[Mesa-dev] [PATCH 1/2] radv: overhaul fragment shader sample positions.
Bas Nieuwenhuizen
bas at basnieuwenhuizen.nl
Mon Apr 3 19:40:54 UTC 2017
Series is
Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Not really a fan of yet another flag in the command buffer, but not
sure what would be the best solution.
On Mon, Apr 3, 2017 at 5:44 AM, Dave Airlie <airlied at gmail.com> wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> The current code was broken, and I decided to redesign it instead.
>
> This puts the sample positions for all samples into the queue
> constant descriptor buffer after all the spill/ring descriptors.
>
> It then uses a single offset register to point how far into the
> samples the samples for num_samples are. This saves one user sgpr
> and means we only generate the sample position data in the rare
> single case where we need it currently.
>
> This doesn't fix the failing CTS tests without the followup
> fix.
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
> src/amd/common/ac_nir_to_llvm.c | 31 +++++++++++---------
> src/amd/common/ac_nir_to_llvm.h | 4 ++-
> src/amd/vulkan/radv_cmd_buffer.c | 61 ++++++++++++++++++++--------------------
> src/amd/vulkan/radv_device.c | 40 ++++++++++++++++++++++----
> src/amd/vulkan/radv_private.h | 2 ++
> 5 files changed, 87 insertions(+), 51 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 520e4cf..aabcabe 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -109,7 +109,7 @@ struct nir_to_llvm_context {
> LLVMValueRef hs_ring_tess_factor;
>
> LLVMValueRef prim_mask;
> - LLVMValueRef sample_positions;
> + LLVMValueRef sample_pos_offset;
> LLVMValueRef persp_sample, persp_center, persp_centroid;
> LLVMValueRef linear_sample, linear_center, linear_centroid;
> LLVMValueRef front_face;
> @@ -573,6 +573,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
> ctx->stage == MESA_SHADER_VERTEX ||
> ctx->stage == MESA_SHADER_TESS_CTRL ||
> ctx->stage == MESA_SHADER_TESS_EVAL ||
> + ctx->stage == MESA_SHADER_FRAGMENT ||
> ctx->is_gs_copy_shader)
> need_ring_offsets = true;
>
> @@ -584,7 +585,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
> need_push_constants = false;
>
> if (need_ring_offsets && !ctx->options->supports_spill) {
> - arg_types[arg_idx++] = const_array(ctx->v16i8, 8); /* address of rings */
> + arg_types[arg_idx++] = const_array(ctx->v16i8, 16); /* address of rings */
> }
>
> /* 1 for each descriptor set */
> @@ -679,7 +680,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
> arg_types[arg_idx++] = ctx->i32; // GS instance id
> break;
> case MESA_SHADER_FRAGMENT:
> - arg_types[arg_idx++] = const_array(ctx->f32, 32); /* sample positions */
> + arg_types[arg_idx++] = ctx->i32; /* sample position offset */
> user_sgpr_count = arg_idx;
> arg_types[arg_idx++] = ctx->i32; /* prim mask */
> sgpr_count = arg_idx;
> @@ -735,7 +736,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
> LLVMPointerType(ctx->i8, CONST_ADDR_SPACE),
> NULL, 0, AC_FUNC_ATTR_READNONE);
> ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
> - const_array(ctx->v16i8, 8), "");
> + const_array(ctx->v16i8, 16), "");
> } else
> ctx->ring_offsets = LLVMGetParam(ctx->main_function, arg_idx++);
> }
> @@ -844,9 +845,9 @@ static void create_function(struct nir_to_llvm_context *ctx)
> ctx->gs_invocation_id = LLVMGetParam(ctx->main_function, arg_idx++);
> break;
> case MESA_SHADER_FRAGMENT:
> - set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS, user_sgpr_idx, 2);
> - user_sgpr_idx += 2;
> - ctx->sample_positions = LLVMGetParam(ctx->main_function, arg_idx++);
> + set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET, user_sgpr_idx, 1);
> + user_sgpr_idx += 1;
> + ctx->sample_pos_offset = LLVMGetParam(ctx->main_function, arg_idx++);
> ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++);
> ctx->persp_sample = LLVMGetParam(ctx->main_function, arg_idx++);
> ctx->persp_center = LLVMGetParam(ctx->main_function, arg_idx++);
> @@ -3518,15 +3519,17 @@ static LLVMValueRef lookup_interp_param(struct nir_to_llvm_context *ctx,
> static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx,
> LLVMValueRef sample_id)
> {
> - /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
> - LLVMValueRef offset0 = LLVMBuildMul(ctx->builder, sample_id, LLVMConstInt(ctx->i32, 8, false), "");
> - LLVMValueRef offset1 = LLVMBuildAdd(ctx->builder, offset0, LLVMConstInt(ctx->i32, 4, false), "");
> - LLVMValueRef result[2];
> + LLVMValueRef result;
> + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_PS_SAMPLE_POSITIONS, false));
>
> - result[0] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset0);
> - result[1] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset1);
> + ptr = LLVMBuildBitCast(ctx->builder, ptr,
> + const_array(ctx->v2f32, 64), "");
>
> - return ac_build_gather_values(&ctx->ac, result, 2);
> + sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, "");
> + result = ac_build_indexed_load(&ctx->ac, ptr, sample_id, false);
> +
> + ctx->shader_info->fs.uses_sample_positions = true;
> + return result;
> }
>
> static LLVMValueRef load_sample_pos(struct nir_to_llvm_context *ctx)
> diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
> index c468d93..3d0b456 100644
> --- a/src/amd/common/ac_nir_to_llvm.h
> +++ b/src/amd/common/ac_nir_to_llvm.h
> @@ -88,7 +88,7 @@ enum ac_ud_index {
> AC_UD_VS_BASE_VERTEX_START_INSTANCE,
> AC_UD_VS_LS_TCS_IN_LAYOUT,
> AC_UD_VS_MAX_UD,
> - AC_UD_PS_SAMPLE_POS = AC_UD_SHADER_START,
> + AC_UD_PS_SAMPLE_POS_OFFSET = AC_UD_SHADER_START,
> AC_UD_PS_MAX_UD,
> AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
> AC_UD_CS_MAX_UD,
> @@ -109,6 +109,7 @@ enum ac_ud_index {
> #define RING_GSVS_GS 4
> #define RING_HS_TESS_FACTOR 5
> #define RING_HS_TESS_OFFCHIP 6
> +#define RING_PS_SAMPLE_POSITIONS 7
>
> // Match MAX_SETS from radv_descriptor_set.h
> #define AC_UD_MAX_SETS MAX_SETS
> @@ -165,6 +166,7 @@ struct ac_shader_variant_info {
> bool force_persample;
> bool prim_id_input;
> bool layer_input;
> + bool uses_sample_positions;
> } fs;
> struct {
> unsigned block_size[3];
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> index c95388e..b0a6854 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -222,6 +222,7 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
> cmd_buffer->esgs_ring_size_needed = 0;
> cmd_buffer->gsvs_ring_size_needed = 0;
> cmd_buffer->tess_rings_needed = false;
> + cmd_buffer->sample_positions_needed = false;
>
> if (cmd_buffer->upload.upload_bo)
> cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> @@ -452,37 +453,35 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
>
> radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples);
>
> - uint32_t samples_offset;
> - void *samples_ptr;
> - void *src;
> - radv_cmd_buffer_upload_alloc(cmd_buffer, num_samples * 4 * 2, 256, &samples_offset,
> - &samples_ptr);
> - switch (num_samples) {
> - case 1:
> - src = cmd_buffer->device->sample_locations_1x;
> - break;
> - case 2:
> - src = cmd_buffer->device->sample_locations_2x;
> - break;
> - case 4:
> - src = cmd_buffer->device->sample_locations_4x;
> - break;
> - case 8:
> - src = cmd_buffer->device->sample_locations_8x;
> - break;
> - case 16:
> - src = cmd_buffer->device->sample_locations_16x;
> - break;
> - default:
> - unreachable("unknown number of samples");
> - }
> - memcpy(samples_ptr, src, num_samples * 4 * 2);
> -
> - uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
> - va += samples_offset;
> + if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.uses_sample_positions) {
> + uint32_t offset;
> + struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET);
> + uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
> + if (loc->sgpr_idx == -1)
> + return;
> + assert(loc->num_sgprs == 1);
> + assert(!loc->indirect);
> + switch (num_samples) {
> + default:
> + offset = 0;
> + break;
> + case 2:
> + offset = 1;
> + break;
> + case 4:
> + offset = 3;
> + break;
> + case 8:
> + offset = 7;
> + break;
> + case 16:
> + offset = 15;
> + break;
> + }
>
> - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
> - AC_UD_PS_SAMPLE_POS, va);
> + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, offset);
> + cmd_buffer->sample_positions_needed = true;
> + }
> }
>
> static void
> @@ -2197,6 +2196,8 @@ void radv_CmdExecuteCommands(
> primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
> if (secondary->tess_rings_needed)
> primary->tess_rings_needed = true;
> + if (secondary->sample_positions_needed)
> + primary->sample_positions_needed = true;
>
> if (secondary->ring_offsets_idx != -1) {
> if (primary->ring_offsets_idx == -1)
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 5c48be1..24219e4 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -1188,6 +1188,7 @@ static void radv_dump_trace(struct radv_device *device,
> static void
> fill_geom_tess_rings(struct radv_queue *queue,
> uint32_t *map,
> + bool add_sample_positions,
> uint32_t esgs_ring_size,
> struct radeon_winsys_bo *esgs_ring_bo,
> uint32_t gsvs_ring_size,
> @@ -1315,6 +1316,18 @@ fill_geom_tess_rings(struct radv_queue *queue,
> S_008F0C_ELEMENT_SIZE(0) |
> S_008F0C_INDEX_STRIDE(0) |
> S_008F0C_ADD_TID_ENABLE(false);
> + desc += 4;
> +
> + /* add sample positions after all rings */
> + memcpy(desc, queue->device->sample_locations_1x, 8);
> + desc += 2;
> + memcpy(desc, queue->device->sample_locations_2x, 16);
> + desc += 4;
> + memcpy(desc, queue->device->sample_locations_4x, 32);
> + desc += 8;
> + memcpy(desc, queue->device->sample_locations_8x, 64);
> + desc += 16;
> + memcpy(desc, queue->device->sample_locations_16x, 128);
> }
>
> static unsigned
> @@ -1374,6 +1387,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
> uint32_t esgs_ring_size,
> uint32_t gsvs_ring_size,
> bool needs_tess_rings,
> + bool needs_sample_positions,
> struct radeon_winsys_cs **initial_preamble_cs,
> struct radeon_winsys_cs **continue_preamble_cs)
> {
> @@ -1385,7 +1399,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
> struct radeon_winsys_bo *tess_factor_ring_bo = NULL;
> struct radeon_winsys_bo *tess_offchip_ring_bo = NULL;
> struct radeon_winsys_cs *dest_cs[2] = {0};
> - bool add_tess_rings = false;
> + bool add_tess_rings = false, add_sample_positions = false;
> unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
> unsigned max_offchip_buffers;
> unsigned hs_offchip_param = 0;
> @@ -1393,6 +1407,10 @@ radv_get_preamble_cs(struct radv_queue *queue,
> if (needs_tess_rings)
> add_tess_rings = true;
> }
> + if (!queue->has_sample_positions) {
> + if (needs_sample_positions)
> + add_sample_positions = true;
> + }
> tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se;
> hs_offchip_param = radv_get_hs_offchip_param(queue->device,
> &max_offchip_buffers);
> @@ -1403,7 +1421,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
> compute_scratch_size <= queue->compute_scratch_size &&
> esgs_ring_size <= queue->esgs_ring_size &&
> gsvs_ring_size <= queue->gsvs_ring_size &&
> - !add_tess_rings &&
> + !add_tess_rings && !add_sample_positions &&
> queue->initial_preamble_cs) {
> *initial_preamble_cs = queue->initial_preamble_cs;
> *continue_preamble_cs = queue->continue_preamble_cs;
> @@ -1485,11 +1503,14 @@ radv_get_preamble_cs(struct radv_queue *queue,
> esgs_ring_bo != queue->esgs_ring_bo ||
> gsvs_ring_bo != queue->gsvs_ring_bo ||
> tess_factor_ring_bo != queue->tess_factor_ring_bo ||
> - tess_offchip_ring_bo != queue->tess_offchip_ring_bo) {
> + tess_offchip_ring_bo != queue->tess_offchip_ring_bo || add_sample_positions) {
> uint32_t size = 0;
> if (gsvs_ring_bo || esgs_ring_bo ||
> - tess_factor_ring_bo || tess_offchip_ring_bo)
> + tess_factor_ring_bo || tess_offchip_ring_bo || add_sample_positions) {
> size = 112; /* 2 dword + 2 padding + 4 dword * 6 */
> + if (add_sample_positions)
> + size += 256; /* 32+16+8+4+2+1 samples * 4 * 2 = 248 bytes. */
> + }
> else if (scratch_bo)
> size = 8; /* 2 dword */
>
> @@ -1541,8 +1562,9 @@ radv_get_preamble_cs(struct radv_queue *queue,
> map[1] = rsrc1;
> }
>
> - if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo)
> - fill_geom_tess_rings(queue, map,
> + if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo ||
> + add_sample_positions)
> + fill_geom_tess_rings(queue, map, add_sample_positions,
> esgs_ring_size, esgs_ring_bo,
> gsvs_ring_size, gsvs_ring_bo,
> tess_factor_ring_size, tess_factor_ring_bo,
> @@ -1685,6 +1707,9 @@ radv_get_preamble_cs(struct radv_queue *queue,
> queue->descriptor_bo = descriptor_bo;
> }
>
> + if (add_sample_positions)
> + queue->has_sample_positions = true;
> +
> *initial_preamble_cs = queue->initial_preamble_cs;
> *continue_preamble_cs = queue->continue_preamble_cs;
> if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
> @@ -1730,6 +1755,7 @@ VkResult radv_QueueSubmit(
> VkResult result;
> bool fence_emitted = false;
> bool tess_rings_needed = false;
> + bool sample_positions_needed = false;
>
> /* Do this first so failing to allocate scratch buffers can't result in
> * partially executed submissions. */
> @@ -1744,11 +1770,13 @@ VkResult radv_QueueSubmit(
> esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
> gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
> tess_rings_needed |= cmd_buffer->tess_rings_needed;
> + sample_positions_needed |= cmd_buffer->sample_positions_needed;
> }
> }
>
> result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
> esgs_ring_size, gsvs_ring_size, tess_rings_needed,
> + sample_positions_needed,
> &initial_preamble_cs, &continue_preamble_cs);
> if (result != VK_SUCCESS)
> return result;
> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> index 3f92d59..a28c825 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -460,6 +460,7 @@ struct radv_queue {
> uint32_t esgs_ring_size;
> uint32_t gsvs_ring_size;
> bool has_tess_rings;
> + bool has_sample_positions;
>
> struct radeon_winsys_bo *scratch_bo;
> struct radeon_winsys_bo *descriptor_bo;
> @@ -748,6 +749,7 @@ struct radv_cmd_buffer {
> uint32_t esgs_ring_size_needed;
> uint32_t gsvs_ring_size_needed;
> bool tess_rings_needed;
> + bool sample_positions_needed;
>
> int ring_offsets_idx; /* just used for verification */
> };
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list