[Mesa-dev] [PATCH 1/2] radv: overhaul fragment shader sample positions.

Bas Nieuwenhuizen bas at basnieuwenhuizen.nl
Mon Apr 3 19:40:54 UTC 2017


Series is

Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>

Not really a fan of yet another flag in the command buffer, but not
sure what would be the best solution.

On Mon, Apr 3, 2017 at 5:44 AM, Dave Airlie <airlied at gmail.com> wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> The current code was broken, and I decided to redesign it instead.
>
> This puts the sample positions for all samples into the queue
> constant descriptor buffer after all the spill/ring descriptors.
>
> It then uses a single offset register to point how far into the
> samples the samples for num_samples are. This saves one user sgpr
> and means we only generate the sample position data in the rare
> single case where we need it currently.
>
> This doesn't fix the failing CTS tests without the followup
> fix.
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  src/amd/common/ac_nir_to_llvm.c  | 31 +++++++++++---------
>  src/amd/common/ac_nir_to_llvm.h  |  4 ++-
>  src/amd/vulkan/radv_cmd_buffer.c | 61 ++++++++++++++++++++--------------------
>  src/amd/vulkan/radv_device.c     | 40 ++++++++++++++++++++++----
>  src/amd/vulkan/radv_private.h    |  2 ++
>  5 files changed, 87 insertions(+), 51 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 520e4cf..aabcabe 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -109,7 +109,7 @@ struct nir_to_llvm_context {
>         LLVMValueRef hs_ring_tess_factor;
>
>         LLVMValueRef prim_mask;
> -       LLVMValueRef sample_positions;
> +       LLVMValueRef sample_pos_offset;
>         LLVMValueRef persp_sample, persp_center, persp_centroid;
>         LLVMValueRef linear_sample, linear_center, linear_centroid;
>         LLVMValueRef front_face;
> @@ -573,6 +573,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
>             ctx->stage == MESA_SHADER_VERTEX ||
>             ctx->stage == MESA_SHADER_TESS_CTRL ||
>             ctx->stage == MESA_SHADER_TESS_EVAL ||
> +           ctx->stage == MESA_SHADER_FRAGMENT ||
>             ctx->is_gs_copy_shader)
>                 need_ring_offsets = true;
>
> @@ -584,7 +585,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
>                 need_push_constants = false;
>
>         if (need_ring_offsets && !ctx->options->supports_spill) {
> -               arg_types[arg_idx++] = const_array(ctx->v16i8, 8); /* address of rings */
> +               arg_types[arg_idx++] = const_array(ctx->v16i8, 16); /* address of rings */
>         }
>
>         /* 1 for each descriptor set */
> @@ -679,7 +680,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
>                 arg_types[arg_idx++] = ctx->i32; // GS instance id
>                 break;
>         case MESA_SHADER_FRAGMENT:
> -               arg_types[arg_idx++] = const_array(ctx->f32, 32); /* sample positions */
> +               arg_types[arg_idx++] = ctx->i32; /* sample position offset */
>                 user_sgpr_count = arg_idx;
>                 arg_types[arg_idx++] = ctx->i32; /* prim mask */
>                 sgpr_count = arg_idx;
> @@ -735,7 +736,7 @@ static void create_function(struct nir_to_llvm_context *ctx)
>                                                                LLVMPointerType(ctx->i8, CONST_ADDR_SPACE),
>                                                                NULL, 0, AC_FUNC_ATTR_READNONE);
>                         ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
> -                                                            const_array(ctx->v16i8, 8), "");
> +                                                            const_array(ctx->v16i8, 16), "");
>                 } else
>                         ctx->ring_offsets = LLVMGetParam(ctx->main_function, arg_idx++);
>         }
> @@ -844,9 +845,9 @@ static void create_function(struct nir_to_llvm_context *ctx)
>                 ctx->gs_invocation_id = LLVMGetParam(ctx->main_function, arg_idx++);
>                 break;
>         case MESA_SHADER_FRAGMENT:
> -               set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS, user_sgpr_idx, 2);
> -               user_sgpr_idx += 2;
> -               ctx->sample_positions = LLVMGetParam(ctx->main_function, arg_idx++);
> +               set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET, user_sgpr_idx, 1);
> +               user_sgpr_idx += 1;
> +               ctx->sample_pos_offset = LLVMGetParam(ctx->main_function, arg_idx++);
>                 ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++);
>                 ctx->persp_sample = LLVMGetParam(ctx->main_function, arg_idx++);
>                 ctx->persp_center = LLVMGetParam(ctx->main_function, arg_idx++);
> @@ -3518,15 +3519,17 @@ static LLVMValueRef lookup_interp_param(struct nir_to_llvm_context *ctx,
>  static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx,
>                                          LLVMValueRef sample_id)
>  {
> -       /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
> -       LLVMValueRef offset0 = LLVMBuildMul(ctx->builder, sample_id, LLVMConstInt(ctx->i32, 8, false), "");
> -       LLVMValueRef offset1 = LLVMBuildAdd(ctx->builder, offset0, LLVMConstInt(ctx->i32, 4, false), "");
> -       LLVMValueRef result[2];
> +       LLVMValueRef result;
> +       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_PS_SAMPLE_POSITIONS, false));
>
> -       result[0] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset0);
> -       result[1] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset1);
> +       ptr = LLVMBuildBitCast(ctx->builder, ptr,
> +                              const_array(ctx->v2f32, 64), "");
>
> -       return ac_build_gather_values(&ctx->ac, result, 2);
> +       sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, "");
> +       result = ac_build_indexed_load(&ctx->ac, ptr, sample_id, false);
> +
> +       ctx->shader_info->fs.uses_sample_positions = true;
> +       return result;
>  }
>
>  static LLVMValueRef load_sample_pos(struct nir_to_llvm_context *ctx)
> diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
> index c468d93..3d0b456 100644
> --- a/src/amd/common/ac_nir_to_llvm.h
> +++ b/src/amd/common/ac_nir_to_llvm.h
> @@ -88,7 +88,7 @@ enum ac_ud_index {
>         AC_UD_VS_BASE_VERTEX_START_INSTANCE,
>         AC_UD_VS_LS_TCS_IN_LAYOUT,
>         AC_UD_VS_MAX_UD,
> -       AC_UD_PS_SAMPLE_POS = AC_UD_SHADER_START,
> +       AC_UD_PS_SAMPLE_POS_OFFSET = AC_UD_SHADER_START,
>         AC_UD_PS_MAX_UD,
>         AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
>         AC_UD_CS_MAX_UD,
> @@ -109,6 +109,7 @@ enum ac_ud_index {
>  #define RING_GSVS_GS 4
>  #define RING_HS_TESS_FACTOR 5
>  #define RING_HS_TESS_OFFCHIP 6
> +#define RING_PS_SAMPLE_POSITIONS 7
>
>  // Match MAX_SETS from radv_descriptor_set.h
>  #define AC_UD_MAX_SETS MAX_SETS
> @@ -165,6 +166,7 @@ struct ac_shader_variant_info {
>                         bool force_persample;
>                         bool prim_id_input;
>                         bool layer_input;
> +                       bool uses_sample_positions;
>                 } fs;
>                 struct {
>                         unsigned block_size[3];
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> index c95388e..b0a6854 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -222,6 +222,7 @@ static void  radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
>         cmd_buffer->esgs_ring_size_needed = 0;
>         cmd_buffer->gsvs_ring_size_needed = 0;
>         cmd_buffer->tess_rings_needed = false;
> +       cmd_buffer->sample_positions_needed = false;
>
>         if (cmd_buffer->upload.upload_bo)
>                 cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> @@ -452,37 +453,35 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
>
>         radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples);
>
> -       uint32_t samples_offset;
> -       void *samples_ptr;
> -       void *src;
> -       radv_cmd_buffer_upload_alloc(cmd_buffer, num_samples * 4 * 2, 256, &samples_offset,
> -                                    &samples_ptr);
> -       switch (num_samples) {
> -       case 1:
> -               src = cmd_buffer->device->sample_locations_1x;
> -               break;
> -       case 2:
> -               src = cmd_buffer->device->sample_locations_2x;
> -               break;
> -       case 4:
> -               src = cmd_buffer->device->sample_locations_4x;
> -               break;
> -       case 8:
> -               src = cmd_buffer->device->sample_locations_8x;
> -               break;
> -       case 16:
> -               src = cmd_buffer->device->sample_locations_16x;
> -               break;
> -       default:
> -               unreachable("unknown number of samples");
> -       }
> -       memcpy(samples_ptr, src, num_samples * 4 * 2);
> -
> -       uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
> -       va += samples_offset;
> +       if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.uses_sample_positions) {
> +               uint32_t offset;
> +               struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET);
> +               uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
> +               if (loc->sgpr_idx == -1)
> +                       return;
> +               assert(loc->num_sgprs == 1);
> +               assert(!loc->indirect);
> +               switch (num_samples) {
> +               default:
> +                       offset = 0;
> +                       break;
> +               case 2:
> +                       offset = 1;
> +                       break;
> +               case 4:
> +                       offset = 3;
> +                       break;
> +               case 8:
> +                       offset = 7;
> +                       break;
> +               case 16:
> +                       offset = 15;
> +                       break;
> +               }
>
> -       radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
> -                                  AC_UD_PS_SAMPLE_POS, va);
> +               radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, offset);
> +               cmd_buffer->sample_positions_needed = true;
> +       }
>  }
>
>  static void
> @@ -2197,6 +2196,8 @@ void radv_CmdExecuteCommands(
>                         primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
>                 if (secondary->tess_rings_needed)
>                         primary->tess_rings_needed = true;
> +               if (secondary->sample_positions_needed)
> +                       primary->sample_positions_needed = true;
>
>                 if (secondary->ring_offsets_idx != -1) {
>                         if (primary->ring_offsets_idx == -1)
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 5c48be1..24219e4 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -1188,6 +1188,7 @@ static void radv_dump_trace(struct radv_device *device,
>  static void
>  fill_geom_tess_rings(struct radv_queue *queue,
>                      uint32_t *map,
> +                    bool add_sample_positions,
>                      uint32_t esgs_ring_size,
>                      struct radeon_winsys_bo *esgs_ring_bo,
>                      uint32_t gsvs_ring_size,
> @@ -1315,6 +1316,18 @@ fill_geom_tess_rings(struct radv_queue *queue,
>                 S_008F0C_ELEMENT_SIZE(0) |
>                 S_008F0C_INDEX_STRIDE(0) |
>                 S_008F0C_ADD_TID_ENABLE(false);
> +       desc += 4;
> +
> +       /* add sample positions after all rings */
> +       memcpy(desc, queue->device->sample_locations_1x, 8);
> +       desc += 2;
> +       memcpy(desc, queue->device->sample_locations_2x, 16);
> +       desc += 4;
> +       memcpy(desc, queue->device->sample_locations_4x, 32);
> +       desc += 8;
> +       memcpy(desc, queue->device->sample_locations_8x, 64);
> +       desc += 16;
> +       memcpy(desc, queue->device->sample_locations_16x, 128);
>  }
>
>  static unsigned
> @@ -1374,6 +1387,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
>                      uint32_t esgs_ring_size,
>                      uint32_t gsvs_ring_size,
>                      bool needs_tess_rings,
> +                    bool needs_sample_positions,
>                       struct radeon_winsys_cs **initial_preamble_cs,
>                       struct radeon_winsys_cs **continue_preamble_cs)
>  {
> @@ -1385,7 +1399,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
>         struct radeon_winsys_bo *tess_factor_ring_bo = NULL;
>         struct radeon_winsys_bo *tess_offchip_ring_bo = NULL;
>         struct radeon_winsys_cs *dest_cs[2] = {0};
> -       bool add_tess_rings = false;
> +       bool add_tess_rings = false, add_sample_positions = false;
>         unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
>         unsigned max_offchip_buffers;
>         unsigned hs_offchip_param = 0;
> @@ -1393,6 +1407,10 @@ radv_get_preamble_cs(struct radv_queue *queue,
>                 if (needs_tess_rings)
>                         add_tess_rings = true;
>         }
> +       if (!queue->has_sample_positions) {
> +               if (needs_sample_positions)
> +                       add_sample_positions = true;
> +       }
>         tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se;
>         hs_offchip_param = radv_get_hs_offchip_param(queue->device,
>                                                      &max_offchip_buffers);
> @@ -1403,7 +1421,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
>             compute_scratch_size <= queue->compute_scratch_size &&
>             esgs_ring_size <= queue->esgs_ring_size &&
>             gsvs_ring_size <= queue->gsvs_ring_size &&
> -           !add_tess_rings &&
> +           !add_tess_rings && !add_sample_positions &&
>             queue->initial_preamble_cs) {
>                 *initial_preamble_cs = queue->initial_preamble_cs;
>                 *continue_preamble_cs = queue->continue_preamble_cs;
> @@ -1485,11 +1503,14 @@ radv_get_preamble_cs(struct radv_queue *queue,
>             esgs_ring_bo != queue->esgs_ring_bo ||
>             gsvs_ring_bo != queue->gsvs_ring_bo ||
>             tess_factor_ring_bo != queue->tess_factor_ring_bo ||
> -           tess_offchip_ring_bo != queue->tess_offchip_ring_bo) {
> +           tess_offchip_ring_bo != queue->tess_offchip_ring_bo || add_sample_positions) {
>                 uint32_t size = 0;
>                 if (gsvs_ring_bo || esgs_ring_bo ||
> -                   tess_factor_ring_bo || tess_offchip_ring_bo)
> +                   tess_factor_ring_bo || tess_offchip_ring_bo || add_sample_positions) {
>                         size = 112; /* 2 dword + 2 padding + 4 dword * 6 */
> +                       if (add_sample_positions)
> +                               size += 256; /* 32+16+8+4+2+1 samples * 4 * 2 = 248 bytes. */
> +               }
>                 else if (scratch_bo)
>                         size = 8; /* 2 dword */
>
> @@ -1541,8 +1562,9 @@ radv_get_preamble_cs(struct radv_queue *queue,
>                                 map[1] = rsrc1;
>                         }
>
> -                       if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo)
> -                               fill_geom_tess_rings(queue, map,
> +                       if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo ||
> +                           add_sample_positions)
> +                               fill_geom_tess_rings(queue, map, add_sample_positions,
>                                                      esgs_ring_size, esgs_ring_bo,
>                                                      gsvs_ring_size, gsvs_ring_bo,
>                                                      tess_factor_ring_size, tess_factor_ring_bo,
> @@ -1685,6 +1707,9 @@ radv_get_preamble_cs(struct radv_queue *queue,
>                 queue->descriptor_bo = descriptor_bo;
>         }
>
> +       if (add_sample_positions)
> +               queue->has_sample_positions = true;
> +
>         *initial_preamble_cs = queue->initial_preamble_cs;
>         *continue_preamble_cs = queue->continue_preamble_cs;
>         if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
> @@ -1730,6 +1755,7 @@ VkResult radv_QueueSubmit(
>         VkResult result;
>         bool fence_emitted = false;
>         bool tess_rings_needed = false;
> +       bool sample_positions_needed = false;
>
>         /* Do this first so failing to allocate scratch buffers can't result in
>          * partially executed submissions. */
> @@ -1744,11 +1770,13 @@ VkResult radv_QueueSubmit(
>                         esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
>                         gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
>                         tess_rings_needed |= cmd_buffer->tess_rings_needed;
> +                       sample_positions_needed |= cmd_buffer->sample_positions_needed;
>                 }
>         }
>
>         result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
>                                       esgs_ring_size, gsvs_ring_size, tess_rings_needed,
> +                                     sample_positions_needed,
>                                       &initial_preamble_cs, &continue_preamble_cs);
>         if (result != VK_SUCCESS)
>                 return result;
> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> index 3f92d59..a28c825 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -460,6 +460,7 @@ struct radv_queue {
>         uint32_t esgs_ring_size;
>         uint32_t gsvs_ring_size;
>         bool has_tess_rings;
> +       bool has_sample_positions;
>
>         struct radeon_winsys_bo *scratch_bo;
>         struct radeon_winsys_bo *descriptor_bo;
> @@ -748,6 +749,7 @@ struct radv_cmd_buffer {
>         uint32_t esgs_ring_size_needed;
>         uint32_t gsvs_ring_size_needed;
>         bool tess_rings_needed;
> +       bool sample_positions_needed;
>
>         int ring_offsets_idx; /* just used for verification */
>  };
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list