[Mesa-dev] [PATCH 08/10] radv: move userdata sgpr ownership to compiler side.

Dave Airlie airlied at gmail.com
Mon Dec 5 00:02:02 UTC 2016


On 4 December 2016 at 08:04, Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl> wrote:
> On Mon, Nov 28, 2016 at 5:19 AM, Dave Airlie <airlied at gmail.com> wrote:
>> From: Dave Airlie <airlied at redhat.com>
>>
>> This isn't fully what we want yet, but is a good step on the way.
>>
>> This allows the compiler to create the information structures
>> for the state setting side, however the state setting still expects
>> things to be pretty much in 2 sgpr wide register sets, and can't
>> handle the indirect setting yet.
>>
>> Signed-off-by: Dave Airlie <airlied at redhat.com>
>> ---
>>  src/amd/common/ac_nir_to_llvm.c  |  56 +++++++++++-
>>  src/amd/common/ac_nir_to_llvm.h  |  44 ++++++----
>>  src/amd/vulkan/radv_cmd_buffer.c | 182 +++++++++++++++++++++++++++------------
>>  3 files changed, 209 insertions(+), 73 deletions(-)
>>
>> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
>> index 2108902..54cae9a 100644
>> --- a/src/amd/common/ac_nir_to_llvm.c
>> +++ b/src/amd/common/ac_nir_to_llvm.c
>> @@ -62,7 +62,7 @@ struct nir_to_llvm_context {
>>         struct hash_table *defs;
>>         struct hash_table *phis;
>>
>> -       LLVMValueRef descriptor_sets[4];
>> +       LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
>>         LLVMValueRef push_constants;
>>         LLVMValueRef num_work_groups;
>>         LLVMValueRef workgroup_ids;
>> @@ -426,6 +426,45 @@ static LLVMValueRef build_indexed_load_const(struct nir_to_llvm_context *ctx,
>>         return result;
>>  }
>>
>> +static void set_userdata_location(struct ac_userdata_info *ud_info, uint8_t sgpr_idx, uint8_t num_sgprs)
>> +{
>> +       ud_info->sgpr_idx = sgpr_idx;
>> +       ud_info->num_sgprs = num_sgprs;
>> +       ud_info->indirect = false;
>> +       ud_info->indirect_offset = 0;
>> +}
>> +
>> +static void set_userdata_location_shader(struct nir_to_llvm_context *ctx,
>> +                                        int idx, uint8_t sgpr_idx, uint8_t num_sgprs)
>> +{
>> +       set_userdata_location(&ctx->shader_info->user_sgprs_locs.shader_data[idx], sgpr_idx, num_sgprs);
>> +}
>> +
>> +#if 0
>> +static void set_userdata_location_indirect(struct ac_userdata_info *ud_info, uint8_t sgpr_idx, uint8_t num_sgprs,
>> +                                          uint32_t indirect_offset)
>> +{
>> +       ud_info->sgpr_idx = sgpr_idx;
>> +       ud_info->num_sgprs = num_sgprs;
>> +       ud_info->indirect = true;
>> +       ud_info->indirect_offset = indirect_offset;
>> +}
>> +#endif
>> +
>> +#define AC_USERDATA_DESCRIPTOR_SET_0 0
>> +#define AC_USERDATA_DESCRIPTOR_SET_1 2
>> +#define AC_USERDATA_DESCRIPTOR_SET_2 4
>> +#define AC_USERDATA_DESCRIPTOR_SET_3 6
>> +#define AC_USERDATA_PUSH_CONST_DYN 8
>> +
>> +#define AC_USERDATA_VS_VERTEX_BUFFERS 10
>> +#define AC_USERDATA_VS_BASE_VERTEX 12
>> +#define AC_USERDATA_VS_START_INSTANCE 13
>> +
>> +#define AC_USERDATA_PS_SAMPLE_POS 10
>> +
>> +#define AC_USERDATA_CS_GRID_SIZE 10
>> +
>>  static void create_function(struct nir_to_llvm_context *ctx,
>>                              struct nir_shader *nir)
>>  {
>> @@ -510,14 +549,18 @@ static void create_function(struct nir_to_llvm_context *ctx,
>>                         ctx->shader_info->num_input_vgprs += llvm_get_type_size(arg_types[i]) / 4;
>>
>>         arg_idx = 0;
>> -       for (unsigned i = 0; i < 4; ++i)
>> +       for (unsigned i = 0; i < 4; ++i) {
>> +               set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], i * 2, 2);
>>                 ctx->descriptor_sets[i] =
>>                     LLVMGetParam(ctx->main_function, arg_idx++);
>> +       }
>>
>>         ctx->push_constants = LLVMGetParam(ctx->main_function, arg_idx++);
>> +       set_userdata_location_shader(ctx, AC_UD_PUSH_CONSTANTS, AC_USERDATA_PUSH_CONST_DYN, 2);
>>
>>         switch (nir->stage) {
>>         case MESA_SHADER_COMPUTE:
>> +               set_userdata_location_shader(ctx, AC_UD_CS_GRID_SIZE, AC_USERDATA_CS_GRID_SIZE, 3);
>>                 ctx->num_work_groups =
>>                     LLVMGetParam(ctx->main_function, arg_idx++);
>>                 ctx->workgroup_ids =
>> @@ -528,7 +571,9 @@ static void create_function(struct nir_to_llvm_context *ctx,
>>                     LLVMGetParam(ctx->main_function, arg_idx++);
>>                 break;
>>         case MESA_SHADER_VERTEX:
>> +               set_userdata_location_shader(ctx, AC_UD_VS_VERTEX_BUFFERS, AC_USERDATA_VS_VERTEX_BUFFERS, 2);
>>                 ctx->vertex_buffers = LLVMGetParam(ctx->main_function, arg_idx++);
>> +               set_userdata_location_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_USERDATA_VS_BASE_VERTEX, 2);
>>                 ctx->base_vertex = LLVMGetParam(ctx->main_function, arg_idx++);
>>                 ctx->start_instance = LLVMGetParam(ctx->main_function, arg_idx++);
>>                 ctx->vertex_id = LLVMGetParam(ctx->main_function, arg_idx++);
>> @@ -537,6 +582,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
>>                 ctx->instance_id = LLVMGetParam(ctx->main_function, arg_idx++);
>>                 break;
>>         case MESA_SHADER_FRAGMENT:
>> +               set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS, AC_USERDATA_PS_SAMPLE_POS, 2);
>>                 ctx->sample_positions = LLVMGetParam(ctx->main_function, arg_idx++);
>>                 ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++);
>>                 ctx->persp_sample = LLVMGetParam(ctx->main_function, arg_idx++);
>> @@ -4564,6 +4610,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>>  {
>>         struct nir_to_llvm_context ctx = {0};
>>         struct nir_function *func;
>> +       unsigned i;
>>         ctx.options = options;
>>         ctx.shader_info = shader_info;
>>         ctx.context = LLVMContextCreate();
>> @@ -4579,6 +4626,11 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>>         ctx.builder = LLVMCreateBuilderInContext(ctx.context);
>>         ctx.stage = nir->stage;
>>
>> +       for (i = 0; i < AC_UD_MAX_SETS; i++)
>> +               shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
>> +       for (i = 0; i < AC_UD_MAX_UD; i++)
>> +               shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
>> +
>>         create_function(&ctx, nir);
>>
>>         if (nir->stage == MESA_SHADER_COMPUTE) {
>> diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
>> index ca06d05..b21fce3 100644
>> --- a/src/amd/common/ac_nir_to_llvm.h
>> +++ b/src/amd/common/ac_nir_to_llvm.h
>> @@ -56,7 +56,35 @@ struct ac_nir_compiler_options {
>>         enum chip_class chip_class;
>>  };
>>
>> +struct ac_userdata_info {
>> +       int8_t sgpr_idx;
>> +       uint8_t num_sgprs;
>> +       bool indirect;
>> +       uint32_t indirect_offset;
>
> I'm not too sure how we want to pass indirect  vars. Mostly how many
> buffers do we use and how to split (e.g. when considering the whole
> problem, this include vertex attribute descriptors and tess/gs ring
> buffers) into those buffers, as well whether we want to only cache
> some values but still put them into the buffer (e.g. for push
> constants, to allow for indirect access?).
>
> Otherwise this series looks reasonable to me. Have you tested it on
> CTS and Talos?

Yes I'm not sure how we should do indirects, the si docs mention
that their compiler does all or nothing (at least it did when the docs were
written, look for Flat Table Mode).

My current thinking is still make 0,1 point to a buffer of global ptrs,

so s[0:1] point to a buffer with ->
0, 1 spill ring uint64_t ptr
2,3,4,5 ring descriptor[0]
6,7,8,9 ring descriptor[1]

where the ring descriptors are the es/gs/vs/tess rings.

Then I'd probably leave indirects for the descriptor set cases
where we have more than 1 set bound (AMD recommend using
a single set), and leave vertex buffers on their own as well.

I suppose it might be worth optimising push constants but it could
 be very hit and miss what would be optimal for what application I'd say.

I'll do some more testing on this series but I've seen no regressions with yet,
and my initial shader-db shows some improvements (not that I can trust my
shader-db is doing anything sane yet).

Dave.






>> +};
>> +
>> +enum ac_ud_index {
>> +       AC_UD_PUSH_CONSTANTS = 0,
>> +       AC_UD_SHADER_START = 1,
>> +       AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
>> +       AC_UD_VS_BASE_VERTEX_START_INSTANCE,
>> +       AC_UD_VS_MAX_UD,
>> +       AC_UD_PS_SAMPLE_POS = AC_UD_SHADER_START,
>> +       AC_UD_PS_MAX_UD,
>> +       AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
>> +       AC_UD_CS_MAX_UD,
>> +       AC_UD_MAX_UD = AC_UD_VS_MAX_UD,
>> +};
>> +
>> +#define AC_UD_MAX_SETS 4
>> +
>> +struct ac_userdata_locations {
>> +       struct ac_userdata_info descriptor_sets[AC_UD_MAX_SETS];
>> +       struct ac_userdata_info shader_data[AC_UD_MAX_UD];
>> +};
>> +
>>  struct ac_shader_variant_info {
>> +       struct ac_userdata_locations user_sgprs_locs;
>>         unsigned num_user_sgprs;
>>         unsigned num_input_sgprs;
>>         unsigned num_input_vgprs;
>> @@ -96,20 +124,4 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
>>                             const struct ac_nir_compiler_options *options,
>>                            bool dump_shader);
>>
>> -/* SHADER ABI defines */
>> -
>> -/* offset in dwords */
>> -#define AC_USERDATA_DESCRIPTOR_SET_0 0
>> -#define AC_USERDATA_DESCRIPTOR_SET_1 2
>> -#define AC_USERDATA_DESCRIPTOR_SET_2 4
>> -#define AC_USERDATA_DESCRIPTOR_SET_3 6
>> -#define AC_USERDATA_PUSH_CONST_DYN 8
>> -
>> -#define AC_USERDATA_VS_VERTEX_BUFFERS 10
>> -#define AC_USERDATA_VS_BASE_VERTEX 12
>> -#define AC_USERDATA_VS_START_INSTANCE 13
>> -
>> -#define AC_USERDATA_PS_SAMPLE_POS 10
>> -
>> -#define AC_USERDATA_CS_GRID_SIZE 10
>>
>> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
>> index 99d5f9c..2dfb456 100644
>> --- a/src/amd/vulkan/radv_cmd_buffer.c
>> +++ b/src/amd/vulkan/radv_cmd_buffer.c
>> @@ -276,6 +276,46 @@ static unsigned radv_pack_float_12p4(float x)
>>                x >= 4096 ? 0xffff : x * 16;
>>  }
>>
>> +static uint32_t
>> +shader_stage_to_user_data_0(gl_shader_stage stage)
>> +{
>> +       switch (stage) {
>> +       case MESA_SHADER_FRAGMENT:
>> +               return R_00B030_SPI_SHADER_USER_DATA_PS_0;
>> +       case MESA_SHADER_VERTEX:
>> +               return R_00B130_SPI_SHADER_USER_DATA_VS_0;
>> +       case MESA_SHADER_COMPUTE:
>> +               return R_00B900_COMPUTE_USER_DATA_0;
>> +       default:
>> +               unreachable("unknown shader");
>> +       }
>> +}
>> +
>> +static struct ac_userdata_info *
>> +radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
>> +                     gl_shader_stage stage,
>> +                     int idx)
>> +{
>> +       return &pipeline->shaders[stage]->info.user_sgprs_locs.shader_data[idx];
>> +}
>> +
>> +static void
>> +radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer,
>> +                          struct radv_pipeline *pipeline,
>> +                          gl_shader_stage stage,
>> +                          int idx, uint64_t va)
>> +{
>> +       struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
>> +       uint32_t base_reg = shader_stage_to_user_data_0(stage);
>> +       if (loc->sgpr_idx == -1)
>> +               return;
>> +       assert(loc->num_sgprs == 2);
>> +       assert(!loc->indirect);
>> +       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 2);
>> +       radeon_emit(cmd_buffer->cs, va);
>> +       radeon_emit(cmd_buffer->cs, va >> 32);
>> +}
>> +
>>  static void
>>  radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
>>                               struct radv_pipeline *pipeline)
>> @@ -327,9 +367,8 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
>>         uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
>>         va += samples_offset;
>>
>> -       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B030_SPI_SHADER_USER_DATA_PS_0 + AC_USERDATA_PS_SAMPLE_POS * 4, 2);
>> -       radeon_emit(cmd_buffer->cs, va);
>> -       radeon_emit(cmd_buffer->cs, va >> 32);
>> +       radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
>> +                                  AC_UD_PS_SAMPLE_POS, va);
>>  }
>>
>>  static void
>> @@ -896,34 +935,51 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
>>
>>  static void
>>  emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
>> +                                  struct radv_pipeline *pipeline,
>>                                    int idx,
>>                                    uint64_t va,
>> -                                  uint32_t base_reg)
>> +                                  gl_shader_stage stage)
>>  {
>> +       struct ac_userdata_info *desc_set_loc = &pipeline->shaders[stage]->info.user_sgprs_locs.descriptor_sets[idx];
>> +       uint32_t base_reg = shader_stage_to_user_data_0(stage);
>> +
>> +       if (desc_set_loc->sgpr_idx == -1)
>> +               return;
>> +
>> +       assert(!desc_set_loc->indirect);
>> +       assert(desc_set_loc->num_sgprs == 2);
>>         radeon_set_sh_reg_seq(cmd_buffer->cs,
>> -                             base_reg + 8 * idx, 2);
>> +                             base_reg + desc_set_loc->sgpr_idx * 4, 2);
>>         radeon_emit(cmd_buffer->cs, va);
>>         radeon_emit(cmd_buffer->cs, va >> 32);
>>  }
>>
>>  static void
>>  radv_emit_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
>> +                                 struct radv_pipeline *pipeline,
>>                                   VkShaderStageFlags stages,
>>                                   struct radv_descriptor_set *set,
>>                                   unsigned idx)
>>  {
>>         if (stages & VK_SHADER_STAGE_FRAGMENT_BIT)
>> -               emit_stage_descriptor_set_userdata(cmd_buffer, idx, set->va, R_00B030_SPI_SHADER_USER_DATA_PS_0);
>> +               emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
>> +                                                  idx, set->va,
>> +                                                  MESA_SHADER_FRAGMENT);
>>
>>         if (stages & VK_SHADER_STAGE_VERTEX_BIT)
>> -               emit_stage_descriptor_set_userdata(cmd_buffer, idx, set->va, R_00B130_SPI_SHADER_USER_DATA_VS_0);
>> +               emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
>> +                                                  idx, set->va,
>> +                                                  MESA_SHADER_VERTEX);
>>
>>         if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
>> -               emit_stage_descriptor_set_userdata(cmd_buffer, idx, set->va, R_00B900_COMPUTE_USER_DATA_0);
>> +               emit_stage_descriptor_set_userdata(cmd_buffer, pipeline,
>> +                                                  idx, set->va,
>> +                                                  MESA_SHADER_COMPUTE);
>>  }
>>
>>  static void
>>  radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
>> +                      struct radv_pipeline *pipeline,
>>                        VkShaderStageFlags stages)
>>  {
>>         unsigned i;
>> @@ -937,23 +993,12 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
>>                 if (!set)
>>                         continue;
>>
>> -               radv_emit_descriptor_set_userdata(cmd_buffer, stages, set, i);
>> +               radv_emit_descriptor_set_userdata(cmd_buffer, pipeline, stages, set, i);
>>         }
>>         cmd_buffer->state.descriptors_dirty = 0;
>>  }
>>
>>  static void
>> -emit_constants_set_userdata(struct radv_cmd_buffer *cmd_buffer,
>> -                           uint64_t va,
>> -                           uint32_t base_reg)
>> -{
>> -       radeon_set_sh_reg_seq(cmd_buffer->cs,
>> -                             base_reg + 4 * AC_USERDATA_PUSH_CONST_DYN, 2);
>> -       radeon_emit(cmd_buffer->cs, va);
>> -       radeon_emit(cmd_buffer->cs, va >> 32);
>> -}
>> -
>> -static void
>>  radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
>>                      struct radv_pipeline *pipeline,
>>                      VkShaderStageFlags stages)
>> @@ -979,13 +1024,16 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
>>         va += offset;
>>
>>         if (stages & VK_SHADER_STAGE_VERTEX_BIT)
>> -               emit_constants_set_userdata(cmd_buffer, va, R_00B130_SPI_SHADER_USER_DATA_VS_0);
>> +               radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX,
>> +                                          AC_UD_PUSH_CONSTANTS, va);
>>
>>         if (stages & VK_SHADER_STAGE_FRAGMENT_BIT)
>> -               emit_constants_set_userdata(cmd_buffer, va, R_00B030_SPI_SHADER_USER_DATA_PS_0);
>> +               radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
>> +                                          AC_UD_PUSH_CONSTANTS, va);
>>
>>         if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
>> -               emit_constants_set_userdata(cmd_buffer, va, R_00B900_COMPUTE_USER_DATA_0);
>> +               radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE,
>> +                                          AC_UD_PUSH_CONSTANTS, va);
>>
>>         cmd_buffer->push_constant_stages &= ~stages;
>>  }
>> @@ -1036,11 +1084,9 @@ radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer)
>>
>>                 va = device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
>>                 va += vb_offset;
>> -               radeon_set_sh_reg_seq(cmd_buffer->cs,
>> -                                     R_00B130_SPI_SHADER_USER_DATA_VS_0 + AC_USERDATA_VS_VERTEX_BUFFERS * 4, 2);
>> -               radeon_emit(cmd_buffer->cs, va);
>> -               radeon_emit(cmd_buffer->cs, va >> 32);
>>
>> +               radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX,
>> +                                          AC_UD_VS_VERTEX_BUFFERS, va);
>>         }
>>
>>         cmd_buffer->state.vertex_descriptors_dirty = false;
>> @@ -1075,7 +1121,8 @@ radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer)
>>
>>         radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
>>
>> -       radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
>> +       radv_flush_descriptors(cmd_buffer, cmd_buffer->state.pipeline,
>> +                              VK_SHADER_STAGE_ALL_GRAPHICS);
>>         radv_flush_constants(cmd_buffer, cmd_buffer->state.pipeline,
>>                              VK_SHADER_STAGE_ALL_GRAPHICS);
>>
>> @@ -1820,9 +1867,13 @@ void radv_CmdDraw(
>>
>>         unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
>>
>> -       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + AC_USERDATA_VS_BASE_VERTEX * 4, 2);
>> -       radeon_emit(cmd_buffer->cs, firstVertex);
>> -       radeon_emit(cmd_buffer->cs, firstInstance);
>> +       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
>> +                                                            AC_UD_VS_BASE_VERTEX_START_INSTANCE);
>> +       if (loc->sgpr_idx != -1) {
>> +               radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + loc->sgpr_idx * 4, 2);
>> +               radeon_emit(cmd_buffer->cs, firstVertex);
>> +               radeon_emit(cmd_buffer->cs, firstInstance);
>> +       }
>>         radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
>>         radeon_emit(cmd_buffer->cs, instanceCount);
>>
>> @@ -1867,9 +1918,13 @@ void radv_CmdDrawIndexed(
>>         radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
>>         radeon_emit(cmd_buffer->cs, cmd_buffer->state.index_type);
>>
>> -       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + AC_USERDATA_VS_BASE_VERTEX * 4, 2);
>> -       radeon_emit(cmd_buffer->cs, vertexOffset);
>> -       radeon_emit(cmd_buffer->cs, firstInstance);
>> +       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
>> +                                                            AC_UD_VS_BASE_VERTEX_START_INSTANCE);
>> +       if (loc->sgpr_idx != -1) {
>> +               radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + loc->sgpr_idx * 4, 2);
>> +               radeon_emit(cmd_buffer->cs, vertexOffset);
>> +               radeon_emit(cmd_buffer->cs, firstInstance);
>> +       }
>>         radeon_emit(cmd_buffer->cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
>>         radeon_emit(cmd_buffer->cs, instanceCount);
>>
>> @@ -1914,6 +1969,9 @@ radv_emit_indirect_draw(struct radv_cmd_buffer *cmd_buffer,
>>
>>         cmd_buffer->device->ws->cs_add_buffer(cs, buffer->bo, 8);
>>
>> +       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
>> +                                                            AC_UD_VS_BASE_VERTEX_START_INSTANCE);
>> +       assert(loc->sgpr_idx != -1);
>>         radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
>>         radeon_emit(cs, 1);
>>         radeon_emit(cs, indirect_va);
>> @@ -1923,8 +1981,8 @@ radv_emit_indirect_draw(struct radv_cmd_buffer *cmd_buffer,
>>                                        PKT3_DRAW_INDIRECT_MULTI,
>>                              8, false));
>>         radeon_emit(cs, 0);
>> -       radeon_emit(cs, ((R_00B130_SPI_SHADER_USER_DATA_VS_0 + AC_USERDATA_VS_BASE_VERTEX * 4) - SI_SH_REG_OFFSET) >> 2);
>> -       radeon_emit(cs, ((R_00B130_SPI_SHADER_USER_DATA_VS_0 + AC_USERDATA_VS_START_INSTANCE * 4) - SI_SH_REG_OFFSET) >> 2);
>> +       radeon_emit(cs, ((R_00B130_SPI_SHADER_USER_DATA_VS_0 + loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2);
>> +       radeon_emit(cs, ((R_00B130_SPI_SHADER_USER_DATA_VS_0 + (loc->sgpr_idx + 1) * 4) - SI_SH_REG_OFFSET) >> 2);
>>         radeon_emit(cs, S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); /* draw_index and count_indirect enable */
>>         radeon_emit(cs, draw_count); /* count */
>>         radeon_emit(cs, count_va); /* count_addr */
>> @@ -2045,7 +2103,8 @@ static void
>>  radv_flush_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
>>  {
>>         radv_emit_compute_pipeline(cmd_buffer);
>> -       radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
>> +       radv_flush_descriptors(cmd_buffer, cmd_buffer->state.compute_pipeline,
>> +                              VK_SHADER_STAGE_COMPUTE_BIT);
>>         radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline,
>>                              VK_SHADER_STAGE_COMPUTE_BIT);
>>         si_emit_cache_flush(cmd_buffer);
>> @@ -2060,12 +2119,19 @@ void radv_CmdDispatch(
>>         RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
>>
>>         radv_flush_compute_pipeline(cmd_buffer);
>> +
>>         unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10);
>>
>> -       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + AC_USERDATA_CS_GRID_SIZE * 4, 3);
>> -       radeon_emit(cmd_buffer->cs, x);
>> -       radeon_emit(cmd_buffer->cs, y);
>> -       radeon_emit(cmd_buffer->cs, z);
>> +       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
>> +                                                            MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
>> +       if (loc->sgpr_idx != -1) {
>> +               assert(!loc->indirect);
>> +               assert(loc->num_sgprs == 3);
>> +               radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
>> +               radeon_emit(cmd_buffer->cs, x);
>> +               radeon_emit(cmd_buffer->cs, y);
>> +               radeon_emit(cmd_buffer->cs, z);
>> +       }
>>
>>         radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
>>                     PKT3_SHADER_TYPE_S(1));
>> @@ -2092,15 +2158,18 @@ void radv_CmdDispatchIndirect(
>>         radv_flush_compute_pipeline(cmd_buffer);
>>
>>         unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 25);
>> -
>> -       for (unsigned i = 0; i < 3; ++i) {
>> -               radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
>> -               radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>> -                                           COPY_DATA_DST_SEL(COPY_DATA_REG));
>> -               radeon_emit(cmd_buffer->cs, (va +  4 * i));
>> -               radeon_emit(cmd_buffer->cs, (va + 4 * i) >> 32);
>> -               radeon_emit(cmd_buffer->cs, ((R_00B900_COMPUTE_USER_DATA_0 + AC_USERDATA_CS_GRID_SIZE * 4) >> 2) + i);
>> -               radeon_emit(cmd_buffer->cs, 0);
>> +       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
>> +                                                            MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
>> +       if (loc->sgpr_idx != -1) {
>> +               for (unsigned i = 0; i < 3; ++i) {
>> +                       radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
>> +                       radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>> +                                   COPY_DATA_DST_SEL(COPY_DATA_REG));
>> +                       radeon_emit(cmd_buffer->cs, (va +  4 * i));
>> +                       radeon_emit(cmd_buffer->cs, (va + 4 * i) >> 32);
>> +                       radeon_emit(cmd_buffer->cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
>> +                       radeon_emit(cmd_buffer->cs, 0);
>> +               }
>>         }
>>
>>         radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) |
>> @@ -2151,11 +2220,14 @@ void radv_unaligned_dispatch(
>>                     S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]) |
>>                     S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
>>
>> -       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + AC_USERDATA_CS_GRID_SIZE * 4, 3);
>> -       radeon_emit(cmd_buffer->cs, blocks[0]);
>> -       radeon_emit(cmd_buffer->cs, blocks[1]);
>> -       radeon_emit(cmd_buffer->cs, blocks[2]);
>> -
>> +       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.compute_pipeline,
>> +                                                            MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
>> +       if (loc->sgpr_idx != -1) {
>> +               radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
>> +               radeon_emit(cmd_buffer->cs, blocks[0]);
>> +               radeon_emit(cmd_buffer->cs, blocks[1]);
>> +               radeon_emit(cmd_buffer->cs, blocks[2]);
>> +       }
>>         radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
>>                     PKT3_SHADER_TYPE_S(1));
>>         radeon_emit(cmd_buffer->cs, blocks[0]);
>> --
>> 2.9.3
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list