[Mesa-dev] [PATCH 2/2] radeonsi: don't load unused compute shader input SGPRs and VGPRs

Marek Olšák maraeo at gmail.com
Fri Apr 28 20:24:26 UTC 2017


On Tue, Apr 25, 2017 at 8:24 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 24.04.2017 18:22, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> Basically, don't load GRID_SIZE or BLOCK_SIZE if they are unused,
>> determine
>> whether to load BLOCK_ID for each component separately, and set the number
>> of THREAD_ID VGPRs to load. Now we should get the maximum CS launch wave
>> rate in most cases.
>> ---
>>  src/gallium/drivers/radeonsi/si_compute.c         | 71
>> ++++++++++++++---------
>>  src/gallium/drivers/radeonsi/si_shader.c          | 37 ++++++++----
>>  src/gallium/drivers/radeonsi/si_shader.h          | 11 ----
>>  src/gallium/drivers/radeonsi/si_shader_internal.h |  5 ++
>>  4 files changed, 76 insertions(+), 48 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>> b/src/gallium/drivers/radeonsi/si_compute.c
>> index 2b2efae..b3399d1 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -41,20 +41,22 @@ struct si_compute {
>>
>>         unsigned ir_type;
>>         unsigned local_size;
>>         unsigned private_size;
>>         unsigned input_size;
>>         struct si_shader shader;
>>
>>         struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
>>         unsigned use_code_object_v2 : 1;
>>         unsigned variable_group_size : 1;
>> +       unsigned uses_grid_size:1;
>> +       unsigned uses_block_size:1;
>>  };
>>
>>  struct dispatch_packet {
>>         uint16_t header;
>>         uint16_t setup;
>>         uint16_t workgroup_size_x;
>>         uint16_t workgroup_size_y;
>>         uint16_t workgroup_size_z;
>>         uint16_t reserved0;
>>         uint32_t grid_size_x;
>> @@ -114,37 +116,45 @@ static void si_create_compute_state_async(void *job,
>> int thread_index)
>>         memset(&sel, 0, sizeof(sel));
>>
>>         sel.screen = program->screen;
>>         tgsi_scan_shader(program->tokens, &sel.info);
>>         sel.tokens = program->tokens;
>>         sel.type = PIPE_SHADER_COMPUTE;
>>         sel.local_size = program->local_size;
>>
>>         program->shader.selector = &sel;
>>         program->shader.is_monolithic = true;
>> +       program->uses_grid_size = sel.info.uses_grid_size;
>> +       program->uses_block_size = sel.info.uses_block_size;
>>
>>         if (si_shader_create(program->screen, tm, &program->shader,
>> debug)) {
>>                 program->shader.compilation_failed = true;
>>         } else {
>>                 bool scratch_enabled =
>> shader->config.scratch_bytes_per_wave > 0;
>> +               unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS +
>> +                                     (sel.info.uses_grid_size ? 3 : 0) +
>> +                                     (sel.info.uses_block_size ? 3 : 0);
>>
>>                 shader->config.rsrc1 =
>>                         S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4)
>> |
>>                         S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8)
>> |
>>                         S_00B848_DX10_CLAMP(1) |
>>                         S_00B848_FLOAT_MODE(shader->config.float_mode);
>>
>>                 shader->config.rsrc2 =
>> -                       S_00B84C_USER_SGPR(SI_CS_NUM_USER_SGPR) |
>> +                       S_00B84C_USER_SGPR(user_sgprs) |
>>                         S_00B84C_SCRATCH_EN(scratch_enabled) |
>> -                       S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) |
>> -                       S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2)
>> |
>> +                       S_00B84C_TGID_X_EN(sel.info.uses_block_id[0]) |
>> +                       S_00B84C_TGID_Y_EN(sel.info.uses_block_id[1]) |
>> +                       S_00B84C_TGID_Z_EN(sel.info.uses_block_id[2]) |
>> +                       S_00B84C_TIDIG_COMP_CNT(sel.info.uses_thread_id[2]
>> ? 2 :
>> +                                               sel.info.uses_thread_id[1]
>> ? 1 : 0) |
>>                         S_00B84C_LDS_SIZE(shader->config.lds_size);
>>
>>                 program->variable_group_size =
>>
>> sel.info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
>>         }
>>
>>         FREE(program->tokens);
>>         program->shader.selector = NULL;
>>  }
>>
>> @@ -644,50 +654,57 @@ static bool si_upload_compute_input(struct
>> si_context *sctx,
>>         }
>>
>>         r600_resource_reference(&input_buffer, NULL);
>>
>>         return true;
>>  }
>>
>>  static void si_setup_tgsi_grid(struct si_context *sctx,
>>                                  const struct pipe_grid_info *info)
>>  {
>> +       struct si_compute *program = sctx->cs_shader_state.program;
>>         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>>         unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
>> -                                 4 * SI_SGPR_GRID_SIZE;
>> +                                4 * SI_NUM_RESOURCE_SGPRS;
>> +       unsigned block_size_reg = grid_size_reg +
>> +                                 /* 12 bytes = 3 dwords. */
>> +                                 12 * program->uses_grid_size;
>>
>>         if (info->indirect) {
>> -               uint64_t base_va =
>> r600_resource(info->indirect)->gpu_address;
>> -               uint64_t va = base_va + info->indirect_offset;
>> -               int i;
>> -
>> -               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> -                                (struct r600_resource *)info->indirect,
>> -                                RADEON_USAGE_READ,
>> RADEON_PRIO_DRAW_INDIRECT);
>> -
>> -               for (i = 0; i < 3; ++i) {
>> -                       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
>> -                       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>> -                                       COPY_DATA_DST_SEL(COPY_DATA_REG));
>> -                       radeon_emit(cs, (va +  4 * i));
>> -                       radeon_emit(cs, (va + 4 * i) >> 32);
>> -                       radeon_emit(cs, (grid_size_reg >> 2) + i);
>> -                       radeon_emit(cs, 0);
>> +               if (program->uses_grid_size) {
>> +                       uint64_t base_va =
>> r600_resource(info->indirect)->gpu_address;
>> +                       uint64_t va = base_va + info->indirect_offset;
>> +                       int i;
>> +
>> +                       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> +                                        (struct r600_resource
>> *)info->indirect,
>> +                                        RADEON_USAGE_READ,
>> RADEON_PRIO_DRAW_INDIRECT);
>> +
>> +                       for (i = 0; i < 3; ++i) {
>> +                               radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4,
>> 0));
>> +                               radeon_emit(cs,
>> COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>> +
>> COPY_DATA_DST_SEL(COPY_DATA_REG));
>> +                               radeon_emit(cs, (va +  4 * i));
>
>
> Extra space (was in the original as well, but may as well fix it now).
>
>
>> +                               radeon_emit(cs, (va + 4 * i) >> 32);
>> +                               radeon_emit(cs, (grid_size_reg >> 2) + i);
>> +                               radeon_emit(cs, 0);
>> +                       }
>>                 }
>>         } else {
>> -               struct si_compute *program =
>> sctx->cs_shader_state.program;
>> -
>> -               radeon_set_sh_reg_seq(cs, grid_size_reg,
>> program->variable_group_size ? 6 : 3);
>> -               radeon_emit(cs, info->grid[0]);
>> -               radeon_emit(cs, info->grid[1]);
>> -               radeon_emit(cs, info->grid[2]);
>> -               if (program->variable_group_size) {
>> +               if (program->uses_grid_size) {
>> +                       radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
>> +                       radeon_emit(cs, info->grid[0]);
>> +                       radeon_emit(cs, info->grid[1]);
>> +                       radeon_emit(cs, info->grid[2]);
>> +               }
>> +               if (program->variable_group_size &&
>> program->uses_block_size) {
>> +                       radeon_set_sh_reg_seq(cs, block_size_reg, 3);
>>                         radeon_emit(cs, info->block[0]);
>>                         radeon_emit(cs, info->block[1]);
>>                         radeon_emit(cs, info->block[2]);
>
>
> This is a slight regression if both are used, though I guess variable group
> size is rare enough that it's not a big deal.
>
> Either way,
>
> Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> By the way: Do we have a test case that exercises the corner case of a
> compute shader that reads only the Y or only the Z component of BLOCK_ID?
> That might be a good idea.

There are no piglits for those, but I have a few here.

Marek


More information about the mesa-dev mailing list