[Mesa-dev] [PATCH 2/2] radeonsi: don't load unused compute shader input SGPRs and VGPRs
Marek Olšák
maraeo at gmail.com
Fri Apr 28 20:24:26 UTC 2017
On Tue, Apr 25, 2017 at 8:24 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 24.04.2017 18:22, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> Basically, don't load GRID_SIZE or BLOCK_SIZE if they are unused,
>> determine
>> whether to load BLOCK_ID for each component separately, and set the number
>> of THREAD_ID VGPRs to load. Now we should get the maximum CS launch wave
>> rate in most cases.
>> ---
>> src/gallium/drivers/radeonsi/si_compute.c | 71
>> ++++++++++++++---------
>> src/gallium/drivers/radeonsi/si_shader.c | 37 ++++++++----
>> src/gallium/drivers/radeonsi/si_shader.h | 11 ----
>> src/gallium/drivers/radeonsi/si_shader_internal.h | 5 ++
>> 4 files changed, 76 insertions(+), 48 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>> b/src/gallium/drivers/radeonsi/si_compute.c
>> index 2b2efae..b3399d1 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -41,20 +41,22 @@ struct si_compute {
>>
>> unsigned ir_type;
>> unsigned local_size;
>> unsigned private_size;
>> unsigned input_size;
>> struct si_shader shader;
>>
>> struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
>> unsigned use_code_object_v2 : 1;
>> unsigned variable_group_size : 1;
>> + unsigned uses_grid_size:1;
>> + unsigned uses_block_size:1;
>> };
>>
>> struct dispatch_packet {
>> uint16_t header;
>> uint16_t setup;
>> uint16_t workgroup_size_x;
>> uint16_t workgroup_size_y;
>> uint16_t workgroup_size_z;
>> uint16_t reserved0;
>> uint32_t grid_size_x;
>> @@ -114,37 +116,45 @@ static void si_create_compute_state_async(void *job,
>> int thread_index)
>> memset(&sel, 0, sizeof(sel));
>>
>> sel.screen = program->screen;
>> tgsi_scan_shader(program->tokens, &sel.info);
>> sel.tokens = program->tokens;
>> sel.type = PIPE_SHADER_COMPUTE;
>> sel.local_size = program->local_size;
>>
>> program->shader.selector = &sel;
>> program->shader.is_monolithic = true;
>> + program->uses_grid_size = sel.info.uses_grid_size;
>> + program->uses_block_size = sel.info.uses_block_size;
>>
>> if (si_shader_create(program->screen, tm, &program->shader,
>> debug)) {
>> program->shader.compilation_failed = true;
>> } else {
>> bool scratch_enabled =
>> shader->config.scratch_bytes_per_wave > 0;
>> + unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS +
>> + (sel.info.uses_grid_size ? 3 : 0) +
>> + (sel.info.uses_block_size ? 3 : 0);
>>
>> shader->config.rsrc1 =
>> S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4)
>> |
>> S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8)
>> |
>> S_00B848_DX10_CLAMP(1) |
>> S_00B848_FLOAT_MODE(shader->config.float_mode);
>>
>> shader->config.rsrc2 =
>> - S_00B84C_USER_SGPR(SI_CS_NUM_USER_SGPR) |
>> + S_00B84C_USER_SGPR(user_sgprs) |
>> S_00B84C_SCRATCH_EN(scratch_enabled) |
>> - S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) |
>> - S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2)
>> |
>> + S_00B84C_TGID_X_EN(sel.info.uses_block_id[0]) |
>> + S_00B84C_TGID_Y_EN(sel.info.uses_block_id[1]) |
>> + S_00B84C_TGID_Z_EN(sel.info.uses_block_id[2]) |
>> + S_00B84C_TIDIG_COMP_CNT(sel.info.uses_thread_id[2]
>> ? 2 :
>> + sel.info.uses_thread_id[1]
>> ? 1 : 0) |
>> S_00B84C_LDS_SIZE(shader->config.lds_size);
>>
>> program->variable_group_size =
>>
>> sel.info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
>> }
>>
>> FREE(program->tokens);
>> program->shader.selector = NULL;
>> }
>>
>> @@ -644,50 +654,57 @@ static bool si_upload_compute_input(struct
>> si_context *sctx,
>> }
>>
>> r600_resource_reference(&input_buffer, NULL);
>>
>> return true;
>> }
>>
>> static void si_setup_tgsi_grid(struct si_context *sctx,
>> const struct pipe_grid_info *info)
>> {
>> + struct si_compute *program = sctx->cs_shader_state.program;
>> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>> unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
>> - 4 * SI_SGPR_GRID_SIZE;
>> + 4 * SI_NUM_RESOURCE_SGPRS;
>> + unsigned block_size_reg = grid_size_reg +
>> + /* 12 bytes = 3 dwords. */
>> + 12 * program->uses_grid_size;
>>
>> if (info->indirect) {
>> - uint64_t base_va =
>> r600_resource(info->indirect)->gpu_address;
>> - uint64_t va = base_va + info->indirect_offset;
>> - int i;
>> -
>> - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> - (struct r600_resource *)info->indirect,
>> - RADEON_USAGE_READ,
>> RADEON_PRIO_DRAW_INDIRECT);
>> -
>> - for (i = 0; i < 3; ++i) {
>> - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
>> - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>> - COPY_DATA_DST_SEL(COPY_DATA_REG));
>> - radeon_emit(cs, (va + 4 * i));
>> - radeon_emit(cs, (va + 4 * i) >> 32);
>> - radeon_emit(cs, (grid_size_reg >> 2) + i);
>> - radeon_emit(cs, 0);
>> + if (program->uses_grid_size) {
>> + uint64_t base_va =
>> r600_resource(info->indirect)->gpu_address;
>> + uint64_t va = base_va + info->indirect_offset;
>> + int i;
>> +
>> + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> + (struct r600_resource
>> *)info->indirect,
>> + RADEON_USAGE_READ,
>> RADEON_PRIO_DRAW_INDIRECT);
>> +
>> + for (i = 0; i < 3; ++i) {
>> + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4,
>> 0));
>> + radeon_emit(cs,
>> COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>> +
>> COPY_DATA_DST_SEL(COPY_DATA_REG));
>> + radeon_emit(cs, (va + 4 * i));
>
>
> Extra space (was in the original as well, but may as well fix it now).
>
>
>> + radeon_emit(cs, (va + 4 * i) >> 32);
>> + radeon_emit(cs, (grid_size_reg >> 2) + i);
>> + radeon_emit(cs, 0);
>> + }
>> }
>> } else {
>> - struct si_compute *program =
>> sctx->cs_shader_state.program;
>> -
>> - radeon_set_sh_reg_seq(cs, grid_size_reg,
>> program->variable_group_size ? 6 : 3);
>> - radeon_emit(cs, info->grid[0]);
>> - radeon_emit(cs, info->grid[1]);
>> - radeon_emit(cs, info->grid[2]);
>> - if (program->variable_group_size) {
>> + if (program->uses_grid_size) {
>> + radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
>> + radeon_emit(cs, info->grid[0]);
>> + radeon_emit(cs, info->grid[1]);
>> + radeon_emit(cs, info->grid[2]);
>> + }
>> + if (program->variable_group_size &&
>> program->uses_block_size) {
>> + radeon_set_sh_reg_seq(cs, block_size_reg, 3);
>> radeon_emit(cs, info->block[0]);
>> radeon_emit(cs, info->block[1]);
>> radeon_emit(cs, info->block[2]);
>
>
> This is a slight regression if both are used, though I guess variable group
> size is rare enough that it's not a big deal.
>
> Either way,
>
> Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> By the way: Do we have a test case that exercises the corner case of a
> compute shader that reads only the Y or only the Z component of BLOCK_ID?
> That might be a good idea.
There are no piglits for those, but I have a few here.
Marek
More information about the mesa-dev
mailing list