[Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size

Fri Sep 9 12:37:20 UTC 2016

On Fri, Sep 9, 2016 at 8:29 AM, Marek Olšák <maraeo at gmail.com> wrote:
> On Fri, Sep 9, 2016 at 10:12 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>
>> Not sure if it's possible to avoid programming the block size twice (once for
>> the userdata and once for the dispatch).
>>
>> Since the shaders are compiled with a pessimistic upper limit on the number of
>> registers, asynchronously compiling variants may be worth considering in the
>> future if we observe the shaders to be dispatched with small block sizes.
>> ---
>> I think this is sufficient to support variable group sizes on radeonsi, but
>> it's completely untested. Do you keep the latest version of your series in a
>> public repository somewhere?
>>
>>  src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
>>  src/gallium/drivers/radeonsi/si_shader.c  | 29 ++++++++++++++++++++---------
>>  src/gallium/drivers/radeonsi/si_shader.h  |  4 +++-
>>  3 files changed, 32 insertions(+), 11 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>> index 5041761..26e096c 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx,
>>                 for (i = 0; i < 3; ++i) {
>>                         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
>>                         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>>                                         COPY_DATA_DST_SEL(COPY_DATA_REG));
>>                         radeon_emit(cs, (va +  4 * i));
>>                         radeon_emit(cs, (va + 4 * i) >> 32);
>>                         radeon_emit(cs, (grid_size_reg >> 2) + i);
>>                         radeon_emit(cs, 0);
>>                 }
>>         } else {
>> +               struct si_compute *program = sctx->cs_shader_state.program;
>> +               bool variable_group_size =
>> +                       program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
>>
>> -               radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
>> +               radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3);
>>                 radeon_emit(cs, info->grid[0]);
>>                 radeon_emit(cs, info->grid[1]);
>>                 radeon_emit(cs, info->grid[2]);
>> +               if (variable_group_size) {
>> +                       radeon_emit(cs, info->block[0]);
>> +                       radeon_emit(cs, info->block[1]);
>> +                       radeon_emit(cs, info->block[2]);
>> +               }
>>         }
>>  }
>>
>>  static void si_emit_dispatch_packets(struct si_context *sctx,
>>                                       const struct pipe_grid_info *info)
>>  {
>>         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>>         bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
>>         unsigned waves_per_threadgroup =
>>                 DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
>> index 0b7de18..730ee21 100644
>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>> @@ -1783,30 +1783,35 @@ static void declare_system_value(
>>
>>         case TGSI_SEMANTIC_GRID_SIZE:
>>                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
>>                 break;
>>
>>         case TGSI_SEMANTIC_BLOCK_SIZE:
>>         {
>>                 LLVMValueRef values[3];
>>                 unsigned i;
>>                 unsigned *properties = ctx->shader->selector->info.properties;
>> -               unsigned sizes[3] = {
>> -                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>> -                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>> -                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>> -               };
>>
>> -               for (i = 0; i < 3; ++i)
>> -                       values[i] = lp_build_const_int32(gallivm, sizes[i]);
>> +               if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
>> +                       unsigned sizes[3] = {
>> +                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>> +                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>> +                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>> +                       };
>> +
>> +                       for (i = 0; i < 3; ++i)
>> +                               values[i] = lp_build_const_int32(gallivm, sizes[i]);
>>
>> -               value = lp_build_gather_values(gallivm, values, 3);
>> +                       value = lp_build_gather_values(gallivm, values, 3);
>> +               } else {
>> +                       value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE);
>> +               }
>>                 break;
>>         }
>>
>>         case TGSI_SEMANTIC_BLOCK_ID:
>>                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
>>                 break;
>>
>>         case TGSI_SEMANTIC_THREAD_ID:
>>                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
>>                 break;
>> @@ -5705,20 +5710,21 @@ static void create_function(struct si_shader_context *ctx)
>>
>>                         for (i = 0; i < num_return_sgprs; i++)
>>                                 returns[i] = ctx->i32;
>>                         for (; i < num_returns; i++)
>>                                 returns[i] = ctx->f32;
>>                 }
>>                 break;
>>
>>         case PIPE_SHADER_COMPUTE:
>>                 params[SI_PARAM_GRID_SIZE] = v3i32;
>> +               params[SI_PARAM_BLOCK_SIZE] = v3i32;
>>                 params[SI_PARAM_BLOCK_ID] = v3i32;
>>                 last_sgpr = SI_PARAM_BLOCK_ID;
>>
>>                 params[SI_PARAM_THREAD_ID] = v3i32;
>>                 num_params = SI_PARAM_THREAD_ID + 1;
>>                 break;
>>         default:
>>                 assert(0 && "unimplemented shader");
>>                 return;
>>         }
>> @@ -5741,21 +5747,26 @@ static void create_function(struct si_shader_context *ctx)
>>                                           S_0286D0_LINEAR_CENTROID_ENA(1) |
>>                                           S_0286D0_FRONT_FACE_ENA(1) |
>>                                           S_0286D0_POS_FIXED_PT_ENA(1));
>>         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
>>                 const unsigned *properties = shader->selector->info.properties;
>>                 unsigned max_work_group_size =
>>                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
>>                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
>>                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>>
>> -               assert(max_work_group_size);
>> +               if (!max_work_group_size) {
>> +                       /* This is a variable group size compute shader,
>> +                        * compile it for the maximum possible group size.
>> +                        */
>> +                       max_work_group_size = 2048;
>> +               }
>
> The VGPR limit is 32. That's too limiting. I'd prefer it if
> GL_MAX_COMPUTE_VARIABLE_WORK_GROUP_INVOCATIONS_ARB was set to 512 or
> 1024. That should move the limit to 128 or 64, respectively.

FWIW I'd definitely like to see that for nouveau eventually as well
(basically have the same issue, where we have to limit the # of regs
used artificially). But perhaps the initial iteration can just work
with the same maxes, and then be improved as a follow-on?

  -ilia