[Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size

Fri Sep 9 16:07:44 UTC 2016

On 09/09/2016 02:37 PM, Ilia Mirkin wrote:
> On Fri, Sep 9, 2016 at 8:29 AM, Marek Olšák <maraeo at gmail.com> wrote:
>> On Fri, Sep 9, 2016 at 10:12 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>>
>>> Not sure if it's possible to avoid programming the block size twice (once for
>>> the userdata and once for the dispatch).
>>>
>>> Since the shaders are compiled with a pessimistic upper limit on the number of
>>> registers, asynchronously compiling variants may be worth considering in the
>>> future if we observe the shaders to be dispatched with small block sizes.
>>> ---
>>> I think this is sufficient to support variable group sizes on radeonsi, but
>>> it's completely untested. Do you keep the latest version of your series in a
>>> public repository somewhere?
>>>
>>>  src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
>>>  src/gallium/drivers/radeonsi/si_shader.c  | 29 ++++++++++++++++++++---------
>>>  src/gallium/drivers/radeonsi/si_shader.h  |  4 +++-
>>>  3 files changed, 32 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>>> index 5041761..26e096c 100644
>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>> @@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx,
>>>                 for (i = 0; i < 3; ++i) {
>>>                         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
>>>                         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>>>                                         COPY_DATA_DST_SEL(COPY_DATA_REG));
>>>                         radeon_emit(cs, (va +  4 * i));
>>>                         radeon_emit(cs, (va + 4 * i) >> 32);
>>>                         radeon_emit(cs, (grid_size_reg >> 2) + i);
>>>                         radeon_emit(cs, 0);
>>>                 }
>>>         } else {
>>> +               struct si_compute *program = sctx->cs_shader_state.program;
>>> +               bool variable_group_size =
>>> +                       program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
>>>
>>> -               radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
>>> +               radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3);
>>>                 radeon_emit(cs, info->grid[0]);
>>>                 radeon_emit(cs, info->grid[1]);
>>>                 radeon_emit(cs, info->grid[2]);
>>> +               if (variable_group_size) {
>>> +                       radeon_emit(cs, info->block[0]);
>>> +                       radeon_emit(cs, info->block[1]);
>>> +                       radeon_emit(cs, info->block[2]);
>>> +               }
>>>         }
>>>  }
>>>
>>>  static void si_emit_dispatch_packets(struct si_context *sctx,
>>>                                       const struct pipe_grid_info *info)
>>>  {
>>>         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>>>         bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
>>>         unsigned waves_per_threadgroup =
>>>                 DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
>>> index 0b7de18..730ee21 100644
>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>> @@ -1783,30 +1783,35 @@ static void declare_system_value(
>>>
>>>         case TGSI_SEMANTIC_GRID_SIZE:
>>>                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
>>>                 break;
>>>
>>>         case TGSI_SEMANTIC_BLOCK_SIZE:
>>>         {
>>>                 LLVMValueRef values[3];
>>>                 unsigned i;
>>>                 unsigned *properties = ctx->shader->selector->info.properties;
>>> -               unsigned sizes[3] = {
>>> -                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>>> -                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>>> -                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>>> -               };
>>>
>>> -               for (i = 0; i < 3; ++i)
>>> -                       values[i] = lp_build_const_int32(gallivm, sizes[i]);
>>> +               if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
>>> +                       unsigned sizes[3] = {
>>> +                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>>> +                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>>> +                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>>> +                       };
>>> +
>>> +                       for (i = 0; i < 3; ++i)
>>> +                               values[i] = lp_build_const_int32(gallivm, sizes[i]);
>>>
>>> -               value = lp_build_gather_values(gallivm, values, 3);
>>> +                       value = lp_build_gather_values(gallivm, values, 3);
>>> +               } else {
>>> +                       value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE);
>>> +               }
>>>                 break;
>>>         }
>>>
>>>         case TGSI_SEMANTIC_BLOCK_ID:
>>>                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
>>>                 break;
>>>
>>>         case TGSI_SEMANTIC_THREAD_ID:
>>>                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
>>>                 break;
>>> @@ -5705,20 +5710,21 @@ static void create_function(struct si_shader_context *ctx)
>>>
>>>                         for (i = 0; i < num_return_sgprs; i++)
>>>                                 returns[i] = ctx->i32;
>>>                         for (; i < num_returns; i++)
>>>                                 returns[i] = ctx->f32;
>>>                 }
>>>                 break;
>>>
>>>         case PIPE_SHADER_COMPUTE:
>>>                 params[SI_PARAM_GRID_SIZE] = v3i32;
>>> +               params[SI_PARAM_BLOCK_SIZE] = v3i32;
>>>                 params[SI_PARAM_BLOCK_ID] = v3i32;
>>>                 last_sgpr = SI_PARAM_BLOCK_ID;
>>>
>>>                 params[SI_PARAM_THREAD_ID] = v3i32;
>>>                 num_params = SI_PARAM_THREAD_ID + 1;
>>>                 break;
>>>         default:
>>>                 assert(0 && "unimplemented shader");
>>>                 return;
>>>         }
>>> @@ -5741,21 +5747,26 @@ static void create_function(struct si_shader_context *ctx)
>>>                                           S_0286D0_LINEAR_CENTROID_ENA(1) |
>>>                                           S_0286D0_FRONT_FACE_ENA(1) |
>>>                                           S_0286D0_POS_FIXED_PT_ENA(1));
>>>         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
>>>                 const unsigned *properties = shader->selector->info.properties;
>>>                 unsigned max_work_group_size =
>>>                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
>>>                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
>>>                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>>>
>>> -               assert(max_work_group_size);
>>> +               if (!max_work_group_size) {
>>> +                       /* This is a variable group size compute shader,
>>> +                        * compile it for the maximum possible group size.
>>> +                        */
>>> +                       max_work_group_size = 2048;
>>> +               }
>>
>> The VGPR limit is 32. That's too limiting. I'd prefer it if
>> GL_MAX_COMPUTE_VARIABLE_WORK_GROUP_INVOCATIONS_ARB was set to 512 or
>> 1024. That should move the limit to 128 or 64, respectively.
>
> FWIW I'd definitely like to see that for nouveau eventually as well
> (basically have the same issue, where we have to limit the # of regs
> used artificially). But perhaps the initial iteration can just work
> with the same maxes, and then be improved as a follow-on?

What about adding a new cap for MaxComputeVariableGroupSizeARB and 
enable the ext based on that cap (suggested by Marek and I do agree) ?

>
>   -ilia
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>