[Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size

Fri Sep 9 16:11:55 UTC 2016

On 09.09.2016 18:07, Samuel Pitoiset wrote:
>
>
> On 09/09/2016 02:37 PM, Ilia Mirkin wrote:
>> On Fri, Sep 9, 2016 at 8:29 AM, Marek Olšák <maraeo at gmail.com> wrote:
>>> On Fri, Sep 9, 2016 at 10:12 AM, Nicolai Hähnle <nhaehnle at gmail.com>
>>> wrote:
>>>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>>>
>>>> Not sure if it's possible to avoid programming the block size twice
>>>> (once for
>>>> the userdata and once for the dispatch).
>>>>
>>>> Since the shaders are compiled with a pessimistic upper limit on the
>>>> number of
>>>> registers, asynchronously compiling variants may be worth
>>>> considering in the
>>>> future if we observe the shaders to be dispatched with small block
>>>> sizes.
>>>> ---
>>>> I think this is sufficient to support variable group sizes on
>>>> radeonsi, but
>>>> it's completely untested. Do you keep the latest version of your
>>>> series in a
>>>> public repository somewhere?
>>>>
>>>>  src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
>>>>  src/gallium/drivers/radeonsi/si_shader.c  | 29
>>>> ++++++++++++++++++++---------
>>>>  src/gallium/drivers/radeonsi/si_shader.h  |  4 +++-
>>>>  3 files changed, 32 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>>>> b/src/gallium/drivers/radeonsi/si_compute.c
>>>> index 5041761..26e096c 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>>> @@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct
>>>> si_context *sctx,
>>>>                 for (i = 0; i < 3; ++i) {
>>>>                         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
>>>>                         radeon_emit(cs,
>>>> COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>>>>
>>>> COPY_DATA_DST_SEL(COPY_DATA_REG));
>>>>                         radeon_emit(cs, (va +  4 * i));
>>>>                         radeon_emit(cs, (va + 4 * i) >> 32);
>>>>                         radeon_emit(cs, (grid_size_reg >> 2) + i);
>>>>                         radeon_emit(cs, 0);
>>>>                 }
>>>>         } else {
>>>> +               struct si_compute *program =
>>>> sctx->cs_shader_state.program;
>>>> +               bool variable_group_size =
>>>> +
>>>> program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]
>>>> == 0;
>>>>
>>>> -               radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
>>>> +               radeon_set_sh_reg_seq(cs, grid_size_reg,
>>>> variable_group_size ? 6 : 3);
>>>>                 radeon_emit(cs, info->grid[0]);
>>>>                 radeon_emit(cs, info->grid[1]);
>>>>                 radeon_emit(cs, info->grid[2]);
>>>> +               if (variable_group_size) {
>>>> +                       radeon_emit(cs, info->block[0]);
>>>> +                       radeon_emit(cs, info->block[1]);
>>>> +                       radeon_emit(cs, info->block[2]);
>>>> +               }
>>>>         }
>>>>  }
>>>>
>>>>  static void si_emit_dispatch_packets(struct si_context *sctx,
>>>>                                       const struct pipe_grid_info
>>>> *info)
>>>>  {
>>>>         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>>>>         bool render_cond_bit = sctx->b.render_cond &&
>>>> !sctx->b.render_cond_force_off;
>>>>         unsigned waves_per_threadgroup =
>>>>                 DIV_ROUND_UP(info->block[0] * info->block[1] *
>>>> info->block[2], 64);
>>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>>>> b/src/gallium/drivers/radeonsi/si_shader.c
>>>> index 0b7de18..730ee21 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>>> @@ -1783,30 +1783,35 @@ static void declare_system_value(
>>>>
>>>>         case TGSI_SEMANTIC_GRID_SIZE:
>>>>                 value = LLVMGetParam(radeon_bld->main_fn,
>>>> SI_PARAM_GRID_SIZE);
>>>>                 break;
>>>>
>>>>         case TGSI_SEMANTIC_BLOCK_SIZE:
>>>>         {
>>>>                 LLVMValueRef values[3];
>>>>                 unsigned i;
>>>>                 unsigned *properties =
>>>> ctx->shader->selector->info.properties;
>>>> -               unsigned sizes[3] = {
>>>> -                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>>>> -
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>>>> -                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>>>> -               };
>>>>
>>>> -               for (i = 0; i < 3; ++i)
>>>> -                       values[i] = lp_build_const_int32(gallivm,
>>>> sizes[i]);
>>>> +               if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]
>>>> != 0) {
>>>> +                       unsigned sizes[3] = {
>>>> +
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>>>> +
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>>>> +
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>>>> +                       };
>>>> +
>>>> +                       for (i = 0; i < 3; ++i)
>>>> +                               values[i] =
>>>> lp_build_const_int32(gallivm, sizes[i]);
>>>>
>>>> -               value = lp_build_gather_values(gallivm, values, 3);
>>>> +                       value = lp_build_gather_values(gallivm,
>>>> values, 3);
>>>> +               } else {
>>>> +                       value = LLVMGetParam(radeon_bld->main_fn,
>>>> SI_PARAM_BLOCK_SIZE);
>>>> +               }
>>>>                 break;
>>>>         }
>>>>
>>>>         case TGSI_SEMANTIC_BLOCK_ID:
>>>>                 value = LLVMGetParam(radeon_bld->main_fn,
>>>> SI_PARAM_BLOCK_ID);
>>>>                 break;
>>>>
>>>>         case TGSI_SEMANTIC_THREAD_ID:
>>>>                 value = LLVMGetParam(radeon_bld->main_fn,
>>>> SI_PARAM_THREAD_ID);
>>>>                 break;
>>>> @@ -5705,20 +5710,21 @@ static void create_function(struct
>>>> si_shader_context *ctx)
>>>>
>>>>                         for (i = 0; i < num_return_sgprs; i++)
>>>>                                 returns[i] = ctx->i32;
>>>>                         for (; i < num_returns; i++)
>>>>                                 returns[i] = ctx->f32;
>>>>                 }
>>>>                 break;
>>>>
>>>>         case PIPE_SHADER_COMPUTE:
>>>>                 params[SI_PARAM_GRID_SIZE] = v3i32;
>>>> +               params[SI_PARAM_BLOCK_SIZE] = v3i32;
>>>>                 params[SI_PARAM_BLOCK_ID] = v3i32;
>>>>                 last_sgpr = SI_PARAM_BLOCK_ID;
>>>>
>>>>                 params[SI_PARAM_THREAD_ID] = v3i32;
>>>>                 num_params = SI_PARAM_THREAD_ID + 1;
>>>>                 break;
>>>>         default:
>>>>                 assert(0 && "unimplemented shader");
>>>>                 return;
>>>>         }
>>>> @@ -5741,21 +5747,26 @@ static void create_function(struct
>>>> si_shader_context *ctx)
>>>>
>>>> S_0286D0_LINEAR_CENTROID_ENA(1) |
>>>>                                           S_0286D0_FRONT_FACE_ENA(1) |
>>>>
>>>> S_0286D0_POS_FIXED_PT_ENA(1));
>>>>         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
>>>>                 const unsigned *properties =
>>>> shader->selector->info.properties;
>>>>                 unsigned max_work_group_size =
>>>>
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
>>>>
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
>>>>
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>>>>
>>>> -               assert(max_work_group_size);
>>>> +               if (!max_work_group_size) {
>>>> +                       /* This is a variable group size compute
>>>> shader,
>>>> +                        * compile it for the maximum possible group
>>>> size.
>>>> +                        */
>>>> +                       max_work_group_size = 2048;
>>>> +               }
>>>
>>> The VGPR limit is 32. That's too limiting. I'd prefer it if
>>> GL_MAX_COMPUTE_VARIABLE_WORK_GROUP_INVOCATIONS_ARB was set to 512 or
>>> 1024. That should move the limit to 128 or 64, respectively.
>>
>> FWIW I'd definitely like to see that for nouveau eventually as well
>> (basically have the same issue, where we have to limit the # of regs
>> used artificially). But perhaps the initial iteration can just work
>> with the same maxes, and then be improved as a follow-on?
>
> What about adding a new cap for MaxComputeVariableGroupSizeARB and
> enable the ext based on that cap (suggested by Marek and I do agree) ?

Yes, please. That would solve all the issues nicely.

Nicolai

>>
>>   -ilia
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>