[Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size
Nicolai Hähnle
nhaehnle at gmail.com
Fri Sep 9 16:11:55 UTC 2016
On 09.09.2016 18:07, Samuel Pitoiset wrote:
>
>
> On 09/09/2016 02:37 PM, Ilia Mirkin wrote:
>> On Fri, Sep 9, 2016 at 8:29 AM, Marek Olšák <maraeo at gmail.com> wrote:
>>> On Fri, Sep 9, 2016 at 10:12 AM, Nicolai Hähnle <nhaehnle at gmail.com>
>>> wrote:
>>>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>>>
>>>> Not sure if it's possible to avoid programming the block size twice
>>>> (once for
>>>> the userdata and once for the dispatch).
>>>>
>>>> Since the shaders are compiled with a pessimistic upper limit on the
>>>> number of
>>>> registers, asynchronously compiling variants may be worth
>>>> considering in the
>>>> future if we observe the shaders to be dispatched with small block
>>>> sizes.
>>>> ---
>>>> I think this is sufficient to support variable group sizes on
>>>> radeonsi, but
>>>> it's completely untested. Do you keep the latest version of your
>>>> series in a
>>>> public repository somewhere?
>>>>
>>>> src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
>>>> src/gallium/drivers/radeonsi/si_shader.c | 29
>>>> ++++++++++++++++++++---------
>>>> src/gallium/drivers/radeonsi/si_shader.h | 4 +++-
>>>> 3 files changed, 32 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>>>> b/src/gallium/drivers/radeonsi/si_compute.c
>>>> index 5041761..26e096c 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>>> @@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct
>>>> si_context *sctx,
>>>> for (i = 0; i < 3; ++i) {
>>>> radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
>>>> radeon_emit(cs,
>>>> COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>>>>
>>>> COPY_DATA_DST_SEL(COPY_DATA_REG));
>>>> radeon_emit(cs, (va + 4 * i));
>>>> radeon_emit(cs, (va + 4 * i) >> 32);
>>>> radeon_emit(cs, (grid_size_reg >> 2) + i);
>>>> radeon_emit(cs, 0);
>>>> }
>>>> } else {
>>>> + struct si_compute *program =
>>>> sctx->cs_shader_state.program;
>>>> + bool variable_group_size =
>>>> +
>>>> program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]
>>>> == 0;
>>>>
>>>> - radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
>>>> + radeon_set_sh_reg_seq(cs, grid_size_reg,
>>>> variable_group_size ? 6 : 3);
>>>> radeon_emit(cs, info->grid[0]);
>>>> radeon_emit(cs, info->grid[1]);
>>>> radeon_emit(cs, info->grid[2]);
>>>> + if (variable_group_size) {
>>>> + radeon_emit(cs, info->block[0]);
>>>> + radeon_emit(cs, info->block[1]);
>>>> + radeon_emit(cs, info->block[2]);
>>>> + }
>>>> }
>>>> }
>>>>
>>>> static void si_emit_dispatch_packets(struct si_context *sctx,
>>>> const struct pipe_grid_info
>>>> *info)
>>>> {
>>>> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>>>> bool render_cond_bit = sctx->b.render_cond &&
>>>> !sctx->b.render_cond_force_off;
>>>> unsigned waves_per_threadgroup =
>>>> DIV_ROUND_UP(info->block[0] * info->block[1] *
>>>> info->block[2], 64);
>>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>>>> b/src/gallium/drivers/radeonsi/si_shader.c
>>>> index 0b7de18..730ee21 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>>> @@ -1783,30 +1783,35 @@ static void declare_system_value(
>>>>
>>>> case TGSI_SEMANTIC_GRID_SIZE:
>>>> value = LLVMGetParam(radeon_bld->main_fn,
>>>> SI_PARAM_GRID_SIZE);
>>>> break;
>>>>
>>>> case TGSI_SEMANTIC_BLOCK_SIZE:
>>>> {
>>>> LLVMValueRef values[3];
>>>> unsigned i;
>>>> unsigned *properties =
>>>> ctx->shader->selector->info.properties;
>>>> - unsigned sizes[3] = {
>>>> - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>>>> -
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>>>> - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>>>> - };
>>>>
>>>> - for (i = 0; i < 3; ++i)
>>>> - values[i] = lp_build_const_int32(gallivm,
>>>> sizes[i]);
>>>> + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]
>>>> != 0) {
>>>> + unsigned sizes[3] = {
>>>> +
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>>>> +
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>>>> +
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>>>> + };
>>>> +
>>>> + for (i = 0; i < 3; ++i)
>>>> + values[i] =
>>>> lp_build_const_int32(gallivm, sizes[i]);
>>>>
>>>> - value = lp_build_gather_values(gallivm, values, 3);
>>>> + value = lp_build_gather_values(gallivm,
>>>> values, 3);
>>>> + } else {
>>>> + value = LLVMGetParam(radeon_bld->main_fn,
>>>> SI_PARAM_BLOCK_SIZE);
>>>> + }
>>>> break;
>>>> }
>>>>
>>>> case TGSI_SEMANTIC_BLOCK_ID:
>>>> value = LLVMGetParam(radeon_bld->main_fn,
>>>> SI_PARAM_BLOCK_ID);
>>>> break;
>>>>
>>>> case TGSI_SEMANTIC_THREAD_ID:
>>>> value = LLVMGetParam(radeon_bld->main_fn,
>>>> SI_PARAM_THREAD_ID);
>>>> break;
>>>> @@ -5705,20 +5710,21 @@ static void create_function(struct
>>>> si_shader_context *ctx)
>>>>
>>>> for (i = 0; i < num_return_sgprs; i++)
>>>> returns[i] = ctx->i32;
>>>> for (; i < num_returns; i++)
>>>> returns[i] = ctx->f32;
>>>> }
>>>> break;
>>>>
>>>> case PIPE_SHADER_COMPUTE:
>>>> params[SI_PARAM_GRID_SIZE] = v3i32;
>>>> + params[SI_PARAM_BLOCK_SIZE] = v3i32;
>>>> params[SI_PARAM_BLOCK_ID] = v3i32;
>>>> last_sgpr = SI_PARAM_BLOCK_ID;
>>>>
>>>> params[SI_PARAM_THREAD_ID] = v3i32;
>>>> num_params = SI_PARAM_THREAD_ID + 1;
>>>> break;
>>>> default:
>>>> assert(0 && "unimplemented shader");
>>>> return;
>>>> }
>>>> @@ -5741,21 +5747,26 @@ static void create_function(struct
>>>> si_shader_context *ctx)
>>>>
>>>> S_0286D0_LINEAR_CENTROID_ENA(1) |
>>>> S_0286D0_FRONT_FACE_ENA(1) |
>>>>
>>>> S_0286D0_POS_FIXED_PT_ENA(1));
>>>> } else if (ctx->type == PIPE_SHADER_COMPUTE) {
>>>> const unsigned *properties =
>>>> shader->selector->info.properties;
>>>> unsigned max_work_group_size =
>>>>
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
>>>>
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
>>>>
>>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>>>>
>>>> - assert(max_work_group_size);
>>>> + if (!max_work_group_size) {
>>>> + /* This is a variable group size compute
>>>> shader,
>>>> + * compile it for the maximum possible group
>>>> size.
>>>> + */
>>>> + max_work_group_size = 2048;
>>>> + }
>>>
>>> The VGPR limit is 32. That's too limiting. I'd prefer it if
>>> GL_MAX_COMPUTE_VARIABLE_WORK_GROUP_INVOCATIONS_ARB was set to 512 or
>>> 1024. That should move the limit to 128 or 64, respectively.
>>
>> FWIW I'd definitely like to see that for nouveau eventually as well
>> (basically have the same issue, where we have to limit the # of regs
>> used artificially). But perhaps the initial iteration can just work
>> with the same maxes, and then be improved as a follow-on?
>
> What about adding a new cap for MaxComputeVariableGroupSizeARB and
> enable the ext based on that cap (suggested by Marek and I do agree) ?
Yes, please. That would solve all the issues nicely.
Nicolai
>>
>> -ilia
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
More information about the mesa-dev
mailing list