[Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size
Samuel Pitoiset
samuel.pitoiset at gmail.com
Fri Sep 9 16:07:44 UTC 2016
On 09/09/2016 02:37 PM, Ilia Mirkin wrote:
> On Fri, Sep 9, 2016 at 8:29 AM, Marek Olšák <maraeo at gmail.com> wrote:
>> On Fri, Sep 9, 2016 at 10:12 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>>
>>> Not sure if it's possible to avoid programming the block size twice (once for
>>> the userdata and once for the dispatch).
>>>
>>> Since the shaders are compiled with a pessimistic upper limit on the number of
>>> registers, asynchronously compiling variants may be worth considering in the
>>> future if we observe the shaders to be dispatched with small block sizes.
>>> ---
>>> I think this is sufficient to support variable group sizes on radeonsi, but
>>> it's completely untested. Do you keep the latest version of your series in a
>>> public repository somewhere?
>>>
>>> src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
>>> src/gallium/drivers/radeonsi/si_shader.c | 29 ++++++++++++++++++++---------
>>> src/gallium/drivers/radeonsi/si_shader.h | 4 +++-
>>> 3 files changed, 32 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>>> index 5041761..26e096c 100644
>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>> @@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx,
>>> for (i = 0; i < 3; ++i) {
>>> radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
>>> radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>>> COPY_DATA_DST_SEL(COPY_DATA_REG));
>>> radeon_emit(cs, (va + 4 * i));
>>> radeon_emit(cs, (va + 4 * i) >> 32);
>>> radeon_emit(cs, (grid_size_reg >> 2) + i);
>>> radeon_emit(cs, 0);
>>> }
>>> } else {
>>> + struct si_compute *program = sctx->cs_shader_state.program;
>>> + bool variable_group_size =
>>> + program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
>>>
>>> - radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
>>> + radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3);
>>> radeon_emit(cs, info->grid[0]);
>>> radeon_emit(cs, info->grid[1]);
>>> radeon_emit(cs, info->grid[2]);
>>> + if (variable_group_size) {
>>> + radeon_emit(cs, info->block[0]);
>>> + radeon_emit(cs, info->block[1]);
>>> + radeon_emit(cs, info->block[2]);
>>> + }
>>> }
>>> }
>>>
>>> static void si_emit_dispatch_packets(struct si_context *sctx,
>>> const struct pipe_grid_info *info)
>>> {
>>> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>>> bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
>>> unsigned waves_per_threadgroup =
>>> DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
>>> index 0b7de18..730ee21 100644
>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>> @@ -1783,30 +1783,35 @@ static void declare_system_value(
>>>
>>> case TGSI_SEMANTIC_GRID_SIZE:
>>> value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
>>> break;
>>>
>>> case TGSI_SEMANTIC_BLOCK_SIZE:
>>> {
>>> LLVMValueRef values[3];
>>> unsigned i;
>>> unsigned *properties = ctx->shader->selector->info.properties;
>>> - unsigned sizes[3] = {
>>> - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>>> - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>>> - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>>> - };
>>>
>>> - for (i = 0; i < 3; ++i)
>>> - values[i] = lp_build_const_int32(gallivm, sizes[i]);
>>> + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
>>> + unsigned sizes[3] = {
>>> + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
>>> + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
>>> + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
>>> + };
>>> +
>>> + for (i = 0; i < 3; ++i)
>>> + values[i] = lp_build_const_int32(gallivm, sizes[i]);
>>>
>>> - value = lp_build_gather_values(gallivm, values, 3);
>>> + value = lp_build_gather_values(gallivm, values, 3);
>>> + } else {
>>> + value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE);
>>> + }
>>> break;
>>> }
>>>
>>> case TGSI_SEMANTIC_BLOCK_ID:
>>> value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
>>> break;
>>>
>>> case TGSI_SEMANTIC_THREAD_ID:
>>> value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
>>> break;
>>> @@ -5705,20 +5710,21 @@ static void create_function(struct si_shader_context *ctx)
>>>
>>> for (i = 0; i < num_return_sgprs; i++)
>>> returns[i] = ctx->i32;
>>> for (; i < num_returns; i++)
>>> returns[i] = ctx->f32;
>>> }
>>> break;
>>>
>>> case PIPE_SHADER_COMPUTE:
>>> params[SI_PARAM_GRID_SIZE] = v3i32;
>>> + params[SI_PARAM_BLOCK_SIZE] = v3i32;
>>> params[SI_PARAM_BLOCK_ID] = v3i32;
>>> last_sgpr = SI_PARAM_BLOCK_ID;
>>>
>>> params[SI_PARAM_THREAD_ID] = v3i32;
>>> num_params = SI_PARAM_THREAD_ID + 1;
>>> break;
>>> default:
>>> assert(0 && "unimplemented shader");
>>> return;
>>> }
>>> @@ -5741,21 +5747,26 @@ static void create_function(struct si_shader_context *ctx)
>>> S_0286D0_LINEAR_CENTROID_ENA(1) |
>>> S_0286D0_FRONT_FACE_ENA(1) |
>>> S_0286D0_POS_FIXED_PT_ENA(1));
>>> } else if (ctx->type == PIPE_SHADER_COMPUTE) {
>>> const unsigned *properties = shader->selector->info.properties;
>>> unsigned max_work_group_size =
>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
>>> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>>>
>>> - assert(max_work_group_size);
>>> + if (!max_work_group_size) {
>>> + /* This is a variable group size compute shader,
>>> + * compile it for the maximum possible group size.
>>> + */
>>> + max_work_group_size = 2048;
>>> + }
>>
>> The VGPR limit is 32. That's too limiting. I'd prefer it if
>> GL_MAX_COMPUTE_VARIABLE_WORK_GROUP_INVOCATIONS_ARB was set to 512 or
>> 1024. That should move the limit to 128 or 64, respectively.
>
> FWIW I'd definitely like to see that for nouveau eventually as well
> (basically have the same issue, where we have to limit the # of regs
> used artificially). But perhaps the initial iteration can just work
> with the same maxes, and then be improved as a follow-on?
What about adding a new cap for MaxComputeVariableGroupSizeARB and
enable the ext based on that cap (suggested by Marek and I do agree) ?
>
> -ilia
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
More information about the mesa-dev
mailing list