[Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size
Samuel Pitoiset
samuel.pitoiset at gmail.com
Fri Sep 9 16:06:29 UTC 2016
On 09/09/2016 10:12 AM, Nicolai Hähnle wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> Not sure if it's possible to avoid programming the block size twice (once for
> the userdata and once for the dispatch).
>
> Since the shaders are compiled with a pessimistic upper limit on the number of
> registers, asynchronously compiling variants may be worth considering in the
> future if we observe the shaders to be dispatched with small block sizes.
> ---
> I think this is sufficient to support variable group sizes on radeonsi, but
> it's completely untested. Do you keep the latest version of your series in a
> public repository somewhere?
Yes, you can find the current version here:
https://cgit.freedesktop.org/~hakzsam/mesa/log/?h=arb_compute_variable_group_size
The next one will be in the arb_compute_variable_group_size_v1 branch
(once I have fixed all the things locally).
>
> src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
> src/gallium/drivers/radeonsi/si_shader.c | 29 ++++++++++++++++++++---------
> src/gallium/drivers/radeonsi/si_shader.h | 4 +++-
> 3 files changed, 32 insertions(+), 11 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 5041761..26e096c 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx,
> for (i = 0; i < 3; ++i) {
> radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
> radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
> COPY_DATA_DST_SEL(COPY_DATA_REG));
> radeon_emit(cs, (va + 4 * i));
> radeon_emit(cs, (va + 4 * i) >> 32);
> radeon_emit(cs, (grid_size_reg >> 2) + i);
> radeon_emit(cs, 0);
> }
> } else {
> + struct si_compute *program = sctx->cs_shader_state.program;
> + bool variable_group_size =
> + program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
>
> - radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
> + radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3);
> radeon_emit(cs, info->grid[0]);
> radeon_emit(cs, info->grid[1]);
> radeon_emit(cs, info->grid[2]);
> + if (variable_group_size) {
> + radeon_emit(cs, info->block[0]);
> + radeon_emit(cs, info->block[1]);
> + radeon_emit(cs, info->block[2]);
> + }
> }
> }
>
> static void si_emit_dispatch_packets(struct si_context *sctx,
> const struct pipe_grid_info *info)
> {
> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
> unsigned waves_per_threadgroup =
> DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 0b7de18..730ee21 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -1783,30 +1783,35 @@ static void declare_system_value(
>
> case TGSI_SEMANTIC_GRID_SIZE:
> value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
> break;
>
> case TGSI_SEMANTIC_BLOCK_SIZE:
> {
> LLVMValueRef values[3];
> unsigned i;
> unsigned *properties = ctx->shader->selector->info.properties;
> - unsigned sizes[3] = {
> - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
> - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
> - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
> - };
>
> - for (i = 0; i < 3; ++i)
> - values[i] = lp_build_const_int32(gallivm, sizes[i]);
> + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
> + unsigned sizes[3] = {
> + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
> + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
> + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
> + };
> +
> + for (i = 0; i < 3; ++i)
> + values[i] = lp_build_const_int32(gallivm, sizes[i]);
>
> - value = lp_build_gather_values(gallivm, values, 3);
> + value = lp_build_gather_values(gallivm, values, 3);
> + } else {
> + value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE);
> + }
> break;
> }
>
> case TGSI_SEMANTIC_BLOCK_ID:
> value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
> break;
>
> case TGSI_SEMANTIC_THREAD_ID:
> value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
> break;
> @@ -5705,20 +5710,21 @@ static void create_function(struct si_shader_context *ctx)
>
> for (i = 0; i < num_return_sgprs; i++)
> returns[i] = ctx->i32;
> for (; i < num_returns; i++)
> returns[i] = ctx->f32;
> }
> break;
>
> case PIPE_SHADER_COMPUTE:
> params[SI_PARAM_GRID_SIZE] = v3i32;
> + params[SI_PARAM_BLOCK_SIZE] = v3i32;
> params[SI_PARAM_BLOCK_ID] = v3i32;
> last_sgpr = SI_PARAM_BLOCK_ID;
>
> params[SI_PARAM_THREAD_ID] = v3i32;
> num_params = SI_PARAM_THREAD_ID + 1;
> break;
> default:
> assert(0 && "unimplemented shader");
> return;
> }
> @@ -5741,21 +5747,26 @@ static void create_function(struct si_shader_context *ctx)
> S_0286D0_LINEAR_CENTROID_ENA(1) |
> S_0286D0_FRONT_FACE_ENA(1) |
> S_0286D0_POS_FIXED_PT_ENA(1));
> } else if (ctx->type == PIPE_SHADER_COMPUTE) {
> const unsigned *properties = shader->selector->info.properties;
> unsigned max_work_group_size =
> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
> properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>
> - assert(max_work_group_size);
> + if (!max_work_group_size) {
> + /* This is a variable group size compute shader,
> + * compile it for the maximum possible group size.
> + */
> + max_work_group_size = 2048;
> + }
>
> radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
> "amdgpu-max-work-group-size",
> max_work_group_size);
> }
>
> shader->info.num_input_sgprs = 0;
> shader->info.num_input_vgprs = 0;
>
> for (i = 0; i <= last_sgpr; ++i)
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index fc1b22d..afdb3f5 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -120,21 +120,22 @@ enum {
> /* GS limits */
> SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
> SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1,
>
> /* PS only */
> SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
> SI_PS_NUM_USER_SGPR,
>
> /* CS only */
> SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS,
> - SI_CS_NUM_USER_SGPR = SI_SGPR_GRID_SIZE + 3
> + SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3,
> + SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3
> };
>
> /* LLVM function parameter indices */
> enum {
> SI_PARAM_RW_BUFFERS,
> SI_PARAM_CONST_BUFFERS,
> SI_PARAM_SAMPLERS,
> SI_PARAM_IMAGES,
> SI_PARAM_SHADER_BUFFERS,
> SI_NUM_RESOURCE_PARAMS,
> @@ -210,20 +211,21 @@ enum {
> SI_PARAM_POS_Y_FLOAT,
> SI_PARAM_POS_Z_FLOAT,
> SI_PARAM_POS_W_FLOAT,
> SI_PARAM_FRONT_FACE,
> SI_PARAM_ANCILLARY,
> SI_PARAM_SAMPLE_COVERAGE,
> SI_PARAM_POS_FIXED_PT,
>
> /* CS only parameters */
> SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS,
> + SI_PARAM_BLOCK_SIZE,
> SI_PARAM_BLOCK_ID,
> SI_PARAM_THREAD_ID,
>
> SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
> };
>
> /* SI-specific system values. */
> enum {
> TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT,
> TGSI_SEMANTIC_DEFAULT_TESSINNER_SI,
>
More information about the mesa-dev
mailing list