[Mesa-dev] [PATCH 2/2] radv: inline push constants where possible.
Samuel Pitoiset
samuel.pitoiset at gmail.com
Thu Jan 11 21:30:34 UTC 2018
On 01/11/2018 04:03 AM, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> Instead of putting the push constants into the upload buffer,
> if we have space in the sgprs we can upload the per-stage
> constants into the shaders directly.
>
> This saves a few reads from memory in the meta shaders,
> we should also be able to inline other objects like
> descriptors.
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
> src/amd/common/ac_nir_to_llvm.c | 93 ++++++++++++++++++++++++++++++++++++----
> src/amd/common/ac_nir_to_llvm.h | 4 ++
> src/amd/common/ac_shader_info.c | 5 ++-
> src/amd/common/ac_shader_info.h | 1 +
> src/amd/vulkan/radv_cmd_buffer.c | 74 ++++++++++++++++++++++++--------
> 5 files changed, 150 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index c00220a9c3..818ce40168 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -92,6 +92,7 @@ struct nir_to_llvm_context {
> LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
> LLVMValueRef ring_offsets;
> LLVMValueRef push_constants;
> + LLVMValueRef inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
> LLVMValueRef view_index;
> LLVMValueRef num_work_groups;
> LLVMValueRef workgroup_ids[3];
> @@ -243,7 +244,7 @@ static void set_llvm_calling_convention(LLVMValueRef func,
> LLVMSetFunctionCallConv(func, calling_conv);
> }
>
> -#define MAX_ARGS 23
> +#define MAX_ARGS 32
> struct arg_info {
> LLVMTypeRef types[MAX_ARGS];
> LLVMValueRef *assign[MAX_ARGS];
> @@ -538,6 +539,8 @@ struct user_sgpr_info {
> bool need_ring_offsets;
> uint8_t sgpr_count;
> bool indirect_all_descriptor_sets;
> + uint8_t base_inline_push_consts;
> + uint8_t num_inline_push_consts;
> };
>
> static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
> @@ -609,8 +612,45 @@ static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
> } else {
> user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
> }
> +
> + if (ctx->shader_info->info.loads_push_constants) {
> + uint32_t remaining_sgprs = 16 - user_sgpr_info->sgpr_count;
This can be 32 on GFX9. Also, please have a look at the TODO above, the
number of user SGPRs has to be fixed first, should be quite trivial but
I didn't send a patch yet.
> + if (!ctx->shader_info->info.has_indirect_push_constants &&
> + !ctx->shader_info->info.loads_dynamic_offsets)
> + remaining_sgprs += 2;
> +
> + if (ctx->options->layout->push_constant_size) {
> + uint8_t num_32bit_push_consts = (ctx->shader_info->info.max_push_constant_used -
> + ctx->shader_info->info.min_push_constant_used) / 4;
> + user_sgpr_info->base_inline_push_consts = ctx->shader_info->info.min_push_constant_used / 4;
> +
> + if (num_32bit_push_consts < remaining_sgprs) {
> + user_sgpr_info->num_inline_push_consts = num_32bit_push_consts;
> + if (!ctx->shader_info->info.has_indirect_push_constants)
> + ctx->shader_info->info.loads_push_constants = false;
> + } else {
> + user_sgpr_info->num_inline_push_consts = remaining_sgprs;
> + }
> +
> + if (user_sgpr_info->num_inline_push_consts > AC_UD_MAX_INLINE_PUSH_CONST)
> + user_sgpr_info->num_inline_push_consts = AC_UD_MAX_INLINE_PUSH_CONST;
> + }
> + }
> }
>
> +static void
> +declare_inline_push_consts(struct nir_to_llvm_context *ctx,
> + gl_shader_stage stage,
> + const struct user_sgpr_info *user_sgpr_info,
> + struct arg_info *args)
> +{
> + ctx->shader_info->inline_push_const_mask = (1 << user_sgpr_info->num_inline_push_consts) - 1;
> + ctx->shader_info->inline_push_const_mask <<= user_sgpr_info->base_inline_push_consts;
> +
> + for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
> + add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->inline_push_consts[i]);
> +
> +}
> static void
> declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
> gl_shader_stage stage,
> @@ -644,6 +684,9 @@ declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
> /* 1 for push constants and dynamic descriptors */
> add_array_arg(args, type, &ctx->push_constants);
> }
> +
> + if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX)))
> + declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
> }
>
> static void
> @@ -651,6 +694,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
> gl_shader_stage stage,
> bool has_previous_stage,
> gl_shader_stage previous_stage,
> + const struct user_sgpr_info *user_sgpr_info,
> struct arg_info *args)
> {
> if (!ctx->is_gs_copy_shader &&
> @@ -660,6 +704,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
> add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32, 16),
> &ctx->vertex_buffers);
> }
> + declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
> add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
> add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance);
> if (ctx->shader_info->info.vs.needs_draw_id) {
> @@ -693,6 +738,16 @@ declare_tes_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
> add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
> }
>
> +static void
> +set_inline_pushconst_locs(struct nir_to_llvm_context *ctx,
> + const struct user_sgpr_info *user_sgpr_info,
> + uint8_t *user_sgpr_idx)
> +{
> + ctx->shader_info->user_sgprs_locs.push_const_base = user_sgpr_info->base_inline_push_consts;
> + for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
> + set_loc(&ctx->shader_info->user_sgprs_locs.inline_push_consts[i], user_sgpr_idx, 1, 0);
> +}
> +
> static void
> set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
> bool has_previous_stage, gl_shader_stage previous_stage,
> @@ -734,12 +789,17 @@ set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
> if (ctx->shader_info->info.loads_push_constants) {
> set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
> }
> +
> +
> + if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX)))
> + set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
> }
>
> static void
> set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
> gl_shader_stage stage, bool has_previous_stage,
> gl_shader_stage previous_stage,
> + const struct user_sgpr_info *user_sgpr_info,
> uint8_t *user_sgpr_idx)
> {
> if (!ctx->is_gs_copy_shader &&
> @@ -750,6 +810,7 @@ set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
> user_sgpr_idx, 2);
> }
>
> + set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
> unsigned vs_num = 2;
> if (ctx->shader_info->info.vs.needs_draw_id)
> vs_num++;
> @@ -805,7 +866,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
> previous_stage, &user_sgpr_info,
> &args, &desc_sets);
> declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage,
> - previous_stage, &args);
> + previous_stage, &user_sgpr_info, &args);
>
> if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index))
> add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
> @@ -838,7 +899,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
> &desc_sets);
> declare_vs_specific_input_sgprs(ctx, stage,
> has_previous_stage,
> - previous_stage, &args);
> + previous_stage, &user_sgpr_info, &args);
>
> add_arg(&args, ARG_SGPR, ctx->ac.i32,
> &ctx->ls_out_layout);
> @@ -934,7 +995,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
> } else {
> declare_vs_specific_input_sgprs(ctx, stage,
> has_previous_stage,
> - previous_stage,
> + previous_stage, &user_sgpr_info,
> &args);
> }
>
> @@ -1076,7 +1137,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
> break;
> case MESA_SHADER_VERTEX:
> set_vs_specific_input_locs(ctx, stage, has_previous_stage,
> - previous_stage, &user_sgpr_idx);
> + previous_stage, &user_sgpr_info, &user_sgpr_idx);
> if (ctx->view_index)
> set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
> if (ctx->options->key.vs.as_ls) {
> @@ -1088,7 +1149,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
> break;
> case MESA_SHADER_TESS_CTRL:
> set_vs_specific_input_locs(ctx, stage, has_previous_stage,
> - previous_stage, &user_sgpr_idx);
> + previous_stage, &user_sgpr_info, &user_sgpr_idx);
> if (has_previous_stage)
> set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
> &user_sgpr_idx, 1);
> @@ -1108,6 +1169,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
> set_vs_specific_input_locs(ctx, stage,
> has_previous_stage,
> previous_stage,
> + &user_sgpr_info,
> &user_sgpr_idx);
> else
> set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT,
> @@ -2357,9 +2419,24 @@ static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
> nir_intrinsic_instr *instr)
> {
> LLVMValueRef ptr, addr;
> + LLVMValueRef src0 = get_src(ctx->nir, instr->src[0]);
> + unsigned index = nir_intrinsic_base(instr);
> +
> + if (LLVMIsConstant(src0)) {
> + unsigned array_index = index;
> + array_index += LLVMConstIntGetZExtValue(src0);
> + array_index /= 4;
> +
> + uint32_t bits = ((1 << instr->num_components) - 1) << array_index;
> +
> + if ((bits & ctx->shader_info->inline_push_const_mask) == bits) {
> + array_index -= ctx->shader_info->user_sgprs_locs.push_const_base;
> + return ac_build_gather_values(&ctx->ac, &ctx->inline_push_consts[array_index], instr->num_components);
> + }
> + }
>
> - addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
> - addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir, instr->src[0]), "");
> + addr = LLVMConstInt(ctx->ac.i32, index, 0);
> + addr = LLVMBuildAdd(ctx->builder, addr, src0, "");
>
> ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
> ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa));
> diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
> index b3ad0a0985..9f9230d3e6 100644
> --- a/src/amd/common/ac_nir_to_llvm.h
> +++ b/src/amd/common/ac_nir_to_llvm.h
> @@ -127,10 +127,13 @@ enum ac_ud_index {
>
> // Match MAX_SETS from radv_descriptor_set.h
> #define AC_UD_MAX_SETS MAX_SETS
> +#define AC_UD_MAX_INLINE_PUSH_CONST 8
>
> struct ac_userdata_locations {
> struct ac_userdata_info descriptor_sets[AC_UD_MAX_SETS];
> struct ac_userdata_info shader_data[AC_UD_MAX_UD];
> + struct ac_userdata_info inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
> + uint8_t push_const_base;
> };
>
> struct ac_vs_output_info {
> @@ -156,6 +159,7 @@ struct ac_shader_variant_info {
> unsigned num_user_sgprs;
> unsigned num_input_sgprs;
> unsigned num_input_vgprs;
> + uint32_t inline_push_const_mask;
> bool need_indirect_descriptor_sets;
> struct {
> struct {
> diff --git a/src/amd/common/ac_shader_info.c b/src/amd/common/ac_shader_info.c
> index 18fa9e1c94..fbb46684ae 100644
> --- a/src/amd/common/ac_shader_info.c
> +++ b/src/amd/common/ac_shader_info.c
> @@ -179,9 +179,10 @@ ac_nir_shader_info_pass(struct nir_shader *nir,
> {
> struct nir_function *func = (struct nir_function *)exec_list_get_head(&nir->functions);
>
> -
> - if (options->layout->dynamic_offset_count)
> + if (options->layout->dynamic_offset_count) {
> info->loads_push_constants = true;
> + info->loads_dynamic_offsets = true;
> + }
>
> nir_foreach_variable(variable, &nir->inputs)
> gather_info_input_decl(nir, options, variable, info);
> diff --git a/src/amd/common/ac_shader_info.h b/src/amd/common/ac_shader_info.h
> index e35cde0ca9..e8ea33f2e3 100644
> --- a/src/amd/common/ac_shader_info.h
> +++ b/src/amd/common/ac_shader_info.h
> @@ -32,6 +32,7 @@ struct ac_shader_info {
> uint8_t min_push_constant_used;
> uint8_t max_push_constant_used;
> bool has_indirect_push_constants;
> + bool loads_dynamic_offsets;
> bool loads_push_constants;
> bool needs_multiview_view_index;
> bool uses_invocation_id;
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> index 60f19fb12b..17306eeaf8 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -1807,6 +1807,27 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
> assert(cmd_buffer->cs->cdw <= cdw_max);
> }
>
> +static struct ac_userdata_info *
> +radv_lookup_push_const_sgpr(struct radv_shader_variant *shader,
> + int idx)
> +{
> + idx -= shader->info.user_sgprs_locs.push_const_base;
> + return &shader->info.user_sgprs_locs.inline_push_consts[idx];
> +}
> +
> +static void
> +radv_emit_inline_pushconsts(struct radv_cmd_buffer *cmd_buffer,
> + struct radv_shader_variant *shader,
> + unsigned base_reg,
> + int idx, int count, uint32_t *values)
> +{
> + struct ac_userdata_info *loc = radv_lookup_push_const_sgpr(shader, idx);
> + assert (loc->sgpr_idx == -1);
> + assert (!loc->indirect);
> + radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
> + radeon_emit_array(cmd_buffer->cs, values, count);
> +}
> +
> static void
> radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
> struct radv_pipeline *pipeline,
> @@ -1816,36 +1837,55 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
> unsigned offset;
> void *ptr;
> uint64_t va;
> + bool need_push_constants = false;
>
> stages &= cmd_buffer->push_constant_stages;
> if (!stages ||
> (!layout->push_constant_size && !layout->dynamic_offset_count))
> return;
>
> - if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
> - 16 * layout->dynamic_offset_count,
> - 256, &offset, &ptr))
> - return;
> + radv_foreach_stage(stage, stages) {
> + if (!pipeline->shaders[stage])
> + continue;
> +
> + need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
> + need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
>
> - memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
> - if (layout->dynamic_offset_count) {
> - memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
> - 16 * layout->dynamic_offset_count);
> + uint32_t mask = pipeline->shaders[stage]->info.inline_push_const_mask;
> + uint32_t base_reg = pipeline->user_data_0[stage];
> + while (mask) {
> + int start, count;
> + u_bit_scan_consecutive_range(&mask, &start, &count);
> + radv_emit_inline_pushconsts(cmd_buffer, pipeline->shaders[stage], base_reg,
> + start, count, (uint32_t *)&cmd_buffer->push_constants[start * 4]);
> + }
> }
>
> - va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> - va += offset;
> + if (need_push_constants) {
> + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
> + 16 * layout->dynamic_offset_count,
> + 256, &offset, &ptr))
> + return;
>
> - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
> - cmd_buffer->cs, MESA_SHADER_STAGES * 4);
> + memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
> + if (layout->dynamic_offset_count) {
> + memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
> + 16 * layout->dynamic_offset_count);
> + }
>
> - radv_foreach_stage(stage, stages) {
> - if (pipeline->shaders[stage]) {
> - radv_emit_userdata_address(cmd_buffer, pipeline, stage,
> - AC_UD_PUSH_CONSTANTS, va);
> + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> + va += offset;
> +
> + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
> + cmd_buffer->cs, MESA_SHADER_STAGES * 4);
> +
> + radv_foreach_stage(stage, stages) {
> + if (pipeline->shaders[stage]) {
> + radv_emit_userdata_address(cmd_buffer, pipeline, stage,
> + AC_UD_PUSH_CONSTANTS, va);
> + }
> }
> }
> -
> cmd_buffer->push_constant_stages &= ~stages;
> assert(cmd_buffer->cs->cdw <= cdw_max);
> }
>
More information about the mesa-dev
mailing list