[Mesa-dev] [PATCH 2/2] [rfc] radv: inline push constants where possible. (v2)
Alex Smith
asmith at feralinteractive.com
Fri Jan 12 09:49:56 UTC 2018
Looks like it's working fine here now. One comment inline below.
On 12 January 2018 at 02:43, Dave Airlie <airlied at gmail.com> wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> Instead of putting the push constants into the upload buffer,
> if we have space in the sgprs we can upload the per-stage
> constants into the shaders directly.
>
> This saves a few reads from memory in the meta shaders,
> we should also be able to inline other objects like
> descriptors.
>
> v2: fixup 16->available_sgprs (Samuel)
> fixup dynamic offsets. (Alex)
> bump to 12.
> handle push consts > 32 better, avoid F1 2017 crash
>
> TODO: proper vega support (Samuel)
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
> src/amd/common/ac_nir_to_llvm.c | 102 ++++++++++++++++++++++++++++++
> +++++----
> src/amd/common/ac_nir_to_llvm.h | 5 ++
> src/amd/common/ac_shader_info.c | 5 +-
> src/amd/common/ac_shader_info.h | 1 +
> src/amd/vulkan/radv_cmd_buffer.c | 75 +++++++++++++++++++++-------
> 5 files changed, 159 insertions(+), 29 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_
> llvm.c
> index 6c578de3aca..00ad76a82f7 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -92,6 +92,7 @@ struct nir_to_llvm_context {
> LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
> LLVMValueRef ring_offsets;
> LLVMValueRef push_constants;
> + LLVMValueRef inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
> LLVMValueRef view_index;
> LLVMValueRef num_work_groups;
> LLVMValueRef workgroup_ids[3];
> @@ -243,7 +244,7 @@ static void set_llvm_calling_convention(LLVMValueRef
> func,
> LLVMSetFunctionCallConv(func, calling_conv);
> }
>
> -#define MAX_ARGS 23
> +#define MAX_ARGS 32
> struct arg_info {
> LLVMTypeRef types[MAX_ARGS];
> LLVMValueRef *assign[MAX_ARGS];
> @@ -538,6 +539,8 @@ struct user_sgpr_info {
> bool need_ring_offsets;
> uint8_t sgpr_count;
> bool indirect_all_descriptor_sets;
> + uint8_t base_inline_push_consts;
> + uint8_t num_inline_push_consts;
> };
>
> static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
> @@ -609,8 +612,49 @@ static void allocate_user_sgprs(struct
> nir_to_llvm_context *ctx,
> } else {
> user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask)
> * 2;
> }
> +
> + if (ctx->shader_info->info.loads_push_constants) {
> + uint32_t remaining_sgprs = available_sgprs -
> user_sgpr_info->sgpr_count;
> + if (!ctx->shader_info->info.has_indirect_push_constants &&
> + !ctx->shader_info->info.loads_dynamic_offsets)
> + remaining_sgprs += 2;
> +
> + if (ctx->options->layout->push_constant_size) {
> + uint8_t num_32bit_push_consts =
> (ctx->shader_info->info.max_push_constant_used -
> +
> ctx->shader_info->info.min_push_constant_used) / 4;
> +
> + if ((ctx->shader_info->info.min_push_constant_used
> / 4) <= 63 &&
> + (ctx->shader_info->info.max_push_constant_used
> / 4) <= 63) {
> + user_sgpr_info->base_inline_push_consts =
> ctx->shader_info->info.min_push_constant_used / 4;
> +
> + if (num_32bit_push_consts <
> remaining_sgprs) {
> + user_sgpr_info->num_inline_push_consts
> = num_32bit_push_consts;
> + if (!ctx->shader_info->info.has_
> indirect_push_constants)
> +
> ctx->shader_info->info.loads_push_constants = false;
> + } else {
> + user_sgpr_info->num_inline_push_consts
> = remaining_sgprs;
> + }
> +
> + if (user_sgpr_info->num_inline_push_consts
> > AC_UD_MAX_INLINE_PUSH_CONST)
> + user_sgpr_info->num_inline_push_consts
> = AC_UD_MAX_INLINE_PUSH_CONST;
>
This is done after possibly setting loads_push_constants to false, if this
happens shouldn't that still be true?
With that fixed, for the series:
Reviewed-by: Alex Smith <asmith at feralinteractive.com>
Thanks,
Alex
> + }
> + }
> + }
> }
>
> +static void
> +declare_inline_push_consts(struct nir_to_llvm_context *ctx,
> + gl_shader_stage stage,
> + const struct user_sgpr_info *user_sgpr_info,
> + struct arg_info *args)
> +{
> + ctx->shader_info->inline_push_const_mask = (1ULL <<
> user_sgpr_info->num_inline_push_consts) - 1;
> + ctx->shader_info->inline_push_const_base =
> user_sgpr_info->base_inline_push_consts;
> +
> + for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts;
> i++)
> + add_arg(args, ARG_SGPR, ctx->ac.i32,
> &ctx->inline_push_consts[i]);
> +
> +}
> static void
> declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
> gl_shader_stage stage,
> @@ -640,10 +684,14 @@ declare_global_input_sgprs(struct
> nir_to_llvm_context *ctx,
> add_array_arg(args, const_array(type, 32), desc_sets);
> }
>
> - if (ctx->shader_info->info.loads_push_constants) {
> + if (ctx->shader_info->info.loads_push_constants ||
> + ctx->shader_info->info.loads_dynamic_offsets) {
> /* 1 for push constants and dynamic descriptors */
> add_array_arg(args, type, &ctx->push_constants);
> }
> +
> + if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage &&
> previous_stage == MESA_SHADER_VERTEX)))
> + declare_inline_push_consts(ctx, stage, user_sgpr_info,
> args);
> }
>
> static void
> @@ -651,6 +699,7 @@ declare_vs_specific_input_sgprs(struct
> nir_to_llvm_context *ctx,
> gl_shader_stage stage,
> bool has_previous_stage,
> gl_shader_stage previous_stage,
> + const struct user_sgpr_info
> *user_sgpr_info,
> struct arg_info *args)
> {
> if (!ctx->is_gs_copy_shader &&
> @@ -660,6 +709,7 @@ declare_vs_specific_input_sgprs(struct
> nir_to_llvm_context *ctx,
> add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32,
> 16),
> &ctx->vertex_buffers);
> }
> + declare_inline_push_consts(ctx, stage, user_sgpr_info,
> args);
> add_arg(args, ARG_SGPR, ctx->ac.i32,
> &ctx->abi.base_vertex);
> add_arg(args, ARG_SGPR, ctx->ac.i32,
> &ctx->abi.start_instance);
> if (ctx->shader_info->info.vs.needs_draw_id) {
> @@ -693,6 +743,16 @@ declare_tes_input_vgprs(struct nir_to_llvm_context
> *ctx, struct arg_info *args)
> add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
> }
>
> +static void
> +set_inline_pushconst_locs(struct nir_to_llvm_context *ctx,
> + const struct user_sgpr_info *user_sgpr_info,
> + uint8_t *user_sgpr_idx)
> +{
> + ctx->shader_info->user_sgprs_locs.push_const_base =
> user_sgpr_info->base_inline_push_consts;
> + for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts;
> i++)
> + set_loc(&ctx->shader_info->user_sgprs_locs.inline_push_consts[i],
> user_sgpr_idx, 1, 0);
> +}
> +
> static void
> set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage
> stage,
> bool has_previous_stage, gl_shader_stage
> previous_stage,
> @@ -731,15 +791,21 @@ set_global_input_locs(struct nir_to_llvm_context
> *ctx, gl_shader_stage stage,
> ctx->shader_info->need_indirect_descriptor_sets = true;
> }
>
> - if (ctx->shader_info->info.loads_push_constants) {
> + if (ctx->shader_info->info.loads_push_constants ||
> + ctx->shader_info->info.loads_dynamic_offsets) {
> set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx,
> 2);
> }
> +
> +
> + if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage &&
> previous_stage == MESA_SHADER_VERTEX)))
> + set_inline_pushconst_locs(ctx, user_sgpr_info,
> user_sgpr_idx);
> }
>
> static void
> set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
> gl_shader_stage stage, bool has_previous_stage,
> gl_shader_stage previous_stage,
> + const struct user_sgpr_info *user_sgpr_info,
> uint8_t *user_sgpr_idx)
> {
> if (!ctx->is_gs_copy_shader &&
> @@ -750,6 +816,7 @@ set_vs_specific_input_locs(struct nir_to_llvm_context
> *ctx,
> user_sgpr_idx, 2);
> }
>
> + set_inline_pushconst_locs(ctx, user_sgpr_info,
> user_sgpr_idx);
> unsigned vs_num = 2;
> if (ctx->shader_info->info.vs.needs_draw_id)
> vs_num++;
> @@ -805,7 +872,7 @@ static void create_function(struct nir_to_llvm_context
> *ctx,
> previous_stage, &user_sgpr_info,
> &args, &desc_sets);
> declare_vs_specific_input_sgprs(ctx, stage,
> has_previous_stage,
> - previous_stage, &args);
> + previous_stage,
> &user_sgpr_info, &args);
>
> if (ctx->shader_info->info.needs_multiview_view_index ||
> (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls &&
> ctx->options->key.has_multiview_view_index))
> add_arg(&args, ARG_SGPR, ctx->ac.i32,
> &ctx->view_index);
> @@ -838,7 +905,7 @@ static void create_function(struct nir_to_llvm_context
> *ctx,
> &desc_sets);
> declare_vs_specific_input_sgprs(ctx, stage,
> has_previous_stage,
> - previous_stage,
> &args);
> + previous_stage,
> &user_sgpr_info, &args);
>
> add_arg(&args, ARG_SGPR, ctx->ac.i32,
> &ctx->ls_out_layout);
> @@ -934,7 +1001,7 @@ static void create_function(struct
> nir_to_llvm_context *ctx,
> } else {
> declare_vs_specific_input_sgprs(ctx,
> stage,
>
> has_previous_stage,
> -
> previous_stage,
> +
> previous_stage, &user_sgpr_info,
> &args);
> }
>
> @@ -1076,7 +1143,7 @@ static void create_function(struct
> nir_to_llvm_context *ctx,
> break;
> case MESA_SHADER_VERTEX:
> set_vs_specific_input_locs(ctx, stage, has_previous_stage,
> - previous_stage, &user_sgpr_idx);
> + previous_stage,
> &user_sgpr_info, &user_sgpr_idx);
> if (ctx->view_index)
> set_loc_shader(ctx, AC_UD_VIEW_INDEX,
> &user_sgpr_idx, 1);
> if (ctx->options->key.vs.as_ls) {
> @@ -1088,7 +1155,7 @@ static void create_function(struct
> nir_to_llvm_context *ctx,
> break;
> case MESA_SHADER_TESS_CTRL:
> set_vs_specific_input_locs(ctx, stage, has_previous_stage,
> - previous_stage, &user_sgpr_idx);
> + previous_stage,
> &user_sgpr_info, &user_sgpr_idx);
> if (has_previous_stage)
> set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
> &user_sgpr_idx, 1);
> @@ -1108,6 +1175,7 @@ static void create_function(struct
> nir_to_llvm_context *ctx,
> set_vs_specific_input_locs(ctx, stage,
>
> has_previous_stage,
> previous_stage,
> + &user_sgpr_info,
> &user_sgpr_idx);
> else
> set_loc_shader(ctx,
> AC_UD_TES_OFFCHIP_LAYOUT,
> @@ -2371,9 +2439,23 @@ static LLVMValueRef visit_load_push_constant(struct
> nir_to_llvm_context *ctx,
> nir_intrinsic_instr *instr)
> {
> LLVMValueRef ptr, addr;
> + LLVMValueRef src0 = get_src(ctx->nir, instr->src[0]);
> + unsigned index = nir_intrinsic_base(instr);
> +
> + if (LLVMIsConstant(src0)) {
> + unsigned array_index = index;
> + array_index += LLVMConstIntGetZExtValue(src0);
> + array_index /= 4;
> +
> + array_index -= ctx->shader_info->inline_push_const_base;
> + uint64_t bits = (((1ULL << instr->num_components) - 1) <<
> array_index);
> + if ((bits & ctx->shader_info->inline_push_const_mask) ==
> bits) {
> + return ac_build_gather_values(&ctx->ac,
> &ctx->inline_push_consts[array_index], instr->num_components);
> + }
> + }
>
> - addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
> - addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir,
> instr->src[0]), "");
> + addr = LLVMConstInt(ctx->ac.i32, index, 0);
> + addr = LLVMBuildAdd(ctx->builder, addr, src0, "");
>
> ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
> ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa));
> diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_
> llvm.h
> index b3ad0a09857..22330fdfbc4 100644
> --- a/src/amd/common/ac_nir_to_llvm.h
> +++ b/src/amd/common/ac_nir_to_llvm.h
> @@ -127,10 +127,13 @@ enum ac_ud_index {
>
> // Match MAX_SETS from radv_descriptor_set.h
> #define AC_UD_MAX_SETS MAX_SETS
> +#define AC_UD_MAX_INLINE_PUSH_CONST 12
>
> struct ac_userdata_locations {
> struct ac_userdata_info descriptor_sets[AC_UD_MAX_SETS];
> struct ac_userdata_info shader_data[AC_UD_MAX_UD];
> + struct ac_userdata_info inline_push_consts[AC_UD_MAX_
> INLINE_PUSH_CONST];
> + uint8_t push_const_base;
> };
>
> struct ac_vs_output_info {
> @@ -156,6 +159,8 @@ struct ac_shader_variant_info {
> unsigned num_user_sgprs;
> unsigned num_input_sgprs;
> unsigned num_input_vgprs;
> + uint64_t inline_push_const_mask;
> + uint32_t inline_push_const_base;
> bool need_indirect_descriptor_sets;
> struct {
> struct {
> diff --git a/src/amd/common/ac_shader_info.c b/src/amd/common/ac_shader_
> info.c
> index 18fa9e1c94c..fbb46684ae3 100644
> --- a/src/amd/common/ac_shader_info.c
> +++ b/src/amd/common/ac_shader_info.c
> @@ -179,9 +179,10 @@ ac_nir_shader_info_pass(struct nir_shader *nir,
> {
> struct nir_function *func = (struct nir_function
> *)exec_list_get_head(&nir->functions);
>
> -
> - if (options->layout->dynamic_offset_count)
> + if (options->layout->dynamic_offset_count) {
> info->loads_push_constants = true;
> + info->loads_dynamic_offsets = true;
> + }
>
> nir_foreach_variable(variable, &nir->inputs)
> gather_info_input_decl(nir, options, variable, info);
> diff --git a/src/amd/common/ac_shader_info.h b/src/amd/common/ac_shader_
> info.h
> index e35cde0ca97..e8ea33f2e3a 100644
> --- a/src/amd/common/ac_shader_info.h
> +++ b/src/amd/common/ac_shader_info.h
> @@ -32,6 +32,7 @@ struct ac_shader_info {
> uint8_t min_push_constant_used;
> uint8_t max_push_constant_used;
> bool has_indirect_push_constants;
> + bool loads_dynamic_offsets;
> bool loads_push_constants;
> bool needs_multiview_view_index;
> bool uses_invocation_id;
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_
> buffer.c
> index d870a9bedb3..e1953a3a095 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -1807,6 +1807,27 @@ radv_flush_descriptors(struct radv_cmd_buffer
> *cmd_buffer,
> assert(cmd_buffer->cs->cdw <= cdw_max);
> }
>
> +static struct ac_userdata_info *
> +radv_lookup_push_const_sgpr(struct radv_shader_variant *shader,
> + int idx)
> +{
> + idx -= shader->info.user_sgprs_locs.push_const_base;
> + return &shader->info.user_sgprs_locs.inline_push_consts[idx];
> +}
> +
> +static void
> +radv_emit_inline_pushconsts(struct radv_cmd_buffer *cmd_buffer,
> + struct radv_shader_variant *shader,
> + unsigned base_reg,
> + int idx, int count, uint32_t *values)
> +{
> + struct ac_userdata_info *loc = radv_lookup_push_const_sgpr(shader,
> idx);
> + assert (loc->sgpr_idx == -1);
> + assert (!loc->indirect);
> + radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx *
> 4, count);
> + radeon_emit_array(cmd_buffer->cs, values, count);
> +}
> +
> static void
> radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
> struct radv_pipeline *pipeline,
> @@ -1816,36 +1837,56 @@ radv_flush_constants(struct radv_cmd_buffer
> *cmd_buffer,
> unsigned offset;
> void *ptr;
> uint64_t va;
> + bool need_push_constants = false;
>
> stages &= cmd_buffer->push_constant_stages;
> if (!stages ||
> (!layout->push_constant_size && !layout->dynamic_offset_count)
> )
> return;
>
> - if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
> layout->push_constant_size +
> - 16 *
> layout->dynamic_offset_count,
> - 256, &offset, &ptr))
> - return;
> + radv_foreach_stage(stage, stages) {
> + if (!pipeline->shaders[stage])
> + continue;
> +
> + need_push_constants |= pipeline->shaders[stage]->
> info.info.loads_push_constants;
> + need_push_constants |= pipeline->shaders[stage]->
> info.info.loads_dynamic_offsets;
>
> - memcpy(ptr, cmd_buffer->push_constants,
> layout->push_constant_size);
> - if (layout->dynamic_offset_count) {
> - memcpy((char*)ptr + layout->push_constant_size,
> cmd_buffer->dynamic_buffers,
> - 16 * layout->dynamic_offset_count);
> + uint32_t mask = pipeline->shaders[stage]->
> info.inline_push_const_mask;
> + uint32_t base_reg = pipeline->user_data_0[stage];
> + while (mask) {
> + int start, count;
> + u_bit_scan_consecutive_range(&mask, &start,
> &count);
> + start += pipeline->shaders[stage]->
> info.inline_push_const_base;
> + radv_emit_inline_pushconsts(cmd_buffer,
> pipeline->shaders[stage], base_reg,
> + start, count,
> (uint32_t *)&cmd_buffer->push_constants[start * 4]);
> + }
> }
>
> - va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> - va += offset;
> + if (need_push_constants) {
> + if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
> layout->push_constant_size +
> + 16 *
> layout->dynamic_offset_count,
> + 256, &offset, &ptr))
> + return;
>
> - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer-
> >device->ws,
> - cmd_buffer->cs,
> MESA_SHADER_STAGES * 4);
> + memcpy(ptr, cmd_buffer->push_constants,
> layout->push_constant_size);
> + if (layout->dynamic_offset_count) {
> + memcpy((char*)ptr + layout->push_constant_size,
> cmd_buffer->dynamic_buffers,
> + 16 * layout->dynamic_offset_count);
> + }
>
> - radv_foreach_stage(stage, stages) {
> - if (pipeline->shaders[stage]) {
> - radv_emit_userdata_address(cmd_buffer, pipeline,
> stage,
> - AC_UD_PUSH_CONSTANTS,
> va);
> + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> + va += offset;
> +
> + MAYBE_UNUSED unsigned cdw_max =
> radeon_check_space(cmd_buffer->device->ws,
> +
> cmd_buffer->cs, MESA_SHADER_STAGES * 4);
> +
> + radv_foreach_stage(stage, stages) {
> + if (pipeline->shaders[stage]) {
> + radv_emit_userdata_address(cmd_buffer,
> pipeline, stage,
> +
> AC_UD_PUSH_CONSTANTS, va);
> + }
> }
> }
> -
> cmd_buffer->push_constant_stages &= ~stages;
> assert(cmd_buffer->cs->cdw <= cdw_max);
> }
> --
> 2.14.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180112/5ffb81a1/attachment-0001.html>
More information about the mesa-dev
mailing list