[Mesa-dev] [PATCH 2/2] radv: inline push constants where possible.

Samuel Pitoiset samuel.pitoiset at gmail.com
Thu Jan 11 21:30:34 UTC 2018



On 01/11/2018 04:03 AM, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
> 
> Instead of putting the push constants into the upload buffer,
> if we have space in the sgprs we can upload the per-stage
> constants into the shaders directly.
> 
> This saves a few reads from memory in the meta shaders,
> we should also be able to inline other objects like
> descriptors.
> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>   src/amd/common/ac_nir_to_llvm.c  | 93 ++++++++++++++++++++++++++++++++++++----
>   src/amd/common/ac_nir_to_llvm.h  |  4 ++
>   src/amd/common/ac_shader_info.c  |  5 ++-
>   src/amd/common/ac_shader_info.h  |  1 +
>   src/amd/vulkan/radv_cmd_buffer.c | 74 ++++++++++++++++++++++++--------
>   5 files changed, 150 insertions(+), 27 deletions(-)
> 
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index c00220a9c3..818ce40168 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -92,6 +92,7 @@ struct nir_to_llvm_context {
>   	LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
>   	LLVMValueRef ring_offsets;
>   	LLVMValueRef push_constants;
> +	LLVMValueRef inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
>   	LLVMValueRef view_index;
>   	LLVMValueRef num_work_groups;
>   	LLVMValueRef workgroup_ids[3];
> @@ -243,7 +244,7 @@ static void set_llvm_calling_convention(LLVMValueRef func,
>   	LLVMSetFunctionCallConv(func, calling_conv);
>   }
>   
> -#define MAX_ARGS 23
> +#define MAX_ARGS 32
>   struct arg_info {
>   	LLVMTypeRef types[MAX_ARGS];
>   	LLVMValueRef *assign[MAX_ARGS];
> @@ -538,6 +539,8 @@ struct user_sgpr_info {
>   	bool need_ring_offsets;
>   	uint8_t sgpr_count;
>   	bool indirect_all_descriptor_sets;
> +	uint8_t base_inline_push_consts;
> +	uint8_t num_inline_push_consts;
>   };
>   
>   static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
> @@ -609,8 +612,45 @@ static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
>   	} else {
>   		user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
>   	}
> +
> +	if (ctx->shader_info->info.loads_push_constants) {
> +		uint32_t remaining_sgprs = 16 - user_sgpr_info->sgpr_count;

This can be 32 on GFX9. Also, please have a look at the TODO above, the 
number of user SGPRs has to be fixed first, should be quite trivial but 
I didn't send a patch yet.

> +		if (!ctx->shader_info->info.has_indirect_push_constants &&
> +		    !ctx->shader_info->info.loads_dynamic_offsets)
> +			remaining_sgprs += 2;
> +
> +		if (ctx->options->layout->push_constant_size) {
> +			uint8_t num_32bit_push_consts = (ctx->shader_info->info.max_push_constant_used -
> +							 ctx->shader_info->info.min_push_constant_used) / 4;
> +			user_sgpr_info->base_inline_push_consts = ctx->shader_info->info.min_push_constant_used / 4;
> +
> +			if (num_32bit_push_consts < remaining_sgprs) {
> +				user_sgpr_info->num_inline_push_consts = num_32bit_push_consts;
> +				if (!ctx->shader_info->info.has_indirect_push_constants)
> +					ctx->shader_info->info.loads_push_constants = false;
> +			} else {
> +				user_sgpr_info->num_inline_push_consts = remaining_sgprs;
> +			}
> +
> +			if (user_sgpr_info->num_inline_push_consts > AC_UD_MAX_INLINE_PUSH_CONST)
> +				user_sgpr_info->num_inline_push_consts = AC_UD_MAX_INLINE_PUSH_CONST;
> +		}
> +	}
>   }
>   
> +static void
> +declare_inline_push_consts(struct nir_to_llvm_context *ctx,
> +			   gl_shader_stage stage,
> +			   const struct user_sgpr_info *user_sgpr_info,
> +			   struct arg_info *args)
> +{
> +	ctx->shader_info->inline_push_const_mask = (1 << user_sgpr_info->num_inline_push_consts) - 1;
> +	ctx->shader_info->inline_push_const_mask <<= user_sgpr_info->base_inline_push_consts;
> +
> +	for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
> +		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->inline_push_consts[i]);
> +
> +}
>   static void
>   declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
>   			   gl_shader_stage stage,
> @@ -644,6 +684,9 @@ declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
>   		/* 1 for push constants and dynamic descriptors */
>   		add_array_arg(args, type, &ctx->push_constants);
>   	}
> +
> +	if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX)))
> +		declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
>   }
>   
>   static void
> @@ -651,6 +694,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
>   				gl_shader_stage stage,
>   				bool has_previous_stage,
>   				gl_shader_stage previous_stage,
> +				const struct user_sgpr_info *user_sgpr_info,
>   				struct arg_info *args)
>   {
>   	if (!ctx->is_gs_copy_shader &&
> @@ -660,6 +704,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
>   			add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32, 16),
>   				&ctx->vertex_buffers);
>   		}
> +		declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
>   		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
>   		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance);
>   		if (ctx->shader_info->info.vs.needs_draw_id) {
> @@ -693,6 +738,16 @@ declare_tes_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
>   	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
>   }
>   
> +static void
> +set_inline_pushconst_locs(struct nir_to_llvm_context *ctx,
> +			  const struct user_sgpr_info *user_sgpr_info,
> +			  uint8_t *user_sgpr_idx)
> +{
> +	ctx->shader_info->user_sgprs_locs.push_const_base = user_sgpr_info->base_inline_push_consts;
> +	for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
> +		set_loc(&ctx->shader_info->user_sgprs_locs.inline_push_consts[i], user_sgpr_idx, 1, 0);
> +}
> +
>   static void
>   set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
>   		      bool has_previous_stage, gl_shader_stage previous_stage,
> @@ -734,12 +789,17 @@ set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
>   	if (ctx->shader_info->info.loads_push_constants) {
>   		set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
>   	}
> +
> +
> +	if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX)))
> +		set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
>   }
>   
>   static void
>   set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
>   			   gl_shader_stage stage, bool has_previous_stage,
>   			   gl_shader_stage previous_stage,
> +			   const struct user_sgpr_info *user_sgpr_info,
>   			   uint8_t *user_sgpr_idx)
>   {
>   	if (!ctx->is_gs_copy_shader &&
> @@ -750,6 +810,7 @@ set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
>   				       user_sgpr_idx, 2);
>   		}
>   
> +		set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
>   		unsigned vs_num = 2;
>   		if (ctx->shader_info->info.vs.needs_draw_id)
>   			vs_num++;
> @@ -805,7 +866,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
>   					   previous_stage, &user_sgpr_info,
>   					   &args, &desc_sets);
>   		declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage,
> -						previous_stage, &args);
> +						previous_stage, &user_sgpr_info, &args);
>   
>   		if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index))
>   			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
> @@ -838,7 +899,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
>   						   &desc_sets);
>   			declare_vs_specific_input_sgprs(ctx, stage,
>   							has_previous_stage,
> -							previous_stage, &args);
> +							previous_stage, &user_sgpr_info, &args);
>   
>   			add_arg(&args, ARG_SGPR, ctx->ac.i32,
>   				&ctx->ls_out_layout);
> @@ -934,7 +995,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
>   			} else {
>   				declare_vs_specific_input_sgprs(ctx, stage,
>   								has_previous_stage,
> -								previous_stage,
> +								previous_stage, &user_sgpr_info,
>   								&args);
>   			}
>   
> @@ -1076,7 +1137,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
>   		break;
>   	case MESA_SHADER_VERTEX:
>   		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
> -					   previous_stage, &user_sgpr_idx);
> +					   previous_stage, &user_sgpr_info, &user_sgpr_idx);
>   		if (ctx->view_index)
>   			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
>   		if (ctx->options->key.vs.as_ls) {
> @@ -1088,7 +1149,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
>   		break;
>   	case MESA_SHADER_TESS_CTRL:
>   		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
> -					   previous_stage, &user_sgpr_idx);
> +					   previous_stage, &user_sgpr_info, &user_sgpr_idx);
>   		if (has_previous_stage)
>   			set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
>   				       &user_sgpr_idx, 1);
> @@ -1108,6 +1169,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
>   				set_vs_specific_input_locs(ctx, stage,
>   							   has_previous_stage,
>   							   previous_stage,
> +							   &user_sgpr_info,
>   							   &user_sgpr_idx);
>   			else
>   				set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT,
> @@ -2357,9 +2419,24 @@ static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
>                                                nir_intrinsic_instr *instr)
>   {
>   	LLVMValueRef ptr, addr;
> +	LLVMValueRef src0 = get_src(ctx->nir, instr->src[0]);
> +	unsigned index = nir_intrinsic_base(instr);
> +
> +	if (LLVMIsConstant(src0)) {
> +		unsigned array_index = index;
> +		array_index += LLVMConstIntGetZExtValue(src0);
> +		array_index /= 4;
> +
> +		uint32_t bits = ((1 << instr->num_components) - 1) << array_index;
> +
> +		if ((bits & ctx->shader_info->inline_push_const_mask) == bits) {
> +			array_index -= ctx->shader_info->user_sgprs_locs.push_const_base;
> +			return ac_build_gather_values(&ctx->ac, &ctx->inline_push_consts[array_index], instr->num_components);
> +		}
> +	}
>   
> -	addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
> -	addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir, instr->src[0]), "");
> +	addr = LLVMConstInt(ctx->ac.i32, index, 0);
> +	addr = LLVMBuildAdd(ctx->builder, addr, src0, "");
>   
>   	ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
>   	ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa));
> diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
> index b3ad0a0985..9f9230d3e6 100644
> --- a/src/amd/common/ac_nir_to_llvm.h
> +++ b/src/amd/common/ac_nir_to_llvm.h
> @@ -127,10 +127,13 @@ enum ac_ud_index {
>   
>   // Match MAX_SETS from radv_descriptor_set.h
>   #define AC_UD_MAX_SETS MAX_SETS
> +#define AC_UD_MAX_INLINE_PUSH_CONST 8
>   
>   struct ac_userdata_locations {
>   	struct ac_userdata_info descriptor_sets[AC_UD_MAX_SETS];
>   	struct ac_userdata_info shader_data[AC_UD_MAX_UD];
> +	struct ac_userdata_info inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
> +	uint8_t push_const_base;
>   };
>   
>   struct ac_vs_output_info {
> @@ -156,6 +159,7 @@ struct ac_shader_variant_info {
>   	unsigned num_user_sgprs;
>   	unsigned num_input_sgprs;
>   	unsigned num_input_vgprs;
> +	uint32_t inline_push_const_mask;
>   	bool need_indirect_descriptor_sets;
>   	struct {
>   		struct {
> diff --git a/src/amd/common/ac_shader_info.c b/src/amd/common/ac_shader_info.c
> index 18fa9e1c94..fbb46684ae 100644
> --- a/src/amd/common/ac_shader_info.c
> +++ b/src/amd/common/ac_shader_info.c
> @@ -179,9 +179,10 @@ ac_nir_shader_info_pass(struct nir_shader *nir,
>   {
>   	struct nir_function *func = (struct nir_function *)exec_list_get_head(&nir->functions);
>   
> -
> -	if (options->layout->dynamic_offset_count)
> +	if (options->layout->dynamic_offset_count) {
>   		info->loads_push_constants = true;
> +		info->loads_dynamic_offsets = true;
> +	}
>   
>   	nir_foreach_variable(variable, &nir->inputs)
>   		gather_info_input_decl(nir, options, variable, info);
> diff --git a/src/amd/common/ac_shader_info.h b/src/amd/common/ac_shader_info.h
> index e35cde0ca9..e8ea33f2e3 100644
> --- a/src/amd/common/ac_shader_info.h
> +++ b/src/amd/common/ac_shader_info.h
> @@ -32,6 +32,7 @@ struct ac_shader_info {
>   	uint8_t min_push_constant_used;
>   	uint8_t max_push_constant_used;
>   	bool has_indirect_push_constants;
> +	bool loads_dynamic_offsets;
>   	bool loads_push_constants;
>   	bool needs_multiview_view_index;
>   	bool uses_invocation_id;
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> index 60f19fb12b..17306eeaf8 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -1807,6 +1807,27 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
>   	assert(cmd_buffer->cs->cdw <= cdw_max);
>   }
>   
> +static struct ac_userdata_info *
> +radv_lookup_push_const_sgpr(struct radv_shader_variant *shader,
> +			    int idx)
> +{
> +	idx -= shader->info.user_sgprs_locs.push_const_base;
> +	return &shader->info.user_sgprs_locs.inline_push_consts[idx];
> +}
> +
> +static void
> +radv_emit_inline_pushconsts(struct radv_cmd_buffer *cmd_buffer,
> +			    struct radv_shader_variant *shader,
> +			    unsigned base_reg,
> +			    int idx, int count, uint32_t *values)
> +{
> +	struct ac_userdata_info *loc = radv_lookup_push_const_sgpr(shader, idx);
> +	assert (loc->sgpr_idx == -1);
> +	assert (!loc->indirect);
> +	radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
> +	radeon_emit_array(cmd_buffer->cs, values, count);
> +}
> +
>   static void
>   radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
>   		     struct radv_pipeline *pipeline,
> @@ -1816,36 +1837,55 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
>   	unsigned offset;
>   	void *ptr;
>   	uint64_t va;
> +	bool need_push_constants = false;
>   
>   	stages &= cmd_buffer->push_constant_stages;
>   	if (!stages ||
>   	    (!layout->push_constant_size && !layout->dynamic_offset_count))
>   		return;
>   
> -	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
> -					  16 * layout->dynamic_offset_count,
> -					  256, &offset, &ptr))
> -		return;
> +	radv_foreach_stage(stage, stages) {
> +		if (!pipeline->shaders[stage])
> +			continue;
> +
> +		need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
> +		need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
>   
> -	memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
> -	if (layout->dynamic_offset_count) {
> -		memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
> -		       16 * layout->dynamic_offset_count);
> +		uint32_t mask = pipeline->shaders[stage]->info.inline_push_const_mask;
> +		uint32_t base_reg = pipeline->user_data_0[stage];
> +		while (mask) {
> +			int start, count;
> +			u_bit_scan_consecutive_range(&mask, &start, &count);
> +			radv_emit_inline_pushconsts(cmd_buffer, pipeline->shaders[stage], base_reg,
> +						    start, count, (uint32_t *)&cmd_buffer->push_constants[start * 4]);
> +		}
>   	}
>   
> -	va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> -	va += offset;
> +	if (need_push_constants) {
> +		if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
> +						  16 * layout->dynamic_offset_count,
> +						  256, &offset, &ptr))
> +			return;
>   
> -	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
> -	                                                   cmd_buffer->cs, MESA_SHADER_STAGES * 4);
> +		memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
> +		if (layout->dynamic_offset_count) {
> +			memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
> +			       16 * layout->dynamic_offset_count);
> +		}
>   
> -	radv_foreach_stage(stage, stages) {
> -		if (pipeline->shaders[stage]) {
> -			radv_emit_userdata_address(cmd_buffer, pipeline, stage,
> -						   AC_UD_PUSH_CONSTANTS, va);
> +		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> +		va += offset;
> +
> +		MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
> +								   cmd_buffer->cs, MESA_SHADER_STAGES * 4);
> +
> +		radv_foreach_stage(stage, stages) {
> +			if (pipeline->shaders[stage]) {
> +				radv_emit_userdata_address(cmd_buffer, pipeline, stage,
> +							   AC_UD_PUSH_CONSTANTS, va);
> +			}
>   		}
>   	}
> -
>   	cmd_buffer->push_constant_stages &= ~stages;
>   	assert(cmd_buffer->cs->cdw <= cdw_max);
>   }
> 


More information about the mesa-dev mailing list