[Mesa-dev] [PATCH 2/2] radv: inline push constants where possible.

Dave Airlie airlied at gmail.com
Thu Jan 11 03:03:35 UTC 2018


From: Dave Airlie <airlied at redhat.com>

Instead of putting the push constants into the upload buffer,
if we have space in the sgprs we can upload the per-stage
constants into the shaders directly.

This saves a few reads from memory in the meta shaders,
we should also be able to inline other objects like
descriptors.

Signed-off-by: Dave Airlie <airlied at redhat.com>
---
 src/amd/common/ac_nir_to_llvm.c  | 93 ++++++++++++++++++++++++++++++++++++----
 src/amd/common/ac_nir_to_llvm.h  |  4 ++
 src/amd/common/ac_shader_info.c  |  5 ++-
 src/amd/common/ac_shader_info.h  |  1 +
 src/amd/vulkan/radv_cmd_buffer.c | 74 ++++++++++++++++++++++++--------
 5 files changed, 150 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index c00220a9c3..818ce40168 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -92,6 +92,7 @@ struct nir_to_llvm_context {
 	LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
 	LLVMValueRef ring_offsets;
 	LLVMValueRef push_constants;
+	LLVMValueRef inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
 	LLVMValueRef view_index;
 	LLVMValueRef num_work_groups;
 	LLVMValueRef workgroup_ids[3];
@@ -243,7 +244,7 @@ static void set_llvm_calling_convention(LLVMValueRef func,
 	LLVMSetFunctionCallConv(func, calling_conv);
 }
 
-#define MAX_ARGS 23
+#define MAX_ARGS 32
 struct arg_info {
 	LLVMTypeRef types[MAX_ARGS];
 	LLVMValueRef *assign[MAX_ARGS];
@@ -538,6 +539,8 @@ struct user_sgpr_info {
 	bool need_ring_offsets;
 	uint8_t sgpr_count;
 	bool indirect_all_descriptor_sets;
+	uint8_t base_inline_push_consts;
+	uint8_t num_inline_push_consts;
 };
 
 static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
@@ -609,8 +612,45 @@ static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
 	} else {
 		user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
 	}
+
+	if (ctx->shader_info->info.loads_push_constants) {
+		uint32_t remaining_sgprs = 16 - user_sgpr_info->sgpr_count;
+		if (!ctx->shader_info->info.has_indirect_push_constants &&
+		    !ctx->shader_info->info.loads_dynamic_offsets)
+			remaining_sgprs += 2;
+
+		if (ctx->options->layout->push_constant_size) {
+			uint8_t num_32bit_push_consts = (ctx->shader_info->info.max_push_constant_used -
+							 ctx->shader_info->info.min_push_constant_used) / 4;
+			user_sgpr_info->base_inline_push_consts = ctx->shader_info->info.min_push_constant_used / 4;
+
+			if (num_32bit_push_consts < remaining_sgprs) {
+				user_sgpr_info->num_inline_push_consts = num_32bit_push_consts;
+				if (!ctx->shader_info->info.has_indirect_push_constants)
+					ctx->shader_info->info.loads_push_constants = false;
+			} else {
+				user_sgpr_info->num_inline_push_consts = remaining_sgprs;
+			}
+
+			if (user_sgpr_info->num_inline_push_consts > AC_UD_MAX_INLINE_PUSH_CONST)
+				user_sgpr_info->num_inline_push_consts = AC_UD_MAX_INLINE_PUSH_CONST;
+		}
+	}
 }
 
+static void
+declare_inline_push_consts(struct nir_to_llvm_context *ctx,
+			   gl_shader_stage stage,
+			   const struct user_sgpr_info *user_sgpr_info,
+			   struct arg_info *args)
+{
+	ctx->shader_info->inline_push_const_mask = (1 << user_sgpr_info->num_inline_push_consts) - 1;
+	ctx->shader_info->inline_push_const_mask <<= user_sgpr_info->base_inline_push_consts;
+
+	for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
+		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->inline_push_consts[i]);
+
+}
 static void
 declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
 			   gl_shader_stage stage,
@@ -644,6 +684,9 @@ declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
 		/* 1 for push constants and dynamic descriptors */
 		add_array_arg(args, type, &ctx->push_constants);
 	}
+
+	if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX)))
+		declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
 }
 
 static void
@@ -651,6 +694,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
 				gl_shader_stage stage,
 				bool has_previous_stage,
 				gl_shader_stage previous_stage,
+				const struct user_sgpr_info *user_sgpr_info,
 				struct arg_info *args)
 {
 	if (!ctx->is_gs_copy_shader &&
@@ -660,6 +704,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
 			add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32, 16),
 				&ctx->vertex_buffers);
 		}
+		declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
 		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
 		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance);
 		if (ctx->shader_info->info.vs.needs_draw_id) {
@@ -693,6 +738,16 @@ declare_tes_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
 	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
 }
 
+static void
+set_inline_pushconst_locs(struct nir_to_llvm_context *ctx,
+			  const struct user_sgpr_info *user_sgpr_info,
+			  uint8_t *user_sgpr_idx)
+{
+	ctx->shader_info->user_sgprs_locs.push_const_base = user_sgpr_info->base_inline_push_consts;
+	for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
+		set_loc(&ctx->shader_info->user_sgprs_locs.inline_push_consts[i], user_sgpr_idx, 1, 0);
+}
+
 static void
 set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
 		      bool has_previous_stage, gl_shader_stage previous_stage,
@@ -734,12 +789,17 @@ set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
 	if (ctx->shader_info->info.loads_push_constants) {
 		set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
 	}
+
+
+	if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX)))
+		set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
 }
 
 static void
 set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
 			   gl_shader_stage stage, bool has_previous_stage,
 			   gl_shader_stage previous_stage,
+			   const struct user_sgpr_info *user_sgpr_info,
 			   uint8_t *user_sgpr_idx)
 {
 	if (!ctx->is_gs_copy_shader &&
@@ -750,6 +810,7 @@ set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
 				       user_sgpr_idx, 2);
 		}
 
+		set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
 		unsigned vs_num = 2;
 		if (ctx->shader_info->info.vs.needs_draw_id)
 			vs_num++;
@@ -805,7 +866,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
 					   previous_stage, &user_sgpr_info,
 					   &args, &desc_sets);
 		declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage,
-						previous_stage, &args);
+						previous_stage, &user_sgpr_info, &args);
 
 		if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index))
 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
@@ -838,7 +899,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
 						   &desc_sets);
 			declare_vs_specific_input_sgprs(ctx, stage,
 							has_previous_stage,
-							previous_stage, &args);
+							previous_stage, &user_sgpr_info, &args);
 
 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
 				&ctx->ls_out_layout);
@@ -934,7 +995,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
 			} else {
 				declare_vs_specific_input_sgprs(ctx, stage,
 								has_previous_stage,
-								previous_stage,
+								previous_stage, &user_sgpr_info,
 								&args);
 			}
 
@@ -1076,7 +1137,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
 		break;
 	case MESA_SHADER_VERTEX:
 		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
-					   previous_stage, &user_sgpr_idx);
+					   previous_stage, &user_sgpr_info, &user_sgpr_idx);
 		if (ctx->view_index)
 			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
 		if (ctx->options->key.vs.as_ls) {
@@ -1088,7 +1149,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
 		break;
 	case MESA_SHADER_TESS_CTRL:
 		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
-					   previous_stage, &user_sgpr_idx);
+					   previous_stage, &user_sgpr_info, &user_sgpr_idx);
 		if (has_previous_stage)
 			set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
 				       &user_sgpr_idx, 1);
@@ -1108,6 +1169,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
 				set_vs_specific_input_locs(ctx, stage,
 							   has_previous_stage,
 							   previous_stage,
+							   &user_sgpr_info,
 							   &user_sgpr_idx);
 			else
 				set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT,
@@ -2357,9 +2419,24 @@ static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
                                              nir_intrinsic_instr *instr)
 {
 	LLVMValueRef ptr, addr;
+	LLVMValueRef src0 = get_src(ctx->nir, instr->src[0]);
+	unsigned index = nir_intrinsic_base(instr);
+
+	if (LLVMIsConstant(src0)) {
+		unsigned array_index = index;
+		array_index += LLVMConstIntGetZExtValue(src0);
+		array_index /= 4;
+
+		uint32_t bits = ((1 << instr->num_components) - 1) << array_index;
+
+		if ((bits & ctx->shader_info->inline_push_const_mask) == bits) {
+			array_index -= ctx->shader_info->user_sgprs_locs.push_const_base;
+			return ac_build_gather_values(&ctx->ac, &ctx->inline_push_consts[array_index], instr->num_components);
+		}
+	}
 
-	addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
-	addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir, instr->src[0]), "");
+	addr = LLVMConstInt(ctx->ac.i32, index, 0);
+	addr = LLVMBuildAdd(ctx->builder, addr, src0, "");
 
 	ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
 	ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa));
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index b3ad0a0985..9f9230d3e6 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -127,10 +127,13 @@ enum ac_ud_index {
 
 // Match MAX_SETS from radv_descriptor_set.h
 #define AC_UD_MAX_SETS MAX_SETS
+#define AC_UD_MAX_INLINE_PUSH_CONST 8
 
 struct ac_userdata_locations {
 	struct ac_userdata_info descriptor_sets[AC_UD_MAX_SETS];
 	struct ac_userdata_info shader_data[AC_UD_MAX_UD];
+	struct ac_userdata_info inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
+	uint8_t push_const_base;
 };
 
 struct ac_vs_output_info {
@@ -156,6 +159,7 @@ struct ac_shader_variant_info {
 	unsigned num_user_sgprs;
 	unsigned num_input_sgprs;
 	unsigned num_input_vgprs;
+	uint32_t inline_push_const_mask;
 	bool need_indirect_descriptor_sets;
 	struct {
 		struct {
diff --git a/src/amd/common/ac_shader_info.c b/src/amd/common/ac_shader_info.c
index 18fa9e1c94..fbb46684ae 100644
--- a/src/amd/common/ac_shader_info.c
+++ b/src/amd/common/ac_shader_info.c
@@ -179,9 +179,10 @@ ac_nir_shader_info_pass(struct nir_shader *nir,
 {
 	struct nir_function *func = (struct nir_function *)exec_list_get_head(&nir->functions);
 
-
-	if (options->layout->dynamic_offset_count)
+	if (options->layout->dynamic_offset_count) {
 		info->loads_push_constants = true;
+		info->loads_dynamic_offsets = true;
+	}
 
 	nir_foreach_variable(variable, &nir->inputs)
 		gather_info_input_decl(nir, options, variable, info);
diff --git a/src/amd/common/ac_shader_info.h b/src/amd/common/ac_shader_info.h
index e35cde0ca9..e8ea33f2e3 100644
--- a/src/amd/common/ac_shader_info.h
+++ b/src/amd/common/ac_shader_info.h
@@ -32,6 +32,7 @@ struct ac_shader_info {
 	uint8_t min_push_constant_used;
 	uint8_t max_push_constant_used;
 	bool has_indirect_push_constants;
+	bool loads_dynamic_offsets;
 	bool loads_push_constants;
 	bool needs_multiview_view_index;
 	bool uses_invocation_id;
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 60f19fb12b..17306eeaf8 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1807,6 +1807,27 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
 	assert(cmd_buffer->cs->cdw <= cdw_max);
 }
 
+static struct ac_userdata_info *
+radv_lookup_push_const_sgpr(struct radv_shader_variant *shader,
+			    int idx)
+{
+	idx -= shader->info.user_sgprs_locs.push_const_base;
+	return &shader->info.user_sgprs_locs.inline_push_consts[idx];
+}
+
+static void
+radv_emit_inline_pushconsts(struct radv_cmd_buffer *cmd_buffer,
+			    struct radv_shader_variant *shader,
+			    unsigned base_reg,
+			    int idx, int count, uint32_t *values)
+{
+	struct ac_userdata_info *loc = radv_lookup_push_const_sgpr(shader, idx);
+	assert (loc->sgpr_idx == -1);
+	assert (!loc->indirect);
+	radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
+	radeon_emit_array(cmd_buffer->cs, values, count);
+}
+
 static void
 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
 		     struct radv_pipeline *pipeline,
@@ -1816,36 +1837,55 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
 	unsigned offset;
 	void *ptr;
 	uint64_t va;
+	bool need_push_constants = false;
 
 	stages &= cmd_buffer->push_constant_stages;
 	if (!stages ||
 	    (!layout->push_constant_size && !layout->dynamic_offset_count))
 		return;
 
-	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
-					  16 * layout->dynamic_offset_count,
-					  256, &offset, &ptr))
-		return;
+	radv_foreach_stage(stage, stages) {
+		if (!pipeline->shaders[stage])
+			continue;
+
+		need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
+		need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
 
-	memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
-	if (layout->dynamic_offset_count) {
-		memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
-		       16 * layout->dynamic_offset_count);
+		uint32_t mask = pipeline->shaders[stage]->info.inline_push_const_mask;
+		uint32_t base_reg = pipeline->user_data_0[stage];
+		while (mask) {
+			int start, count;
+			u_bit_scan_consecutive_range(&mask, &start, &count);
+			radv_emit_inline_pushconsts(cmd_buffer, pipeline->shaders[stage], base_reg,
+						    start, count, (uint32_t *)&cmd_buffer->push_constants[start * 4]);
+		}
 	}
 
-	va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
-	va += offset;
+	if (need_push_constants) {
+		if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
+						  16 * layout->dynamic_offset_count,
+						  256, &offset, &ptr))
+			return;
 
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
-	                                                   cmd_buffer->cs, MESA_SHADER_STAGES * 4);
+		memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
+		if (layout->dynamic_offset_count) {
+			memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
+			       16 * layout->dynamic_offset_count);
+		}
 
-	radv_foreach_stage(stage, stages) {
-		if (pipeline->shaders[stage]) {
-			radv_emit_userdata_address(cmd_buffer, pipeline, stage,
-						   AC_UD_PUSH_CONSTANTS, va);
+		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
+		va += offset;
+
+		MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
+								   cmd_buffer->cs, MESA_SHADER_STAGES * 4);
+
+		radv_foreach_stage(stage, stages) {
+			if (pipeline->shaders[stage]) {
+				radv_emit_userdata_address(cmd_buffer, pipeline, stage,
+							   AC_UD_PUSH_CONSTANTS, va);
+			}
 		}
 	}
-
 	cmd_buffer->push_constant_stages &= ~stages;
 	assert(cmd_buffer->cs->cdw <= cdw_max);
 }
-- 
2.14.3



More information about the mesa-dev mailing list