[Mesa-dev] [RFC PATCH 2/2] ac/nir: Don't treat each element as a vec4 in compute shared memory

Tue Jun 27 15:53:19 UTC 2017

Currently every element in shared memory (including individual elements
of an array) are treated as a vec4. For example, the following:

    shared uint array[1024];

gets treated as an array of 1024 vec4s with only the first component
ever used.

This can be highly wasteful of LDS space. We have a shader which only
really requires ~24KB of shared memory, but since scalars are being
padded to vec4, it actually ends up needing 96KB. This is more than
the hardware supports, and leads to an LLVM compilation failure.

Therefore, instead pack data in LDS according to the std430 layout.

Signed-off-by: Alex Smith <asmith at feralinteractive.com>
---
RFC since I'm not sure whether this is the best approach to handle this,
and there may be something I've missed.

No regressions seen in Mad Max or Dawn of War 3, which both have a few
shaders which use shared memory. Also no regressions in the CTS
'dEQP-VK.compute.*' tests.

I've also checked over various test cases using arrays/structs/etc. in
shared memory, and as far as I can tell the generated LLVM/GCN code is
correct.
---
 src/amd/common/ac_nir_to_llvm.c | 55 ++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 468ce4d..a48bb3b 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx,
 	LLVMValueRef ptr;
 	int addr_space;
 
-	offset = LLVMConstInt(ctx->i32, idx * 16, false);
+	offset = LLVMConstInt(ctx->i32, idx, false);
 
 	ptr = ctx->shared_memory;
 	ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
@@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct nir_to_llvm_context *ctx,
 
 static void
 radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
-		      bool vs_in, unsigned *vertex_index_out,
+		      bool vs_in, bool compute_shared, unsigned *vertex_index_out,
 		      LLVMValueRef *vertex_index_ref,
 		      unsigned *const_out, LLVMValueRef *indir_out)
 {
@@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
 		if (tail->deref_type == nir_deref_type_array) {
 			nir_deref_array *deref_array = nir_deref_as_array(tail);
 			LLVMValueRef index, stride, local_offset;
-			unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
+
+			/* For compute LDS, data is packed rather than treating
+			 * each individual element as a vec4. The returned
+			 * index is used to index into an i32 array so divide
+			 * by 4.
+			 */
+			unsigned size = compute_shared ? glsl_get_std430_array_stride(tail->type, false) / 4
+						       : glsl_count_attribute_slots(tail->type, vs_in);
 
 			const_offset += size * deref_array->base_offset;
 			if (deref_array->deref_array_type == nir_deref_array_type_direct)
@@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
 
 			for (unsigned i = 0; i < deref_struct->index; i++) {
 				const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
-				const_offset += glsl_count_attribute_slots(ft, vs_in);
+
+				/* Same as above. */
+				unsigned size = compute_shared ? glsl_get_std430_array_stride(ft, false) / 4
+							       : glsl_count_attribute_slots(tail->type, vs_in);
+				const_offset += size;
 			}
 		} else
 			unreachable("unsupported deref type");
@@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx,
 	const bool is_compact = instr->variables[0]->var->data.compact;
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, NULL, per_vertex ? &vertex_index : NULL,
+			      false, false, NULL, per_vertex ? &vertex_index : NULL,
 			      &const_index, &indir_index);
 
 	stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8);
@@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
 	const bool is_compact = instr->variables[0]->var->data.compact;
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, NULL, per_vertex ? &vertex_index : NULL,
+			      false, false, NULL, per_vertex ? &vertex_index : NULL,
 			      &const_index, &indir_index);
 
 	if (!instr->variables[0]->var->data.patch) {
@@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
 	const bool is_compact = instr->variables[0]->var->data.compact;
 
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, NULL, per_vertex ? &vertex_index : NULL,
+			      false, false, NULL, per_vertex ? &vertex_index : NULL,
 			      &const_index, &indir_index);
 
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
@@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx,
 	const bool is_compact = instr->variables[0]->var->data.compact;
 
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, NULL, per_vertex ? &vertex_index : NULL,
+			      false, false, NULL, per_vertex ? &vertex_index : NULL,
 			      &const_index, &indir_index);
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
 	if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
@@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx,
 	LLVMValueRef value[4], result;
 	unsigned vertex_index;
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, &vertex_index, NULL,
+			      false, false, &vertex_index, NULL,
 			      &const_index, &indir_index);
 	vtx_offset_param = vertex_index;
 	assert(vtx_offset_param < 6);
@@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
 	unsigned const_index;
 	bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
 	             instr->variables[0]->var->data.mode == nir_var_shader_in;
-	radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
-				      &const_index, &indir_index);
+	bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
+	radv_get_deref_offset(ctx, instr->variables[0], vs_in, compute_shared,
+			      NULL, NULL, &const_index, &indir_index);
 
 	if (instr->dest.ssa.bit_size == 64)
 		ve *= 2;
@@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
 		LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
 		LLVMValueRef derived_ptr;
 
-		if (indir_index)
-			indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
-
 		for (unsigned chan = 0; chan < ve; chan++) {
 			LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
 			if (indir_index)
@@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx,
 	int writemask = instr->const_index[0];
 	LLVMValueRef indir_index;
 	unsigned const_index;
-	radv_get_deref_offset(ctx, instr->variables[0], false,
+	bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
+	radv_get_deref_offset(ctx, instr->variables[0], false, compute_shared,
 	                      NULL, NULL, &const_index, &indir_index);
 
 	if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
@@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx,
 	case nir_var_shared: {
 		LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
 
-		if (indir_index)
-			indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
-
 		for (unsigned chan = 0; chan < 8; chan++) {
 			if (!(writemask & (1 << chan)))
 				continue;
@@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context *ctx)
 
 static void
 handle_shared_compute_var(struct nir_to_llvm_context *ctx,
-			  struct nir_variable *variable, uint32_t *offset, int idx)
+			  struct nir_variable *variable, uint32_t *offset)
 {
-	unsigned size = glsl_count_attribute_slots(variable->type, false);
+	unsigned size = glsl_get_std430_array_stride(variable->type, false);
 	variable->data.driver_location = *offset;
 	*offset += size;
 }
@@ -5926,16 +5933,12 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 		nir_foreach_variable(variable, &nir->shared)
 			num_shared++;
 		if (num_shared) {
-			int idx = 0;
 			uint32_t shared_size = 0;
 			LLVMValueRef var;
 			LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE);
-			nir_foreach_variable(variable, &nir->shared) {
-				handle_shared_compute_var(&ctx, variable, &shared_size, idx);
-				idx++;
-			}
+			nir_foreach_variable(variable, &nir->shared)
+				handle_shared_compute_var(&ctx, variable, &shared_size);
 
-			shared_size *= 16;
 			var = LLVMAddGlobalInAddressSpace(ctx.module,
 							  LLVMArrayType(ctx.i8, shared_size),
 							  "compute_lds",
-- 
2.9.4