Mesa (master): gallium/radeon: reduce alloca of temporaries based on usagemask

Wed Aug 17 10:11:31 UTC 2016

Module: Mesa
Branch: master
Commit: 8916d1e2fae61c532e1e2013f0f76122ed1916b7
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8916d1e2fae61c532e1e2013f0f76122ed1916b7

Author: Nicolai Hähnle <nicolai.haehnle at amd.com>
Date:   Mon Aug  8 22:31:02 2016 +0200

gallium/radeon: reduce alloca of temporaries based on usagemask

v2: take actual writemasks into account

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

---

 src/gallium/drivers/radeon/radeon_llvm.h           |  2 +
 .../drivers/radeon/radeon_setup_tgsi_llvm.c        | 62 ++++++++++++++++++----
 2 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 4ed2c97..0276ef3 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -99,6 +99,8 @@ struct radeon_llvm_context {
 	struct tgsi_array_info *temp_arrays;
 	LLVMValueRef *temp_array_allocas;
 
+	LLVMValueRef undef_alloca;
+
 	LLVMValueRef main_fn;
 	LLVMTypeRef return_type;
 
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index dac0594..dd7d60b 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -185,6 +185,7 @@ get_pointer_into_array(struct radeon_llvm_context *ctx,
 		       const struct tgsi_ind_register *reg_indirect)
 {
 	unsigned array_id;
+	struct tgsi_array_info *array;
 	struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	LLVMValueRef idxs[2];
@@ -202,10 +203,23 @@ get_pointer_into_array(struct radeon_llvm_context *ctx,
 	if (!alloca)
 		return NULL;
 
+	array = &ctx->temp_arrays[array_id - 1];
+
+	if (!(array->writemask & (1 << swizzle)))
+		return ctx->undef_alloca;
+
 	index = emit_array_index(&ctx->soa, reg_indirect,
 				 reg_index - ctx->temp_arrays[array_id - 1].range.First);
-	index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
-	index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, swizzle), "");
+	index = LLVMBuildMul(
+		builder, index,
+		lp_build_const_int32(gallivm, util_bitcount(array->writemask)),
+		"");
+	index = LLVMBuildAdd(
+		builder, index,
+		lp_build_const_int32(
+			gallivm,
+			util_bitcount(array->writemask & ((1 << swizzle) - 1))),
+		"");
 	idxs[0] = ctx->soa.bld_base.uint_bld.zero;
 	idxs[1] = index;
 	return LLVMBuildGEP(builder, alloca, idxs, 2, "");
@@ -479,11 +493,18 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
 		char name[16] = "";
 		LLVMValueRef array_alloca = NULL;
 		unsigned decl_size;
+		unsigned writemask = decl->Declaration.UsageMask;
 		first = decl->Range.First;
 		last = decl->Range.Last;
 		decl_size = 4 * ((last - first) + 1);
+
 		if (decl->Declaration.Array) {
 			unsigned id = decl->Array.ArrayID - 1;
+			unsigned array_size;
+
+			writemask &= ctx->temp_arrays[id].writemask;
+			ctx->temp_arrays[id].writemask = writemask;
+			array_size = ((last - first) + 1) * util_bitcount(writemask);
 
 			/* If the array has more than 16 elements, store it
 			 * in memory using an alloca that spans the entire
@@ -491,7 +512,8 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
 			 *
 			 * Otherwise, store each array element individually.
 			 * We will then generate vectors (per-channel, up to
-			 * <4 x float>) for indirect addressing.
+			 * <16 x float> if the usagemask is a single bit) for
+			 * indirect addressing.
 			 *
 			 * Note that 16 is the number of vector elements that
 			 * LLVM will store in a register, so theoretically an
@@ -503,10 +525,10 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
 			 * code path for arrays. LLVM should be smart enough to
 			 * promote allocas into registers when profitable.
 			 */
-			if (decl_size > 16) {
+			if (array_size > 16) {
 				array_alloca = LLVMBuildAlloca(builder,
 					LLVMArrayType(bld_base->base.vec_type,
-						      decl_size), "array");
+						      array_size), "array");
 				ctx->temp_array_allocas[id] = array_alloca;
 			}
 		}
@@ -531,14 +553,34 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
 				bld_base->uint_bld.zero,
 				NULL
 			};
+			unsigned j = 0;
+
+			if (writemask != TGSI_WRITEMASK_XYZW &&
+			    !ctx->undef_alloca) {
+				/* Create a dummy alloca. We use it so that we
+				 * have a pointer that is safe to load from if
+				 * a shader ever reads from a channel that
+				 * it never writes to.
+				 */
+				ctx->undef_alloca = si_build_alloca_undef(
+					bld_base->base.gallivm,
+					bld_base->base.vec_type, "undef");
+			}
+
 			for (i = 0; i < decl_size; ++i) {
+				LLVMValueRef ptr;
+				if (writemask & (1 << (i % 4))) {
 #ifdef DEBUG
-				snprintf(name, sizeof(name), "TEMP%d.%c",
-					 first + i / 4, "xyzw"[i % 4]);
+					snprintf(name, sizeof(name), "TEMP%d.%c",
+						 first + i / 4, "xyzw"[i % 4]);
 #endif
-				idxs[1] = lp_build_const_int32(bld_base->base.gallivm, i);
-				ctx->temps[first * TGSI_NUM_CHANNELS + i] =
-					LLVMBuildGEP(builder, array_alloca, idxs, 2, name);
+					idxs[1] = lp_build_const_int32(bld_base->base.gallivm, j);
+					ptr = LLVMBuildGEP(builder, array_alloca, idxs, 2, name);
+					j++;
+				} else {
+					ptr = ctx->undef_alloca;
+				}
+				ctx->temps[first * TGSI_NUM_CHANNELS + i] = ptr;
 			}
 		}
 		break;