[Mesa-dev] [PATCH 17/19] gallium/radeon: reduce alloca of temporaries based on usagemask

Tue Aug 9 10:36:46 UTC 2016

From: Nicolai Hähnle <nicolai.haehnle at amd.com>

---
 src/gallium/drivers/radeon/radeon_llvm.h           |  1 +
 .../drivers/radeon/radeon_setup_tgsi_llvm.c        | 47 +++++++++++++++++-----
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 13f3336..4c946b5 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -45,20 +45,21 @@ struct radeon_llvm_branch {
 	unsigned has_else;
 };
 
 struct radeon_llvm_loop {
 	LLVMBasicBlockRef loop_block;
 	LLVMBasicBlockRef endloop_block;
 };
 
 struct radeon_llvm_array {
 	struct tgsi_declaration_range range;
+	ubyte usagemask;
 	LLVMValueRef alloca;
 };
 
 struct radeon_llvm_context {
 	struct lp_build_tgsi_soa_context soa;
 
 	/*=== Front end configuration ===*/
 
 	/* Instructions that are not described by any of the TGSI opcodes. */
 
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index d8ab5b0..73e4ce2 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -191,22 +191,30 @@ get_pointer_into_array(struct radeon_llvm_context *ctx,
 	LLVMValueRef index;
 
 	if (file != TGSI_FILE_TEMPORARY)
 		return NULL;
 
 	array = get_temp_array(&ctx->soa.bld_base, reg_index, reg_indirect);
 	if (!array || !array->alloca)
 		return NULL;
 
 	index = emit_array_index(&ctx->soa, reg_indirect, reg_index - array->range.First);
-	index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
-	index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, swizzle), "");
+	index = LLVMBuildMul(
+		builder, index,
+		lp_build_const_int32(gallivm, util_bitcount(array->usagemask)),
+		"");
+	index = LLVMBuildAdd(
+		builder, index,
+		lp_build_const_int32(
+			gallivm,
+			util_bitcount(array->usagemask & ((1 << swizzle) - 1))),
+		"");
 	idxs[0] = ctx->soa.bld_base.uint_bld.zero;
 	idxs[1] = index;
 	return LLVMBuildGEP(builder, array->alloca, idxs, 2, "");
 }
 
 LLVMValueRef
 radeon_llvm_emit_fetch_64bit(struct lp_build_tgsi_context *bld_base,
 			     enum tgsi_opcode_type type,
 			     LLVMValueRef ptr,
 			     LLVMValueRef ptr2)
@@ -466,54 +474,61 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
 			}
 		}
 		break;
 	}
 
 	case TGSI_FILE_TEMPORARY:
 	{
 		char name[16] = "";
 		LLVMValueRef array_alloca = NULL;
 		unsigned decl_size;
+		unsigned usagemask = decl->Declaration.UsageMask;
 		first = decl->Range.First;
 		last = decl->Range.Last;
 		decl_size = 4 * ((last - first) + 1);
+
 		if (decl->Declaration.Array) {
 			unsigned id = decl->Array.ArrayID - 1;
+			unsigned array_size;
+
 			if (!ctx->arrays) {
 				int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
 				ctx->arrays = CALLOC(size, sizeof(ctx->arrays[0]));
 			}
 
 			ctx->arrays[id].range = decl->Range;
+			ctx->arrays[id].usagemask = usagemask;
+			array_size = ((last - first) + 1) * util_bitcount(usagemask);
 
 			/* If the array has more than 16 elements, store it
 			 * in memory using an alloca that spans the entire
 			 * array.
 			 *
 			 * Otherwise, store each array element individually.
 			 * We will then generate vectors (per-channel, up to
-			 * <4 x float>) for indirect addressing.
+			 * <16 x float> if the usagemask is a single bit) for
+			 * indirect addressing.
 			 *
 			 * Note that 16 is the number of vector elements that
 			 * LLVM will store in a register, so theoretically an
 			 * array with up to 4 * 16 = 64 elements could be
 			 * handled this way, but whether that's a good idea
 			 * depends on VGPR register pressure elsewhere.
 			 *
 			 * FIXME: We shouldn't need to have the non-alloca
 			 * code path for arrays. LLVM should be smart enough to
 			 * promote allocas into registers when profitable.
 			 */
-			if (decl_size > 16) {
+			if (array_size > 16) {
 				array_alloca = LLVMBuildAlloca(builder,
 					LLVMArrayType(bld_base->base.vec_type,
-						      decl_size), "array");
+						      array_size), "array");
 				ctx->arrays[id].alloca = array_alloca;
 			}
 		}
 
 		if (!ctx->temps_count) {
 			ctx->temps_count = bld_base->info->file_max[TGSI_FILE_TEMPORARY] + 1;
 			ctx->temps = MALLOC(TGSI_NUM_CHANNELS * ctx->temps_count * sizeof(LLVMValueRef));
 		}
 		if (!array_alloca) {
 			for (i = 0; i < decl_size; ++i) {
@@ -524,28 +539,40 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
 				ctx->temps[first * TGSI_NUM_CHANNELS + i] =
 					si_build_alloca_undef(bld_base->base.gallivm,
 							      bld_base->base.vec_type,
 							      name);
 			}
 		} else {
 			LLVMValueRef idxs[2] = {
 				bld_base->uint_bld.zero,
 				NULL
 			};
+			LLVMValueRef undef = NULL;
+			unsigned j = 0;
+
+			if (usagemask != TGSI_WRITEMASK_XYZW)
+				undef = LLVMGetUndef(LLVMPointerType(bld_base->base.vec_type, 0));
+
 			for (i = 0; i < decl_size; ++i) {
+				LLVMValueRef ptr;
+				if (usagemask & (1 << (i % 4))) {
 #ifdef DEBUG
-				snprintf(name, sizeof(name), "TEMP%d.%c",
-					 first + i / 4, "xyzw"[i % 4]);
+					snprintf(name, sizeof(name), "TEMP%d.%c",
+						 first + i / 4, "xyzw"[i % 4]);
 #endif
-				idxs[1] = lp_build_const_int32(bld_base->base.gallivm, i);
-				ctx->temps[first * TGSI_NUM_CHANNELS + i] =
-					LLVMBuildGEP(builder, array_alloca, idxs, 2, name);
+					idxs[1] = lp_build_const_int32(bld_base->base.gallivm, j);
+					ptr = LLVMBuildGEP(builder, array_alloca, idxs, 2, name);
+					j++;
+				} else {
+					ptr = undef;
+				}
+				ctx->temps[first * TGSI_NUM_CHANNELS + i] = ptr;
 			}
 		}
 		break;
 	}
 	case TGSI_FILE_INPUT:
 	{
 		unsigned idx;
 		for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
 			if (ctx->load_input)
 				ctx->load_input(ctx, idx, decl);
-- 
2.7.4