[Mesa-dev] [PATCH 17/19] gallium/radeon: reduce alloca of temporaries based on usagemask
Nicolai Hähnle
nhaehnle at gmail.com
Tue Aug 9 10:36:46 UTC 2016
From: Nicolai Hähnle <nicolai.haehnle at amd.com>
---
src/gallium/drivers/radeon/radeon_llvm.h | 1 +
.../drivers/radeon/radeon_setup_tgsi_llvm.c | 47 +++++++++++++++++-----
2 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 13f3336..4c946b5 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -45,20 +45,21 @@ struct radeon_llvm_branch {
unsigned has_else;
};
struct radeon_llvm_loop {
LLVMBasicBlockRef loop_block;
LLVMBasicBlockRef endloop_block;
};
struct radeon_llvm_array {
struct tgsi_declaration_range range;
+ ubyte usagemask;
LLVMValueRef alloca;
};
struct radeon_llvm_context {
struct lp_build_tgsi_soa_context soa;
/*=== Front end configuration ===*/
/* Instructions that are not described by any of the TGSI opcodes. */
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index d8ab5b0..73e4ce2 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -191,22 +191,30 @@ get_pointer_into_array(struct radeon_llvm_context *ctx,
LLVMValueRef index;
if (file != TGSI_FILE_TEMPORARY)
return NULL;
array = get_temp_array(&ctx->soa.bld_base, reg_index, reg_indirect);
if (!array || !array->alloca)
return NULL;
index = emit_array_index(&ctx->soa, reg_indirect, reg_index - array->range.First);
- index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
- index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, swizzle), "");
+ index = LLVMBuildMul(
+ builder, index,
+ lp_build_const_int32(gallivm, util_bitcount(array->usagemask)),
+ "");
+ index = LLVMBuildAdd(
+ builder, index,
+ lp_build_const_int32(
+ gallivm,
+ util_bitcount(array->usagemask & ((1 << swizzle) - 1))),
+ "");
idxs[0] = ctx->soa.bld_base.uint_bld.zero;
idxs[1] = index;
return LLVMBuildGEP(builder, array->alloca, idxs, 2, "");
}
LLVMValueRef
radeon_llvm_emit_fetch_64bit(struct lp_build_tgsi_context *bld_base,
enum tgsi_opcode_type type,
LLVMValueRef ptr,
LLVMValueRef ptr2)
@@ -466,54 +474,61 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
}
}
break;
}
case TGSI_FILE_TEMPORARY:
{
char name[16] = "";
LLVMValueRef array_alloca = NULL;
unsigned decl_size;
+ unsigned usagemask = decl->Declaration.UsageMask;
first = decl->Range.First;
last = decl->Range.Last;
decl_size = 4 * ((last - first) + 1);
+
if (decl->Declaration.Array) {
unsigned id = decl->Array.ArrayID - 1;
+ unsigned array_size;
+
if (!ctx->arrays) {
int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
ctx->arrays = CALLOC(size, sizeof(ctx->arrays[0]));
}
ctx->arrays[id].range = decl->Range;
+ ctx->arrays[id].usagemask = usagemask;
+ array_size = ((last - first) + 1) * util_bitcount(usagemask);
/* If the array has more than 16 elements, store it
* in memory using an alloca that spans the entire
* array.
*
* Otherwise, store each array element individually.
* We will then generate vectors (per-channel, up to
- * <4 x float>) for indirect addressing.
+ * <16 x float> if the usagemask is a single bit) for
+ * indirect addressing.
*
* Note that 16 is the number of vector elements that
* LLVM will store in a register, so theoretically an
* array with up to 4 * 16 = 64 elements could be
* handled this way, but whether that's a good idea
* depends on VGPR register pressure elsewhere.
*
* FIXME: We shouldn't need to have the non-alloca
* code path for arrays. LLVM should be smart enough to
* promote allocas into registers when profitable.
*/
- if (decl_size > 16) {
+ if (array_size > 16) {
array_alloca = LLVMBuildAlloca(builder,
LLVMArrayType(bld_base->base.vec_type,
- decl_size), "array");
+ array_size), "array");
ctx->arrays[id].alloca = array_alloca;
}
}
if (!ctx->temps_count) {
ctx->temps_count = bld_base->info->file_max[TGSI_FILE_TEMPORARY] + 1;
ctx->temps = MALLOC(TGSI_NUM_CHANNELS * ctx->temps_count * sizeof(LLVMValueRef));
}
if (!array_alloca) {
for (i = 0; i < decl_size; ++i) {
@@ -524,28 +539,40 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
ctx->temps[first * TGSI_NUM_CHANNELS + i] =
si_build_alloca_undef(bld_base->base.gallivm,
bld_base->base.vec_type,
name);
}
} else {
LLVMValueRef idxs[2] = {
bld_base->uint_bld.zero,
NULL
};
+ LLVMValueRef undef = NULL;
+ unsigned j = 0;
+
+ if (usagemask != TGSI_WRITEMASK_XYZW)
+ undef = LLVMGetUndef(LLVMPointerType(bld_base->base.vec_type, 0));
+
for (i = 0; i < decl_size; ++i) {
+ LLVMValueRef ptr;
+ if (usagemask & (1 << (i % 4))) {
#ifdef DEBUG
- snprintf(name, sizeof(name), "TEMP%d.%c",
- first + i / 4, "xyzw"[i % 4]);
+ snprintf(name, sizeof(name), "TEMP%d.%c",
+ first + i / 4, "xyzw"[i % 4]);
#endif
- idxs[1] = lp_build_const_int32(bld_base->base.gallivm, i);
- ctx->temps[first * TGSI_NUM_CHANNELS + i] =
- LLVMBuildGEP(builder, array_alloca, idxs, 2, name);
+ idxs[1] = lp_build_const_int32(bld_base->base.gallivm, j);
+ ptr = LLVMBuildGEP(builder, array_alloca, idxs, 2, name);
+ j++;
+ } else {
+ ptr = undef;
+ }
+ ctx->temps[first * TGSI_NUM_CHANNELS + i] = ptr;
}
}
break;
}
case TGSI_FILE_INPUT:
{
unsigned idx;
for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
if (ctx->load_input)
ctx->load_input(ctx, idx, decl);
--
2.7.4
More information about the mesa-dev
mailing list