[Mesa-dev] [PATCH 3/3] radeon/llvm: Use alloca instructions for larger arrays
Marek Olšák
maraeo at gmail.com
Thu May 26 14:26:47 UTC 2016
For the series:
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Marek
On Wed, May 25, 2016 at 2:35 PM, Tom Stellard <thomas.stellard at amd.com> wrote:
> We were storing arrays in vectors, which was leading to some really bad
> spill code for large arrays. allocas instructions are a better fit for
> arrays and LLVM optimizations are more geared toward dealing with
> allocas instead of vectors.
>
> For arrays that have 16 or less 32-bit elements, we will continue to use
> vectors, because this will force LLVM to store them in registers and
> use indirect registers, which is usually faster for small arrays.
>
> In the future we should use allocas for all arrays and teach LLVM
> how to store allocas in registers.
>
> This fixes the piglit test:
>
> spec/glsl-1.50/execution/geometry/max-input-component
> ---
> src/gallium/drivers/radeon/radeon_llvm.h | 7 +-
> .../drivers/radeon/radeon_setup_tgsi_llvm.c | 169 ++++++++++++++++++---
> 2 files changed, 151 insertions(+), 25 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
> index 3e11b36..5b524b6 100644
> --- a/src/gallium/drivers/radeon/radeon_llvm.h
> +++ b/src/gallium/drivers/radeon/radeon_llvm.h
> @@ -50,6 +50,11 @@ struct radeon_llvm_loop {
> LLVMBasicBlockRef endloop_block;
> };
>
> +struct radeon_llvm_array {
> + struct tgsi_declaration_range range;
> + LLVMValueRef alloca;
> +};
> +
> struct radeon_llvm_context {
> struct lp_build_tgsi_soa_context soa;
>
> @@ -96,7 +101,7 @@ struct radeon_llvm_context {
> unsigned loop_depth;
> unsigned loop_depth_max;
>
> - struct tgsi_declaration_range *arrays;
> + struct radeon_llvm_array *arrays;
>
> LLVMValueRef main_fn;
> LLVMTypeRef return_type;
> diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> index 93bc307..cb35390 100644
> --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> @@ -83,11 +83,25 @@ static LLVMValueRef emit_swizzle(
>
> static struct tgsi_declaration_range
> get_array_range(struct lp_build_tgsi_context *bld_base,
> - unsigned File, const struct tgsi_ind_register *reg)
> + unsigned File, unsigned reg_index,
> + const struct tgsi_ind_register *reg)
> {
> struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
>
> - if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 ||
> + if (!reg) {
> + unsigned i;
> + unsigned num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
> + for (i = 0; i < num_arrays; i++) {
> + const struct tgsi_declaration_range *range =
> + &ctx->arrays[i].range;
> +
> + if (reg_index >= range->First && reg_index <= range->Last) {
> + return ctx->arrays[i].range;
> + }
> + }
> + }
> +
> + if (File != TGSI_FILE_TEMPORARY || !reg || reg->ArrayID == 0 ||
> reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) {
> struct tgsi_declaration_range range;
> range.First = 0;
> @@ -95,7 +109,32 @@ get_array_range(struct lp_build_tgsi_context *bld_base,
> return range;
> }
>
> - return ctx->arrays[reg->ArrayID - 1];
> + return ctx->arrays[reg->ArrayID - 1].range;
> +}
> +
> +static LLVMValueRef get_alloca_for_array(
> + struct lp_build_tgsi_context *bld_base,
> + unsigned file,
> + unsigned index) {
> +
> + unsigned i;
> + unsigned num_arrays;
> + struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base);
> +
> + if (file != TGSI_FILE_TEMPORARY) {
> + return NULL;
> + }
> +
> + num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
> + for (i = 0; i < num_arrays; i++) {
> + const struct tgsi_declaration_range *range =
> + &ctx->arrays[i].range;
> +
> + if (index >= range->First && index <= range->Last) {
> + return ctx->arrays[i].alloca;
> + }
> + }
> + return NULL;
> }
>
> static LLVMValueRef
> @@ -106,6 +145,9 @@ emit_array_index(
> {
> struct gallivm_state * gallivm = bld->bld_base.base.gallivm;
>
> + if (!reg) {
> + return lp_build_const_int32(gallivm, offset);
> + }
> LLVMValueRef addr = LLVMBuildLoad(gallivm->builder, bld->addr[reg->Index][reg->Swizzle], "");
> return LLVMBuildAdd(gallivm->builder, addr, lp_build_const_int32(gallivm, offset), "");
> }
> @@ -154,7 +196,7 @@ emit_array_fetch(
> tmp_reg.Register.Index = i + range.First;
> LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle);
> result = LLVMBuildInsertElement(builder, result, temp,
> - lp_build_const_int32(gallivm, i), "");
> + lp_build_const_int32(gallivm, i), "array_vector");
> }
> return result;
> }
> @@ -169,13 +211,35 @@ load_value_from_array(
> const struct tgsi_ind_register *reg_indirect)
> {
> struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
> - LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> - struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_indirect);
> + struct gallivm_state *gallivm = bld_base->base.gallivm;
> + LLVMBuilderRef builder = gallivm->builder;
> + struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect);
> + LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - range.First);
> + LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index);
> + LLVMValueRef ptr, val, indices[2];
> +
> + if (!array) {
> + /* Handle the case where the array is stored as a vector. */
> + return LLVMBuildExtractElement(builder,
> + emit_array_fetch(bld_base, file, type, range, swizzle),
> + index, "");
> + }
>
> - return LLVMBuildExtractElement(builder,
> - emit_array_fetch(bld_base, file, type, range, swizzle),
> - emit_array_index(bld, reg_indirect, reg_index - range.First), "");
> + index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
> + index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, swizzle), "");
> + indices[0] = bld_base->uint_bld.zero;
> + indices[1] = index;
> + ptr = LLVMBuildGEP(builder, array, indices, 2, "");
> + val = LLVMBuildLoad(builder, ptr, "");
> + if (type == TGSI_TYPE_DOUBLE) {
> + LLVMValueRef ptr_hi, val_hi;
> + indices[0] = lp_build_const_int32(gallivm, 1);
> + ptr_hi = LLVMBuildGEP(builder, ptr, indices, 1, "");
> + val_hi = LLVMBuildLoad(builder, ptr_hi, "");
> + val = radeon_llvm_emit_fetch_double(bld_base, val, val_hi);
>
> + }
> + return val;
> }
>
> static LLVMValueRef store_value_to_array(
> @@ -187,13 +251,26 @@ static LLVMValueRef store_value_to_array(
> const struct tgsi_ind_register *reg_indirect)
> {
> struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
> - LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> - struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_indirect);
> -
> - return LLVMBuildInsertElement(builder,
> + struct gallivm_state *gallivm = bld_base->base.gallivm;
> + LLVMBuilderRef builder = gallivm->builder;
> + struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect);
> + LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - range.First);
> + LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index);
> +
> + if (array) {
> + LLVMValueRef indices[2];
> + index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
> + index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, chan_index), "");
> + indices[0] = bld_base->uint_bld.zero;
> + indices[1] = index;
> + LLVMValueRef pointer = LLVMBuildGEP(builder, array, indices, 2, "");
> + LLVMBuildStore(builder, value, pointer);
> + return NULL;
> + } else {
> + return LLVMBuildInsertElement(builder,
> emit_array_fetch(bld_base, file, TGSI_TYPE_FLOAT, range, chan_index),
> - value, emit_array_index(bld, reg_indirect, reg_index - range.First), "");
> - return NULL;
> + value, index, "");
> + }
> }
>
> LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
> @@ -217,8 +294,9 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
> }
>
> if (reg->Register.Indirect) {
> - return load_value_from_array(bld_base, reg->Register.File, type,
> + LLVMValueRef load = load_value_from_array(bld_base, reg->Register.File, type,
> swizzle, reg->Register.Index, ®->Indirect);
> + return bitcast(bld_base, type, load);
> }
>
> switch(reg->Register.File) {
> @@ -257,6 +335,11 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
> LLVMBuildLoad(builder, ptr, ""),
> LLVMBuildLoad(builder, ptr2, ""));
> }
> + LLVMValueRef array = get_alloca_for_array(bld_base, reg->Register.File, reg->Register.Index);
> + if (array) {
> + return bitcast(bld_base, type, load_value_from_array(bld_base, reg->Register.File, type,
> + swizzle, reg->Register.Index, NULL));
> + }
> result = LLVMBuildLoad(builder, ptr, "");
> break;
>
> @@ -309,6 +392,7 @@ static void emit_declaration(
> const struct tgsi_full_declaration *decl)
> {
> struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
> + LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> unsigned first, last, i, idx;
> switch(decl->Declaration.File) {
> case TGSI_FILE_ADDRESS:
> @@ -326,13 +410,36 @@ static void emit_declaration(
> }
>
> case TGSI_FILE_TEMPORARY:
> + {
> + unsigned decl_size;
> + first = decl->Range.First;
> + last = decl->Range.Last;
> + decl_size = 4 * ((last - first) + 1);
> if (decl->Declaration.Array) {
> + unsigned id = decl->Array.ArrayID - 1;
> if (!ctx->arrays) {
> int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
> - ctx->arrays = MALLOC(sizeof(ctx->arrays[0]) * size);
> + ctx->arrays = CALLOC(size, sizeof(ctx->arrays[0]));
> + for (i = 0; i < size; ++i) {
> + assert(!ctx->arrays[i].alloca);}
> }
>
> - ctx->arrays[decl->Array.ArrayID - 1] = decl->Range;
> + ctx->arrays[id].range = decl->Range;
> +
> + /* If the array is more than 16 elements (each element
> + * is 32-bits), then store it in a vector. Storing the
> + * array in a vector will causes the compiler to store
> + * the array in registers and access it using indirect
> + * addressing. 16 is number of vector elements that
> + * LLVM will store in a register.
> + * FIXME: We shouldn't need to do this. LLVM should be
> + * smart enough to promote allocas int registers when
> + * profitable.
> + */
> + if (decl_size > 16) {
> + ctx->arrays[id].alloca = LLVMBuildAlloca(builder,
> + LLVMArrayType(bld_base->base.vec_type, decl_size),"array");
> + }
> }
> first = decl->Range.First;
> last = decl->Range.Last;
> @@ -349,7 +456,7 @@ static void emit_declaration(
> }
> }
> break;
> -
> + }
> case TGSI_FILE_INPUT:
> {
> unsigned idx;
> @@ -459,11 +566,16 @@ void radeon_llvm_emit_store(
>
> if (reg->Register.Indirect) {
> struct tgsi_declaration_range range = get_array_range(bld_base,
> - reg->Register.File, ®->Indirect);
> + reg->Register.File, reg->Register.Index, ®->Indirect);
>
> unsigned i, size = range.Last - range.First + 1;
> - LLVMValueRef array = store_value_to_array(bld_base, value, reg->Register.File, chan_index,
> - reg->Register.Index, ®->Indirect);
> + unsigned file = reg->Register.File;
> + unsigned reg_index = reg->Register.Index;
> + LLVMValueRef array = store_value_to_array(bld_base, value, file, chan_index,
> + reg_index, ®->Indirect);
> + if (get_alloca_for_array(bld_base, file, reg_index)) {
> + continue;
> + }
> for (i = 0; i < size; ++i) {
> switch(reg->Register.File) {
> case TGSI_FILE_OUTPUT:
> @@ -477,7 +589,7 @@ void radeon_llvm_emit_store(
> break;
>
> default:
> - return;
> + continue;
> }
> value = LLVMBuildExtractElement(builder, array,
> lp_build_const_int32(gallivm, i), "");
> @@ -493,14 +605,23 @@ void radeon_llvm_emit_store(
> break;
>
> case TGSI_FILE_TEMPORARY:
> + {
> + LLVMValueRef array;
> if (reg->Register.Index >= ctx->temps_count)
> continue;
> + array = get_alloca_for_array(bld_base, reg->Register.File, reg->Register.Index);
> +
> + if (array) {
> + store_value_to_array(bld_base, value, reg->Register.File, chan_index, reg->Register.Index,
> + NULL);
> + continue;
> + }
> temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index];
> if (dtype == TGSI_TYPE_DOUBLE)
> temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1];
>
> break;
> -
> + }
> default:
> return;
> }
> --
> 2.0.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list