[Mesa-dev] [PATCH 3/3] radeon/llvm: Use alloca instructions for larger arrays

Thu May 26 14:26:47 UTC 2016

For the series:

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek

On Wed, May 25, 2016 at 2:35 PM, Tom Stellard <thomas.stellard at amd.com> wrote:
> We were storing arrays in vectors, which was leading to some really bad
> spill code for large arrays.  allocas instructions are a better fit for
> arrays and LLVM optimizations are more geared toward dealing with
> allocas instead of vectors.
>
> For arrays that have 16 or less 32-bit elements, we will continue to use
> vectors, because this will force LLVM to store them in registers and
> use indirect registers, which is usually faster for small arrays.
>
> In the future we should use allocas for all arrays and teach LLVM
> how to store allocas in registers.
>
> This fixes the piglit test:
>
> spec/glsl-1.50/execution/geometry/max-input-component
> ---
>  src/gallium/drivers/radeon/radeon_llvm.h           |   7 +-
>  .../drivers/radeon/radeon_setup_tgsi_llvm.c        | 169 ++++++++++++++++++---
>  2 files changed, 151 insertions(+), 25 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
> index 3e11b36..5b524b6 100644
> --- a/src/gallium/drivers/radeon/radeon_llvm.h
> +++ b/src/gallium/drivers/radeon/radeon_llvm.h
> @@ -50,6 +50,11 @@ struct radeon_llvm_loop {
>         LLVMBasicBlockRef endloop_block;
>  };
>
> +struct radeon_llvm_array {
> +       struct tgsi_declaration_range range;
> +       LLVMValueRef alloca;
> +};
> +
>  struct radeon_llvm_context {
>         struct lp_build_tgsi_soa_context soa;
>
> @@ -96,7 +101,7 @@ struct radeon_llvm_context {
>         unsigned loop_depth;
>         unsigned loop_depth_max;
>
> -       struct tgsi_declaration_range *arrays;
> +       struct radeon_llvm_array *arrays;
>
>         LLVMValueRef main_fn;
>         LLVMTypeRef return_type;
> diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> index 93bc307..cb35390 100644
> --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> @@ -83,11 +83,25 @@ static LLVMValueRef emit_swizzle(
>
>  static struct tgsi_declaration_range
>  get_array_range(struct lp_build_tgsi_context *bld_base,
> -               unsigned File, const struct tgsi_ind_register *reg)
> +               unsigned File, unsigned reg_index,
> +               const struct tgsi_ind_register *reg)
>  {
>         struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
>
> -       if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 ||
> +       if (!reg) {
> +               unsigned i;
> +               unsigned num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
> +               for (i = 0; i < num_arrays; i++) {
> +                       const struct tgsi_declaration_range *range =
> +                                               &ctx->arrays[i].range;
> +
> +                       if (reg_index >= range->First && reg_index <= range->Last) {
> +                               return ctx->arrays[i].range;
> +                       }
> +               }
> +       }
> +
> +       if (File != TGSI_FILE_TEMPORARY || !reg || reg->ArrayID == 0 ||
>             reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) {
>                 struct tgsi_declaration_range range;
>                 range.First = 0;
> @@ -95,7 +109,32 @@ get_array_range(struct lp_build_tgsi_context *bld_base,
>                 return range;
>         }
>
> -       return ctx->arrays[reg->ArrayID - 1];
> +       return ctx->arrays[reg->ArrayID - 1].range;
> +}
> +
> +static LLVMValueRef get_alloca_for_array(
> +       struct lp_build_tgsi_context *bld_base,
> +       unsigned file,
> +       unsigned index) {
> +
> +       unsigned i;
> +       unsigned num_arrays;
> +       struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base);
> +
> +       if (file != TGSI_FILE_TEMPORARY) {
> +               return NULL;
> +       }
> +
> +       num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
> +       for (i = 0; i < num_arrays; i++) {
> +               const struct tgsi_declaration_range *range =
> +                                               &ctx->arrays[i].range;
> +
> +               if (index >= range->First && index <= range->Last) {
> +                       return ctx->arrays[i].alloca;
> +               }
> +       }
> +       return NULL;
>  }
>
>  static LLVMValueRef
> @@ -106,6 +145,9 @@ emit_array_index(
>  {
>         struct gallivm_state * gallivm = bld->bld_base.base.gallivm;
>
> +       if (!reg) {
> +               return lp_build_const_int32(gallivm, offset);
> +       }
>         LLVMValueRef addr = LLVMBuildLoad(gallivm->builder, bld->addr[reg->Index][reg->Swizzle], "");
>         return LLVMBuildAdd(gallivm->builder, addr, lp_build_const_int32(gallivm, offset), "");
>  }
> @@ -154,7 +196,7 @@ emit_array_fetch(
>                 tmp_reg.Register.Index = i + range.First;
>                 LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle);
>                 result = LLVMBuildInsertElement(builder, result, temp,
> -                       lp_build_const_int32(gallivm, i), "");
> +                       lp_build_const_int32(gallivm, i), "array_vector");
>         }
>         return result;
>  }
> @@ -169,13 +211,35 @@ load_value_from_array(
>         const struct tgsi_ind_register *reg_indirect)
>  {
>         struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
> -       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> -       struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_indirect);
> +       struct gallivm_state *gallivm = bld_base->base.gallivm;
> +       LLVMBuilderRef builder = gallivm->builder;
> +       struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect);
> +       LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - range.First);
> +       LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index);
> +       LLVMValueRef ptr, val, indices[2];
> +
> +       if (!array) {
> +               /* Handle the case where the array is stored as a vector. */
> +               return LLVMBuildExtractElement(builder,
> +                               emit_array_fetch(bld_base, file, type, range, swizzle),
> +                               index, "");
> +       }
>
> -       return LLVMBuildExtractElement(builder,
> -                       emit_array_fetch(bld_base, file, type, range, swizzle),
> -                       emit_array_index(bld, reg_indirect, reg_index - range.First), "");
> +       index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
> +       index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, swizzle), "");
> +       indices[0] = bld_base->uint_bld.zero;
> +       indices[1] = index;
> +       ptr = LLVMBuildGEP(builder, array, indices, 2, "");
> +       val = LLVMBuildLoad(builder, ptr, "");
> +       if (type == TGSI_TYPE_DOUBLE) {
> +               LLVMValueRef ptr_hi, val_hi;
> +               indices[0] = lp_build_const_int32(gallivm, 1);
> +               ptr_hi = LLVMBuildGEP(builder, ptr, indices, 1, "");
> +               val_hi = LLVMBuildLoad(builder, ptr_hi, "");
> +               val = radeon_llvm_emit_fetch_double(bld_base, val, val_hi);
>
> +       }
> +       return val;
>  }
>
>  static LLVMValueRef store_value_to_array(
> @@ -187,13 +251,26 @@ static LLVMValueRef store_value_to_array(
>         const struct tgsi_ind_register *reg_indirect)
>  {
>         struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
> -       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> -       struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_indirect);
> -
> -       return LLVMBuildInsertElement(builder,
> +       struct gallivm_state *gallivm = bld_base->base.gallivm;
> +       LLVMBuilderRef builder = gallivm->builder;
> +       struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect);
> +       LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - range.First);
> +       LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index);
> +
> +       if (array) {
> +               LLVMValueRef indices[2];
> +               index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), "");
> +               index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, chan_index), "");
> +               indices[0] = bld_base->uint_bld.zero;
> +               indices[1] = index;
> +               LLVMValueRef pointer = LLVMBuildGEP(builder, array, indices, 2, "");
> +               LLVMBuildStore(builder, value, pointer);
> +               return NULL;
> +       } else {
> +               return LLVMBuildInsertElement(builder,
>                                 emit_array_fetch(bld_base, file, TGSI_TYPE_FLOAT, range, chan_index),
> -                               value,  emit_array_index(bld, reg_indirect, reg_index - range.First), "");
> -       return NULL;
> +                               value, index, "");
> +       }
>  }
>
>  LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
> @@ -217,8 +294,9 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
>         }
>
>         if (reg->Register.Indirect) {
> -               return load_value_from_array(bld_base, reg->Register.File, type,
> +               LLVMValueRef load = load_value_from_array(bld_base, reg->Register.File, type,
>                                 swizzle, reg->Register.Index, &reg->Indirect);
> +               return bitcast(bld_base, type, load);
>         }
>
>         switch(reg->Register.File) {
> @@ -257,6 +335,11 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
>                                                  LLVMBuildLoad(builder, ptr, ""),
>                                                  LLVMBuildLoad(builder, ptr2, ""));
>                 }
> +               LLVMValueRef array = get_alloca_for_array(bld_base, reg->Register.File, reg->Register.Index);
> +               if (array) {
> +                       return bitcast(bld_base, type, load_value_from_array(bld_base, reg->Register.File, type,
> +                                       swizzle, reg->Register.Index, NULL));
> +               }
>                 result = LLVMBuildLoad(builder, ptr, "");
>                 break;
>
> @@ -309,6 +392,7 @@ static void emit_declaration(
>         const struct tgsi_full_declaration *decl)
>  {
>         struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
> +       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
>         unsigned first, last, i, idx;
>         switch(decl->Declaration.File) {
>         case TGSI_FILE_ADDRESS:
> @@ -326,13 +410,36 @@ static void emit_declaration(
>         }
>
>         case TGSI_FILE_TEMPORARY:
> +       {
> +               unsigned decl_size;
> +               first = decl->Range.First;
> +               last = decl->Range.Last;
> +               decl_size = 4 * ((last - first) + 1);
>                 if (decl->Declaration.Array) {
> +                       unsigned id = decl->Array.ArrayID - 1;
>                         if (!ctx->arrays) {
>                                 int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
> -                               ctx->arrays = MALLOC(sizeof(ctx->arrays[0]) * size);
> +                               ctx->arrays = CALLOC(size, sizeof(ctx->arrays[0]));
> +                       for (i = 0; i < size; ++i) {
> +                               assert(!ctx->arrays[i].alloca);}
>                         }
>
> -                       ctx->arrays[decl->Array.ArrayID - 1] = decl->Range;
> +                       ctx->arrays[id].range = decl->Range;
> +
> +                       /* If the array is more than 16 elements (each element
> +                        * is 32-bits), then store it in a vector.  Storing the
> +                        * array in a vector will causes the compiler to store
> +                        * the array in registers and access it using indirect
> +                        * addressing.  16 is number of vector elements that
> +                        * LLVM will store in a register.
> +                        * FIXME: We shouldn't need to do this.  LLVM should be
> +                        * smart enough to promote allocas int registers when
> +                        * profitable.
> +                        */
> +                       if (decl_size > 16) {
> +                               ctx->arrays[id].alloca = LLVMBuildAlloca(builder,
> +                                       LLVMArrayType(bld_base->base.vec_type, decl_size),"array");
> +                       }
>                 }
>                 first = decl->Range.First;
>                 last = decl->Range.Last;
> @@ -349,7 +456,7 @@ static void emit_declaration(
>                         }
>                 }
>                 break;
> -
> +       }
>         case TGSI_FILE_INPUT:
>         {
>                 unsigned idx;
> @@ -459,11 +566,16 @@ void radeon_llvm_emit_store(
>
>                 if (reg->Register.Indirect) {
>                         struct tgsi_declaration_range range = get_array_range(bld_base,
> -                               reg->Register.File, &reg->Indirect);
> +                               reg->Register.File, reg->Register.Index, &reg->Indirect);
>
>                         unsigned i, size = range.Last - range.First + 1;
> -                       LLVMValueRef array = store_value_to_array(bld_base, value, reg->Register.File, chan_index,
> -                                                                 reg->Register.Index, &reg->Indirect);
> +                       unsigned file = reg->Register.File;
> +                       unsigned reg_index = reg->Register.Index;
> +                       LLVMValueRef array = store_value_to_array(bld_base, value, file, chan_index,
> +                                                                 reg_index, &reg->Indirect);
> +                       if (get_alloca_for_array(bld_base, file, reg_index)) {
> +                               continue;
> +                       }
>                         for (i = 0; i < size; ++i) {
>                                 switch(reg->Register.File) {
>                                 case TGSI_FILE_OUTPUT:
> @@ -477,7 +589,7 @@ void radeon_llvm_emit_store(
>                                         break;
>
>                                 default:
> -                                       return;
> +                                       continue;
>                                 }
>                                 value = LLVMBuildExtractElement(builder, array,
>                                         lp_build_const_int32(gallivm, i), "");
> @@ -493,14 +605,23 @@ void radeon_llvm_emit_store(
>                                 break;
>
>                         case TGSI_FILE_TEMPORARY:
> +                       {
> +                               LLVMValueRef array;
>                                 if (reg->Register.Index >= ctx->temps_count)
>                                         continue;
> +                               array = get_alloca_for_array(bld_base, reg->Register.File, reg->Register.Index);
> +
> +                               if (array) {
> +                                       store_value_to_array(bld_base, value, reg->Register.File, chan_index, reg->Register.Index,
> +                                                               NULL);
> +                                       continue;
> +                               }
>                                 temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index];
>                                 if (dtype == TGSI_TYPE_DOUBLE)
>                                         temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1];
>
>                                 break;
> -
> +                       }
>                         default:
>                                 return;
>                         }
> --
> 2.0.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev