[Mesa-dev] [PATCH 2/2] gallivm: only fetch pointers to constant buffers once

Roland Scheidegger sroland at vmware.com
Wed May 14 08:14:39 PDT 2014


Am 14.05.2014 13:43, schrieb Jose Fonseca:
> 
> 
> ----- Original Message -----
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> In 1d35f77228ad540a551a8e09e062b764a6e31f5e support for multiple constant
>> buffers was introduced. This meant we had another indirection, and we did
>> resolve the indirection for each constant buffer access. This looks very
>> reasonable since llvm can figure out if it's the same pointer, however it
>> turns out that this can cause llvm compilation time to go through the roof
>> and beyond (I've seen cases in excess of factor 100, e.g. from 50 ms to more
>> than 10 seconds (!)), with all the additional time spent in IR optimization
>> passes (and in the end all of it in DominatorTree::dominate()).
>> I've been unable to narrow it down a bit more (only some shaders seem
>> affected,
>> seemingly without much correlation to overall shader complexity or constant
>> usage) but it is easily avoidable by doing the buffer lookups themeselves
>> just
>> once (at constant buffer declaration time).
>> ---
>>  src/gallium/auxiliary/gallivm/lp_bld_tgsi.h     |   2 +
>>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 100
>>  +++++++++++++++---------
>>  2 files changed, 65 insertions(+), 37 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
>> index ffd6e87..88ac3c9 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
>> @@ -437,6 +437,8 @@ struct lp_build_tgsi_soa_context
>>  
>>     LLVMValueRef consts_ptr;
>>     LLVMValueRef const_sizes_ptr;
>> +   LLVMValueRef consts[LP_MAX_TGSI_CONST_BUFFERS];
>> +   LLVMValueRef consts_sizes[LP_MAX_TGSI_CONST_BUFFERS];
>>     const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS];
>>     LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS];
>>  
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
>> index 2b47fc2..0eaa020 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
>> @@ -1213,7 +1213,6 @@ emit_fetch_constant(
>>     LLVMBuilderRef builder = gallivm->builder;
>>     struct lp_build_context *uint_bld = &bld_base->uint_bld;
>>     unsigned dimension = 0;
>> -   LLVMValueRef dimension_index;
>>     LLVMValueRef consts_ptr;
>>     LLVMValueRef num_consts;
>>     LLVMValueRef res;
>> @@ -1227,11 +1226,8 @@ emit_fetch_constant(
>>        assert(dimension < LP_MAX_TGSI_CONST_BUFFERS);
>>     }
>>  
>> -   dimension_index = lp_build_const_int32(gallivm, dimension);
>> -   consts_ptr =
>> -      lp_build_array_get(gallivm, bld->consts_ptr, dimension_index);
>> -   num_consts =
>> -      lp_build_array_get(gallivm, bld->const_sizes_ptr, dimension_index);
>> +   consts_ptr = bld->consts[dimension];
>> +   num_consts = bld->consts_sizes[dimension];
>>  
>>     if (reg->Register.Indirect) {
>>        LLVMValueRef indirect_index;
>> @@ -2677,56 +2673,86 @@ lp_emit_declaration_soa(
>>     const unsigned last = decl->Range.Last;
>>     unsigned idx, i;
>>  
>> -   for (idx = first; idx <= last; ++idx) {
>> -      assert(last <= bld->bld_base.info->file_max[decl->Declaration.File]);
>> -      switch (decl->Declaration.File) {
>> -      case TGSI_FILE_TEMPORARY:
>> -         if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
>> -            assert(idx < LP_MAX_INLINED_TEMPS);
>> +   assert(last <= bld->bld_base.info->file_max[decl->Declaration.File]);
>> +
>> +   switch (decl->Declaration.File) {
>> +   case TGSI_FILE_TEMPORARY:
>> +      if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
>> +         assert(last < LP_MAX_INLINED_TEMPS);
>> +         for (idx = first; idx <= last; ++idx) {
>>              for (i = 0; i < TGSI_NUM_CHANNELS; i++)
>>                 bld->temps[idx][i] = lp_build_alloca(gallivm, vec_type,
>>                 "temp");
>>           }
>> -         break;
>> +      }
>> +      break;
>>  
>> -      case TGSI_FILE_OUTPUT:
>> -         if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
>> +   case TGSI_FILE_OUTPUT:
>> +      if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
>> +         for (idx = first; idx <= last; ++idx) {
>>              for (i = 0; i < TGSI_NUM_CHANNELS; i++)
>>                 bld->outputs[idx][i] = lp_build_alloca(gallivm,
>>                                                        vec_type, "output");
>>           }
>> -         break;
>> +      }
>> +      break;
>>  
>> -      case TGSI_FILE_ADDRESS:
>> -	 /* ADDR registers are only allocated with an integer LLVM IR type,
>> -	  * as they are guaranteed to always have integers.
>> -	  * XXX: Not sure if this exception is worthwhile (or the whole idea of
>> -	  * an ADDR register for that matter).
>> -	  */
>> +   case TGSI_FILE_ADDRESS:
>> +      /* ADDR registers are only allocated with an integer LLVM IR type,
>> +       * as they are guaranteed to always have integers.
>> +       * XXX: Not sure if this exception is worthwhile (or the whole idea of
>> +       * an ADDR register for that matter).
>> +       */
>> +      assert(last < LP_MAX_TGSI_ADDRS);
>> +      for (idx = first; idx <= last; ++idx) {
>>           assert(idx < LP_MAX_TGSI_ADDRS);
>>           for (i = 0; i < TGSI_NUM_CHANNELS; i++)
>>              bld->addr[idx][i] = lp_build_alloca(gallivm,
>>              bld_base->base.int_vec_type, "addr");
>> -         break;
>> +      }
>> +      break;
>>  
>> -      case TGSI_FILE_PREDICATE:
>> -         assert(idx < LP_MAX_TGSI_PREDS);
>> +   case TGSI_FILE_PREDICATE:
>> +      assert(last < LP_MAX_TGSI_PREDS);
>> +      for (idx = first; idx <= last; ++idx) {
>>           for (i = 0; i < TGSI_NUM_CHANNELS; i++)
>>              bld->preds[idx][i] = lp_build_alloca(gallivm, vec_type,
>>                                                   "predicate");
>> -         break;
>> +      }
>> +      break;
>>  
>> -      case TGSI_FILE_SAMPLER_VIEW:
>> -         /*
>> -          * The target stored here MUST match whatever there actually
>> -          * is in the set sampler views (what about return type?).
>> -          */
>> -         assert(idx < PIPE_MAX_SHADER_SAMPLER_VIEWS);
>> +   case TGSI_FILE_SAMPLER_VIEW:
>> +      /*
>> +       * The target stored here MUST match whatever there actually
>> +       * is in the set sampler views (what about return type?).
>> +       */
>> +      assert(last < PIPE_MAX_SHADER_SAMPLER_VIEWS);
>> +      for (idx = first; idx <= last; ++idx) {
>>           bld->sv[idx] = decl->SamplerView;
>> -         break;
>> -
>> -      default:
>> -         /* don't need to declare other vars */
>> -         break;
>>        }
>> +      break;
>> +
>> +   case TGSI_FILE_CONSTANT:
>> +   {
>> +      /*
>> +       * We could trivially fetch the per-buffer pointer when fetching the
>> +       * constant, relying on llvm to figure out it's always the same
>> pointer
>> +       * anyway. However, doing so results in a huge (more than factor of
>> 10)
>> +       * slowdown in llvm compilation times for some (but not all) shaders
>> +       * (more specifically, the IR optimization spends way more time in
>> +       * DominatorTree::dominates). At least with llvm versions 3.1, 3.3.
>> +       */
>> +      unsigned idx2D = decl->Dim.Index2D;
>> +      LLVMValueRef index2D = lp_build_const_int32(gallivm, idx2D);
>> +      assert(idx2D < LP_MAX_TGSI_CONST_BUFFERS);
>> +      bld->consts[idx2D] =
>> +         lp_build_array_get(gallivm, bld->consts_ptr, index2D);
>> +      bld->consts_sizes[idx2D] =
>> +         lp_build_array_get(gallivm, bld->const_sizes_ptr, index2D);
>> +   }
>> +      break;
>> +
>> +   default:
>> +      /* don't need to declare other vars */
>> +      break;
>>     }
>>  }
> 
> Looks good.
> 
> Reviewed-by: Jose Fonseca <jfonseca at vmware.com>
> 
> I wonder if there are other bits from the state that we should putting in variables at the top code block. 

I looked at it and nothing really stood out which would be an obvious
candidate. Of course, indirect access to registers works similarly but
that can't be changed (and I suspect this won't be an issue with compile
times there).
(Note that it's not actually at the top code block if you look at the
complete fs, which would require us to move that code to outside the fs
loop, that is outside lp_build_tgsi_soa and changing the interface
instead - doable but I don't know if there would be any benefits,
probably not.)

Roland


More information about the mesa-dev mailing list