[Mesa-dev] [PATCH 2/6] gallivm: optimize SoA AoS fallback fetch path a little

Tue Dec 20 21:04:20 UTC 2016

Am 20.12.2016 um 15:13 schrieb Jose Fonseca:
> On 12/12/16 00:11, sroland at vmware.com wrote:
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> We should do transpose, not extract/insert, at least with "sufficient"
>> amount
>> of channels (for 4 channels, extract/insert shuffles generated
>> otherwise look
>> truly terrifying). Albeit we shouldn't fallback to that so often in
>> any case.
>> ---
>>  src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 83
>> +++++++++++++++++++----
>>  1 file changed, 70 insertions(+), 13 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
>> index 389bfa0..902c763 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
>> @@ -40,6 +40,39 @@
>>  #include "lp_bld_debug.h"
>>  #include "lp_bld_format.h"
>>  #include "lp_bld_arit.h"
>> +#include "lp_bld_pack.h"
>> +
>> +
>> +static void
>> +convert_to_soa(struct gallivm_state *gallivm,
>> +               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
>> +               LLVMValueRef dst_soa[4],
>> +               const struct lp_type soa_type)
>> +{
>> +   unsigned j, k;
>> +   struct lp_type aos_channel_type = soa_type;
>> +
>> +   LLVMValueRef aos_channels[4];
>> +   unsigned pixels_per_channel = soa_type.length / 4;
>> +
>> +   debug_assert((soa_type.length % 4) == 0);
>> +
>> +   aos_channel_type.length >>= 1;
>> +
>> +   for (j = 0; j < 4; ++j) {
>> +      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
>> +
>> +      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
>> +
>> +      for (k = 0; k < pixels_per_channel; ++k) {
>> +         channel[k] = src_aos[j + 4 * k];
>> +      }
>> +
>> +      aos_channels[j] = lp_build_concat(gallivm, channel,
>> aos_channel_type, pixels_per_channel);
>> +   }
>> +
>> +   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
>> +}
>>
>>
>>  void
>> @@ -48,9 +81,6 @@ lp_build_format_swizzle_soa(const struct
>> util_format_description *format_desc,
>>                              const LLVMValueRef *unswizzled,
>>                              LLVMValueRef swizzled_out[4])
>>  {
>> -   assert(PIPE_SWIZZLE_0 == (int)PIPE_SWIZZLE_0);
>> -   assert(PIPE_SWIZZLE_1 == (int)PIPE_SWIZZLE_1);
>> -
>>     if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
>>        enum pipe_swizzle swizzle;
>>        LLVMValueRef depth_or_stencil;
>> @@ -547,9 +577,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state
>> *gallivm,
>>     {
>>        unsigned k, chan;
>>        struct lp_type tmp_type;
>> +      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
>> +      boolean vec_transpose = FALSE;
>>
>>        if (gallivm_debug & GALLIVM_DEBUG_PERF) {
>> -         debug_printf("%s: scalar unpacking of %s\n",
>> +         debug_printf("%s: AoS fetch fallback for %s\n",
>>                        __FUNCTION__, format_desc->short_name);
>>        }
>>
>> @@ -560,12 +592,31 @@ lp_build_fetch_rgba_soa(struct gallivm_state
>> *gallivm,
>>           rgba_out[chan] = lp_build_undef(gallivm, type);
>>        }
>>
>> +      if (format_desc->nr_channels > 2 ||
>> +          format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
>> +         /*
>> +          * Note that vector transpose can be worse. This is because
>> +          * llvm will ensure the missing channels have the correct
>> +          * values, in particular typically 1.0 for the last channel
>> +          * (if they are used or not doesn't matter, usually llvm can't
>> +          * figure this out here probably due to the transpose).
>> +          * But with the extract/insert path, since those missing
>> elements
>> +          * were just directly inserted/extracted llvm can optimize this
>> +          * somewhat (though it still doesn't look great - and not for
>> +          * the compressed formats due to their external fetch funcs).
>> +          * So restrict to cases where we are sure it helps (albeit
>> +          * with 2 channels it MIGHT be worth it at least with AVX).
>> +          * In any case, this is just a bandaid, it does NOT replace
>> proper
>> +          * SoA format unpack.
>> +          */
>> +         vec_transpose = TRUE;
>> +      }
>> +
> 
> There's a burden in maintaining so many code paths -- it raises the
> difficulty bar next time we want to do an optimization --, so if this is
> just a little worse, or only affects the draw, I'd say it's better to
> always use vec_transpose.

It is quite a bit worse. Though actually that analysis was done when
even some single channel formats were using that path (definitely not
just draw, e.g. r16f texture sampling), for them it's worse than for 2
channel formats. That said, the further changes in the series make sure
we don't really hit this path anymore (for any plain formats at least)
so you're right it can go away. I'll nuke it.

Roland

> 
>>        /* loop over number of pixels */
>>        for(k = 0; k < type.length; ++k) {
>>           LLVMValueRef index = lp_build_const_int32(gallivm, k);
>>           LLVMValueRef offset_elem;
>>           LLVMValueRef i_elem, j_elem;
>> -         LLVMValueRef tmp;
>>
>>           offset_elem = LLVMBuildExtractElement(builder, offset,
>>                                                 index, "");
>> @@ -574,20 +625,26 @@ lp_build_fetch_rgba_soa(struct gallivm_state
>> *gallivm,
>>           j_elem = LLVMBuildExtractElement(builder, j, index, "");
>>
>>           /* Get a single float[4]={R,G,B,A} pixel */
>> -         tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
>> -                                       aligned, base_ptr, offset_elem,
>> -                                       i_elem, j_elem, cache);
>> +         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc,
>> tmp_type,
>> +                                                aligned, base_ptr,
>> offset_elem,
>> +                                                i_elem, j_elem, cache);
>>
>>           /*
>>            * Insert the AoS tmp value channels into the SoA result
>> vectors at
>>            * position = 'index'.
>>            */
>> -         for (chan = 0; chan < 4; ++chan) {
>> -            LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
>> -            tmp_chan = LLVMBuildExtractElement(builder, tmp,
>> chan_val, "");
>> -            rgba_out[chan] = LLVMBuildInsertElement(builder,
>> rgba_out[chan],
>> -                                                    tmp_chan, index,
>> "");
>> +         if (!vec_transpose) {
>> +            for (chan = 0; chan < 4; ++chan) {
>> +               LLVMValueRef chan_val = lp_build_const_int32(gallivm,
>> chan),
>> +               tmp_chan = LLVMBuildExtractElement(builder,
>> aos_fetch[k], chan_val, "");
>> +               rgba_out[chan] = LLVMBuildInsertElement(builder,
>> rgba_out[chan],
>> +                                                       tmp_chan,
>> index, "");
>> +            }
>>           }
>>        }
>> +      if (vec_transpose) {
>> +         convert_to_soa(gallivm, aos_fetch, rgba_out, type);
>> +      }
>> +
>>     }
>>  }
>>
> 
> 
> Jose