[Mesa-dev] [PATCH 2/6] gallivm: optimize SoA AoS fallback fetch path a little

Tue Dec 20 14:13:20 UTC 2016

On 12/12/16 00:11, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> We should do transpose, not extract/insert, at least with "sufficient" amount
> of channels (for 4 channels, extract/insert shuffles generated otherwise look
> truly terrifying). Albeit we shouldn't fallback to that so often in any case.
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 83 +++++++++++++++++++----
>  1 file changed, 70 insertions(+), 13 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> index 389bfa0..902c763 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> @@ -40,6 +40,39 @@
>  #include "lp_bld_debug.h"
>  #include "lp_bld_format.h"
>  #include "lp_bld_arit.h"
> +#include "lp_bld_pack.h"
> +
> +
> +static void
> +convert_to_soa(struct gallivm_state *gallivm,
> +               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
> +               LLVMValueRef dst_soa[4],
> +               const struct lp_type soa_type)
> +{
> +   unsigned j, k;
> +   struct lp_type aos_channel_type = soa_type;
> +
> +   LLVMValueRef aos_channels[4];
> +   unsigned pixels_per_channel = soa_type.length / 4;
> +
> +   debug_assert((soa_type.length % 4) == 0);
> +
> +   aos_channel_type.length >>= 1;
> +
> +   for (j = 0; j < 4; ++j) {
> +      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
> +
> +      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
> +
> +      for (k = 0; k < pixels_per_channel; ++k) {
> +         channel[k] = src_aos[j + 4 * k];
> +      }
> +
> +      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
> +   }
> +
> +   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
> +}
>
>
>  void
> @@ -48,9 +81,6 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
>                              const LLVMValueRef *unswizzled,
>                              LLVMValueRef swizzled_out[4])
>  {
> -   assert(PIPE_SWIZZLE_0 == (int)PIPE_SWIZZLE_0);
> -   assert(PIPE_SWIZZLE_1 == (int)PIPE_SWIZZLE_1);
> -
>     if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
>        enum pipe_swizzle swizzle;
>        LLVMValueRef depth_or_stencil;
> @@ -547,9 +577,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
>     {
>        unsigned k, chan;
>        struct lp_type tmp_type;
> +      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
> +      boolean vec_transpose = FALSE;
>
>        if (gallivm_debug & GALLIVM_DEBUG_PERF) {
> -         debug_printf("%s: scalar unpacking of %s\n",
> +         debug_printf("%s: AoS fetch fallback for %s\n",
>                        __FUNCTION__, format_desc->short_name);
>        }
>
> @@ -560,12 +592,31 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
>           rgba_out[chan] = lp_build_undef(gallivm, type);
>        }
>
> +      if (format_desc->nr_channels > 2 ||
> +          format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
> +         /*
> +          * Note that vector transpose can be worse. This is because
> +          * llvm will ensure the missing channels have the correct
> +          * values, in particular typically 1.0 for the last channel
> +          * (if they are used or not doesn't matter, usually llvm can't
> +          * figure this out here probably due to the transpose).
> +          * But with the extract/insert path, since those missing elements
> +          * were just directly inserted/extracted llvm can optimize this
> +          * somewhat (though it still doesn't look great - and not for
> +          * the compressed formats due to their external fetch funcs).
> +          * So restrict to cases where we are sure it helps (albeit
> +          * with 2 channels it MIGHT be worth it at least with AVX).
> +          * In any case, this is just a bandaid, it does NOT replace proper
> +          * SoA format unpack.
> +          */
> +         vec_transpose = TRUE;
> +      }
> +

There's a burden in maintaining so many code paths -- it raises the 
difficulty bar next time we want to do an optimization --, so if this is 
just a little worse, or only affects the draw, I'd say it's better to 
always use vec_transpose.

>        /* loop over number of pixels */
>        for(k = 0; k < type.length; ++k) {
>           LLVMValueRef index = lp_build_const_int32(gallivm, k);
>           LLVMValueRef offset_elem;
>           LLVMValueRef i_elem, j_elem;
> -         LLVMValueRef tmp;
>
>           offset_elem = LLVMBuildExtractElement(builder, offset,
>                                                 index, "");
> @@ -574,20 +625,26 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
>           j_elem = LLVMBuildExtractElement(builder, j, index, "");
>
>           /* Get a single float[4]={R,G,B,A} pixel */
> -         tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
> -                                       aligned, base_ptr, offset_elem,
> -                                       i_elem, j_elem, cache);
> +         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
> +                                                aligned, base_ptr, offset_elem,
> +                                                i_elem, j_elem, cache);
>
>           /*
>            * Insert the AoS tmp value channels into the SoA result vectors at
>            * position = 'index'.
>            */
> -         for (chan = 0; chan < 4; ++chan) {
> -            LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
> -            tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
> -            rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
> -                                                    tmp_chan, index, "");
> +         if (!vec_transpose) {
> +            for (chan = 0; chan < 4; ++chan) {
> +               LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
> +               tmp_chan = LLVMBuildExtractElement(builder, aos_fetch[k], chan_val, "");
> +               rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
> +                                                       tmp_chan, index, "");
> +            }
>           }
>        }
> +      if (vec_transpose) {
> +         convert_to_soa(gallivm, aos_fetch, rgba_out, type);
> +      }
> +
>     }
>  }
>

Jose