[Mesa-dev] [PATCH 4/6] gallivm: provide soa fetch path handling formats with more than 32bit

Tue Dec 20 14:23:15 UTC 2016

On 12/12/16 00:12, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> This previously always fell back to AoS conversion. Even for 4-float formats
> (which is the optimal case by far for that fallback case) this was suboptimal,
> since it meant the conversion couldn't be done with 256bit vectors. While this
> may still only be partly possible for some formats, (unless there's AVX2
> support) at least the transpose can be done with half the unpacks
> (and before using the transpose for AoS fallbacks, it was worse still).
> With less than 4 channels, things got way worse with the AoS fallback
> quickly even with 128bit vectors.
> The strategy is pretty much the same as the existing one for formats
> which fit into 32 bits, except there's now multiple vectors to be
> fetched (2 or 4 to be exact), which need to be shuffled first (if it's 4
> vectors, this amounts to a transpose, for 2 it's a bit different),
> then the unpack is done the same (with the exception that the shift
> of the channels is now modulo 32, and we need to select the right
> vector).
> In fact the most complex part about it is to get the shuffles right
> for separating into lo/hi parts for AVX/AVX2...

> This also makes use of the new ability of gather to use provided type
> information, which we abuse to outsmart llvm so we get decent shuffles,
> and to fetch 3x32bit vectors without having to ZExt the scalar.
> And just because we can, we handle double formats too, albeit they are
> a bit different (draw sometimes needs to handle that).
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 529 +++++++++++++++-------
>  1 file changed, 375 insertions(+), 154 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> index b3ea709..9550f26 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> @@ -31,6 +31,7 @@
>  #include "util/u_format.h"
>  #include "util/u_memory.h"
>  #include "util/u_string.h"
> +#include "util/u_math.h"
>
>  #include "lp_bld_type.h"
>  #include "lp_bld_const.h"
> @@ -113,6 +114,166 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
>  }
>
>
> +
> +static LLVMValueRef
> +lp_build_extract_soa_chan(struct lp_build_context *bld,
> +                          unsigned blockbits,
> +                          boolean srgb_chan,
> +                          struct util_format_channel_description chan_desc,
> +                          LLVMValueRef packed)
> +{
> +   struct gallivm_state *gallivm = bld->gallivm;
> +   LLVMBuilderRef builder = gallivm->builder;
> +   struct lp_type type = bld->type;
> +   LLVMValueRef input = packed;
> +   const unsigned width = chan_desc.size;
> +   const unsigned start = chan_desc.shift;
> +   const unsigned stop = start + width;
> +
> +   /* Decode the input vector component */
> +
> +   switch(chan_desc.type) {
> +   case UTIL_FORMAT_TYPE_VOID:
> +      input = bld->undef;
> +      break;
> +
> +   case UTIL_FORMAT_TYPE_UNSIGNED:
> +      /*
> +       * Align the LSB
> +       */
> +      if (start) {
> +         input = LLVMBuildLShr(builder, input,
> +                               lp_build_const_int_vec(gallivm, type, start), "");
> +      }
> +
> +      /*
> +       * Zero the MSBs
> +       */
> +      if (stop < blockbits) {
> +         unsigned mask = ((unsigned long long)1 << width) - 1;
> +         input = LLVMBuildAnd(builder, input,
> +                              lp_build_const_int_vec(gallivm, type, mask), "");
> +      }
> +
> +      /*
> +       * Type conversion
> +       */
> +      if (type.floating) {
> +         if (srgb_chan) {
> +            struct lp_type conv_type = lp_uint_type(type);
> +            input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
> +         }
> +         else {
> +            if(chan_desc.normalized)
> +               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
> +            else
> +               input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
> +         }
> +      }
> +      else if (chan_desc.pure_integer) {
> +         /* Nothing to do */
> +      } else {
> +          /* FIXME */
> +          assert(0);
> +      }
> +      break;
> +
> +   case UTIL_FORMAT_TYPE_SIGNED:
> +      /*
> +       * Align the sign bit first.
> +       */
> +      if (stop < type.width) {
> +         unsigned bits = type.width - stop;
> +         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
> +         input = LLVMBuildShl(builder, input, bits_val, "");
> +      }
> +
> +      /*
> +       * Align the LSB (with an arithmetic shift to preserve the sign)
> +       */
> +      if (chan_desc.size < type.width) {
> +         unsigned bits = type.width - chan_desc.size;
> +         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
> +         input = LLVMBuildAShr(builder, input, bits_val, "");
> +      }
> +
> +      /*
> +       * Type conversion
> +       */
> +      if (type.floating) {
> +         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
> +         if (chan_desc.normalized) {
> +            double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
> +            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
> +            input = LLVMBuildFMul(builder, input, scale_val, "");
> +            /*
> +             * The formula above will produce value below -1.0 for most negative
> +             * value but everything seems happy with that hence disable for now.
> +             */
> +            if (0)
> +               input = lp_build_max(bld, input,
> +                                    lp_build_const_vec(gallivm, type, -1.0f));
> +         }
> +      }
> +      else if (chan_desc.pure_integer) {
> +         /* Nothing to do */
> +      } else {
> +          /* FIXME */
> +          assert(0);
> +      }
> +      break;
> +
> +   case UTIL_FORMAT_TYPE_FLOAT:
> +      if (type.floating) {
> +         if (chan_desc.size == 16) {
> +            struct lp_type f16i_type = type;
> +            f16i_type.width /= 2;
> +            f16i_type.floating = 0;
> +            if (start) {
> +               input = LLVMBuildLShr(builder, input,
> +                                     lp_build_const_int_vec(gallivm, type, start), "");
> +            }
> +            input = LLVMBuildTrunc(builder, input,
> +                                   lp_build_vec_type(gallivm, f16i_type), "");
> +            input = lp_build_half_to_float(gallivm, input);
> +         } else {
> +            assert(start == 0);
> +            assert(stop == 32);
> +            assert(type.width == 32);
> +         }
> +         input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
> +      }
> +      else {
> +         /* FIXME */
> +         assert(0);
> +         input = bld->undef;
> +      }
> +      break;
> +
> +   case UTIL_FORMAT_TYPE_FIXED:
> +      if (type.floating) {
> +         double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
> +         LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
> +         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
> +         input = LLVMBuildFMul(builder, input, scale_val, "");
> +      }
> +      else {
> +         /* FIXME */
> +         assert(0);
> +         input = bld->undef;
> +      }
> +      break;
> +
> +   default:
> +      assert(0);
> +      input = bld->undef;
> +      break;
> +   }
> +
> +   return input;
> +}
> +
> +
>  /**
>   * Unpack several pixels in SoA.
>   *
> @@ -143,7 +304,6 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
>                           LLVMValueRef packed,
>                           LLVMValueRef rgba_out[4])
>  {
> -   LLVMBuilderRef builder = gallivm->builder;
>     struct lp_build_context bld;
>     LLVMValueRef inputs[4];
>     unsigned chan;
> @@ -159,162 +319,19 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
>
>     /* Decode the input vector components */
>     for (chan = 0; chan < format_desc->nr_channels; ++chan) {
> -      const unsigned width = format_desc->channel[chan].size;
> -      const unsigned start = format_desc->channel[chan].shift;
> -      const unsigned stop = start + width;
> -      LLVMValueRef input;
> -
> -      input = packed;
> -
> -      switch(format_desc->channel[chan].type) {
> -      case UTIL_FORMAT_TYPE_VOID:
> -         input = lp_build_undef(gallivm, type);
> -         break;
> -
> -      case UTIL_FORMAT_TYPE_UNSIGNED:
> -         /*
> -          * Align the LSB
> -          */
> -
> -         if (start) {
> -            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
> -         }
> -
> -         /*
> -          * Zero the MSBs
> -          */
> -
> -         if (stop < format_desc->block.bits) {
> -            unsigned mask = ((unsigned long long)1 << width) - 1;
> -            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
> -         }
> -
> -         /*
> -          * Type conversion
> -          */
> -
> -         if (type.floating) {
> -            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
> -               if (format_desc->swizzle[3] == chan) {
> -                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
> -               }
> -               else {
> -                  struct lp_type conv_type = lp_uint_type(type);
> -                  input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
> -               }
> -            }
> -            else {
> -               if(format_desc->channel[chan].normalized)
> -                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
> -               else
> -                  input = LLVMBuildSIToFP(builder, input,
> -                                          lp_build_vec_type(gallivm, type), "");
> -            }
> -         }
> -         else if (format_desc->channel[chan].pure_integer) {
> -            /* Nothing to do */
> -         } else {
> -             /* FIXME */
> -             assert(0);
> -         }
> -
> -         break;
> -
> -      case UTIL_FORMAT_TYPE_SIGNED:
> -         /*
> -          * Align the sign bit first.
> -          */
> -
> -         if (stop < type.width) {
> -            unsigned bits = type.width - stop;
> -            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
> -            input = LLVMBuildShl(builder, input, bits_val, "");
> -         }
> +      struct util_format_channel_description chan_desc = format_desc->channel[chan];
> +      boolean srgb_chan = FALSE;
>
> -         /*
> -          * Align the LSB (with an arithmetic shift to preserve the sign)
> -          */
> -
> -         if (format_desc->channel[chan].size < type.width) {
> -            unsigned bits = type.width - format_desc->channel[chan].size;
> -            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
> -            input = LLVMBuildAShr(builder, input, bits_val, "");
> -         }
> -
> -         /*
> -          * Type conversion
> -          */
> -
> -         if (type.floating) {
> -            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
> -            if (format_desc->channel[chan].normalized) {
> -               double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
> -               LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
> -               input = LLVMBuildFMul(builder, input, scale_val, "");
> -               /* the formula above will produce value below -1.0 for most negative
> -                * value but everything seems happy with that hence disable for now */
> -               if (0)
> -                  input = lp_build_max(&bld, input,
> -                                       lp_build_const_vec(gallivm, type, -1.0f));
> -            }
> -         }
> -         else if (format_desc->channel[chan].pure_integer) {
> -            /* Nothing to do */
> -         } else {
> -             /* FIXME */
> -             assert(0);
> -         }
> -
> -         break;
> -
> -      case UTIL_FORMAT_TYPE_FLOAT:
> -         if (type.floating) {
> -            if (format_desc->channel[chan].size == 16) {
> -               struct lp_type f16i_type = type;
> -               f16i_type.width /= 2;
> -               f16i_type.floating = 0;
> -               if (start) {
> -                  input = LLVMBuildLShr(builder, input,
> -                             lp_build_const_int_vec(gallivm, type, start), "");
> -               }
> -               input = LLVMBuildTrunc(builder, input,
> -                                      lp_build_vec_type(gallivm, f16i_type), "");
> -               input = lp_build_half_to_float(gallivm, input);
> -            } else {
> -               assert(start == 0);
> -               assert(stop == 32);
> -               assert(type.width == 32);
> -            }
> -            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
> -         }
> -         else {
> -            /* FIXME */
> -            assert(0);
> -            input = lp_build_undef(gallivm, type);
> -         }
> -         break;
> -
> -      case UTIL_FORMAT_TYPE_FIXED:
> -         if (type.floating) {
> -            double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
> -            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
> -            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
> -            input = LLVMBuildFMul(builder, input, scale_val, "");
> -         }
> -         else {
> -            /* FIXME */
> -            assert(0);
> -            input = lp_build_undef(gallivm, type);
> -         }
> -         break;
> -
> -      default:
> -         assert(0);
> -         input = lp_build_undef(gallivm, type);
> -         break;
> +      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
> +          format_desc->swizzle[3] != chan) {
> +         srgb_chan = TRUE;
>        }
>
> -      inputs[chan] = input;
> +      inputs[chan] = lp_build_extract_soa_chan(&bld,
> +                                               format_desc->block.bits,
> +                                               srgb_chan,
> +                                               chan_desc,
> +                                               packed);
>     }
>
>     lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
> @@ -450,6 +467,210 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
>        return;
>     }
>
> +
> +   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
> +       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
> +       format_desc->block.width == 1 &&
> +       format_desc->block.height == 1 &&
> +       format_desc->block.bits > type.width &&
> +       ((format_desc->block.bits <= type.width * type.length &&
> +         format_desc->channel[0].size <= type.width) ||
> +        (format_desc->channel[0].size == 64 &&
> +         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
> +         type.floating)))
> +   {
> +      /*
> +       * Similar to above, but the packed pixel is larger than what fits
> +       * into an element of the destination format. The packed pixels will be
> +       * shuffled into SoA vectors appropriately, and then the extraction will
> +       * be done in parallel as much as possible.
> +       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
> +       * the gathered vectors can be shuffled easily (even with avx).
> +       * 64xn float -> 32xn float is handled too but it's a bit special as
> +       * it does the conversion pre-shuffle.
> +       */
> +
> +      LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
> +      struct lp_type fetch_type, gather_type = type;
> +      unsigned num_gather, fetch_width, i, j;
> +      struct lp_build_context bld;
> +      boolean fp64 = format_desc->channel[0].size == 64;
> +
> +      lp_build_context_init(&bld, gallivm, type);
> +
> +      assert(type.width == 32);
> +      assert(format_desc->block.bits > type.width);
> +
> +      /*
> +       * First, figure out fetch order.
> +       */
> +      fetch_width = util_next_power_of_two(format_desc->block.bits);
> +      num_gather = fetch_width / type.width;
> +      /*
> +       * fp64 are treated like fp32 except we fetch twice wide values
> +       * (as we shuffle after trunc). The shuffles for that work out
> +       * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
> +       * albeit we miss the potential opportunity for hw gather (as it
> +       * only handles native size).
> +       */
> +      num_gather = fetch_width / type.width;
> +      gather_type.width *= num_gather;
> +      if (fp64) {
> +         num_gather /= 2;
> +      }
> +      gather_type.length /= num_gather;
> +
> +      for (i = 0; i < num_gather; i++) {
> +         LLVMValueRef offsetr, shuf_vec;
> +         if(num_gather == 4) {
> +            for (j = 0; j < gather_type.length; j++) {
> +               unsigned idx = i + 4*j;
> +               shuffles[j] = lp_build_const_int32(gallivm, idx);
> +            }
> +            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
> +            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
> +
> +         }
> +         else if (num_gather == 2) {
> +            assert(num_gather == 2);
> +            for (j = 0; j < gather_type.length; j++) {
> +               unsigned idx = i*2 + (j%2) + (j/2)*4;
> +               shuffles[j] = lp_build_const_int32(gallivm, idx);
> +            }
> +            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
> +            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
> +         }
> +         else {
> +            assert(num_gather == 1);
> +            offsetr = offset;
> +         }
> +         if (gather_type.length == 1) {
> +            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
> +            offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
> +         }
> +
> +         /*
> +          * Determine whether to use float or int loads. This is mostly
> +          * to outsmart the (stupid) llvm int/float shuffle logic, we
> +          * don't really care much if the data is floats or ints...
> +          * But llvm will refuse to use single float shuffle with int data
> +          * and instead use 3 int shuffles instead, the code looks atrocious.
> +          * (Note bitcasts often won't help, as llvm is too smart to be
> +          * fooled by that.)
> +          * Nobody cares about simd float<->int domain transition penalties,
> +          * which usually don't even exist for shuffles anyway.
> +          * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
> +          * going into transpose, which is unpacks, so doesn't really matter
> +          * much).
> +          * With 2x32bit or 4x16bit fetch, we use float vec, since those
> +          * go into the weird channel separation shuffle. With floats,
> +          * this is (with 128bit vectors):
> +          * - 2 movq, 2 movhpd, 2 shufps
> +          * With ints it would be:
> +          * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
> +          * I've seen texture functions increase in code size by 15% just due
> +          * to that (there's lots of such fetches in them...)
> +          * (We could chose a different gather order to improve this somewhat
> +          * for the int path, but it would basically just drop the blends,
> +          * so the float path with this order really is optimal.)
> +          * Albeit it is tricky sometimes llvm doesn't ignore the float->int
> +          * casts so must avoid them until we're done with the float shuffle...
> +          * 3x16bit formats (the same is also true for 3x8) are pretty bad but
> +          * there's nothing we can do about them (we could overallocate by
> +          * those couple bytes and use unaligned but pot sized load).
> +          * Note that this is very much x86 specific. I don't know if this
> +          * affect other archs at all.
> +          */
> +         if (num_gather > 1) {
> +            /*
> +             * We always want some float type here (with x86)
> +             * due to shuffles being float ones afterwards (albeit for
> +             * the num_gather == 4 case int should work fine too
> +             * (unless there's some problems with avx but not avx2).
> +             */
> +            if (format_desc->channel[0].size == 64) {
> +               fetch_type = lp_type_float_vec(64, gather_type.width);
> +            } else {
> +               fetch_type = lp_type_int_vec(32, gather_type.width);
> +            }
> +         }
> +         else {
> +            /* type doesn't matter much */
> +            if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
> +                (format_desc->channel[0].size == 32 ||
> +                 format_desc->channel[0].size == 64)) {
> +            fetch_type = lp_type_float(gather_type.width);
> +            } else {
> +               fetch_type = lp_type_uint(gather_type.width);
> +            }
> +         }
> +
> +         /* Now finally gather the values */
> +         packed[i] = lp_build_gather(gallivm, gather_type.length,
> +                                     format_desc->block.bits,
> +                                     fetch_type, aligned,
> +                                     base_ptr, offsetr, FALSE);
> +         if (fp64) {
> +            struct lp_type conv_type = type;
> +            conv_type.width *= 2;
> +            packed[i] = LLVMBuildBitCast(builder, packed[i],
> +                                         lp_build_vec_type(gallivm, conv_type), "");
> +            packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
> +         }
> +      }
> +
> +      /* shuffle the gathered values to SoA */
> +      if (num_gather == 2) {
> +         for (i = 0; i < num_gather; i++) {
> +            for (j = 0; j < type.length; j++) {
> +               unsigned idx = (j%2)*2 + (j/4)*4 + i;
> +               if ((j/2)%2)
> +                  idx += type.length;
> +               shuffles[j] = lp_build_const_int32(gallivm, idx);
> +            }
> +            dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
> +                                            LLVMConstVector(shuffles, type.length), "");
> +         }
> +      }
> +      else if (num_gather == 4) {
> +         lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
> +      }
> +      else {
> +         assert(num_gather == 1);
> +         dst[0] = packed[0];
> +      }
> +
> +      /*
> +       * And finally unpack exactly as above, except that
> +       * chan shift is adjusted and the right vector selected.
> +       */
> +      if (!fp64) {
> +         for (i = 0; i < num_gather; i++) {
> +            dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
> +         }
> +         for (i = 0; i < format_desc->nr_channels; i++) {
> +            struct util_format_channel_description chan_desc = format_desc->channel[i];
> +            unsigned blockbits = type.width;
> +            unsigned vec_nr = chan_desc.shift / type.width;
> +            chan_desc.shift %= type.width;
> +
> +            output[i] = lp_build_extract_soa_chan(&bld,
> +                                                  blockbits,
> +                                                  FALSE,
> +                                                  chan_desc,
> +                                                  dst[vec_nr]);
> +         }
> +      }
> +      else {
> +         for (i = 0; i < format_desc->nr_channels; i++)  {
> +            output[i] = dst[i];
> +         }
> +      }
> +
> +      lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
> +      return;
> +   }
> +
>     if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
>         format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
>        /*
>

Looks good AFAICT.

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>