[Mesa-dev] [PATCH 3/4] gallivm: optimize lp_build_unpack_arith_rgba_aos slightly

Wed Jan 4 16:19:46 UTC 2017

On 21/12/16 04:01, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> This code uses a vector shift which has to be emulated on x86 unless
> there's AVX2. Luckily in some cases we can actually avoid the shift
> altogether, so do that.
> Also make sure we hit the fast lp_build_conv() path when applicable,
> albeit that's quite the hack...
> That said, this path is taken for AoS sampling for small unorm (smaller
> than rgba8) formats, and it is completely hopeless even with those
> changes, with or without AVX.
> (Probably should have some code similar to the one in the llvmpipe fs
> backend code, using bit replication to extend to rgba8888 - rounding
> is not quite 100% accurate but if it's good enough there it should be
> here as well.)
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_format_aos.c | 116 ++++++++++++++++++----
>  1 file changed, 97 insertions(+), 19 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> index 322e7b8..574bb64 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> @@ -38,6 +38,7 @@
>  #include "util/u_math.h"
>  #include "util/u_pointer.h"
>  #include "util/u_string.h"
> +#include "util/u_cpu_detect.h"
>
>  #include "lp_bld_arit.h"
>  #include "lp_bld_init.h"
> @@ -49,6 +50,7 @@
>  #include "lp_bld_gather.h"
>  #include "lp_bld_debug.h"
>  #include "lp_bld_format.h"
> +#include "lp_bld_pack.h"
>  #include "lp_bld_intr.h"
>
>
> @@ -156,6 +158,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
>     LLVMValueRef shifts[4];
>     LLVMValueRef masks[4];
>     LLVMValueRef scales[4];
> +   LLVMTypeRef vec32_type;
>
>     boolean normalized;
>     boolean needs_uitofp;
> @@ -171,19 +174,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
>      * matches floating point size */
>     assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
>
> +   vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
> +
>     /* Broadcast the packed value to all four channels
>      * before: packed = BGRA
>      * after: packed = {BGRA, BGRA, BGRA, BGRA}
>      */
> -   packed = LLVMBuildInsertElement(builder,
> -                                   LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
> -                                   packed,
> +   packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
>                                     LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
>                                     "");
> -   packed = LLVMBuildShuffleVector(builder,
> -                                   packed,
> -                                   LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
> -                                   LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
> +   packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
> +                                   LLVMConstNull(vec32_type),
>                                     "");
>
>     /* Initialize vector constants */
> @@ -224,9 +225,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
>     /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
>      * into masked = {X, Y, Z, W}
>      */
> -   /* Note: we cannot do this shift on x86 natively until AVX2. */
> -   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
> -   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
> +   if (desc->block.bits < 32 && normalized) {
> +      /*
> +       * Note: we cannot do the shift below on x86 natively until AVX2.
> +       *
> +       * Old llvm versions will resort to scalar extract/shift insert,
> +       * which is definitely terrible, new versions will just do
> +       * several vector shifts and shuffle/blend results together.
> +       * We could turn this into a variable left shift plus a constant
> +       * right shift, and llvm would then turn the variable left shift
> +       * into a mul for us (albeit without sse41 the mul needs emulation
> +       * too...). However, since we're going to do a float mul
> +       * anyway, we just adjust that mul instead (plus the mask), skipping
> +       * the shift completely.
> +       * We could also use a extra mul when the format isn't normalized and
> +       * we don't have AVX2 support, but don't bother for now. Unfortunately,
> +       * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
> +       * rgba8 if it ends up here), as that would require UIToFP, albeit that
> +       * would be fixable with easy 16bit shuffle (unless there's channels
> +       * crossing 16bit boundaries).
> +       */
> +      for (i = 0; i < 4; ++i) {
> +         if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
> +            unsigned bits = desc->channel[i].size;
> +            unsigned shift = desc->channel[i].shift;
> +            unsigned long long mask = ((1ULL << bits) - 1) << shift;
> +            scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
> +            masks[i] = lp_build_const_int32(gallivm, mask);
> +         }
> +      }
> +      masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
> +   } else {
> +      shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
> +      masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
> +   }
>
>     if (!needs_uitofp) {
>        /* UIToFP can't be expressed in SSE2 */
> @@ -235,8 +267,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
>        casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
>     }
>
> -   /* At this point 'casted' may be a vector of floats such as
> -    * {255.0, 255.0, 255.0, 255.0}.  Next, if the pixel values are normalized
> +   /*
> +    * At this point 'casted' may be a vector of floats such as
> +    * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
> +    * by powers of two). Next, if the pixel values are normalized
>      * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
>      */
>
> @@ -392,6 +426,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
>
>     if (format_matches_type(format_desc, type) &&
>         format_desc->block.bits <= type.width * 4 &&
> +       /* XXX this shouldn't be needed */
>         util_is_power_of_two(format_desc->block.bits)) {
>        LLVMValueRef packed;
>        LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
> @@ -424,6 +459,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
>          format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
>         format_desc->block.width == 1 &&
>         format_desc->block.height == 1 &&
> +       /* XXX this shouldn't be needed */
>         util_is_power_of_two(format_desc->block.bits) &&
>         format_desc->block.bits <= 32 &&
>         format_desc->is_bitmask &&
> @@ -433,8 +469,24 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
>         !format_desc->channel[0].pure_integer) {
>
>        LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
> -      LLVMValueRef res;
> -      unsigned k;
> +      LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
> +      struct lp_type conv_type;
> +      unsigned k, num_conv_src, num_conv_dst;
> +
> +      /*
> +       * XXX: We end up here for the AoS unorm8 sampling (if the format wasn't some
> +       * 888(8) variant), so things like rgb565. This is _really_ suboptimal.
> +       * Not only do we a single pixel at a time but we convert to float,
> +       * do a normalize mul, un-normalize mul, convert back to int, finally pack
> +       * down to 8 bits. At the end throw in a couple of shifts/ands/ors for aos
> +       * swizzle (well rgb565 is ok but bgrx5551 not for instance) for good
> +       * measure. (And if we're not extra careful we get some pointless min/max
> +       * too for clamping values to range). This is a disaster of epic proportions,
> +       * simply forcing SoA sampling would be way faster (even when we don't have
> +       * AVX support).
> +       * We should make sure we cannot hit this code path for anything but single
> +       * pixels.
> +       */
>
>        /*
>         * Unpack a pixel at a time into a <4 x float> RGBA vector
> @@ -464,12 +516,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
>                        __FUNCTION__, format_desc->short_name);
>        }
>
> -      lp_build_conv(gallivm,
> -                    lp_float32_vec4_type(),
> -                    type,
> -                    tmps, num_pixels, &res, 1);
> +      conv_type = lp_float32_vec4_type();
> +      num_conv_src = num_pixels;
> +      num_conv_dst = 1;
> +
> +      if (num_pixels % 8 == 0) {
> +         lp_build_concat_n(gallivm, lp_float32_vec4_type(),
> +                           tmps, num_pixels, tmps, num_pixels / 2);
> +         conv_type.length *= num_pixels / 4;
> +         num_conv_src = 4 * num_pixels / 8;
> +         if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
> +            /*
> +             * FIXME: The fast float->unorm path (which is basically
> +             * skipping the MIN/MAX which are extremely pointless in any
> +             * case) requires that there's 2 destinations...
> +             * In any case, we really should make sure we don't hit this
> +             * code with multiple pixels for unorm8 dst types, it's
> +             * completely hopeless even if we do hit the right conversion.
> +             */
> +            type.length /= num_pixels / 4;
> +            num_conv_dst = num_pixels / 4;
> +         }
> +      }
> +
> +      lp_build_conv(gallivm, conv_type, type,
> +                    tmps, num_conv_src, res, num_conv_dst);
> +
> +      if (num_pixels % 8 == 0 &&
> +          (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
> +         lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
> +      }
>
> -      return lp_build_format_swizzle_aos(format_desc, &bld, res);
> +      return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
>     }
>
>     /* If all channels are of same type and we are not using half-floats */
>

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>