[Mesa-dev] [PATCH 3/4] gallivm: optimize lp_build_unpack_arith_rgba_aos slightly
Jose Fonseca
jfonseca at vmware.com
Wed Jan 4 16:19:46 UTC 2017
On 21/12/16 04:01, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> This code uses a vector shift which has to be emulated on x86 unless
> there's AVX2. Luckily in some cases we can actually avoid the shift
> altogether, so do that.
> Also make sure we hit the fast lp_build_conv() path when applicable,
> albeit that's quite the hack...
> That said, this path is taken for AoS sampling for small unorm (smaller
> than rgba8) formats, and it is completely hopeless even with those
> changes, with or without AVX.
> (Probably should have some code similar to the one in the llvmpipe fs
> backend code, using bit replication to extend to rgba8888 - rounding
> is not quite 100% accurate but if it's good enough there it should be
> here as well.)
> ---
> src/gallium/auxiliary/gallivm/lp_bld_format_aos.c | 116 ++++++++++++++++++----
> 1 file changed, 97 insertions(+), 19 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> index 322e7b8..574bb64 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> @@ -38,6 +38,7 @@
> #include "util/u_math.h"
> #include "util/u_pointer.h"
> #include "util/u_string.h"
> +#include "util/u_cpu_detect.h"
>
> #include "lp_bld_arit.h"
> #include "lp_bld_init.h"
> @@ -49,6 +50,7 @@
> #include "lp_bld_gather.h"
> #include "lp_bld_debug.h"
> #include "lp_bld_format.h"
> +#include "lp_bld_pack.h"
> #include "lp_bld_intr.h"
>
>
> @@ -156,6 +158,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
> LLVMValueRef shifts[4];
> LLVMValueRef masks[4];
> LLVMValueRef scales[4];
> + LLVMTypeRef vec32_type;
>
> boolean normalized;
> boolean needs_uitofp;
> @@ -171,19 +174,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
> * matches floating point size */
> assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
>
> + vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
> +
> /* Broadcast the packed value to all four channels
> * before: packed = BGRA
> * after: packed = {BGRA, BGRA, BGRA, BGRA}
> */
> - packed = LLVMBuildInsertElement(builder,
> - LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
> - packed,
> + packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
> LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
> "");
> - packed = LLVMBuildShuffleVector(builder,
> - packed,
> - LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
> - LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
> + packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
> + LLVMConstNull(vec32_type),
> "");
>
> /* Initialize vector constants */
> @@ -224,9 +225,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
> /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
> * into masked = {X, Y, Z, W}
> */
> - /* Note: we cannot do this shift on x86 natively until AVX2. */
> - shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
> - masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
> + if (desc->block.bits < 32 && normalized) {
> + /*
> + * Note: we cannot do the shift below on x86 natively until AVX2.
> + *
> + * Old llvm versions will resort to scalar extract/shift insert,
> + * which is definitely terrible, new versions will just do
> + * several vector shifts and shuffle/blend results together.
> + * We could turn this into a variable left shift plus a constant
> + * right shift, and llvm would then turn the variable left shift
> + * into a mul for us (albeit without sse41 the mul needs emulation
> + * too...). However, since we're going to do a float mul
> + * anyway, we just adjust that mul instead (plus the mask), skipping
> + * the shift completely.
> + * We could also use a extra mul when the format isn't normalized and
> + * we don't have AVX2 support, but don't bother for now. Unfortunately,
> + * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
> + * rgba8 if it ends up here), as that would require UIToFP, albeit that
> + * would be fixable with easy 16bit shuffle (unless there's channels
> + * crossing 16bit boundaries).
> + */
> + for (i = 0; i < 4; ++i) {
> + if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
> + unsigned bits = desc->channel[i].size;
> + unsigned shift = desc->channel[i].shift;
> + unsigned long long mask = ((1ULL << bits) - 1) << shift;
> + scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
> + masks[i] = lp_build_const_int32(gallivm, mask);
> + }
> + }
> + masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
> + } else {
> + shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
> + masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
> + }
>
> if (!needs_uitofp) {
> /* UIToFP can't be expressed in SSE2 */
> @@ -235,8 +267,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
> casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
> }
>
> - /* At this point 'casted' may be a vector of floats such as
> - * {255.0, 255.0, 255.0, 255.0}. Next, if the pixel values are normalized
> + /*
> + * At this point 'casted' may be a vector of floats such as
> + * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
> + * by powers of two). Next, if the pixel values are normalized
> * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
> */
>
> @@ -392,6 +426,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
>
> if (format_matches_type(format_desc, type) &&
> format_desc->block.bits <= type.width * 4 &&
> + /* XXX this shouldn't be needed */
> util_is_power_of_two(format_desc->block.bits)) {
> LLVMValueRef packed;
> LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
> @@ -424,6 +459,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
> format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
> format_desc->block.width == 1 &&
> format_desc->block.height == 1 &&
> + /* XXX this shouldn't be needed */
> util_is_power_of_two(format_desc->block.bits) &&
> format_desc->block.bits <= 32 &&
> format_desc->is_bitmask &&
> @@ -433,8 +469,24 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
> !format_desc->channel[0].pure_integer) {
>
> LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
> - LLVMValueRef res;
> - unsigned k;
> + LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
> + struct lp_type conv_type;
> + unsigned k, num_conv_src, num_conv_dst;
> +
> + /*
> + * XXX: We end up here for the AoS unorm8 sampling (if the format wasn't some
> + * 888(8) variant), so things like rgb565. This is _really_ suboptimal.
> + * Not only do we a single pixel at a time but we convert to float,
> + * do a normalize mul, un-normalize mul, convert back to int, finally pack
> + * down to 8 bits. At the end throw in a couple of shifts/ands/ors for aos
> + * swizzle (well rgb565 is ok but bgrx5551 not for instance) for good
> + * measure. (And if we're not extra careful we get some pointless min/max
> + * too for clamping values to range). This is a disaster of epic proportions,
> + * simply forcing SoA sampling would be way faster (even when we don't have
> + * AVX support).
> + * We should make sure we cannot hit this code path for anything but single
> + * pixels.
> + */
>
> /*
> * Unpack a pixel at a time into a <4 x float> RGBA vector
> @@ -464,12 +516,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
> __FUNCTION__, format_desc->short_name);
> }
>
> - lp_build_conv(gallivm,
> - lp_float32_vec4_type(),
> - type,
> - tmps, num_pixels, &res, 1);
> + conv_type = lp_float32_vec4_type();
> + num_conv_src = num_pixels;
> + num_conv_dst = 1;
> +
> + if (num_pixels % 8 == 0) {
> + lp_build_concat_n(gallivm, lp_float32_vec4_type(),
> + tmps, num_pixels, tmps, num_pixels / 2);
> + conv_type.length *= num_pixels / 4;
> + num_conv_src = 4 * num_pixels / 8;
> + if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
> + /*
> + * FIXME: The fast float->unorm path (which is basically
> + * skipping the MIN/MAX which are extremely pointless in any
> + * case) requires that there's 2 destinations...
> + * In any case, we really should make sure we don't hit this
> + * code with multiple pixels for unorm8 dst types, it's
> + * completely hopeless even if we do hit the right conversion.
> + */
> + type.length /= num_pixels / 4;
> + num_conv_dst = num_pixels / 4;
> + }
> + }
> +
> + lp_build_conv(gallivm, conv_type, type,
> + tmps, num_conv_src, res, num_conv_dst);
> +
> + if (num_pixels % 8 == 0 &&
> + (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
> + lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
> + }
>
> - return lp_build_format_swizzle_aos(format_desc, &bld, res);
> + return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
> }
>
> /* If all channels are of same type and we are not using half-floats */
>
Reviewed-by: Jose Fonseca <jfonseca at vmware.com>
More information about the mesa-dev
mailing list