[Mesa-dev] [PATCH] gallivm: Use native packs and unpacks for the lerps
Jose Fonseca
jfonseca at vmware.com
Tue Oct 18 13:30:14 UTC 2016
On 18/10/16 02:38, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> For the texturing packs, things looked pretty terrible. For every
> lerp, we were repacking the values, and while those look sort of cheap
> with 128bit, with 256bit we end up with 2 of them instead of just 1 but
> worse, plus 2 extracts too (the unpack, however, works fine with a
> single instruction, albeit only with llvm 3.8 - the vpmovzxbw).
>
> Ideally we'd use more clever pack for llvmpipe backend conversion too
> since we actually use the "wrong" shuffle (which is more work) when doing
> the fs twiddle just so we end up with the wrong order for being able to
> do native pack when converting from 2x8f -> 1x16b. But this requires some
> refactoring, since the untwiddle is separate from conversion.
>
> This is only used for avx2 256bit pack/unpack for now.
>
> Improves openarena scores by 8% or so, though overall it's still pretty
> disappointing how much faster 256bit vectors are even with avx2 (or
> rather, aren't...). And, of course, eliminating the needless
> packs/unpacks in the first place would eliminate most of that advantage
> (not quite all) from this patch.
> ---
> src/gallium/auxiliary/gallivm/lp_bld_arit.c | 14 +--
> src/gallium/auxiliary/gallivm/lp_bld_pack.c | 139 ++++++++++++++++++++++++++--
> src/gallium/auxiliary/gallivm/lp_bld_pack.h | 16 ++++
> 3 files changed, 156 insertions(+), 13 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index f5cacc4..3ea0734 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -1046,14 +1046,14 @@ lp_build_mul(struct lp_build_context *bld,
> struct lp_type wide_type = lp_wider_type(type);
> LLVMValueRef al, ah, bl, bh, abl, abh, ab;
>
> - lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
> - lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
> + lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
> + lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
>
> /* PMULLW, PSRLW, PADDW */
> abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
> abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
>
> - ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
> + ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
>
> return ab;
> }
> @@ -1350,9 +1350,9 @@ lp_build_lerp(struct lp_build_context *bld,
>
> lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
>
> - lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
> - lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
> - lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
> + lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
> + lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
> + lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
>
> /*
> * Lerp both halves.
> @@ -1363,7 +1363,7 @@ lp_build_lerp(struct lp_build_context *bld,
> resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
> resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
>
> - res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
> + res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
> } else {
> res = lp_build_lerp_simple(bld, x, v0, v1, flags);
> }
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> index b0e76e6..e8d4fcd 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> @@ -346,10 +346,10 @@ lp_build_interleave2(struct gallivm_state *gallivm,
> */
> LLVMValueRef
> lp_build_interleave2_half(struct gallivm_state *gallivm,
> - struct lp_type type,
> - LLVMValueRef a,
> - LLVMValueRef b,
> - unsigned lo_hi)
> + struct lp_type type,
> + LLVMValueRef a,
> + LLVMValueRef b,
> + unsigned lo_hi)
> {
> if (type.length * type.width == 256) {
> LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
> @@ -359,11 +359,13 @@ lp_build_interleave2_half(struct gallivm_state *gallivm,
> }
> }
>
> +
> /**
> * Double the bit width.
> *
> * This will only change the number of bits the values are represented, not the
> * values themselves.
> + *
> */
> void
> lp_build_unpack2(struct gallivm_state *gallivm,
> @@ -394,6 +396,65 @@ lp_build_unpack2(struct gallivm_state *gallivm,
> #ifdef PIPE_ARCH_LITTLE_ENDIAN
> *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
> *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
> +
> +#else
> + *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
> + *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
> +#endif
> +
> + /* Cast the result into the new type (twice as wide) */
> +
> + dst_vec_type = lp_build_vec_type(gallivm, dst_type);
> +
> + *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
> + *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
> +}
> +
> +
> +/**
> + * Double the bit width, with an order which fits the cpu nicely.
> + *
> + * This will only change the number of bits the values are represented, not the
> + * values themselves.
> + *
> + * The order of the results is not guaranteed, other than it will match
> + * the corresponding lp_build_pack2_native call.
> + */
> +void
> +lp_build_unpack2_native(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + struct lp_type dst_type,
> + LLVMValueRef src,
> + LLVMValueRef *dst_lo,
> + LLVMValueRef *dst_hi)
> +{
> + LLVMBuilderRef builder = gallivm->builder;
> + LLVMValueRef msb;
> + LLVMTypeRef dst_vec_type;
> +
> + assert(!src_type.floating);
> + assert(!dst_type.floating);
> + assert(dst_type.width == src_type.width * 2);
> + assert(dst_type.length * 2 == src_type.length);
> +
> + if(dst_type.sign && src_type.sign) {
> + /* Replicate the sign bit in the most significant bits */
> + msb = LLVMBuildAShr(builder, src,
> + lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
> + }
> + else
> + /* Most significant bits always zero */
> + msb = lp_build_zero(gallivm, src_type);
> +
> + /* Interleave bits */
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> + if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
> + *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
> + *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
> + } else {
> + *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
> + *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
> + }
> #else
> *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
> *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
> @@ -440,7 +501,8 @@ lp_build_unpack(struct gallivm_state *gallivm,
> tmp_type.length /= 2;
>
> for(i = num_tmps; i--; ) {
> - lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
> + lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0],
> + &dst[2*i + 1]);
> }
>
> src_type = tmp_type;
> @@ -605,6 +667,70 @@ lp_build_pack2(struct gallivm_state *gallivm,
> }
>
>
> +/**
> + * Non-interleaved native pack.
> + *
> + * Similar to lp_build_pack2, but the ordering of values is not
> + * guaranteed, other than it will match lp_build_unpack2_native.
> + *
> + * In particular, with avx2, the lower and upper 128bits of the vectors will
> + * be packed independently, so that (with 32bit->16bit values)
> + * (LSB) (MSB)
> + * lo = l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __
> + * hi = h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __
> + * res = l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7
> + *
> + * This will only change the number of bits the values are represented, not the
> + * values themselves.
> + *
> + * It is assumed the values are already clamped into the destination type range.
> + * Values outside that range will produce undefined results.
> + */
> +LLVMValueRef
> +lp_build_pack2_native(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + struct lp_type dst_type,
> + LLVMValueRef lo,
> + LLVMValueRef hi)
> +{
> + LLVMBuilderRef builder = gallivm->builder;
> + struct lp_type intr_type = dst_type;
> + const char *intrinsic = NULL;
> +
> + assert(!src_type.floating);
> + assert(!dst_type.floating);
> + assert(src_type.width == dst_type.width * 2);
> + assert(src_type.length * 2 == dst_type.length);
> +
> + /* At this point only have special case for avx2 */
> + if (src_type.length * src_type.width == 256 &&
> + util_cpu_caps.has_avx2) {
> + switch(src_type.width) {
> + case 32:
> + if (dst_type.sign) {
> + intrinsic = "llvm.x86.avx2.packssdw";
> + } else {
> + intrinsic = "llvm.x86.avx2.packusdw";
> + }
> + break;
> + case 16:
> + if (dst_type.sign) {
> + intrinsic = "llvm.x86.avx2.packsswb";
> + } else {
> + intrinsic = "llvm.x86.avx2.packuswb";
> + }
> + break;
> + }
> + }
> + if (intrinsic) {
> + LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
> + return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type,
> + lo, hi);
> + }
> + else {
> + return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
> + }
> +}
>
> /**
> * Non-interleaved pack and saturate.
> @@ -640,7 +766,8 @@ lp_build_packs2(struct gallivm_state *gallivm,
> if(clamp) {
> struct lp_build_context bld;
> unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
> - LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1);
> + LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type,
> + ((unsigned long long)1 << dst_bits) - 1);
> lp_build_context_init(&bld, gallivm, src_type);
> lo = lp_build_min(&bld, lo, dst_max);
> hi = lp_build_min(&bld, hi, dst_max);
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> index 367fba1..3e07716 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> @@ -74,6 +74,14 @@ lp_build_unpack2(struct gallivm_state *gallivm,
>
>
> void
> +lp_build_unpack2_native(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + struct lp_type dst_type,
> + LLVMValueRef src,
> + LLVMValueRef *dst_lo,
> + LLVMValueRef *dst_hi);
> +
> +void
> lp_build_unpack(struct gallivm_state *gallivm,
> struct lp_type src_type,
> struct lp_type dst_type,
> @@ -118,6 +126,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
>
>
> LLVMValueRef
> +lp_build_pack2_native(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + struct lp_type dst_type,
> + LLVMValueRef lo,
> + LLVMValueRef hi);
> +
> +
> +LLVMValueRef
> lp_build_pack(struct gallivm_state *gallivm,
> struct lp_type src_type,
> struct lp_type dst_type,
>
Looks great!
Reviewed-by: Jose Fonseca <jfonseca at vmware.com>
More information about the mesa-dev
mailing list