[Mesa-dev] [PATCH] gallivm: Use native packs and unpacks for the lerps

Tue Oct 18 13:30:14 UTC 2016

On 18/10/16 02:38, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> For the texturing packs, things looked pretty terrible. For every
> lerp, we were repacking the values, and while those look sort of cheap
> with 128bit, with 256bit we end up with 2 of them instead of just 1 but
> worse, plus 2 extracts too (the unpack, however, works fine with a
> single instruction, albeit only with llvm 3.8 - the vpmovzxbw).
>
> Ideally we'd use more clever pack for llvmpipe backend conversion too
> since we actually use the "wrong" shuffle (which is more work) when doing
> the fs twiddle just so we end up with the wrong order for being able to
> do native pack when converting from 2x8f -> 1x16b. But this requires some
> refactoring, since the untwiddle is separate from conversion.
>
> This is only used for avx2 256bit pack/unpack for now.
>
> Improves openarena scores by 8% or so, though overall it's still pretty
> disappointing how much faster 256bit vectors are even with avx2 (or
> rather, aren't...). And, of course, eliminating the needless
> packs/unpacks in the first place would eliminate most of that advantage
> (not quite all) from this patch.
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_arit.c |  14 +--
>  src/gallium/auxiliary/gallivm/lp_bld_pack.c | 139 ++++++++++++++++++++++++++--
>  src/gallium/auxiliary/gallivm/lp_bld_pack.h |  16 ++++
>  3 files changed, 156 insertions(+), 13 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index f5cacc4..3ea0734 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -1046,14 +1046,14 @@ lp_build_mul(struct lp_build_context *bld,
>        struct lp_type wide_type = lp_wider_type(type);
>        LLVMValueRef al, ah, bl, bh, abl, abh, ab;
>
> -      lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
> -      lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
> +      lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
> +      lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
>
>        /* PMULLW, PSRLW, PADDW */
>        abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
>        abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
>
> -      ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
> +      ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
>
>        return ab;
>     }
> @@ -1350,9 +1350,9 @@ lp_build_lerp(struct lp_build_context *bld,
>
>        lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
>
> -      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
> -      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
> -      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
> +      lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
> +      lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
> +      lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
>
>        /*
>         * Lerp both halves.
> @@ -1363,7 +1363,7 @@ lp_build_lerp(struct lp_build_context *bld,
>        resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
>        resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
>
> -      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
> +      res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
>     } else {
>        res = lp_build_lerp_simple(bld, x, v0, v1, flags);
>     }
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> index b0e76e6..e8d4fcd 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
> @@ -346,10 +346,10 @@ lp_build_interleave2(struct gallivm_state *gallivm,
>   */
>  LLVMValueRef
>  lp_build_interleave2_half(struct gallivm_state *gallivm,
> -                     struct lp_type type,
> -                     LLVMValueRef a,
> -                     LLVMValueRef b,
> -                     unsigned lo_hi)
> +                          struct lp_type type,
> +                          LLVMValueRef a,
> +                          LLVMValueRef b,
> +                          unsigned lo_hi)
>  {
>     if (type.length * type.width == 256) {
>        LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
> @@ -359,11 +359,13 @@ lp_build_interleave2_half(struct gallivm_state *gallivm,
>     }
>  }
>
> +
>  /**
>   * Double the bit width.
>   *
>   * This will only change the number of bits the values are represented, not the
>   * values themselves.
> + *
>   */
>  void
>  lp_build_unpack2(struct gallivm_state *gallivm,
> @@ -394,6 +396,65 @@ lp_build_unpack2(struct gallivm_state *gallivm,
>  #ifdef PIPE_ARCH_LITTLE_ENDIAN
>     *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
>     *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
> +
> +#else
> +   *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
> +   *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
> +#endif
> +
> +   /* Cast the result into the new type (twice as wide) */
> +
> +   dst_vec_type = lp_build_vec_type(gallivm, dst_type);
> +
> +   *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
> +   *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
> +}
> +
> +
> +/**
> + * Double the bit width, with an order which fits the cpu nicely.
> + *
> + * This will only change the number of bits the values are represented, not the
> + * values themselves.
> + *
> + * The order of the results is not guaranteed, other than it will match
> + * the corresponding lp_build_pack2_native call.
> + */
> +void
> +lp_build_unpack2_native(struct gallivm_state *gallivm,
> +                        struct lp_type src_type,
> +                        struct lp_type dst_type,
> +                        LLVMValueRef src,
> +                        LLVMValueRef *dst_lo,
> +                        LLVMValueRef *dst_hi)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef msb;
> +   LLVMTypeRef dst_vec_type;
> +
> +   assert(!src_type.floating);
> +   assert(!dst_type.floating);
> +   assert(dst_type.width == src_type.width * 2);
> +   assert(dst_type.length * 2 == src_type.length);
> +
> +   if(dst_type.sign && src_type.sign) {
> +      /* Replicate the sign bit in the most significant bits */
> +      msb = LLVMBuildAShr(builder, src,
> +               lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
> +   }
> +   else
> +      /* Most significant bits always zero */
> +      msb = lp_build_zero(gallivm, src_type);
> +
> +   /* Interleave bits */
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +   if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
> +      *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
> +      *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
> +   } else {
> +      *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
> +      *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
> +   }
>  #else
>     *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
>     *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
> @@ -440,7 +501,8 @@ lp_build_unpack(struct gallivm_state *gallivm,
>        tmp_type.length /= 2;
>
>        for(i = num_tmps; i--; ) {
> -         lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
> +         lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0],
> +                          &dst[2*i + 1]);
>        }
>
>        src_type = tmp_type;
> @@ -605,6 +667,70 @@ lp_build_pack2(struct gallivm_state *gallivm,
>  }
>
>
> +/**
> + * Non-interleaved native pack.
> + *
> + * Similar to lp_build_pack2, but the ordering of values is not
> + * guaranteed, other than it will match lp_build_unpack2_native.
> + *
> + * In particular, with avx2, the lower and upper 128bits of the vectors will
> + * be packed independently, so that (with 32bit->16bit values)
> + *         (LSB)                                       (MSB)
> + *   lo =   l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __
> + *   hi =   h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __
> + *   res =  l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7
> + *
> + * This will only change the number of bits the values are represented, not the
> + * values themselves.
> + *
> + * It is assumed the values are already clamped into the destination type range.
> + * Values outside that range will produce undefined results.
> + */
> +LLVMValueRef
> +lp_build_pack2_native(struct gallivm_state *gallivm,
> +                      struct lp_type src_type,
> +                      struct lp_type dst_type,
> +                      LLVMValueRef lo,
> +                      LLVMValueRef hi)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   struct lp_type intr_type = dst_type;
> +   const char *intrinsic = NULL;
> +
> +   assert(!src_type.floating);
> +   assert(!dst_type.floating);
> +   assert(src_type.width == dst_type.width * 2);
> +   assert(src_type.length * 2 == dst_type.length);
> +
> +   /* At this point only have special case for avx2 */
> +   if (src_type.length * src_type.width == 256 &&
> +       util_cpu_caps.has_avx2) {
> +      switch(src_type.width) {
> +      case 32:
> +         if (dst_type.sign) {
> +            intrinsic = "llvm.x86.avx2.packssdw";
> +         } else {
> +            intrinsic = "llvm.x86.avx2.packusdw";
> +         }
> +         break;
> +      case 16:
> +         if (dst_type.sign) {
> +            intrinsic = "llvm.x86.avx2.packsswb";
> +         } else {
> +            intrinsic = "llvm.x86.avx2.packuswb";
> +         }
> +         break;
> +      }
> +   }
> +   if (intrinsic) {
> +      LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
> +      return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type,
> +                                       lo, hi);
> +   }
> +   else {
> +      return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
> +   }
> +}
>
>  /**
>   * Non-interleaved pack and saturate.
> @@ -640,7 +766,8 @@ lp_build_packs2(struct gallivm_state *gallivm,
>     if(clamp) {
>        struct lp_build_context bld;
>        unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
> -      LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1);
> +      LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type,
> +                                ((unsigned long long)1 << dst_bits) - 1);
>        lp_build_context_init(&bld, gallivm, src_type);
>        lo = lp_build_min(&bld, lo, dst_max);
>        hi = lp_build_min(&bld, hi, dst_max);
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> index 367fba1..3e07716 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
> @@ -74,6 +74,14 @@ lp_build_unpack2(struct gallivm_state *gallivm,
>
>
>  void
> +lp_build_unpack2_native(struct gallivm_state *gallivm,
> +                        struct lp_type src_type,
> +                        struct lp_type dst_type,
> +                        LLVMValueRef src,
> +                        LLVMValueRef *dst_lo,
> +                        LLVMValueRef *dst_hi);
> +
> +void
>  lp_build_unpack(struct gallivm_state *gallivm,
>                  struct lp_type src_type,
>                  struct lp_type dst_type,
> @@ -118,6 +126,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
>
>
>  LLVMValueRef
> +lp_build_pack2_native(struct gallivm_state *gallivm,
> +                      struct lp_type src_type,
> +                      struct lp_type dst_type,
> +                      LLVMValueRef lo,
> +                      LLVMValueRef hi);
> +
> +
> +LLVMValueRef
>  lp_build_pack(struct gallivm_state *gallivm,
>                struct lp_type src_type,
>                struct lp_type dst_type,
>

Looks great!

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>