Mesa (master): gallivm: Use native packs and unpacks for the lerps

Tue Oct 18 23:45:17 UTC 2016

Module: Mesa
Branch: master
Commit: 6f2f0daeb49e132f44ca9bf930049470a39c970f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=6f2f0daeb49e132f44ca9bf930049470a39c970f

Author: Roland Scheidegger <sroland at vmware.com>
Date:   Tue Oct 18 03:37:37 2016 +0200

gallivm: Use native packs and unpacks for the lerps

For the texturing packs, things looked pretty terrible. For every
lerp, we were repacking the values, and while those look sort of cheap
with 128bit, with 256bit we end up with 2 of them instead of just 1 but
worse, plus 2 extracts too (the unpack, however, works fine with a
single instruction, albeit only with llvm 3.8 - the vpmovzxbw).

Ideally we'd use more clever pack for llvmpipe backend conversion too
since we actually use the "wrong" shuffle (which is more work) when doing
the fs twiddle just so we end up with the wrong order for being able to
do native pack when converting from 2x8f -> 1x16b. But this requires some
refactoring, since the untwiddle is separate from conversion.

This is only used for avx2 256bit pack/unpack for now.

Improves openarena scores by 8% or so, though overall it's still pretty
disappointing how much faster 256bit vectors are even with avx2 (or
rather, aren't...). And, of course, eliminating the needless
packs/unpacks in the first place would eliminate most of that advantage
(not quite all) from this patch.

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>

---

 src/gallium/auxiliary/gallivm/lp_bld_arit.c |  14 +--
 src/gallium/auxiliary/gallivm/lp_bld_pack.c | 139 ++++++++++++++++++++++++++--
 src/gallium/auxiliary/gallivm/lp_bld_pack.h |  16 ++++
 3 files changed, 156 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index f5cacc4..3ea0734 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1046,14 +1046,14 @@ lp_build_mul(struct lp_build_context *bld,
       struct lp_type wide_type = lp_wider_type(type);
       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 
-      lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
-      lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
 
       /* PMULLW, PSRLW, PADDW */
       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 
-      ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
+      ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
 
       return ab;
    }
@@ -1350,9 +1350,9 @@ lp_build_lerp(struct lp_build_context *bld,
 
       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
 
-      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
-      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
-      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
 
       /*
        * Lerp both halves.
@@ -1363,7 +1363,7 @@ lp_build_lerp(struct lp_build_context *bld,
       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
 
-      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
+      res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
    } else {
       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index b0e76e6..e8d4fcd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -346,10 +346,10 @@ lp_build_interleave2(struct gallivm_state *gallivm,
  */
 LLVMValueRef
 lp_build_interleave2_half(struct gallivm_state *gallivm,
-                     struct lp_type type,
-                     LLVMValueRef a,
-                     LLVMValueRef b,
-                     unsigned lo_hi)
+                          struct lp_type type,
+                          LLVMValueRef a,
+                          LLVMValueRef b,
+                          unsigned lo_hi)
 {
    if (type.length * type.width == 256) {
       LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
@@ -359,11 +359,13 @@ lp_build_interleave2_half(struct gallivm_state *gallivm,
    }
 }
 
+
 /**
  * Double the bit width.
  *
  * This will only change the number of bits the values are represented, not the
  * values themselves.
+ *
  */
 void
 lp_build_unpack2(struct gallivm_state *gallivm,
@@ -394,6 +396,65 @@ lp_build_unpack2(struct gallivm_state *gallivm,
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
    *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
+
+#else
+   *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
+   *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
+#endif
+
+   /* Cast the result into the new type (twice as wide) */
+
+   dst_vec_type = lp_build_vec_type(gallivm, dst_type);
+
+   *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
+   *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
+}
+
+
+/**
+ * Double the bit width, with an order which fits the cpu nicely.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ *
+ * The order of the results is not guaranteed, other than it will match
+ * the corresponding lp_build_pack2_native call.
+ */
+void
+lp_build_unpack2_native(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        struct lp_type dst_type,
+                        LLVMValueRef src,
+                        LLVMValueRef *dst_lo,
+                        LLVMValueRef *dst_hi)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef msb;
+   LLVMTypeRef dst_vec_type;
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(dst_type.width == src_type.width * 2);
+   assert(dst_type.length * 2 == src_type.length);
+
+   if(dst_type.sign && src_type.sign) {
+      /* Replicate the sign bit in the most significant bits */
+      msb = LLVMBuildAShr(builder, src,
+               lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
+   }
+   else
+      /* Most significant bits always zero */
+      msb = lp_build_zero(gallivm, src_type);
+
+   /* Interleave bits */
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
+      *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
+      *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
+   } else {
+      *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
+      *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
+   }
 #else
    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
@@ -440,7 +501,8 @@ lp_build_unpack(struct gallivm_state *gallivm,
       tmp_type.length /= 2;
 
       for(i = num_tmps; i--; ) {
-         lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
+         lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0],
+                          &dst[2*i + 1]);
       }
 
       src_type = tmp_type;
@@ -605,6 +667,70 @@ lp_build_pack2(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Non-interleaved native pack.
+ *
+ * Similar to lp_build_pack2, but the ordering of values is not
+ * guaranteed, other than it will match lp_build_unpack2_native.
+ *
+ * In particular, with avx2, the lower and upper 128bits of the vectors will
+ * be packed independently, so that (with 32bit->16bit values)
+ *         (LSB)                                       (MSB)
+ *   lo =   l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __
+ *   hi =   h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __
+ *   res =  l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ *
+ * It is assumed the values are already clamped into the destination type range.
+ * Values outside that range will produce undefined results.
+ */
+LLVMValueRef
+lp_build_pack2_native(struct gallivm_state *gallivm,
+                      struct lp_type src_type,
+                      struct lp_type dst_type,
+                      LLVMValueRef lo,
+                      LLVMValueRef hi)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type intr_type = dst_type;
+   const char *intrinsic = NULL;
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(src_type.width == dst_type.width * 2);
+   assert(src_type.length * 2 == dst_type.length);
+
+   /* At this point only have special case for avx2 */
+   if (src_type.length * src_type.width == 256 &&
+       util_cpu_caps.has_avx2) {
+      switch(src_type.width) {
+      case 32:
+         if (dst_type.sign) {
+            intrinsic = "llvm.x86.avx2.packssdw";
+         } else {
+            intrinsic = "llvm.x86.avx2.packusdw";
+         }
+         break;
+      case 16:
+         if (dst_type.sign) {
+            intrinsic = "llvm.x86.avx2.packsswb";
+         } else {
+            intrinsic = "llvm.x86.avx2.packuswb";
+         }
+         break;
+      }
+   }
+   if (intrinsic) {
+      LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
+      return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type,
+                                       lo, hi);
+   }
+   else {
+      return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
+   }
+}
 
 /**
  * Non-interleaved pack and saturate.
@@ -640,7 +766,8 @@ lp_build_packs2(struct gallivm_state *gallivm,
    if(clamp) {
       struct lp_build_context bld;
       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
-      LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1);
+      LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type,
+                                ((unsigned long long)1 << dst_bits) - 1);
       lp_build_context_init(&bld, gallivm, src_type);
       lo = lp_build_min(&bld, lo, dst_max);
       hi = lp_build_min(&bld, hi, dst_max);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index 367fba1..3e07716 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -74,6 +74,14 @@ lp_build_unpack2(struct gallivm_state *gallivm,
 
 
 void
+lp_build_unpack2_native(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        struct lp_type dst_type,
+                        LLVMValueRef src,
+                        LLVMValueRef *dst_lo,
+                        LLVMValueRef *dst_hi);
+
+void
 lp_build_unpack(struct gallivm_state *gallivm,
                 struct lp_type src_type,
                 struct lp_type dst_type,
@@ -118,6 +126,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
 
 
 LLVMValueRef
+lp_build_pack2_native(struct gallivm_state *gallivm,
+                      struct lp_type src_type,
+                      struct lp_type dst_type,
+                      LLVMValueRef lo,
+                      LLVMValueRef hi);
+
+
+LLVMValueRef
 lp_build_pack(struct gallivm_state *gallivm,
               struct lp_type src_type,
               struct lp_type dst_type,