Mesa (master): gallivm: don't use saturated unsigned add/sub intrinsics for llvm 8.0

Fri Aug 24 06:01:34 UTC 2018

Module: Mesa
Branch: master
Commit: 8e1be9a34ac8ce6f115eaf2ab0d99b6a0ce37630
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8e1be9a34ac8ce6f115eaf2ab0d99b6a0ce37630

Author: Roland Scheidegger <sroland at vmware.com>
Date:   Thu Aug 23 19:07:05 2018 +0200

gallivm: don't use saturated unsigned add/sub intrinsics for llvm 8.0

These have been removed. Unfortunately auto-upgrade doesn't work for
jit. (Worse, it seems we don't get a compilation error anymore when
compiling the shader, rather llvm will just do a call to a null
function in the jitted shaders making it difficult to detect when
intrinsics vanish.)

Luckily the signed ones are still there, I helped convincing llvm
removing them is a bad idea for now, since while the unsigned ones have
sort of agreed-upon simplest patterns to replace them with, this is not
the case for the signed ones, and they require _significantly_ more
complex patterns - to the point that the recognition is IMHO probably
unlikely to ever work reliably in practice (due to other optimizations
interfering). (Even for the relatively trivial unsigned patterns, llvm
already added test cases where recognition doesn't work, unsaturated
add followed by saturated add may produce atrocious code.)
Nevertheless, it seems there's a serious quest to squash all
cpu-specific intrinsics going on, so I'd expect patches to nuke them as
well to resurface.

Adapt the existing fallback code to match the simple patterns llvm uses
and hope for the best. I've verified with lp_test_blend that it does
produce the expected saturated assembly instructions. Though our
cmp/select build helpers don't use boolean masks, but it doesn't seem
to interfere with llvm's ability to recognize the pattern.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106231
Reviewed-by: Jose Fonseca <jfonseca at vmware.com>

---

 src/gallium/auxiliary/gallivm/lp_bld_arit.c | 87 ++++++++++++++++++++---------
 1 file changed, 60 insertions(+), 27 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index e922474ef6..f348833206 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -557,23 +557,27 @@ lp_build_add(struct lp_build_context *bld,
       if (!type.floating && !type.fixed) {
          if (type.width * type.length == 128) {
             if (util_cpu_caps.has_sse2) {
-              if (type.width == 8)
-                intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-              if (type.width == 16)
-                intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
+               if (type.width == 8)
+                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
+                                         HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
+               if (type.width == 16)
+                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
+                                         HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
             } else if (util_cpu_caps.has_altivec) {
-              if (type.width == 8)
-                 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
-              if (type.width == 16)
-                 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
             }
          }
          if (type.width * type.length == 256) {
             if (util_cpu_caps.has_avx2) {
-              if (type.width == 8)
-                intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
-              if (type.width == 16)
-                intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
+                                          HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
+                                          HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
             }
          }
       }
@@ -592,8 +596,6 @@ lp_build_add(struct lp_build_context *bld,
          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
-      } else {
-         a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
       }
    }
 
@@ -612,6 +614,24 @@ lp_build_add(struct lp_build_context *bld,
    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 
+   if (type.norm && !type.floating && !type.fixed) {
+      if (!type.sign) {
+         /*
+          * newer llvm versions no longer support the intrinsics, but recognize
+          * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
+          * code, it is important we match the pattern llvm uses (and pray llvm
+          * doesn't change it - and hope they decide on the same pattern for
+          * all backends supporting it...).
+          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
+          * interfere with llvm's ability to recognize the pattern but seems
+          * a bit brittle.
+          */
+         LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
+         res = lp_build_select(bld, overflowed,
+                               LLVMConstAllOnes(bld->int_vec_type), res);
+      }
+   }
+
    /* XXX clamp to floor of -1 or 0??? */
 
    return res;
@@ -858,23 +878,27 @@ lp_build_sub(struct lp_build_context *bld,
       if (!type.floating && !type.fixed) {
          if (type.width * type.length == 128) {
             if (util_cpu_caps.has_sse2) {
-              if (type.width == 8)
-                 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-              if (type.width == 16)
-                 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
+                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
+                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
             } else if (util_cpu_caps.has_altivec) {
-              if (type.width == 8)
-                 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
-              if (type.width == 16)
-                 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
             }
          }
          if (type.width * type.length == 256) {
             if (util_cpu_caps.has_avx2) {
-              if (type.width == 8)
-                 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
-              if (type.width == 16)
-                 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
+                                          HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
+                                          HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
             }
          }
       }
@@ -894,7 +918,16 @@ lp_build_sub(struct lp_build_context *bld,
          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
       } else {
-         a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
+         /*
+          * This must match llvm pattern for saturated unsigned sub.
+          * (lp_build_max_simple actually does the job with its current
+          * definition but do it explicitly here.)
+          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
+          * interfere with llvm's ability to recognize the pattern but seems
+          * a bit brittle.
+          */
+         LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
+         a = lp_build_select(bld, no_ov, a, b);
       }
    }