[Mesa-dev] [PATCH 1/2] gallivm: better support for fast rsqrt

sroland at vmware.com sroland at vmware.com
Wed Jul 10 17:21:31 PDT 2013


From: Roland Scheidegger <sroland at vmware.com>

We had to disable fast rsqrt before because it wasn't precise enough etc.
However in situations when we know we're not going to need more precision
we can still use a fast rsqrt (which can be several times faster than
the quite expensive sqrt). Hence introduce a new helper which does exactly
that - it is probably not useful calling it in some situations if there's
no fast rsqrt available so make it queryable if it's available too.
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c |   55 +++++++++++++++++++++++++++
 src/gallium/auxiliary/gallivm/lp_bld_arit.h |    7 ++++
 2 files changed, 62 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 08aec79..2e84543 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -2361,6 +2361,61 @@ lp_build_rsqrt(struct lp_build_context *bld,
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
 
+/**
+ * If there's a fast (inaccurate) rsqrt instruction available
+ * (caller may want to avoid to call rsqrt_fast if it's not available,
+ * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
+ * unavailable it would result in sqrt/div/mul so obviously
+ * much better to just call sqrt, skipping both div and mul).
+ */
+boolean
+lp_build_fast_rsqrt_available(struct lp_type type) {
+
+   assert(type.floating);
+
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+      return true;
+   }
+   return false;
+}
+
+
+/**
+ * Generate 1/sqrt(a).
+ * Result is undefined for values < 0, infinity for +0.
+ * Precision is limited, only ~10 bits guaranteed
+ * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
+ */
+LLVMValueRef
+lp_build_rsqrt_fast(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   const struct lp_type type = bld->type;
+
+   assert(lp_check_value(type, a));
+
+   assert(type.floating);
+
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+      const char *intrinsic = NULL;
+
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rsqrt.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+      }
+      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+   }
+   else {
+      debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
+   }
+   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
+}
+
 
 /**
  * Generate sin(a) using SSE2
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 966796c..d53e471 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -231,6 +231,13 @@ LLVMValueRef
 lp_build_rsqrt(struct lp_build_context *bld,
                LLVMValueRef a);
 
+boolean
+lp_build_fast_rsqrt_available(struct lp_type type);
+
+LLVMValueRef
+lp_build_rsqrt_fast(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
 LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
              LLVMValueRef a);
-- 
1.7.9.5


More information about the mesa-dev mailing list