[Mesa-dev] [PATCH 1/2] gallivm: better support for fast rsqrt

Thu Jul 11 09:29:06 PDT 2013

Looks good. Thanks for the updates.

Jose

----- Original Message -----
> From: Roland Scheidegger <sroland at vmware.com>
> 
> We had to disable fast rsqrt before because it wasn't precise enough etc.
> However in situations when we know we're not going to need more precision
> we can still use a fast rsqrt (which can be several times faster than
> the quite expensive sqrt). Hence introduce a new helper which does exactly
> that - it is probably not useful calling it in some situations if there's
> no fast rsqrt available so make it queryable if it's available too.
> 
> v2: use fast_rsqrt consistently instead of rsqrt_fast, fix indentation,
> let rsqrt use fast_rsqrt.
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_arit.c |   75
>  +++++++++++++++++++++------
>  src/gallium/auxiliary/gallivm/lp_bld_arit.h |    7 +++
>  2 files changed, 66 insertions(+), 16 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index c006ac5..fd7c22e 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -2306,19 +2306,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
>     /*
>      * This should be faster but all denormals will end up as infinity.
>      */
> -   if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
> ||
> -        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
> +   if (0 && lp_build_fast_rsqrt_available(type)) {
>        const unsigned num_iterations = 1;
>        LLVMValueRef res;
>        unsigned i;
> -      const char *intrinsic = NULL;
>  
> -      if (type.length == 4) {
> -         intrinsic = "llvm.x86.sse.rsqrt.ps";
> -      }
> -      else {
> -         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
> -      }
> +      /* rsqrt(1.0) != 1.0 here */
> +      res = lp_build_fast_rsqrt(bld, a);
> +
>        if (num_iterations) {
>           /*
>            * Newton-Raphson will result in NaN instead of infinity for zero,
> @@ -2338,8 +2333,6 @@ lp_build_rsqrt(struct lp_build_context *bld,
>  
>           inf = LLVMBuildBitCast(builder, inf,
>           lp_build_vec_type(bld->gallivm, type), "");
>  
> -         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type,
> a);
> -
>           for (i = 0; i < num_iterations; ++i) {
>              res = lp_build_rsqrt_refine(bld, a, res);
>           }
> @@ -2350,11 +2343,6 @@ lp_build_rsqrt(struct lp_build_context *bld,
>           cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a,
>           bld->one);
>           res = lp_build_select(bld, cmp, bld->one, res);
>        }
> -      else {
> -         /* rsqrt(1.0) != 1.0 here */
> -         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type,
> a);
> -
> -      }
>  
>        return res;
>     }
> @@ -2362,6 +2350,61 @@ lp_build_rsqrt(struct lp_build_context *bld,
>     return lp_build_rcp(bld, lp_build_sqrt(bld, a));
>  }
>  
> +/**
> + * If there's a fast (inaccurate) rsqrt instruction available
> + * (caller may want to avoid to call rsqrt_fast if it's not available,
> + * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
> + * unavailable it would result in sqrt/div/mul so obviously
> + * much better to just call sqrt, skipping both div and mul).
> + */
> +boolean
> +lp_build_fast_rsqrt_available(struct lp_type type)
> +{
> +
> +   assert(type.floating);
> +
> +   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
> +       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
> +      return true;
> +   }
> +   return false;
> +}
> +
> +
> +/**
> + * Generate 1/sqrt(a).
> + * Result is undefined for values < 0, infinity for +0.
> + * Precision is limited, only ~10 bits guaranteed
> + * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
> + */
> +LLVMValueRef
> +lp_build_fast_rsqrt(struct lp_build_context *bld,
> +                    LLVMValueRef a)
> +{
> +   LLVMBuilderRef builder = bld->gallivm->builder;
> +   const struct lp_type type = bld->type;
> +
> +   assert(lp_check_value(type, a));
> +
> +   assert(type.floating);
> +
> +   if (lp_build_fast_rsqrt_available(type)) {
> +      const char *intrinsic = NULL;
> +
> +      if (type.length == 4) {
> +         intrinsic = "llvm.x86.sse.rsqrt.ps";
> +      }
> +      else {
> +         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
> +      }
> +      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
> +   }
> +   else {
> +      debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n",
> __FUNCTION__);
> +   }
> +   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
> +}
> +
>  
>  /**
>   * Generate sin(a) using SSE2
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> index 966796c..920e339 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> @@ -231,6 +231,13 @@ LLVMValueRef
>  lp_build_rsqrt(struct lp_build_context *bld,
>                 LLVMValueRef a);
>  
> +boolean
> +lp_build_fast_rsqrt_available(struct lp_type type);
> +
> +LLVMValueRef
> +lp_build_fast_rsqrt(struct lp_build_context *bld,
> +                    LLVMValueRef a);
> +
>  LLVMValueRef
>  lp_build_cos(struct lp_build_context *bld,
>               LLVMValueRef a);
> --
> 1.7.9.5
>