[Mesa-dev] [PATCH] nir: Optimize double-precision lower_round_even()

Mon Jan 28 18:25:53 UTC 2019

I like it :-).
That said, there's some caveats as discussed on IRC - in particular for
gpus which don't do round-to-nearest-even for ordinary fp64 math (or
rounding mode could be set to something else manually) it won't do the
right thing.
And if you can have fast-math enabled, then it probably won't round at
all (at least I think it would be legal to eliminate the add/sub in this
case).
So I'm not entirely sure anymore if this can be used unconditionally.
But I can't really tell if those potential caveats actually matter, hence
Reviewed-by: Roland Scheidegger <sroland at vmware.com>

Am 28.01.19 um 18:31 schrieb Matt Turner:
> Use the trick of adding and then subtracting 2**52 (52 is the number of
> explicit mantissa bits a double-precision floating-point value has) to
> implement round-to-even.
> 
> Cuts the number of instructions on SKL of the piglit test
> fs-roundEven-double.shader_test from 109 to 21.
> ---
>  src/compiler/nir/nir_lower_double_ops.c | 56 ++++++-------------------
>  1 file changed, 12 insertions(+), 44 deletions(-)
> 
> diff --git a/src/compiler/nir/nir_lower_double_ops.c b/src/compiler/nir/nir_lower_double_ops.c
> index 4d4cdf635ea..054fce9c168 100644
> --- a/src/compiler/nir/nir_lower_double_ops.c
> +++ b/src/compiler/nir/nir_lower_double_ops.c
> @@ -392,50 +392,18 @@ lower_fract(nir_builder *b, nir_ssa_def *src)
>  static nir_ssa_def *
>  lower_round_even(nir_builder *b, nir_ssa_def *src)
>  {
> -   /* If fract(src) == 0.5, then we will have to decide the rounding direction.
> -    * We will do this by computing the mod(abs(src), 2) and testing if it
> -    * is < 1 or not.
> -    *
> -    * We compute mod(abs(src), 2) as:
> -    * abs(src) - 2.0 * floor(abs(src) / 2.0)
> -    */
> -   nir_ssa_def *two = nir_imm_double(b, 2.0);
> -   nir_ssa_def *abs_src = nir_fabs(b, src);
> -   nir_ssa_def *mod =
> -      nir_fsub(b,
> -               abs_src,
> -               nir_fmul(b,
> -                        two,
> -                        nir_ffloor(b,
> -                                   nir_fmul(b,
> -                                            abs_src,
> -                                            nir_imm_double(b, 0.5)))));
> -
> -   /*
> -    * If fract(src) != 0.5, then we round as floor(src + 0.5)
> -    *
> -    * If fract(src) == 0.5, then we have to check the modulo:
> -    *
> -    *   if it is < 1 we need a trunc operation so we get:
> -    *      0.5 -> 0,   -0.5 -> -0
> -    *      2.5 -> 2,   -2.5 -> -2
> -    *
> -    *   otherwise we need to check if src >= 0, in which case we need to round
> -    *   upwards, or not, in which case we need to round downwards so we get:
> -    *      1.5 -> 2,   -1.5 -> -2
> -    *      3.5 -> 4,   -3.5 -> -4
> -    */
> -   nir_ssa_def *fract = nir_ffract(b, src);
> -   return nir_bcsel(b,
> -                    nir_fne(b, fract, nir_imm_double(b, 0.5)),
> -                    nir_ffloor(b, nir_fadd(b, src, nir_imm_double(b, 0.5))),
> -                    nir_bcsel(b,
> -                              nir_flt(b, mod, nir_imm_double(b, 1.0)),
> -                              nir_ftrunc(b, src),
> -                              nir_bcsel(b,
> -                                        nir_fge(b, src, nir_imm_double(b, 0.0)),
> -                                        nir_fadd(b, src, nir_imm_double(b, 0.5)),
> -                                        nir_fsub(b, src, nir_imm_double(b, 0.5)))));
> +   /* Add and subtract 2**52 to round off any fractional bits. */
> +   nir_ssa_def *two52 = nir_imm_double(b, (double)(1ull << 52));
> +   nir_ssa_def *sign = nir_iand(b, nir_unpack_64_2x32_split_y(b, src),
> +                                nir_imm_int(b, 1ull << 31));
> +
> +   b->exact = true;
> +   nir_ssa_def *res = nir_fsub(b, nir_fadd(b, nir_fabs(b, src), two52), two52);
> +   b->exact = false;
> +
> +   return nir_bcsel(b, nir_flt(b, nir_fabs(b, src), two52),
> +                    nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, res),
> +                                           nir_ior(b, nir_unpack_64_2x32_split_y(b, res), sign)), src);
>  }
>  
>  static nir_ssa_def *
>