[Mesa-dev] [PATCHv2 6/8] nir/spirv/glsl450: Rewrite atan2 implementation to fix accuracy and handling of zero/infinity.

Tue Jan 31 05:03:39 UTC 2017

This patch is

Reviewed-by: Ian Romanick <ian.d.romanick at intel.com>

On 01/26/2017 02:52 PM, Francisco Jerez wrote:
> See "glsl: Rewrite atan2 implementation to fix accuracy and handling
> of zero/infinity." for the rationale, but note that the instruction
> count benefit discussed there is somewhat less important for the SPIRV
> implementation, because the current code already emitted no control
> flow instructions -- Still this saves us one hardware instruction per
> scalar component on Intel SKL hardware.
> 
> Fixes the following Vulkan CTS tests on Intel hardware:
> 
>     dEQP-VK.glsl.builtin.precision.atan2.highp_compute.scalar
>     dEQP-VK.glsl.builtin.precision.atan2.highp_compute.vec2
>     dEQP-VK.glsl.builtin.precision.atan2.highp_compute.vec3
>     dEQP-VK.glsl.builtin.precision.atan2.highp_compute.vec4
>     dEQP-VK.glsl.builtin.precision.atan2.mediump_compute.vec2
>     dEQP-VK.glsl.builtin.precision.atan2.mediump_compute.vec4
> 
> Note that most of the test-cases above expect IEEE-compliant handling
> of atan2(±∞, ±∞), which this patch doesn't explicitly handle, so
> except for the last two the test-cases above weren't expected to pass
> yet.  The reason they do is that the i965 back-end implementation of
> the NIR fmin and fmax instructions is not quite GLSL-compliant (it
> complies with IEEE 754 recommendations though), because fmin/fmax of a
> NaN and a non-NaN argument currently always return the non-NaN
> argument, which causes atan() to flush NaN to one and return the
> expected value.  The front-end should probably not be relying on this
> behavior for correctness though because other back-ends are likely to
> behave differently -- A follow-up patch will handle the atan2(±∞, ±∞)
> corner cases explicitly.
> 
> v2: Fix up argument scaling to take into account the range and
>     precision of exotic FP24 hardware.  Flip coordinate system for
>     arguments along the vertical line as if they were on the left
>     half-plane in order to avoid division by zero which may give
>     unspecified results on non-GLSL 4.1-capable hardware.  Sprinkle in
>     some more comments.
> ---
>  src/compiler/spirv/vtn_glsl450.c | 77 ++++++++++++++++++++++++++++------------
>  1 file changed, 55 insertions(+), 22 deletions(-)
> 
> diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
> index 0d32fdd..8509f64 100644
> --- a/src/compiler/spirv/vtn_glsl450.c
> +++ b/src/compiler/spirv/vtn_glsl450.c
> @@ -302,28 +302,61 @@ build_atan(nir_builder *b, nir_ssa_def *y_over_x)
>  static nir_ssa_def *
>  build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
>  {
> -   nir_ssa_def *zero = nir_imm_float(b, 0.0f);
> -
> -   /* If |x| >= 1.0e-8 * |y|: */
> -   nir_ssa_def *condition =
> -      nir_fge(b, nir_fabs(b, x),
> -              nir_fmul(b, nir_imm_float(b, 1.0e-8f), nir_fabs(b, y)));
> -
> -   /* Then...call atan(y/x) and fix it up: */
> -   nir_ssa_def *atan1 = build_atan(b, nir_fdiv(b, y, x));
> -   nir_ssa_def *r_then =
> -      nir_bcsel(b, nir_flt(b, x, zero),
> -                   nir_fadd(b, atan1,
> -                               nir_bcsel(b, nir_fge(b, y, zero),
> -                                            nir_imm_float(b, M_PIf),
> -                                            nir_imm_float(b, -M_PIf))),
> -                   atan1);
> -
> -   /* Else... */
> -   nir_ssa_def *r_else =
> -      nir_fmul(b, nir_fsign(b, y), nir_imm_float(b, M_PI_2f));
> -
> -   return nir_bcsel(b, condition, r_then, r_else);
> +   nir_ssa_def *zero = nir_imm_float(b, 0);
> +   nir_ssa_def *one = nir_imm_float(b, 1);
> +
> +   /* If we're on the left half-plane rotate the coordinates π/2 clock-wise
> +    * for the y=0 discontinuity to end up aligned with the vertical
> +    * discontinuity of atan(s/t) along t=0.  This also makes sure that we
> +    * don't attempt to divide by zero along the vertical line, which may give
> +    * unspecified results on non-GLSL 4.1-capable hardware.
> +    */
> +   nir_ssa_def *flip = nir_fge(b, zero, x);
> +   nir_ssa_def *s = nir_bcsel(b, flip, nir_fabs(b, x), y);
> +   nir_ssa_def *t = nir_bcsel(b, flip, y, nir_fabs(b, x));
> +
> +   /* If the magnitude of the denominator exceeds some huge value, scale down
> +    * the arguments in order to prevent the reciprocal operation from flushing
> +    * its result to zero, which would cause precision problems, and for s
> +    * infinite would cause us to return a NaN instead of the correct finite
> +    * value.
> +    *
> +    * If fmin and fmax are respectively the smallest and largest positive
> +    * normalized floating point values representable by the implementation,
> +    * the constants below should be in agreement with:
> +    *
> +    *    huge <= 1 / fmin
> +    *    scale <= 1 / fmin / fmax (for |t| >= huge)
> +    *
> +    * In addition scale should be a negative power of two in order to avoid
> +    * loss of precision.  The values chosen below should work for most usual
> +    * floating point representations with at least the dynamic range of ATI's
> +    * 24-bit representation.
> +    */
> +   nir_ssa_def *huge = nir_imm_float(b, 1e18f);
> +   nir_ssa_def *scale = nir_bcsel(b, nir_fge(b, nir_fabs(b, t), huge),
> +                                  nir_imm_float(b, 0.25), one);
> +   nir_ssa_def *rcp_scaled_t = nir_frcp(b, nir_fmul(b, t, scale));
> +   nir_ssa_def *s_over_t = nir_fmul(b, nir_fmul(b, s, scale), rcp_scaled_t);
> +
> +   /* Calculate the arctangent and fix up the result if we had flipped the
> +    * coordinate system.
> +    */
> +   nir_ssa_def *arc = nir_fadd(b, nir_fmul(b, nir_b2f(b, flip),
> +                                           nir_imm_float(b, M_PI_2f)),
> +                               build_atan(b, nir_fabs(b, s_over_t)));
> +
> +   /* Rather convoluted calculation of the sign of the result.  When x < 0 we
> +    * cannot use fsign because we need to be able to distinguish between
> +    * negative and positive zero.  We don't use bitwise arithmetic tricks for
> +    * consistency with the GLSL front-end.  When x >= 0 rcp_scaled_t will
> +    * always be non-negative so this won't be able to distinguish between
> +    * negative and positive zero, but we don't care because atan2 is
> +    * continuous along the whole positive y = 0 half-line, so it won't affect
> +    * the result significantly.
> +    */
> +   return nir_bcsel(b, nir_flt(b, nir_fmin(b, y, rcp_scaled_t), zero),
> +                    nir_fneg(b, arc), arc);
>  }
>  
>  static nir_ssa_def *
>