[Mesa-dev] [PATCH] util: treat denorm'ed floats like zero

Tue Jul 9 10:11:21 PDT 2013

Am 09.07.2013 07:18, schrieb Zack Rusin:
> the spec is very explicit about treatment of denorm floats and
> the behavior is exactly the same for them as it would be for -0 or
> +0. This makes our shading code match that behavior.
I think this warrants some closer mentioning of what spec this refers
to. As far as I know OpenGL doesn't imply anything about denorms.

> Signed-off-by: Zack Rusin <zackr at vmware.com>
> ---
>  src/gallium/auxiliary/draw/draw_pt.c        |    4 +++
>  src/gallium/auxiliary/gallivm/lp_bld_arit.c |    1 +
>  src/gallium/auxiliary/util/u_math.c         |   51 +++++++++++++++++++++++++++
>  src/gallium/auxiliary/util/u_math.h         |    7 ++++
>  src/gallium/drivers/llvmpipe/lp_rast.c      |    2 ++
>  5 files changed, 65 insertions(+)
> 
> diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
> index d2fe002..33c673f 100644
> --- a/src/gallium/auxiliary/draw/draw_pt.c
> +++ b/src/gallium/auxiliary/draw/draw_pt.c
> @@ -459,8 +459,10 @@ draw_vbo(struct draw_context *draw,
>     unsigned instance;
>     unsigned index_limit;
>     unsigned count;
> +   unsigned fpstate = util_fpstate_get();
>     struct pipe_draw_info resolved_info;
>  
> +   util_fpstate_set_denorms_to_zero(fpstate);
>     resolve_draw_info(info, &resolved_info);
>     info = &resolved_info;
>  
> @@ -518,6 +520,7 @@ draw_vbo(struct draw_context *draw,
>        if (index_limit == 0) {
>        /* one of the buffers is too small to do any valid drawing */
>           debug_warning("draw: VBO too small to draw anything\n");
> +         util_fpstate_set(fpstate);
>           return;
>        }
>     }
> @@ -558,4 +561,5 @@ draw_vbo(struct draw_context *draw,
>     if (draw->collect_statistics) {
>        draw->render->pipeline_statistics(draw->render, &draw->statistics);
>     }
> +   util_fpstate_set(fpstate);
>  }
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index 08aec79..c006ac5 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -62,6 +62,7 @@
>  #include "lp_bld_debug.h"
>  #include "lp_bld_bitarit.h"
>  #include "lp_bld_arit.h"
> +#include "lp_bld_flow.h"
>  
>  
>  #define EXP_POLY_DEGREE 5
> diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
> index 2811475..70ff710 100644
> --- a/src/gallium/auxiliary/util/u_math.c
> +++ b/src/gallium/auxiliary/util/u_math.c
> @@ -28,6 +28,7 @@
>  
>  
>  #include "util/u_math.h"
> +#include "util/u_cpu_detect.h"
>  
>  
>  /** 2^x, for x in [-1.0, 1.0) */
> @@ -70,4 +71,54 @@ util_init_math(void)
>     }
>  }
>  
> +/**
> + * Fetches the contents of the fpstate (mxcsr on x86) register.
> + *
> + * On platforms without support for it just returns 0.
> + */
> +unsigned
> +util_fpstate_get(void)
> +{
> +   unsigned mxcsr = 0;
> +
> +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
> +   if (util_cpu_caps.has_sse3) {
> +      mxcsr = __builtin_ia32_stmxcsr();
> +   }
> +#endif
>  
> +   return mxcsr;
> +}
> +
> +/**
> + * Make sure that the fp treats the denormalized floating
> + * point numbers as zero.
> + */
> +unsigned
> +util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
> +{
> +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
> +#define MXCSR_DAZ (1 << 6)	/* Enable denormals are zero mode */
> +#define MXCSR_FTZ (1 << 15)	/* Enable flush to zero mode */
> +   if (util_cpu_caps.has_sse3) {
> +      current_mxcsr |= MXCSR_DAZ | MXCSR_FTZ;
I think since FTZ is supported on all sse capable cpus this bit should
be set regardless of sse3. (Plus, most sse2 capable cpus support DAZ
except some P4, though it would require some more elaborate detection
which I'm not sure is worth it.)

> +      util_fpstate_set(current_mxcsr);
> +   }
> +#endif
> +   return current_mxcsr;
> +}
> +
> +/**
> + * Set the state of the fpstate (mxcsr on x86) register.
> + *
> + * On platforms without support for it's a noop.
> + */
> +void
> +util_fpstate_set(unsigned mxcsr)
> +{
> +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
> +   if (util_cpu_caps.has_sse3) {
> +      __builtin_ia32_ldmxcsr(mxcsr);
> +   }
> +#endif
> +}
> diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
> index 64d16cb..bc39488 100644
> --- a/src/gallium/auxiliary/util/u_math.h
> +++ b/src/gallium/auxiliary/util/u_math.h
> @@ -763,6 +763,13 @@ static INLINE int32_t util_signed_fixed(float value, unsigned frac_bits)
>     return (int32_t)(value * (1<<frac_bits));
>  }
>  
> +unsigned
> +util_fpstate_get(void);
> +unsigned
> +util_fpstate_set_denorms_to_zero(unsigned current_fpstate);
> +void
> +util_fpstate_set(unsigned fpstate);
> +
>  
>  
>  #ifdef __cplusplus
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
> index 871cc50..536f916 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast.c
> @@ -751,6 +751,8 @@ static PIPE_THREAD_ROUTINE( thread_function, init_data )
>     struct lp_rasterizer_task *task = (struct lp_rasterizer_task *) init_data;
>     struct lp_rasterizer *rast = task->rast;
>     boolean debug = false;
> +   unsigned fpstate = util_fpstate_get();
> +   util_fpstate_set_denorms_to_zero(fpstate);
>  
>     while (1) {
>        /* wait for work */
> 

Otherwise looks good to me, though I'd like to see the ppc handling of
it to be the same.

Roland