[Mesa-dev] [PATCH] llvmpipe: fix blending with half-float formats

Tue Dec 10 07:23:28 PST 2013

Am 10.12.2013 04:57, schrieb Zack Rusin:
> The fact that we flush denorms to zero breaks our half-float
> conversion and blending. This patches enables denorms for
> blending. It's a little tricky due to the llvm bug that makes
> it incorrectly reorder the mxcsr intrinsics:
> http://llvm.org/bugs/show_bug.cgi?id=6393
> 
> Signed-off-by: Zack Rusin <zackr at vmware.com>
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_arit.c | 67 +++++++++++++++++++++++++++++
>  src/gallium/auxiliary/gallivm/lp_bld_arit.h | 11 +++++
>  src/gallium/drivers/llvmpipe/lp_state_fs.c  | 31 ++++++++++---
>  3 files changed, 104 insertions(+), 5 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index 70929e7..47e778c 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -64,6 +64,13 @@
>  #include "lp_bld_arit.h"
>  #include "lp_bld_flow.h"
>  
> +#if defined(PIPE_ARCH_SSE)
> +#include <xmmintrin.h>
> +/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
> + * used, so just define it here to avoid further. */
> +#define _MM_DENORMALS_ZERO_MASK	0x0040
> +#endif
> +
>  
>  #define EXP_POLY_DEGREE 5
>  
> @@ -3489,3 +3496,63 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
>     return ret;
>  }
>  
> +
> +LLVMValueRef
> +lp_build_fpstate_get(struct gallivm_state *gallivm)
> +{
> +   if (util_cpu_caps.has_sse) {
> +      LLVMBuilderRef builder = gallivm->builder;
> +      LLVMValueRef mxcsr_ptr = lp_build_alloca(
> +         gallivm,
> +         LLVMInt32TypeInContext(gallivm->context),
> +         "mxcsr_ptr");
> +      lp_build_intrinsic(builder,
> +                         "llvm.x86.sse.stmxcsr",
> +                         LLVMVoidTypeInContext(gallivm->context),
> +                         &mxcsr_ptr, 1);
> +      return mxcsr_ptr;
> +   }
> +   return 0;
> +}
> +
> +void
> +lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
> +                                  boolean zero)
> +{
> +   if (util_cpu_caps.has_sse) {
> +      /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
> +      int daz_ftz = _MM_FLUSH_ZERO_MASK;
> +
> +      LLVMBuilderRef builder = gallivm->builder;
> +      LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
> +      LLVMValueRef mxcsr =
> +         LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
> +
> +      if (util_cpu_caps.has_daz) {
> +         /* Enable denormals are zero mode */
> +         daz_ftz |= _MM_DENORMALS_ZERO_MASK;
> +      }
> +      if (zero) {
> +         mxcsr = LLVMBuildOr(builder, mxcsr,
> +                             LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
> +      } else {
> +         mxcsr = LLVMBuildAnd(builder, mxcsr,
> +                              LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
> +      }
> +
> +      LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
> +      lp_build_fpstate_set(gallivm, mxcsr_ptr);
> +   }
> +}
> +
> +void
> +lp_build_fpstate_set(struct gallivm_state *gallivm,
> +                     LLVMValueRef mxcsr_ptr)
> +{
> +   if (util_cpu_caps.has_sse) {
> +      lp_build_intrinsic(gallivm->builder,
> +                         "llvm.x86.sse.ldmxcsr",
> +                         LLVMVoidTypeInContext(gallivm->context),
> +                         &mxcsr_ptr, 1);
> +   }
> +}
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> index 75bf89e..9d29093 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> @@ -358,4 +358,15 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
>                         const struct lp_type type,
>                         LLVMValueRef x);
>  
> +
> +LLVMValueRef
> +lp_build_fpstate_get(struct gallivm_state *gallivm);
> +
> +void
> +lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
> +                                  boolean zero);
> +void
> +lp_build_fpstate_set(struct gallivm_state *gallivm,
> +                     LLVMValueRef mxcsr);
> +
>  #endif /* !LP_BLD_ARIT_H */
> diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> index b5816e0..d0fdc80 100644
> --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
> +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> @@ -1490,6 +1490,28 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>  
>     const boolean is_1d = variant->key.resource_1d;
>     unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
> +   LLVMValueRef fpstate = 0;
> +
> +   /* Get type from output format */
> +   lp_blend_type_from_format_desc(out_format_desc, &row_type);
> +   lp_mem_type_from_format_desc(out_format_desc, &dst_type);
> +
> +   /*
> +    * Technically this code should go into lp_build_smallfloat_to_float
> +    * and lp_build_float_to_smallfloat but due to the
> +    * http://llvm.org/bugs/show_bug.cgi?id=6393
> +    * llvm reorders the mxcsr intrinsics in a way that breaks the code.
> +    * So the ordering is important here and there shouldn't be any
> +    * llvm ir instrunctions in this function before
> +    * this, otherwise half-float format conversions won't work
> +    * (again due to llvm bug #6393).
> +    */
> +   if (dst_type.floating && dst_type.width == 16) {
This still forgets the packed float format (r11g11b10_float) (same in
the disabling code below of course).

Otherwise looks good to me.

Roland

> +      /* We need to make sure that denorms are ok for half float
> +         conversions */
> +      fpstate = lp_build_fpstate_get(gallivm);
> +      lp_build_fpstate_set_denorms_zero(gallivm, FALSE);
> +   }
>  
>     mask_type = lp_int32_vec4_type();
>     mask_type.length = fs_type.length;
> @@ -1523,11 +1545,6 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>     undef_src_val = lp_build_undef(gallivm, fs_type);
>  #endif
>  
> -
> -   /* Get type from output format */
> -   lp_blend_type_from_format_desc(out_format_desc, &row_type);
> -   lp_mem_type_from_format_desc(out_format_desc, &dst_type);
> -
>     row_type.length = fs_type.length;
>     vector_width    = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
>  
> @@ -1987,6 +2004,10 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>                               dst, dst_type, dst_count, dst_alignment);
>     }
>  
> +   if (dst_type.floating && dst_type.width == 16) {
> +      lp_build_fpstate_set(gallivm, fpstate);
> +   }
> +
>     if (do_branch) {
>        lp_build_mask_end(&mask_ctx);
>     }
>