[Mesa-dev] [PATCH] gallivm, llvmpipe: fix float->srgb conversion to handle NaNs

Wed Nov 13 12:21:24 PST 2013

The introduction and use of lp_build_clamp_zero_one_nanzero looks good.

I can't comment on the changes to the existing paths though, as they are too subtle for me. As long there are no regressions I'm good.

Jose

----- Original Message -----
> From: Roland Scheidegger <sroland at vmware.com>
> 
> d3d10 requires us to convert NaNs to zero for any float->int conversion.
> We don't really do that but mostly seems to work. In particular I suspect the
> very common float->unorm8 path only really passes because it relies on sse2
> pack intrinsics which just happen to work by luck for NaNs (float->int
> conversion in hw gives integer indeterminate value, which just happens to be
> -0x80000000 hence gets converted to zero in the end after pack intrinsics).
> However, float->srgb didn't get so lucky, because we need to clamp before
> blending and clamping resulted in NaN behavior being undefined (and actually
> got converted to 1.0 by clamping with sse2). Fix this by using a zero/one
> clamp
> with defined nan behavior as we can handle the NaN for free this way.
> I suspect there's more bugs lurking in this area (e.g. converting floats to
> snorm) as we don't really use defined NaN behavior everywhere but this seems
> to be good enough.
> While here respecify nan behavior modes a bit, in particular the
> return_second
> mode didn't really do what we wanted. From the caller's perspective, we
> really
> wanted to say we need the non-nan result, but we already know the second arg
> isn't a NaN. So we use this now instead, which means that cpu architectures
> which actually implement min/max by always returning non-nan (that is
> adhering
> to ieee754-2008 rules) don't need to bend over backwards for nothing.
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_arit.c        |   44
>  +++++++++++++-------
>  src/gallium/auxiliary/gallivm/lp_bld_arit.h        |   12 ++++--
>  src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c |    2 +-
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c    |   11 ++---
>  src/gallium/drivers/llvmpipe/lp_state_fs.c         |    4 +-
>  5 files changed, 45 insertions(+), 28 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index 00052ed..70929e7 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -123,8 +123,10 @@ lp_build_min_simple(struct lp_build_context *bld,
>        }
>     }
>     else if (type.floating && util_cpu_caps.has_altivec) {
> -      debug_printf("%s: altivec doesn't support nan behavior modes\n",
> -                   __FUNCTION__);
> +      if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
> +         debug_printf("%s: altivec doesn't support nan return nan
> behavior\n",
> +                      __FUNCTION__);
> +      }
>        if (type.width == 32 && type.length == 4) {
>           intrinsic = "llvm.ppc.altivec.vminfp";
>           intr_size = 128;
> @@ -159,8 +161,6 @@ lp_build_min_simple(struct lp_build_context *bld,
>        }
>     } else if (util_cpu_caps.has_altivec) {
>        intr_size = 128;
> -      debug_printf("%s: altivec doesn't support nan behavior modes\n",
> -                   __FUNCTION__);
>        if (type.width == 8) {
>           if (!type.sign) {
>              intrinsic = "llvm.ppc.altivec.vminub";
> @@ -191,7 +191,7 @@ lp_build_min_simple(struct lp_build_context *bld,
>         */
>        if (util_cpu_caps.has_sse && type.floating &&
>            nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
> -          nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
> +          nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
>           LLVMValueRef isnan, max;
>           max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
>                                                     type,
> @@ -227,7 +227,7 @@ lp_build_min_simple(struct lp_build_context *bld,
>           return lp_build_select(bld, cond, a, b);
>        }
>           break;
> -      case GALLIVM_NAN_RETURN_SECOND:
> +      case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
>           cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
>           return lp_build_select(bld, cond, a, b);
>        case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
> @@ -299,8 +299,10 @@ lp_build_max_simple(struct lp_build_context *bld,
>        }
>     }
>     else if (type.floating && util_cpu_caps.has_altivec) {
> -      debug_printf("%s: altivec doesn't support nan behavior modes\n",
> -                   __FUNCTION__);
> +      if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
> +         debug_printf("%s: altivec doesn't support nan return nan
> behavior\n",
> +                      __FUNCTION__);
> +      }
>        if (type.width == 32 || type.length == 4) {
>           intrinsic = "llvm.ppc.altivec.vmaxfp";
>           intr_size = 128;
> @@ -336,8 +338,6 @@ lp_build_max_simple(struct lp_build_context *bld,
>        }
>     } else if (util_cpu_caps.has_altivec) {
>       intr_size = 128;
> -     debug_printf("%s: altivec doesn't support nan behavior modes\n",
> -                  __FUNCTION__);
>       if (type.width == 8) {
>         if (!type.sign) {
>           intrinsic = "llvm.ppc.altivec.vmaxub";
> @@ -362,7 +362,7 @@ lp_build_max_simple(struct lp_build_context *bld,
>     if(intrinsic) {
>        if (util_cpu_caps.has_sse && type.floating &&
>            nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
> -          nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
> +          nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
>           LLVMValueRef isnan, min;
>           min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
>                                                     type,
> @@ -398,7 +398,7 @@ lp_build_max_simple(struct lp_build_context *bld,
>           return lp_build_select(bld, cond, a, b);
>        }
>           break;
> -      case GALLIVM_NAN_RETURN_SECOND:
> +      case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
>           cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
>           return lp_build_select(bld, cond, a, b);
>        case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
> @@ -1399,6 +1399,7 @@ lp_build_max_ext(struct lp_build_context *bld,
>  
>  /**
>   * Generate clamp(a, min, max)
> + * NaN behavior (for any of a, min, max) is undefined.
>   * Do checks for special cases.
>   */
>  LLVMValueRef
> @@ -1418,6 +1419,20 @@ lp_build_clamp(struct lp_build_context *bld,
>  
>  
>  /**
> + * Generate clamp(a, 0, 1)
> + * A NaN will get converted to zero.
> + */
> +LLVMValueRef
> +lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
> +                                LLVMValueRef a)
> +{
> +   a = lp_build_max_ext(bld, a, bld->zero,
> GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
> +   a = lp_build_min(bld, a, bld->one);
> +   return a;
> +}
> +
> +
> +/**
>   * Generate abs(a)
>   */
>  LLVMValueRef
> @@ -3029,9 +3044,8 @@ lp_build_exp2(struct lp_build_context *bld,
>     /* We want to preserve NaN and make sure than for exp2 if x > 128,
>      * the result is INF  and if it's smaller than -126.9 the result is 0 */
>     x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0),
>     x,
> -                        GALLIVM_NAN_RETURN_SECOND);
> -   x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type,
> -126.99999), x,
> -                        GALLIVM_NAN_RETURN_SECOND);
> +                        GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
> +   x = lp_build_max(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
> x);
>  
>     /* ipart = floor(x) */
>     /* fpart = x - ipart */
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> index 49d4e2c..75bf89e 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> @@ -142,9 +142,11 @@ enum gallivm_nan_behavior {
>     GALLIVM_NAN_RETURN_NAN,
>     /* If one of the inputs is NaN, the other operand is returned */
>     GALLIVM_NAN_RETURN_OTHER,
> -   /* If one of the inputs is NaN, the second operand is returned.
> -    * In min/max it will be as fast as undefined with sse opcodes */
> -   GALLIVM_NAN_RETURN_SECOND
> +   /* If one of the inputs is NaN, the other operand is returned,
> +    * but we guarantee the second operand is not a NaN.
> +    * In min/max it will be as fast as undefined with sse opcodes,
> +    * and archs having native return_other can benefit too. */
> +   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
>  };
>  
>  LLVMValueRef
> @@ -176,6 +178,10 @@ lp_build_clamp(struct lp_build_context *bld,
>                 LLVMValueRef max);
>  
>  LLVMValueRef
> +lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
> +                                LLVMValueRef a);
> +
> +LLVMValueRef
>  lp_build_abs(struct lp_build_context *bld,
>               LLVMValueRef a);
>  
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> index 2b1fe64..6645151 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> @@ -326,7 +326,7 @@ lp_build_float_to_srgb_packed(struct gallivm_state
> *gallivm,
>      * can't use lp_build_conv since we want to keep values as 32bit
>      * here so we can interleave with rgb to go from SoA->AoS.
>      */
> -   alpha = lp_build_clamp(&f32_bld, src[3], f32_bld.zero, f32_bld.one);
> +   alpha = lp_build_clamp_zero_one_nanzero(&f32_bld, src[3]);
>     alpha = lp_build_mul(&f32_bld, alpha,
>                          lp_build_const_vec(gallivm, src_type, 255.0f));
>     tmpsrgb[3] = lp_build_iround(&f32_bld, alpha);
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> index 5f81066..5fc47ed 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> @@ -1384,21 +1384,18 @@ emit_store_chan(
>        assert(dtype == TGSI_TYPE_FLOAT ||
>               dtype == TGSI_TYPE_UNTYPED);
>        value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
> -      value = lp_build_max_ext(float_bld, value, float_bld->zero,
> -                               GALLIVM_NAN_RETURN_SECOND);
> -      value = lp_build_min_ext(float_bld, value, float_bld->one,
> -                               GALLIVM_NAN_BEHAVIOR_UNDEFINED);
> +      value = lp_build_clamp_zero_one_nanzero(float_bld, value);
>        break;
>  
>     case TGSI_SAT_MINUS_PLUS_ONE:
>        assert(dtype == TGSI_TYPE_FLOAT ||
>               dtype == TGSI_TYPE_UNTYPED);
>        value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
> +      /* This will give -1.0 for NaN which is probably not what we want. */
>        value = lp_build_max_ext(float_bld, value,
>                                 lp_build_const_vec(gallivm, float_bld->type,
>                                 -1.0),
> -                               GALLIVM_NAN_RETURN_SECOND);
> -      value = lp_build_min_ext(float_bld, value, float_bld->one,
> -                               GALLIVM_NAN_BEHAVIOR_UNDEFINED);
> +                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
> +      value = lp_build_min(float_bld, value, float_bld->one);
>        break;
>  
>     default:
> diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c
> b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> index 8223d2a..b5816e0 100644
> --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
> +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> @@ -1760,11 +1760,11 @@ generate_unswizzled_blend(struct gallivm_state
> *gallivm,
>        assert(row_type.floating);
>        lp_build_context_init(&f32_bld, gallivm, row_type);
>        for (i = 0; i < src_count; i++) {
> -         src[i] = lp_build_clamp(&f32_bld, src[i], f32_bld.zero,
> f32_bld.one);
> +         src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
>        }
>        if (dual_source_blend) {
>           for (i = 0; i < src_count; i++) {
> -            src1[i] = lp_build_clamp(&f32_bld, src1[i], f32_bld.zero,
> f32_bld.one);
> +            src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
>           }
>        }
>        /* probably can't be different than row_type but better safe than
>        sorry... */
> --
> 1.7.9.5
>