[Mesa-dev] [PATCH] llvmpipe: fix snorm blending

Mon Nov 20 13:58:40 UTC 2017

On 18/11/17 05:34, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> The blend math gets a bit funky due to inverse blend factors being
> in range [0,2] rather than [-1,1], our normalized math can't really
> cover this.
> src_alpha_saturate blend factor has a similar problem too.
> (Note that piglit fbo-blending-formats test is mostly useless for
> anything but unorm formats, since not just all src/dst values are
> between [0,1], but the tests are crafted in a way that the results
> are between [0,1] too.)
> 
> v2: some formatting fixes, and fix a fairly obscure (to debug)
> issue with alpha-only formats (not related to snorm at all), where
> blend optimization would think it could simplify the blend equation
> if the blend factors were complementary, however was using the
> completely unrelated rgb blend factors instead of the alpha ones...
> ---
>   src/gallium/auxiliary/gallivm/lp_bld_arit.c     |  50 ++++-----
>   src/gallium/auxiliary/gallivm/lp_bld_arit.h     |   7 ++
>   src/gallium/drivers/llvmpipe/lp_bld_blend.c     | 130 ++++++++++++++++++++++--
>   src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c |  53 ++++++----
>   4 files changed, 187 insertions(+), 53 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index a1edd34..321c6e4 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -541,38 +541,38 @@ lp_build_add(struct lp_build_context *bld,
>      assert(lp_check_value(type, a));
>      assert(lp_check_value(type, b));
>   
> -   if(a == bld->zero)
> +   if (a == bld->zero)
>         return b;
> -   if(b == bld->zero)
> +   if (b == bld->zero)
>         return a;
> -   if(a == bld->undef || b == bld->undef)
> +   if (a == bld->undef || b == bld->undef)
>         return bld->undef;
>   
> -   if(bld->type.norm) {
> +   if (type.norm) {
>         const char *intrinsic = NULL;
>   
> -      if(a == bld->one || b == bld->one)
> +      if (!type.sign && (a == bld->one || b == bld->one))
>           return bld->one;
>   
>         if (!type.floating && !type.fixed) {
>            if (type.width * type.length == 128) {
> -            if(util_cpu_caps.has_sse2) {
> -              if(type.width == 8)
> +            if (util_cpu_caps.has_sse2) {
> +              if (type.width == 8)
>                   intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
> -              if(type.width == 16)
> +              if (type.width == 16)
>                   intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
>               } else if (util_cpu_caps.has_altivec) {
> -              if(type.width == 8)
> +              if (type.width == 8)
>                    intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
> -              if(type.width == 16)
> +              if (type.width == 16)
>                    intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
>               }
>            }
>            if (type.width * type.length == 256) {
> -            if(util_cpu_caps.has_avx2) {
> -              if(type.width == 8)
> +            if (util_cpu_caps.has_avx2) {
> +              if (type.width == 8)
>                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
> -              if(type.width == 16)
> +              if (type.width == 16)
>                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
>               }
>            }
> @@ -842,38 +842,38 @@ lp_build_sub(struct lp_build_context *bld,
>      assert(lp_check_value(type, a));
>      assert(lp_check_value(type, b));
>   
> -   if(b == bld->zero)
> +   if (b == bld->zero)
>         return a;
> -   if(a == bld->undef || b == bld->undef)
> +   if (a == bld->undef || b == bld->undef)
>         return bld->undef;
> -   if(a == b)
> +   if (a == b)
>         return bld->zero;
>   
> -   if(bld->type.norm) {
> +   if (type.norm) {
>         const char *intrinsic = NULL;
>   
> -      if(b == bld->one)
> +      if (!type.sign && b == bld->one)
>           return bld->zero;
>   
>         if (!type.floating && !type.fixed) {
>            if (type.width * type.length == 128) {
>               if (util_cpu_caps.has_sse2) {
> -              if(type.width == 8)
> +              if (type.width == 8)
>                    intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
> -              if(type.width == 16)
> +              if (type.width == 16)
>                    intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
>               } else if (util_cpu_caps.has_altivec) {
> -              if(type.width == 8)
> +              if (type.width == 8)
>                    intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
> -              if(type.width == 16)
> +              if (type.width == 16)
>                    intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
>               }
>            }
>            if (type.width * type.length == 256) {
>               if (util_cpu_caps.has_avx2) {
> -              if(type.width == 8)
> +              if (type.width == 8)
>                    intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
> -              if(type.width == 16)
> +              if (type.width == 16)
>                    intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
>               }
>            }
> @@ -963,7 +963,7 @@ lp_build_sub(struct lp_build_context *bld,
>    * @sa Michael Herf, The "double blend trick", May 2000,
>    *     http://www.stereopsis.com/doubleblend.html
>    */
> -static LLVMValueRef
> +LLVMValueRef
>   lp_build_mul_norm(struct gallivm_state *gallivm,
>                     struct lp_type wide_type,
>                     LLVMValueRef a, LLVMValueRef b)
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> index 2a4137a..f5b2800 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
> @@ -71,6 +71,13 @@ lp_build_sub(struct lp_build_context *bld,
>                LLVMValueRef a,
>                LLVMValueRef b);
>   
> +
> +LLVMValueRef
> +lp_build_mul_norm(struct gallivm_state *gallivm,
> +                  struct lp_type wide_type,
> +                  LLVMValueRef a,
> +                  LLVMValueRef b);
> +
>   LLVMValueRef
>   lp_build_mul(struct lp_build_context *bld,
>                LLVMValueRef a,
> diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
> index 1feb415..af03d6c 100644
> --- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c
> +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
> @@ -35,6 +35,7 @@
>   #include "gallivm/lp_bld_swizzle.h"
>   #include "gallivm/lp_bld_flow.h"
>   #include "gallivm/lp_bld_debug.h"
> +#include "gallivm/lp_bld_pack.h"
>   
>   #include "lp_bld_blend.h"
>   
> @@ -65,11 +66,11 @@ lp_build_blend_func_commutative(unsigned func)
>   boolean
>   lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
>   {
> -   if(rgb_func == alpha_func)
> +   if (rgb_func == alpha_func)
>         return FALSE;
> -   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
> +   if (rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
>         return TRUE;
> -   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
> +   if (rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
>         return TRUE;
>      return FALSE;
>   }
> @@ -86,6 +87,56 @@ lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
>   
>   
>   /**
> + * Whether this is a inverse blend factor
> + */
> +static inline boolean
> +is_inverse_factor(unsigned factor)
> +{
> +   return factor > 0x11;

I fear that if the pipe_blendfactor enum ever gets renumered this would 
silently fail.  It's better to add some sort of compile time assertion 
here, e.,

    STATIC_ASSERT(PIPE_BLENDFACTOR_ZERO = 0x11);

> +}
> +
> +
> +/**
> + * Calculates the (expanded to wider type) multiplication
> + * of 2 normalized numbers.
> + */
> +static void
> +lp_build_mul_norm_expand(struct lp_build_context *bld,
> +                         LLVMValueRef a, LLVMValueRef b,
> +                         LLVMValueRef *resl, LLVMValueRef *resh,
> +                         boolean signedness_differs)
> +{
> +   const struct lp_type type = bld->type;
> +   struct lp_type wide_type = lp_wider_type(type);
> +   struct lp_type wide_type2 = wide_type;
> +   struct lp_type type2 = type;
> +   LLVMValueRef al, ah, bl, bh;
> +
> +   assert(lp_check_value(type, a));
> +   assert(lp_check_value(type, b));
> +   assert(!type.floating && !type.fixed && type.norm);
> +
> +   if (a == bld->zero || b == bld->zero) {
> +      LLVMValueRef zero = LLVMConstNull(lp_build_vec_type(bld->gallivm, wide_type));
> +      *resl = zero;
> +      *resh = zero;
> +      return;
> +   }
> +
> +   if (signedness_differs) {
> +      type2.sign = !type.sign;
> +      wide_type2.sign = !wide_type2.sign;
> +   }
> +
> +   lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
> +   lp_build_unpack2_native(bld->gallivm, type2, wide_type2, b, &bl, &bh);
> +
> +   *resl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
> +   *resh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
> +}
> +
> +
> +/**
>    * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
>    */
>   LLVMValueRef
> @@ -155,7 +206,7 @@ lp_build_blend(struct lp_build_context *bld,
>               } else {
>                  return lp_build_lerp(bld, dst_factor, src, dst, 0);
>               }
> -         } else if(bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
> +         } else if (bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
>               result = lp_build_add(bld, src, dst);
>   
>               if (factor_src < factor_dst) {
> @@ -165,7 +216,7 @@ lp_build_blend(struct lp_build_context *bld,
>                  result = lp_build_mul(bld, result, dst_factor);
>                  return lp_build_sub(bld, src, result);
>               }
> -         } else if(bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
> +         } else if (bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
>               result = lp_build_add(bld, src, dst);
>   
>               if (factor_src < factor_dst) {
> @@ -192,9 +243,72 @@ lp_build_blend(struct lp_build_context *bld,
>      if (optimise_only)
>         return NULL;
>   
> -   src_term = lp_build_mul(bld, src, src_factor);
> -   dst_term = lp_build_mul(bld, dst, dst_factor);
> -   return lp_build_blend_func(bld, func, src_term, dst_term);
> +   if ((bld->type.norm && bld->type.sign) &&
> +       (is_inverse_factor(factor_src) || is_inverse_factor(factor_dst))) {
> +      /*
> +       * With snorm blending, the inverse blend factors range from [0,2]
> +       * instead of [-1,1], 

 >
so the ordinary signed normalized arithmetic
> +       * doesn't quite work. Unpack must be unsigned, and the add/sub
> +       * must be done with wider type.
> +       * (Note that it's not quite obvious what the blend equation wrt to
> +       * clamping should actually be based on GL spec in this case, but
> +       * really the incoming src values are clamped to [-1,1] (the dst is
> +       * always clamped already), and then NO further clamping occurs until
> +       * the end.)
> +       */
> +      struct lp_build_context bldw;
> +      struct lp_type wide_type = lp_wider_type(bld->type);
> +      LLVMValueRef src_terml, src_termh, dst_terml, dst_termh;
> +      LLVMValueRef resl, resh;
> +
> +      /*
> +       * We don't need saturate math for the sub/add, since we have
> +       * x+1 bit numbers in x*2 wide type (result is x+2 bits).
> +       * (Doesn't really matter on x86 sse2 though as we use saturated
> +       * intrinsics.)
> +       */
> +      wide_type.norm = 0;
> +      lp_build_context_init(&bldw, bld->gallivm, wide_type);
> +
> +      /*
> +       * XXX This is a bit hackish. Note that -128 really should
> +       * be -1.0, the same as -127. However, we did not actually clamp
> +       * things anywhere (relying on pack intrinsics instead) therefore
> +       * we will get -128, and the inverted factor then 255. But the mul
> +       * can overflow in this case (rather the rounding fixups for the mul,
> +       * -128*255 will be positive).
> +       * So we clamp the src and dst up here but only when necessary (we
> +       * should do this before calculating blend factors but it's enough
> +       * for avoiding overflow).
> +       */
> +      if (is_inverse_factor(factor_src)) {
> +         src = lp_build_max(bld, src,
> +                            lp_build_const_vec(bld->gallivm, bld->type, -1.0));
> +      }
> +      if (is_inverse_factor(factor_dst)) {
> +         dst = lp_build_max(bld, dst,
> +                            lp_build_const_vec(bld->gallivm, bld->type, -1.0));
> +      }
> +
> +      lp_build_mul_norm_expand(bld, src, src_factor, &src_terml, &src_termh,
> +                               is_inverse_factor(factor_src) ? TRUE : FALSE);
> +      lp_build_mul_norm_expand(bld, dst, dst_factor, &dst_terml, &dst_termh,
> +                               is_inverse_factor(factor_dst) ? TRUE : FALSE);
> +      resl = lp_build_blend_func(&bldw, func, src_terml, dst_terml);
> +      resh = lp_build_blend_func(&bldw, func, src_termh, dst_termh);
> +
> +      /*
> +       * XXX pack2_native is not ok because the values have to be in dst
> +       * range. We need native pack though for the correct order on avx2.
> +       * Will break on everything not implementing clamping pack intrinsics
> +       * (i.e. everything but sse2 and altivec).
> +       */
> +      return lp_build_pack2_native(bld->gallivm, wide_type, bld->type, resl, resh);

Great catch!

Do you think we should just have a generic covert to float/sint32 for snorm?

We should also add piglit test cases for this, so this can never go 
underected again.  Particularly focusing on the hard edge-cases of 
llvmpipe you mention above.

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>

Jose

> +   } else {
> +      src_term = lp_build_mul(bld, src, src_factor);
> +      dst_term = lp_build_mul(bld, dst, dst_factor);
> +      return lp_build_blend_func(bld, func, src_term, dst_term);
> +   }
>   }
>   
>   void
> diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
> index 45c5c2b..c16ef1a 100644
> --- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
> +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
> @@ -112,22 +112,34 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
>      case PIPE_BLENDFACTOR_DST_ALPHA:
>         return bld->dst;
>      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
> -      if(alpha)
> +      if (alpha)
>            return bld->base.one;
>         else {
>            /*
>             * If there's no dst alpha the complement is zero but for unclamped
> -          * float inputs min can be non-zero (negative).
> +          * float inputs (or snorm inputs) min can be non-zero (negative).
>             */
> -         if (!bld->has_dst_alpha) {
> -            if (!bld->saturate)
> +         if (!bld->saturate) {
> +            if (!bld->has_dst_alpha) {
>                  bld->saturate = lp_build_min(&bld->base, src_alpha, bld->base.zero);
> -         }
> -         else {
> -            if(!bld->inv_dst)
> -               bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
> -            if(!bld->saturate)
> +            }
> +            else if (bld->base.type.norm && bld->base.type.sign) {
> +               /*
> +                * The complement/min totally doesn't work, since
> +                * the complement is in range [0,2] but the other
> +                * min input is [-1,1]. However, we can just clamp to 0
> +                * before doing the complement...
> +                */
> +               LLVMValueRef inv_dst;
> +               inv_dst = lp_build_max(&bld->base, bld->base.zero, bld->dst);
> +               inv_dst = lp_build_comp(&bld->base, inv_dst);
> +               bld->saturate = lp_build_min(&bld->base, src_alpha, inv_dst);
> +            } else {
> +               if (!bld->inv_dst) {
> +                  bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
> +               }
>                  bld->saturate = lp_build_min(&bld->base, src_alpha, bld->inv_dst);
> +            }
>            }
>            return bld->saturate;
>         }
> @@ -140,24 +152,24 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
>      case PIPE_BLENDFACTOR_SRC1_ALPHA:
>         return src1_alpha;
>      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
> -      if(!bld->inv_src)
> +      if (!bld->inv_src)
>            bld->inv_src = lp_build_comp(&bld->base, bld->src);
>         return bld->inv_src;
>      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
> -      if(!bld->inv_src_alpha)
> +      if (!bld->inv_src_alpha)
>            bld->inv_src_alpha = lp_build_comp(&bld->base, src_alpha);
>         return bld->inv_src_alpha;
>      case PIPE_BLENDFACTOR_INV_DST_COLOR:
>      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
> -      if(!bld->inv_dst)
> +      if (!bld->inv_dst)
>            bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
>         return bld->inv_dst;
>      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
> -      if(!bld->inv_const)
> +      if (!bld->inv_const)
>            bld->inv_const = lp_build_comp(&bld->base, bld->const_);
>         return bld->inv_const;
>      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
> -      if(!bld->inv_const_alpha)
> +      if (!bld->inv_const_alpha)
>            bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha);
>         return bld->inv_const_alpha;
>      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
> @@ -331,7 +343,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
>      bld.const_alpha = const_alpha;
>      bld.has_dst_alpha = FALSE;
>   
> -   /* Find the alpha channel if not provided seperately */
> +   /* Find the alpha channel if not provided separately */
>      if (!src_alpha) {
>         for (i = 0; i < 4; ++i) {
>            if (swizzle[i] == 3) {
> @@ -349,7 +361,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
>      }
>   
>      if (blend->logicop_enable) {
> -      if(!type.floating) {
> +      if (!type.floating) {
>            result = lp_build_logicop(gallivm->builder, blend->logicop_func, src, dst);
>         }
>         else {
> @@ -361,6 +373,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
>         boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor &&
>                                   state->alpha_src_factor == state->alpha_dst_factor) ||
>                                  nr_channels == 1;
> +      boolean alpha_only = nr_channels == 1 && alpha_swizzle == PIPE_SWIZZLE_X;
>   
>         src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor,
>                                            state->alpha_src_factor,
> @@ -374,8 +387,8 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
>   
>         result = lp_build_blend(&bld.base,
>                                 state->rgb_func,
> -                              state->rgb_src_factor,
> -                              state->rgb_dst_factor,
> +                              alpha_only ? state->alpha_src_factor : state->rgb_src_factor,
> +                              alpha_only ? state->alpha_dst_factor : state->rgb_dst_factor,
>                                 src,
>                                 dst,
>                                 src_factor,
> @@ -383,8 +396,8 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
>                                 rgb_alpha_same,
>                                 false);
>   
> -      if(state->rgb_func != state->alpha_func && nr_channels > 1 &&
> -                            alpha_swizzle != PIPE_SWIZZLE_NONE) {
> +      if (state->rgb_func != state->alpha_func && nr_channels > 1 &&
> +          alpha_swizzle != PIPE_SWIZZLE_NONE) {
>            LLVMValueRef alpha;
>   
>            alpha = lp_build_blend(&bld.base,
>