[Mesa-dev] [PATCH 2/2] gallivm: Fix lerping of (un)signed normalized numbers.

Thu Dec 6 07:02:14 PST 2012

On 12/06/2012 06:20 AM, jfonseca at vmware.com wrote:
> From: José Fonseca<jfonseca at vmware.com>
>
> Several issues acutally:

"actually"


>
> - Fix a regression in unsigned normalized in the rescaling
>    [0, 255] to [0, 256]
>
> - Ensure we use signed shifts where appropriate (instead of
>    unsigned shifts)
>
> - Refactor the code slightly -- move all the logic inside
>    lp_build_lerp_simple().
>
> This change, plus an adjustment in the tolerance of signed normalized
> results in piglit fbo-blending-formats fixes bug 57903
> ---
>   src/gallium/auxiliary/gallivm/lp_bld_arit.c |   92 +++++++++++++--------------
>   1 file changed, 43 insertions(+), 49 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index 8b19ebd..d930f09 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -685,7 +685,7 @@ lp_build_sub(struct lp_build_context *bld,
>   /**
>    * Normalized multiplication.
>    *
> - * There are several approaches here (using 8-bit normalized multiplication as
> + * There are several approaches for (using 8-bit normalized multiplication as
>    * an example):
>    *
>    * - alpha plus one
> @@ -694,7 +694,7 @@ lp_build_sub(struct lp_build_context *bld,
>    *
>    *       a*b/255 ~= (a*(b + 1))>>  256
>    *
> - *     which is the fastest method that satisfies the following OpenGL criteria
> + *     which is the fastest method that satisfies the following OpenGL criteria of
>    *
>    *       0*0 = 0 and 255*255 = 255
>    *
> @@ -710,7 +710,7 @@ lp_build_sub(struct lp_build_context *bld,
>    *
>    *     note that just by itself it doesn't satisfies the OpenGL criteria, as
>    *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
> - *     must be used
> + *     must be used.
>    *
>    * - geometric series plus rounding
>    *
> @@ -719,7 +719,9 @@ lp_build_sub(struct lp_build_context *bld,
>    *
>    *       t/255 ~= (t + (t>>  8) + 0x80)>>  8
>    *
> - *     achieving the exact results
> + *     achieving the exact results.
> + *
> + *
>    *
>    * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
>    *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
> @@ -733,8 +735,7 @@ lp_build_mul_norm(struct gallivm_state *gallivm,
>   {
>      LLVMBuilderRef builder = gallivm->builder;
>      struct lp_build_context bld;
> -   unsigned bits;
> -   LLVMValueRef shift;
> +   unsigned n;
>      LLVMValueRef half;
>      LLVMValueRef ab;
>
> @@ -744,29 +745,28 @@ lp_build_mul_norm(struct gallivm_state *gallivm,
>
>      lp_build_context_init(&bld, gallivm, wide_type);
>
> -   bits = wide_type.width / 2;
> +   n = wide_type.width / 2;
>      if (wide_type.sign) {
> -      --bits;
> +      --n;
>      }
>
> -   shift = lp_build_const_int_vec(gallivm, wide_type, bits);
> +   /*
> +    * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
> +    * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
> +    */
>
> -#if 0
> -
> -   /* a*b/255 ~= (a*(b + 1))>>  256 */
> -   /* XXX: This would not work for signed types */
> -   assert(!wide_type.sign);
> -   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, wide_type, 1), "");
> -   ab = LLVMBuildMul(builder, a, b, "");
> +   /*
> +    * a*b / (2**n - 1) ~= (a*b + (a*b>>  n) + half)>>  n
> +    */
>
> -#else
> -
> -   /* ab/255 ~= (ab + (ab>>  8) + 0x80)>>  8 */
>      ab = LLVMBuildMul(builder, a, b, "");
> -   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, shift, ""), "");
> +   ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
>
> -   /* Add rounding term */
> -   half = lp_build_const_int_vec(gallivm, wide_type, 1<<  (bits - 1));
> +   /*
> +    * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1<<  (n - 1))
> +    */
> +
> +   half = lp_build_const_int_vec(gallivm, wide_type, 1<<  (n - 1));
>      if (wide_type.sign) {
>         LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
>         LLVMValueRef sign = lp_build_shr_imm(&bld, half, wide_type.width - 1);
> @@ -774,9 +774,8 @@ lp_build_mul_norm(struct gallivm_state *gallivm,
>      }
>      ab = LLVMBuildAdd(builder, ab, half, "");
>
> -#endif
> -
> -   ab = LLVMBuildLShr(builder, ab, shift, "");
> +   /* Final division */
> +   ab = lp_build_shr_imm(&bld, ab, n);
>
>      return ab;
>   }
> @@ -988,14 +987,28 @@ lp_build_lerp_simple(struct lp_build_context *bld,
>
>      delta = lp_build_sub(bld, v1, v0);
>
> -   res = lp_build_mul(bld, x, delta);
> -
>      if (normalized) {
> -      if (bld->type.sign) {
> -         res = lp_build_shr_imm(bld, res, half_width - 1);
> -      } else {
> +      if (!bld->type.sign) {
> +         /*
> +          * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
> +          * most-significant-bit to the lowest-significant-bit, so that
> +          * later we can just divide by 2**n instead of 2**n - 1.
> +          */
> +         x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
> +
> +         /* (x * delta)>>  n */
> +         res = lp_build_mul(bld, x, delta);
>            res = lp_build_shr_imm(bld, res, half_width);
> +      } else {
> +         /*
> +          * The rescaling trick above doesn't work for signed numbers, so
> +          * use the 2**n - 1 divison approximation in lp_build_mul_norm
> +          * instead.
> +          */
> +         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
>         }
> +   } else {
> +      res = lp_build_mul(bld, x, delta);
>      }
>
>      res = lp_build_add(bld, v0, res);
> @@ -1022,7 +1035,6 @@ lp_build_lerp(struct lp_build_context *bld,
>                 LLVMValueRef v0,
>                 LLVMValueRef v1)
>   {
> -   LLVMBuilderRef builder = bld->gallivm->builder;
>      const struct lp_type type = bld->type;
>      LLVMValueRef res;
>
> @@ -1034,8 +1046,6 @@ lp_build_lerp(struct lp_build_context *bld,
>         struct lp_type wide_type;
>         struct lp_build_context wide_bld;
>         LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
> -      unsigned bits;
> -      LLVMValueRef shift;
>
>         assert(type.length>= 2);
>
> @@ -1055,22 +1065,6 @@ lp_build_lerp(struct lp_build_context *bld,
>         lp_build_unpack2(bld->gallivm, type, wide_type, v1,&v1l,&v1h);
>
>         /*
> -       * Scale x from [0, 255] to [0, 256]
> -       */
> -
> -      bits = type.width - 1;
> -      if (type.sign) {
> -         --bits;
> -      }
> -
> -      shift = lp_build_const_int_vec(bld->gallivm, wide_type, bits - 1);
> -
> -      xl = lp_build_add(&wide_bld, xl,
> -                        LLVMBuildAShr(builder, xl, shift, ""));
> -      xh = lp_build_add(&wide_bld, xh,
> -                        LLVMBuildAShr(builder, xh, shift, ""));
> -
> -      /*
>          * Lerp both halves.
>          */
>

Reviewed-by: Brian Paul <brianp at vmware.com>