[Mesa-dev] [PATCH 2/2] gallium/auxiliary: optimize rgb9e5 helper some more

Thu Aug 20 15:06:45 PDT 2015

Ping?

Am 09.08.2015 um 17:28 schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> I used this as some testing ground for investigating some compiler
> bits initially (e.g. lrint calls etc.), figured I could do much better
> in the end just for fun...
> This is mathematically equivalent, but uses some tricks to avoid
> doubles and also replaces some float math with ints. Good for another
> performance doubling or so. As a side note, some quick tests show that
> llvm's loop vectorizer would be able to properly vectorize this version
> (which it failed to do earlier due to doubles, producing a mess), giving
> another 3 times performance increase with sse2 (more with sse4.1), but this
> may not apply to mesa.
> ---
>  src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 ++++++++++++++--------------
>  1 file changed, 42 insertions(+), 45 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
> index d11bfa8..21feba7 100644
> --- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
> +++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
> @@ -74,62 +74,59 @@ typedef union {
>     } field;
>  } rgb9e5;
>  
> -static inline float rgb9e5_ClampRange(float x)
> -{
> -   if (x > 0.0f) {
> -      if (x >= MAX_RGB9E5) {
> -         return MAX_RGB9E5;
> -      } else {
> -         return x;
> -      }
> -   } else {
> -      /* NaN gets here too since comparisons with NaN always fail! */
> -      return 0.0f;
> -   }
> -}
>  
> -/* Ok, FloorLog2 is not correct for the denorm and zero values, but we
> -   are going to do a max of this value with the minimum rgb9e5 exponent
> -   that will hide these problem cases. */
> -static inline int rgb9e5_FloorLog2(float x)
> +static inline int rgb9e5_ClampRange(float x)
>  {
>     float754 f;
> -
> +   float754 max;
>     f.value = x;
> -   return (f.field.biasedexponent - 127);
> +   max.value = MAX_RGB9E5;
> +
> +   if (f.raw > 0x7f800000)
> +  /* catches neg, NaNs */
> +      return 0;
> +   else if (f.raw >= max.raw)
> +      return max.raw;
> +   else
> +      return f.raw;
>  }
>  
>  static inline unsigned float3_to_rgb9e5(const float rgb[3])
>  {
>     rgb9e5 retval;
> -   float maxrgb;
> -   int rm, gm, bm;
> -   float rc, gc, bc;
> -   int exp_shared, maxm;
> +   int rm, gm, bm, exp_shared;
>     float754 revdenom = {0};
> -
> -   rc = rgb9e5_ClampRange(rgb[0]);
> -   gc = rgb9e5_ClampRange(rgb[1]);
> -   bc = rgb9e5_ClampRange(rgb[2]);
> -
> -   maxrgb = MAX3(rc, gc, bc);
> -   exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
> +   float754 rc, bc, gc, maxrgb;
> +
> +   rc.raw = rgb9e5_ClampRange(rgb[0]);
> +   gc.raw = rgb9e5_ClampRange(rgb[1]);
> +   bc.raw = rgb9e5_ClampRange(rgb[2]);
> +   maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw);
> +
> +   /*
> +    * Compared to what the spec suggests, instead of conditionally adjusting
> +    * the exponent after the fact do it here by doing the equivalent of +0.5 -
> +    * the int add will spill over into the exponent in this case.
> +    */
> +   maxrgb.raw += maxrgb.raw & (1 << (23-9));
> +   exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) +
> +                1 + RGB9E5_EXP_BIAS - 127;
> +   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS -
> +                                          RGB9E5_MANTISSA_BITS) + 1;
>     assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
> -   assert(exp_shared >= 0);
> -   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
> -
> -   maxm = (int) (maxrgb * revdenom.value + 0.5);
> -   if (maxm == MAX_RGB9E5_MANTISSA + 1) {
> -      revdenom.value *= 0.5f;
> -      exp_shared += 1;
> -      assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
> -   } else {
> -      assert(maxm <= MAX_RGB9E5_MANTISSA);
> -   }
> -
> -   rm = (int) (rc * revdenom.value + 0.5);
> -   gm = (int) (gc * revdenom.value + 0.5);
> -   bm = (int) (bc * revdenom.value + 0.5);
> +
> +   /*
> +    * The spec uses strict round-up behavior (d3d10 disagrees, but in any case
> +    * must match what is done above for figuring out exponent).
> +    * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding
> +    * ourselves (revdenom was adjusted by +1, above).
> +    */
> +   rm = (int) (rc.value * revdenom.value);
> +   gm = (int) (gc.value * revdenom.value);
> +   bm = (int) (bc.value * revdenom.value);
> +   rm = (rm & 1) + (rm >> 1);
> +   gm = (gm & 1) + (gm >> 1);
> +   bm = (bm & 1) + (bm >> 1);
>  
>     assert(rm <= MAX_RGB9E5_MANTISSA);
>     assert(gm <= MAX_RGB9E5_MANTISSA);
>