[Mesa-dev] [PATCH 2/2] gallium/auxiliary: optimize rgb9e5 helper some more

Sun Aug 9 08:28:06 PDT 2015

From: Roland Scheidegger <sroland at vmware.com>

I used this as some testing ground for investigating some compiler
bits initially (e.g. lrint calls etc.), figured I could do much better
in the end just for fun...
This is mathematically equivalent, but uses some tricks to avoid
doubles and also replaces some float math with ints. Good for another
performance doubling or so. As a side note, some quick tests show that
llvm's loop vectorizer would be able to properly vectorize this version
(which it failed to do earlier due to doubles, producing a mess), giving
another 3 times performance increase with sse2 (more with sse4.1), but this
may not apply to mesa.
---
 src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 ++++++++++++++--------------
 1 file changed, 42 insertions(+), 45 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index d11bfa8..21feba7 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -74,62 +74,59 @@ typedef union {
    } field;
 } rgb9e5;
 
-static inline float rgb9e5_ClampRange(float x)
-{
-   if (x > 0.0f) {
-      if (x >= MAX_RGB9E5) {
-         return MAX_RGB9E5;
-      } else {
-         return x;
-      }
-   } else {
-      /* NaN gets here too since comparisons with NaN always fail! */
-      return 0.0f;
-   }
-}
 
-/* Ok, FloorLog2 is not correct for the denorm and zero values, but we
-   are going to do a max of this value with the minimum rgb9e5 exponent
-   that will hide these problem cases. */
-static inline int rgb9e5_FloorLog2(float x)
+static inline int rgb9e5_ClampRange(float x)
 {
    float754 f;
-
+   float754 max;
    f.value = x;
-   return (f.field.biasedexponent - 127);
+   max.value = MAX_RGB9E5;
+
+   if (f.raw > 0x7f800000)
+  /* catches neg, NaNs */
+      return 0;
+   else if (f.raw >= max.raw)
+      return max.raw;
+   else
+      return f.raw;
 }
 
 static inline unsigned float3_to_rgb9e5(const float rgb[3])
 {
    rgb9e5 retval;
-   float maxrgb;
-   int rm, gm, bm;
-   float rc, gc, bc;
-   int exp_shared, maxm;
+   int rm, gm, bm, exp_shared;
    float754 revdenom = {0};
-
-   rc = rgb9e5_ClampRange(rgb[0]);
-   gc = rgb9e5_ClampRange(rgb[1]);
-   bc = rgb9e5_ClampRange(rgb[2]);
-
-   maxrgb = MAX3(rc, gc, bc);
-   exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
+   float754 rc, bc, gc, maxrgb;
+
+   rc.raw = rgb9e5_ClampRange(rgb[0]);
+   gc.raw = rgb9e5_ClampRange(rgb[1]);
+   bc.raw = rgb9e5_ClampRange(rgb[2]);
+   maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw);
+
+   /*
+    * Compared to what the spec suggests, instead of conditionally adjusting
+    * the exponent after the fact do it here by doing the equivalent of +0.5 -
+    * the int add will spill over into the exponent in this case.
+    */
+   maxrgb.raw += maxrgb.raw & (1 << (23-9));
+   exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) +
+                1 + RGB9E5_EXP_BIAS - 127;
+   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS -
+                                          RGB9E5_MANTISSA_BITS) + 1;
    assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
-   assert(exp_shared >= 0);
-   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
-
-   maxm = (int) (maxrgb * revdenom.value + 0.5);
-   if (maxm == MAX_RGB9E5_MANTISSA + 1) {
-      revdenom.value *= 0.5f;
-      exp_shared += 1;
-      assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
-   } else {
-      assert(maxm <= MAX_RGB9E5_MANTISSA);
-   }
-
-   rm = (int) (rc * revdenom.value + 0.5);
-   gm = (int) (gc * revdenom.value + 0.5);
-   bm = (int) (bc * revdenom.value + 0.5);
+
+   /*
+    * The spec uses strict round-up behavior (d3d10 disagrees, but in any case
+    * must match what is done above for figuring out exponent).
+    * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding
+    * ourselves (revdenom was adjusted by +1, above).
+    */
+   rm = (int) (rc.value * revdenom.value);
+   gm = (int) (gc.value * revdenom.value);
+   bm = (int) (bc.value * revdenom.value);
+   rm = (rm & 1) + (rm >> 1);
+   gm = (gm & 1) + (gm >> 1);
+   bm = (bm & 1) + (bm >> 1);
 
    assert(rm <= MAX_RGB9E5_MANTISSA);
    assert(gm <= MAX_RGB9E5_MANTISSA);
-- 
1.9.1