[Mesa-dev] [PATCH 2/2] gallium/auxiliary: optimize rgb9e5 helper some more

Mon Aug 24 15:28:30 PDT 2015

If there are no piglit regressions:

Acked-by: Marek Olšák <marek.olsak at amd.com>

Marek

On Fri, Aug 21, 2015 at 12:06 AM, Roland Scheidegger <sroland at vmware.com> wrote:
> Ping?
>
> Am 09.08.2015 um 17:28 schrieb sroland at vmware.com:
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> I used this as some testing ground for investigating some compiler
>> bits initially (e.g. lrint calls etc.), figured I could do much better
>> in the end just for fun...
>> This is mathematically equivalent, but uses some tricks to avoid
>> doubles and also replaces some float math with ints. Good for another
>> performance doubling or so. As a side note, some quick tests show that
>> llvm's loop vectorizer would be able to properly vectorize this version
>> (which it failed to do earlier due to doubles, producing a mess), giving
>> another 3 times performance increase with sse2 (more with sse4.1), but this
>> may not apply to mesa.
>> ---
>>  src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 ++++++++++++++--------------
>>  1 file changed, 42 insertions(+), 45 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
>> index d11bfa8..21feba7 100644
>> --- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
>> +++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
>> @@ -74,62 +74,59 @@ typedef union {
>>     } field;
>>  } rgb9e5;
>>
>> -static inline float rgb9e5_ClampRange(float x)
>> -{
>> -   if (x > 0.0f) {
>> -      if (x >= MAX_RGB9E5) {
>> -         return MAX_RGB9E5;
>> -      } else {
>> -         return x;
>> -      }
>> -   } else {
>> -      /* NaN gets here too since comparisons with NaN always fail! */
>> -      return 0.0f;
>> -   }
>> -}
>>
>> -/* Ok, FloorLog2 is not correct for the denorm and zero values, but we
>> -   are going to do a max of this value with the minimum rgb9e5 exponent
>> -   that will hide these problem cases. */
>> -static inline int rgb9e5_FloorLog2(float x)
>> +static inline int rgb9e5_ClampRange(float x)
>>  {
>>     float754 f;
>> -
>> +   float754 max;
>>     f.value = x;
>> -   return (f.field.biasedexponent - 127);
>> +   max.value = MAX_RGB9E5;
>> +
>> +   if (f.raw > 0x7f800000)
>> +  /* catches neg, NaNs */
>> +      return 0;
>> +   else if (f.raw >= max.raw)
>> +      return max.raw;
>> +   else
>> +      return f.raw;
>>  }
>>
>>  static inline unsigned float3_to_rgb9e5(const float rgb[3])
>>  {
>>     rgb9e5 retval;
>> -   float maxrgb;
>> -   int rm, gm, bm;
>> -   float rc, gc, bc;
>> -   int exp_shared, maxm;
>> +   int rm, gm, bm, exp_shared;
>>     float754 revdenom = {0};
>> -
>> -   rc = rgb9e5_ClampRange(rgb[0]);
>> -   gc = rgb9e5_ClampRange(rgb[1]);
>> -   bc = rgb9e5_ClampRange(rgb[2]);
>> -
>> -   maxrgb = MAX3(rc, gc, bc);
>> -   exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
>> +   float754 rc, bc, gc, maxrgb;
>> +
>> +   rc.raw = rgb9e5_ClampRange(rgb[0]);
>> +   gc.raw = rgb9e5_ClampRange(rgb[1]);
>> +   bc.raw = rgb9e5_ClampRange(rgb[2]);
>> +   maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw);
>> +
>> +   /*
>> +    * Compared to what the spec suggests, instead of conditionally adjusting
>> +    * the exponent after the fact do it here by doing the equivalent of +0.5 -
>> +    * the int add will spill over into the exponent in this case.
>> +    */
>> +   maxrgb.raw += maxrgb.raw & (1 << (23-9));
>> +   exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) +
>> +                1 + RGB9E5_EXP_BIAS - 127;
>> +   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS -
>> +                                          RGB9E5_MANTISSA_BITS) + 1;
>>     assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
>> -   assert(exp_shared >= 0);
>> -   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
>> -
>> -   maxm = (int) (maxrgb * revdenom.value + 0.5);
>> -   if (maxm == MAX_RGB9E5_MANTISSA + 1) {
>> -      revdenom.value *= 0.5f;
>> -      exp_shared += 1;
>> -      assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
>> -   } else {
>> -      assert(maxm <= MAX_RGB9E5_MANTISSA);
>> -   }
>> -
>> -   rm = (int) (rc * revdenom.value + 0.5);
>> -   gm = (int) (gc * revdenom.value + 0.5);
>> -   bm = (int) (bc * revdenom.value + 0.5);
>> +
>> +   /*
>> +    * The spec uses strict round-up behavior (d3d10 disagrees, but in any case
>> +    * must match what is done above for figuring out exponent).
>> +    * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding
>> +    * ourselves (revdenom was adjusted by +1, above).
>> +    */
>> +   rm = (int) (rc.value * revdenom.value);
>> +   gm = (int) (gc.value * revdenom.value);
>> +   bm = (int) (bc.value * revdenom.value);
>> +   rm = (rm & 1) + (rm >> 1);
>> +   gm = (gm & 1) + (gm >> 1);
>> +   bm = (bm & 1) + (bm >> 1);
>>
>>     assert(rm <= MAX_RGB9E5_MANTISSA);
>>     assert(gm <= MAX_RGB9E5_MANTISSA);
>>
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev