[Mesa-dev] [PATCH 2/2] gallium/auxiliary: optimize rgb9e5 helper some more
Marek Olšák
maraeo at gmail.com
Mon Aug 24 15:28:30 PDT 2015
If there are no piglit regressions:
Acked-by: Marek Olšák <marek.olsak at amd.com>
Marek
On Fri, Aug 21, 2015 at 12:06 AM, Roland Scheidegger <sroland at vmware.com> wrote:
> Ping?
>
> Am 09.08.2015 um 17:28 schrieb sroland at vmware.com:
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> I used this as some testing ground for investigating some compiler
>> bits initially (e.g. lrint calls etc.), figured I could do much better
>> in the end just for fun...
>> This is mathematically equivalent, but uses some tricks to avoid
>> doubles and also replaces some float math with ints. Good for another
>> performance doubling or so. As a side note, some quick tests show that
>> llvm's loop vectorizer would be able to properly vectorize this version
>> (which it failed to do earlier due to doubles, producing a mess), giving
>> another 3 times performance increase with sse2 (more with sse4.1), but this
>> may not apply to mesa.
>> ---
>> src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 ++++++++++++++--------------
>> 1 file changed, 42 insertions(+), 45 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
>> index d11bfa8..21feba7 100644
>> --- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
>> +++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
>> @@ -74,62 +74,59 @@ typedef union {
>> } field;
>> } rgb9e5;
>>
>> -static inline float rgb9e5_ClampRange(float x)
>> -{
>> - if (x > 0.0f) {
>> - if (x >= MAX_RGB9E5) {
>> - return MAX_RGB9E5;
>> - } else {
>> - return x;
>> - }
>> - } else {
>> - /* NaN gets here too since comparisons with NaN always fail! */
>> - return 0.0f;
>> - }
>> -}
>>
>> -/* Ok, FloorLog2 is not correct for the denorm and zero values, but we
>> - are going to do a max of this value with the minimum rgb9e5 exponent
>> - that will hide these problem cases. */
>> -static inline int rgb9e5_FloorLog2(float x)
>> +static inline int rgb9e5_ClampRange(float x)
>> {
>> float754 f;
>> -
>> + float754 max;
>> f.value = x;
>> - return (f.field.biasedexponent - 127);
>> + max.value = MAX_RGB9E5;
>> +
>> + if (f.raw > 0x7f800000)
>> + /* catches neg, NaNs */
>> + return 0;
>> + else if (f.raw >= max.raw)
>> + return max.raw;
>> + else
>> + return f.raw;
>> }
>>
>> static inline unsigned float3_to_rgb9e5(const float rgb[3])
>> {
>> rgb9e5 retval;
>> - float maxrgb;
>> - int rm, gm, bm;
>> - float rc, gc, bc;
>> - int exp_shared, maxm;
>> + int rm, gm, bm, exp_shared;
>> float754 revdenom = {0};
>> -
>> - rc = rgb9e5_ClampRange(rgb[0]);
>> - gc = rgb9e5_ClampRange(rgb[1]);
>> - bc = rgb9e5_ClampRange(rgb[2]);
>> -
>> - maxrgb = MAX3(rc, gc, bc);
>> - exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
>> + float754 rc, bc, gc, maxrgb;
>> +
>> + rc.raw = rgb9e5_ClampRange(rgb[0]);
>> + gc.raw = rgb9e5_ClampRange(rgb[1]);
>> + bc.raw = rgb9e5_ClampRange(rgb[2]);
>> + maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw);
>> +
>> + /*
>> + * Compared to what the spec suggests, instead of conditionally adjusting
>> + * the exponent after the fact do it here by doing the equivalent of +0.5 -
>> + * the int add will spill over into the exponent in this case.
>> + */
>> + maxrgb.raw += maxrgb.raw & (1 << (23-9));
>> + exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) +
>> + 1 + RGB9E5_EXP_BIAS - 127;
>> + revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS -
>> + RGB9E5_MANTISSA_BITS) + 1;
>> assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
>> - assert(exp_shared >= 0);
>> - revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
>> -
>> - maxm = (int) (maxrgb * revdenom.value + 0.5);
>> - if (maxm == MAX_RGB9E5_MANTISSA + 1) {
>> - revdenom.value *= 0.5f;
>> - exp_shared += 1;
>> - assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
>> - } else {
>> - assert(maxm <= MAX_RGB9E5_MANTISSA);
>> - }
>> -
>> - rm = (int) (rc * revdenom.value + 0.5);
>> - gm = (int) (gc * revdenom.value + 0.5);
>> - bm = (int) (bc * revdenom.value + 0.5);
>> +
>> + /*
>> + * The spec uses strict round-up behavior (d3d10 disagrees, but in any case
>> + * must match what is done above for figuring out exponent).
>> + * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding
>> + * ourselves (revdenom was adjusted by +1, above).
>> + */
>> + rm = (int) (rc.value * revdenom.value);
>> + gm = (int) (gc.value * revdenom.value);
>> + bm = (int) (bc.value * revdenom.value);
>> + rm = (rm & 1) + (rm >> 1);
>> + gm = (gm & 1) + (gm >> 1);
>> + bm = (bm & 1) + (bm >> 1);
>>
>> assert(rm <= MAX_RGB9E5_MANTISSA);
>> assert(gm <= MAX_RGB9E5_MANTISSA);
>>
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list