[Mesa-dev] [PATCH 2/2] gallium/auxiliary: optimize rgb9e5 helper some more
Roland Scheidegger
sroland at vmware.com
Thu Aug 20 15:06:45 PDT 2015
Ping?
Am 09.08.2015 um 17:28 schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
>
> I used this as some testing ground for investigating some compiler
> bits initially (e.g. lrint calls etc.), figured I could do much better
> in the end just for fun...
> This is mathematically equivalent, but uses some tricks to avoid
> doubles and also replaces some float math with ints. Good for another
> performance doubling or so. As a side note, some quick tests show that
> llvm's loop vectorizer would be able to properly vectorize this version
> (which it failed to do earlier due to doubles, producing a mess), giving
> another 3 times performance increase with sse2 (more with sse4.1), but this
> may not apply to mesa.
> ---
> src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 ++++++++++++++--------------
> 1 file changed, 42 insertions(+), 45 deletions(-)
>
> diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
> index d11bfa8..21feba7 100644
> --- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
> +++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
> @@ -74,62 +74,59 @@ typedef union {
> } field;
> } rgb9e5;
>
> -static inline float rgb9e5_ClampRange(float x)
> -{
> - if (x > 0.0f) {
> - if (x >= MAX_RGB9E5) {
> - return MAX_RGB9E5;
> - } else {
> - return x;
> - }
> - } else {
> - /* NaN gets here too since comparisons with NaN always fail! */
> - return 0.0f;
> - }
> -}
>
> -/* Ok, FloorLog2 is not correct for the denorm and zero values, but we
> - are going to do a max of this value with the minimum rgb9e5 exponent
> - that will hide these problem cases. */
> -static inline int rgb9e5_FloorLog2(float x)
> +static inline int rgb9e5_ClampRange(float x)
> {
> float754 f;
> -
> + float754 max;
> f.value = x;
> - return (f.field.biasedexponent - 127);
> + max.value = MAX_RGB9E5;
> +
> + if (f.raw > 0x7f800000)
> + /* catches neg, NaNs */
> + return 0;
> + else if (f.raw >= max.raw)
> + return max.raw;
> + else
> + return f.raw;
> }
>
> static inline unsigned float3_to_rgb9e5(const float rgb[3])
> {
> rgb9e5 retval;
> - float maxrgb;
> - int rm, gm, bm;
> - float rc, gc, bc;
> - int exp_shared, maxm;
> + int rm, gm, bm, exp_shared;
> float754 revdenom = {0};
> -
> - rc = rgb9e5_ClampRange(rgb[0]);
> - gc = rgb9e5_ClampRange(rgb[1]);
> - bc = rgb9e5_ClampRange(rgb[2]);
> -
> - maxrgb = MAX3(rc, gc, bc);
> - exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
> + float754 rc, bc, gc, maxrgb;
> +
> + rc.raw = rgb9e5_ClampRange(rgb[0]);
> + gc.raw = rgb9e5_ClampRange(rgb[1]);
> + bc.raw = rgb9e5_ClampRange(rgb[2]);
> + maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw);
> +
> + /*
> + * Compared to what the spec suggests, instead of conditionally adjusting
> + * the exponent after the fact do it here by doing the equivalent of +0.5 -
> + * the int add will spill over into the exponent in this case.
> + */
> + maxrgb.raw += maxrgb.raw & (1 << (23-9));
> + exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) +
> + 1 + RGB9E5_EXP_BIAS - 127;
> + revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS -
> + RGB9E5_MANTISSA_BITS) + 1;
> assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
> - assert(exp_shared >= 0);
> - revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
> -
> - maxm = (int) (maxrgb * revdenom.value + 0.5);
> - if (maxm == MAX_RGB9E5_MANTISSA + 1) {
> - revdenom.value *= 0.5f;
> - exp_shared += 1;
> - assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
> - } else {
> - assert(maxm <= MAX_RGB9E5_MANTISSA);
> - }
> -
> - rm = (int) (rc * revdenom.value + 0.5);
> - gm = (int) (gc * revdenom.value + 0.5);
> - bm = (int) (bc * revdenom.value + 0.5);
> +
> + /*
> + * The spec uses strict round-up behavior (d3d10 disagrees, but in any case
> + * must match what is done above for figuring out exponent).
> + * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding
> + * ourselves (revdenom was adjusted by +1, above).
> + */
> + rm = (int) (rc.value * revdenom.value);
> + gm = (int) (gc.value * revdenom.value);
> + bm = (int) (bc.value * revdenom.value);
> + rm = (rm & 1) + (rm >> 1);
> + gm = (gm & 1) + (gm >> 1);
> + bm = (bm & 1) + (bm >> 1);
>
> assert(rm <= MAX_RGB9E5_MANTISSA);
> assert(gm <= MAX_RGB9E5_MANTISSA);
>
More information about the mesa-dev
mailing list