[Mesa-dev] [PATCH 2/2] gallivm: handle srgb-to-linear and linear-to-srgb conversions

Thu Jul 11 10:41:52 PDT 2013

----- Original Message -----
> Am 11.07.2013 18:54, schrieb Jose Fonseca:
> > ----- Original Message -----
> >> From: Roland Scheidegger <sroland at vmware.com>
> >>
> >> srgb-to-linear is using 3rd degree polynomial for now which should be
> >> _just_
> >> good enough. Reverse is using some rational polynomials and is quite
> >> accurate,
> >> though not hooked into llvmpipe's blend code yet and hence unused
> >> (untested).
> >> Using a table might also be an option (for srgb-to-linear especially).
> >> This does not enable any new features yet because EXT_texture_srgb was
> >> already
> >> supported via util_format fallbacks, but performance was lacking probably
> >> due
> >> to the external function call (the table used by the util_format_srgb code
> >> may
> >> not be all that much slower on its own).
> >> Some performance figures (taken from modified gloss, replaced both base
> >> and
> >> sphere texture to use GL_SRGB instead of GL_RGB, measured on 1Ghz Sandy
> >> Bridge,
> >> the numbers aren't terribly accurate):
> >>
> >> normal gloss, aos, 8-wide: 47 fps
> >> normal gloss, aos, 4-wide: 48 fps
> >>
> >> normal gloss, forced to soa, 8-wide: 48 fps
> >> normal gloss, forced to soa, 4-wide: 47 fps
> >>
> >> patched gloss, old code, soa, 8-wide: 21 fps
> >> patched gloss, old code, soa, 4-wide: 24 fps
> >>
> >> patched gloss, new code, soa, 8-wide: 41 fps
> >> patched gloss, new code, soa, 4-wide: 38 fps
> >>
> >> So there's a performance hit but it seems acceptable, certainly better
> >> than using the fallback.
> >> Note the new code only works for 4x8bit srgb formats, others (L8/L8A8)
> >> will
> >> continue to use the old util_format fallback, because I can't be bothered
> >> to write code for formats noone uses anyway (as decoding is done as part
> >> of
> >> lp_build_unpack_rgba_soa which can only handle block type width of 32).
> >> Compressed srgb formats should get their own path though eventually (it is
> >> going to be expensive in any case, first decompress, then convert).
> >> No piglit regressions.
> >> ---
> >>  src/gallium/auxiliary/Makefile.sources             |    1 +
> >>  src/gallium/auxiliary/gallivm/lp_bld_format.h      |   11 +
> >>  src/gallium/auxiliary/gallivm/lp_bld_format_soa.c  |   25 +-
> >>  src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c |  308
> >>  ++++++++++++++++++++
> >>  4 files changed, 339 insertions(+), 6 deletions(-)
> >>  create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> >>
> >> diff --git a/src/gallium/auxiliary/Makefile.sources
> >> b/src/gallium/auxiliary/Makefile.sources
> >> index 4751762..8cffeb0 100644
> >> --- a/src/gallium/auxiliary/Makefile.sources
> >> +++ b/src/gallium/auxiliary/Makefile.sources
> >> @@ -172,6 +172,7 @@ GALLIVM_SOURCES := \
> >>          gallivm/lp_bld_format_aos.c \
> >>          gallivm/lp_bld_format_aos_array.c \
> >>  	gallivm/lp_bld_format_float.c \
> >> +        gallivm/lp_bld_format_srgb.c \
> >>          gallivm/lp_bld_format_soa.c \
> >>          gallivm/lp_bld_format_yuv.c \
> >>          gallivm/lp_bld_gather.c \
> >> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h
> >> b/src/gallium/auxiliary/gallivm/lp_bld_format.h
> >> index 12a0318..744d002 100644
> >> --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
> >> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
> >> @@ -158,4 +158,15 @@ lp_build_rgb9e5_to_float(struct gallivm_state
> >> *gallivm,
> >>                           LLVMValueRef src,
> >>                           LLVMValueRef *dst);
> >>  
> >> +LLVMValueRef
> >> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> >> +                        struct lp_type src_type,
> >> +                        LLVMValueRef src);
> >> +
> >> +LLVMValueRef
> >> +lp_build_srgb_to_linear(struct gallivm_state *gallivm,
> >> +                        struct lp_type src_type,
> >> +                        LLVMValueRef src);
> >> +
> >> +
> >>  #endif /* !LP_BLD_FORMAT_H */
> >> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> >> b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> >> index 4c6bd81..114ce03 100644
> >> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> >> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> >> @@ -163,11 +163,23 @@ lp_build_unpack_rgba_soa(struct gallivm_state
> >> *gallivm,
> >>            */
> >>  
> >>           if (type.floating) {
> >> -            if(format_desc->channel[chan].normalized)
> >> -               input = lp_build_unsigned_norm_to_float(gallivm, width,
> >> type,
> >> input);
> >> -            else
> >> -               input = LLVMBuildSIToFP(builder, input,
> >> -                                       lp_build_vec_type(gallivm, type),
> >> "");
> >> +            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
> >> +               assert(width == 8);
> >> +               if (format_desc->swizzle[3] == chan) {
> >> +                  input = lp_build_unsigned_norm_to_float(gallivm, width,
> >> type, input);
> >> +               }
> >> +               else {
> >> +                  struct lp_type conv_type = lp_uint_type(type);
> >> +                  input = lp_build_srgb_to_linear(gallivm, conv_type,
> >> input);
> >> +               }
> >> +            }
> >> +            else {
> >> +               if(format_desc->channel[chan].normalized)
> >> +                  input = lp_build_unsigned_norm_to_float(gallivm, width,
> >> type, input);
> >> +               else
> >> +                  input = LLVMBuildSIToFP(builder, input,
> >> +                                          lp_build_vec_type(gallivm,
> >> type),
> >> "");
> >> +            }
> >>           }
> >>           else if (format_desc->channel[chan].pure_integer) {
> >>              /* Nothing to do */
> >> @@ -344,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
> >>  
> >>     if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
> >>         (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
> >> +        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
> >>          format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
> >>         format_desc->block.width == 1 &&
> >>         format_desc->block.height == 1 &&
> >> @@ -394,7 +407,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
> >>        packed = lp_build_gather(gallivm, type.length,
> >>                                 format_desc->block.bits,
> >>                                 type.width, base_ptr, offset,
> >> -			       FALSE);
> >> +                               FALSE);
> >>        if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> >>           lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
> >>        }
> >> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> >> b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> >> new file mode 100644
> >> index 0000000..2422817
> >> --- /dev/null
> >> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> >> @@ -0,0 +1,308 @@
> >> +/**************************************************************************
> >> + *
> >> + * Copyright 2013 VMware, Inc.
> >> + * All Rights Reserved.
> >> + *
> >> + * Permission is hereby granted, free of charge, to any person obtaining
> >> a
> >> + * copy of this software and associated documentation files (the
> >> + * "Software"), to deal in the Software without restriction, including
> >> + * without limitation the rights to use, copy, modify, merge, publish,
> >> + * distribute, sub license, and/or sell copies of the Software, and to
> >> + * permit persons to whom the Software is furnished to do so, subject to
> >> + * the following conditions:
> >> + *
> >> + * The above copyright notice and this permission notice (including the
> >> + * next paragraph) shall be included in all copies or substantial
> >> portions
> >> + * of the Software.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> >> EXPRESS
> >> + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> >> NON-INFRINGEMENT.
> >> + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
> >> + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
> >> CONTRACT,
> >> + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> >> + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> >> + *
> >> +
> >> **************************************************************************/
> >> +
> >> +
> >> +/**
> >> + * @file
> >> + * Format conversion code for srgb formats.
> >> + *
> >> + * Functions for converting from srgb to linear and vice versa.
> >> + * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
> >> + *
> >> + * srgb->linear:
> >> + * cl = cs / 12.92,                 cs <= 0.04045
> >> + * cl = ((cs + 0.055)/1.055)^2.4,   cs >  0.04045
> >> + *
> >> + * linear->srgb:
> >> + * if (isnan(cl)) {
> >> + *    Map IEEE-754 Not-a-number to zero.
> >> + *    cs = 0.0;
> >> + * } else if (cl > 1.0) {
> >> + *    cs = 1.0;
> >> + * } else if (cl < 0.0) {
> >> + *    cs = 0.0;
> >> + * } else if (cl < 0.0031308) {
> >> + *    cs = 12.92 * cl;
> >> + * } else {
> >> + *    cs = 1.055 * pow(cl, 0.41666) - 0.055;
> >> + * }
> >> + *
> >> + * This does not need to be accurate, however at least for d3d10
> >> + *
> >> (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
> >> + * 1) For srgb->linear, it is required that the error on the srgb side is
> >> + *    not larger than 0.5f, which I interpret that if you map the value
> >> back
> >> + *    to srgb from linear using the ideal conversion, it would not be off
> >> by
> >> + *    more than 0.5f (that is, it would map to the same 8-bit integer
> >> value
> >> + *    as it was before conversion to linear).
> >> + * 2) linear->srgb is permitted 0.6f which luckily looks like quite a
> >> large
> >> + *    error is allowed.
> >> + * 3) Additionally, all srgb values converted to linear and back must
> >> result
> >> + *    in the same value as they were originally.
> >> + *
> >> + * @author Roland Scheidegger <sroland at vmware.com>
> >> + */
> >> +
> >> +
> >> +#include "util/u_debug.h"
> >> +
> >> +#include "lp_bld_type.h"
> >> +#include "lp_bld_const.h"
> >> +#include "lp_bld_arit.h"
> >> +#include "lp_bld_bitarit.h"
> >> +#include "lp_bld_logic.h"
> >> +#include "lp_bld_format.h"
> >> +
> >> +
> >> +
> >> +/**
> >> + * Convert srgb int values to linear float values.
> >> + * Several possibilities how to do this, e.g.
> >> + * - table
> >> + * - doing the pow() with int-to-float and float-to-int tricks
> >> + *
> >> (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
> >> + * - just using standard polynomial approximation
> >> + *   (3rd order polynomial is required for crappy but just sufficient
> >> accuracy)
> >> + *
> >> + * @param src   integer (vector) value(s) to convert
> >> + *              (8 bit values unpacked to 32 bit already).
> >> + */
> >> +LLVMValueRef
> >> +lp_build_srgb_to_linear(struct gallivm_state *gallivm,
> >> +                        struct lp_type src_type,
> >> +                        LLVMValueRef src)
> >> +{
> >> +   struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
> >> +   struct lp_build_context f32_bld;
> >> +   LLVMValueRef srcf, part_lin, part_pow, tmp, is_linear;
> >> +   LLVMValueRef lin_const, tmp_const, lin_thresh;
> >> +
> >> +   assert(src_type.width == 32);
> >> +   lp_build_context_init(&f32_bld, gallivm, f32_type);
> >> +
> >> +   /*
> >> +    * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) +
> >> 0.0023)
> >> +    * (found with octave polyfit and some magic as I couldn't get the
> >> error
> >> +    * function right). Using the above mentioned error function, the
> >> values
> >> stay
> >> +    * within +-0.35, except for the lowest values - hence tweaking linear
> >> segment
> >> +    * to cover the first 16 instead of the first 11 values (the error
> >> stays
> >> +    * just about acceptable there too).
> >> +    * Hence: lin = src > 15 ?
> >> +    *           (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
> >> :
> >> +    *           src / 12.6;
> >> +    * This function really only makes sense for vectors, should use LUT
> >> otherwise.
> >> +    * All in all (including float conversion) 10 instructions (with
> >> sse4.1),
> >> +    * 6 constants. Bad dependency chains though, but FMA should help
> >> (minus
> >> 3
> >> +    * instructions).
> > 
> > Please use lp_build_polynomial. It tries to avoid data dependency.
> > Furthermore, if we start using FMA, then it's less one place to update.
> Ok. Are you sure it's worth avoiding data dependency at the cost of extra
> instructions (the way I built the polynomial, it's 6 instructions, and with
> lp_build_polynomial it would be 7)? 

I'm not sure for this particular polynomial order (you could benchmark).  It did make a significant improvement for log2/exp2 's polynomials at the time James did this.

If it's not worth it, then lp_build_polynomial should do a straight polynomial for that order and lower.  But lp_build_polynomial should still be used no matter what.  The expectation being that lp_build_polynomial will emit the best code possible for any polynomial.

> I thought because r/g/b will be done in
> parallel anyway it wouldn't be much of an issue. Didn't measure it, though.
> I am actually not really sure if fma isn't already used, while this is
> an non-conformant optimization to optimize mul+add into fma some
> compilers do it by default anyway IIRC.

If you want to add a new flag lp_build_polynomial to force a straightforward polynomial expansion that's fine too

> >> +    */
> >> +   /* doing the 1/255 mul as part of the approximation */
> >> +   srcf = lp_build_int_to_float(&f32_bld, src);
> >> +   lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f *
> >> 255.0f));
> >> +   part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
> >> +
> >> +   tmp_const = lp_build_const_vec(gallivm, f32_type, 0.3012f / (255.0f *
> >> 255.0f * 255.0f));
> >> +   tmp = lp_build_mul(&f32_bld, srcf, tmp_const);
> >> +   tmp_const = lp_build_const_vec(gallivm, f32_type, 0.6935f / (255.0f *
> >> 255.0f));
> >> +   tmp = lp_build_add(&f32_bld, tmp, tmp_const);
> >> +   tmp = lp_build_mul(&f32_bld, srcf, tmp);
> >> +   tmp_const = lp_build_const_vec(gallivm, f32_type, 0.0030f / 255.0f);
> >> +   tmp = lp_build_add(&f32_bld, tmp, tmp_const);
> >> +   tmp = lp_build_mul(&f32_bld, srcf, tmp);
> >> +   tmp_const = lp_build_const_vec(gallivm, f32_type, 0.0023f);
> >> +   part_pow = lp_build_add(&f32_bld, tmp, tmp_const);
> >> +
> >> +   lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
> >> +   is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL,
> >> srcf,
> >> lin_thresh);
> >> +   return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
> >> +}
> >> +
> >> +
> >> +/**
> >> + * Convert linear float values to srgb int values.
> >> + * Several possibilities how to do this, e.g.
> >> + * - use table (based on exponent/highest order mantissa bits) and do
> >> + *   linear interpolation (https://gist.github.com/rygorous/2203834)
> >> + * - Chebyshev polynomial
> >> + * - Approximation using reciprocals
> >> + * - using int-to-float and float-to-int tricks for pow()
> >> + *
> >> (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
> >> + *
> >> + * @param src   float (vector) value(s) to convert.
> >> + */
> >> +#if 0
> > 
> > I'd prefer you used if (0) instead of #if 0, so that we can ensure all
> > these variants staying working and buidling in the future.
> > 
> > Something like
> > 
> >   lp_build_linear_to_srgb_foo() {}
> >   lp_build_linear_to_srgb_boo() {}
> > 
> >   lp_build_linear_to_srgb() {
> >      if(0)
> >        return lp_build_linear_to_srgb_foo()
> >      else
> >        return lp_build_linear_to_srgb_boo()
> >    }
> > 
> > And you could probably refactor so that the linear part is only implemented
> > in one place.   Note I don't think that leaving multiple implementations
> > of this is bad, as the optimal way of doing this sort of things tends to
> > vary with introduction of newer processor generations.
> Ok. I did it that way because I only really planned for one method then
> figured the
> quite a bit simpler one should do, but of course didn't want to nuke the
> other one before I'm really sure it works... But when keeping both it
> certainly makes sense to actually compile them both.
> The more complex one cannot be faster though than the simpler regardless
> of cpu I'm quite sure. The dependency chains are similar as are the
> instructions used except that the complex version has an additional 4
> instructions (also in the longest dependency chain) more or less added
> on top of it.
> 
> >> +/* XXX remove this after verifying less accurate method is really good
> >> enough */
> >> +LLVMValueRef
> >> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> >> +                        struct lp_type src_type,
> >> +                        LLVMValueRef src)
> >> +{
> >> +   LLVMBuilderRef builder = gallivm->builder;
> >> +   struct lp_build_context f32_bld;
> >> +   float exp_f = 2.0f/3.0f;
> >> +   float coeff_f = 0.62996f;
> >> +   LLVMValueRef pow_approx, coeff, x2, exponent, tmp, pow_1, pow_2,
> >> pow_final;
> >> +   LLVMValueRef lin_thresh, lin, lin_const, is_linear;
> >> +   struct lp_type int_type = lp_int_type(src_type);
> >> +
> >> +   lp_build_context_init(&f32_bld, gallivm, src_type);
> >> +
> >> +   src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
> >> +
> >> +   /*
> >> +    * using int-to-float and float-to-int trick for pow().
> >> +    * This is much more accurate than necessary thanks to the correction,
> >> +    * but it most certainly makes no sense without rsqrt available.
> >> +    * Bonus points if you understand how this works...
> >> +    * All in all (including min/max clamp, conversion) 19 instructions.
> >> +    */
> >> +
> >> +   /*
> >> +    * First calculate approx x^8/12
> >> +    */
> >> +   exponent = lp_build_const_vec(gallivm, src_type, exp_f);
> >> +   coeff = lp_build_const_vec(gallivm, src_type,
> >> +                              exp2f(127 / exp_f - 127) * powf(coeff_f,
> >> 1.0
> > 
> >    1.0 -> 1.0f
> > 
> >> /exp_f));
> >> +
> >> +   /* premultiply src */
> >> +   tmp = lp_build_mul(&f32_bld, coeff, src);
> >> +   /* "log2" */
> >> +   tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm,
> >> int_type), "");
> >> +   tmp = lp_build_int_to_float(&f32_bld, tmp);
> >> +   /* multiply for pow */
> >> +   tmp = lp_build_mul(&f32_bld, tmp, exponent);
> >> +   /* "exp2" */
> >> +   pow_approx = lp_build_itrunc(&f32_bld, tmp);
> >> +   pow_approx = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm,
> >> src_type), "");
> >> +
> >> +   /*
> >> +    * Since that pow was inaccurate (like 3 bits, though each sqrt step
> >> would
> >> +    * give another bit), compensate the error (which is why we chose
> >> another
> >> +    * exponent in the first place).
> >> +    */
> >> +   /* x * x^(8/12) = x^(20/12) */
> >> +   pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
> >> +   /* x * x * x^(-4/12) = x^(20/12) */
> >> +   tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
> >> +   x2 = lp_build_mul(&f32_bld, src, src);
> >> +   pow_2 = lp_build_mul(&f32_bld, x2, tmp);
> >> +
> >> +   /* average the values so the errors cancel out, compensate bias,
> >> +    * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0
> >> mul
> >> +    * for conversion to int in here */
> >> +   tmp = lp_build_add(&f32_bld, pow_1, pow_2);
> >> +   coeff = lp_build_const_vec(gallivm, src_type,
> >> +                              1.0f/(3.0f*coeff_f) * 0.999852f *
> >> powf(1.055f
> >> * 255.0f, 4.0f));
> >> +   pow_final = lp_build_mul(&f32_bld, tmp, coeff);
> >> +
> >> +   /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
> >> +   pow_final = lp_build_fast_rsqrt(&f32_bld, pow_final);
> >> +   pow_final = lp_build_fast_rsqrt(&f32_bld, pow_final);
> >> +   pow_final = lp_build_add(&f32_bld, pow_final,
> >> +                            lp_build_const_vec(gallivm, src_type,
> >> -0.055f));
> >> +
> >> +   /* linear part is child's play */
> >> +   lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
> >> +   lin = lp_build_mul(&f32_bld, src, lin_const);
> >> +
> >> +   lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
> >> +   is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src,
> >> lin_thresh);
> >> +   tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
> >> +
> >> +   f32_bld.type.sign = 0;
> >> +   return lp_build_iround(&f32_bld, tmp);
> >> +}
> >> +
> >> +#else
> >> +LLVMValueRef
> >> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> >> +                        struct lp_type src_type,
> >> +                        LLVMValueRef src)
> >> +{
> >> +   struct lp_build_context f32_bld;
> >> +   LLVMValueRef pow_final, tmp, tmp1, tmp2, x05, x0375, a_const, b_const,
> >> c_const;
> >> +   LLVMValueRef lin_thresh, lin, lin_const, is_linear;
> >> +
> >> +   lp_build_context_init(&f32_bld, gallivm, src_type);
> >> +
> >> +   /*
> >> +    * using "rational polynomial" approximation here.
> >> +    * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
> >> +    * factoring in the 255.0 mul and the scaling mul.
> >> +    * (a is closer to actual value so has higher weight than b.)
> >> +    * Note: the constants are magic values. They were found empirically,
> >> +    * possibly could be improved but good enough (be VERY careful with
> >> +    * error metric if you'd want to tweak them, they also MUST fit with
> >> +    * the crappy polynomial above for srgb->linear since it is required
> >> +    * that each srgb value maps back to the same value).
> >> +    * This function has an error of max +-0.17 (and we'd only require
> >> +-0.6),
> >> +    * for the approximated srgb->linear values the error is naturally
> >> larger
> >> +    * (+-0.42) but still accurate enough (required +-0.5 essentially).
> >> +    * All in all (including min/max clamp, conversion) 15 instructions.
> >> +    * FMA would help (minus 2 instructions).
> >> +    */
> >> +
> >> +   if (lp_build_fast_rsqrt_available(src_type)) {
> >> +      tmp = lp_build_fast_rsqrt(&f32_bld, src);
> >> +      x05 = lp_build_mul(&f32_bld, src, tmp);
> >> +   }
> >> +   else {
> >> +      /*
> >> +       * I don't really expect this to be practical without rsqrt
> >> +       * but there's no reason for triple punishment so at least
> >> +       * save the otherwise resulting division and unnecessary mul...
> >> +       */
> >> +      x05 = lp_build_sqrt(&f32_bld, src);
> >> +   }
> >> +
> >> +   tmp = lp_build_mul(&f32_bld, x05, src);
> >> +   if (lp_build_fast_rsqrt_available(src_type)) {
> >> +      x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld,
> >> tmp));
> >> +   }
> >> +   else {
> >> +      x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
> >> +   }
> >> +
> >> +   a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 *
> >> 255.0f);
> >> +   b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 *
> >> 255.0f);
> >> +   c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
> >> +
> >> +   tmp1 = lp_build_mul(&f32_bld, a_const, x0375);
> >> +   tmp2 = lp_build_mul(&f32_bld, b_const, x05);
> >> +   tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
> >> +   pow_final = lp_build_add(&f32_bld, tmp1, tmp2);
> > 
> > Again, please use lp_build_polynomial for polynomial construction.
> No that can't work here (only works for ordinary polynomials but not for
> something of the sort here which is a^0.5 + b^0.375 + c).

Right.

> > 
> >> +
> >> +   /* linear part */
> >> +   lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
> >> +   lin = lp_build_mul(&f32_bld, src, lin_const);
> >> +
> >> +   lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
> >> +   is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src,
> >> lin_thresh);
> >> +   tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
> >> +
> >> +   f32_bld.type.sign = 0;
> >> +   return lp_build_iround(&f32_bld, tmp);
> >> +}
> >> +#endif
> > 
> > Otherwise looks good. I'm trusting you on the maths side though!
> The math is funky I don't even trust it! For some reason though it seems to
> give the right results :-).
> 
> 
> Roland
> 
> 
> > Jose
> > 
>