[Mesa-dev] [PATCH 2/2] gallivm: handle srgb-to-linear and linear-to-srgb conversions

Thu Jul 11 09:54:50 PDT 2013

----- Original Message -----
> From: Roland Scheidegger <sroland at vmware.com>
> 
> srgb-to-linear is using 3rd degree polynomial for now which should be _just_
> good enough. Reverse is using some rational polynomials and is quite
> accurate,
> though not hooked into llvmpipe's blend code yet and hence unused (untested).
> Using a table might also be an option (for srgb-to-linear especially).
> This does not enable any new features yet because EXT_texture_srgb was
> already
> supported via util_format fallbacks, but performance was lacking probably due
> to the external function call (the table used by the util_format_srgb code
> may
> not be all that much slower on its own).
> Some performance figures (taken from modified gloss, replaced both base and
> sphere texture to use GL_SRGB instead of GL_RGB, measured on 1Ghz Sandy
> Bridge,
> the numbers aren't terribly accurate):
> 
> normal gloss, aos, 8-wide: 47 fps
> normal gloss, aos, 4-wide: 48 fps
> 
> normal gloss, forced to soa, 8-wide: 48 fps
> normal gloss, forced to soa, 4-wide: 47 fps
> 
> patched gloss, old code, soa, 8-wide: 21 fps
> patched gloss, old code, soa, 4-wide: 24 fps
> 
> patched gloss, new code, soa, 8-wide: 41 fps
> patched gloss, new code, soa, 4-wide: 38 fps
> 
> So there's a performance hit but it seems acceptable, certainly better
> than using the fallback.
> Note the new code only works for 4x8bit srgb formats, others (L8/L8A8) will
> continue to use the old util_format fallback, because I can't be bothered
> to write code for formats noone uses anyway (as decoding is done as part of
> lp_build_unpack_rgba_soa which can only handle block type width of 32).
> Compressed srgb formats should get their own path though eventually (it is
> going to be expensive in any case, first decompress, then convert).
> No piglit regressions.
> ---
>  src/gallium/auxiliary/Makefile.sources             |    1 +
>  src/gallium/auxiliary/gallivm/lp_bld_format.h      |   11 +
>  src/gallium/auxiliary/gallivm/lp_bld_format_soa.c  |   25 +-
>  src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c |  308
>  ++++++++++++++++++++
>  4 files changed, 339 insertions(+), 6 deletions(-)
>  create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> 
> diff --git a/src/gallium/auxiliary/Makefile.sources
> b/src/gallium/auxiliary/Makefile.sources
> index 4751762..8cffeb0 100644
> --- a/src/gallium/auxiliary/Makefile.sources
> +++ b/src/gallium/auxiliary/Makefile.sources
> @@ -172,6 +172,7 @@ GALLIVM_SOURCES := \
>          gallivm/lp_bld_format_aos.c \
>          gallivm/lp_bld_format_aos_array.c \
>  	gallivm/lp_bld_format_float.c \
> +        gallivm/lp_bld_format_srgb.c \
>          gallivm/lp_bld_format_soa.c \
>          gallivm/lp_bld_format_yuv.c \
>          gallivm/lp_bld_gather.c \
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h
> b/src/gallium/auxiliary/gallivm/lp_bld_format.h
> index 12a0318..744d002 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
> @@ -158,4 +158,15 @@ lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
>                           LLVMValueRef src,
>                           LLVMValueRef *dst);
>  
> +LLVMValueRef
> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> +                        struct lp_type src_type,
> +                        LLVMValueRef src);
> +
> +LLVMValueRef
> +lp_build_srgb_to_linear(struct gallivm_state *gallivm,
> +                        struct lp_type src_type,
> +                        LLVMValueRef src);
> +
> +
>  #endif /* !LP_BLD_FORMAT_H */
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> index 4c6bd81..114ce03 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> @@ -163,11 +163,23 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
>            */
>  
>           if (type.floating) {
> -            if(format_desc->channel[chan].normalized)
> -               input = lp_build_unsigned_norm_to_float(gallivm, width, type,
> input);
> -            else
> -               input = LLVMBuildSIToFP(builder, input,
> -                                       lp_build_vec_type(gallivm, type),
> "");
> +            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
> +               assert(width == 8);
> +               if (format_desc->swizzle[3] == chan) {
> +                  input = lp_build_unsigned_norm_to_float(gallivm, width,
> type, input);
> +               }
> +               else {
> +                  struct lp_type conv_type = lp_uint_type(type);
> +                  input = lp_build_srgb_to_linear(gallivm, conv_type,
> input);
> +               }
> +            }
> +            else {
> +               if(format_desc->channel[chan].normalized)
> +                  input = lp_build_unsigned_norm_to_float(gallivm, width,
> type, input);
> +               else
> +                  input = LLVMBuildSIToFP(builder, input,
> +                                          lp_build_vec_type(gallivm, type),
> "");
> +            }
>           }
>           else if (format_desc->channel[chan].pure_integer) {
>              /* Nothing to do */
> @@ -344,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
>  
>     if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
>         (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
> +        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
>          format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
>         format_desc->block.width == 1 &&
>         format_desc->block.height == 1 &&
> @@ -394,7 +407,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
>        packed = lp_build_gather(gallivm, type.length,
>                                 format_desc->block.bits,
>                                 type.width, base_ptr, offset,
> -			       FALSE);
> +                               FALSE);
>        if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
>           lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
>        }
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> new file mode 100644
> index 0000000..2422817
> --- /dev/null
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> @@ -0,0 +1,308 @@
> +/**************************************************************************
> + *
> + * Copyright 2013 VMware, Inc.
> + * All Rights Reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> + * "Software"), to deal in the Software without restriction, including
> + * without limitation the rights to use, copy, modify, merge, publish,
> + * distribute, sub license, and/or sell copies of the Software, and to
> + * permit persons to whom the Software is furnished to do so, subject to
> + * the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> + * next paragraph) shall be included in all copies or substantial portions
> + * of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
> + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
> + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
> + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + **************************************************************************/
> +
> +
> +/**
> + * @file
> + * Format conversion code for srgb formats.
> + *
> + * Functions for converting from srgb to linear and vice versa.
> + * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
> + *
> + * srgb->linear:
> + * cl = cs / 12.92,                 cs <= 0.04045
> + * cl = ((cs + 0.055)/1.055)^2.4,   cs >  0.04045
> + *
> + * linear->srgb:
> + * if (isnan(cl)) {
> + *    Map IEEE-754 Not-a-number to zero.
> + *    cs = 0.0;
> + * } else if (cl > 1.0) {
> + *    cs = 1.0;
> + * } else if (cl < 0.0) {
> + *    cs = 0.0;
> + * } else if (cl < 0.0031308) {
> + *    cs = 12.92 * cl;
> + * } else {
> + *    cs = 1.055 * pow(cl, 0.41666) - 0.055;
> + * }
> + *
> + * This does not need to be accurate, however at least for d3d10
> + *
> (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
> + * 1) For srgb->linear, it is required that the error on the srgb side is
> + *    not larger than 0.5f, which I interpret that if you map the value back
> + *    to srgb from linear using the ideal conversion, it would not be off by
> + *    more than 0.5f (that is, it would map to the same 8-bit integer value
> + *    as it was before conversion to linear).
> + * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
> + *    error is allowed.
> + * 3) Additionally, all srgb values converted to linear and back must result
> + *    in the same value as they were originally.
> + *
> + * @author Roland Scheidegger <sroland at vmware.com>
> + */
> +
> +
> +#include "util/u_debug.h"
> +
> +#include "lp_bld_type.h"
> +#include "lp_bld_const.h"
> +#include "lp_bld_arit.h"
> +#include "lp_bld_bitarit.h"
> +#include "lp_bld_logic.h"
> +#include "lp_bld_format.h"
> +
> +
> +
> +/**
> + * Convert srgb int values to linear float values.
> + * Several possibilities how to do this, e.g.
> + * - table
> + * - doing the pow() with int-to-float and float-to-int tricks
> + *
> (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
> + * - just using standard polynomial approximation
> + *   (3rd order polynomial is required for crappy but just sufficient
> accuracy)
> + *
> + * @param src   integer (vector) value(s) to convert
> + *              (8 bit values unpacked to 32 bit already).
> + */
> +LLVMValueRef
> +lp_build_srgb_to_linear(struct gallivm_state *gallivm,
> +                        struct lp_type src_type,
> +                        LLVMValueRef src)
> +{
> +   struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
> +   struct lp_build_context f32_bld;
> +   LLVMValueRef srcf, part_lin, part_pow, tmp, is_linear;
> +   LLVMValueRef lin_const, tmp_const, lin_thresh;
> +
> +   assert(src_type.width == 32);
> +   lp_build_context_init(&f32_bld, gallivm, f32_type);
> +
> +   /*
> +    * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) +
> 0.0023)
> +    * (found with octave polyfit and some magic as I couldn't get the error
> +    * function right). Using the above mentioned error function, the values
> stay
> +    * within +-0.35, except for the lowest values - hence tweaking linear
> segment
> +    * to cover the first 16 instead of the first 11 values (the error stays
> +    * just about acceptable there too).
> +    * Hence: lin = src > 15 ?
> +    *           (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023) :
> +    *           src / 12.6;
> +    * This function really only makes sense for vectors, should use LUT
> otherwise.
> +    * All in all (including float conversion) 10 instructions (with sse4.1),
> +    * 6 constants. Bad dependency chains though, but FMA should help (minus
> 3
> +    * instructions).

Please use lp_build_polynomial. It tries to avoid data dependency. Furthermore, if we start using FMA, then it's less one place to update.

> +    */
> +   /* doing the 1/255 mul as part of the approximation */
> +   srcf = lp_build_int_to_float(&f32_bld, src);
> +   lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f *
> 255.0f));
> +   part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
> +
> +   tmp_const = lp_build_const_vec(gallivm, f32_type, 0.3012f / (255.0f *
> 255.0f * 255.0f));
> +   tmp = lp_build_mul(&f32_bld, srcf, tmp_const);
> +   tmp_const = lp_build_const_vec(gallivm, f32_type, 0.6935f / (255.0f *
> 255.0f));
> +   tmp = lp_build_add(&f32_bld, tmp, tmp_const);
> +   tmp = lp_build_mul(&f32_bld, srcf, tmp);
> +   tmp_const = lp_build_const_vec(gallivm, f32_type, 0.0030f / 255.0f);
> +   tmp = lp_build_add(&f32_bld, tmp, tmp_const);
> +   tmp = lp_build_mul(&f32_bld, srcf, tmp);
> +   tmp_const = lp_build_const_vec(gallivm, f32_type, 0.0023f);
> +   part_pow = lp_build_add(&f32_bld, tmp, tmp_const);
> +
> +   lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
> +   is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf,
> lin_thresh);
> +   return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
> +}
> +
> +
> +/**
> + * Convert linear float values to srgb int values.
> + * Several possibilities how to do this, e.g.
> + * - use table (based on exponent/highest order mantissa bits) and do
> + *   linear interpolation (https://gist.github.com/rygorous/2203834)
> + * - Chebyshev polynomial
> + * - Approximation using reciprocals
> + * - using int-to-float and float-to-int tricks for pow()
> + *
> (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
> + *
> + * @param src   float (vector) value(s) to convert.
> + */
> +#if 0

I'd prefer you used if (0) instead of #if 0, so that we can ensure all these variants staying working and buidling in the future.

Something like 

  lp_build_linear_to_srgb_foo() {}
  lp_build_linear_to_srgb_boo() {}

  lp_build_linear_to_srgb() {
     if(0) 
       return lp_build_linear_to_srgb_foo()
     else 
       return lp_build_linear_to_srgb_boo()
   }

And you could probably refactor so that the linear part is only implemented in one place.   Note I don't think that leaving multiple implementations of this is bad, as the optimal way of doing this sort of things tends to vary with introduction of newer processor generations.

> +/* XXX remove this after verifying less accurate method is really good
> enough */
> +LLVMValueRef
> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> +                        struct lp_type src_type,
> +                        LLVMValueRef src)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   struct lp_build_context f32_bld;
> +   float exp_f = 2.0f/3.0f;
> +   float coeff_f = 0.62996f;
> +   LLVMValueRef pow_approx, coeff, x2, exponent, tmp, pow_1, pow_2,
> pow_final;
> +   LLVMValueRef lin_thresh, lin, lin_const, is_linear;
> +   struct lp_type int_type = lp_int_type(src_type);
> +
> +   lp_build_context_init(&f32_bld, gallivm, src_type);
> +
> +   src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
> +
> +   /*
> +    * using int-to-float and float-to-int trick for pow().
> +    * This is much more accurate than necessary thanks to the correction,
> +    * but it most certainly makes no sense without rsqrt available.
> +    * Bonus points if you understand how this works...
> +    * All in all (including min/max clamp, conversion) 19 instructions.
> +    */
> +
> +   /*
> +    * First calculate approx x^8/12
> +    */
> +   exponent = lp_build_const_vec(gallivm, src_type, exp_f);
> +   coeff = lp_build_const_vec(gallivm, src_type,
> +                              exp2f(127 / exp_f - 127) * powf(coeff_f, 1.0

   1.0 -> 1.0f

> /exp_f));
> +
> +   /* premultiply src */
> +   tmp = lp_build_mul(&f32_bld, coeff, src);
> +   /* "log2" */
> +   tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm,
> int_type), "");
> +   tmp = lp_build_int_to_float(&f32_bld, tmp);
> +   /* multiply for pow */
> +   tmp = lp_build_mul(&f32_bld, tmp, exponent);
> +   /* "exp2" */
> +   pow_approx = lp_build_itrunc(&f32_bld, tmp);
> +   pow_approx = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm,
> src_type), "");
> +
> +   /*
> +    * Since that pow was inaccurate (like 3 bits, though each sqrt step
> would
> +    * give another bit), compensate the error (which is why we chose another
> +    * exponent in the first place).
> +    */
> +   /* x * x^(8/12) = x^(20/12) */
> +   pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
> +   /* x * x * x^(-4/12) = x^(20/12) */
> +   tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
> +   x2 = lp_build_mul(&f32_bld, src, src);
> +   pow_2 = lp_build_mul(&f32_bld, x2, tmp);
> +
> +   /* average the values so the errors cancel out, compensate bias,
> +    * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0
> mul
> +    * for conversion to int in here */
> +   tmp = lp_build_add(&f32_bld, pow_1, pow_2);
> +   coeff = lp_build_const_vec(gallivm, src_type,
> +                              1.0f/(3.0f*coeff_f) * 0.999852f * powf(1.055f
> * 255.0f, 4.0f));
> +   pow_final = lp_build_mul(&f32_bld, tmp, coeff);
> +
> +   /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
> +   pow_final = lp_build_fast_rsqrt(&f32_bld, pow_final);
> +   pow_final = lp_build_fast_rsqrt(&f32_bld, pow_final);
> +   pow_final = lp_build_add(&f32_bld, pow_final,
> +                            lp_build_const_vec(gallivm, src_type, -0.055f));
> +
> +   /* linear part is child's play */
> +   lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
> +   lin = lp_build_mul(&f32_bld, src, lin_const);
> +
> +   lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
> +   is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src,
> lin_thresh);
> +   tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
> +
> +   f32_bld.type.sign = 0;
> +   return lp_build_iround(&f32_bld, tmp);
> +}
> +
> +#else
> +LLVMValueRef
> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> +                        struct lp_type src_type,
> +                        LLVMValueRef src)
> +{
> +   struct lp_build_context f32_bld;
> +   LLVMValueRef pow_final, tmp, tmp1, tmp2, x05, x0375, a_const, b_const,
> c_const;
> +   LLVMValueRef lin_thresh, lin, lin_const, is_linear;
> +
> +   lp_build_context_init(&f32_bld, gallivm, src_type);
> +
> +   /*
> +    * using "rational polynomial" approximation here.
> +    * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
> +    * factoring in the 255.0 mul and the scaling mul.
> +    * (a is closer to actual value so has higher weight than b.)
> +    * Note: the constants are magic values. They were found empirically,
> +    * possibly could be improved but good enough (be VERY careful with
> +    * error metric if you'd want to tweak them, they also MUST fit with
> +    * the crappy polynomial above for srgb->linear since it is required
> +    * that each srgb value maps back to the same value).
> +    * This function has an error of max +-0.17 (and we'd only require
> +-0.6),
> +    * for the approximated srgb->linear values the error is naturally larger
> +    * (+-0.42) but still accurate enough (required +-0.5 essentially).
> +    * All in all (including min/max clamp, conversion) 15 instructions.
> +    * FMA would help (minus 2 instructions).
> +    */
> +
> +   if (lp_build_fast_rsqrt_available(src_type)) {
> +      tmp = lp_build_fast_rsqrt(&f32_bld, src);
> +      x05 = lp_build_mul(&f32_bld, src, tmp);
> +   }
> +   else {
> +      /*
> +       * I don't really expect this to be practical without rsqrt
> +       * but there's no reason for triple punishment so at least
> +       * save the otherwise resulting division and unnecessary mul...
> +       */
> +      x05 = lp_build_sqrt(&f32_bld, src);
> +   }
> +
> +   tmp = lp_build_mul(&f32_bld, x05, src);
> +   if (lp_build_fast_rsqrt_available(src_type)) {
> +      x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld,
> tmp));
> +   }
> +   else {
> +      x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
> +   }
> +
> +   a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 *
> 255.0f);
> +   b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 *
> 255.0f);
> +   c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
> +
> +   tmp1 = lp_build_mul(&f32_bld, a_const, x0375);
> +   tmp2 = lp_build_mul(&f32_bld, b_const, x05);
> +   tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
> +   pow_final = lp_build_add(&f32_bld, tmp1, tmp2);

Again, please use lp_build_polynomial for polynomial construction.

> +
> +   /* linear part */
> +   lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
> +   lin = lp_build_mul(&f32_bld, src, lin_const);
> +
> +   lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
> +   is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src,
> lin_thresh);
> +   tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
> +
> +   f32_bld.type.sign = 0;
> +   return lp_build_iround(&f32_bld, tmp);
> +}
> +#endif

Otherwise looks good. I'm trusting you on the maths side though!

Jose