[Mesa-dev] [PATCH 2/2] gallivm: handle srgb-to-linear and linear-to-srgb conversions
Jose Fonseca
jfonseca at vmware.com
Thu Jul 11 09:54:50 PDT 2013
----- Original Message -----
> From: Roland Scheidegger <sroland at vmware.com>
>
> srgb-to-linear is using 3rd degree polynomial for now which should be _just_
> good enough. Reverse is using some rational polynomials and is quite
> accurate,
> though not hooked into llvmpipe's blend code yet and hence unused (untested).
> Using a table might also be an option (for srgb-to-linear especially).
> This does not enable any new features yet because EXT_texture_srgb was
> already
> supported via util_format fallbacks, but performance was lacking probably due
> to the external function call (the table used by the util_format_srgb code
> may
> not be all that much slower on its own).
> Some performance figures (taken from modified gloss, replaced both base and
> sphere texture to use GL_SRGB instead of GL_RGB, measured on 1Ghz Sandy
> Bridge,
> the numbers aren't terribly accurate):
>
> normal gloss, aos, 8-wide: 47 fps
> normal gloss, aos, 4-wide: 48 fps
>
> normal gloss, forced to soa, 8-wide: 48 fps
> normal gloss, forced to soa, 4-wide: 47 fps
>
> patched gloss, old code, soa, 8-wide: 21 fps
> patched gloss, old code, soa, 4-wide: 24 fps
>
> patched gloss, new code, soa, 8-wide: 41 fps
> patched gloss, new code, soa, 4-wide: 38 fps
>
> So there's a performance hit but it seems acceptable, certainly better
> than using the fallback.
> Note the new code only works for 4x8bit srgb formats, others (L8/L8A8) will
> continue to use the old util_format fallback, because I can't be bothered
> to write code for formats noone uses anyway (as decoding is done as part of
> lp_build_unpack_rgba_soa which can only handle block type width of 32).
> Compressed srgb formats should get their own path though eventually (it is
> going to be expensive in any case, first decompress, then convert).
> No piglit regressions.
> ---
> src/gallium/auxiliary/Makefile.sources | 1 +
> src/gallium/auxiliary/gallivm/lp_bld_format.h | 11 +
> src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 25 +-
> src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c | 308
> ++++++++++++++++++++
> 4 files changed, 339 insertions(+), 6 deletions(-)
> create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
>
> diff --git a/src/gallium/auxiliary/Makefile.sources
> b/src/gallium/auxiliary/Makefile.sources
> index 4751762..8cffeb0 100644
> --- a/src/gallium/auxiliary/Makefile.sources
> +++ b/src/gallium/auxiliary/Makefile.sources
> @@ -172,6 +172,7 @@ GALLIVM_SOURCES := \
> gallivm/lp_bld_format_aos.c \
> gallivm/lp_bld_format_aos_array.c \
> gallivm/lp_bld_format_float.c \
> + gallivm/lp_bld_format_srgb.c \
> gallivm/lp_bld_format_soa.c \
> gallivm/lp_bld_format_yuv.c \
> gallivm/lp_bld_gather.c \
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h
> b/src/gallium/auxiliary/gallivm/lp_bld_format.h
> index 12a0318..744d002 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
> @@ -158,4 +158,15 @@ lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
> LLVMValueRef src,
> LLVMValueRef *dst);
>
> +LLVMValueRef
> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + LLVMValueRef src);
> +
> +LLVMValueRef
> +lp_build_srgb_to_linear(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + LLVMValueRef src);
> +
> +
> #endif /* !LP_BLD_FORMAT_H */
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> index 4c6bd81..114ce03 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
> @@ -163,11 +163,23 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
> */
>
> if (type.floating) {
> - if(format_desc->channel[chan].normalized)
> - input = lp_build_unsigned_norm_to_float(gallivm, width, type,
> input);
> - else
> - input = LLVMBuildSIToFP(builder, input,
> - lp_build_vec_type(gallivm, type),
> "");
> + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
> + assert(width == 8);
> + if (format_desc->swizzle[3] == chan) {
> + input = lp_build_unsigned_norm_to_float(gallivm, width,
> type, input);
> + }
> + else {
> + struct lp_type conv_type = lp_uint_type(type);
> + input = lp_build_srgb_to_linear(gallivm, conv_type,
> input);
> + }
> + }
> + else {
> + if(format_desc->channel[chan].normalized)
> + input = lp_build_unsigned_norm_to_float(gallivm, width,
> type, input);
> + else
> + input = LLVMBuildSIToFP(builder, input,
> + lp_build_vec_type(gallivm, type),
> "");
> + }
> }
> else if (format_desc->channel[chan].pure_integer) {
> /* Nothing to do */
> @@ -344,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
>
> if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
> (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
> + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
> format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
> format_desc->block.width == 1 &&
> format_desc->block.height == 1 &&
> @@ -394,7 +407,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
> packed = lp_build_gather(gallivm, type.length,
> format_desc->block.bits,
> type.width, base_ptr, offset,
> - FALSE);
> + FALSE);
> if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
> }
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> new file mode 100644
> index 0000000..2422817
> --- /dev/null
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
> @@ -0,0 +1,308 @@
> +/**************************************************************************
> + *
> + * Copyright 2013 VMware, Inc.
> + * All Rights Reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> + * "Software"), to deal in the Software without restriction, including
> + * without limitation the rights to use, copy, modify, merge, publish,
> + * distribute, sub license, and/or sell copies of the Software, and to
> + * permit persons to whom the Software is furnished to do so, subject to
> + * the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> + * next paragraph) shall be included in all copies or substantial portions
> + * of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
> + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
> + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
> + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + **************************************************************************/
> +
> +
> +/**
> + * @file
> + * Format conversion code for srgb formats.
> + *
> + * Functions for converting from srgb to linear and vice versa.
> + * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
> + *
> + * srgb->linear:
> + * cl = cs / 12.92, cs <= 0.04045
> + * cl = ((cs + 0.055)/1.055)^2.4, cs > 0.04045
> + *
> + * linear->srgb:
> + * if (isnan(cl)) {
> + * Map IEEE-754 Not-a-number to zero.
> + * cs = 0.0;
> + * } else if (cl > 1.0) {
> + * cs = 1.0;
> + * } else if (cl < 0.0) {
> + * cs = 0.0;
> + * } else if (cl < 0.0031308) {
> + * cs = 12.92 * cl;
> + * } else {
> + * cs = 1.055 * pow(cl, 0.41666) - 0.055;
> + * }
> + *
> + * This does not need to be accurate, however at least for d3d10
> + *
> (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
> + * 1) For srgb->linear, it is required that the error on the srgb side is
> + * not larger than 0.5f, which I interpret that if you map the value back
> + * to srgb from linear using the ideal conversion, it would not be off by
> + * more than 0.5f (that is, it would map to the same 8-bit integer value
> + * as it was before conversion to linear).
> + * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
> + * error is allowed.
> + * 3) Additionally, all srgb values converted to linear and back must result
> + * in the same value as they were originally.
> + *
> + * @author Roland Scheidegger <sroland at vmware.com>
> + */
> +
> +
> +#include "util/u_debug.h"
> +
> +#include "lp_bld_type.h"
> +#include "lp_bld_const.h"
> +#include "lp_bld_arit.h"
> +#include "lp_bld_bitarit.h"
> +#include "lp_bld_logic.h"
> +#include "lp_bld_format.h"
> +
> +
> +
> +/**
> + * Convert srgb int values to linear float values.
> + * Several possibilities how to do this, e.g.
> + * - table
> + * - doing the pow() with int-to-float and float-to-int tricks
> + *
> (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
> + * - just using standard polynomial approximation
> + * (3rd order polynomial is required for crappy but just sufficient
> accuracy)
> + *
> + * @param src integer (vector) value(s) to convert
> + * (8 bit values unpacked to 32 bit already).
> + */
> +LLVMValueRef
> +lp_build_srgb_to_linear(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + LLVMValueRef src)
> +{
> + struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
> + struct lp_build_context f32_bld;
> + LLVMValueRef srcf, part_lin, part_pow, tmp, is_linear;
> + LLVMValueRef lin_const, tmp_const, lin_thresh;
> +
> + assert(src_type.width == 32);
> + lp_build_context_init(&f32_bld, gallivm, f32_type);
> +
> + /*
> + * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) +
> 0.0023)
> + * (found with octave polyfit and some magic as I couldn't get the error
> + * function right). Using the above mentioned error function, the values
> stay
> + * within +-0.35, except for the lowest values - hence tweaking linear
> segment
> + * to cover the first 16 instead of the first 11 values (the error stays
> + * just about acceptable there too).
> + * Hence: lin = src > 15 ?
> + * (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023) :
> + * src / 12.6;
> + * This function really only makes sense for vectors, should use LUT
> otherwise.
> + * All in all (including float conversion) 10 instructions (with sse4.1),
> + * 6 constants. Bad dependency chains though, but FMA should help (minus
> 3
> + * instructions).
Please use lp_build_polynomial. It tries to avoid data dependency. Furthermore, if we start using FMA, then it's less one place to update.
> + */
> + /* doing the 1/255 mul as part of the approximation */
> + srcf = lp_build_int_to_float(&f32_bld, src);
> + lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f *
> 255.0f));
> + part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
> +
> + tmp_const = lp_build_const_vec(gallivm, f32_type, 0.3012f / (255.0f *
> 255.0f * 255.0f));
> + tmp = lp_build_mul(&f32_bld, srcf, tmp_const);
> + tmp_const = lp_build_const_vec(gallivm, f32_type, 0.6935f / (255.0f *
> 255.0f));
> + tmp = lp_build_add(&f32_bld, tmp, tmp_const);
> + tmp = lp_build_mul(&f32_bld, srcf, tmp);
> + tmp_const = lp_build_const_vec(gallivm, f32_type, 0.0030f / 255.0f);
> + tmp = lp_build_add(&f32_bld, tmp, tmp_const);
> + tmp = lp_build_mul(&f32_bld, srcf, tmp);
> + tmp_const = lp_build_const_vec(gallivm, f32_type, 0.0023f);
> + part_pow = lp_build_add(&f32_bld, tmp, tmp_const);
> +
> + lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
> + is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf,
> lin_thresh);
> + return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
> +}
> +
> +
> +/**
> + * Convert linear float values to srgb int values.
> + * Several possibilities how to do this, e.g.
> + * - use table (based on exponent/highest order mantissa bits) and do
> + * linear interpolation (https://gist.github.com/rygorous/2203834)
> + * - Chebyshev polynomial
> + * - Approximation using reciprocals
> + * - using int-to-float and float-to-int tricks for pow()
> + *
> (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
> + *
> + * @param src float (vector) value(s) to convert.
> + */
> +#if 0
I'd prefer you used if (0) instead of #if 0, so that we can ensure all these variants staying working and buidling in the future.
Something like
lp_build_linear_to_srgb_foo() {}
lp_build_linear_to_srgb_boo() {}
lp_build_linear_to_srgb() {
if(0)
return lp_build_linear_to_srgb_foo()
else
return lp_build_linear_to_srgb_boo()
}
And you could probably refactor so that the linear part is only implemented in one place. Note I don't think that leaving multiple implementations of this is bad, as the optimal way of doing this sort of things tends to vary with introduction of newer processor generations.
> +/* XXX remove this after verifying less accurate method is really good
> enough */
> +LLVMValueRef
> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + LLVMValueRef src)
> +{
> + LLVMBuilderRef builder = gallivm->builder;
> + struct lp_build_context f32_bld;
> + float exp_f = 2.0f/3.0f;
> + float coeff_f = 0.62996f;
> + LLVMValueRef pow_approx, coeff, x2, exponent, tmp, pow_1, pow_2,
> pow_final;
> + LLVMValueRef lin_thresh, lin, lin_const, is_linear;
> + struct lp_type int_type = lp_int_type(src_type);
> +
> + lp_build_context_init(&f32_bld, gallivm, src_type);
> +
> + src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
> +
> + /*
> + * using int-to-float and float-to-int trick for pow().
> + * This is much more accurate than necessary thanks to the correction,
> + * but it most certainly makes no sense without rsqrt available.
> + * Bonus points if you understand how this works...
> + * All in all (including min/max clamp, conversion) 19 instructions.
> + */
> +
> + /*
> + * First calculate approx x^8/12
> + */
> + exponent = lp_build_const_vec(gallivm, src_type, exp_f);
> + coeff = lp_build_const_vec(gallivm, src_type,
> + exp2f(127 / exp_f - 127) * powf(coeff_f, 1.0
1.0 -> 1.0f
> /exp_f));
> +
> + /* premultiply src */
> + tmp = lp_build_mul(&f32_bld, coeff, src);
> + /* "log2" */
> + tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm,
> int_type), "");
> + tmp = lp_build_int_to_float(&f32_bld, tmp);
> + /* multiply for pow */
> + tmp = lp_build_mul(&f32_bld, tmp, exponent);
> + /* "exp2" */
> + pow_approx = lp_build_itrunc(&f32_bld, tmp);
> + pow_approx = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm,
> src_type), "");
> +
> + /*
> + * Since that pow was inaccurate (like 3 bits, though each sqrt step
> would
> + * give another bit), compensate the error (which is why we chose another
> + * exponent in the first place).
> + */
> + /* x * x^(8/12) = x^(20/12) */
> + pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
> + /* x * x * x^(-4/12) = x^(20/12) */
> + tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
> + x2 = lp_build_mul(&f32_bld, src, src);
> + pow_2 = lp_build_mul(&f32_bld, x2, tmp);
> +
> + /* average the values so the errors cancel out, compensate bias,
> + * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0
> mul
> + * for conversion to int in here */
> + tmp = lp_build_add(&f32_bld, pow_1, pow_2);
> + coeff = lp_build_const_vec(gallivm, src_type,
> + 1.0f/(3.0f*coeff_f) * 0.999852f * powf(1.055f
> * 255.0f, 4.0f));
> + pow_final = lp_build_mul(&f32_bld, tmp, coeff);
> +
> + /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
> + pow_final = lp_build_fast_rsqrt(&f32_bld, pow_final);
> + pow_final = lp_build_fast_rsqrt(&f32_bld, pow_final);
> + pow_final = lp_build_add(&f32_bld, pow_final,
> + lp_build_const_vec(gallivm, src_type, -0.055f));
> +
> + /* linear part is child's play */
> + lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
> + lin = lp_build_mul(&f32_bld, src, lin_const);
> +
> + lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
> + is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src,
> lin_thresh);
> + tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
> +
> + f32_bld.type.sign = 0;
> + return lp_build_iround(&f32_bld, tmp);
> +}
> +
> +#else
> +LLVMValueRef
> +lp_build_linear_to_srgb(struct gallivm_state *gallivm,
> + struct lp_type src_type,
> + LLVMValueRef src)
> +{
> + struct lp_build_context f32_bld;
> + LLVMValueRef pow_final, tmp, tmp1, tmp2, x05, x0375, a_const, b_const,
> c_const;
> + LLVMValueRef lin_thresh, lin, lin_const, is_linear;
> +
> + lp_build_context_init(&f32_bld, gallivm, src_type);
> +
> + /*
> + * using "rational polynomial" approximation here.
> + * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
> + * factoring in the 255.0 mul and the scaling mul.
> + * (a is closer to actual value so has higher weight than b.)
> + * Note: the constants are magic values. They were found empirically,
> + * possibly could be improved but good enough (be VERY careful with
> + * error metric if you'd want to tweak them, they also MUST fit with
> + * the crappy polynomial above for srgb->linear since it is required
> + * that each srgb value maps back to the same value).
> + * This function has an error of max +-0.17 (and we'd only require
> +-0.6),
> + * for the approximated srgb->linear values the error is naturally larger
> + * (+-0.42) but still accurate enough (required +-0.5 essentially).
> + * All in all (including min/max clamp, conversion) 15 instructions.
> + * FMA would help (minus 2 instructions).
> + */
> +
> + if (lp_build_fast_rsqrt_available(src_type)) {
> + tmp = lp_build_fast_rsqrt(&f32_bld, src);
> + x05 = lp_build_mul(&f32_bld, src, tmp);
> + }
> + else {
> + /*
> + * I don't really expect this to be practical without rsqrt
> + * but there's no reason for triple punishment so at least
> + * save the otherwise resulting division and unnecessary mul...
> + */
> + x05 = lp_build_sqrt(&f32_bld, src);
> + }
> +
> + tmp = lp_build_mul(&f32_bld, x05, src);
> + if (lp_build_fast_rsqrt_available(src_type)) {
> + x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld,
> tmp));
> + }
> + else {
> + x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
> + }
> +
> + a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 *
> 255.0f);
> + b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 *
> 255.0f);
> + c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
> +
> + tmp1 = lp_build_mul(&f32_bld, a_const, x0375);
> + tmp2 = lp_build_mul(&f32_bld, b_const, x05);
> + tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
> + pow_final = lp_build_add(&f32_bld, tmp1, tmp2);
Again, please use lp_build_polynomial for polynomial construction.
> +
> + /* linear part */
> + lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
> + lin = lp_build_mul(&f32_bld, src, lin_const);
> +
> + lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
> + is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src,
> lin_thresh);
> + tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
> +
> + f32_bld.type.sign = 0;
> + return lp_build_iround(&f32_bld, tmp);
> +}
> +#endif
Otherwise looks good. I'm trusting you on the maths side though!
Jose
More information about the mesa-dev
mailing list