[Mesa-dev] [PATCH] llvmpipe: add EXT_packed_float render target format support
Roland Scheidegger
sroland at vmware.com
Thu Mar 21 17:07:29 PDT 2013
Ok so before someone else notices that, ignore the rgb9e5 part.
The format isn't quite what I thought it was...
Roland
Am 21.03.2013 23:28, schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
>
> New conversion code to handle conversion from/to r11g11b10 AoS to/from
> SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
> (which works pretty much the same as r11g11b10 except for the packing).
> (This code should also be used for texture sampling instead of
> relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
> Unfortunately a crazy amount of hacks is necessary to get the conversion
> code running in llvmpipe's generate_unswizzled_blend, which isn't well
> suited for formats where the storage representation has nothing to do
> with what's needed for blending (moreover, the conversion will convert
> from packed AoS values, which is the storage format, to float SoA values,
> because this is much more natural for the conversion, and likewise from
> SoA values to packed AoS values - but the "blend" (which includes
> trivial things like partial mask) works on AoS values, so incoming fs
> values will go SoA->AoS, values from destination will go packed
> AoS->SoA->AoS, then do blend, then AoS->SoA->packed AoS which probably
> isn't the most efficient way though the shuffles are probably bearable).
>
> Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
> still need to verify Inf/NaNs (where most of the complexity in the
> conversion comes from actually).
> ---
> src/gallium/auxiliary/gallivm/lp_bld_conv.c | 314 +++++++++++++++++++++++++++
> src/gallium/auxiliary/gallivm/lp_bld_conv.h | 14 ++
> src/gallium/drivers/llvmpipe/lp_screen.c | 6 +-
> src/gallium/drivers/llvmpipe/lp_state_fs.c | 126 +++++++++++
> 4 files changed, 458 insertions(+), 2 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> index dc3649d..4fce1bc 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> @@ -155,6 +155,320 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
>
>
> /**
> + * Convert float32 to a float-like value with less exponent and mantissa
> + * bits. The mantissa is still biased, and the mantissa still has an implied 1,
> + * but there's no sign bit.
> + *
> + * @param src (vector) float value to convert
> + * @param mantissa_bits the number of mantissa bits
> + * @param exponent_bits the number of exponent bits
> + *
> + * Unlike float_to_half using accurate method here.
> + * This implements round-towards-zero (trunc) hence too large numbers get
> + * converted to largest representable number, not infinity.
> + * Small numbers may get converted to denorms, depending on normal
> + * float denorm handling of the cpu.
> + * Note that compared to the references, below, we skip any rounding bias
> + * and do strict rounding towards zero (if I got the constants right...)
> + * - OpenGL allows rounding towards zero (though not preferred) and
> + * DX10 even seems to require it.
> + * Note that this will not try to pack the values somehow - they will
> + * look like "rescaled floats" (except for Inf/NaN) (but returned as
> + * (vector) int32).
> + *
> + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
> + * ref https://gist.github.com/rygorous/2156668
> + */
> +static LLVMValueRef
> +lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
> + LLVMValueRef src,
> + unsigned mantissa_bits,
> + unsigned exponent_bits)
> +{
> + LLVMBuilderRef builder = gallivm->builder;
> + LLVMTypeRef src_type = LLVMTypeOf(src);
> + LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
> + LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
> + LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum;
> + unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> + LLVMGetVectorSize(src_type) : 1;
> + struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
> + struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
> + struct lp_build_context f32_bld, i32_bld;
> + LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
> +
> + lp_build_context_init(&f32_bld, gallivm, f32_type);
> + lp_build_context_init(&i32_bld, gallivm, i32_type);
> +
> + i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
> + ((1 << exponent_bits) - 1) << 23);
> + i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
> +
> + /* "ordinary" number */
> + /* clamp to pos range (can still have sign bit if NaN but doesn't matter) */
> + clamped = lp_build_max(&f32_bld, src, zero);
> + clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, "");
> + /* get rid of excess mantissa bits */
> + /* really not sure about that constant */
> + i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
> + ~((1 << (23 - mantissa_bits)) - 1));
> +
> + tmp = lp_build_and(&i32_bld, clamped, i32_roundmask);
> + tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, "");
> + /* bias exponent (and denormalize if necessary) */
> + magic = lp_build_const_int_vec(gallivm, i32_type,
> + ((1 << (exponent_bits - 1)) - 1) << 23);
> + magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
> + normal = lp_build_mul(&f32_bld, tmp, magic);
> +
> + /* clamp to max value */
> + small_max = lp_build_const_int_vec(gallivm, i32_type,
> + (((1 << exponent_bits) - 2) << 23) |
> + (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
> + small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
> + normal = lp_build_min(&f32_bld, normal, small_max);
> + normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
> +
> + /*
> + * handle nan/inf cases
> + * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan
> + * Note that on a lucky day, we could simplify this a bit,
> + * by just using the max(src, zero) result - this will have -Inf
> + * clamped to 0, and MIGHT preserve the NaNs.
> + */
> + src_abs = lp_build_abs(&f32_bld, src);
> + src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, "");
> + src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, "");
> + isnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
> + src_abs, i32_floatexpmask);
> + isposinf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL,
> + src, i32_floatexpmask);
> + isnanorposinf = lp_build_and(&i32_bld, isnan, isposinf);
> + /* could also set more mantissa bits but need at least the highest mantissa bit */
> + i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22);
> + /* combine maxexp with qnanbit */
> + nanorposinfnum = lp_build_or(&i32_bld, i32_smallexpmask,
> + lp_build_and(&i32_bld, isnan, i32_qnanbit));
> +
> + return lp_build_select(&i32_bld, isnanorposinf, nanorposinfnum, normal);
> +}
> +
> +
> +/**
> + * Convert a float-like value with less exponent and mantissa
> + * bits than a normal float32 to a float32. The mantissa of
> + * the source value is assumed to have an implied 1, and the exponent
> + * is biased. There are no negative values.
> + * The source value already is in "rescaled float" format, with the
> + * exponent starting at bit 23 (and the relevant mantissa bits immediately
> + * below that).
> + *
> + * @param src (vector) value to convert
> + * @param mantissa_bits the number of mantissa bits
> + * @param exponent_bits the number of exponent bits
> + *
> + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
> + * ref https://gist.github.com/rygorous/2156668
> + */
> +static LLVMValueRef
> +lp_build_smallfloat_nosign_to_float(struct gallivm_state *gallivm,
> + LLVMValueRef src,
> + unsigned mantissa_bits,
> + unsigned exponent_bits)
> +{
> + LLVMBuilderRef builder = gallivm->builder;
> + LLVMTypeRef src_type = LLVMTypeOf(src);
> + LLVMValueRef smallexpmask, i32_floatexpmask, magic;
> + LLVMValueRef wasinfnan, tmp, res;
> + unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> + LLVMGetVectorSize(src_type) : 1;
> + struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
> + struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
> + struct lp_build_context f32_bld, i32_bld;
> +
> + lp_build_context_init(&f32_bld, gallivm, f32_type);
> + lp_build_context_init(&i32_bld, gallivm, i32_type);
> +
> + smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
> + ((1 << exponent_bits) - 1) << 23);
> + i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
> + /*
> + * magic number has exponent new exp bias + (new exp bias - old exp bias),
> + * mantissa is 0.
> + */
> + magic = lp_build_const_int_vec(gallivm, i32_type,
> + (255 - (1 << (exponent_bits - 1))) << 23);
> + magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
> +
> + /* adjust exponent and fix denorms */
> + res = lp_build_mul(&f32_bld, src, magic);
> +
> + /*
> + * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
> + * so a simple "or" will do (because exp adjust will leave mantissa intact)
> + */
> + /* use float compare (better for AVX 8-wide / no AVX2 though otherwise should use int) */
> + smallexpmask = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
> + wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, src, smallexpmask);
> + res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
> + tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
> + res = lp_build_or(&i32_bld, tmp, res);
> +
> + return LLVMBuildBitCast(builder, res, f32_bld.vec_type, "");
> +}
> +
> +
> +/**
> + * Convert rgba float SoA values to packed r11g11b10 values.
> + *
> + * @param src SoA float (vector) values to convert.
> + */
> +LLVMValueRef
> +lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
> + LLVMValueRef *src)
> +{
> + LLVMValueRef dst, rcomp, bcomp, gcomp, shift, mask;
> + struct lp_build_context i32_bld;
> + LLVMTypeRef src_type = LLVMTypeOf(*src);
> + unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> + LLVMGetVectorSize(src_type) : 1;
> + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
> +
> + lp_build_context_init(&i32_bld, gallivm, i32_type);
> +
> + /* "rescale" - this does the actual conversion except the packing */
> + rcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[0], 6, 5);
> + gcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[1], 6, 5);
> + bcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[2], 5, 5);
> +
> + /* pack rescaled SoA floats to r11g11b10 AoS values */
> + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6);
> + rcomp = lp_build_shr(&i32_bld, rcomp, shift);
> +
> + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17);
> + mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << 11);
> + gcomp = lp_build_shr(&i32_bld, gcomp, shift);
> + gcomp = lp_build_and(&i32_bld, gcomp, mask);
> +
> + shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
> + mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << 22);
> + bcomp = lp_build_shl(&i32_bld, bcomp, shift);
> + bcomp = lp_build_and(&i32_bld, bcomp, mask);
> +
> + dst = lp_build_or(&i32_bld, rcomp, gcomp);
> + return lp_build_or(&i32_bld, dst, bcomp);
> +}
> +
> +
> +/**
> + * Convert packed float format (r11g11b10) value(s) to rgba float SoA values.
> + *
> + * @param src packed AoS r11g11b10 values (as (vector) int32)
> + * @param dst pointer to the SoA result values
> + */
> +void
> +lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
> + LLVMValueRef src,
> + LLVMValueRef *dst)
> +{
> + LLVMBuilderRef builder = gallivm->builder;
> + LLVMTypeRef src_type = LLVMTypeOf(src);
> + LLVMValueRef rcomp, bcomp, gcomp, shift, mask;
> + unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> + LLVMGetVectorSize(src_type) : 1;
> + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
> + struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
> + struct lp_build_context i32_bld, f32_bld;
> +
> + lp_build_context_init(&i32_bld, gallivm, i32_type);
> + lp_build_context_init(&f32_bld, gallivm, f32_type);
> +
> + /* put mantissa/exp into "rescaled float" format */
> + mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << (23 - 6));
> + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6);
> + rcomp = lp_build_shl(&i32_bld, src, shift);
> + rcomp = lp_build_and(&i32_bld, rcomp, mask);
> + rcomp = LLVMBuildBitCast(builder, rcomp, f32_bld.vec_type, "");
> + dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, rcomp, 6, 5);
> +
> + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17);
> + gcomp = lp_build_shl(&i32_bld, src, shift);
> + gcomp = lp_build_and(&i32_bld, gcomp, mask);
> + gcomp = LLVMBuildBitCast(builder, gcomp, f32_bld.vec_type, "");
> + dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, gcomp, 6, 5);
> +
> + shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
> + mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << (23 - 5));
> + /* really logical shift but gets masked out anyway */
> + bcomp = lp_build_shr(&i32_bld, src, shift);
> + bcomp = lp_build_and(&i32_bld, bcomp, mask);
> + bcomp = LLVMBuildBitCast(builder, bcomp, f32_bld.vec_type, "");
> + dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, bcomp, 5, 5);
> +
> + /* Just set alpha to one */
> + dst[3] = f32_bld.one;
> +}
> +
> +
> +/**
> + * Convert shared exponent format (rgb9e5) value(s) to rgba float SoA values.
> + *
> + * @param src packed AoS rgb9e5 values (as (vector) int32)
> + * @param dst pointer to the SoA result values
> + */
> +void
> +lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
> + LLVMValueRef src,
> + LLVMValueRef *dst)
> +{
> + LLVMBuilderRef builder = gallivm->builder;
> + LLVMTypeRef src_type = LLVMTypeOf(src);
> + LLVMValueRef rcomp, bcomp, gcomp, exp, shift, mask;
> + unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> + LLVMGetVectorSize(src_type) : 1;
> + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
> + struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
> + struct lp_build_context i32_bld, f32_bld;
> +
> + lp_build_context_init(&i32_bld, gallivm, i32_type);
> + lp_build_context_init(&f32_bld, gallivm, f32_type);
> +
> + /* extract exponent */
> + mask = lp_build_const_int_vec(gallivm, i32_type, 0x1f << 23);
> + shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
> + exp = lp_build_shl(&i32_bld, src, shift);
> + exp = lp_build_and(&i32_bld, exp, mask);
> +
> + /* put mantissa/exp into "rescaled float" format */
> + mask = lp_build_const_int_vec(gallivm, i32_type, 0x1ff << (23 - 9));
> + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 9);
> + rcomp = lp_build_shl(&i32_bld, src, shift);
> + rcomp = lp_build_and(&i32_bld, rcomp, mask);
> + rcomp = lp_build_or(&i32_bld, rcomp, exp);
> + rcomp = LLVMBuildBitCast(builder, rcomp, f32_bld.vec_type, "");
> + dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, rcomp, 9, 5);
> +
> + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 18);
> + gcomp = lp_build_shl(&i32_bld, src, shift);
> + gcomp = lp_build_and(&i32_bld, gcomp, mask);
> + gcomp = lp_build_or(&i32_bld, gcomp, exp);
> + gcomp = LLVMBuildBitCast(builder, gcomp, f32_bld.vec_type, "");
> + dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, gcomp, 9, 5);
> +
> + shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
> + /* really logical shift but gets masked out anyway */
> + bcomp = lp_build_shr(&i32_bld, src, shift);
> + bcomp = lp_build_and(&i32_bld, bcomp, mask);
> + bcomp = lp_build_or(&i32_bld, bcomp, exp);
> + bcomp = LLVMBuildBitCast(builder, bcomp, f32_bld.vec_type, "");
> + dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, bcomp, 9, 5);
> +
> + /* Just set alpha to one */
> + dst[3] = f32_bld.one;
> +}
> +
> +
> +/**
> * Converts int16 half-float to float32
> * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
> * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
> index d7dfed8..d8bc294 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
> @@ -62,6 +62,20 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
> LLVMValueRef src);
>
> LLVMValueRef
> +lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
> + LLVMValueRef *src);
> +
> +void
> +lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
> + LLVMValueRef src,
> + LLVMValueRef *dst);
> +
> +void
> +lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
> + LLVMValueRef src,
> + LLVMValueRef *dst);
> +
> +LLVMValueRef
> lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
> struct lp_type src_type,
> unsigned dst_width,
> diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
> index 93e125d..ece7679 100644
> --- a/src/gallium/drivers/llvmpipe/lp_screen.c
> +++ b/src/gallium/drivers/llvmpipe/lp_screen.c
> @@ -321,7 +321,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
> if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB)
> return FALSE;
>
> - if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
> + if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN &&
> + format != PIPE_FORMAT_R11G11B10_FLOAT)
> return FALSE;
> assert(format_desc->block.width == 1);
> assert(format_desc->block.height == 1);
> @@ -329,7 +330,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
> if (format_desc->is_mixed)
> return FALSE;
>
> - if (!format_desc->is_array && !format_desc->is_bitmask)
> + if (!format_desc->is_array && !format_desc->is_bitmask &&
> + format != PIPE_FORMAT_R11G11B10_FLOAT)
> return FALSE;
>
> /*
> diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> index d8369b4..953a5c1 100644
> --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
> +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> @@ -972,6 +972,17 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
> unsigned i;
> unsigned chan;
>
> + if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> + /* just make this a 32bit uint */
> + type->floating = false;
> + type->fixed = false;
> + type->sign = false;
> + type->norm = false;
> + type->width = 32;
> + type->length = 1;
> + return;
> + }
> +
> for (i = 0; i < 4; i++)
> if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
> break;
> @@ -1009,6 +1020,17 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc
> unsigned i;
> unsigned chan;
>
> + if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> + /* always use ordinary floats for blending */
> + type->floating = true;
> + type->fixed = false;
> + type->sign = true;
> + type->norm = false;
> + type->width = 32;
> + type->length = 4;
> + return;
> + }
> +
> for (i = 0; i < 4; i++)
> if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
> break;
> @@ -1122,6 +1144,48 @@ convert_to_blend_type(struct gallivm_state *gallivm,
> unsigned pixels = 16 / num_srcs;
> bool is_arith;
>
> + /*
> + * full custom path for packed floats - none of the later functions would do
> + * anything useful, and given the lp_type representation they can't be fixed.
> + */
> + if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> + LLVMValueRef tmpsrc[4];
> + /*
> + * This is pretty suboptimal for this case blending in SoA would be much
> + * better, since conversion gets us SoA values so need to convert back.
> + */
> + assert(src_type.width == 32);
> + assert(dst_type.floating);
> + assert(dst_type.width = 32);
> + assert(dst_type.length % 4 == 0);
> + for (i = 0; i < 4; i++) {
> + tmpsrc[i] = src[i];
> + }
> + for (i = 0; i < num_srcs / 4; i++) {
> + LLVMValueRef tmpsoa[4];
> + LLVMValueRef tmps = tmpsrc[i];
> + if (num_srcs == 8) {
> + LLVMValueRef shuffles[8];
> + unsigned j;
> + /* fetch was 4 values but need 8-wide output values */
> + tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
> + /*
> + * for 8-wide aos transpose would give us wrong order not matching
> + * incoming converted fs values and mask. ARGH.
> + */
> + for (j = 0; j < 4; j++) {
> + shuffles[j] = lp_build_const_int32(gallivm, j * 2);
> + shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
> + }
> + tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
> + LLVMConstVector(shuffles, 8), "");
> + }
> + lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
> + lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
> + }
> + return;
> + }
> +
> lp_mem_type_from_format_desc(src_fmt, &mem_type);
> lp_blend_type_from_format_desc(src_fmt, &blend_type);
>
> @@ -1225,6 +1289,47 @@ convert_from_blend_type(struct gallivm_state *gallivm,
> unsigned pixels = 16 / num_srcs;
> bool is_arith;
>
> + /*
> + * full custom path for packed floats - none of the later functions would do
> + * anything useful, and given the lp_type representation they can't be fixed.
> + */
> + if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> + /*
> + * This is pretty suboptimal for this case blending in SoA would be much
> + * better - we need to transpose the AoS values back to SoA values for
> + * conversion/packing.
> + */
> + assert(src_type.floating);
> + assert(src_type.width = 32);
> + assert(src_type.length % 4 == 0);
> + assert(dst_type.width == 32);
> + for (i = 0; i < num_srcs / 4; i++) {
> + LLVMValueRef tmpsoa[4], tmpdst;
> + lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
> + tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
> + if (num_srcs == 8) {
> + LLVMValueRef tmpaos, shuffles[8];
> + unsigned j;
> + /*
> + * for 8-wide aos transpose has given us wrong order not matching
> + * output order. HMPF. Also need to split the output values manually.
> + */
> + for (j = 0; j < 4; j++) {
> + shuffles[j * 2] = lp_build_const_int32(gallivm, j);
> + shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
> + }
> + tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
> + LLVMConstVector(shuffles, 8), "");
> + src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
> + src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
> + }
> + else {
> + src[i] = tmpdst;
> + }
> + }
> + return;
> + }
> +
> lp_mem_type_from_format_desc(src_fmt, &mem_type);
> lp_blend_type_from_format_desc(src_fmt, &blend_type);
>
> @@ -1532,6 +1637,17 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
> }
> }
>
> + if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
> + /* the code above can't work for layout_other */
> + dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
> + has_alpha = true;
> + swizzle[0] = 0;
> + swizzle[1] = 1;
> + swizzle[2] = 2;
> + swizzle[3] = 3;
> + pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
> + }
> +
> /* If 3 channels then pad to include alpha for 4 element transpose */
> if (dst_channels == 3 && !has_alpha) {
> for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
> @@ -1756,6 +1872,16 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>
> dst_type.length *= 16 / dst_count;
>
> + if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
> + /*
> + * we need multiple values at once for the conversion, so can as well
> + * load them vectorized here too instead of concatenating later.
> + * (Still need concatenation later for 8-wide vectors).
> + */
> + dst_count = block_height;
> + dst_type.length = block_width;
> + }
> +
> load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
> dst, dst_type, dst_count, dst_alignment);
>
>
More information about the mesa-dev
mailing list