[Mesa-dev] [PATCH] llvmpipe: add EXT_packed_float render target format support

Thu Mar 21 17:07:29 PDT 2013

Ok so before someone else notices that, ignore the rgb9e5 part.
The format isn't quite what I thought it was...

Roland

Am 21.03.2013 23:28, schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> New conversion code to handle conversion from/to r11g11b10 AoS to/from
> SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
> (which works pretty much the same as r11g11b10 except for the packing).
> (This code should also be used for texture sampling instead of
> relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
> Unfortunately a crazy amount of hacks is necessary to get the conversion
> code running in llvmpipe's generate_unswizzled_blend, which isn't well
> suited for formats where the storage representation has nothing to do
> with what's needed for blending (moreover, the conversion will convert
> from packed AoS values, which is the storage format, to float SoA values,
> because this is much more natural for the conversion, and likewise from
> SoA values to packed AoS values - but the "blend" (which includes
> trivial things like partial mask) works on AoS values, so incoming fs
> values will go SoA->AoS, values from destination will go packed
> AoS->SoA->AoS, then do blend, then AoS->SoA->packed AoS which probably
> isn't the most efficient way though the shuffles are probably bearable).
> 
> Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
> still need to verify Inf/NaNs (where most of the complexity in the
> conversion comes from actually).
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_conv.c |  314 +++++++++++++++++++++++++++
>  src/gallium/auxiliary/gallivm/lp_bld_conv.h |   14 ++
>  src/gallium/drivers/llvmpipe/lp_screen.c    |    6 +-
>  src/gallium/drivers/llvmpipe/lp_state_fs.c  |  126 +++++++++++
>  4 files changed, 458 insertions(+), 2 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> index dc3649d..4fce1bc 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
> @@ -155,6 +155,320 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
>  
>  
>  /**
> + * Convert float32 to a float-like value with less exponent and mantissa
> + * bits. The mantissa is still biased, and the mantissa still has an implied 1,
> + * but there's no sign bit.
> + *
> + * @param src             (vector) float value to convert
> + * @param mantissa_bits   the number of mantissa bits
> + * @param exponent_bits   the number of exponent bits
> + *
> + * Unlike float_to_half using accurate method here.
> + * This implements round-towards-zero (trunc) hence too large numbers get
> + * converted to largest representable number, not infinity.
> + * Small numbers may get converted to denorms, depending on normal
> + * float denorm handling of the cpu.
> + * Note that compared to the references, below, we skip any rounding bias
> + * and do strict rounding towards zero (if I got the constants right...)
> + * - OpenGL allows rounding towards zero (though not preferred) and
> + * DX10 even seems to require it.
> + * Note that this will not try to pack the values somehow - they will
> + * look like "rescaled floats" (except for Inf/NaN) (but returned as
> + * (vector) int32).
> + *
> + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
> + * ref https://gist.github.com/rygorous/2156668
> + */
> +static LLVMValueRef
> +lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
> +                                    LLVMValueRef src,
> +                                    unsigned mantissa_bits,
> +                                    unsigned exponent_bits)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMTypeRef src_type = LLVMTypeOf(src);
> +   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
> +   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
> +   LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum;
> +   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> +                        LLVMGetVectorSize(src_type) : 1;
> +   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
> +   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
> +   struct lp_build_context f32_bld, i32_bld;
> +   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
> +
> +   lp_build_context_init(&f32_bld, gallivm, f32_type);
> +   lp_build_context_init(&i32_bld, gallivm, i32_type);
> +
> +   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
> +                                             ((1 << exponent_bits) - 1) << 23);
> +   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
> +
> +   /* "ordinary" number */
> +   /* clamp to pos range (can still have sign bit if NaN but doesn't matter) */
> +   clamped = lp_build_max(&f32_bld, src, zero);
> +   clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, "");
> +   /* get rid of excess mantissa bits */
> +   /* really not sure about that constant */
> +   i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
> +                                          ~((1 << (23 - mantissa_bits)) - 1));
> +
> +   tmp = lp_build_and(&i32_bld, clamped, i32_roundmask);
> +   tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, "");
> +   /* bias exponent (and denormalize if necessary) */
> +   magic = lp_build_const_int_vec(gallivm, i32_type,
> +                                  ((1 << (exponent_bits - 1)) - 1) << 23);
> +   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
> +   normal = lp_build_mul(&f32_bld, tmp, magic);
> +
> +   /* clamp to max value */
> +   small_max = lp_build_const_int_vec(gallivm, i32_type,
> +                                      (((1 << exponent_bits) - 2) << 23) |
> +                                      (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
> +   small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
> +   normal = lp_build_min(&f32_bld, normal, small_max);
> +   normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
> +
> +   /*
> +    * handle nan/inf cases
> +    * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan
> +    * Note that on a lucky day, we could simplify this a bit,
> +    * by just using the max(src, zero) result - this will have -Inf
> +    * clamped to 0, and MIGHT preserve the NaNs.
> +    */
> +   src_abs = lp_build_abs(&f32_bld, src);
> +   src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, "");
> +   src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, "");
> +   isnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
> +                            src_abs, i32_floatexpmask);
> +   isposinf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL,
> +                               src, i32_floatexpmask);
> +   isnanorposinf = lp_build_and(&i32_bld, isnan, isposinf);
> +   /* could also set more mantissa bits but need at least the highest mantissa bit */
> +   i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22);
> +   /* combine maxexp with qnanbit */
> +   nanorposinfnum = lp_build_or(&i32_bld, i32_smallexpmask,
> +                                lp_build_and(&i32_bld, isnan, i32_qnanbit));
> +
> +   return lp_build_select(&i32_bld, isnanorposinf, nanorposinfnum, normal);
> +}
> +
> +
> +/**
> + * Convert a float-like value with less exponent and mantissa
> + * bits than a normal float32 to a float32. The mantissa of
> + * the source value is assumed to have an implied 1, and the exponent
> + * is biased. There are no negative values.
> + * The source value already is in "rescaled float" format, with the
> + * exponent starting at bit 23 (and the relevant mantissa bits immediately
> + * below that).
> + *
> + * @param src             (vector) value to convert
> + * @param mantissa_bits   the number of mantissa bits
> + * @param exponent_bits   the number of exponent bits
> + *
> + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
> + * ref https://gist.github.com/rygorous/2156668
> + */
> +static LLVMValueRef
> +lp_build_smallfloat_nosign_to_float(struct gallivm_state *gallivm,
> +                                    LLVMValueRef src,
> +                                    unsigned mantissa_bits,
> +                                    unsigned exponent_bits)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMTypeRef src_type = LLVMTypeOf(src);
> +   LLVMValueRef smallexpmask, i32_floatexpmask, magic;
> +   LLVMValueRef wasinfnan, tmp, res;
> +   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> +                        LLVMGetVectorSize(src_type) : 1;
> +   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
> +   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
> +   struct lp_build_context f32_bld, i32_bld;
> +
> +   lp_build_context_init(&f32_bld, gallivm, f32_type);
> +   lp_build_context_init(&i32_bld, gallivm, i32_type);
> +
> +   smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
> +                                         ((1 << exponent_bits) - 1) << 23);
> +   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
> +   /*
> +    * magic number has exponent new exp bias + (new exp bias - old exp bias),
> +    * mantissa is 0.
> +    */
> +   magic = lp_build_const_int_vec(gallivm, i32_type,
> +                                  (255 - (1 << (exponent_bits - 1))) << 23);
> +   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
> +
> +   /* adjust exponent and fix denorms */
> +   res = lp_build_mul(&f32_bld, src, magic);
> +
> +   /*
> +    * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
> +    * so a simple "or" will do (because exp adjust will leave mantissa intact)
> +    */
> +   /* use float compare (better for AVX 8-wide / no AVX2 though otherwise should use int) */
> +   smallexpmask = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
> +   wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, src, smallexpmask);
> +   res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
> +   tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
> +   res = lp_build_or(&i32_bld, tmp, res);
> +
> +   return LLVMBuildBitCast(builder, res, f32_bld.vec_type, "");
> +}
> +
> +
> +/**
> + * Convert rgba float SoA values to packed r11g11b10 values.
> + *
> + * @param src   SoA float (vector) values to convert.
> + */
> +LLVMValueRef
> +lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
> +                            LLVMValueRef *src)
> +{
> +   LLVMValueRef dst, rcomp, bcomp, gcomp, shift, mask;
> +   struct lp_build_context i32_bld;
> +   LLVMTypeRef src_type = LLVMTypeOf(*src);
> +   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> +                            LLVMGetVectorSize(src_type) : 1;
> +   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
> +
> +   lp_build_context_init(&i32_bld, gallivm, i32_type);
> +
> +   /* "rescale" - this does the actual conversion except the packing */
> +   rcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[0], 6, 5);
> +   gcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[1], 6, 5);
> +   bcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[2], 5, 5);
> +
> +   /* pack rescaled SoA floats to r11g11b10 AoS values */
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6);
> +   rcomp = lp_build_shr(&i32_bld, rcomp, shift);
> +
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17);
> +   mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << 11);
> +   gcomp = lp_build_shr(&i32_bld, gcomp, shift);
> +   gcomp = lp_build_and(&i32_bld, gcomp, mask);
> +
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
> +   mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << 22);
> +   bcomp = lp_build_shl(&i32_bld, bcomp, shift);
> +   bcomp = lp_build_and(&i32_bld, bcomp, mask);
> +
> +   dst = lp_build_or(&i32_bld, rcomp, gcomp);
> +   return lp_build_or(&i32_bld, dst, bcomp);
> +}
> +
> +
> +/**
> + * Convert packed float format (r11g11b10) value(s) to rgba float SoA values.
> + *
> + * @param src   packed AoS r11g11b10 values (as (vector) int32)
> + * @param dst   pointer to the SoA result values
> + */
> +void
> +lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
> +                            LLVMValueRef src,
> +                            LLVMValueRef *dst)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMTypeRef src_type = LLVMTypeOf(src);
> +   LLVMValueRef rcomp, bcomp, gcomp, shift, mask;
> +   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> +                            LLVMGetVectorSize(src_type) : 1;
> +   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
> +   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
> +   struct lp_build_context i32_bld, f32_bld;
> +
> +   lp_build_context_init(&i32_bld, gallivm, i32_type);
> +   lp_build_context_init(&f32_bld, gallivm, f32_type);
> +
> +   /* put mantissa/exp into "rescaled float" format */
> +   mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << (23 - 6));
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6);
> +   rcomp = lp_build_shl(&i32_bld, src, shift);
> +   rcomp = lp_build_and(&i32_bld, rcomp, mask);
> +   rcomp = LLVMBuildBitCast(builder, rcomp, f32_bld.vec_type, "");
> +   dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, rcomp, 6, 5);
> +
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17);
> +   gcomp = lp_build_shl(&i32_bld, src, shift);
> +   gcomp = lp_build_and(&i32_bld, gcomp, mask);
> +   gcomp = LLVMBuildBitCast(builder, gcomp, f32_bld.vec_type, "");
> +   dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, gcomp, 6, 5);
> +
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
> +   mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << (23 - 5));
> +   /* really logical shift but gets masked out anyway */
> +   bcomp = lp_build_shr(&i32_bld, src, shift);
> +   bcomp = lp_build_and(&i32_bld, bcomp, mask);
> +   bcomp = LLVMBuildBitCast(builder, bcomp, f32_bld.vec_type, "");
> +   dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, bcomp, 5, 5);
> +
> +   /* Just set alpha to one */
> +   dst[3] = f32_bld.one;
> +}
> +
> +
> +/**
> + * Convert shared exponent format (rgb9e5) value(s) to rgba float SoA values.
> + *
> + * @param src   packed AoS rgb9e5 values (as (vector) int32)
> + * @param dst   pointer to the SoA result values
> + */
> +void
> +lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
> +                         LLVMValueRef src,
> +                         LLVMValueRef *dst)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMTypeRef src_type = LLVMTypeOf(src);
> +   LLVMValueRef rcomp, bcomp, gcomp, exp, shift, mask;
> +   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
> +                            LLVMGetVectorSize(src_type) : 1;
> +   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
> +   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
> +   struct lp_build_context i32_bld, f32_bld;
> +
> +   lp_build_context_init(&i32_bld, gallivm, i32_type);
> +   lp_build_context_init(&f32_bld, gallivm, f32_type);
> +
> +   /* extract exponent */
> +   mask = lp_build_const_int_vec(gallivm, i32_type, 0x1f << 23);
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
> +   exp = lp_build_shl(&i32_bld, src, shift);
> +   exp = lp_build_and(&i32_bld, exp, mask);
> +
> +   /* put mantissa/exp into "rescaled float" format */
> +   mask = lp_build_const_int_vec(gallivm, i32_type, 0x1ff << (23 - 9));
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 9);
> +   rcomp = lp_build_shl(&i32_bld, src, shift);
> +   rcomp = lp_build_and(&i32_bld, rcomp, mask);
> +   rcomp = lp_build_or(&i32_bld, rcomp, exp);
> +   rcomp = LLVMBuildBitCast(builder, rcomp, f32_bld.vec_type, "");
> +   dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, rcomp, 9, 5);
> +
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 18);
> +   gcomp = lp_build_shl(&i32_bld, src, shift);
> +   gcomp = lp_build_and(&i32_bld, gcomp, mask);
> +   gcomp = lp_build_or(&i32_bld, gcomp, exp);
> +   gcomp = LLVMBuildBitCast(builder, gcomp, f32_bld.vec_type, "");
> +   dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, gcomp, 9, 5);
> +
> +   shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
> +   /* really logical shift but gets masked out anyway */
> +   bcomp = lp_build_shr(&i32_bld, src, shift);
> +   bcomp = lp_build_and(&i32_bld, bcomp, mask);
> +   bcomp = lp_build_or(&i32_bld, bcomp, exp);
> +   bcomp = LLVMBuildBitCast(builder, bcomp, f32_bld.vec_type, "");
> +   dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, bcomp, 9, 5);
> +
> +   /* Just set alpha to one */
> +   dst[3] = f32_bld.one;
> +}
> +
> +
> +/**
>   * Converts int16 half-float to float32
>   * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
>   * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
> index d7dfed8..d8bc294 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
> @@ -62,6 +62,20 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
>                         LLVMValueRef src);
>  
>  LLVMValueRef
> +lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
> +                            LLVMValueRef *src);
> +
> +void
> +lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
> +                            LLVMValueRef src,
> +                            LLVMValueRef *dst);
> +
> +void
> +lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
> +                         LLVMValueRef src,
> +                         LLVMValueRef *dst);
> +
> +LLVMValueRef
>  lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
>                                          struct lp_type src_type,
>                                          unsigned dst_width,
> diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
> index 93e125d..ece7679 100644
> --- a/src/gallium/drivers/llvmpipe/lp_screen.c
> +++ b/src/gallium/drivers/llvmpipe/lp_screen.c
> @@ -321,7 +321,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
>        if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB)
>           return FALSE;
>  
> -      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
> +      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN &&
> +          format != PIPE_FORMAT_R11G11B10_FLOAT)
>           return FALSE;
>        assert(format_desc->block.width == 1);
>        assert(format_desc->block.height == 1);
> @@ -329,7 +330,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
>        if (format_desc->is_mixed)
>           return FALSE;
>  
> -      if (!format_desc->is_array && !format_desc->is_bitmask)
> +      if (!format_desc->is_array && !format_desc->is_bitmask &&
> +          format != PIPE_FORMAT_R11G11B10_FLOAT)
>           return FALSE;
>  
>        /*
> diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> index d8369b4..953a5c1 100644
> --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
> +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> @@ -972,6 +972,17 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
>     unsigned i;
>     unsigned chan;
>  
> +   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> +      /* just make this a 32bit uint */
> +      type->floating = false;
> +      type->fixed = false;
> +      type->sign = false;
> +      type->norm = false;
> +      type->width = 32;
> +      type->length = 1;
> +      return;
> +   }
> +
>     for (i = 0; i < 4; i++)
>        if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
>           break;
> @@ -1009,6 +1020,17 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc
>     unsigned i;
>     unsigned chan;
>  
> +   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> +      /* always use ordinary floats for blending */
> +      type->floating = true;
> +      type->fixed = false;
> +      type->sign = true;
> +      type->norm = false;
> +      type->width = 32;
> +      type->length = 4;
> +      return;
> +   }
> +
>     for (i = 0; i < 4; i++)
>        if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
>           break;
> @@ -1122,6 +1144,48 @@ convert_to_blend_type(struct gallivm_state *gallivm,
>     unsigned pixels = 16 / num_srcs;
>     bool is_arith;
>  
> +   /*
> +    * full custom path for packed floats - none of the later functions would do
> +    * anything useful, and given the lp_type representation they can't be fixed.
> +    */
> +   if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> +      LLVMValueRef tmpsrc[4];
> +      /*
> +       * This is pretty suboptimal for this case blending in SoA would be much
> +       * better, since conversion gets us SoA values so need to convert back.
> +       */
> +      assert(src_type.width == 32);
> +      assert(dst_type.floating);
> +      assert(dst_type.width = 32);
> +      assert(dst_type.length % 4 == 0);
> +      for (i = 0; i < 4; i++) {
> +         tmpsrc[i] = src[i];
> +      }
> +      for (i = 0; i < num_srcs / 4; i++) {
> +         LLVMValueRef tmpsoa[4];
> +         LLVMValueRef tmps = tmpsrc[i];
> +         if (num_srcs == 8) {
> +            LLVMValueRef shuffles[8];
> +            unsigned j;
> +            /* fetch was 4 values but need 8-wide output values */
> +            tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
> +            /*
> +             * for 8-wide aos transpose would give us wrong order not matching
> +             * incoming converted fs values and mask. ARGH.
> +             */
> +            for (j = 0; j < 4; j++) {
> +               shuffles[j] = lp_build_const_int32(gallivm, j * 2);
> +               shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
> +            }
> +            tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
> +                                          LLVMConstVector(shuffles, 8), "");
> +         }
> +         lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
> +         lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
> +      }
> +      return;
> +   }
> +
>     lp_mem_type_from_format_desc(src_fmt, &mem_type);
>     lp_blend_type_from_format_desc(src_fmt, &blend_type);
>  
> @@ -1225,6 +1289,47 @@ convert_from_blend_type(struct gallivm_state *gallivm,
>     unsigned pixels = 16 / num_srcs;
>     bool is_arith;
>  
> +   /*
> +    * full custom path for packed floats - none of the later functions would do
> +    * anything useful, and given the lp_type representation they can't be fixed.
> +    */
> +   if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
> +      /*
> +       * This is pretty suboptimal for this case blending in SoA would be much
> +       * better - we need to transpose the AoS values back to SoA values for
> +       * conversion/packing.
> +       */
> +      assert(src_type.floating);
> +      assert(src_type.width = 32);
> +      assert(src_type.length % 4 == 0);
> +      assert(dst_type.width == 32);
> +      for (i = 0; i < num_srcs / 4; i++) {
> +         LLVMValueRef tmpsoa[4], tmpdst;
> +         lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
> +         tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
> +         if (num_srcs == 8) {
> +            LLVMValueRef tmpaos, shuffles[8];
> +            unsigned j;
> +            /*
> +             * for 8-wide aos transpose has given us wrong order not matching
> +             * output order. HMPF. Also need to split the output values manually.
> +             */
> +            for (j = 0; j < 4; j++) {
> +               shuffles[j * 2] = lp_build_const_int32(gallivm, j);
> +               shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
> +            }
> +            tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
> +                                            LLVMConstVector(shuffles, 8), "");
> +            src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
> +            src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
> +         }
> +         else {
> +            src[i] = tmpdst;
> +         }
> +      }
> +      return;
> +   }
> +
>     lp_mem_type_from_format_desc(src_fmt, &mem_type);
>     lp_blend_type_from_format_desc(src_fmt, &blend_type);
>  
> @@ -1532,6 +1637,17 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>        }
>     }
>  
> +   if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
> +      /* the code above can't work for layout_other */
> +      dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
> +      has_alpha = true;
> +      swizzle[0] = 0;
> +      swizzle[1] = 1;
> +      swizzle[2] = 2;
> +      swizzle[3] = 3;
> +      pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
> +   }
> +
>     /* If 3 channels then pad to include alpha for 4 element transpose */
>     if (dst_channels == 3 && !has_alpha) {
>        for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
> @@ -1756,6 +1872,16 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>  
>     dst_type.length *= 16 / dst_count;
>  
> +   if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
> +      /*
> +       * we need multiple values at once for the conversion, so can as well
> +       * load them vectorized here too instead of concatenating later.
> +       * (Still need concatenation later for 8-wide vectors).
> +       */
> +      dst_count = block_height;
> +      dst_type.length = block_width;
> +   }
> +
>     load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
>                           dst, dst_type, dst_count, dst_alignment);
>  
>