[Mesa-dev] [PATCH] llvmpipe: add EXT_packed_float render target format support

Thu Mar 21 19:31:45 PDT 2013

From: Roland Scheidegger <sroland at vmware.com>

New conversion code to handle conversion from/to r11g11b10 AoS to/from
SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
(which works pretty much the same as r11g11b10 except for the packing).
(This code should also be used for texture sampling instead of
relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
Unfortunately a crazy amount of hacks is necessary to get the conversion
code running in llvmpipe's generate_unswizzled_blend, which isn't well
suited for formats where the storage representation has nothing to do
with what's needed for blending (moreover, the conversion will convert
from packed AoS values, which is the storage format, to float SoA values,
because this is much more natural for the conversion, and likewise from
SoA values to packed AoS values - but the "blend" (which includes
trivial things like partial mask) works on AoS values, so incoming fs
values will go SoA->AoS, values from destination will go packed
AoS->SoA->AoS, then do blend, then AoS->SoA->packed AoS which probably
isn't the most efficient way though the shuffles are probably bearable).

Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
still need to verify Inf/NaNs (where most of the complexity in the
conversion comes from actually).

v2: drop the (very bogus) rgb9e5 part, and do component extraction
in the helper code for r11g11b10 to float conversion, making the code
slightly more compact (suggested by Jose), now that there are no other
callers left this works quite well. (Could do the same for the
opposite way but it's less than ideal there, final part of packing
needs to be done in caller anyway and there'd be another conditional.)
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |  250 +++++++++++++++++++++++++++
 src/gallium/auxiliary/gallivm/lp_bld_conv.h |    9 +
 src/gallium/drivers/llvmpipe/lp_screen.c    |    6 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  126 ++++++++++++++
 4 files changed, 389 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index dc3649d..06d64c7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -155,6 +155,256 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
 
 
 /**
+ * Convert float32 to a float-like value with less exponent and mantissa
+ * bits. The mantissa is still biased, and the mantissa still has an implied 1,
+ * but there's no sign bit.
+ *
+ * @param src             (vector) float value to convert
+ * @param mantissa_bits   the number of mantissa bits
+ * @param exponent_bits   the number of exponent bits
+ *
+ * Unlike float_to_half using accurate method here.
+ * This implements round-towards-zero (trunc) hence too large numbers get
+ * converted to largest representable number, not infinity.
+ * Small numbers may get converted to denorms, depending on normal
+ * float denorm handling of the cpu.
+ * Note that compared to the references, below, we skip any rounding bias
+ * and do strict rounding towards zero (if I got the constants right...)
+ * - OpenGL allows rounding towards zero (though not preferred) and
+ * DX10 even seems to require it.
+ * Note that this will not try to pack the values somehow - they will
+ * look like "rescaled floats" (except for Inf/NaN) (but returned as
+ * (vector) int32).
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ * ref https://gist.github.com/rygorous/2156668
+ */
+static LLVMValueRef
+lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
+                                    LLVMValueRef src,
+                                    unsigned mantissa_bits,
+                                    unsigned exponent_bits)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
+   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
+   LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum;
+   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+                        LLVMGetVectorSize(src_type) : 1;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
+   struct lp_build_context f32_bld, i32_bld;
+   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
+
+   lp_build_context_init(&f32_bld, gallivm, f32_type);
+   lp_build_context_init(&i32_bld, gallivm, i32_type);
+
+   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
+                                             ((1 << exponent_bits) - 1) << 23);
+   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+
+   /* "ordinary" number */
+   /* clamp to pos range (can still have sign bit if NaN but doesn't matter) */
+   clamped = lp_build_max(&f32_bld, src, zero);
+   clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, "");
+   /* get rid of excess mantissa bits */
+   /* really not sure about that constant */
+   i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
+                                          ~((1 << (23 - mantissa_bits)) - 1));
+
+   tmp = lp_build_and(&i32_bld, clamped, i32_roundmask);
+   tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, "");
+   /* bias exponent (and denormalize if necessary) */
+   magic = lp_build_const_int_vec(gallivm, i32_type,
+                                  ((1 << (exponent_bits - 1)) - 1) << 23);
+   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+   normal = lp_build_mul(&f32_bld, tmp, magic);
+
+   /* clamp to max value */
+   small_max = lp_build_const_int_vec(gallivm, i32_type,
+                                      (((1 << exponent_bits) - 2) << 23) |
+                                      (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
+   small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
+   normal = lp_build_min(&f32_bld, normal, small_max);
+   normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
+
+   /*
+    * handle nan/inf cases
+    * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan
+    * Note that on a lucky day, we could simplify this a bit,
+    * by just using the max(src, zero) result - this will have -Inf
+    * clamped to 0, and MIGHT preserve the NaNs.
+    */
+   src_abs = lp_build_abs(&f32_bld, src);
+   src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, "");
+   src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, "");
+   isnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
+                            src_abs, i32_floatexpmask);
+   isposinf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL,
+                               src, i32_floatexpmask);
+   isnanorposinf = lp_build_and(&i32_bld, isnan, isposinf);
+   /* could also set more mantissa bits but need at least the highest mantissa bit */
+   i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22);
+   /* combine maxexp with qnanbit */
+   nanorposinfnum = lp_build_or(&i32_bld, i32_smallexpmask,
+                                lp_build_and(&i32_bld, isnan, i32_qnanbit));
+
+   return lp_build_select(&i32_bld, isnanorposinf, nanorposinfnum, normal);
+}
+
+
+/**
+ * Convert rgba float SoA values to packed r11g11b10 values.
+ *
+ * @param src   SoA float (vector) values to convert.
+ */
+LLVMValueRef
+lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
+                            LLVMValueRef *src)
+{
+   LLVMValueRef dst, rcomp, bcomp, gcomp, shift, mask;
+   struct lp_build_context i32_bld;
+   LLVMTypeRef src_type = LLVMTypeOf(*src);
+   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+                            LLVMGetVectorSize(src_type) : 1;
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
+
+   lp_build_context_init(&i32_bld, gallivm, i32_type);
+
+   /* "rescale" - this does the actual conversion except the packing */
+   rcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[0], 6, 5);
+   gcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[1], 6, 5);
+   bcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[2], 5, 5);
+
+   /* pack rescaled SoA floats to r11g11b10 AoS values */
+   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6);
+   rcomp = lp_build_shr(&i32_bld, rcomp, shift);
+
+   shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17);
+   mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << 11);
+   gcomp = lp_build_shr(&i32_bld, gcomp, shift);
+   gcomp = lp_build_and(&i32_bld, gcomp, mask);
+
+   shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
+   mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << 22);
+   bcomp = lp_build_shl(&i32_bld, bcomp, shift);
+   bcomp = lp_build_and(&i32_bld, bcomp, mask);
+
+   dst = lp_build_or(&i32_bld, rcomp, gcomp);
+   return lp_build_or(&i32_bld, dst, bcomp);
+}
+
+
+/**
+ * Convert a float-like value with less exponent and mantissa
+ * bits than a normal float32 to a float32. The mantissa of
+ * the source value is assumed to have an implied 1, and the exponent
+ * is biased. There are no negative values.
+ * The source value to extract must be in a 32bit int (vector).
+ * While this helper is generic, it is only ever going to be useful for
+ * r11g11b10 (no other common format exists with the same properties).
+ *
+ * @param src             (vector) value to convert
+ * @param mantissa_bits   the number of mantissa bits
+ * @param exponent_bits   the number of exponent bits
+ * @param mantissa_start  the bit start position of the packed component
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ * ref https://gist.github.com/rygorous/2156668
+ */
+static LLVMValueRef
+lp_build_smallfloat_nosign_to_float(struct gallivm_state *gallivm,
+                                    LLVMValueRef src,
+                                    unsigned mantissa_bits,
+                                    unsigned exponent_bits,
+                                    unsigned mantissa_start)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   LLVMValueRef smallexpmask, i32_floatexpmask, magic;
+   LLVMValueRef wasinfnan, tmp, res, shift, mask;
+   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+                        LLVMGetVectorSize(src_type) : 1;
+   unsigned exponent_start = mantissa_start + mantissa_bits;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
+   struct lp_build_context f32_bld, i32_bld;
+
+   lp_build_context_init(&f32_bld, gallivm, f32_type);
+   lp_build_context_init(&i32_bld, gallivm, i32_type);
+
+   /* extract the component to "float position" */
+   if (exponent_start < 23) {
+      shift = lp_build_const_int_vec(gallivm, i32_type, 23 - exponent_start);
+      src = lp_build_shl(&i32_bld, src, shift);
+   }
+   else {
+      shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23);
+      src = lp_build_shr(&i32_bld, src, shift);
+   }
+   mask = lp_build_const_int_vec(gallivm, i32_type,
+                                 ((1 << (mantissa_bits + exponent_bits)) - 1) <<
+                                 (23 - mantissa_bits));
+   src = lp_build_and(&i32_bld, src, mask);
+   src = LLVMBuildBitCast(builder, src, f32_bld.vec_type, "");
+
+   /* now do the actual scaling */
+   smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
+                                         ((1 << exponent_bits) - 1) << 23);
+   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+   /*
+    * magic number has exponent new exp bias + (new exp bias - old exp bias),
+    * mantissa is 0.
+    */
+   magic = lp_build_const_int_vec(gallivm, i32_type,
+                                  (255 - (1 << (exponent_bits - 1))) << 23);
+   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+
+   /* adjust exponent and fix denorms */
+   res = lp_build_mul(&f32_bld, src, magic);
+
+   /*
+    * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
+    * so a simple "or" will do (because exp adjust will leave mantissa intact)
+    */
+   /* use float compare (better for AVX 8-wide / no AVX2 though otherwise should use int) */
+   smallexpmask = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+   wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, src, smallexpmask);
+   res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
+   tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
+   res = lp_build_or(&i32_bld, tmp, res);
+
+   return LLVMBuildBitCast(builder, res, f32_bld.vec_type, "");
+}
+
+
+/**
+ * Convert packed float format (r11g11b10) value(s) to rgba float SoA values.
+ *
+ * @param src   packed AoS r11g11b10 values (as (vector) int32)
+ * @param dst   pointer to the SoA result values
+ */
+void
+lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
+                            LLVMValueRef src,
+                            LLVMValueRef *dst)
+{
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+                            LLVMGetVectorSize(src_type) : 1;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
+
+   dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, src, 6, 5, 0);
+   dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, src, 6, 5, 11);
+   dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, src, 5, 5, 22);
+
+   /* Just set alpha to one */
+   dst[3] = lp_build_one(gallivm, f32_type);
+}
+
+
+/**
  * Converts int16 half-float to float32
  * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
  * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index d7dfed8..5bd6f4f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -62,6 +62,15 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
                        LLVMValueRef src);
 
 LLVMValueRef
+lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
+                            LLVMValueRef *src);
+
+void
+lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
+                            LLVMValueRef src,
+                            LLVMValueRef *dst);
+
+LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
                                         struct lp_type src_type,
                                         unsigned dst_width,
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 93e125d..ece7679 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -321,7 +321,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
       if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB)
          return FALSE;
 
-      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN &&
+          format != PIPE_FORMAT_R11G11B10_FLOAT)
          return FALSE;
       assert(format_desc->block.width == 1);
       assert(format_desc->block.height == 1);
@@ -329,7 +330,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
       if (format_desc->is_mixed)
          return FALSE;
 
-      if (!format_desc->is_array && !format_desc->is_bitmask)
+      if (!format_desc->is_array && !format_desc->is_bitmask &&
+          format != PIPE_FORMAT_R11G11B10_FLOAT)
          return FALSE;
 
       /*
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index d8369b4..953a5c1 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -972,6 +972,17 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
    unsigned i;
    unsigned chan;
 
+   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /* just make this a 32bit uint */
+      type->floating = false;
+      type->fixed = false;
+      type->sign = false;
+      type->norm = false;
+      type->width = 32;
+      type->length = 1;
+      return;
+   }
+
    for (i = 0; i < 4; i++)
       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
          break;
@@ -1009,6 +1020,17 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc
    unsigned i;
    unsigned chan;
 
+   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /* always use ordinary floats for blending */
+      type->floating = true;
+      type->fixed = false;
+      type->sign = true;
+      type->norm = false;
+      type->width = 32;
+      type->length = 4;
+      return;
+   }
+
    for (i = 0; i < 4; i++)
       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
          break;
@@ -1122,6 +1144,48 @@ convert_to_blend_type(struct gallivm_state *gallivm,
    unsigned pixels = 16 / num_srcs;
    bool is_arith;
 
+   /*
+    * full custom path for packed floats - none of the later functions would do
+    * anything useful, and given the lp_type representation they can't be fixed.
+    */
+   if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      LLVMValueRef tmpsrc[4];
+      /*
+       * This is pretty suboptimal for this case blending in SoA would be much
+       * better, since conversion gets us SoA values so need to convert back.
+       */
+      assert(src_type.width == 32);
+      assert(dst_type.floating);
+      assert(dst_type.width = 32);
+      assert(dst_type.length % 4 == 0);
+      for (i = 0; i < 4; i++) {
+         tmpsrc[i] = src[i];
+      }
+      for (i = 0; i < num_srcs / 4; i++) {
+         LLVMValueRef tmpsoa[4];
+         LLVMValueRef tmps = tmpsrc[i];
+         if (num_srcs == 8) {
+            LLVMValueRef shuffles[8];
+            unsigned j;
+            /* fetch was 4 values but need 8-wide output values */
+            tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
+            /*
+             * for 8-wide aos transpose would give us wrong order not matching
+             * incoming converted fs values and mask. ARGH.
+             */
+            for (j = 0; j < 4; j++) {
+               shuffles[j] = lp_build_const_int32(gallivm, j * 2);
+               shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
+            }
+            tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
+                                          LLVMConstVector(shuffles, 8), "");
+         }
+         lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
+         lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
+      }
+      return;
+   }
+
    lp_mem_type_from_format_desc(src_fmt, &mem_type);
    lp_blend_type_from_format_desc(src_fmt, &blend_type);
 
@@ -1225,6 +1289,47 @@ convert_from_blend_type(struct gallivm_state *gallivm,
    unsigned pixels = 16 / num_srcs;
    bool is_arith;
 
+   /*
+    * full custom path for packed floats - none of the later functions would do
+    * anything useful, and given the lp_type representation they can't be fixed.
+    */
+   if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /*
+       * This is pretty suboptimal for this case blending in SoA would be much
+       * better - we need to transpose the AoS values back to SoA values for
+       * conversion/packing.
+       */
+      assert(src_type.floating);
+      assert(src_type.width = 32);
+      assert(src_type.length % 4 == 0);
+      assert(dst_type.width == 32);
+      for (i = 0; i < num_srcs / 4; i++) {
+         LLVMValueRef tmpsoa[4], tmpdst;
+         lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
+         tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
+         if (num_srcs == 8) {
+            LLVMValueRef tmpaos, shuffles[8];
+            unsigned j;
+            /*
+             * for 8-wide aos transpose has given us wrong order not matching
+             * output order. HMPF. Also need to split the output values manually.
+             */
+            for (j = 0; j < 4; j++) {
+               shuffles[j * 2] = lp_build_const_int32(gallivm, j);
+               shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
+            }
+            tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
+                                            LLVMConstVector(shuffles, 8), "");
+            src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
+            src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
+         }
+         else {
+            src[i] = tmpdst;
+         }
+      }
+      return;
+   }
+
    lp_mem_type_from_format_desc(src_fmt, &mem_type);
    lp_blend_type_from_format_desc(src_fmt, &blend_type);
 
@@ -1532,6 +1637,17 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       }
    }
 
+   if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /* the code above can't work for layout_other */
+      dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
+      has_alpha = true;
+      swizzle[0] = 0;
+      swizzle[1] = 1;
+      swizzle[2] = 2;
+      swizzle[3] = 3;
+      pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
+   }
+
    /* If 3 channels then pad to include alpha for 4 element transpose */
    if (dst_channels == 3 && !has_alpha) {
       for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
@@ -1756,6 +1872,16 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
 
    dst_type.length *= 16 / dst_count;
 
+   if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      /*
+       * we need multiple values at once for the conversion, so can as well
+       * load them vectorized here too instead of concatenating later.
+       * (Still need concatenation later for 8-wide vectors).
+       */
+      dst_count = block_height;
+      dst_type.length = block_width;
+   }
+
    load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
                          dst, dst_type, dst_count, dst_alignment);
 
-- 
1.7.9.5