[Mesa-dev] [PATCH 2/2] llvmpipe: implement dual source blending

Fri Feb 8 08:36:36 PST 2013

Forgot to mention, this passes the piglit arb_blend_func_extended tests.
Oh and I forgot to remove the load_shader_output function in this patch
(this was an attempt to factor out some more code but ultimately I
reverted it as it made the code no bit simpler).

Roland

Am 08.02.2013 04:19, schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> link up the fs outputs and blend inputs, and make sure the second blend source
> is correctly loaded and converted (which is quite complex).
> There's a slight refactoring of the monster generate_unswizzled_blend()
> function where it makes sense to factor out alpha conversion (which needs
> to run twice for dual source blend).
> ---
>  src/gallium/drivers/llvmpipe/lp_bld_blend.h     |    1 +
>  src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c |   13 +-
>  src/gallium/drivers/llvmpipe/lp_screen.c        |    2 +-
>  src/gallium/drivers/llvmpipe/lp_state_fs.c      |  331 +++++++++++++++++------
>  src/gallium/drivers/llvmpipe/lp_test_blend.c    |    3 +-
>  5 files changed, 257 insertions(+), 93 deletions(-)
> 
> diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
> index 4bd2867..249a345 100644
> --- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
> +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
> @@ -62,6 +62,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
>                     LLVMValueRef src,
>                     LLVMValueRef src_alpha,
>                     LLVMValueRef src1,
> +                   LLVMValueRef src1_alpha,
>                     LLVMValueRef dst,
>                     LLVMValueRef mask,
>                     LLVMValueRef const_,
> diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
> index 8e9e7fe..c4d04a2 100644
> --- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
> +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
> @@ -70,6 +70,7 @@ struct lp_build_blend_aos_context
>     LLVMValueRef src;
>     LLVMValueRef src_alpha;
>     LLVMValueRef src1;
> +   LLVMValueRef src1_alpha;
>     LLVMValueRef dst;
>     LLVMValueRef const_;
>     LLVMValueRef const_alpha;
> @@ -94,6 +95,7 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
>                                   boolean alpha)
>  {
>     LLVMValueRef src_alpha = bld->src_alpha ? bld->src_alpha : bld->src;
> +   LLVMValueRef src1_alpha = bld->src1_alpha ? bld->src1_alpha : bld->src1;
>     LLVMValueRef const_alpha = bld->const_alpha ? bld->const_alpha : bld->const_;
>  
>     switch (factor) {
> @@ -123,8 +125,9 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
>     case PIPE_BLENDFACTOR_CONST_ALPHA:
>        return const_alpha;
>     case PIPE_BLENDFACTOR_SRC1_COLOR:
> -   case PIPE_BLENDFACTOR_SRC1_ALPHA:
>        return bld->src1;
> +   case PIPE_BLENDFACTOR_SRC1_ALPHA:
> +      return src1_alpha;
>     case PIPE_BLENDFACTOR_INV_SRC_COLOR:
>        if(!bld->inv_src)
>           bld->inv_src = lp_build_comp(&bld->base, bld->src);
> @@ -147,8 +150,9 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
>           bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha);
>        return bld->inv_const_alpha;
>     case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
> -   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
>        return lp_build_comp(&bld->base, bld->src1);
> +   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
> +      return lp_build_comp(&bld->base, src1_alpha);
>     default:
>        assert(0);
>        return bld->base.zero;
> @@ -265,10 +269,13 @@ lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
>   * @param type          data type of the pixel vector
>   * @param rt            render target index
>   * @param src           blend src
> + * @param src_alpha     blend src alpha (if not included in src)
>   * @param src1          second blend src (for dual source blend)
> + * @param src1_alpha    second blend src alpha (if not included in src1)
>   * @param dst           blend dst
>   * @param mask          optional mask to apply to the blending result
>   * @param const_        const blend color
> + * @param const_alpha   const blend color alpha (if not included in const_)
>   * @param swizzle       swizzle values for RGBA
>   *
>   * @return the result of blending src and dst
> @@ -282,6 +289,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
>                     LLVMValueRef src,
>                     LLVMValueRef src_alpha,
>                     LLVMValueRef src1,
> +                   LLVMValueRef src1_alpha,
>                     LLVMValueRef dst,
>                     LLVMValueRef mask,
>                     LLVMValueRef const_,
> @@ -307,6 +315,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
>     bld.dst = dst;
>     bld.const_ = const_;
>     bld.src_alpha = src_alpha;
> +   bld.src1_alpha = src1_alpha;
>     bld.const_alpha = const_alpha;
>  
>     /* Find the alpha channel if not provided seperately */
> diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
> index f3bc516..b9c1567 100644
> --- a/src/gallium/drivers/llvmpipe/lp_screen.c
> +++ b/src/gallium/drivers/llvmpipe/lp_screen.c
> @@ -115,7 +115,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
>     case PIPE_CAP_SM3:
>        return 1;
>     case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
> -      return 0;
> +      return 1;
>     case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
>        return PIPE_MAX_SO_BUFFERS;
>     case PIPE_CAP_ANISOTROPIC_FILTER:
> diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> index 2b31c14..c04fa5e 100644
> --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
> +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> @@ -66,6 +66,7 @@
>  #include "util/u_dump.h"
>  #include "util/u_string.h"
>  #include "util/u_simple_list.h"
> +#include "util/u_dual_blend.h"
>  #include "os/os_time.h"
>  #include "pipe/p_shader_tokens.h"
>  #include "draw/draw_context.h"
> @@ -247,6 +248,8 @@ generate_fs(struct gallivm_state *gallivm,
>     boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
>                              shader->info.base.num_inputs < 3 &&
>                              shader->info.base.num_instructions < 8);
> +   const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
> +                                     util_blend_state_is_dual(&key->blend, 0);
>     unsigned attrib;
>     unsigned chan;
>     unsigned cbuf;
> @@ -302,6 +305,12 @@ generate_fs(struct gallivm_state *gallivm,
>           color[cbuf][chan] = lp_build_alloca(gallivm, vec_type, "color");
>        }
>     }
> +   if (dual_source_blend) {
> +      assert(key->nr_cbufs <= 1);
> +      for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
> +         color[1][chan] = lp_build_alloca(gallivm, vec_type, "color1");
> +      }
> +   }
>  
>     /* do triangle edge testing */
>     if (partial_mask) {
> @@ -414,8 +423,9 @@ generate_fs(struct gallivm_state *gallivm,
>     /* Color write  */
>     for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
>     {
> -      if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR &&
> -          shader->info.base.output_semantic_index[attrib] < key->nr_cbufs)
> +      unsigned cbuf = shader->info.base.output_semantic_index[attrib];
> +      if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) &&
> +           ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)))
>        {
>           unsigned cbuf = shader->info.base.output_semantic_index[attrib];
>           for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
> @@ -478,6 +488,8 @@ generate_fs_loop(struct gallivm_state *gallivm,
>     boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
>                              shader->info.base.num_inputs < 3 &&
>                              shader->info.base.num_instructions < 8);
> +   const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
> +                                     util_blend_state_is_dual(&key->blend, 0);
>     unsigned attrib;
>     unsigned chan;
>     unsigned cbuf;
> @@ -551,7 +563,15 @@ generate_fs_loop(struct gallivm_state *gallivm,
>                                                         num_loop, "color");
>        }
>     }
> -
> +   if (dual_source_blend) {
> +      assert(key->nr_cbufs <= 1);
> +      for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
> +         out_color[1][chan] = lp_build_array_alloca(gallivm,
> +                                                    lp_build_vec_type(gallivm,
> +                                                                      type),
> +                                                    num_loop, "color1");
> +      }
> +   }
>  
>  
>     /* 'mask' will control execution based on quad's pixel alive/killed state */
> @@ -656,10 +676,10 @@ generate_fs_loop(struct gallivm_state *gallivm,
>     /* Color write  */
>     for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
>     {
> -      if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR &&
> -          shader->info.base.output_semantic_index[attrib] < key->nr_cbufs)
> +      unsigned cbuf = shader->info.base.output_semantic_index[attrib];
> +      if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) &&
> +           ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)))
>        {
> -         unsigned cbuf = shader->info.base.output_semantic_index[attrib];
>           for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
>              if(outputs[attrib][chan]) {
>                 /* XXX: just initialize outputs to point at colors[] and
> @@ -1283,6 +1303,142 @@ convert_from_blend_type(struct gallivm_state *gallivm,
>     }
>  }
>  
> +static void
> +load_shader_output(struct gallivm_state *gallivm,
> +                   struct lp_type fs_type,
> +                   const unsigned char swizzle[TGSI_NUM_CHANNELS],
> +                   LLVMValueRef fs_out_color[TGSI_NUM_CHANNELS][4],
> +                   LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS],
> +                   LLVMValueRef* fs_mask,
> +                   LLVMValueRef* src_mask,
> +                   LLVMValueRef* src_alpha,
> +                   const unsigned alpha_channel,
> +                   const unsigned src_channels,
> +                   const bool has_alpha,
> +                   const unsigned num_fs,
> +                   const unsigned dst_channels)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   unsigned i, j;
> +
> +   for (i = 0; i < num_fs; ++i) {
> +      /* Always load alpha for use in blending */
> +      LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[alpha_channel][i], "");
> +
> +      /* Load each channel */
> +      for (j = 0; j < dst_channels; ++j) {
> +         fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[swizzle[j]][i], "");
> +      }
> +
> +      /* If 3 channels then pad to include alpha for 4 element transpose */
> +      /*
> +       * XXX If we include that here maybe could actually use it instead of
> +       * separate alpha for blending?
> +       */
> +      if (dst_channels == 3 && !has_alpha) {
> +         fs_src[i][3] = alpha;
> +      }
> +
> +      /* We split the row_mask and row_alpha as we want 128bit interleave */
> +      if (fs_type.length == 8) {
> +         src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
> +         src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
> +
> +         src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
> +         src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
> +      } else {
> +         src_mask[i] = fs_mask[i];
> +         src_alpha[i] = alpha;
> +      }
> +   }
> +}
> +
> +
> +static void
> +convert_alpha(struct gallivm_state *gallivm,
> +              struct lp_type row_type,
> +              struct lp_type alpha_type,
> +              const unsigned block_size,
> +              const unsigned block_height,
> +              const unsigned src_count,
> +              const unsigned dst_channels,
> +              const bool pad_inline,
> +              LLVMValueRef* src_alpha)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   unsigned i, j;
> +   unsigned length = row_type.length;
> +   row_type.length = alpha_type.length;
> +
> +   /* Twiddle the alpha to match pixels */
> +   lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, 4, src_alpha);
> +
> +   for (i = 0; i < 4; ++i) {
> +      lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
> +   }
> +
> +   alpha_type = row_type;
> +   row_type.length = length;
> +
> +   /* If only one channel we can only need the single alpha value per pixel */
> +   if (src_count == 1) {
> +      assert(dst_channels == 1);
> +
> +      lp_build_concat_n(gallivm, alpha_type, src_alpha, 4, src_alpha, src_count);
> +   } else {
> +      /* If there are more srcs than rows then we need to split alpha up */
> +      if (src_count > block_height) {
> +         for (i = src_count; i > 0; --i) {
> +            unsigned pixels = block_size / src_count;
> +            unsigned idx = i - 1;
> +
> +            src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4], (idx * pixels) % 4, pixels);
> +         }
> +      }
> +
> +      /* If there is a src for each pixel broadcast the alpha across whole row */
> +      if (src_count == block_size) {
> +         for (i = 0; i < src_count; ++i) {
> +            src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
> +         }
> +      } else {
> +         unsigned pixels = block_size / src_count;
> +         unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
> +         unsigned alpha_span = 1;
> +         LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
> +
> +         /* Check if we need 2 src_alphas for our shuffles */
> +         if (pixels > alpha_type.length) {
> +            alpha_span = 2;
> +         }
> +
> +         /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
> +         for (j = 0; j < row_type.length; ++j) {
> +            if (j < pixels * channels) {
> +               shuffles[j] = lp_build_const_int32(gallivm, j / channels);
> +            } else {
> +               shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
> +            }
> +         }
> +
> +         for (i = 0; i < src_count; ++i) {
> +            unsigned idx1 = i, idx2 = i;
> +
> +            if (alpha_span > 1){
> +               idx1 *= alpha_span;
> +               idx2 = idx1 + 1;
> +            }
> +
> +            src_alpha[i] = LLVMBuildShuffleVector(builder,
> +                                                  src_alpha[idx1],
> +                                                  src_alpha[idx2],
> +                                                  LLVMConstVector(shuffles, row_type.length),
> +                                                  "");
> +         }
> +      }
> +   }
> +}
> +
>  
>  /**
>   * Generates the blend function for unswizzled colour buffers
> @@ -1296,7 +1452,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>                            unsigned int num_fs,
>                            struct lp_type fs_type,
>                            LLVMValueRef* fs_mask,
> -                          LLVMValueRef fs_out_color[TGSI_NUM_CHANNELS][4],
> +                          LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
>                            LLVMValueRef context_ptr,
>                            LLVMValueRef color_ptr,
>                            LLVMValueRef stride,
> @@ -1311,9 +1467,12 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>  
>     LLVMBuilderRef builder = gallivm->builder;
>     LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
> +   LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
>     LLVMValueRef src_alpha[4 * 4];
> +   LLVMValueRef src1_alpha[4 * 4];
>     LLVMValueRef src_mask[4 * 4];
>     LLVMValueRef src[4 * 4];
> +   LLVMValueRef src1[4 * 4];
>     LLVMValueRef dst[4 * 4];
>     LLVMValueRef blend_color;
>     LLVMValueRef blend_alpha;
> @@ -1323,14 +1482,13 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>     struct lp_build_mask_context mask_ctx;
>     struct lp_type mask_type;
>     struct lp_type blend_type;
> -   struct lp_type alpha_type;
>     struct lp_type row_type;
>     struct lp_type dst_type;
>  
>     unsigned char swizzle[TGSI_NUM_CHANNELS];
>     unsigned vector_width;
> +   unsigned src_channels = TGSI_NUM_CHANNELS;
>     unsigned dst_channels;
> -   unsigned src_channels;
>     unsigned dst_count;
>     unsigned src_count;
>     unsigned i, j;
> @@ -1341,8 +1499,9 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>  
>     bool pad_inline = is_arithmetic_format(out_format_desc);
>     bool has_alpha = false;
> +   const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable &&
> +                                     util_blend_state_is_dual(&variant->key.blend, 0);
>  
> -   src_channels = TGSI_NUM_CHANNELS;
>     mask_type = lp_int32_vec4_type();
>     mask_type.length = fs_type.length;
>  
> @@ -1422,17 +1581,20 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>      */
>     for (i = 0; i < num_fs; ++i) {
>        /* Always load alpha for use in blending */
> -      LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[alpha_channel][i], "");
> +      LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], "");
>  
>        /* Load each channel */
>        for (j = 0; j < dst_channels; ++j) {
> -         fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[swizzle[j]][i], "");
> +         fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], "");
>        }
>  
>        /* If 3 channels then pad to include alpha for 4 element transpose */
> +      /*
> +       * XXX If we include that here maybe could actually use it instead of
> +       * separate alpha for blending?
> +       */
>        if (dst_channels == 3 && !has_alpha) {
>           fs_src[i][3] = alpha;
> -         swizzle[3] = 3;
>        }
>  
>        /* We split the row_mask and row_alpha as we want 128bit interleave */
> @@ -1447,6 +1609,25 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>           src_alpha[i] = alpha;
>        }
>     }
> +   if (dual_source_blend) {
> +      /* same as above except different src/dst, skip masks and comments... */
> +      for (i = 0; i < num_fs; ++i) {
> +         LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], "");
> +
> +         for (j = 0; j < dst_channels; ++j) {
> +            fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], "");
> +         }
> +         if (dst_channels == 3 && !has_alpha) {
> +            fs_src1[i][3] = alpha;
> +         }
> +         if (fs_type.length == 8) {
> +            src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
> +            src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
> +         } else {
> +            src1_alpha[i] = alpha;
> +         }
> +      }
> +   }
>  
>     if (util_format_is_pure_integer(out_format)) {
>        /*
> @@ -1467,11 +1648,14 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>        }
>     }
>  
> -
>     /*
>      * Pixel twiddle from fragment shader order to memory order
>      */
>     src_count = generate_fs_twiddle(gallivm, fs_type, num_fs, dst_channels, fs_src, src, pad_inline);
> +   if (dual_source_blend) {
> +      generate_fs_twiddle(gallivm, fs_type, num_fs, dst_channels, fs_src1, src1, pad_inline);
> +   }
> +
>     src_channels = dst_channels < 3 ? dst_channels : 4;
>     if (src_count != num_fs * src_channels) {
>        unsigned ds = src_count / (num_fs * src_channels);
> @@ -1480,12 +1664,17 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>     }
>  
>     blend_type = row_type;
> -   alpha_type = fs_type;
> -   alpha_type.length = 4;
>     mask_type.length = 4;
>  
>     /* Convert src to row_type */
> -   src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
> +   if (dual_source_blend) {
> +      struct lp_type old_row_type = row_type;
> +      lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
> +      src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type, src1, src_count, src1);
> +   }
> +   else {
> +      src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
> +   }
>  
>     /* If the rows are not an SSE vector, combine them to become SSE size! */
>     if ((row_type.width * row_type.length) % 128) {
> @@ -1494,6 +1683,9 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>  
>        dst_count = src_count / (vector_width / bits);
>        combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count);
> +      if (dual_source_blend) {
> +         lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
> +      }
>  
>        row_type.length *= combined;
>        src_count /= combined;
> @@ -1569,75 +1761,17 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>      * Alpha conversion
>      */
>     if (!has_alpha) {
> -      unsigned length = row_type.length;
> -      row_type.length = alpha_type.length;
> -
> -      /* Twiddle the alpha to match pixels */
> -      lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, 4, src_alpha);
> -
> -      for (i = 0; i < 4; ++i) {
> -         lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
> -      }
> -
> -      alpha_type = row_type;
> -      row_type.length = length;
> -
> -      /* If only one channel we can only need the single alpha value per pixel */
> -      if (src_count == 1) {
> -         assert(dst_channels == 1);
> -
> -         lp_build_concat_n(gallivm, alpha_type, src_alpha, 4, src_alpha, src_count);
> -      } else {
> -         /* If there are more srcs than rows then we need to split alpha up */
> -         if (src_count > block_height) {
> -            for (i = src_count; i > 0; --i) {
> -               unsigned pixels = block_size / src_count;
> -               unsigned idx = i - 1;
> -
> -               src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4], (idx * pixels) % 4, pixels);
> -            }
> -         }
> -
> -         /* If there is a src for each pixel broadcast the alpha across whole row */
> -         if (src_count == block_size) {
> -            for (i = 0; i < src_count; ++i) {
> -               src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
> -            }
> -         } else {
> -            unsigned pixels = block_size / src_count;
> -            unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
> -            unsigned alpha_span = 1;
> -            LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
> -
> -            /* Check if we need 2 src_alphas for our shuffles */
> -            if (pixels > alpha_type.length) {
> -               alpha_span = 2;
> -            }
> -
> -            /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
> -            for (j = 0; j < row_type.length; ++j) {
> -               if (j < pixels * channels) {
> -                  shuffles[j] = lp_build_const_int32(gallivm, j / channels);
> -               } else {
> -                  shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
> -               }
> -            }
> -
> -            for (i = 0; i < src_count; ++i) {
> -               unsigned idx1 = i, idx2 = i;
> -
> -               if (alpha_span > 1){
> -                  idx1 *= alpha_span;
> -                  idx2 = idx1 + 1;
> -               }
> -
> -               src_alpha[i] = LLVMBuildShuffleVector(builder,
> -                                                     src_alpha[idx1],
> -                                                     src_alpha[idx2],
> -                                                     LLVMConstVector(shuffles, row_type.length),
> -                                                     "");
> -            }
> -         }
> +      struct lp_type alpha_type = fs_type;
> +      alpha_type.length = 4;
> +      convert_alpha(gallivm, row_type, alpha_type,
> +                    block_size, block_height,
> +                    src_count, dst_channels,
> +                    pad_inline, src_alpha);
> +      if (dual_source_blend) {
> +         convert_alpha(gallivm, row_type, alpha_type,
> +                       block_size, block_height,
> +                       src_count, dst_channels,
> +                       pad_inline, src1_alpha);
>        }
>     }
>  
> @@ -1693,7 +1827,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
>                                    rt,
>                                    src[i],
>                                    has_alpha ? NULL : src_alpha[i],
> -                                  NULL,
> +                                  src1[i],
> +                                  has_alpha ? NULL : src1_alpha[i],
>                                    dst[i],
>                                    partial_mask ? src_mask[i] : NULL,
>                                    blend_color,
> @@ -1788,6 +1923,8 @@ generate_fragment(struct llvmpipe_context *lp,
>     unsigned cbuf;
>     boolean cbuf0_write_all;
>     boolean try_loop = TRUE;
> +   const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
> +                                     util_blend_state_is_dual(&key->blend, 0);
>  
>     assert(lp_native_vector_width / 32 >= 4);
>  
> @@ -1951,10 +2088,17 @@ generate_fragment(struct llvmpipe_context *lp,
>                       mask_input,
>                       thread_data_ptr);
>  
> -         for (cbuf = 0; cbuf < key->nr_cbufs; cbuf++)
> +         for (cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
>              for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
>                 fs_out_color[cbuf][chan][i] =
>                    out_color[cbuf * !cbuf0_write_all][chan];
> +         }
> +         if (dual_source_blend) {
> +            /* only support one dual source blend target hence always use output 1 */
> +            for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
> +               fs_out_color[1][chan][i] =
> +                  out_color[1][chan];
> +         }
>        }
>     }
>     else {
> @@ -2024,6 +2168,15 @@ generate_fragment(struct llvmpipe_context *lp,
>                 fs_out_color[cbuf][chan][i] = ptr;
>              }
>           }
> +         if (dual_source_blend) {
> +            /* only support one dual source blend target hence always use output 1 */
> +            for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
> +               ptr = LLVMBuildGEP(builder,
> +                                  color_store[1][chan],
> +                                  &indexi, 1, "");
> +               fs_out_color[1][chan][i] = ptr;
> +            }
> +         }
>        }
>     }
>  
> @@ -2052,7 +2205,7 @@ generate_fragment(struct llvmpipe_context *lp,
>                               "");
>  
>        generate_unswizzled_blend(gallivm, cbuf, variant, key->cbuf_format[cbuf],
> -                                num_fs, fs_type, fs_mask, fs_out_color[cbuf],
> +                                num_fs, fs_type, fs_mask, fs_out_color,
>                                  context_ptr, color_ptr, stride, partial_mask, do_branch);
>     }
>  
> diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
> index 6faaedf..c3d2714 100644
> --- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
> +++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
> @@ -177,7 +177,8 @@ add_blend_test(struct gallivm_state *gallivm,
>     dst = LLVMBuildLoad(builder, dst_ptr, "dst");
>     con = LLVMBuildLoad(builder, const_ptr, "const");
>  
> -   res = lp_build_blend_aos(gallivm, blend, format, type, rt, src, NULL, src1, dst, NULL, con, NULL, swizzle, 4);
> +   res = lp_build_blend_aos(gallivm, blend, format, type, rt, src, NULL,
> +                            src1, NULL, dst, NULL, con, NULL, swizzle, 4);
>  
>     lp_build_name(res, "res");
>  
>