Mesa (master): vc4: Move all of our fixed function fragment color handling to NIR.

Fri Aug 14 18:41:52 UTC 2015

Module: Mesa
Branch: master
Commit: bf3c50fba221f216e38d3f60f89161ced4c684c0
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=bf3c50fba221f216e38d3f60f89161ced4c684c0

Author: Eric Anholt <eric at anholt.net>
Date:   Mon Apr 13 21:36:24 2015 -0700

vc4: Move all of our fixed function fragment color handling to NIR.

This massively reduces our dependency on VC4-specific optimization passes.

shader-db:
total uniforms in shared programs: 32077 -> 32067 (-0.03%)
uniforms in affected programs:     149 -> 139 (-6.71%)
total instructions in shared programs: 98208 -> 98182 (-0.03%)
instructions in affected programs:     2154 -> 2128 (-1.21%)

---

 src/gallium/drivers/vc4/Makefile.sources      |    1 +
 src/gallium/drivers/vc4/vc4_nir_lower_blend.c |  431 ++++++++++++++++++++++++
 src/gallium/drivers/vc4/vc4_nir_lower_io.c    |   24 +-
 src/gallium/drivers/vc4/vc4_program.c         |  440 ++++---------------------
 src/gallium/drivers/vc4/vc4_qir.h             |   16 +-
 src/gallium/drivers/vc4/vc4_uniforms.c        |   14 +-
 6 files changed, 538 insertions(+), 388 deletions(-)

diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index b09ffa6..6fb40c2 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -19,6 +19,7 @@ C_SOURCES := \
 	vc4_fence.c \
 	vc4_formats.c \
 	vc4_job.c \
+	vc4_nir_lower_blend.c \
 	vc4_nir_lower_io.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
new file mode 100644
index 0000000..a372a6c
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Implements most of the fixed function fragment pipeline in shader code.
+ *
+ * VC4 doesn't have any hardware support for blending, alpha test, logic ops,
+ * or color mask.  Instead, you read the current contents of the destination
+ * from the tile buffer after having waited for the scoreboard (which is
+ * handled by vc4_qpu_emit.c), then do math using your output color and that
+ * destination value, and update the output color appropriately.
+ */
+
+/**
+ * Lowers fixed-function blending to a load of the destination color and a
+ * series of ALU operations before the store of the output.
+ */
+#include "util/u_format.h"
+#include "vc4_qir.h"
+#include "glsl/nir/nir_builder.h"
+#include "vc4_context.h"
+
+/** Emits a load of the previous fragment color from the tile buffer. */
+static nir_ssa_def *
+vc4_nir_get_dst_color(nir_builder *b)
+{
+        nir_intrinsic_instr *load =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_input);
+        load->num_components = 1;
+        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT;
+        nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+        nir_builder_instr_insert(b, &load->instr);
+        return &load->dest.ssa;
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb)
+{
+        nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045));
+        nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92));
+        nir_ssa_def *high = nir_fpow(b,
+                                     nir_fmul(b,
+                                              nir_fadd(b, srgb,
+                                                       nir_imm_float(b, 0.055)),
+                                              nir_imm_float(b, 1.0 / 1.055)),
+                                     nir_imm_float(b, 2.4));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear)
+{
+        nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308));
+        nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92));
+        nir_ssa_def *high = nir_fsub(b,
+                                     nir_fmul(b,
+                                              nir_imm_float(b, 1.055),
+                                              nir_fpow(b,
+                                                       linear,
+                                                       nir_imm_float(b, 0.41666))),
+                                     nir_imm_float(b, 0.055));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static nir_ssa_def *
+vc4_blend_channel(nir_builder *b,
+                  nir_ssa_def **src,
+                  nir_ssa_def **dst,
+                  unsigned factor,
+                  int channel)
+{
+        switch(factor) {
+        case PIPE_BLENDFACTOR_ONE:
+                return nir_imm_float(b, 1.0);
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return src[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return src[3];
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return dst[3];
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return dst[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                if (channel != 3) {
+                        return nir_fmin(b,
+                                        src[3],
+                                        nir_fsub(b,
+                                                 nir_imm_float(b, 1.0),
+                                                 dst[3]));
+                } else {
+                        return nir_imm_float(b, 1.0);
+                }
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel);
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W);
+        case PIPE_BLENDFACTOR_ZERO:
+                return nir_imm_float(b, 0.0);
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[channel]);
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[3]);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[3]);
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[channel]);
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel));
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W));
+
+        default:
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA:
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend factor %d\n", factor);
+                return nir_imm_float(b, 1.0);
+        }
+}
+
+static nir_ssa_def *
+vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+               unsigned func)
+{
+        switch (func) {
+        case PIPE_BLEND_ADD:
+                return nir_fadd(b, src, dst);
+        case PIPE_BLEND_SUBTRACT:
+                return nir_fsub(b, src, dst);
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+                return nir_fsub(b, dst, src);
+        case PIPE_BLEND_MIN:
+                return nir_fmin(b, src, dst);
+        case PIPE_BLEND_MAX:
+                return nir_fmax(b, src, dst);
+
+        default:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend func %d\n", func);
+                return src;
+
+        }
+}
+
+static void
+vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
+                nir_ssa_def **src_color, nir_ssa_def **dst_color)
+{
+        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
+
+        if (!blend->blend_enable) {
+                for (int i = 0; i < 4; i++)
+                        result[i] = src_color[i];
+                return;
+        }
+
+        /* Clamp the src color to [0, 1].  Dest is already clamped. */
+        for (int i = 0; i < 4; i++)
+                src_color[i] = nir_fsat(b, src_color[i]);
+
+        nir_ssa_def *src_blend[4], *dst_blend[4];
+        for (int i = 0; i < 4; i++) {
+                int src_factor = ((i != 3) ? blend->rgb_src_factor :
+                                  blend->alpha_src_factor);
+                int dst_factor = ((i != 3) ? blend->rgb_dst_factor :
+                                  blend->alpha_dst_factor);
+                src_blend[i] = nir_fmul(b, src_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          src_factor, i));
+                dst_blend[i] = nir_fmul(b, dst_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          dst_factor, i));
+        }
+
+        for (int i = 0; i < 4; i++) {
+                result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i],
+                                           ((i != 3) ? blend->rgb_func :
+                                            blend->alpha_func));
+        }
+}
+
+static nir_ssa_def *
+vc4_logicop(nir_builder *b, int logicop_func,
+            nir_ssa_def *src, nir_ssa_def *dst)
+{
+        switch (logicop_func) {
+        case PIPE_LOGICOP_CLEAR:
+                return nir_imm_int(b, 0);
+        case PIPE_LOGICOP_NOR:
+                return nir_inot(b, nir_ior(b, src, dst));
+        case PIPE_LOGICOP_AND_INVERTED:
+                return nir_iand(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_COPY_INVERTED:
+                return nir_inot(b, src);
+        case PIPE_LOGICOP_AND_REVERSE:
+                return nir_iand(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_INVERT:
+                return nir_inot(b, dst);
+        case PIPE_LOGICOP_XOR:
+                return nir_ixor(b, src, dst);
+        case PIPE_LOGICOP_NAND:
+                return nir_inot(b, nir_iand(b, src, dst));
+        case PIPE_LOGICOP_AND:
+                return nir_iand(b, src, dst);
+        case PIPE_LOGICOP_EQUIV:
+                return nir_inot(b, nir_ixor(b, src, dst));
+        case PIPE_LOGICOP_NOOP:
+                return dst;
+        case PIPE_LOGICOP_OR_INVERTED:
+                return nir_ior(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_OR_REVERSE:
+                return nir_ior(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_OR:
+                return nir_ior(b, src, dst);
+        case PIPE_LOGICOP_SET:
+                return nir_imm_int(b, ~0);
+        default:
+                fprintf(stderr, "Unknown logic op %d\n", logicop_func);
+                /* FALLTHROUGH */
+        case PIPE_LOGICOP_COPY:
+                return src;
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_pipe_compare_func(nir_builder *b, int func,
+                          nir_ssa_def *src0, nir_ssa_def *src1)
+{
+        switch (func) {
+        default:
+                fprintf(stderr, "Unknown compare func %d\n", func);
+                /* FALLTHROUGH */
+        case PIPE_FUNC_NEVER:
+                return nir_imm_int(b, 0);
+        case PIPE_FUNC_ALWAYS:
+                return nir_imm_int(b, ~0);
+        case PIPE_FUNC_EQUAL:
+                return nir_feq(b, src0, src1);
+        case PIPE_FUNC_NOTEQUAL:
+                return nir_fne(b, src0, src1);
+        case PIPE_FUNC_GREATER:
+                return nir_flt(b, src1, src0);
+        case PIPE_FUNC_GEQUAL:
+                return nir_fge(b, src0, src1);
+        case PIPE_FUNC_LESS:
+                return nir_flt(b, src0, src1);
+        case PIPE_FUNC_LEQUAL:
+                return nir_fge(b, src1, src0);
+        }
+}
+
+static void
+vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
+                                nir_ssa_def *alpha)
+{
+        if (!c->fs_key->alpha_test)
+                return;
+
+        nir_ssa_def *alpha_ref =
+                vc4_nir_get_state_uniform(b, QUNIFORM_ALPHA_REF);
+        nir_ssa_def *condition =
+                vc4_nir_pipe_compare_func(b, c->fs_key->alpha_test_func,
+                                          alpha, alpha_ref);
+
+        nir_intrinsic_instr *discard =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_discard_if);
+        discard->num_components = 1;
+        discard->src[0] = nir_src_for_ssa(nir_inot(b, condition));
+        nir_builder_instr_insert(b, &discard->instr);
+}
+
+static void
+vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
+{
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+
+        /* Pull out the float src/dst color components. */
+        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
+        nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color);
+        nir_ssa_def *src_color[4], *unpacked_dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false);
+                unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
+        }
+
+        /* Unswizzle the destination color. */
+        nir_ssa_def *dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                dst_color[i] = vc4_nir_get_swizzled_channel(b,
+                                                            unpacked_dst_color,
+                                                            format_swiz[i]);
+        }
+
+        vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
+
+        /* Turn dst color to linear. */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]);
+        }
+
+        nir_ssa_def *blend_color[4];
+        vc4_do_blending(c, b, blend_color, src_color, dst_color);
+
+        /* sRGB encode the output color */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]);
+        }
+
+        nir_ssa_def *swizzled_outputs[4];
+        for (int i = 0; i < 4; i++) {
+                swizzled_outputs[i] =
+                        vc4_nir_get_swizzled_channel(b, blend_color,
+                                                     format_swiz[i]);
+        }
+
+        nir_ssa_def *packed_color =
+                nir_pack_unorm_4x8(b,
+                                   nir_vec4(b,
+                                            swizzled_outputs[0],
+                                            swizzled_outputs[1],
+                                            swizzled_outputs[2],
+                                            swizzled_outputs[3]));
+
+        packed_color = vc4_logicop(b, c->fs_key->logicop_func,
+                                   packed_color, packed_dst_color);
+
+        /* If the bit isn't set in the color mask, then just return the
+         * original dst color, instead.
+         */
+        uint32_t colormask = 0xffffffff;
+        for (int i = 0; i < 4; i++) {
+                if (format_swiz[i] < 4 &&
+                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
+                        colormask &= ~(0xff << (i * 8));
+                }
+        }
+        packed_color = nir_ior(b,
+                               nir_iand(b, packed_color,
+                                        nir_imm_int(b, colormask)),
+                               nir_iand(b, packed_dst_color,
+                                        nir_imm_int(b, ~colormask)));
+
+        /* Turn the old vec4 output into a store of the packed color. */
+        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+                              nir_src_for_ssa(packed_color));
+        intr->num_components = 1;
+}
+
+static bool
+vc4_nir_lower_blend_block(nir_block *block, void *state)
+{
+        struct vc4_compile *c = state;
+
+        nir_foreach_instr(block, instr) {
+                if (instr->type != nir_instr_type_intrinsic)
+                        continue;
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                if (intr->intrinsic != nir_intrinsic_store_output)
+                        continue;
+
+                nir_variable *output_var = NULL;
+                foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                        if (var->data.driver_location == intr->const_index[0]) {
+                                output_var = var;
+                                break;
+                        }
+                }
+                assert(output_var);
+                unsigned semantic_name = output_var->data.location;
+
+                if (semantic_name != TGSI_SEMANTIC_COLOR)
+                        continue;
+
+                nir_function_impl *impl =
+                        nir_cf_node_get_function(&block->cf_node);
+                nir_builder b;
+                nir_builder_init(&b, impl);
+                nir_builder_insert_before_instr(&b, &intr->instr);
+                vc4_nir_lower_blend_instr(c, &b, intr);
+        }
+        return true;
+}
+
+void
+vc4_nir_lower_blend(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl) {
+                        nir_foreach_block(overload->impl,
+                                          vc4_nir_lower_blend_block, c);
+
+                        nir_metadata_preserve(overload->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index ffc120e..229d411 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -56,11 +56,14 @@ static void
 vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                     nir_intrinsic_instr *intr)
 {
-        /* All TGSI-to-NIR inputs are vec4. */
-        assert(intr->num_components == 4);
-
         nir_builder_insert_before_instr(b, &intr->instr);
 
+        if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
+            VC4_NIR_TLB_COLOR_READ_INPUT) {
+                /* This doesn't need any lowering. */
+                return;
+        }
+
         nir_variable *input_var = NULL;
         foreach_list_typed(nir_variable, var, node, &c->s->inputs) {
                 if (var->data.driver_location == intr->const_index[0]) {
@@ -72,6 +75,9 @@ vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
         int semantic_name = input_var->data.location;
         int semantic_index = input_var->data.index;
 
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
         /* Generate scalar loads equivalent to the original VEC4. */
         nir_ssa_def *dests[4];
         for (unsigned i = 0; i < intr->num_components; i++) {
@@ -145,6 +151,12 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
                 return;
         }
 
+        /* Color output is lowered by vc4_nir_lower_blend(). */
+        if (c->stage == QSTAGE_FRAG && semantic_name == TGSI_SEMANTIC_COLOR) {
+                intr->const_index[0] *= 4;
+                return;
+        }
+
         /* All TGSI-to-NIR outputs are VEC4. */
         assert(intr->num_components == 4);
 
@@ -170,7 +182,11 @@ static void
 vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
                       nir_intrinsic_instr *intr)
 {
-        /* All TGSI-to-NIR uniform loads are vec4. */
+        /* All TGSI-to-NIR uniform loads are vec4, but we may create dword
+         * loads in our lowering passes.
+         */
+        if (intr->num_components == 1)
+                return;
         assert(intr->num_components == 4);
 
         nir_builder_insert_before_instr(b, &intr->instr);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index fb1726c..13c4721 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -123,6 +123,26 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
         return &intr->dest.ssa;
 }
 
+nir_ssa_def *
+vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+{
+        switch (swiz) {
+        default:
+        case UTIL_FORMAT_SWIZZLE_NONE:
+                fprintf(stderr, "warning: unknown swizzle\n");
+                /* FALLTHROUGH */
+        case UTIL_FORMAT_SWIZZLE_0:
+                return nir_imm_float(b, 0.0);
+        case UTIL_FORMAT_SWIZZLE_1:
+                return nir_imm_float(b, 1.0);
+        case UTIL_FORMAT_SWIZZLE_X:
+        case UTIL_FORMAT_SWIZZLE_Y:
+        case UTIL_FORMAT_SWIZZLE_Z:
+        case UTIL_FORMAT_SWIZZLE_W:
+                return srcs[swiz];
+        }
+}
+
 static struct qreg *
 ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
 {
@@ -259,22 +279,6 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
 }
 
 static struct qreg
-qir_srgb_encode(struct vc4_compile *c, struct qreg linear)
-{
-        struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92));
-        struct qreg high = qir_FSUB(c,
-                                    qir_FMUL(c,
-                                             qir_uniform_f(c, 1.055),
-                                             qir_POW(c,
-                                                     linear,
-                                                     qir_uniform_f(c, 0.41666))),
-                                    qir_uniform_f(c, 0.055));
-
-        qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308)));
-        return qir_SEL_X_Y_NS(c, low, high);
-}
-
-static struct qreg
 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
 {
         struct qreg src0_hi = qir_SHR(c, src0,
@@ -834,6 +838,32 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 return;
         }
 
+        if (instr->op == nir_op_pack_unorm_4x8) {
+                struct qreg result;
+                for (int i = 0; i < 4; i++) {
+                        struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                                      instr->src[0].swizzle[i]);
+                        if (i == 0)
+                                result = qir_PACK_8888_F(c, src);
+                        else
+                                result = qir_PACK_8_F(c, result, src, i);
+                }
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                *dest = result;
+                return;
+        }
+
+        if (instr->op == nir_op_unpack_unorm_4x8) {
+                struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                              instr->src[0].swizzle[0]);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                for (int i = 0; i < 4; i++) {
+                        if (instr->dest.write_mask & (1 << i))
+                                dest[i] = qir_UNPACK_8_F(c, src, i);
+                }
+                return;
+        }
+
         /* General case: We can just grab the one used channel per src. */
         struct qreg src[nir_op_infos[instr->op].num_inputs];
         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
@@ -1036,161 +1066,6 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         }
 }
 
-static struct qreg
-vc4_blend_channel(struct vc4_compile *c,
-                  struct qreg *dst,
-                  struct qreg *src,
-                  struct qreg val,
-                  unsigned factor,
-                  int channel)
-{
-        switch(factor) {
-        case PIPE_BLENDFACTOR_ONE:
-                return val;
-        case PIPE_BLENDFACTOR_SRC_COLOR:
-                return qir_FMUL(c, val, src[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA:
-                return qir_FMUL(c, val, src[3]);
-        case PIPE_BLENDFACTOR_DST_ALPHA:
-                return qir_FMUL(c, val, dst[3]);
-        case PIPE_BLENDFACTOR_DST_COLOR:
-                return qir_FMUL(c, val, dst[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-                if (channel != 3) {
-                        return qir_FMUL(c,
-                                        val,
-                                        qir_FMIN(c,
-                                                 src[3],
-                                                 qir_FSUB(c,
-                                                          qir_uniform_f(c, 1.0),
-                                                          dst[3])));
-                } else {
-                        return val;
-                }
-        case PIPE_BLENDFACTOR_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR,
-                                            channel));
-        case PIPE_BLENDFACTOR_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3));
-        case PIPE_BLENDFACTOR_ZERO:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[channel]));
-        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[3]));
-        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[3]));
-        case PIPE_BLENDFACTOR_INV_DST_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[channel]));
-        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     channel)));
-        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     3)));
-
-        default:
-        case PIPE_BLENDFACTOR_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_SRC1_ALPHA:
-        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend factor %d\n", factor);
-                return val;
-        }
-}
-
-static struct qreg
-vc4_blend_func(struct vc4_compile *c,
-               struct qreg src, struct qreg dst,
-               unsigned func)
-{
-        switch (func) {
-        case PIPE_BLEND_ADD:
-                return qir_FADD(c, src, dst);
-        case PIPE_BLEND_SUBTRACT:
-                return qir_FSUB(c, src, dst);
-        case PIPE_BLEND_REVERSE_SUBTRACT:
-                return qir_FSUB(c, dst, src);
-        case PIPE_BLEND_MIN:
-                return qir_FMIN(c, src, dst);
-        case PIPE_BLEND_MAX:
-                return qir_FMAX(c, src, dst);
-
-        default:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend func %d\n", func);
-                return src;
-
-        }
-}
-
-/**
- * Implements fixed function blending in shader code.
- *
- * VC4 doesn't have any hardware support for blending.  Instead, you read the
- * current contents of the destination from the tile buffer after having
- * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do
- * math using your output color and that destination value, and update the
- * output color appropriately.
- */
-static void
-vc4_blend(struct vc4_compile *c, struct qreg *result,
-          struct qreg *dst_color, struct qreg *src_color)
-{
-        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
-
-        if (!blend->blend_enable) {
-                for (int i = 0; i < 4; i++)
-                        result[i] = src_color[i];
-                return;
-        }
-
-        for (int i = 0; i < 4; i++)
-                src_color[i] = qir_SAT(c, src_color[i]);
-
-        struct qreg src_blend[4], dst_blend[4];
-        for (int i = 0; i < 3; i++) {
-                src_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 src_color[i],
-                                                 blend->rgb_src_factor, i);
-                dst_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 dst_color[i],
-                                                 blend->rgb_dst_factor, i);
-        }
-        src_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         src_color[3],
-                                         blend->alpha_src_factor, 3);
-        dst_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         dst_color[3],
-                                         blend->alpha_dst_factor, 3);
-
-        for (int i = 0; i < 3; i++) {
-                result[i] = vc4_blend_func(c,
-                                           src_blend[i], dst_blend[i],
-                                           blend->rgb_func);
-        }
-        result[3] = vc4_blend_func(c,
-                                   src_blend[3], dst_blend[3],
-                                   blend->alpha_func);
-}
-
 static void
 clip_distance_discard(struct vc4_compile *c)
 {
@@ -1214,216 +1089,16 @@ clip_distance_discard(struct vc4_compile *c)
 }
 
 static void
-alpha_test_discard(struct vc4_compile *c)
-{
-        struct qreg src_alpha;
-        struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0);
-
-        if (!c->fs_key->alpha_test)
-                return;
-
-        if (c->output_color_index != -1)
-                src_alpha = c->outputs[c->output_color_index + 3];
-        else
-                src_alpha = qir_uniform_f(c, 1.0);
-
-        if (c->discard.file == QFILE_NULL)
-                c->discard = qir_uniform_ui(c, 0);
-
-        switch (c->fs_key->alpha_test_func) {
-        case PIPE_FUNC_NEVER:
-                c->discard = qir_uniform_ui(c, ~0);
-                break;
-        case PIPE_FUNC_ALWAYS:
-                break;
-        case PIPE_FUNC_EQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_NOTEQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GREATER:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LESS:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        }
-}
-
-static struct qreg
-vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst)
-{
-        switch (c->fs_key->logicop_func) {
-        case PIPE_LOGICOP_CLEAR:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_LOGICOP_NOR:
-                return qir_NOT(c, qir_OR(c, src, dst));
-        case PIPE_LOGICOP_AND_INVERTED:
-                return qir_AND(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_COPY_INVERTED:
-                return qir_NOT(c, src);
-        case PIPE_LOGICOP_AND_REVERSE:
-                return qir_AND(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_INVERT:
-                return qir_NOT(c, dst);
-        case PIPE_LOGICOP_XOR:
-                return qir_XOR(c, src, dst);
-        case PIPE_LOGICOP_NAND:
-                return qir_NOT(c, qir_AND(c, src, dst));
-        case PIPE_LOGICOP_AND:
-                return qir_AND(c, src, dst);
-        case PIPE_LOGICOP_EQUIV:
-                return qir_NOT(c, qir_XOR(c, src, dst));
-        case PIPE_LOGICOP_NOOP:
-                return dst;
-        case PIPE_LOGICOP_OR_INVERTED:
-                return qir_OR(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_OR_REVERSE:
-                return qir_OR(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_OR:
-                return qir_OR(c, src, dst);
-        case PIPE_LOGICOP_SET:
-                return qir_uniform_ui(c, ~0);
-        case PIPE_LOGICOP_COPY:
-        default:
-                return src;
-        }
-}
-
-/**
- * Applies the GL blending pipeline and returns the packed (8888) output
- * color.
- */
-static struct qreg
-blend_pipeline(struct vc4_compile *c)
-{
-        enum pipe_format color_format = c->fs_key->color_format;
-        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
-        struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg packed_dst_color = c->undef;
-
-        if (c->fs_key->blend.blend_enable ||
-            c->fs_key->blend.colormask != 0xf ||
-            c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                packed_dst_color = qir_TLB_COLOR_READ(c);
-                for (int i = 0; i < 4; i++)
-                        tlb_read_color[i] = qir_UNPACK_8_F(c,
-                                                           packed_dst_color, i);
-                for (int i = 0; i < 4; i++) {
-                        dst_color[i] = get_swizzled_channel(c,
-                                                            tlb_read_color,
-                                                            format_swiz[i]);
-                        if (util_format_is_srgb(color_format) && i != 3) {
-                                linear_dst_color[i] =
-                                        qir_srgb_decode(c, dst_color[i]);
-                        } else {
-                                linear_dst_color[i] = dst_color[i];
-                        }
-                }
-        }
-
-        struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef };
-        const struct qreg *output_colors = (c->output_color_index != -1 ?
-                                            c->outputs + c->output_color_index :
-                                            undef_array);
-        struct qreg blend_src_color[4];
-        for (int i = 0; i < 4; i++)
-                blend_src_color[i] = output_colors[i];
-
-        struct qreg blend_color[4];
-        vc4_blend(c, blend_color, linear_dst_color, blend_src_color);
-
-        if (util_format_is_srgb(color_format)) {
-                for (int i = 0; i < 3; i++)
-                        blend_color[i] = qir_srgb_encode(c, blend_color[i]);
-        }
-
-        /* Debug: Sometimes you're getting a black output and just want to see
-         * if the FS is getting executed at all.  Spam magenta into the color
-         * output.
-         */
-        if (0) {
-                blend_color[0] = qir_uniform_f(c, 1.0);
-                blend_color[1] = qir_uniform_f(c, 0.0);
-                blend_color[2] = qir_uniform_f(c, 1.0);
-                blend_color[3] = qir_uniform_f(c, 0.5);
-        }
-
-        struct qreg swizzled_outputs[4];
-        for (int i = 0; i < 4; i++) {
-                swizzled_outputs[i] = get_swizzled_channel(c, blend_color,
-                                                           format_swiz[i]);
-        }
-
-        struct qreg packed_color = c->undef;
-        for (int i = 0; i < 4; i++) {
-                if (swizzled_outputs[i].file == QFILE_NULL)
-                        continue;
-                if (packed_color.file == QFILE_NULL) {
-                        packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]);
-                } else {
-                        packed_color = qir_PACK_8_F(c,
-                                                    packed_color,
-                                                    swizzled_outputs[i],
-                                                    i);
-                }
-        }
-
-        if (packed_color.file == QFILE_NULL)
-                packed_color = qir_uniform_ui(c, 0);
-
-        if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                packed_color = vc4_logicop(c, packed_color, packed_dst_color);
-        }
-
-        /* If the bit isn't set in the color mask, then just return the
-         * original dst color, instead.
-         */
-        uint32_t colormask = 0xffffffff;
-        for (int i = 0; i < 4; i++) {
-                if (format_swiz[i] < 4 &&
-                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
-                        colormask &= ~(0xff << (i * 8));
-                }
-        }
-        if (colormask != 0xffffffff) {
-                packed_color = qir_OR(c,
-                                      qir_AND(c, packed_color,
-                                              qir_uniform_ui(c, colormask)),
-                                      qir_AND(c, packed_dst_color,
-                                              qir_uniform_ui(c, ~colormask)));
-        }
-
-        return packed_color;
-}
-
-static void
 emit_frag_end(struct vc4_compile *c)
 {
         clip_distance_discard(c);
-        alpha_test_discard(c);
-        struct qreg color = blend_pipeline(c);
+
+        struct qreg color;
+        if (c->output_color_index != -1) {
+                color = c->outputs[c->output_color_index];
+        } else {
+                color = qir_uniform_ui(c, 0);
+        }
 
         if (c->discard.file != QFILE_NULL)
                 qir_TLB_DISCARD_SETUP(c, c->discard);
@@ -1839,8 +1514,11 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 
         case nir_intrinsic_load_input:
                 assert(instr->num_components == 1);
-                *dest = c->inputs[instr->const_index[0]];
-
+                if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
+                        *dest = qir_TLB_COLOR_READ(c);
+                } else {
+                        *dest = c->inputs[instr->const_index[0]];
+                }
                 break;
 
         case nir_intrinsic_store_output:
@@ -2052,6 +1730,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         c->s = tgsi_to_nir(tokens, &nir_options);
         nir_opt_global_to_local(c->s);
         nir_convert_to_ssa(c->s);
+        if (stage == QSTAGE_FRAG)
+                vc4_nir_lower_blend(c);
         vc4_nir_lower_io(c);
         nir_lower_idiv(c->s);
         nir_lower_load_const_to_scalar(c->s);
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 57e25de..cade795 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -39,6 +39,8 @@
 #include "vc4_screen.h"
 #include "pipe/p_state.h"
 
+struct nir_builder;
+
 enum qfile {
         QFILE_NULL,
         QFILE_TEMP,
@@ -242,7 +244,11 @@ enum quniform_contents {
 
         QUNIFORM_TEXTURE_BORDER_COLOR,
 
-        QUNIFORM_BLEND_CONST_COLOR,
+        QUNIFORM_BLEND_CONST_COLOR_X,
+        QUNIFORM_BLEND_CONST_COLOR_Y,
+        QUNIFORM_BLEND_CONST_COLOR_Z,
+        QUNIFORM_BLEND_CONST_COLOR_W,
+
         QUNIFORM_STENCIL,
 
         QUNIFORM_ALPHA_REF,
@@ -414,6 +420,11 @@ struct vc4_compile {
         uint32_t variant_id;
 };
 
+/* Special nir_load_input intrinsic index for loading the current TLB
+ * destination color.
+ */
+#define VC4_NIR_TLB_COLOR_READ_INPUT		2000000000
+
 /* Special offset for nir_load_uniform values to get a QUNIFORM_*
  * state-dependent value.
  */
@@ -458,9 +469,12 @@ bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm_writes(struct vc4_compile *c);
+void vc4_nir_lower_blend(struct vc4_compile *c);
 void vc4_nir_lower_io(struct vc4_compile *c);
 nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
                                        enum quniform_contents contents);
+nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
+                                          nir_ssa_def **srcs, int swiz);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index 3bf6672..85d6998 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -257,9 +257,14 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                                                          uinfo->data[i]));
                         break;
 
-                case QUNIFORM_BLEND_CONST_COLOR:
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
                         cl_aligned_f(&uniforms,
-                                     CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1));
+                                     CLAMP(vc4->blend_color.color[uinfo->contents[i] -
+                                                                  QUNIFORM_BLEND_CONST_COLOR_X],
+                                           0, 1));
                         break;
 
                 case QUNIFORM_STENCIL:
@@ -321,7 +326,10 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
                         dirty |= VC4_DIRTY_TEXSTATE;
                         break;
 
-                case QUNIFORM_BLEND_CONST_COLOR:
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
                         dirty |= VC4_DIRTY_BLEND_COLOR;
                         break;