[Mesa-dev] [PATCH 07/10] i965: Replace brw_wm_* with dumping code into the fs_visitor.

Mon Sep 24 17:06:46 PDT 2012

On 09/22/2012 02:04 PM, Eric Anholt wrote:
> This makes a giant pile of code newly dead.  It also fixes TXB on newer
> chipsets, which has been totally broken (I now have a piglit test for that).
> It passes the same set of Ian's ARB_fragment_program tests.  It also improves
> high-settings ETQW performance by 3.2 +/- 1.9% (n=3), thanks to better
> optimization and having 8-wide along with 16-wide shaders.
> ---
>  src/mesa/drivers/dri/i965/Makefile.sources   |    1 +
>  src/mesa/drivers/dri/i965/brw_fs.cpp         |   36 +-
>  src/mesa/drivers/dri/i965/brw_fs.h           |   30 +-
>  src/mesa/drivers/dri/i965/brw_fs_emit.cpp    |   22 +-
>  src/mesa/drivers/dri/i965/brw_fs_fp.cpp      |  781 ++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |    3 +-
>  src/mesa/drivers/dri/i965/brw_wm.c           |   58 +-
>  src/mesa/drivers/dri/i965/brw_wm_state.c     |   19 +-
>  src/mesa/drivers/dri/i965/gen6_wm_state.c    |    8 +-
>  src/mesa/drivers/dri/i965/gen7_wm_state.c    |    8 +-
>  10 files changed, 857 insertions(+), 109 deletions(-)
>  create mode 100644 src/mesa/drivers/dri/i965/brw_fs_fp.cpp

I think the LIT code may be broken (comments inline), and one comment is
wrong.  Assuming you fix (or refute) those, then patches 1-8 are:
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>

I haven't read through 9 and 10 yet, but I plan to soon.

> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index 3715b0f..edc2376 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -119,6 +119,7 @@ i965_CXX_FILES = \
>  	brw_fs_cse.cpp \
>  	brw_fs_copy_propagation.cpp \
>  	brw_fs_emit.cpp \
> +	brw_fs_fp.cpp \
>  	brw_fs_live_variables.cpp \
>  	brw_fs_visitor.cpp \
>  	brw_fs_channel_expressions.cpp \
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index fea5980..2701413 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1999,11 +1999,15 @@ fs_visitor::run()
>        /* Generate FS IR for main().  (the visitor only descends into
>         * functions called "main").
>         */
> -      foreach_list(node, &*shader->ir) {
> -	 ir_instruction *ir = (ir_instruction *)node;
> -	 base_ir = ir;
> -	 this->result = reg_undef;
> -	 ir->accept(this);
> +      if (shader) {
> +         foreach_list(node, &*shader->ir) {
> +            ir_instruction *ir = (ir_instruction *)node;
> +            base_ir = ir;
> +            this->result = reg_undef;
> +            ir->accept(this);
> +         }
> +      } else {
> +         emit_fragment_program_code();
>        }
>        if (failed)
>  	 return false;
> @@ -2084,24 +2088,26 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
>     bool start_busy = false;
>     float start_time = 0;
>  
> -   if (!prog)
> -      return false;
> -
>     if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
>        start_busy = (intel->batch.last_bo &&
>                      drm_intel_bo_busy(intel->batch.last_bo));
>        start_time = get_time();
>     }
>  
> -   struct brw_shader *shader =
> -     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
> -   if (!shader)
> -      return false;
> +   struct brw_shader *shader = NULL;
> +   if (prog)
> +      shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
>  
>     if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
> -      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
> -      _mesa_print_ir(shader->ir, NULL);
> -      printf("\n\n");
> +      if (shader) {
> +         printf("GLSL IR for native fragment shader %d:\n", prog->Name);
> +         _mesa_print_ir(shader->ir, NULL);
> +         printf("\n\n");
> +      } else {
> +         printf("ARB_fragment_program %d ir for native fragment shader\n",
> +                c->fp->program.Base.Id);
> +         _mesa_print_program(&c->fp->program.Base);
> +      }
>     }
>  
>     /* Now the main event: Visit the shader IR and generate our FS IR for it.
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 9cb9590..9fbb8e5 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -177,7 +177,7 @@ public:
>     /** @{
>      * Annotation for the generated IR.  One of the two can be set.
>      */
> -   ir_instruction *ir;
> +   const void *ir;
>     const char *annotation;
>     /** @} */
>  };
> @@ -325,6 +325,29 @@ public:
>     void emit_if_gen6(ir_if *ir);
>     void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset);
>  
> +   void emit_fragment_program_code();
> +   void setup_fp_regs();
> +   fs_reg get_fp_src_reg(const prog_src_register *src);
> +   fs_reg get_fp_dst_reg(const prog_dst_register *dst);
> +   void emit_fp_alu1(enum opcode opcode,
> +                     const struct prog_instruction *fpi,
> +                     fs_reg dst, fs_reg src);
> +   void emit_fp_alu2(enum opcode opcode,
> +                     const struct prog_instruction *fpi,
> +                     fs_reg dst, fs_reg src0, fs_reg src1);
> +   void emit_fp_scalar_write(const struct prog_instruction *fpi,
> +                             fs_reg dst, fs_reg src);
> +   void emit_fp_scalar_math(enum opcode opcode,
> +                            const struct prog_instruction *fpi,
> +                            fs_reg dst, fs_reg src);
> +
> +   void emit_fp_minmax(const struct prog_instruction *fpi,
> +                       fs_reg dst, fs_reg src0, fs_reg src1);
> +
> +   void emit_fp_sop(uint32_t conditional_mod,
> +                    const struct prog_instruction *fpi,
> +                    fs_reg dst, fs_reg src0, fs_reg src1, fs_reg one);
> +
>     void emit_color_write(int target, int index, int first_color_mrf);
>     void emit_fb_writes();
>     bool try_rewrite_rhs_to_dst(ir_assignment *ir,
> @@ -382,9 +405,12 @@ public:
>     int max_grf;
>     int urb_setup[FRAG_ATTRIB_MAX];
>  
> +   fs_reg *fp_temp_regs;
> +   fs_reg *fp_input_regs;
> +
>     /** @{ debug annotation info */
>     const char *current_annotation;
> -   ir_instruction *base_ir;
> +   const void *base_ir;
>     /** @} */
>  
>     bool failed;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> index e477a61..aa60ed5 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> @@ -726,11 +726,16 @@ fs_visitor::generate_code()
>  {
>     int last_native_insn_offset = p->next_insn_offset;
>     const char *last_annotation_string = NULL;
> -   ir_instruction *last_annotation_ir = NULL;
> +   const void *last_annotation_ir = NULL;
>  
>     if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
> -      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
> -	     prog->Name, c->dispatch_width);
> +      if (shader) {
> +         printf("Native code for fragment shader %d (%d-wide dispatch):\n",
> +                prog->Name, c->dispatch_width);
> +      } else {
> +         printf("Native code for fragment program %d (%d-wide dispatch):\n",
> +                c->fp->program.Base.Id, c->dispatch_width);
> +      }
>     }
>  
>     fs_cfg *cfg = NULL;
> @@ -762,7 +767,16 @@ fs_visitor::generate_code()
>  	    last_annotation_ir = inst->ir;
>  	    if (last_annotation_ir) {
>  	       printf("   ");
> -	       last_annotation_ir->print();
> +               if (shader)
> +                  ((ir_instruction *)inst->ir)->print();
> +               else {
> +                  const prog_instruction *fpi;
> +                  fpi = (const prog_instruction *)inst->ir;
> +                  printf("%d: ", (int)(fpi - fp->Base.Instructions));
> +                  _mesa_fprint_instruction_opt(stdout,
> +                                               fpi,
> +                                               0, PROG_PRINT_DEBUG, NULL);
> +               }
>  	       printf("\n");
>  	    }
>  	 }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> new file mode 100644
> index 0000000..48ec9a5
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> @@ -0,0 +1,781 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +/** @file brw_fs_fp.cpp
> + *
> + * Implementation of the compiler for GL_ARB_fragment_program shaders on top
> + * of the GLSL compiler backend.
> + */
> +
> +#include "brw_context.h"
> +#include "brw_fs.h"
> +
> +static fs_reg
> +regoffset(fs_reg reg, int i)
> +{
> +   reg.reg_offset += i;
> +   return reg;
> +}
> +
> +void
> +fs_visitor::emit_fp_alu1(enum opcode opcode,
> +                         const struct prog_instruction *fpi,
> +                         fs_reg dst, fs_reg src)
> +{
> +   for (int i = 0; i < 4; i++) {
> +      if (fpi->DstReg.WriteMask & (1 << i))
> +         emit(opcode, regoffset(dst, i), regoffset(src, i));
> +   }
> +}
> +
> +void
> +fs_visitor::emit_fp_alu2(enum opcode opcode,
> +                         const struct prog_instruction *fpi,
> +                         fs_reg dst, fs_reg src0, fs_reg src1)
> +{
> +   for (int i = 0; i < 4; i++) {
> +      if (fpi->DstReg.WriteMask & (1 << i))
> +         emit(opcode, regoffset(dst, i),
> +              regoffset(src0, i), regoffset(src1, i));
> +   }
> +}
> +
> +void
> +fs_visitor::emit_fp_minmax(const prog_instruction *fpi,
> +                           fs_reg dst, fs_reg src0, fs_reg src1)
> +{
> +   uint32_t conditionalmod;
> +   if (fpi->Opcode == OPCODE_MIN)
> +      conditionalmod = BRW_CONDITIONAL_L;
> +   else
> +      conditionalmod = BRW_CONDITIONAL_GE;
> +
> +   for (int i = 0; i < 4; i++) {
> +      if (fpi->DstReg.WriteMask & (1 << i)) {
> +         emit_minmax(conditionalmod, regoffset(dst, i),
> +                     regoffset(src0, i), regoffset(src1, i));
> +      }
> +   }
> +}
> +
> +void
> +fs_visitor::emit_fp_sop(uint32_t conditional_mod,
> +                        const struct prog_instruction *fpi,
> +                        fs_reg dst, fs_reg src0, fs_reg src1,
> +                        fs_reg one)
> +{
> +   for (int i = 0; i < 4; i++) {
> +      if (fpi->DstReg.WriteMask & (1 << i)) {
> +         fs_inst *inst;
> +
> +         inst = emit(BRW_OPCODE_CMP, fs_reg(brw_null_reg()),
> +                     regoffset(src0, i), regoffset(src1, i));
> +         inst->conditional_mod = conditional_mod;
> +
> +         inst = emit(BRW_OPCODE_SEL, regoffset(dst, i), one, fs_reg(0.0f));
> +         inst->predicated = true;
> +      }
> +   }
> +}
> +
> +void
> +fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi,
> +                                 fs_reg dst, fs_reg src)
> +{
> +   for (int i = 0; i < 4; i++) {
> +      if (fpi->DstReg.WriteMask & (1 << i))
> +         emit(BRW_OPCODE_MOV, regoffset(dst, i), src);
> +   }
> +}
> +
> +void
> +fs_visitor::emit_fp_scalar_math(enum opcode opcode,
> +                                const struct prog_instruction *fpi,
> +                                fs_reg dst, fs_reg src)
> +{
> +   fs_reg temp = fs_reg(this, glsl_type::float_type);
> +   emit_math(opcode, temp, src);
> +   emit_fp_scalar_write(fpi, dst, temp);
> +}
> +
> +void
> +fs_visitor::emit_fragment_program_code()
> +{
> +   setup_fp_regs();
> +
> +   fs_reg null = fs_reg(brw_null_reg());
> +
> +   /* Keep a reg with 0.0 around, for reuse use by emit_sop so that it can

"Keep a reg with 1.0 around, for reuse by emit_fp_sop"
                 ^^^ (not 0.0)                 ^^ (function name)

> +    * just be:
> +    *
> +    * sel.f0 dst 1.0 0.0
> +    *
> +    * instead of
> +    *
> +    * mov    dst 0.0
> +    * mov.f0 dst 1.0
> +    */
> +   fs_reg one = fs_reg(this, glsl_type::float_type);
> +   emit(BRW_OPCODE_MOV, one, fs_reg(1.0f));
> +
> +   for (unsigned int insn = 0; insn < fp->Base.NumInstructions; insn++) {
> +      const struct prog_instruction *fpi = &fp->Base.Instructions[insn];
> +      base_ir = fpi;
> +
> +      //_mesa_print_instruction(fpi);
> +
> +      fs_reg dst;
> +      fs_reg src[3];
> +
> +      /* We always emit into a temporary destination register to avoid
> +       * aliasing issues.
> +       */
> +      dst = fs_reg(this, glsl_type::vec4_type);
> +
> +      for (int i = 0; i < 3; i++)
> +         src[i] = get_fp_src_reg(&fpi->SrcReg[i]);
> +
> +      switch (fpi->Opcode) {
> +      case OPCODE_ABS:
> +         src[0].abs = true;
> +         src[0].negate = false;
> +         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_ADD:
> +         emit_fp_alu2(BRW_OPCODE_ADD, fpi, dst, src[0], src[1]);
> +         break;
> +
> +      case OPCODE_CMP:
> +         for (int i = 0; i < 4; i++) {
> +            if (fpi->DstReg.WriteMask & (1 << i)) {
> +               fs_inst *inst;
> +
> +               inst = emit(BRW_OPCODE_CMP, null,
> +                           regoffset(src[0], i), fs_reg(0.0f));
> +               inst->conditional_mod = BRW_CONDITIONAL_L;
> +
> +               inst = emit(BRW_OPCODE_SEL, regoffset(dst, i),
> +                           regoffset(src[1], i), regoffset(src[2], i));
> +               inst->predicated = true;
> +            }
> +         }
> +         break;
> +
> +      case OPCODE_COS:
> +         emit_fp_scalar_math(SHADER_OPCODE_COS, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_DP2:
> +      case OPCODE_DP3:
> +      case OPCODE_DP4:
> +      case OPCODE_DPH: {
> +         fs_reg mul = fs_reg(this, glsl_type::float_type);
> +         fs_reg acc = fs_reg(this, glsl_type::float_type);
> +         int count;
> +
> +         switch (fpi->Opcode) {
> +         case OPCODE_DP2: count = 2; break;
> +         case OPCODE_DP3: count = 3; break;
> +         case OPCODE_DP4: count = 4; break;
> +         case OPCODE_DPH: count = 3; break;
> +         default: assert(!"not reached"); count = 0; break;
> +         }
> +
> +         emit(BRW_OPCODE_MUL, acc,
> +              regoffset(src[0], 0), regoffset(src[1], 0));
> +         for (int i = 1; i < count; i++) {
> +            emit(BRW_OPCODE_MUL, mul,
> +                 regoffset(src[0], i), regoffset(src[1], i));
> +            emit(BRW_OPCODE_ADD, acc, acc, mul);
> +         }

Future optimization: MAD would be nice here, but that can be done later.

> +         if (fpi->Opcode == OPCODE_DPH)
> +            emit(BRW_OPCODE_ADD, acc, acc, regoffset(src[1], 3));
> +
> +         emit_fp_scalar_write(fpi, dst, acc);
> +         break;
> +      }
> +
> +      case OPCODE_DST:
> +         if (fpi->DstReg.WriteMask & WRITEMASK_X)
> +            emit(BRW_OPCODE_MOV, dst, fs_reg(1.0f));
> +         if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
> +            emit(BRW_OPCODE_MUL, regoffset(dst, 1),
> +                 regoffset(src[0], 1), regoffset(src[1], 1));
> +         }
> +         if (fpi->DstReg.WriteMask & WRITEMASK_Z)
> +            emit(BRW_OPCODE_MOV, regoffset(dst, 2), regoffset(src[0], 2));
> +         if (fpi->DstReg.WriteMask & WRITEMASK_W)
> +            emit(BRW_OPCODE_MOV, regoffset(dst, 3), regoffset(src[1], 3));
> +         break;
> +
> +      case OPCODE_EX2:
> +         emit_fp_scalar_math(SHADER_OPCODE_EXP2, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_FLR:
> +         emit_fp_alu1(BRW_OPCODE_RNDD, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_FRC:
> +         emit_fp_alu1(BRW_OPCODE_FRC, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_KIL: {
> +         for (int i = 0; i < 4; i++) {
> +            /* In most cases the argument to a KIL will be something like
> +             * TEMP[0].wwww, so there's no point in checking whether .w is < 0
> +             * 4 times in a row.
> +             */
> +            if (i > 0 &&
> +                GET_SWZ(fpi->SrcReg[0].Swizzle, i) ==
> +                GET_SWZ(fpi->SrcReg[0].Swizzle, i - 1) &&
> +                ((fpi->SrcReg[0].Negate >> i) & 1) ==
> +                ((fpi->SrcReg[0].Negate >> (i - 1)) & 1)) {
> +               continue;
> +            }
> +
> +            fs_inst *inst = emit(BRW_OPCODE_CMP, null,
> +                                 regoffset(src[0], i), 0.0f);
> +            inst->conditional_mod = BRW_CONDITIONAL_L;
> +
> +            inst = emit(BRW_OPCODE_IF);
> +            inst->predicated = true;
> +            emit(FS_OPCODE_DISCARD);
> +            emit(BRW_OPCODE_ENDIF);
> +         }
> +         break;
> +      }
> +
> +      case OPCODE_LG2:
> +         emit_fp_scalar_math(SHADER_OPCODE_LOG2, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_LIT:
> +         /* From the ARB_fragment_program spec:
> +          *
> +          *      tmp = VectorLoad(op0);
> +          *      if (tmp.x < 0) tmp.x = 0;
> +          *      if (tmp.y < 0) tmp.y = 0;
> +          *      if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
> +          *      else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
> +          *      result.x = 1.0;
> +          *      result.y = tmp.x;
> +          *      result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
> +          *      result.w = 1.0;
> +          */
> +         if (fpi->DstReg.WriteMask & WRITEMASK_X)
> +            emit(BRW_OPCODE_MOV, regoffset(dst, 0), fs_reg(1.0f));
> +
> +         if (fpi->DstReg.WriteMask & WRITEMASK_YZ) {
> +            fs_inst *inst;
> +            inst = emit(BRW_OPCODE_CMP, null,
> +                        regoffset(src[0], 0), fs_reg(0.0f));
> +            inst->conditional_mod = BRW_CONDITIONAL_LE;
> +
> +            if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
> +               emit(BRW_OPCODE_MOV, regoffset(dst, 1), regoffset(src[0], 0));
> +               inst = emit(BRW_OPCODE_MOV, regoffset(dst, 1), fs_reg(0.0f));
> +               inst->predicated = true;
> +            }
> +
> +            if (fpi->DstReg.WriteMask & WRITEMASK_Z) {
> +               emit_math(SHADER_OPCODE_POW, regoffset(dst, 2),
> +                         regoffset(src[0], 1), regoffset(src[0], 3));
> +
> +               inst = emit(BRW_OPCODE_MOV, regoffset(dst, 2), fs_reg(0.0f));
> +               inst->predicated = true;

This looks broken...don't you need to handle clamping to (-128, 128)?

> +            }
> +         }
> +
> +         if (fpi->DstReg.WriteMask & WRITEMASK_W)
> +            emit(BRW_OPCODE_MOV, regoffset(dst, 3), fs_reg(1.0f));
> +
> +         break;
> +
> +      case OPCODE_LRP:
> +         for (int i = 0; i < 4; i++) {
> +            if (fpi->DstReg.WriteMask & (1 << i)) {
> +               fs_reg neg_src0 = regoffset(src[0], i);
> +               neg_src0.negate = !neg_src0.negate;
> +               fs_reg temp = fs_reg(this, glsl_type::float_type);
> +               fs_reg temp2 = fs_reg(this, glsl_type::float_type);
> +               emit(BRW_OPCODE_ADD, temp, neg_src0, fs_reg(1.0f));
> +               emit(BRW_OPCODE_MUL, temp, temp, regoffset(src[2], i));
> +               emit(BRW_OPCODE_MUL, temp2,
> +                    regoffset(src[0], i), regoffset(src[1], i));
> +               emit(BRW_OPCODE_ADD, regoffset(dst, i), temp, temp2);
> +            }
> +         }
> +         break;
> +
> +      case OPCODE_MAD:
> +         for (int i = 0; i < 4; i++) {
> +            if (fpi->DstReg.WriteMask & (1 << i)) {
> +               fs_reg temp = fs_reg(this, glsl_type::float_type);
> +               emit(BRW_OPCODE_MUL, temp,
> +                    regoffset(src[0], i), regoffset(src[1], i));
> +               emit(BRW_OPCODE_ADD, regoffset(dst, i),
> +                    temp, regoffset(src[2], i));
> +            }
> +         }
> +         break;

Future optimization: MADs...

> +
> +      case OPCODE_MAX:
> +         emit_fp_minmax(fpi, dst, src[0], src[1]);
> +         break;
> +
> +      case OPCODE_MOV:
> +         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_MIN:
> +         emit_fp_minmax(fpi, dst, src[0], src[1]);
> +         break;
> +
> +      case OPCODE_MUL:
> +         emit_fp_alu2(BRW_OPCODE_MUL, fpi, dst, src[0], src[1]);
> +         break;
> +
> +      case OPCODE_POW: {
> +         fs_reg temp = fs_reg(this, glsl_type::float_type);
> +         emit_math(SHADER_OPCODE_POW, temp, src[0], src[1]);
> +         emit_fp_scalar_write(fpi, dst, temp);
> +         break;
> +      }

I see.  You don't use emit_fp_scalar_math here because it doesn't take a
second argument.

> +
> +      case OPCODE_RCP:
> +         emit_fp_scalar_math(SHADER_OPCODE_RCP, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_RSQ:
> +         emit_fp_scalar_math(SHADER_OPCODE_RSQ, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_SCS:
> +         if (fpi->DstReg.WriteMask & WRITEMASK_X) {
> +            emit_math(SHADER_OPCODE_COS, regoffset(dst, 0),
> +                      regoffset(src[0], 0));
> +         }
> +
> +         if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
> +            emit_math(SHADER_OPCODE_SIN, regoffset(dst, 1),
> +                      regoffset(src[0], 1));
> +         }
> +         break;

Future optimization: we could use the actual SINCOS math instruction
when asking for WRITEMASK_XY.  But I don't know how common that is.

> +      case OPCODE_SGE:
> +         emit_fp_sop(BRW_CONDITIONAL_GE, fpi, dst, src[0], src[1], one);
> +         break;
> +
> +      case OPCODE_SIN:
> +         emit_fp_scalar_math(SHADER_OPCODE_SIN, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_SLT:
> +         emit_fp_sop(BRW_CONDITIONAL_L, fpi, dst, src[0], src[1], one);
> +         break;
> +
> +      case OPCODE_SUB: {
> +         fs_reg neg_src1 = src[1];
> +         neg_src1.negate = !src[1].negate;
> +
> +         emit_fp_alu2(BRW_OPCODE_ADD, fpi, dst, src[0], neg_src1);
> +         break;
> +      }
> +
> +      case OPCODE_TEX:
> +      case OPCODE_TXB:
> +      case OPCODE_TXP: {
> +         /* We piggy-back on the GLSL IR support for texture setup.  To do so,
> +          * we have to cook up an ir_texture that has the coordinate field
> +          * with appropriate type, and shadow_comparitor set or not.  All the
> +          * other properties of ir_texture are passed in as arguments to the
> +          * emit_texture_gen* function.
> +          */
> +         ir_texture *ir = NULL;
> +
> +         fs_reg lod;
> +         fs_reg dpdy;
> +         fs_reg coordinate = src[0];
> +         fs_reg shadow_c;
> +
> +         switch (fpi->Opcode) {
> +         case OPCODE_TEX:
> +            ir = new(mem_ctx) ir_texture(ir_tex);
> +            break;
> +         case OPCODE_TXP: {
> +            ir = new(mem_ctx) ir_texture(ir_tex);
> +
> +            coordinate = fs_reg(this, glsl_type::vec3_type);
> +            fs_reg invproj = fs_reg(this, glsl_type::float_type);
> +            emit_math(SHADER_OPCODE_RCP, invproj, regoffset(src[0], 3));
> +            for (int i = 0; i < 3; i++) {
> +               emit(BRW_OPCODE_MUL, regoffset(coordinate, i),
> +                    regoffset(src[0], i), invproj);
> +            }
> +            break;
> +         }
> +         case OPCODE_TXB:
> +            ir = new(mem_ctx) ir_texture(ir_txb);
> +            lod = regoffset(src[0], 3);
> +            break;
> +         default:
> +            assert(!"not reached");
> +            break;
> +         }
> +
> +         const glsl_type *coordinate_type;
> +         switch (fpi->TexSrcTarget) {
> +         case TEXTURE_1D_INDEX:
> +            coordinate_type = glsl_type::float_type;
> +            break;
> +
> +         case TEXTURE_2D_INDEX:
> +         case TEXTURE_1D_ARRAY_INDEX:
> +         case TEXTURE_RECT_INDEX:
> +         case TEXTURE_EXTERNAL_INDEX:
> +            coordinate_type = glsl_type::vec2_type;
> +            break;
> +
> +         case TEXTURE_3D_INDEX:
> +         case TEXTURE_2D_ARRAY_INDEX:
> +            coordinate_type = glsl_type::vec3_type;
> +            break;
> +
> +         case TEXTURE_CUBE_INDEX: {
> +            coordinate_type = glsl_type::vec3_type;
> +
> +            fs_reg temp = fs_reg(this, glsl_type::float_type);
> +            fs_reg cubecoord = fs_reg(this, glsl_type::vec3_type);
> +            fs_reg abscoord = coordinate;
> +            abscoord.negate = false;
> +            abscoord.abs = true;
> +            emit_minmax(BRW_CONDITIONAL_GE, temp,
> +                        regoffset(abscoord, 0), regoffset(abscoord, 1));
> +            emit_minmax(BRW_CONDITIONAL_GE, temp,
> +                        temp, regoffset(abscoord, 2));
> +            emit_math(SHADER_OPCODE_RCP, temp, temp);
> +            for (int i = 0; i < 3; i++) {
> +               emit(BRW_OPCODE_MUL, regoffset(cubecoord, i),
> +                    regoffset(coordinate, i), temp);
> +            }
> +
> +            coordinate = cubecoord;
> +            break;
> +         }
> +
> +         default:
> +            assert(!"not reached");
> +            coordinate_type = glsl_type::vec2_type;
> +            break;
> +         }
> +
> +         ir_constant_data junk_data;
> +         ir->coordinate = new(mem_ctx) ir_constant(coordinate_type, &junk_data);
> +
> +         coordinate = rescale_texcoord(ir, coordinate,
> +                                       fpi->TexSrcTarget == TEXTURE_RECT_INDEX,
> +                                       fpi->TexSrcUnit, fpi->TexSrcUnit);
> +
> +         if (fpi->TexShadow) {
> +            shadow_c = regoffset(coordinate, 2);
> +            ir->shadow_comparitor = new(mem_ctx) ir_constant(0.0f);
> +         }
> +
> +         fs_inst *inst;
> +         if (intel->gen >= 7) {
> +            inst = emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, dpdy);
> +         } else if (intel->gen >= 5) {
> +            inst = emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, dpdy);
> +         } else {
> +            inst = emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, dpdy);
> +         }
> +
> +         inst->sampler = fpi->TexSrcUnit;
> +         inst->shadow_compare = fpi->TexShadow;
> +
> +         /* Reuse the GLSL swizzle_result() handler. */
> +         swizzle_result(ir, dst, fpi->TexSrcUnit);
> +         dst = this->result;
> +
> +         break;
> +      }
> +
> +      case OPCODE_SWZ:
> +         /* Note that SWZ's extended swizzles are handled in the general
> +          * get_src_reg() code.
> +          */
> +         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
> +         break;
> +
> +      case OPCODE_XPD:
> +         for (int i = 0; i < 3; i++) {
> +            if (fpi->DstReg.WriteMask & (1 << i)) {
> +               int i1 = (i + 1) % 3;
> +               int i2 = (i + 2) % 3;
> +
> +               fs_reg temp = fs_reg(this, glsl_type::float_type);
> +               fs_reg neg_src1_1 = regoffset(src[1], i1);
> +               neg_src1_1.negate = !neg_src1_1.negate;
> +               emit(BRW_OPCODE_MUL, temp,
> +                    regoffset(src[0], i2), neg_src1_1);
> +               emit(BRW_OPCODE_MUL, regoffset(dst, i),
> +                    regoffset(src[0], i1), regoffset(src[1], i2));
> +               emit(BRW_OPCODE_ADD, regoffset(dst, i),
> +                    regoffset(dst, i), temp);
> +            }
> +         }
> +         break;
> +
> +      case OPCODE_END:
> +         break;
> +
> +      default:
> +         _mesa_problem(ctx, "Unsupported opcode %s in fragment program\n",
> +                       _mesa_opcode_string(fpi->Opcode));
> +      }
> +
> +      /* To handle saturates, we emit a MOV with a saturate bit, which
> +       * optimization should fold into the preceding instructions when safe.
> +       */
> +      if (fpi->Opcode != OPCODE_END) {
> +         fs_reg real_dst = get_fp_dst_reg(&fpi->DstReg);
> +
> +         for (int i = 0; i < 4; i++) {
> +            if (fpi->DstReg.WriteMask & (1 << i)) {
> +               fs_inst *inst = emit(BRW_OPCODE_MOV,
> +                                    regoffset(real_dst, i),
> +                                    regoffset(dst, i));
> +               inst->saturate = fpi->SaturateMode;
> +            }
> +         }
> +      }
> +   }
> +
> +   /* Epilogue:
> +    *
> +    * Fragment depth has this strange convention of being the .z component of
> +    * a vec4.  emit_fb_write() wants to see a float value, instead.
> +    */
> +   this->current_annotation = "result.depth write";
> +   if (frag_depth.file != BAD_FILE) {
> +      fs_reg temp = fs_reg(this, glsl_type::float_type);
> +      emit(BRW_OPCODE_MOV, temp, regoffset(frag_depth, 2));
> +      frag_depth = temp;
> +   }
> +}
> +
> +void
> +fs_visitor::setup_fp_regs()
> +{
> +   /* PROGRAM_TEMPORARY */
> +   int num_temp = fp->Base.NumTemporaries;
> +   fp_temp_regs = rzalloc_array(mem_ctx, fs_reg, num_temp);
> +   for (int i = 0; i < num_temp; i++)
> +      fp_temp_regs[i] = fs_reg(this, glsl_type::vec4_type);
> +
> +   /* PROGRAM_STATE_VAR, PROGRAM_NAMED_PARAM, etc. */
> +   if (c->dispatch_width == 8) {
> +      for (unsigned p = 0;
> +           p < c->fp->program.Base.Parameters->NumParameters; p++) {
> +         for (unsigned int i = 0; i < 4; i++) {
> +            this->param_index[c->prog_data.nr_params] = p;
> +            this->param_offset[c->prog_data.nr_params] = i;
> +            c->prog_data.nr_params++;
> +         }
> +      }
> +   }
> +
> +   fp_input_regs = rzalloc_array(mem_ctx, fs_reg, FRAG_ATTRIB_MAX);
> +   for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
> +      if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
> +         /* Make up a dummy instruction to reuse code for emitting
> +          * interpolation.
> +          */
> +         ir_variable *ir = new(mem_ctx) ir_variable(glsl_type::vec4_type,
> +                                                    "fp_input",
> +                                                    ir_var_in);
> +         ir->location = i;
> +
> +         this->current_annotation = ralloc_asprintf(ctx, "interpolate input %d",
> +                                                    i);
> +
> +         switch (i) {
> +         case FRAG_ATTRIB_WPOS:
> +            ir->pixel_center_integer = fp->PixelCenterInteger;
> +            ir->origin_upper_left = fp->OriginUpperLeft;
> +            fp_input_regs[i] = *emit_fragcoord_interpolation(ir);
> +            break;
> +         case FRAG_ATTRIB_FACE:
> +            fp_input_regs[i] = *emit_frontfacing_interpolation(ir);
> +            break;
> +         default:
> +            fp_input_regs[i] = *emit_general_interpolation(ir);
> +
> +            if (i == FRAG_ATTRIB_FOGC) {
> +               emit(BRW_OPCODE_MOV,
> +                    regoffset(fp_input_regs[i], 1), fs_reg(0.0f));
> +               emit(BRW_OPCODE_MOV,
> +                    regoffset(fp_input_regs[i], 2), fs_reg(0.0f));
> +               emit(BRW_OPCODE_MOV,
> +                    regoffset(fp_input_regs[i], 3), fs_reg(1.0f));
> +            }
> +
> +            break;
> +         }
> +
> +         this->current_annotation = NULL;
> +      }
> +   }
> +}
> +
> +fs_reg
> +fs_visitor::get_fp_dst_reg(const prog_dst_register *dst)
> +{
> +   switch (dst->File) {
> +   case PROGRAM_TEMPORARY:
> +      return fp_temp_regs[dst->Index];
> +
> +   case PROGRAM_OUTPUT:
> +      if (dst->Index == FRAG_RESULT_DEPTH) {
> +         if (frag_depth.file == BAD_FILE)
> +            frag_depth = fs_reg(this, glsl_type::vec4_type);
> +         return frag_depth;
> +      } else if (dst->Index == FRAG_RESULT_COLOR) {
> +         if (outputs[0].file == BAD_FILE) {
> +            outputs[0] = fs_reg(this, glsl_type::vec4_type);
> +            output_components[0] = 4;
> +
> +            /* Tell emit_fb_writes() to smear fragment.color across all the
> +             * color attachments.
> +             */
> +            for (int i = 1; i < c->key.nr_color_regions; i++) {
> +               outputs[i] = outputs[0];
> +               output_components[i] = output_components[0];
> +            }
> +         }
> +         return outputs[0];
> +      } else {
> +         int output_index = dst->Index - FRAG_RESULT_DATA0;
> +         if (outputs[output_index].file == BAD_FILE) {
> +            outputs[output_index] = fs_reg(this, glsl_type::vec4_type);
> +         }
> +         output_components[output_index] = 4;
> +         return outputs[output_index];
> +      }
> +
> +   case PROGRAM_UNDEFINED:
> +      return fs_reg();
> +
> +   default:
> +      _mesa_problem(ctx, "bad dst register file: %s\n",
> +                    _mesa_register_file_name((gl_register_file)dst->File));
> +      return fs_reg(this, glsl_type::vec4_type);
> +   }
> +}
> +
> +fs_reg
> +fs_visitor::get_fp_src_reg(const prog_src_register *src)
> +{
> +   struct gl_program_parameter_list *plist = c->fp->program.Base.Parameters;
> +
> +   fs_reg result;
> +
> +   assert(!src->Abs);
> +
> +   switch (src->File) {
> +   case PROGRAM_UNDEFINED:
> +      return fs_reg();
> +   case PROGRAM_TEMPORARY:
> +      result = fp_temp_regs[src->Index];
> +      break;
> +
> +   case PROGRAM_INPUT:
> +      result = fp_input_regs[src->Index];
> +      break;
> +
> +   case PROGRAM_STATE_VAR:
> +   case PROGRAM_UNIFORM:
> +   case PROGRAM_CONSTANT:
> +   case PROGRAM_NAMED_PARAM:
> +      /* We actually want to look at the type in the Parameters list for this,
> +       * because this lets us upload constant builtin uniforms, as actual
> +       * constants.
> +       */
> +      switch (plist->Parameters[src->Index].Type) {
> +      case PROGRAM_NAMED_PARAM:
> +      case PROGRAM_CONSTANT: {
> +         result = fs_reg(this, glsl_type::vec4_type);
> +
> +         for (int i = 0; i < 4; i++) {
> +            emit(BRW_OPCODE_MOV, regoffset(result, i),
> +                 fs_reg(plist->ParameterValues[src->Index][i].f));
> +         }
> +         break;
> +      }
> +
> +      case PROGRAM_STATE_VAR:
> +      case PROGRAM_UNIFORM:
> +         result = fs_reg(UNIFORM, src->Index * 4);
> +         break;
> +
> +      default:
> +         _mesa_problem(ctx, "bad uniform src register file: %s\n",
> +                       _mesa_register_file_name((gl_register_file)src->File));
> +         return fs_reg(this, glsl_type::vec4_type);
> +      }
> +      break;
> +
> +   default:
> +      _mesa_problem(ctx, "bad src register file: %s\n",
> +                    _mesa_register_file_name((gl_register_file)src->File));
> +      return fs_reg(this, glsl_type::vec4_type);
> +   }
> +
> +   if (src->Swizzle != SWIZZLE_NOOP || src->Negate) {
> +      fs_reg unswizzled = result;
> +      result = fs_reg(this, glsl_type::vec4_type);
> +      for (int i = 0; i < 4; i++) {
> +         bool negate = src->Negate & (1 << i);
> +         /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
> +          * but it costs us nothing to support it.
> +          */
> +         int src_swiz = GET_SWZ(src->Swizzle, i);
> +         if (src_swiz == SWIZZLE_ZERO) {
> +            emit(BRW_OPCODE_MOV, regoffset(result, i), fs_reg(0.0f));
> +         } else if (src_swiz == SWIZZLE_ONE) {
> +            emit(BRW_OPCODE_MOV, regoffset(result, i),
> +                 negate ? fs_reg(-1.0f) : fs_reg(1.0f));
> +         } else {
> +            fs_reg src = regoffset(unswizzled, src_swiz);
> +            if (negate)
> +               src.negate = !src.negate;
> +            emit(BRW_OPCODE_MOV, regoffset(result, i), src);
> +         }
> +      }
> +   }
> +
> +   return result;
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index c8d976f..e89ad55 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -2240,8 +2240,7 @@ fs_visitor::fs_visitor(struct brw_wm_compile *c, struct gl_shader_program *prog,
>     this->c = c;
>     this->p = &c->func;
>     this->brw = p->brw;
> -   this->fp = (struct gl_fragment_program *)
> -      prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
> +   this->fp = &c->fp->program;
>     this->prog = prog;
>     this->intel = &brw->intel;
>     this->ctx = &intel->ctx;
> diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
> index 995e8f3..47151f0 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm.c
> @@ -85,46 +85,6 @@ GLuint brw_wm_is_scalar_result( GLuint opcode )
>     }
>  }
>  
> -
> -/**
> - * Do GPU code generation for non-GLSL shader.  non-GLSL shaders have
> - * no flow control instructions so we can more readily do SSA-style
> - * optimizations.
> - */
> -static void
> -brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
> -{
> -   /* Augment fragment program.  Add instructions for pre- and
> -    * post-fragment-program tasks such as interpolation and fogging.
> -    */
> -   brw_wm_pass_fp(c);
> -
> -   /* Translate to intermediate representation.  Build register usage
> -    * chains.
> -    */
> -   brw_wm_pass0(c);
> -
> -   /* Dead code removal.
> -    */
> -   brw_wm_pass1(c);
> -
> -   /* Register allocation.
> -    * Divide by two because we operate on 16 pixels at a time and require
> -    * two GRF entries for each logical shader register.
> -    */
> -   c->grf_limit = BRW_WM_MAX_GRF / 2;
> -
> -   brw_wm_pass2(c);
> -
> -   /* how many general-purpose registers are used */
> -   c->prog_data.reg_blocks = brw_register_blocks(c->max_wm_grf);
> -
> -   /* Emit GEN4 code.
> -    */
> -   brw_wm_emit(c);
> -}
> -
> -
>  /**
>   * Return a bitfield where bit n is set if barycentric interpolation mode n
>   * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
> @@ -356,23 +316,7 @@ bool do_wm_prog(struct brw_context *brw,
>        brw_compute_barycentric_interp_modes(brw, c->key.flat_shade,
>                                             &fp->program);
>  
> -   if (prog && prog->_LinkedShaders[MESA_SHADER_FRAGMENT]) {
> -      if (!brw_wm_fs_emit(brw, c, prog))
> -	 return false;
> -   } else {
> -      if (!c->instruction) {
> -	 c->instruction = rzalloc_array(c, struct brw_wm_instruction, BRW_WM_MAX_INSN);
> -	 c->prog_instructions = rzalloc_array(c, struct prog_instruction, BRW_WM_MAX_INSN);
> -	 c->vreg = rzalloc_array(c, struct brw_wm_value, BRW_WM_MAX_VREG);
> -	 c->refs = rzalloc_array(c, struct brw_wm_ref, BRW_WM_MAX_REF);
> -      }
> -
> -      /* Fallback for fixed function and ARB_fp shaders. */
> -      c->dispatch_width = 16;
> -      brw_wm_payload_setup(brw, c);
> -      brw_wm_non_glsl_emit(brw, c);
> -      c->prog_data.dispatch_width = 16;
> -   }
> +   brw_wm_fs_emit(brw, c, prog);
>  
>     /* Scratch space is used for register spilling */
>     if (c->last_scratch) {
> diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
> index dd67795..ea2dea9 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
> @@ -163,23 +163,8 @@ brw_upload_wm_unit(struct brw_context *brw)
>     /* _NEW_COLOR */
>     wm->wm5.program_uses_killpixel = fp->UsesKill || ctx->Color.AlphaEnabled;
>  
> -
> -   /* BRW_NEW_FRAGMENT_PROGRAM
> -    *
> -    * If using the fragment shader backend, the program is always
> -    * 8-wide.  If not, it's always 16.
> -    */
> -   if (ctx->Shader._CurrentFragmentProgram) {
> -      struct brw_shader *shader = (struct brw_shader *)
> -	 ctx->Shader._CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT];
> -
> -      if (shader != NULL && shader->ir != NULL) {
> -	 wm->wm5.enable_8_pix = 1;
> -	 if (brw->wm.prog_data->prog_offset_16)
> -	    wm->wm5.enable_16_pix = 1;
> -      }
> -   }
> -   if (!wm->wm5.enable_8_pix)
> +   wm->wm5.enable_8_pix = 1;
> +   if (brw->wm.prog_data->prog_offset_16)
>        wm->wm5.enable_16_pix = 1;
>  
>     wm->wm5.max_threads = brw->max_wm_threads - 1;
> diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
> index dd43528..bd28f97 100644
> --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
> @@ -151,13 +151,9 @@ upload_wm_state(struct brw_context *brw)
>     dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
>  
>     /* CACHE_NEW_WM_PROG */
> -   if (brw->wm.prog_data->dispatch_width == 8) {
> -      dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
> -      if (brw->wm.prog_data->prog_offset_16)
> -	 dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
> -   } else {
> +   dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
> +   if (brw->wm.prog_data->prog_offset_16)
>        dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
> -   }
>  
>     /* CACHE_NEW_WM_PROG | _NEW_COLOR */
>     if (brw->wm.prog_data->dual_src_blend &&
> diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
> index dc49a7d..e0c6911 100644
> --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
> @@ -196,13 +196,9 @@ upload_ps_state(struct brw_context *brw)
>     if (brw->fragment_program->Base.InputsRead != 0)
>        dw4 |= GEN7_PS_ATTRIBUTE_ENABLE;
>  
> -   if (brw->wm.prog_data->dispatch_width == 8) {
> -      dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
> -      if (brw->wm.prog_data->prog_offset_16)
> -	 dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
> -   } else {
> +   dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
> +   if (brw->wm.prog_data->prog_offset_16)
>        dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
> -   }
>  
>     dw5 |= (brw->wm.prog_data->first_curbe_grf <<
>  	   GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
>