[Mesa-dev] [PATCH 3/3] i965: Add writes_accumulator flag

Wed Apr 9 13:47:19 PDT 2014

From: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>

Our hardware has an "accumulator" register, which can be used to store
intermediate results across multiple instructions.  Many instructions
can implicitly write a value to the accumulator in addition to their
normal destination register.  This is enabled by the "AccWrEn" flag.

This patch introduces a new flag, inst->writes_accumulator, which
allows us to express the AccWrEn notion in the IR.  It also creates a
n ALU2_ACC macro to easily define emitters for instructions that
implicitly write the accumulator.

Previously, we only supported implicit accumulator writes from the
ADDC, SUBB, and MACH instructions.  We always enabled them on those
instructions, and left them disabled for other instructions.

To take advantage of the MAC (multiply-accumulate) instruction, we
need to be able to set AccWrEn on other types of instructions.

Reviewed-by: Matt Turner <mattst88 at gmail.com>
Signed-off-by: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
---
I split out is_accumulator() into a separate patch, and made some
fixes to the scheduling code. Let me know if these changes look good
to you, JP. (Patch formatted with -U15 as to see other sections of
the scheduling code during review)

 src/mesa/drivers/dri/i965/brw_fs.cpp               | 26 ++++++----
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp     |  7 +--
 .../drivers/dri/i965/brw_schedule_instructions.cpp | 58 ++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.h             |  1 +
 src/mesa/drivers/dri/i965/brw_vec4.cpp             | 15 ++----
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp   |  7 +--
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp     | 17 +++++--
 7 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index e576545..0eece60 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -52,30 +52,32 @@ extern "C" {
 #include "glsl/glsl_types.h"
 
 void
 fs_inst::init()
 {
    memset(this, 0, sizeof(*this));
    this->conditional_mod = BRW_CONDITIONAL_NONE;
 
    this->dst = reg_undef;
    this->src[0] = reg_undef;
    this->src[1] = reg_undef;
    this->src[2] = reg_undef;
 
    /* This will be the case for almost all instructions. */
    this->regs_written = 1;
+
+   this->writes_accumulator = false;
 }
 
 fs_inst::fs_inst()
 {
    init();
    this->opcode = BRW_OPCODE_NOP;
 }
 
 fs_inst::fs_inst(enum opcode opcode)
 {
    init();
    this->opcode = opcode;
 }
 
 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
@@ -139,63 +141,72 @@ fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 
 #define ALU1(op)                                                        \
    fs_inst *                                                            \
    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
    {                                                                    \
       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
    }
 
 #define ALU2(op)                                                        \
    fs_inst *                                                            \
    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
    {                                                                    \
       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
    }
 
+#define ALU2_ACC(op)                                                    \
+   fs_inst *                                                            \
+   fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
+   {                                                                    \
+      fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
+      inst->writes_accumulator = true;                                  \
+      return inst;                                                      \
+   }
+
 #define ALU3(op)                                                        \
    fs_inst *                                                            \
    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
    {                                                                    \
       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
    }
 
 ALU1(NOT)
 ALU1(MOV)
 ALU1(FRC)
 ALU1(RNDD)
 ALU1(RNDE)
 ALU1(RNDZ)
 ALU2(ADD)
 ALU2(MUL)
-ALU2(MACH)
+ALU2_ACC(MACH)
 ALU2(AND)
 ALU2(OR)
 ALU2(XOR)
 ALU2(SHL)
 ALU2(SHR)
 ALU2(ASR)
 ALU3(LRP)
 ALU1(BFREV)
 ALU3(BFE)
 ALU2(BFI1)
 ALU3(BFI2)
 ALU1(FBH)
 ALU1(FBL)
 ALU1(CBIT)
 ALU3(MAD)
-ALU2(ADDC)
-ALU2(SUBB)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
 ALU2(SEL)
 
 /** Gen4 predicated IF. */
 fs_inst *
 fs_visitor::IF(uint32_t predicate)
 {
    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
    inst->predicate = predicate;
    return inst;
 }
 
 /** Gen6 IF with embedded comparison. */
 fs_inst *
 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 {
@@ -2113,40 +2124,35 @@ fs_visitor::dead_code_eliminate()
 
          for (int i = 0; i < inst->regs_written; i++) {
             int var = live_intervals->var_from_vgrf[inst->dst.reg];
             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
                dead = false;
                break;
             }
          }
 
          if (dead) {
             /* Don't dead code eliminate instructions that write to the
              * accumulator as a side-effect. Instead just set the destination
              * to the null register to free it.
              */
-            switch (inst->opcode) {
-            case BRW_OPCODE_ADDC:
-            case BRW_OPCODE_SUBB:
-            case BRW_OPCODE_MACH:
+            if (inst->writes_accumulator) {
                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
-               break;
-            default:
+            } else {
                inst->remove();
                progress = true;
-               break;
             }
          }
       }
 
       pc++;
    }
 
    if (progress)
       invalidate_live_intervals();
 
    return progress;
 }
 
 struct dead_code_hash_key
 {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index e590bdf..1cf35b4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1399,56 +1399,55 @@ fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
 	  * accumulator value, so now you can't check, for example,
 	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
 	  */
 	 assert(!inst->conditional_mod ||
 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
 		!inst->src[i].negate);
       }
       dst = brw_reg_from_fs_reg(&inst->dst);
 
       brw_set_conditionalmod(p, inst->conditional_mod);
       brw_set_predicate_control(p, inst->predicate);
       brw_set_predicate_inverse(p, inst->predicate_inverse);
       brw_set_flag_reg(p, 0, inst->flag_subreg);
       brw_set_saturate(p, inst->saturate);
       brw_set_mask_control(p, inst->force_writemask_all);
+      brw_set_acc_write_control(p, inst->writes_accumulator);
 
       if (inst->force_uncompressed || dispatch_width == 8) {
 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       } else if (inst->force_sechalf) {
 	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
       } else {
 	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
       }
 
       switch (inst->opcode) {
       case BRW_OPCODE_MOV:
 	 brw_MOV(p, dst, src[0]);
 	 break;
       case BRW_OPCODE_ADD:
 	 brw_ADD(p, dst, src[0], src[1]);
 	 break;
       case BRW_OPCODE_MUL:
 	 brw_MUL(p, dst, src[0], src[1]);
 	 break;
       case BRW_OPCODE_AVG:
 	 brw_AVG(p, dst, src[0], src[1]);
 	 break;
       case BRW_OPCODE_MACH:
-	 brw_set_acc_write_control(p, 1);
 	 brw_MACH(p, dst, src[0], src[1]);
-	 brw_set_acc_write_control(p, 0);
 	 break;
 
       case BRW_OPCODE_MAD:
          assert(brw->gen >= 6);
 	 brw_set_access_mode(p, BRW_ALIGN_16);
          if (dispatch_width == 16 && !brw->is_haswell) {
 	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 	    brw_MAD(p, dst, src[0], src[1], src[2]);
 	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 	    brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
 	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 	 } else {
 	    brw_MAD(p, dst, src[0], src[1], src[2]);
 	 }
 	 brw_set_access_mode(p, BRW_ALIGN_1);
@@ -1528,39 +1527,35 @@ fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
          /* FBH only supports UD type for dst. */
          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
          break;
       case BRW_OPCODE_FBL:
          assert(brw->gen >= 7);
          /* FBL only supports UD type for dst. */
          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
          break;
       case BRW_OPCODE_CBIT:
          assert(brw->gen >= 7);
          /* CBIT only supports UD type for dst. */
          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
          break;
       case BRW_OPCODE_ADDC:
          assert(brw->gen >= 7);
-         brw_set_acc_write_control(p, 1);
          brw_ADDC(p, dst, src[0], src[1]);
-         brw_set_acc_write_control(p, 0);
          break;
       case BRW_OPCODE_SUBB:
          assert(brw->gen >= 7);
-         brw_set_acc_write_control(p, 1);
          brw_SUBB(p, dst, src[0], src[1]);
-         brw_set_acc_write_control(p, 0);
          break;
 
       case BRW_OPCODE_BFE:
          assert(brw->gen >= 7);
          brw_set_access_mode(p, BRW_ALIGN_16);
          if (dispatch_width == 16 && !brw->is_haswell) {
             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
             brw_BFE(p, dst, src[0], src[1], src[2]);
             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
             brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
          } else {
             brw_BFE(p, dst, src[0], src[1], src[2]);
          }
          brw_set_access_mode(p, BRW_ALIGN_1);
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index a951459..3538da5 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -738,30 +738,31 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst)
 	   !inst->force_uncompressed &&
 	   !inst->force_sechalf);
 }
 
 void
 fs_instruction_scheduler::calculate_deps()
 {
    /* Pre-register-allocation, this tracks the last write per VGRF (so
     * different reg_offsets within it can interfere when they shouldn't).
     * After register allocation, reg_offsets are gone and we track individual
     * GRF registers.
     */
    schedule_node *last_grf_write[grf_count];
    schedule_node *last_mrf_write[BRW_MAX_MRF];
    schedule_node *last_conditional_mod[2] = { NULL, NULL };
+   schedule_node *last_accumulator_write = NULL;
    /* Fixed HW registers are assumed to be separate from the virtual
     * GRFs, so they can be tracked separately.  We don't really write
     * to fixed GRFs much, so don't bother tracking them on a more
     * granular level.
     */
    schedule_node *last_fixed_grf_write = NULL;
    int reg_width = v->dispatch_width / 8;
 
    /* The last instruction always needs to still be the last
     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
     * WHILE) and scheduling other things after it would disturb the
     * basic block, or it's FB_WRITE and we should do a better job at
     * dead code elimination anyway.
     */
    schedule_node *last = (schedule_node *)instructions.get_tail();
@@ -788,52 +789,58 @@ fs_instruction_scheduler::calculate_deps()
             } else {
                add_dep(last_grf_write[inst->src[i].reg], n);
             }
 	 } else if (inst->src[i].file == HW_REG &&
 		    (inst->src[i].fixed_hw_reg.file ==
 		     BRW_GENERAL_REGISTER_FILE)) {
 	    if (post_reg_alloc) {
                int size = reg_width;
                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
                   size = 1;
                for (int r = 0; r < size; r++)
                   add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
             } else {
                add_dep(last_fixed_grf_write, n);
             }
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(last_accumulator_write, n);
 	 } else if (inst->src[i].file != BAD_FILE &&
 		    inst->src[i].file != IMM &&
 		    inst->src[i].file != UNIFORM) {
 	    assert(inst->src[i].file != MRF);
 	    add_barrier_deps(n);
 	 }
       }
 
       if (inst->base_mrf != -1) {
 	 for (int i = 0; i < inst->mlen; i++) {
 	    /* It looks like the MRF regs are released in the send
 	     * instruction once it's sent, not when the result comes
 	     * back.
 	     */
 	    add_dep(last_mrf_write[inst->base_mrf + i], n);
 	 }
       }
 
       if (inst->reads_flag()) {
 	 add_dep(last_conditional_mod[inst->flag_subreg], n);
       }
 
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(last_accumulator_write, n);
+      }
+
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
          if (post_reg_alloc) {
             for (int r = 0; r < inst->regs_written * reg_width; r++) {
                add_dep(last_grf_write[inst->dst.reg + r], n);
                last_grf_write[inst->dst.reg + r] = n;
             }
          } else {
             add_dep(last_grf_write[inst->dst.reg], n);
             last_grf_write[inst->dst.reg] = n;
          }
       } else if (inst->dst.file == MRF) {
 	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 
 	 add_dep(last_mrf_write[reg], n);
@@ -842,51 +849,60 @@ fs_instruction_scheduler::calculate_deps()
 	    if (inst->dst.reg & BRW_MRF_COMPR4)
 	       reg += 4;
 	    else
 	       reg++;
 	    add_dep(last_mrf_write[reg], n);
 	    last_mrf_write[reg] = n;
 	 }
       } else if (inst->dst.file == HW_REG &&
 		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
          if (post_reg_alloc) {
             for (int r = 0; r < reg_width; r++)
                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
          } else {
             last_fixed_grf_write = n;
          }
+      } else if (inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
       } else if (inst->dst.file != BAD_FILE) {
 	 add_barrier_deps(n);
       }
 
       if (inst->mlen > 0 && inst->base_mrf != -1) {
 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 	    add_dep(last_mrf_write[inst->base_mrf + i], n);
 	    last_mrf_write[inst->base_mrf + i] = n;
 	 }
       }
 
       if (inst->writes_flag()) {
 	 add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
 	 last_conditional_mod[inst->flag_subreg] = n;
       }
+
+      if (inst->writes_accumulator) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      }
    }
 
    /* bottom-to-top dependencies: WAR */
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
+   last_accumulator_write = NULL;
    last_fixed_grf_write = NULL;
 
    exec_node *node;
    exec_node *prev;
    for (node = instructions.get_tail(), prev = node->prev;
 	!node->is_head_sentinel();
 	node = prev, prev = node->prev) {
       schedule_node *n = (schedule_node *)node;
       fs_inst *inst = (fs_inst *)n->inst;
 
       /* write-after-read deps. */
       for (int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
                for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
@@ -894,52 +910,58 @@ fs_instruction_scheduler::calculate_deps()
             } else {
                add_dep(n, last_grf_write[inst->src[i].reg]);
             }
 	 } else if (inst->src[i].file == HW_REG &&
 		    (inst->src[i].fixed_hw_reg.file ==
 		     BRW_GENERAL_REGISTER_FILE)) {
 	    if (post_reg_alloc) {
                int size = reg_width;
                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
                   size = 1;
                for (int r = 0; r < size; r++)
                   add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
             } else {
                add_dep(n, last_fixed_grf_write);
             }
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(n, last_accumulator_write);
          } else if (inst->src[i].file != BAD_FILE &&
 		    inst->src[i].file != IMM &&
 		    inst->src[i].file != UNIFORM) {
 	    assert(inst->src[i].file != MRF);
 	    add_barrier_deps(n);
 	 }
       }
 
       if (inst->base_mrf != -1) {
 	 for (int i = 0; i < inst->mlen; i++) {
 	    /* It looks like the MRF regs are released in the send
 	     * instruction once it's sent, not when the result comes
 	     * back.
 	     */
 	    add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 	 }
       }
 
       if (inst->reads_flag()) {
 	 add_dep(n, last_conditional_mod[inst->flag_subreg]);
       }
 
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(n, last_accumulator_write);
+      }
+
       /* Update the things this instruction wrote, so earlier reads
        * can mark this as WAR dependency.
        */
       if (inst->dst.file == GRF) {
          if (post_reg_alloc) {
             for (int r = 0; r < inst->regs_written * reg_width; r++)
                last_grf_write[inst->dst.reg + r] = n;
          } else {
             last_grf_write[inst->dst.reg] = n;
          }
       } else if (inst->dst.file == MRF) {
 	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 
 	 last_mrf_write[reg] = n;
 
@@ -947,52 +969,59 @@ fs_instruction_scheduler::calculate_deps()
 	    if (inst->dst.reg & BRW_MRF_COMPR4)
 	       reg += 4;
 	    else
 	       reg++;
 
 	    last_mrf_write[reg] = n;
 	 }
       } else if (inst->dst.file == HW_REG &&
 		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
          if (post_reg_alloc) {
             for (int r = 0; r < reg_width; r++)
                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
          } else {
             last_fixed_grf_write = n;
          }
+      } else if (inst->dst.is_accumulator()) {
+         last_accumulator_write = n;
       } else if (inst->dst.file != BAD_FILE) {
 	 add_barrier_deps(n);
       }
 
       if (inst->mlen > 0 && inst->base_mrf != -1) {
 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 	    last_mrf_write[inst->base_mrf + i] = n;
 	 }
       }
 
       if (inst->writes_flag()) {
 	 last_conditional_mod[inst->flag_subreg] = n;
       }
+
+      if (inst->writes_accumulator) {
+         last_accumulator_write = n;
+      }
    }
 }
 
 void
 vec4_instruction_scheduler::calculate_deps()
 {
    schedule_node *last_grf_write[grf_count];
    schedule_node *last_mrf_write[BRW_MAX_MRF];
    schedule_node *last_conditional_mod = NULL;
+   schedule_node *last_accumulator_write = NULL;
    /* Fixed HW registers are assumed to be separate from the virtual
     * GRFs, so they can be tracked separately.  We don't really write
     * to fixed GRFs much, so don't bother tracking them on a more
     * granular level.
     */
    schedule_node *last_fixed_grf_write = NULL;
 
    /* The last instruction always needs to still be the last instruction.
     * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
     * other things after it would disturb the basic block, or it's the EOT
     * URB_WRITE and we should do a better job at dead code eliminating
     * anything that could have been scheduled after it.
     */
    schedule_node *last = (schedule_node *)instructions.get_tail();
    add_barrier_deps(last);
@@ -1004,146 +1033,175 @@ vec4_instruction_scheduler::calculate_deps()
    foreach_list(node, &instructions) {
       schedule_node *n = (schedule_node *)node;
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
       if (inst->has_side_effects())
          add_barrier_deps(n);
 
       /* read-after-write deps. */
       for (int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF) {
             add_dep(last_grf_write[inst->src[i].reg], n);
          } else if (inst->src[i].file == HW_REG &&
                     (inst->src[i].fixed_hw_reg.file ==
                      BRW_GENERAL_REGISTER_FILE)) {
             add_dep(last_fixed_grf_write, n);
+         } else if (inst->src[i].is_accumulator()) {
+            assert(last_accumulator_write);
+            add_dep(last_accumulator_write, n);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM) {
             /* No reads from MRF, and ATTR is already translated away */
             assert(inst->src[i].file != MRF &&
                    inst->src[i].file != ATTR);
             add_barrier_deps(n);
          }
       }
 
       for (int i = 0; i < inst->mlen; i++) {
          /* It looks like the MRF regs are released in the send
           * instruction once it's sent, not when the result comes
           * back.
           */
          add_dep(last_mrf_write[inst->base_mrf + i], n);
       }
 
       if (inst->reads_flag()) {
          assert(last_conditional_mod);
          add_dep(last_conditional_mod, n);
       }
 
+      if (inst->reads_accumulator_implicitly()) {
+         assert(last_accumulator_write);
+         add_dep(last_accumulator_write, n);
+      }
+
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
          add_dep(last_grf_write[inst->dst.reg], n);
          last_grf_write[inst->dst.reg] = n;
       } else if (inst->dst.file == MRF) {
          add_dep(last_mrf_write[inst->dst.reg], n);
          last_mrf_write[inst->dst.reg] = n;
      } else if (inst->dst.file == HW_REG &&
                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
          last_fixed_grf_write = n;
+      } else if (inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
       } else if (inst->dst.file != BAD_FILE) {
          add_barrier_deps(n);
       }
 
       if (inst->mlen > 0) {
          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
             add_dep(last_mrf_write[inst->base_mrf + i], n);
             last_mrf_write[inst->base_mrf + i] = n;
          }
       }
 
       if (inst->writes_flag()) {
          add_dep(last_conditional_mod, n, 0);
          last_conditional_mod = n;
       }
+
+      if (inst->writes_accumulator) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      }
    }
 
    /* bottom-to-top dependencies: WAR */
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    last_conditional_mod = NULL;
+   last_accumulator_write = NULL;
    last_fixed_grf_write = NULL;
 
    exec_node *node;
    exec_node *prev;
    for (node = instructions.get_tail(), prev = node->prev;
         !node->is_head_sentinel();
         node = prev, prev = node->prev) {
       schedule_node *n = (schedule_node *)node;
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
       /* write-after-read deps. */
       for (int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF) {
             add_dep(n, last_grf_write[inst->src[i].reg]);
          } else if (inst->src[i].file == HW_REG &&
                     (inst->src[i].fixed_hw_reg.file ==
                      BRW_GENERAL_REGISTER_FILE)) {
             add_dep(n, last_fixed_grf_write);
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(n, last_accumulator_write);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM) {
             assert(inst->src[i].file != MRF &&
                    inst->src[i].file != ATTR);
             add_barrier_deps(n);
          }
       }
 
       for (int i = 0; i < inst->mlen; i++) {
          /* It looks like the MRF regs are released in the send
           * instruction once it's sent, not when the result comes
           * back.
           */
          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
       }
 
       if (inst->reads_flag()) {
          add_dep(n, last_conditional_mod);
       }
 
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(n, last_accumulator_write);
+      }
+
       /* Update the things this instruction wrote, so earlier reads
        * can mark this as WAR dependency.
        */
       if (inst->dst.file == GRF) {
          last_grf_write[inst->dst.reg] = n;
       } else if (inst->dst.file == MRF) {
          last_mrf_write[inst->dst.reg] = n;
       } else if (inst->dst.file == HW_REG &&
                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
          last_fixed_grf_write = n;
+      } else if (inst->dst.is_accumulator()) {
+         last_accumulator_write = n;
       } else if (inst->dst.file != BAD_FILE) {
          add_barrier_deps(n);
       }
 
       if (inst->mlen > 0) {
          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
             last_mrf_write[inst->base_mrf + i] = n;
          }
       }
 
       if (inst->writes_flag()) {
          last_conditional_mod = n;
       }
+
+      if (inst->writes_accumulator) {
+         last_accumulator_write = n;
+      }
    }
 }
 
 schedule_node *
 fs_instruction_scheduler::choose_instruction_to_schedule()
 {
    schedule_node *chosen = NULL;
 
    if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
       int chosen_time = 0;
 
       /* Of the instructions ready to execute or the closest to
        * being ready, choose the oldest one.
        */
       foreach_list(node, &instructions) {
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 9ef08e5..e730ed0 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -48,30 +48,31 @@ public:
    bool can_do_source_mods() const;
    bool can_do_saturate() const;
    bool reads_accumulator_implicitly() const;
 
    /**
     * True if the instruction has side effects other than writing to
     * its destination registers.  You are expected not to reorder or
     * optimize these out unless you know what you are doing.
     */
    bool has_side_effects() const;
 
    enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
 
    uint8_t predicate;
    bool predicate_inverse;
+   bool writes_accumulator; /**< instruction implicitly writes accumulator */
 };
 
 enum instruction_scheduler_mode {
    SCHEDULE_PRE,
    SCHEDULE_PRE_NON_LIFO,
    SCHEDULE_PRE_LIFO,
    SCHEDULE_POST,
 };
 
 class backend_visitor : public ir_visitor {
 protected:
 
    backend_visitor(struct brw_context *brw,
                    struct gl_shader_program *shader_prog,
                    struct gl_program *prog,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 38d2b93..acce0377 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -338,43 +338,36 @@ src_reg::equals(src_reg *r)
 	   imm.u == r->imm.u);
 }
 
 static bool
 try_eliminate_instruction(vec4_instruction *inst, int new_writemask,
                           const struct brw_context *brw)
 {
    if (inst->has_side_effects())
       return false;
 
    if (new_writemask == 0) {
       /* Don't dead code eliminate instructions that write to the
        * accumulator as a side-effect. Instead just set the destination
        * to the null register to free it.
        */
-      switch (inst->opcode) {
-      case BRW_OPCODE_ADDC:
-      case BRW_OPCODE_SUBB:
-      case BRW_OPCODE_MACH:
+      if (inst->writes_accumulator || inst->writes_flag()) {
          inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
-         break;
-      default:
-         if (inst->writes_flag()) {
-            inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
-         } else {
-            inst->remove();
-         }
+      } else {
+         inst->remove();
       }
+
       return true;
    } else if (inst->dst.writemask != new_writemask) {
       switch (inst->opcode) {
       case SHADER_OPCODE_TXF_CMS:
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
       case VS_OPCODE_PULL_CONSTANT_LOAD:
       case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
          break;
       default:
          /* Do not set a writemask on Gen6 for math instructions, those are
           * executed using align1 mode that does not support a destination mask.
           */
          if (!(brw->gen == 6 && inst->is_math()) && !inst->is_tex()) {
             inst->dst.writemask = new_writemask;
             return true;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index a74514f..5f85d31 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -959,33 +959,31 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
             src[i] = stride(src[i], 4, 4, 1);
       }
    }
 
    switch (inst->opcode) {
    case BRW_OPCODE_MOV:
       brw_MOV(p, dst, src[0]);
       break;
    case BRW_OPCODE_ADD:
       brw_ADD(p, dst, src[0], src[1]);
       break;
    case BRW_OPCODE_MUL:
       brw_MUL(p, dst, src[0], src[1]);
       break;
    case BRW_OPCODE_MACH:
-      brw_set_acc_write_control(p, 1);
       brw_MACH(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
 
    case BRW_OPCODE_MAD:
       assert(brw->gen >= 6);
       brw_MAD(p, dst, src[0], src[1], src[2]);
       break;
 
    case BRW_OPCODE_FRC:
       brw_FRC(p, dst, src[0]);
       break;
    case BRW_OPCODE_RNDD:
       brw_RNDD(p, dst, src[0]);
       break;
    case BRW_OPCODE_RNDE:
       brw_RNDE(p, dst, src[0]);
@@ -1065,39 +1063,35 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
       /* FBH only supports UD type for dst. */
       brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
       break;
    case BRW_OPCODE_FBL:
       assert(brw->gen >= 7);
       /* FBL only supports UD type for dst. */
       brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
       break;
    case BRW_OPCODE_CBIT:
       assert(brw->gen >= 7);
       /* CBIT only supports UD type for dst. */
       brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
       break;
    case BRW_OPCODE_ADDC:
       assert(brw->gen >= 7);
-      brw_set_acc_write_control(p, 1);
       brw_ADDC(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
    case BRW_OPCODE_SUBB:
       assert(brw->gen >= 7);
-      brw_set_acc_write_control(p, 1);
       brw_SUBB(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
 
    case BRW_OPCODE_BFE:
       assert(brw->gen >= 7);
       brw_BFE(p, dst, src[0], src[1], src[2]);
       break;
 
    case BRW_OPCODE_BFI1:
       assert(brw->gen >= 7);
       brw_BFI1(p, dst, src[0], src[1]);
       break;
    case BRW_OPCODE_BFI2:
       assert(brw->gen >= 7);
       brw_BFI2(p, dst, src[0], src[1], src[2]);
       break;
@@ -1305,30 +1299,31 @@ vec4_generator::generate_code(exec_list *instructions)
 	    if (last_annotation_string)
 	       fprintf(stderr, "   %s\n", last_annotation_string);
 	 }
       }
 
       for (unsigned int i = 0; i < 3; i++) {
 	 src[i] = inst->get_src(this->prog_data, i);
       }
       dst = inst->get_dst();
 
       brw_set_conditionalmod(p, inst->conditional_mod);
       brw_set_predicate_control(p, inst->predicate);
       brw_set_predicate_inverse(p, inst->predicate_inverse);
       brw_set_saturate(p, inst->saturate);
       brw_set_mask_control(p, inst->force_writemask_all);
+      brw_set_acc_write_control(p, inst->writes_accumulator);
 
       unsigned pre_emit_nr_insn = p->nr_insn;
 
       generate_vec4_instruction(inst, dst, src);
 
       if (inst->no_dd_clear || inst->no_dd_check) {
          assert(p->nr_insn == pre_emit_nr_insn + 1 ||
                 !"no_dd_check or no_dd_clear set for IR emitting more "
                 "than 1 instruction");
 
          struct brw_instruction *last = &p->store[pre_emit_nr_insn];
 
          if (inst->no_dd_clear)
             last->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED;
          if (inst->no_dd_check)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index edace10..3a76442 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -30,30 +30,31 @@ extern "C" {
 namespace brw {
 
 vec4_instruction::vec4_instruction(vec4_visitor *v,
 				   enum opcode opcode, dst_reg dst,
 				   src_reg src0, src_reg src1, src_reg src2)
 {
    this->opcode = opcode;
    this->dst = dst;
    this->src[0] = src0;
    this->src[1] = src1;
    this->src[2] = src2;
    this->saturate = false;
    this->force_writemask_all = false;
    this->no_dd_clear = false;
    this->no_dd_check = false;
+   this->writes_accumulator = false;
    this->conditional_mod = BRW_CONDITIONAL_NONE;
    this->sampler = 0;
    this->texture_offset = 0;
    this->target = 0;
    this->shadow_compare = false;
    this->ir = v->base_ir;
    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
    this->header_present = false;
    this->mlen = 0;
    this->base_mrf = 0;
    this->offset = 0;
    this->annotation = v->current_annotation;
 }
 
 vec4_instruction *
@@ -112,70 +113,80 @@ vec4_visitor::emit(enum opcode opcode)
    vec4_instruction *							\
    vec4_visitor::op(dst_reg dst, src_reg src0)				\
    {									\
       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
 					   src0);			\
    }
 
 #define ALU2(op)							\
    vec4_instruction *							\
    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)		\
    {									\
       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
 					   src0, src1);			\
    }
 
+#define ALU2_ACC(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)		\
+   {									\
+      vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
+                       BRW_OPCODE_##op, dst, src0, src1);		\
+      inst->writes_accumulator = true;                                 \
+      return inst;                                                     \
+   }
+
 #define ALU3(op)							\
    vec4_instruction *							\
    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
    {									\
       assert(brw->gen >= 6);						\
       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
 					   src0, src1, src2);		\
    }
 
 ALU1(NOT)
 ALU1(MOV)
 ALU1(FRC)
 ALU1(RNDD)
 ALU1(RNDE)
 ALU1(RNDZ)
 ALU1(F32TO16)
 ALU1(F16TO32)
 ALU2(ADD)
 ALU2(MUL)
-ALU2(MACH)
+ALU2_ACC(MACH)
 ALU2(AND)
 ALU2(OR)
 ALU2(XOR)
 ALU2(DP3)
 ALU2(DP4)
 ALU2(DPH)
 ALU2(SHL)
 ALU2(SHR)
 ALU2(ASR)
 ALU3(LRP)
 ALU1(BFREV)
 ALU3(BFE)
 ALU2(BFI1)
 ALU3(BFI2)
 ALU1(FBH)
 ALU1(FBL)
 ALU1(CBIT)
 ALU3(MAD)
-ALU2(ADDC)
-ALU2(SUBB)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
 
 /** Gen4 predicated IF. */
 vec4_instruction *
 vec4_visitor::IF(uint32_t predicate)
 {
    vec4_instruction *inst;
 
    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
    inst->predicate = predicate;
 
    return inst;
 }
 
 /** Gen6 IF with embedded comparison. */
 vec4_instruction *
-- 
1.8.3.2