Mesa (master): i965: Move the GRF-to-MRF optimizations to brw_optimize.c.

Eric Anholt anholt at kemper.freedesktop.org
Mon Jul 26 20:08:39 UTC 2010


Module: Mesa
Branch: master
Commit: 22f839292f48a47601e1b97a7f4679018c42d0ed
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=22f839292f48a47601e1b97a7f4679018c42d0ed

Author: Eric Anholt <eric at anholt.net>
Date:   Mon Jul 26 12:41:39 2010 -0700

i965: Move the GRF-to-MRF optimizations to brw_optimize.c.

---

 src/mesa/drivers/dri/i965/brw_eu.h       |    2 +
 src/mesa/drivers/dri/i965/brw_optimize.c |  613 +++++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_wm_emit.c  |  622 +-----------------------------
 3 files changed, 618 insertions(+), 619 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 31ff86c..bc15173 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -984,5 +984,7 @@ void brw_set_src1( struct brw_instruction *insn,
 
 /* brw_optimize.c */
 void brw_optimize(struct brw_compile *p);
+void brw_remove_duplicate_mrf_moves(struct brw_compile *p);
+void brw_remove_mrf_to_grf_moves(struct brw_compile *p);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_optimize.c b/src/mesa/drivers/dri/i965/brw_optimize.c
index a364b15..136dbbd 100644
--- a/src/mesa/drivers/dri/i965/brw_optimize.c
+++ b/src/mesa/drivers/dri/i965/brw_optimize.c
@@ -32,6 +32,619 @@
 #include "brw_defines.h"
 #include "brw_eu.h"
 
+#define BRW_MRF_NUM 16
+#define BRW_SIZE_OF_REG 32
+
+static INLINE
+GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
+{
+   switch (inst->header.opcode) {
+      case BRW_OPCODE_MOV:
+      case BRW_OPCODE_SEL:
+      case BRW_OPCODE_NOT:
+      case BRW_OPCODE_AND:
+      case BRW_OPCODE_OR:
+      case BRW_OPCODE_XOR:
+      case BRW_OPCODE_SHR:
+      case BRW_OPCODE_SHL:
+      case BRW_OPCODE_RSR:
+      case BRW_OPCODE_RSL:
+      case BRW_OPCODE_ADD:
+      case BRW_OPCODE_MUL:
+      case BRW_OPCODE_AVG:
+      case BRW_OPCODE_FRC:
+      case BRW_OPCODE_RNDU:
+      case BRW_OPCODE_RNDD:
+      case BRW_OPCODE_RNDE:
+      case BRW_OPCODE_RNDZ:
+      case BRW_OPCODE_MAC:
+      case BRW_OPCODE_MACH:
+      case BRW_OPCODE_LINE:
+         return GL_TRUE;
+      default:
+         return GL_FALSE;
+   }
+}
+
+static const struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+} inst_opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+static const GLuint inst_stride[7] = {
+    [0] = 0,
+    [1] = 1,
+    [2] = 2,
+    [3] = 4,
+    [4] = 8,
+    [5] = 16,
+    [6] = 32
+};
+
+static const GLuint inst_type_size[8] = {
+    [0] = 4,
+    [1] = 4,
+    [2] = 2,
+    [3] = 2,
+    [4] = 1,
+    [5] = 1,
+    [7] = 4
+};
+
+#define BRW_MAX_OFFSET(x0,x1) ((x0) > (x1) ? (x0) : (x1))
+#define BRW_MIN_OFFSET(x0,x1) ((x0) < (x1) ? (x0) : (x1));
+
+static INLINE GLboolean
+brw_is_grf_written(const struct brw_instruction *inst,
+                   int reg_index, int size,
+                   int gen)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * BRW_SIZE_OF_REG;
+   const int reg_end = reg_start + size;
+
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+   const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
+                         + inst->bits1.da1.dest_subreg_nr;
+   int length, write_end;
+
+   /* SEND is specific */
+   if (inst->header.opcode == BRW_OPCODE_SEND) {
+      if (gen >= 5)
+         length = inst->bits3.generic_gen5.response_length*BRW_SIZE_OF_REG;
+      else 
+         length = inst->bits3.generic.response_length*BRW_SIZE_OF_REG;
+   }
+   else {
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+   }
+
+   /* If the two intervals intersect, we overwrite the register */
+   write_end = write_start + length;
+   const int left = BRW_MAX_OFFSET(write_start, reg_start);
+   const int right = BRW_MIN_OFFSET(write_end, reg_end);
+
+   return left < right;
+}
+
+/* Specific path for message register since we need to handle the compr4 case */
+static INLINE GLboolean
+brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * BRW_SIZE_OF_REG;
+   const int reg_end = reg_start + size;
+
+   const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
+   const int is_compr4 = inst->bits1.da1.dest_reg_nr & 0xf0;
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+
+   /* We use compr4 with a size != 16 elements. Strange, we conservatively
+    * consider that we are writing the register.
+    */
+   if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
+      return GL_TRUE;
+
+   GLboolean is_written = GL_FALSE;
+
+   /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
+   if (is_compr4) {
+      const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
+
+      /* First 8-way register */
+      const int write_start0 = mrf_index*BRW_SIZE_OF_REG
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end0 = write_start0 + length;
+
+      /* Second 8-way register */
+      const int write_start1 = (mrf_index+4)*BRW_SIZE_OF_REG
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end1 = write_start1 + length;
+
+      /* If the two intervals intersect, we overwrite the register */
+      const int left0 = BRW_MAX_OFFSET(write_start0, reg_start);
+      const int right0 = BRW_MIN_OFFSET(write_end0, reg_end);
+      const int left1 = BRW_MAX_OFFSET(write_start1, reg_start);
+      const int right1 = BRW_MIN_OFFSET(write_end1, reg_end);
+
+      is_written = left0 < right0 || left1 < right1;
+   }
+   else {
+      int length;
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+
+      /* If the two intervals intersect, we write into the register */
+      const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
+                            + inst->bits1.da1.dest_subreg_nr;
+      const int write_end = write_start + length;
+      const int left = BRW_MAX_OFFSET(write_start, reg_start);
+      const int right = BRW_MIN_OFFSET(write_end, reg_end);;
+
+      is_written = left < right;
+   }
+
+   /* SEND may perform an implicit mov to a mrf register */
+   if (is_written == GL_FALSE &&
+       inst->header.opcode == BRW_OPCODE_SEND &&
+       inst->bits1.da1.src0_reg_file != 0) {
+
+      const int mrf_start = inst->header.destreg__conditionalmod;
+      const int write_start = mrf_start * BRW_SIZE_OF_REG;
+      const int write_end = write_start + BRW_SIZE_OF_REG;
+      const int left = BRW_MAX_OFFSET(write_start, reg_start);
+      const int right = BRW_MIN_OFFSET(write_end, reg_end);;
+      is_written = left < right;
+   }
+
+   return is_written;
+}
+
+static INLINE GLboolean
+brw_is_mrf_read(const struct brw_instruction *inst,
+                int reg_index, int size, int gen)
+{
+   if (inst->header.opcode != BRW_OPCODE_SEND)
+      return GL_FALSE;
+   if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+      return GL_TRUE;
+
+   const int reg_start = reg_index*BRW_SIZE_OF_REG;
+   const int reg_end = reg_start + size;
+
+   int length, read_start, read_end;
+   if (gen >= 5)
+      length = inst->bits3.generic_gen5.msg_length*BRW_SIZE_OF_REG;
+   else 
+      length = inst->bits3.generic.msg_length*BRW_SIZE_OF_REG;
+
+   /* Look if SEND uses an implicit mov. In that case, we read one less register
+    * (but we write it)
+    */
+   if (inst->bits1.da1.src0_reg_file != 0)
+      read_start = inst->header.destreg__conditionalmod;
+   else {
+      length--;
+      read_start = inst->header.destreg__conditionalmod + 1;
+   }
+   read_start *= BRW_SIZE_OF_REG;
+   read_end = read_start + length;
+
+   const int left = BRW_MAX_OFFSET(read_start, reg_start);
+   const int right = BRW_MIN_OFFSET(read_end, reg_end);
+
+   return left < right;
+}
+
+static INLINE GLboolean
+brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
+{
+   int i, j;
+   if (inst_opcode[inst->header.opcode].nsrc == 0)
+      return GL_FALSE;
+
+   /* Look at first source. We must take into account register regions to
+    * monitor carefully the read. Note that we are a bit too conservative here
+    * since we do not take into account the fact that some complete registers
+    * may be skipped
+    */
+   if (inst_opcode[inst->header.opcode].nsrc >= 1) {
+
+      if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*BRW_SIZE_OF_REG;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits2.da1.src0_width;
+      const int row_num = elem_num >> inst->bits2.da1.src0_width;
+      const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
+      int row_start = inst->bits2.da1.src0_reg_nr*BRW_SIZE_OF_REG
+                    + inst->bits2.da1.src0_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   /* Second src register */
+   if (inst_opcode[inst->header.opcode].nsrc >= 2) {
+
+      if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*BRW_SIZE_OF_REG;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits3.da1.src1_width;
+      const int row_num = elem_num >> inst->bits3.da1.src1_width;
+      const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
+      int row_start = inst->bits3.da1.src1_reg_nr*BRW_SIZE_OF_REG
+                    + inst->bits3.da1.src1_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_is_control_done(const struct brw_instruction *mov) {
+   return
+       mov->header.dependency_control != 0 ||
+       mov->header.thread_control != 0 ||
+       mov->header.mask_control != 0 ||
+       mov->header.saturate != 0 ||
+       mov->header.debug_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_predicated(const struct brw_instruction *mov) {
+   return mov->header.predicate_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
+                      int *mrf_index,
+                      int *grf_index,
+                      GLboolean *is_compr4)
+{
+   if (brw_is_predicated(mov) ||
+       brw_is_control_done(mov) ||
+       mov->header.debug_control != 0)
+      return GL_FALSE;
+
+   if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
+       mov->bits1.da1.dest_reg_type != 7 ||
+       mov->bits1.da1.dest_horiz_stride != 1 ||
+       mov->bits1.da1.dest_subreg_nr != 0)
+      return GL_FALSE;
+
+   if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
+       mov->bits1.da1.src0_reg_type != 7 ||
+       mov->bits2.da1.src0_width != 3 ||
+       mov->bits2.da1.src0_horiz_stride != 1 ||
+       mov->bits2.da1.src0_vert_stride != 4 ||
+       mov->bits2.da1.src0_subreg_nr != 0 ||
+       mov->bits2.da1.src0_abs != 0 ||
+       mov->bits2.da1.src0_negate != 0)
+      return GL_FALSE;
+
+   *grf_index = mov->bits2.da1.src0_reg_nr;
+   *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
+   *is_compr4 = (mov->bits1.da1.dest_reg_nr & 0xf0) != 0;
+   return GL_TRUE;
+}
+
+static INLINE GLboolean
+brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
+{
+   /* remark: no problem to predicate a SEL instruction */
+   if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
+       brw_is_control_done(inst) == GL_FALSE &&
+       inst->header.execution_size == 4 &&
+       inst->header.access_mode == BRW_ALIGN_1 &&
+       inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
+       inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
+       inst->bits1.da1.dest_reg_type == 7 &&
+       inst->bits1.da1.dest_horiz_stride == 1 &&
+       inst->bits1.da1.dest_reg_nr == grf_index &&
+       inst->bits1.da1.dest_subreg_nr == 0 &&
+       brw_is_arithmetic_inst(inst))
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_inst_are_equal(const struct brw_instruction *src0,
+                   const struct brw_instruction *src1)
+{
+   const GLuint *field0 = (GLuint *) src0;
+   const GLuint *field1 = (GLuint *) src1;
+   return field0[0] == field1[0] &&
+          field0[1] == field1[1] &&
+          field0[2] == field1[2] &&
+          field0[3] == field1[3];
+}
+
+static INLINE void
+brw_inst_copy(struct brw_instruction *dst,
+              const struct brw_instruction *src)
+{
+   GLuint *field_dst = (GLuint *) dst;
+   const GLuint *field_src = (GLuint *) src;
+   field_dst[0] = field_src[0];
+   field_dst[1] = field_src[1];
+   field_dst[2] = field_src[2];
+   field_dst[3] = field_src[3];
+}
+
+static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
+{
+   int i, nr_insn = 0, to = 0, from = 0;
+
+   for (from = 0; from < p->nr_insn; ++from) {
+      if (removeInst[from])
+         continue;
+      if(to != from)
+         brw_inst_copy(p->store + to, p->store + from);
+      to++;
+   }
+
+   for (i = 0; i < p->nr_insn; ++i)
+      if (removeInst[i] == GL_FALSE)
+         nr_insn++;
+   p->nr_insn = nr_insn;
+}
+
+/* The gen code emitter generates a lot of duplications in the mrf-to-grf moves.
+ * Here, we monitor same mov mrf-to-grf instrutions and remove them as soon as
+ * none of the two operands have been written
+ */
+void brw_remove_duplicate_mrf_moves(struct brw_compile *p)
+{
+   const int gen = p->brw->intel.gen;
+   int i, j;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
+      const int simd16_size = 2 * BRW_SIZE_OF_REG;
+
+      for (j = i + 1; j < p->nr_insn; j++) {
+         const struct brw_instruction *inst = p->store + j;
+
+         if (brw_inst_are_equal(mov, inst)) {
+            removeInst[j] = GL_TRUE;
+            continue;
+         }
+
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
+             brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG) ||
+             brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG))
+            break;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
+void brw_remove_mrf_to_grf_moves(struct brw_compile *p)
+{
+   int i, j, prev;
+   struct brw_context *brw = p->brw;
+   const int gen = brw->intel.gen;
+   const int simd16_size = 2*BRW_SIZE_OF_REG;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   assert(removeInst);
+
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      struct brw_instruction *grf_inst = NULL;
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      /* Using comp4 enables a stride of 4 for this instruction */
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
+
+      /* Look where the register has been set */
+      prev = i;
+      GLboolean potential_remove = GL_FALSE;
+      while (prev--) {
+
+         /* If _one_ instruction writes the grf, we try to remove the mov */
+         struct brw_instruction *inst = p->store + prev;
+         if (brw_is_grf_straight_write(inst, grf_index)) {
+            potential_remove = GL_TRUE;
+            grf_inst = inst;
+            break;
+         }
+
+      }
+
+      if (potential_remove == GL_FALSE)
+         continue;
+      removeInst[i] = GL_TRUE;
+
+      /* Monitor first the section of code between the grf computation and the
+       * mov. Here we cannot read or write both mrf and grf register
+       */
+      for (j = prev + 1; j < i; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
+             brw_is_grf_read(inst, grf_index, simd16_size)           ||
+             brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG)   ||
+             brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG)   ||
+             brw_is_mrf_read(inst, mrf_index0, BRW_SIZE_OF_REG, gen) ||
+             brw_is_mrf_read(inst, mrf_index1, BRW_SIZE_OF_REG, gen)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+      }
+
+      /* After the mov, we can read or write the mrf. If the grf is overwritten,
+       * we are done
+       */
+      for (j = i + 1; j < p->nr_insn; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+
+         if (brw_is_grf_read(inst, grf_index, simd16_size)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+
+         if (brw_is_grf_straight_write(inst, grf_index))
+            break;
+      }
+
+      /* Note that with the top down traversal, we can safely pacth the mov
+       * instruction
+       */
+      if (removeInst[i]) {
+         grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
+         grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
 static GLboolean
 is_single_channel_dp4(struct brw_instruction *insn)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index d10e1c7..b09071f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -1459,623 +1459,6 @@ static void spill_values( struct brw_wm_compile *c,
 	 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
 }
 
-#define BRW_MRF_NUM 16
-#define BRW_SIZE_OF_REG 32
-
-static INLINE
-GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
-{
-   switch (inst->header.opcode) {
-      case BRW_OPCODE_MOV:
-      case BRW_OPCODE_SEL:
-      case BRW_OPCODE_NOT:
-      case BRW_OPCODE_AND:
-      case BRW_OPCODE_OR:
-      case BRW_OPCODE_XOR:
-      case BRW_OPCODE_SHR:
-      case BRW_OPCODE_SHL:
-      case BRW_OPCODE_RSR:
-      case BRW_OPCODE_RSL:
-      case BRW_OPCODE_ADD:
-      case BRW_OPCODE_MUL:
-      case BRW_OPCODE_AVG:
-      case BRW_OPCODE_FRC:
-      case BRW_OPCODE_RNDU:
-      case BRW_OPCODE_RNDD:
-      case BRW_OPCODE_RNDE:
-      case BRW_OPCODE_RNDZ:
-      case BRW_OPCODE_MAC:
-      case BRW_OPCODE_MACH:
-      case BRW_OPCODE_LINE:
-         return GL_TRUE;
-      default:
-         return GL_FALSE;
-   }
-}
-
-static const struct {
-    char    *name;
-    int	    nsrc;
-    int	    ndst;
-} inst_opcode[128] = {
-    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
-
-    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
-
-    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
-
-    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
-    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
-    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
-    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
-    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
-    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
-    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
-    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
-    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
-    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
-    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
-    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
-    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
-    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
-};
-
-static const GLuint inst_stride[7] = {
-    [0] = 0,
-    [1] = 1,
-    [2] = 2,
-    [3] = 4,
-    [4] = 8,
-    [5] = 16,
-    [6] = 32
-};
-
-static const GLuint inst_type_size[8] = {
-    [0] = 4,
-    [1] = 4,
-    [2] = 2,
-    [3] = 2,
-    [4] = 1,
-    [5] = 1,
-    [7] = 4
-};
-
-#define BRW_MAX_OFFSET(x0,x1) ((x0) > (x1) ? (x0) : (x1))
-#define BRW_MIN_OFFSET(x0,x1) ((x0) < (x1) ? (x0) : (x1));
-
-static INLINE GLboolean
-brw_is_grf_written(const struct brw_instruction *inst,
-                   int reg_index, int size,
-                   int gen)
-{
-   if (inst_opcode[inst->header.opcode].ndst == 0)
-      return GL_FALSE;
-
-   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
-      if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
-         return GL_TRUE;
-
-   if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
-      return GL_FALSE;
-
-   const int reg_start = reg_index * BRW_SIZE_OF_REG;
-   const int reg_end = reg_start + size;
-
-   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
-   const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
-                         + inst->bits1.da1.dest_subreg_nr;
-   int length, write_end;
-
-   /* SEND is specific */
-   if (inst->header.opcode == BRW_OPCODE_SEND) {
-      if (gen >= 5)
-         length = inst->bits3.generic_gen5.response_length*BRW_SIZE_OF_REG;
-      else 
-         length = inst->bits3.generic.response_length*BRW_SIZE_OF_REG;
-   }
-   else {
-      length = 1 << inst->header.execution_size;
-      length *= type_size;
-      length *= inst->bits1.da1.dest_horiz_stride;
-   }
-
-   /* If the two intervals intersect, we overwrite the register */
-   write_end = write_start + length;
-   const int left = BRW_MAX_OFFSET(write_start, reg_start);
-   const int right = BRW_MIN_OFFSET(write_end, reg_end);
-
-   return left < right;
-}
-
-/* Specific path for message register since we need to handle the compr4 case */
-static INLINE GLboolean
-brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
-{
-   if (inst_opcode[inst->header.opcode].ndst == 0)
-      return GL_FALSE;
-
-   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
-      if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
-         return GL_TRUE;
-
-   if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
-      return GL_FALSE;
-
-   const int reg_start = reg_index * BRW_SIZE_OF_REG;
-   const int reg_end = reg_start + size;
-
-   const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
-   const int is_compr4 = inst->bits1.da1.dest_reg_nr & 0xf0;
-   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
-
-   /* We use compr4 with a size != 16 elements. Strange, we conservatively
-    * consider that we are writing the register.
-    */
-   if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
-      return GL_TRUE;
-
-   GLboolean is_written = GL_FALSE;
-
-   /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
-   if (is_compr4) {
-      const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
-
-      /* First 8-way register */
-      const int write_start0 = mrf_index*BRW_SIZE_OF_REG
-                             + inst->bits1.da1.dest_subreg_nr;
-      const int write_end0 = write_start0 + length;
-
-      /* Second 8-way register */
-      const int write_start1 = (mrf_index+4)*BRW_SIZE_OF_REG
-                             + inst->bits1.da1.dest_subreg_nr;
-      const int write_end1 = write_start1 + length;
-
-      /* If the two intervals intersect, we overwrite the register */
-      const int left0 = BRW_MAX_OFFSET(write_start0, reg_start);
-      const int right0 = BRW_MIN_OFFSET(write_end0, reg_end);
-      const int left1 = BRW_MAX_OFFSET(write_start1, reg_start);
-      const int right1 = BRW_MIN_OFFSET(write_end1, reg_end);
-
-      is_written = left0 < right0 || left1 < right1;
-   }
-   else {
-      int length;
-      length = 1 << inst->header.execution_size;
-      length *= type_size;
-      length *= inst->bits1.da1.dest_horiz_stride;
-
-      /* If the two intervals intersect, we write into the register */
-      const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
-                            + inst->bits1.da1.dest_subreg_nr;
-      const int write_end = write_start + length;
-      const int left = BRW_MAX_OFFSET(write_start, reg_start);
-      const int right = BRW_MIN_OFFSET(write_end, reg_end);;
-
-      is_written = left < right;
-   }
-
-   /* SEND may perform an implicit mov to a mrf register */
-   if (is_written == GL_FALSE &&
-       inst->header.opcode == BRW_OPCODE_SEND &&
-       inst->bits1.da1.src0_reg_file != 0) {
-
-      const int mrf_start = inst->header.destreg__conditionalmod;
-      const int write_start = mrf_start * BRW_SIZE_OF_REG;
-      const int write_end = write_start + BRW_SIZE_OF_REG;
-      const int left = BRW_MAX_OFFSET(write_start, reg_start);
-      const int right = BRW_MIN_OFFSET(write_end, reg_end);;
-      is_written = left < right;
-   }
-
-   return is_written;
-}
-
-static INLINE GLboolean
-brw_is_mrf_read(const struct brw_instruction *inst,
-                int reg_index, int size, int gen)
-{
-   if (inst->header.opcode != BRW_OPCODE_SEND)
-      return GL_FALSE;
-   if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
-      return GL_TRUE;
-
-   const int reg_start = reg_index*BRW_SIZE_OF_REG;
-   const int reg_end = reg_start + size;
-
-   int length, read_start, read_end;
-   if (gen >= 5)
-      length = inst->bits3.generic_gen5.msg_length*BRW_SIZE_OF_REG;
-   else 
-      length = inst->bits3.generic.msg_length*BRW_SIZE_OF_REG;
-
-   /* Look if SEND uses an implicit mov. In that case, we read one less register
-    * (but we write it)
-    */
-   if (inst->bits1.da1.src0_reg_file != 0)
-      read_start = inst->header.destreg__conditionalmod;
-   else {
-      length--;
-      read_start = inst->header.destreg__conditionalmod + 1;
-   }
-   read_start *= BRW_SIZE_OF_REG;
-   read_end = read_start + length;
-
-   const int left = BRW_MAX_OFFSET(read_start, reg_start);
-   const int right = BRW_MIN_OFFSET(read_end, reg_end);
-
-   return left < right;
-}
-
-static INLINE GLboolean
-brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
-{
-   int i, j;
-   if (inst_opcode[inst->header.opcode].nsrc == 0)
-      return GL_FALSE;
-
-   /* Look at first source. We must take into account register regions to
-    * monitor carefully the read. Note that we are a bit too conservative here
-    * since we do not take into account the fact that some complete registers
-    * may be skipped
-    */
-   if (inst_opcode[inst->header.opcode].nsrc >= 1) {
-
-      if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
-         if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
-            return GL_TRUE;
-      if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
-         return GL_FALSE;
-
-      const int reg_start = reg_index*BRW_SIZE_OF_REG;
-      const int reg_end = reg_start + size;
-
-      /* See if at least one of this element intersects the interval */
-      const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
-      const int elem_num = 1 << inst->header.execution_size;
-      const int width = 1 << inst->bits2.da1.src0_width;
-      const int row_num = elem_num >> inst->bits2.da1.src0_width;
-      const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
-      const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
-      int row_start = inst->bits2.da1.src0_reg_nr*BRW_SIZE_OF_REG
-                    + inst->bits2.da1.src0_subreg_nr;
-      for (j = 0; j < row_num; ++j) {
-         int write_start = row_start;
-         for (i = 0; i < width; ++i) {
-            const int write_end = write_start + type_size;
-            const int left = write_start > reg_start ? write_start : reg_start;
-            const int right = write_end < reg_end ? write_end : reg_end;
-            if (left < right)
-               return GL_TRUE;
-            write_start += hs;
-         }
-         row_start += vs;
-      }
-   }
-
-   /* Second src register */
-   if (inst_opcode[inst->header.opcode].nsrc >= 2) {
-
-      if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
-         if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
-            return GL_TRUE;
-      if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
-         return GL_FALSE;
-
-      const int reg_start = reg_index*BRW_SIZE_OF_REG;
-      const int reg_end = reg_start + size;
-
-      /* See if at least one of this element intersects the interval */
-      const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
-      const int elem_num = 1 << inst->header.execution_size;
-      const int width = 1 << inst->bits3.da1.src1_width;
-      const int row_num = elem_num >> inst->bits3.da1.src1_width;
-      const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
-      const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
-      int row_start = inst->bits3.da1.src1_reg_nr*BRW_SIZE_OF_REG
-                    + inst->bits3.da1.src1_subreg_nr;
-      for (j = 0; j < row_num; ++j) {
-         int write_start = row_start;
-         for (i = 0; i < width; ++i) {
-            const int write_end = write_start + type_size;
-            const int left = write_start > reg_start ? write_start : reg_start;
-            const int right = write_end < reg_end ? write_end : reg_end;
-            if (left < right)
-               return GL_TRUE;
-            write_start += hs;
-         }
-         row_start += vs;
-      }
-   }
-
-   return GL_FALSE;
-}
-
-static INLINE GLboolean
-brw_is_control_done(const struct brw_instruction *mov) {
-   return
-       mov->header.dependency_control != 0 ||
-       mov->header.thread_control != 0 ||
-       mov->header.mask_control != 0 ||
-       mov->header.saturate != 0 ||
-       mov->header.debug_control != 0;
-}
-
-static INLINE GLboolean
-brw_is_predicated(const struct brw_instruction *mov) {
-   return mov->header.predicate_control != 0;
-}
-
-static INLINE GLboolean
-brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
-                      int *mrf_index,
-                      int *grf_index,
-                      GLboolean *is_compr4)
-{
-   if (brw_is_predicated(mov) ||
-       brw_is_control_done(mov) ||
-       mov->header.debug_control != 0)
-      return GL_FALSE;
-
-   if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
-       mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
-       mov->bits1.da1.dest_reg_type != 7 ||
-       mov->bits1.da1.dest_horiz_stride != 1 ||
-       mov->bits1.da1.dest_subreg_nr != 0)
-      return GL_FALSE;
-
-   if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
-       mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
-       mov->bits1.da1.src0_reg_type != 7 ||
-       mov->bits2.da1.src0_width != 3 ||
-       mov->bits2.da1.src0_horiz_stride != 1 ||
-       mov->bits2.da1.src0_vert_stride != 4 ||
-       mov->bits2.da1.src0_subreg_nr != 0 ||
-       mov->bits2.da1.src0_abs != 0 ||
-       mov->bits2.da1.src0_negate != 0)
-      return GL_FALSE;
-
-   *grf_index = mov->bits2.da1.src0_reg_nr;
-   *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
-   *is_compr4 = (mov->bits1.da1.dest_reg_nr & 0xf0) != 0;
-   return GL_TRUE;
-}
-
-static INLINE GLboolean
-brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
-{
-   /* remark: no problem to predicate a SEL instruction */
-   if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
-       brw_is_control_done(inst) == GL_FALSE &&
-       inst->header.execution_size == 4 &&
-       inst->header.access_mode == BRW_ALIGN_1 &&
-       inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
-       inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
-       inst->bits1.da1.dest_reg_type == 7 &&
-       inst->bits1.da1.dest_horiz_stride == 1 &&
-       inst->bits1.da1.dest_reg_nr == grf_index &&
-       inst->bits1.da1.dest_subreg_nr == 0 &&
-       brw_is_arithmetic_inst(inst))
-      return GL_TRUE;
-
-   return GL_FALSE;
-}
-
-static INLINE GLboolean
-brw_inst_are_equal(const struct brw_instruction *src0,
-                   const struct brw_instruction *src1)
-{
-   const GLuint *field0 = (GLuint *) src0;
-   const GLuint *field1 = (GLuint *) src1;
-   return field0[0] == field1[0] &&
-          field0[1] == field1[1] &&
-          field0[2] == field1[2] &&
-          field0[3] == field1[3];
-}
-
-static INLINE void
-brw_inst_copy(struct brw_instruction *dst,
-              const struct brw_instruction *src)
-{
-   GLuint *field_dst = (GLuint *) dst;
-   const GLuint *field_src = (GLuint *) src;
-   field_dst[0] = field_src[0];
-   field_dst[1] = field_src[1];
-   field_dst[2] = field_src[2];
-   field_dst[3] = field_src[3];
-}
-
-static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
-{
-   int i, nr_insn = 0, to = 0, from = 0;
-
-   for (from = 0; from < p->nr_insn; ++from) {
-      if (removeInst[from])
-         continue;
-      if(to != from)
-         brw_inst_copy(p->store + to, p->store + from);
-      to++;
-   }
-
-   for (i = 0; i < p->nr_insn; ++i)
-      if (removeInst[i] == GL_FALSE)
-         nr_insn++;
-   p->nr_insn = nr_insn;
-}
-
-/* The gen code emitter generates a lot of duplications in the mrf-to-grf moves.
- * Here, we monitor same mov mrf-to-grf instrutions and remove them as soon as
- * none of the two operands have been written
- */
-static void brw_remove_duplicate_mrf_moves(struct brw_wm_compile *c)
-{
-   struct brw_compile *p = &c->func;
-   const int gen = p->brw->intel.gen;
-   int i, j;
-
-   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
-   for (i = 0; i < p->nr_insn; i++) {
-      if (removeInst[i])
-         continue;
-
-      const struct brw_instruction *mov = p->store + i;
-      int mrf_index, grf_index;
-      GLboolean is_compr4;
-
-      /* Only consider _straight_ grf-to-mrf moves */
-      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
-         continue;
-
-      const int mrf_index0 = mrf_index;
-      const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
-      const int simd16_size = 2 * BRW_SIZE_OF_REG;
-
-      for (j = i + 1; j < p->nr_insn; j++) {
-         const struct brw_instruction *inst = p->store + j;
-
-         if (brw_inst_are_equal(mov, inst)) {
-            removeInst[j] = GL_TRUE;
-            continue;
-         }
-
-         if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
-             brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG) ||
-             brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG))
-            break;
-      }
-   }
-
-   brw_remove_inst(p, removeInst);
-   free(removeInst);
-}
-
-static void brw_remove_mrf_to_grf_moves(struct brw_wm_compile *c)
-{
-   int i, j, prev;
-   struct brw_compile *p = &c->func;
-   struct brw_context *brw = p->brw;
-   const int gen = brw->intel.gen;
-   const int simd16_size = 2*BRW_SIZE_OF_REG;
-
-   if (c->dispatch_width != 16 || brw->has_compr4 == GL_FALSE)
-      return;
-
-   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
-   assert(removeInst);
-
-   for (i = 0; i < p->nr_insn; i++) {
-      if (removeInst[i])
-         continue;
-
-      struct brw_instruction *grf_inst = NULL;
-      const struct brw_instruction *mov = p->store + i;
-      int mrf_index, grf_index;
-      GLboolean is_compr4;
-
-      /* Only consider _straight_ grf-to-mrf moves */
-      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
-         continue;
-
-      /* Using comp4 enables a stride of 4 for this instruction */
-      const int mrf_index0 = mrf_index;
-      const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
-
-      /* Look where the register has been set */
-      prev = i;
-      GLboolean potential_remove = GL_FALSE;
-      while (prev--) {
-
-         /* If _one_ instruction writes the grf, we try to remove the mov */
-         struct brw_instruction *inst = p->store + prev;
-         if (brw_is_grf_straight_write(inst, grf_index)) {
-            potential_remove = GL_TRUE;
-            grf_inst = inst;
-            break;
-         }
-
-      }
-
-      if (potential_remove == GL_FALSE)
-         continue;
-      removeInst[i] = GL_TRUE;
-
-      /* Monitor first the section of code between the grf computation and the
-       * mov. Here we cannot read or write both mrf and grf register
-       */
-      for (j = prev + 1; j < i; ++j) {
-         struct brw_instruction *inst = p->store + j;
-         if (removeInst[j])
-            continue;
-         if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
-             brw_is_grf_read(inst, grf_index, simd16_size)           ||
-             brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG)   ||
-             brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG)   ||
-             brw_is_mrf_read(inst, mrf_index0, BRW_SIZE_OF_REG, gen) ||
-             brw_is_mrf_read(inst, mrf_index1, BRW_SIZE_OF_REG, gen)) {
-            removeInst[i] = GL_FALSE;
-            break;
-         }
-      }
-
-      /* After the mov, we can read or write the mrf. If the grf is overwritten,
-       * we are done
-       */
-      for (j = i + 1; j < p->nr_insn; ++j) {
-         struct brw_instruction *inst = p->store + j;
-         if (removeInst[j])
-            continue;
-
-         if (brw_is_grf_read(inst, grf_index, simd16_size)) {
-            removeInst[i] = GL_FALSE;
-            break;
-         }
-
-         if (brw_is_grf_straight_write(inst, grf_index))
-            break;
-      }
-
-      /* Note that with the top down traversal, we can safely pacth the mov
-       * instruction
-       */
-      if (removeInst[i]) {
-         grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
-         grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
-      }
-   }
-
-   brw_remove_inst(p, removeInst);
-   free(removeInst);
-}
 
 /* Emit the fragment program instructions here.
  */
@@ -2331,8 +1714,9 @@ void brw_wm_emit( struct brw_wm_compile *c )
 
    /* Only properly tested on ILK */
    if (p->brw->intel.gen == 5) {
-     brw_remove_duplicate_mrf_moves(c);
-     brw_remove_mrf_to_grf_moves(c);
+     brw_remove_duplicate_mrf_moves(p);
+     if (c->dispatch_width == 16)
+	brw_remove_mrf_to_grf_moves(p);
    }
 
    if (INTEL_DEBUG & DEBUG_WM) {




More information about the mesa-commit mailing list