[Mesa-dev] [PATCH] i965: Improve (i.e. remove) some grf-to-mrf unnecessary moves

Sun Jul 25 21:30:19 PDT 2010

- Several routines directly analyze the grf-to-mrf moves from the Gen binary
  code. When it is possible, the mov is removed and the message register is
  directly written in the arithmetic instruction

- Also redundant mrf-to-grf moves are removed (frequently for example, when
  sampling many textures with the same uv)

- Code was tested with piglit, warsow and nexuiz on an Ironlake machine. No
  regression was found there

- Note that the optimizations are *deactivated* on Gen4 and Gen6 since I did
  test them properly yet. No reason there are bugs but who knows

- The optimizations are currently done in branch free programs *only*.
  Considering branches is more complicated and there are actually two paths:
  one for branch free programs and one for programs with branches

- Also some other optimizations should be done during the emission itself but
  considering that some code is shader between vertex shaders (AOS) and pixel
  shaders (SOA) and that we may have branches or not, it is pretty hard to both
  factorize the code and have one good set of strategies
---
 src/mesa/drivers/dri/i965/brw_wm_emit.c |  628 ++++++++++++++++++++++++++++++-
 1 files changed, 626 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index 323cfac..d10e1c7 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -1459,6 +1459,623 @@ static void spill_values( struct brw_wm_compile *c,
 	 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
 }
 
+#define BRW_MRF_NUM 16
+#define BRW_SIZE_OF_REG 32
+
+static INLINE
+GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
+{
+   switch (inst->header.opcode) {
+      case BRW_OPCODE_MOV:
+      case BRW_OPCODE_SEL:
+      case BRW_OPCODE_NOT:
+      case BRW_OPCODE_AND:
+      case BRW_OPCODE_OR:
+      case BRW_OPCODE_XOR:
+      case BRW_OPCODE_SHR:
+      case BRW_OPCODE_SHL:
+      case BRW_OPCODE_RSR:
+      case BRW_OPCODE_RSL:
+      case BRW_OPCODE_ADD:
+      case BRW_OPCODE_MUL:
+      case BRW_OPCODE_AVG:
+      case BRW_OPCODE_FRC:
+      case BRW_OPCODE_RNDU:
+      case BRW_OPCODE_RNDD:
+      case BRW_OPCODE_RNDE:
+      case BRW_OPCODE_RNDZ:
+      case BRW_OPCODE_MAC:
+      case BRW_OPCODE_MACH:
+      case BRW_OPCODE_LINE:
+         return GL_TRUE;
+      default:
+         return GL_FALSE;
+   }
+}
+
+static const struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+} inst_opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+static const GLuint inst_stride[7] = {
+    [0] = 0,
+    [1] = 1,
+    [2] = 2,
+    [3] = 4,
+    [4] = 8,
+    [5] = 16,
+    [6] = 32
+};
+
+static const GLuint inst_type_size[8] = {
+    [0] = 4,
+    [1] = 4,
+    [2] = 2,
+    [3] = 2,
+    [4] = 1,
+    [5] = 1,
+    [7] = 4
+};
+
+#define BRW_MAX_OFFSET(x0,x1) ((x0) > (x1) ? (x0) : (x1))
+#define BRW_MIN_OFFSET(x0,x1) ((x0) < (x1) ? (x0) : (x1));
+
+static INLINE GLboolean
+brw_is_grf_written(const struct brw_instruction *inst,
+                   int reg_index, int size,
+                   int gen)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * BRW_SIZE_OF_REG;
+   const int reg_end = reg_start + size;
+
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+   const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
+                         + inst->bits1.da1.dest_subreg_nr;
+   int length, write_end;
+
+   /* SEND is specific */
+   if (inst->header.opcode == BRW_OPCODE_SEND) {
+      if (gen >= 5)
+         length = inst->bits3.generic_gen5.response_length*BRW_SIZE_OF_REG;
+      else 
+         length = inst->bits3.generic.response_length*BRW_SIZE_OF_REG;
+   }
+   else {
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+   }
+
+   /* If the two intervals intersect, we overwrite the register */
+   write_end = write_start + length;
+   const int left = BRW_MAX_OFFSET(write_start, reg_start);
+   const int right = BRW_MIN_OFFSET(write_end, reg_end);
+
+   return left < right;
+}
+
+/* Specific path for message register since we need to handle the compr4 case */
+static INLINE GLboolean
+brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * BRW_SIZE_OF_REG;
+   const int reg_end = reg_start + size;
+
+   const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
+   const int is_compr4 = inst->bits1.da1.dest_reg_nr & 0xf0;
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+
+   /* We use compr4 with a size != 16 elements. Strange, we conservatively
+    * consider that we are writing the register.
+    */
+   if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
+      return GL_TRUE;
+
+   GLboolean is_written = GL_FALSE;
+
+   /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
+   if (is_compr4) {
+      const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
+
+      /* First 8-way register */
+      const int write_start0 = mrf_index*BRW_SIZE_OF_REG
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end0 = write_start0 + length;
+
+      /* Second 8-way register */
+      const int write_start1 = (mrf_index+4)*BRW_SIZE_OF_REG
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end1 = write_start1 + length;
+
+      /* If the two intervals intersect, we overwrite the register */
+      const int left0 = BRW_MAX_OFFSET(write_start0, reg_start);
+      const int right0 = BRW_MIN_OFFSET(write_end0, reg_end);
+      const int left1 = BRW_MAX_OFFSET(write_start1, reg_start);
+      const int right1 = BRW_MIN_OFFSET(write_end1, reg_end);
+
+      is_written = left0 < right0 || left1 < right1;
+   }
+   else {
+      int length;
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+
+      /* If the two intervals intersect, we write into the register */
+      const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
+                            + inst->bits1.da1.dest_subreg_nr;
+      const int write_end = write_start + length;
+      const int left = BRW_MAX_OFFSET(write_start, reg_start);
+      const int right = BRW_MIN_OFFSET(write_end, reg_end);;
+
+      is_written = left < right;
+   }
+
+   /* SEND may perform an implicit mov to a mrf register */
+   if (is_written == GL_FALSE &&
+       inst->header.opcode == BRW_OPCODE_SEND &&
+       inst->bits1.da1.src0_reg_file != 0) {
+
+      const int mrf_start = inst->header.destreg__conditionalmod;
+      const int write_start = mrf_start * BRW_SIZE_OF_REG;
+      const int write_end = write_start + BRW_SIZE_OF_REG;
+      const int left = BRW_MAX_OFFSET(write_start, reg_start);
+      const int right = BRW_MIN_OFFSET(write_end, reg_end);;
+      is_written = left < right;
+   }
+
+   return is_written;
+}
+
+static INLINE GLboolean
+brw_is_mrf_read(const struct brw_instruction *inst,
+                int reg_index, int size, int gen)
+{
+   if (inst->header.opcode != BRW_OPCODE_SEND)
+      return GL_FALSE;
+   if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+      return GL_TRUE;
+
+   const int reg_start = reg_index*BRW_SIZE_OF_REG;
+   const int reg_end = reg_start + size;
+
+   int length, read_start, read_end;
+   if (gen >= 5)
+      length = inst->bits3.generic_gen5.msg_length*BRW_SIZE_OF_REG;
+   else 
+      length = inst->bits3.generic.msg_length*BRW_SIZE_OF_REG;
+
+   /* Look if SEND uses an implicit mov. In that case, we read one less register
+    * (but we write it)
+    */
+   if (inst->bits1.da1.src0_reg_file != 0)
+      read_start = inst->header.destreg__conditionalmod;
+   else {
+      length--;
+      read_start = inst->header.destreg__conditionalmod + 1;
+   }
+   read_start *= BRW_SIZE_OF_REG;
+   read_end = read_start + length;
+
+   const int left = BRW_MAX_OFFSET(read_start, reg_start);
+   const int right = BRW_MIN_OFFSET(read_end, reg_end);
+
+   return left < right;
+}
+
+static INLINE GLboolean
+brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
+{
+   int i, j;
+   if (inst_opcode[inst->header.opcode].nsrc == 0)
+      return GL_FALSE;
+
+   /* Look at first source. We must take into account register regions to
+    * monitor carefully the read. Note that we are a bit too conservative here
+    * since we do not take into account the fact that some complete registers
+    * may be skipped
+    */
+   if (inst_opcode[inst->header.opcode].nsrc >= 1) {
+
+      if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*BRW_SIZE_OF_REG;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits2.da1.src0_width;
+      const int row_num = elem_num >> inst->bits2.da1.src0_width;
+      const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
+      int row_start = inst->bits2.da1.src0_reg_nr*BRW_SIZE_OF_REG
+                    + inst->bits2.da1.src0_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   /* Second src register */
+   if (inst_opcode[inst->header.opcode].nsrc >= 2) {
+
+      if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*BRW_SIZE_OF_REG;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits3.da1.src1_width;
+      const int row_num = elem_num >> inst->bits3.da1.src1_width;
+      const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
+      int row_start = inst->bits3.da1.src1_reg_nr*BRW_SIZE_OF_REG
+                    + inst->bits3.da1.src1_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_is_control_done(const struct brw_instruction *mov) {
+   return
+       mov->header.dependency_control != 0 ||
+       mov->header.thread_control != 0 ||
+       mov->header.mask_control != 0 ||
+       mov->header.saturate != 0 ||
+       mov->header.debug_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_predicated(const struct brw_instruction *mov) {
+   return mov->header.predicate_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
+                      int *mrf_index,
+                      int *grf_index,
+                      GLboolean *is_compr4)
+{
+   if (brw_is_predicated(mov) ||
+       brw_is_control_done(mov) ||
+       mov->header.debug_control != 0)
+      return GL_FALSE;
+
+   if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
+       mov->bits1.da1.dest_reg_type != 7 ||
+       mov->bits1.da1.dest_horiz_stride != 1 ||
+       mov->bits1.da1.dest_subreg_nr != 0)
+      return GL_FALSE;
+
+   if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
+       mov->bits1.da1.src0_reg_type != 7 ||
+       mov->bits2.da1.src0_width != 3 ||
+       mov->bits2.da1.src0_horiz_stride != 1 ||
+       mov->bits2.da1.src0_vert_stride != 4 ||
+       mov->bits2.da1.src0_subreg_nr != 0 ||
+       mov->bits2.da1.src0_abs != 0 ||
+       mov->bits2.da1.src0_negate != 0)
+      return GL_FALSE;
+
+   *grf_index = mov->bits2.da1.src0_reg_nr;
+   *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
+   *is_compr4 = (mov->bits1.da1.dest_reg_nr & 0xf0) != 0;
+   return GL_TRUE;
+}
+
+static INLINE GLboolean
+brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
+{
+   /* remark: no problem to predicate a SEL instruction */
+   if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
+       brw_is_control_done(inst) == GL_FALSE &&
+       inst->header.execution_size == 4 &&
+       inst->header.access_mode == BRW_ALIGN_1 &&
+       inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
+       inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
+       inst->bits1.da1.dest_reg_type == 7 &&
+       inst->bits1.da1.dest_horiz_stride == 1 &&
+       inst->bits1.da1.dest_reg_nr == grf_index &&
+       inst->bits1.da1.dest_subreg_nr == 0 &&
+       brw_is_arithmetic_inst(inst))
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_inst_are_equal(const struct brw_instruction *src0,
+                   const struct brw_instruction *src1)
+{
+   const GLuint *field0 = (GLuint *) src0;
+   const GLuint *field1 = (GLuint *) src1;
+   return field0[0] == field1[0] &&
+          field0[1] == field1[1] &&
+          field0[2] == field1[2] &&
+          field0[3] == field1[3];
+}
+
+static INLINE void
+brw_inst_copy(struct brw_instruction *dst,
+              const struct brw_instruction *src)
+{
+   GLuint *field_dst = (GLuint *) dst;
+   const GLuint *field_src = (GLuint *) src;
+   field_dst[0] = field_src[0];
+   field_dst[1] = field_src[1];
+   field_dst[2] = field_src[2];
+   field_dst[3] = field_src[3];
+}
+
+static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
+{
+   int i, nr_insn = 0, to = 0, from = 0;
+
+   for (from = 0; from < p->nr_insn; ++from) {
+      if (removeInst[from])
+         continue;
+      if(to != from)
+         brw_inst_copy(p->store + to, p->store + from);
+      to++;
+   }
+
+   for (i = 0; i < p->nr_insn; ++i)
+      if (removeInst[i] == GL_FALSE)
+         nr_insn++;
+   p->nr_insn = nr_insn;
+}
+
+/* The gen code emitter generates a lot of duplications in the mrf-to-grf moves.
+ * Here, we monitor same mov mrf-to-grf instrutions and remove them as soon as
+ * none of the two operands have been written
+ */
+static void brw_remove_duplicate_mrf_moves(struct brw_wm_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   const int gen = p->brw->intel.gen;
+   int i, j;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
+      const int simd16_size = 2 * BRW_SIZE_OF_REG;
+
+      for (j = i + 1; j < p->nr_insn; j++) {
+         const struct brw_instruction *inst = p->store + j;
+
+         if (brw_inst_are_equal(mov, inst)) {
+            removeInst[j] = GL_TRUE;
+            continue;
+         }
+
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
+             brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG) ||
+             brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG))
+            break;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
+static void brw_remove_mrf_to_grf_moves(struct brw_wm_compile *c)
+{
+   int i, j, prev;
+   struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
+   const int gen = brw->intel.gen;
+   const int simd16_size = 2*BRW_SIZE_OF_REG;
+
+   if (c->dispatch_width != 16 || brw->has_compr4 == GL_FALSE)
+      return;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   assert(removeInst);
+
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      struct brw_instruction *grf_inst = NULL;
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      /* Using comp4 enables a stride of 4 for this instruction */
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
+
+      /* Look where the register has been set */
+      prev = i;
+      GLboolean potential_remove = GL_FALSE;
+      while (prev--) {
+
+         /* If _one_ instruction writes the grf, we try to remove the mov */
+         struct brw_instruction *inst = p->store + prev;
+         if (brw_is_grf_straight_write(inst, grf_index)) {
+            potential_remove = GL_TRUE;
+            grf_inst = inst;
+            break;
+         }
+
+      }
+
+      if (potential_remove == GL_FALSE)
+         continue;
+      removeInst[i] = GL_TRUE;
+
+      /* Monitor first the section of code between the grf computation and the
+       * mov. Here we cannot read or write both mrf and grf register
+       */
+      for (j = prev + 1; j < i; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
+             brw_is_grf_read(inst, grf_index, simd16_size)           ||
+             brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG)   ||
+             brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG)   ||
+             brw_is_mrf_read(inst, mrf_index0, BRW_SIZE_OF_REG, gen) ||
+             brw_is_mrf_read(inst, mrf_index1, BRW_SIZE_OF_REG, gen)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+      }
+
+      /* After the mov, we can read or write the mrf. If the grf is overwritten,
+       * we are done
+       */
+      for (j = i + 1; j < p->nr_insn; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+
+         if (brw_is_grf_read(inst, grf_index, simd16_size)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+
+         if (brw_is_grf_straight_write(inst, grf_index))
+            break;
+      }
+
+      /* Note that with the top down traversal, we can safely pacth the mov
+       * instruction
+       */
+      if (removeInst[i]) {
+         grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
+         grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
 
 /* Emit the fragment program instructions here.
  */
@@ -1712,12 +2329,19 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->spill_slot);
    }
 
+   /* Only properly tested on ILK */
+   if (p->brw->intel.gen == 5) {
+     brw_remove_duplicate_mrf_moves(c);
+     brw_remove_mrf_to_grf_moves(c);
+   }
+
    if (INTEL_DEBUG & DEBUG_WM) {
       int i;
 
-      printf("wm-native:\n");
-      for (i = 0; i < p->nr_insn; i++)
+     printf("wm-native:\n");
+     for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i], p->brw->intel.gen);
       printf("\n");
    }
 }
+
-- 
1.7.1