Mesa (master): i965/fs: Convert gen7 to using GRFs for texture messages.

Thu Oct 10 23:30:15 UTC 2013

Module: Mesa
Branch: master
Commit: 36fbe66d3a71df76fcb6f915846da4471b3a8442
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=36fbe66d3a71df76fcb6f915846da4471b3a8442

Author: Eric Anholt <eric at anholt.net>
Date:   Wed Oct  9 17:17:59 2013 -0700

i965/fs: Convert gen7 to using GRFs for texture messages.

Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways.  One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()).  Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required.  For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.

The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction.  We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.

Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).

Improves openarena performance by 1.01092% +/- 0.66897% (n=425).

No significant difference on Lightsmark (n=44).

v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
    segfault in lightsmark.

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
Acked-by: Matt Turner <mattst88 at gmail.com>

---

 src/mesa/drivers/dri/i965/brw_eu_emit.c            |    3 +-
 src/mesa/drivers/dri/i965/brw_fs.cpp               |   37 ++++++-
 src/mesa/drivers/dri/i965/brw_fs.h                 |    5 +-
 .../drivers/dri/i965/brw_fs_copy_propagation.cpp   |    3 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp     |   27 ++++-
 .../drivers/dri/i965/brw_fs_live_variables.cpp     |    9 +--
 src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp  |   47 +++++----
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp       |  110 ++++++++++----------
 .../drivers/dri/i965/brw_schedule_instructions.cpp |   36 ++++---
 9 files changed, 169 insertions(+), 108 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 33245d7..8efd679 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -2193,7 +2193,8 @@ void brw_SAMPLE(struct brw_compile *p,
    struct brw_context *brw = p->brw;
    struct brw_instruction *insn;
 
-   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+   if (msg_reg_nr != -1)
+      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.predicate_control = 0; /* XXX */
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index da31b3e..e5d6e4b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -355,7 +355,8 @@ fs_inst::is_send_from_grf()
    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
-            src[1].file == GRF));
+            src[1].file == GRF) ||
+           (is_tex() && src[0].file == GRF));
 }
 
 bool
@@ -436,6 +437,14 @@ fs_reg::equals(const fs_reg &r) const
            imm.u == r.imm.u);
 }
 
+fs_reg
+fs_reg::retype(uint32_t type)
+{
+   fs_reg result = *this;
+   result.type = type;
+   return result;
+}
+
 bool
 fs_reg::is_zero() const
 {
@@ -698,6 +707,18 @@ fs_inst::is_partial_write()
            this->force_sechalf);
 }
 
+int
+fs_inst::regs_read(fs_visitor *v, int arg)
+{
+   if (is_tex() && arg == 0 && src[0].file == GRF) {
+      if (v->dispatch_width == 16)
+	 return (mlen + 1) / 2;
+      else
+	 return mlen;
+   }
+   return 1;
+}
+
 /**
  * Returns how many MRFs an FS opcode will write over.
  *
@@ -710,6 +731,9 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    if (inst->mlen == 0)
       return 0;
 
+   if (inst->base_mrf == -1)
+      return 0;
+
    switch (inst->opcode) {
    case SHADER_OPCODE_RCP:
    case SHADER_OPCODE_RSQ:
@@ -2194,6 +2218,13 @@ fs_visitor::register_coalesce()
 	    break;
 	 }
 
+	 if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
+	     scan_inst->src[0].file == GRF &&
+	     scan_inst->src[0].reg == inst->dst.reg) {
+	    interfered = true;
+	    break;
+	 }
+
 	 /* The accumulator result appears to get used for the
 	  * conditional modifier generation.  When negating a UD
 	  * value, there is a 33rd bit generated for the sign in the
@@ -2382,7 +2413,7 @@ fs_visitor::compute_to_mrf()
 	    }
 	 }
 
-	 if (scan_inst->mlen > 0) {
+	 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
 	    /* Found a SEND instruction, which means that there are
 	     * live values in MRFs from base_mrf to base_mrf +
 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
@@ -2444,7 +2475,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 	 last_mrf_move[inst->dst.reg] = NULL;
       }
 
-      if (inst->mlen > 0) {
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
 	 /* Found a SEND instruction, which will include two or fewer
 	  * implied MRF writes.  We could do better here.
 	  */
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 360dbad..c78f9ae 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -78,6 +78,7 @@ public:
    bool is_zero() const;
    bool is_one() const;
    bool is_valid_3src() const;
+   fs_reg retype(uint32_t type);
 
    /** Register file: GRF, MRF, IMM. */
    enum register_file file;
@@ -145,6 +146,7 @@ public:
    bool overwrites_reg(const fs_reg &reg);
    bool is_send_from_grf();
    bool is_partial_write();
+   int regs_read(fs_visitor *v, int arg);
 
    fs_reg dst;
    fs_reg src[3];
@@ -354,7 +356,8 @@ public:
    void try_replace_with_sel();
    void emit_bool_to_cond_code(ir_rvalue *condition);
    void emit_if_gen6(ir_if *ir);
-   void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset);
+   void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset,
+                     int count);
 
    void emit_fragment_program_code();
    void setup_fp_regs();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index fb6fe18..7b90982 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -279,6 +279,9 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    if (entry->src.file == IMM)
       return false;
 
+   if (inst->regs_read(this, arg) > 1)
+      return false;
+
    if (inst->src[arg].file != entry->dst.file ||
        inst->src[arg].reg != entry->dst.reg ||
        inst->src[arg].reg_offset != entry->dst.reg_offset) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index dbfbc11..4b668f1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -501,24 +501,43 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       dst = vec16(dst);
    }
 
+   if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
+      /* The send-from-GRF for 16-wide texturing with a header has an extra
+       * hardware register allocated to it, which we need to skip over (since
+       * our coordinates in the payload are in the even-numbered registers,
+       * and the header comes right before the first one).
+       */
+      assert(src.file == BRW_GENERAL_REGISTER_FILE);
+      src.nr++;
+   }
+
    /* Load the message header if present.  If there's a texture offset,
     * we need to set it up explicitly and load the offset bitfield.
     * Otherwise, we can use an implied move from g0 to the first message reg.
     */
    if (inst->texture_offset) {
+      struct brw_reg header_reg;
+
+      if (brw->gen >= 7) {
+         header_reg = src;
+      } else {
+         assert(inst->base_mrf != -1);
+         header_reg = retype(brw_message_reg(inst->base_mrf),
+                             BRW_REGISTER_TYPE_UD);
+      }
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       /* Explicitly set up the message header by copying g0 to the MRF. */
-      brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
-                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+      brw_MOV(p, header_reg, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 
       /* Then set the offset bits in DWord 2. */
-      brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
-                                     inst->base_mrf, 2), BRW_REGISTER_TYPE_UD),
+      brw_MOV(p, retype(brw_vec1_reg(header_reg.file,
+                                     header_reg.nr, 2), BRW_REGISTER_TYPE_UD),
                  brw_imm_ud(inst->texture_offset));
       brw_pop_insn_state(p);
    } else if (inst->header_present) {
+      assert(brw->gen < 7);
       /* Set up an implied move from g0 to the MRF. */
       src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index 50aa7a6..b3026c2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -151,14 +151,7 @@ fs_live_variables::setup_def_use()
             if (reg.file != GRF)
                continue;
 
-            int regs_read = 1;
-            /* We don't know how many components are read in a send-from-grf,
-             * so just assume "all of them."
-             */
-            if (inst->is_send_from_grf())
-               regs_read = v->virtual_grf_sizes[reg.reg];
-
-            for (int i = 0; i < regs_read; i++) {
+            for (int j = 0; j < inst->regs_read(v, i); j++) {
                setup_one_read(block, inst, ip, reg);
                reg.reg_offset++;
             }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index f0f4ad9..157c9ae 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -512,19 +512,25 @@ fs_visitor::assign_regs()
 }
 
 void
-fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, uint32_t spill_offset)
+fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, uint32_t spill_offset,
+                         int count)
 {
-   fs_inst *unspill_inst = new(mem_ctx) fs_inst(FS_OPCODE_UNSPILL, dst);
-   unspill_inst->offset = spill_offset;
-   unspill_inst->ir = inst->ir;
-   unspill_inst->annotation = inst->annotation;
+   for (int i = 0; i < count; i++) {
+      fs_inst *unspill_inst = new(mem_ctx) fs_inst(FS_OPCODE_UNSPILL, dst);
+      unspill_inst->offset = spill_offset;
+      unspill_inst->ir = inst->ir;
+      unspill_inst->annotation = inst->annotation;
+
+      /* Choose a MRF that won't conflict with an MRF that's live across the
+       * spill.  Nothing else will make it up to MRF 14/15.
+       */
+      unspill_inst->base_mrf = 14;
+      unspill_inst->mlen = 1; /* header contains offset */
+      inst->insert_before(unspill_inst);
 
-   /* Choose a MRF that won't conflict with an MRF that's live across the
-    * spill.  Nothing else will make it up to MRF 14/15.
-    */
-   unspill_inst->base_mrf = 14;
-   unspill_inst->mlen = 1; /* header contains offset */
-   inst->insert_before(unspill_inst);
+      dst.reg_offset++;
+      spill_offset += REG_SIZE;
+   }
 }
 
 int
@@ -623,9 +629,14 @@ fs_visitor::spill_reg(int spill_reg)
       for (unsigned int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == GRF &&
 	     inst->src[i].reg == spill_reg) {
-	    inst->src[i].reg = virtual_grf_alloc(1);
-	    emit_unspill(inst, inst->src[i],
-                         spill_offset + REG_SIZE * inst->src[i].reg_offset);
+            int regs_read = inst->regs_read(this, i);
+
+            inst->src[i].reg = virtual_grf_alloc(regs_read);
+            inst->src[i].reg_offset = 0;
+
+            emit_unspill(inst, inst->src[i],
+                         spill_offset + REG_SIZE * inst->src[i].reg_offset,
+                         regs_read);
 	 }
       }
 
@@ -641,12 +652,8 @@ fs_visitor::spill_reg(int spill_reg)
           * since we write back out all of the regs_written().
 	  */
 	 if (inst->predicate || inst->force_uncompressed || inst->force_sechalf) {
-            fs_reg unspill_reg = inst->dst;
-            for (int chan = 0; chan < inst->regs_written; chan++) {
-               emit_unspill(inst, unspill_reg,
-                            subset_spill_offset + REG_SIZE * chan);
-               unspill_reg.reg_offset++;
-            }
+            emit_unspill(inst, inst->dst, subset_spill_offset,
+                         inst->regs_written);
 	 }
 
 	 fs_reg spill_src = inst->dst;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 728567c..e659203 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1226,27 +1226,28 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
                               fs_reg shadow_c, fs_reg lod, fs_reg lod2,
                               fs_reg sample_index)
 {
-   int mlen = 0;
-   int base_mrf = 2;
    int reg_width = dispatch_width / 8;
    bool header_present = false;
    int offsets[3];
 
+   fs_reg payload = fs_reg(this, glsl_type::float_type);
+   fs_reg next = payload;
+
    if (ir->op == ir_tg4 || (ir->offset && ir->op != ir_txf)) {
-      /* * The offsets set up by the ir_texture visitor are in the
-       * m1 header, so we can't go headerless.
+      /* For general texture offsets (no txf workaround), we need a header to
+       * put them in.  Note that for 16-wide we're making space for two actual
+       * hardware registers here, so the emit will have to fix up for this.
        *
        * * ir4_tg4 needs to place its channel select in the header,
        * for interaction with ARB_texture_swizzle
        */
       header_present = true;
-      mlen++;
-      base_mrf--;
+      next.reg_offset++;
    }
 
    if (ir->shadow_comparitor) {
-      emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
-      mlen += reg_width;
+      emit(MOV(next, shadow_c));
+      next.reg_offset++;
    }
 
    /* Set up the LOD info */
@@ -1256,12 +1257,12 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    case ir_tg4:
       break;
    case ir_txb:
-      emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
-      mlen += reg_width;
+      emit(MOV(next, lod));
+      next.reg_offset++;
       break;
    case ir_txl:
-      emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
-      mlen += reg_width;
+      emit(MOV(next, lod));
+      next.reg_offset++;
       break;
    case ir_txd: {
       if (dispatch_width == 16)
@@ -1271,32 +1272,32 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
        */
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), coordinate));
+	 emit(MOV(next, coordinate));
 	 coordinate.reg_offset++;
-	 mlen += reg_width;
+	 next.reg_offset++;
 
          /* For cube map array, the coordinate is (u,v,r,ai) but there are
           * only derivatives for (u, v, r).
           */
          if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
-            emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
+            emit(MOV(next, lod));
             lod.reg_offset++;
-            mlen += reg_width;
+            next.reg_offset++;
 
-            emit(MOV(fs_reg(MRF, base_mrf + mlen), lod2));
+            emit(MOV(next, lod2));
             lod2.reg_offset++;
-            mlen += reg_width;
+            next.reg_offset++;
          }
       }
       break;
    }
    case ir_txs:
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
-      mlen += reg_width;
+      emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), lod));
+      next.reg_offset++;
       break;
    case ir_query_levels:
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), fs_reg(0)));
-      mlen += reg_width;
+      emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      next.reg_offset++;
       break;
    case ir_txf:
       /* It appears that the ld instruction used for txf does its
@@ -1314,40 +1315,37 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       }
 
       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
-      emit(ADD(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D),
-               coordinate, offsets[0]));
+      emit(ADD(next.retype(BRW_REGISTER_TYPE_D), coordinate, offsets[0]));
       coordinate.reg_offset++;
-      mlen += reg_width;
+      next.reg_offset++;
 
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), lod));
-      mlen += reg_width;
+      emit(MOV(next.retype(BRW_REGISTER_TYPE_D), lod));
+      next.reg_offset++;
 
       for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(ADD(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D),
-                  coordinate, offsets[i]));
+	 emit(ADD(next.retype(BRW_REGISTER_TYPE_D), coordinate, offsets[i]));
 	 coordinate.reg_offset++;
-	 mlen += reg_width;
+	 next.reg_offset++;
       }
       break;
    case ir_txf_ms:
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), sample_index));
-      mlen += reg_width;
+      emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), sample_index));
+      next.reg_offset++;
 
       /* constant zero MCS; we arrange to never actually have a compressed
        * multisample surface here for now. TODO: issue ld_mcs to get this first,
        * if we ever support texturing from compressed multisample surfaces
        */
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
-      mlen += reg_width;
+      emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      next.reg_offset++;
 
       /* there is no offsetting for this message; just copy in the integer
        * texture coordinates
        */
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-         emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D),
-                  coordinate));
+         emit(MOV(next.retype(BRW_REGISTER_TYPE_D), coordinate));
          coordinate.reg_offset++;
-         mlen += reg_width;
+         next.reg_offset++;
       }
       break;
    }
@@ -1355,32 +1353,37 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    /* Set up the coordinate (except for cases where it was done above) */
    if (ir->op != ir_txd && ir->op != ir_txs && ir->op != ir_txf && ir->op != ir_txf_ms && ir->op != ir_query_levels) {
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), coordinate));
+	 emit(MOV(next, coordinate));
 	 coordinate.reg_offset++;
-	 mlen += reg_width;
+	 next.reg_offset++;
       }
    }
 
    /* Generate the SEND */
    fs_inst *inst = NULL;
    switch (ir->op) {
-   case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
-   case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
-   case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
-   case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
-   case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
-   case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_MS, dst); break;
-   case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
-   case ir_query_levels: inst = emit(SHADER_OPCODE_TXS, dst); break;
-   case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst); break;
-   case ir_tg4: inst = emit(SHADER_OPCODE_TG4, dst); break;
-   }
-   inst->base_mrf = base_mrf;
-   inst->mlen = mlen;
+   case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst, payload); break;
+   case ir_txb: inst = emit(FS_OPCODE_TXB, dst, payload); break;
+   case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst, payload); break;
+   case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst, payload); break;
+   case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst, payload); break;
+   case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_MS, dst, payload); break;
+   case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
+   case ir_query_levels: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
+   case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst, payload); break;
+   case ir_tg4: inst = emit(SHADER_OPCODE_TG4, dst, payload); break;
+   }
+   inst->base_mrf = -1;
+   if (reg_width == 2)
+      inst->mlen = next.reg_offset * reg_width - header_present;
+   else
+      inst->mlen = next.reg_offset * reg_width;
+
    inst->header_present = header_present;
    inst->regs_written = 4;
 
-   if (mlen > 11) {
+   virtual_grf_sizes[payload.reg] = next.reg_offset;
+   if (inst->mlen > 11) {
       fail("Message length >11 disallowed by hardware\n");
    }
 
@@ -1591,9 +1594,6 @@ fs_visitor::visit(ir_texture *ir)
                                lod, lod2);
    }
 
-   /* The header is set up by generate_tex() when necessary. */
-   inst->src[0] = reg_undef;
-
    if (ir->offset != NULL && ir->op != ir_txf)
       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
 
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 735ad93..b24c38c 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -569,7 +569,7 @@ fs_instruction_scheduler::calculate_deps()
       for (int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
-               for (int r = 0; r < reg_width; r++)
+               for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
                   add_dep(last_grf_write[inst->src[i].reg + r], n);
             } else {
                add_dep(last_grf_write[inst->src[i].reg], n);
@@ -594,12 +594,14 @@ fs_instruction_scheduler::calculate_deps()
 	 }
       }
 
-      for (int i = 0; i < inst->mlen; i++) {
-	 /* It looks like the MRF regs are released in the send
-	  * instruction once it's sent, not when the result comes
-	  * back.
-	  */
-	 add_dep(last_mrf_write[inst->base_mrf + i], n);
+      if (inst->base_mrf != -1) {
+	 for (int i = 0; i < inst->mlen; i++) {
+	    /* It looks like the MRF regs are released in the send
+	     * instruction once it's sent, not when the result comes
+	     * back.
+	     */
+	    add_dep(last_mrf_write[inst->base_mrf + i], n);
+	 }
       }
 
       if (inst->predicate) {
@@ -642,7 +644,7 @@ fs_instruction_scheduler::calculate_deps()
 	 add_barrier_deps(n);
       }
 
-      if (inst->mlen > 0) {
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 	    add_dep(last_mrf_write[inst->base_mrf + i], n);
 	    last_mrf_write[inst->base_mrf + i] = n;
@@ -677,7 +679,7 @@ fs_instruction_scheduler::calculate_deps()
       for (int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
-               for (int r = 0; r < reg_width; r++)
+               for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
             } else {
                add_dep(n, last_grf_write[inst->src[i].reg]);
@@ -702,12 +704,14 @@ fs_instruction_scheduler::calculate_deps()
 	 }
       }
 
-      for (int i = 0; i < inst->mlen; i++) {
-	 /* It looks like the MRF regs are released in the send
-	  * instruction once it's sent, not when the result comes
-	  * back.
-	  */
-	 add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+      if (inst->base_mrf != -1) {
+	 for (int i = 0; i < inst->mlen; i++) {
+	    /* It looks like the MRF regs are released in the send
+	     * instruction once it's sent, not when the result comes
+	     * back.
+	     */
+	    add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+	 }
       }
 
       if (inst->predicate) {
@@ -749,7 +753,7 @@ fs_instruction_scheduler::calculate_deps()
 	 add_barrier_deps(n);
       }
 
-      if (inst->mlen > 0) {
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 	    last_mrf_write[inst->base_mrf + i] = n;
 	 }