[Mesa-dev] [PATCH RFC 6/6] i965/gen7: emulate SIMD16 sample_d with dual SIMD8 sample_d

Mon Sep 30 01:27:50 PDT 2013

From: Chia-I Wu <olv at lunarg.com>

Add fs_visitor::emit_dual_texture_gen7 that emulate SIMD16 sample_d with dual
SIMD8 sample_d on gen7+.  Fix fs_generator::generate_tex to send SIMD8
messages when force_uncompressed or force_sechalf is set.

No piglit quick.tests regression on Ivy Bridge and Haswell.

With this change, I am seeing 6.76479% +/- 0.619064% (at 95.0% confidence)
improvement on Xonotic with Ultra effects.

Signed-off-by: Chia-I Wu <olv at lunarg.com>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |   3 +
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 137 ++++++++++++++++++++++++++-
 2 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index c161e7d..82a0a7d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -335,6 +335,9 @@ public:
    fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
                               fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
                               fs_reg sample_index, int sampler);
+   void emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+                               fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
+                               fs_reg sample_index, int sampler);
    fs_inst *emit_texture(ir_texture *ir, fs_reg dst, int base_mrf, int mlen,
                          bool header_present, int regs_written, int sampler);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 6435a17..b9f97b6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1334,6 +1334,133 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    return emit_texture(ir, dst, base_mrf, mlen, header_present, 4, sampler);
 }
 
+/* Emulate a SIMD16 sampler message with dual SIMD8 sampler messages.  For
+ * now, and for pratical reaons, only ir_txd is supported.
+ */
+void
+fs_visitor::emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+                                   fs_reg shadow_c, fs_reg lod, fs_reg lod2,
+                                   fs_reg sample_index, int sampler)
+{
+   /* no need to emit dual SIMD8 messages */
+   if (dispatch_width != 16 || ir->op != ir_txd) {
+      emit_texture_gen7(ir, dst, coordinate, shadow_c,
+                        lod, lod2, sample_index, sampler);
+      return;
+   }
+
+   const int reg_width = 1;
+   int mlen = 0;
+   int base_mrf = 2;
+   bool header_present = false;
+   fs_reg temp = fs_reg(GRF, virtual_grf_alloc(4),
+         brw_type_for_base_type(ir->type));
+
+   emit(FS_OPCODE_OVERWRITE_DST, dst);
+   emit(FS_OPCODE_OVERWRITE_DST, temp);
+
+   for (int msg = 0; msg < 2; msg++) {
+      if (msg == 0)
+         push_force_uncompressed();
+      else
+         push_force_sechalf();
+
+      /* only txd is supported for now */
+      assert(ir->op == ir_txd);
+
+      if (ir->offset) {
+         /* The offsets set up by the ir_texture visitor are in the
+          * m1 header, so we can't go headerless.
+          */
+         header_present = true;
+         mlen++;
+         base_mrf--;
+      }
+
+      if (ir->shadow_comparitor) {
+         emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
+         mlen += reg_width;
+      }
+
+      /* Load dPdx and the coordinate together:
+       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+       */
+      fs_reg coord = coordinate, ddx = lod, ddy = lod2;
+      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+         emit(MOV(fs_reg(MRF, base_mrf + mlen), coord));
+         coord.reg_offset++;
+         mlen += reg_width;
+
+         /* For cube map array, the coordinate is (u,v,r,ai) but there are
+          * only derivatives for (u, v, r).
+          */
+         if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
+            emit(MOV(fs_reg(MRF, base_mrf + mlen), ddx));
+            ddx.reg_offset++;
+            mlen += reg_width;
+
+            emit(MOV(fs_reg(MRF, base_mrf + mlen), ddy));
+            ddy.reg_offset++;
+            mlen += reg_width;
+         }
+      }
+
+      if (mlen > 11) {
+         fail("Message length >11 disallowed by hardware\n");
+         break;
+      }
+
+      /* response length is 4, which are 2 vgrf */
+      emit_texture(ir, temp, base_mrf, mlen, header_present, 2, sampler);
+
+      if (msg == 0) {
+         /* move from temp to dst */
+         for (int i = 0; i < 4; i++) {
+            fs_reg d = dst;
+            d.reg_offset += i;
+
+            fs_reg s = temp;
+            s.reg_offset += i / 2;
+            s.sechalf = (i % 2);
+
+            emit(MOV(d, s));
+         }
+
+         pop_force_uncompressed();
+
+         /* use non-overlapping MRF range if possible */
+         if (base_mrf + mlen * 2 < BRW_MAX_MRF)
+            base_mrf += mlen;
+
+         mlen = 0;
+
+         temp.reg_offset += 2;
+
+         coordinate.sechalf = true;
+         shadow_c.sechalf = true;
+         lod.sechalf = true;
+         lod2.sechalf = true;
+         sample_index.sechalf = true;
+      }
+      else {
+         /* move from temp to dst */
+         for (int i = 0; i < 4; i++) {
+            fs_reg d = dst;
+            d.reg_offset += i;
+            d.sechalf = true;
+
+            fs_reg s = temp;
+            s.reg_offset += i / 2;
+            s.sechalf = (i % 2);
+
+            emit(MOV(d, s));
+         }
+
+         pop_force_sechalf();
+      }
+   }
+}
+
 fs_reg
 fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
                              bool is_rect, int sampler, int texunit)
@@ -1503,8 +1630,14 @@ fs_visitor::visit(ir_texture *ir)
    fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
 
    if (brw->gen >= 7) {
-      emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
-                        lod, lod2, sample_index, sampler);
+      if (dispatch_width == 16 && ir->op == ir_txd) {
+         emit_dual_texture_gen7(ir, dst, coordinate, shadow_comparitor,
+                                lod, lod2, sample_index, sampler);
+      }
+      else {
+         emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
+                           lod, lod2, sample_index, sampler);
+      }
    } else if (brw->gen >= 5) {
       emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
                         lod, lod2, sample_index, sampler);
-- 
1.8.3.1