[Mesa-dev] [PATCHv3 3/3] i965/gen7: emulate SIMD16 sample_d with dual SIMD8 sample_d

Wed Oct 16 08:58:11 CEST 2013

From: Chia-I Wu <olv at lunarg.com>

Add fs_visitor::emit_dual_texture_gen7 that emulate SIMD16 sample_d with dual
SIMD8 sample_d on gen7+.

No piglit quick.tests regression on Ivy Bridge and Haswell.

Improved Xonotic with Ultra effects by 6.0209% +/- 0.396586% (N=11) on
Haswell.

v2: no change
v3: reworked because of texture-from-GRF changes

Signed-off-by: Chia-I Wu <olv at lunarg.com>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |   3 +
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 117 ++++++++++++++++++++++++++-
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index c2ba351..05bf39e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -347,6 +347,9 @@ public:
    fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
                               fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
                               fs_reg sample_index, int sampler);
+   void emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+                               fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
+                               fs_reg sample_index, int sampler);
    fs_inst *emit_texture(ir_texture *ir, fs_reg dst, fs_reg payload, int mlen,
                          bool header_present, int regs_written, int sampler);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d164b04..19e3f1e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1394,6 +1394,114 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    return emit_texture(ir, dst, payload, mlen, header_present, 4, sampler);
 }
 
+/* Emulate a SIMD16 sampler message with dual SIMD8 sampler messages.  For
+ * now, and for pratical reaons, only ir_txd is supported.
+ */
+void
+fs_visitor::emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+                                   fs_reg shadow_c, fs_reg lod, fs_reg lod2,
+                                   fs_reg sample_index, int sampler)
+{
+   /* no need to emit dual SIMD8 messages */
+   if (dispatch_width != 16 || ir->op != ir_txd) {
+      emit_texture_gen7(ir, dst, coordinate, shadow_c,
+                        lod, lod2, sample_index, sampler);
+      return;
+   }
+
+   fs_reg simd8_dst = fs_reg(GRF, virtual_grf_alloc(4),
+         brw_type_for_base_type(ir->type));
+
+#define ADVANCE_HALF(reg) \
+   do { reg.reg_offset += reg.sechalf; reg.sechalf = !reg.sechalf; } while (0)
+
+   for (int msg = 0; msg < 2; msg++) {
+      bool header_present = false;
+      fs_reg payload = fs_reg(this, glsl_type::float_type);
+      fs_reg next = payload;
+
+      if (msg == 0)
+         push_force_uncompressed();
+      else
+         push_force_sechalf();
+
+      /* only txd is supported for now */
+      assert(ir->op == ir_txd);
+
+      if (ir->offset) {
+         /* Need the header to put texture offsets in */
+         header_present = true;
+         ADVANCE_HALF(next);
+      }
+
+      if (ir->shadow_comparitor) {
+         emit(MOV(next, shadow_c));
+         ADVANCE_HALF(next);
+      }
+
+      /* Load dPdx and the coordinate together:
+       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+       */
+      fs_reg coord = coordinate, ddx = lod, ddy = lod2;
+      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+         emit(MOV(next, coord));
+         coord.reg_offset++;
+         ADVANCE_HALF(next);
+
+         /* For cube map array, the coordinate is (u,v,r,ai) but there are
+          * only derivatives for (u, v, r).
+          */
+         if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
+            emit(MOV(next, ddx));
+            ddx.reg_offset++;
+            ADVANCE_HALF(next);
+
+            emit(MOV(next, ddy));
+            ddy.reg_offset++;
+            ADVANCE_HALF(next);
+         }
+      }
+
+      int mlen = next.reg_offset * 2 + next.sechalf;
+      if (mlen > 11) {
+         fail("Message length >11 disallowed by hardware\n");
+         break;
+      }
+
+      /* Message length is mlen and response length is 4.  In vgrf, that means
+       * (mlen + 1) / 2 registers for payload and 2 registers for writeback.
+       */
+      virtual_grf_sizes[payload.reg] = (mlen + 1) / 2;
+      emit_texture(ir, simd8_dst, payload, mlen, header_present, 2, sampler);
+
+      fs_reg d = dst, s = simd8_dst;
+      d.sechalf = (msg == 1);
+
+      /* swizzle the result to match SIMD16 writeback */
+      for (int i = 0; i < 4; i++) {
+         emit(MOV(d, s));
+         d.reg_offset++;
+         ADVANCE_HALF(s);
+      }
+
+      if (msg == 0) {
+         pop_force_uncompressed();
+
+         /* prepare for the second message */
+         simd8_dst.reg_offset += 2;
+         coordinate.sechalf = true;
+         shadow_c.sechalf = true;
+         lod.sechalf = true;
+         lod2.sechalf = true;
+         sample_index.sechalf = true;
+      } else {
+         pop_force_sechalf();
+      }
+   }
+
+#undef ADVANCE_HALF
+}
+
 fs_reg
 fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
                              bool is_rect, int sampler, int texunit)
@@ -1586,8 +1694,13 @@ fs_visitor::visit(ir_texture *ir)
    fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
 
    if (brw->gen >= 7) {
-      emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
-                        lod, lod2, sample_index, sampler);
+      if (dispatch_width == 16 && ir->op == ir_txd) {
+         emit_dual_texture_gen7(ir, dst, coordinate, shadow_comparitor,
+                                lod, lod2, sample_index, sampler);
+      } else {
+         emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
+                           lod, lod2, sample_index, sampler);
+      }
    } else if (brw->gen >= 5) {
       emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
                         lod, lod2, sample_index, sampler);
-- 
1.8.3.1