[Mesa-dev] [PATCH 40/95] i965/vec4: add a SIMD lowering pass

Tue Jul 19 10:40:37 UTC 2016

Generally, instructions in Align16 mode only ever write to a single
register and don't need anny form of SIMD splitting, that's why we
have never had a SIMD splitting pass in the vec4 backend. However,
double-precision instructions typically write 2 registers and in
some cases they run into certain hardware bugs and limitations
that we need to work around by splitting the instructions so we only
write to 1 register at a time.

This patch implements a basic SIMD splitting pass for this purpose.
Notice that it does not attempt to be as flexible and generic as the
FS version, because as I explain above, the cases where this needs
to act are more limited, so we take advantage of that to simplify
the implementation.

Because we only use double-precision instructions in Align16 mode
in gen7 (gen8+ is fully scalar and gens < 7 do not implement fp64)
the pass is restricted to act on gen7 hardware only.

For now the pass only handles the gen7 restriction where any
instruction that writes 2 registers also needs to read 2 registers.
This affects double-precision instructions reading uniforms, for
example. Later patches will extend the lowering pass adding a few
more cases.
---
 src/mesa/drivers/dri/i965/brw_ir_vec4.h |   1 +
 src/mesa/drivers/dri/i965/brw_vec4.cpp  | 100 +++++++++++++++++++++++++++++++-
 src/mesa/drivers/dri/i965/brw_vec4.h    |   2 +
 3 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index 721772e..f66c093 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -167,6 +167,7 @@ public:
    unsigned sol_vertex; /**< gen6: used for setting dst index in SVB header */
 
    uint8_t exec_size;
+   uint8_t group;
 
    bool is_send_from_grf();
    unsigned regs_read(unsigned arg) const;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 8316691..829b7d3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1947,6 +1947,101 @@ vec4_visitor::convert_to_hw_regs()
    }
 }
 
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * vec4_visitor::lower_simd_width() if the returned value matches the
+ * instruction's original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct brw_device_info *devinfo,
+                       const vec4_instruction *inst)
+{
+   /* For now we only need to split some cases of double-precision instructions
+    * that write 2 registers. We only need to care about this in gen7 because
+    * that is the only hardware that implements fp64 in Align16.
+    */
+   if (devinfo->gen != 7 || inst->regs_written < 2)
+      return inst->exec_size;
+
+   unsigned lowered_width = MIN2(8, inst->exec_size);
+
+   /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
+    * Register Addressing:
+    *
+    *    "When destination spans two registers, the source MUST span two
+    *     registers."
+    */
+   for (unsigned i = 0; i < 3; i++) {
+      if (inst->src[i].file == BAD_FILE)
+         continue;
+      if (inst->regs_read(i) < 2)
+         lowered_width = MIN2(lowered_width, 4);
+   }
+
+   return lowered_width;
+}
+
+bool
+vec4_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      const unsigned lowered_width = get_lowered_simd_width(devinfo, inst);
+      assert(lowered_width <= inst->exec_size);
+      if (lowered_width == inst->exec_size)
+         continue;
+
+      /* For now we only support splitting 8-wide instructions into 4-wide */
+      assert(inst->exec_size == 8 && lowered_width == 4);
+
+      /* We always split so that each lowered instruction writes exactly to
+       * one register.
+       */
+      assert(inst->regs_written == inst->exec_size / lowered_width);
+
+      for (unsigned n = 0; n < inst->exec_size / lowered_width; n++)  {
+         dst_reg dst = offset(inst->dst, n);
+
+         src_reg srcs[3];
+         for (int i = 0; i < 3; i++) {
+            srcs[i] = inst->src[i];
+
+            if (srcs[i].file == BAD_FILE)
+               continue;
+
+            if (!is_uniform(srcs[i])) {
+               if (type_sz(srcs[i].type) == 8) {
+                  srcs[i] = offset(srcs[i], n);
+               } else {
+                  assert(lowered_width * n < 8);
+                  srcs[i].subnr += lowered_width * n;
+               }
+            }
+         }
+
+         vec4_instruction *linst = new(mem_ctx)
+            vec4_instruction(inst->opcode, dst, srcs[0], srcs[1], srcs[2]);
+         linst->exec_size = lowered_width;
+         linst->group = lowered_width * n;
+         linst->regs_written = 1;
+         linst->conditional_mod = inst->conditional_mod;
+         linst->predicate = inst->predicate;
+         linst->saturate = inst->saturate;
+         inst->insert_before(block, linst);
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 bool
 vec4_visitor::run()
 {
@@ -2002,9 +2097,12 @@ vec4_visitor::run()
       backend_shader::dump_instructions(filename);
    }
 
-   bool progress;
+   bool progress = false;
    int iteration = 0;
    int pass_num = 0;
+
+   OPT(lower_simd_width);
+
    do {
       progress = false;
       pass_num = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index cf7cdab..e4c4e91 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -160,6 +160,8 @@ public:
    void opt_schedule_instructions();
    void convert_to_hw_regs();
 
+   bool lower_simd_width();
+
    vec4_instruction *emit(vec4_instruction *inst);
 
    vec4_instruction *emit(enum opcode opcode);
-- 
2.7.4