[Mesa-dev] [PATCH 46/95] i965/vec4: add a scalarization pass for double-precision instructions

Tue Jul 19 10:40:43 UTC 2016

The hardware only supports 32-bit swizzles, which means that a swizzle
like XYZW only selects channels XY of a DF, making access to channels ZW
more difficult, specially considering the various regioning restrictions
imposed by the hardware. The combination of both things makes handling
ramdom swizzles on DF operands rather difficult, as there are many
combinations that can't be represented at all, at least not without
some work and some level of instruction splitting.

Writemasks are 64-bit in general, however XY and ZW writemasks also work
in 32-bit, which means these writemasks can't be represented natively,
adding to the complexity.

For now, we decided to try and simplify things as much as possible to
avoid dealing with all this from the get go by adding a scalarization
pass that runs after the main optimization loop. By fully scalarizing
DF instructions in align16 we avoid most of the complexity introduced
by the aforementioned hardware restrictions and have an easier path to
an initial fully functional version for the vector backend in haswell.

Later, we can improve the implementation so we don't necessarily
scalarize everything, iteratively adding more complexity and building
on top of a framework that is already working. Curro drafted some ideas
for how this could be done here:
https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 99 ++++++++++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_vec4.h   |  1 +
 2 files changed, 100 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 610c45d..6bbe5da 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -2051,6 +2051,103 @@ vec4_visitor::lower_simd_width()
    return progress;
 }
 
+static bool
+is_align1_df(vec4_instruction *inst)
+{
+   switch (inst->opcode) {
+      case VEC4_OPCODE_DOUBLE_TO_SINGLE:
+      case VEC4_OPCODE_SINGLE_TO_DOUBLE:
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT:
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT:
+         return true;
+      default:
+         return false;
+   }
+}
+
+static brw_predicate
+scalarize_predicate(brw_predicate predicate, unsigned writemask)
+{
+   if (predicate != BRW_PREDICATE_NORMAL)
+      return predicate;
+
+   switch (writemask) {
+   case WRITEMASK_X:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case WRITEMASK_Y:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case WRITEMASK_Z:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case WRITEMASK_W:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      unreachable("invalid writemask");
+   }
+}
+
+bool
+vec4_visitor::scalarize_df()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* Skip DF instructions that operate in Align1 mode */
+      if (is_align1_df(inst))
+         continue;
+
+      /* Check if this is a double-precision instruction */
+      bool is_double = type_sz(inst->dst.type) == 8;
+      for (int arg = 0; !is_double && arg < 3; arg++) {
+         is_double = inst->src[arg].file != BAD_FILE &&
+                     type_sz(inst->src[arg].type) == 8;
+      }
+
+      if (!is_double)
+         continue;
+
+      /* Generate scalar instructions for each enabled channel */
+      for (unsigned chan = 0; chan < 4; chan++) {
+         unsigned chan_mask = 1 << chan;
+         if (!(inst->dst.writemask & chan_mask))
+            continue;
+
+         src_reg srcs[3];
+         for (unsigned i = 0; i < 3; i++) {
+            unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan);
+            srcs[i] = inst->src[i];
+            srcs[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz);
+         }
+
+         dst_reg dst = inst->dst;
+         dst.writemask = chan_mask;
+
+         vec4_instruction *scalar_inst = new(mem_ctx)
+            vec4_instruction(inst->opcode, dst, srcs[0], srcs[1], srcs[2]);
+         scalar_inst->regs_written = inst->regs_written;
+         scalar_inst->exec_size = inst->exec_size;
+         scalar_inst->group = inst->group;
+         scalar_inst->force_writemask_all = inst->force_writemask_all;
+         scalar_inst->conditional_mod = inst->conditional_mod;
+         scalar_inst->saturate = inst->saturate;
+         if (inst->predicate != BRW_PREDICATE_NONE) {
+            scalar_inst->predicate =
+               scalarize_predicate(inst->predicate, chan_mask);
+         }
+         inst->insert_before(block, scalar_inst);
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 bool
 vec4_visitor::run()
 {
@@ -2148,6 +2245,8 @@ vec4_visitor::run()
    if (failed)
       return false;
 
+   OPT(scalarize_df);
+
    setup_payload();
 
    if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index e4c4e91..7abcc33 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -161,6 +161,7 @@ public:
    void convert_to_hw_regs();
 
    bool lower_simd_width();
+   bool scalarize_df();
 
    vec4_instruction *emit(vec4_instruction *inst);
 
-- 
2.7.4