[Mesa-dev] [PATCH v2 33/53] intel/fs: Emit LINE+MAC for LINTERP with unaligned coordinates

Mon May 28 17:01:06 UTC 2018

On g4x through Sandy Bridge, src1 (the coordinates) of the PLN
instruction is required to be an even register number.  When it's odd
(which can happen with SIMD32), we have to emit a LINE+MAC combination
instead.  Unfortunately, we can't just fall through to the gen4 case
because the input registers are still set up for PLN which lays out the
four src1 registers differently in SIMD16 than LINE.

v2 (Jason Ekstrand):
 - Take advantage of both accumulators and emit LINE LINE MAC MAC
 - Unify the gen4 and gen4x-6 cases using a loop
---
 src/intel/compiler/brw_fs_generator.cpp | 55 +++++++++++++++++++++++++--------
 src/intel/compiler/brw_shader.cpp       |  3 +-
 2 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 548a208..7943914 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -760,28 +760,57 @@ fs_generator::generate_linterp(fs_inst *inst,
       brw_pop_insn_state(p);
 
       return true;
-   } else if (devinfo->has_pln) {
+   } else if (devinfo->has_pln &&
+              (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
+      brw_PLN(p, dst, interp, delta_x);
+
+      return false;
+   } else {
       /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane":
        *
        *    "[DevSNB]:<src1> must be even register aligned.
        *
        * This restriction is lifted on Ivy Bridge.
+       *
+       * This means that we need to split PLN into LINE+MAC on-the-fly.
+       * Unfortunately, the inputs are laid out for PLN and not LIN+MAC so
+       * we have to split into SIMD8 pieces.  For gen4 (!has_pln), the
+       * coordinate registers are laid out differently so we leave it as a
+       * SIMD16 instruction.
        */
-      assert(devinfo->gen >= 7 || (delta_x.nr & 1) == 0);
-      brw_PLN(p, dst, interp, delta_x);
-
-      return false;
-   } else {
-      i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
-      i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);
+      assert(inst->exec_size == 8 || inst->exec_size == 16);
+      assert(inst->group % 16 == 0);
 
-      brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
+      const unsigned lower_size = devinfo->has_pln ? 8 : inst->exec_size;
+      brw_set_default_exec_size(p, cvt(lower_size) - 1);
 
-      /* brw_set_default_saturate() is called before emitting instructions, so
-       * the saturate bit is set in each instruction, so we need to unset it on
-       * the first instruction.
+      /* Thanks to two accumulators, we can emit all the LINEs and then all
+       * the MACs.  This improves parallelism a bit.
        */
-      brw_inst_set_saturate(p->devinfo, i[0], false);
+      for (unsigned g = 0; g < lower_size / 8; g++) {
+         brw_inst *line = brw_LINE(p, brw_null_reg(), interp,
+                                   offset(delta_x, g * 2));
+         brw_inst_set_group(devinfo, line, inst->group + g * lower_size);
+
+         /* LINE writes the accumulator automatically on gen4-5.  On Sandy
+          * Bridge and later, we have to explicitly enable it.
+          */
+         if (devinfo->gen >= 6)
+            brw_inst_set_acc_wr_control(p->devinfo, line, true);
+
+         /* brw_set_default_saturate() is called before emitting
+          * instructions, so the saturate bit is set in each instruction,
+          * so we need to unset it on the LINE instructions.
+          */
+         brw_inst_set_saturate(p->devinfo, line, false);
+      }
+
+      for (unsigned g = 0; g < lower_size / 8; g++) {
+         brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1),
+                                 offset(delta_x, g * 2 + 1));
+         brw_inst_set_group(devinfo, mac, inst->group + g * lower_size);
+         brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod);
+      }
 
       return true;
    }
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index dfd2c5c..8610c30 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -985,7 +985,8 @@ backend_instruction::writes_accumulator_implicitly(const struct gen_device_info
           (devinfo->gen < 6 &&
            ((opcode >= BRW_OPCODE_ADD && opcode < BRW_OPCODE_NOP) ||
             (opcode >= FS_OPCODE_DDX_COARSE && opcode <= FS_OPCODE_LINTERP))) ||
-          (opcode == FS_OPCODE_LINTERP && !devinfo->has_pln);
+          (opcode == FS_OPCODE_LINTERP &&
+           (!devinfo->has_pln || devinfo->gen <= 6));
 }
 
 bool
-- 
2.5.0.400.gff86faf