[Mesa-dev] [PATCH 13/25] i965/fs: Skip SIMD lowering destination zipping if possible.

Sat May 28 02:05:54 UTC 2016

Skipping the temporary allocation and copy instructions is easy (just
return dst), but the conditions used to find out whether the copy can
be optimized out safely without breaking the program are rather
complex: The destination must be exactly one component of at most the
execution width of the lowered instruction, and all source regions of
the instruction must be either fully disjoint from the destination or
be aligned with it group by group.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 81 ++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8eff27e..5d26c68 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5054,6 +5054,78 @@ emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,
 }
 
 /**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the destination
+ * of the lowered instruction and copying the data back to the original
+ * destination region.
+ *
+ * Note that the result of this function may be different for different
+ * channel groups in cases where the overlap between a source and destination
+ * region is partial.
+ */
+static inline bool
+needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
+{
+   /* If the instruction writes more than one component we'll have to shuffle
+    * the results of multiple lowered instructions in order to make sure that
+    * they end up arranged correctly in the original destination region.
+    */
+   if (inst->regs_written * REG_SIZE >
+       inst->dst.component_size(inst->exec_size))
+      return true;
+
+   /* If the lowered execution size is larger than the original the result of
+    * the instruction won't fit in the original destination, so we'll have to
+    * allocate a temporary in any case.
+    */
+   if (lbld.dispatch_width() > inst->exec_size)
+      return true;
+
+   for (unsigned i = 0; i < inst->sources; i++) {
+      /* Index of this lowered instruction and total number of lowered
+       * instructions.
+       */
+      const unsigned j = lbld.group() / lbld.dispatch_width();
+      const unsigned n = DIV_ROUND_UP(inst->exec_size, lbld.dispatch_width());
+
+      /* Specified channels from the source and destination regions. */
+      const fs_reg src = horiz_offset(inst->src[i], lbld.group());
+      const fs_reg dst = horiz_offset(inst->dst, lbld.group());
+
+      /* Lowered component size of the source and destination regions. */
+      const unsigned dsrc = inst->src[i].component_size(lbld.dispatch_width());
+      const unsigned ddst = inst->dst.component_size(lbld.dispatch_width());
+
+      /* If we already made a copy of the source for other reasons there won't
+       * be any overlap with the destination.
+       */
+      if (!is_periodic(inst->src[i], lbld.dispatch_width()) &&
+          inst->components_read(i) != 1)
+         continue;
+
+      /* We need a copy if the specified group of channels of the destination
+       * overlaps either the previous or subsequent groups of the source.  We
+       * don't care whether the destination of a given lowered instruction
+       * overlaps its own source because there's arguably no
+       * source/destination hazard for this opcode if the source and
+       * destination of the unlowered instruction were already overlapping.
+       *
+       * Note that it might be possible to avoid emitting a copy in more cases
+       * by making assumptions about the ordering in which the lowered
+       * instructions will be emitted (ugh!), but this has the advantage that
+       * the lowered instructions will be fully independent from each other
+       * regardless of the form of overlap so they can be scheduled freely.
+       */
+      if (regions_overlap(dst, ddst, inst->src[i], dsrc * j) ||
+          regions_overlap(dst, ddst, horiz_offset(src, lbld.dispatch_width()),
+                           dsrc * (n - j - 1)))
+         return true;
+   }
+
+   return false;
+}
+
+/**
  * Insert data from a packed temporary into the channel group given by
  * lbld.group() of the destination region of instruction \p inst and return
  * the temporary as result.  If any copy instructions are required they will
@@ -5073,6 +5145,8 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
    const fs_reg dst = horiz_offset(inst->dst, lbld.group());
    const unsigned dst_size = inst->regs_written * REG_SIZE /
             inst->dst.component_size(inst->exec_size);
+
+   if (needs_dst_copy(lbld, inst)) {
    const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
 
    if (inst->predicate) {
@@ -5090,6 +5164,13 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
           .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
 
    return tmp;
+
+   } else {
+      /* No need to allocate a temporary for the lowered instruction, just
+       * take the right group of channels from the original region.
+       */
+      return dst;
+   }
 }
 
 bool
-- 
2.7.3