[Mesa-dev] [PATCH v3 4/4] i965/vec4: Don't unspill the same register in consecutive instructions

Tue Aug 4 03:26:30 PDT 2015

If we have spilled/unspilled a register in the current instruction, avoid
emitting unspills for the same register in the same instruction or consecutive
instructions following the current one as long as they keep reading the spilled
register. This should allow us to avoid emitting costy unspills that come with
little benefit to register allocation.

Also, update evaluate_spill_costs so that we account for the saved unspills.
---
 .../drivers/dri/i965/brw_vec4_reg_allocate.cpp     | 129 +++++++++++++++++++--
 1 file changed, 121 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index 617c988..fed5f4d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -264,6 +264,95 @@ vec4_visitor::reg_allocate()
    return true;
 }
 
+/**
+ * When we decide to spill a register, instead of blindly spilling every use,
+ * save unspills when the spill register is used (read) in consecutive
+ * instructions. This can potentially save a bunch of unspills that would
+ * have very little impact in register allocation anyway.
+ *
+ * Notice that we need to account for this behavior when spilling a register
+ * and when evaluating spilling costs. This function is designed so it can
+ * be called from both places and avoid repeating the logic.
+ *
+ *  - When we call this function from spill_reg, we pass in scratch_reg the
+ *    actual unspill/spill register that we want to reuse in the current
+ *    instruction.
+ *
+ *  - When we call this from evaluate_spill_costs, we pass the register for
+ *    which we are evaluating spilling costs.
+ *
+ * In either case, we check if the previous instructions read scratch_reg until
+ * we find an instruction that writes to it (in which case we can reuse
+ * scratch_reg as long as the writemask is compatible with the channels we need
+ * to read in the current instruction) or we hit an instruction that does not
+ * read scratch_reg at all. The latter can only happen when we call this from
+ * evaluate_spill_costs, and means that this is the point at which we first
+ * need the unspill this register for our current instruction. Since all our
+ * unspills read a full vec4, we know that in this case we will have all
+ * the channels available in scratch_reg and we can reuse it.
+ *
+ * In any other case, we can't reuse scratch_reg in the current instruction,
+ * meaning that we will need to unspill it.
+ */
+static bool
+can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
+                           unsigned scratch_reg)
+{
+   assert(inst->src[i].file == GRF);
+
+   /* If the current instruction is already using scratch_reg in src[n] with
+    * n < i, then we know we can reuse it for src[i] too.
+    */
+   for (unsigned n = 0; n < i; n++) {
+      if (inst->src[n].file == GRF && inst->src[n].reg == scratch_reg)
+         return true;
+   }
+
+   bool prev_inst_read_scratch_reg = false;
+   vec4_instruction *prev_inst = (vec4_instruction *) inst->prev;
+   for (; !prev_inst->is_head_sentinel();
+        prev_inst = (vec4_instruction *) prev_inst->prev) {
+      /* If any previous instruction does not read from or write to scratch_reg
+       * inconditonally we cannot reuse scratch_reg
+       */
+      if (prev_inst->predicate && prev_inst->opcode != BRW_OPCODE_SEL)
+         return false;
+
+      /* If the previous instruction writes to scratch_reg then we can reuse
+       * it if the channels we wrote are compatible with our read mask.
+       */
+      if (prev_inst->dst.file == GRF && prev_inst->dst.reg == scratch_reg) {
+         return (brw_mask_for_swizzle(inst->src[i].swizzle) &
+                 ~prev_inst->dst.writemask) == 0;
+     }
+
+      if (prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE ||
+          prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ)
+         continue;
+
+      /* Check that the previous instruction reads scratch_reg, if so, continue
+       * looping. Otherwise it means that we got here from
+       * evaluate_spill_costs and this is the point at which we will emit an
+       * unspill for the register passed in scratch_reg, in which case we can
+       * only reuse it if all other instructions in between have read
+       * scratch_reg.
+       */
+      int n;
+      for (n = 0; n < 3; n++) {
+         if (prev_inst->src[n].file == GRF &&
+             prev_inst->src[n].reg == scratch_reg) {
+            prev_inst_read_scratch_reg = true;
+            break;
+         }
+      }
+      if (n == 3) {
+         return prev_inst_read_scratch_reg;
+      }
+   }
+
+   return false;
+}
+
 void
 vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
 {
@@ -281,9 +370,15 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (unsigned int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF) {
-            spill_costs[inst->src[i].reg] += loop_scale;
-            if (inst->src[i].reladdr)
-               no_spill[inst->src[i].reg] = true;
+            /* We will only unspill src[i] it it wasn't unspilled for the
+             * previous instruction, in which case we'll just reuse the scratch
+             * reg for this instruction.
+             */
+            if (!can_use_scratch_for_source(inst, i, inst->src[i].reg)) {
+               spill_costs[inst->src[i].reg] += loop_scale;
+               if (inst->src[i].reladdr)
+                  no_spill[inst->src[i].reg] = true;
+            }
          }
       }
 
@@ -342,19 +437,37 @@ vec4_visitor::spill_reg(int spill_reg_nr)
    unsigned int spill_offset = last_scratch++;
 
    /* Generate spill/unspill instructions for the objects being spilled. */
+   int scratch_reg = -1;
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      /* We don't spill registers used for scratch */
+      if (inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ ||
+          inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
+         continue;
+
       for (unsigned int i = 0; i < 3; i++) {
          if (inst->src[i].file == GRF && inst->src[i].reg == spill_reg_nr) {
-            src_reg spill_reg = inst->src[i];
-            inst->src[i].reg = alloc.allocate(1);
-            dst_reg temp = dst_reg(inst->src[i]);
-
-            emit_scratch_read(block, inst, temp, spill_reg, spill_offset);
+            if (scratch_reg == -1 ||
+                !can_use_scratch_for_source(inst, i, scratch_reg)) {
+               /* We need to unspill anyway so make sure we read the full vec4
+                * in any case. This way, the cached register can be reused
+                * for consecutive instructions that read different channels of
+                * the same vec4.
+                */
+               scratch_reg = alloc.allocate(1);
+               src_reg temp = inst->src[i];
+               temp.reg = scratch_reg;
+               temp.swizzle = BRW_SWIZZLE_XYZW;
+               emit_scratch_read(block, inst,
+                                 dst_reg(temp), inst->src[i], spill_offset);
+            }
+            assert(scratch_reg != -1);
+            inst->src[i].reg = scratch_reg;
          }
       }
 
       if (inst->dst.file == GRF && inst->dst.reg == spill_reg_nr) {
          emit_scratch_write(block, inst, spill_offset);
+         scratch_reg = inst->dst.reg;
       }
    }
 
-- 
1.9.1