Mesa (master): vc4: Reserve rb31 instead of r3 for raddr conflict spills.

Tue Dec 9 09:05:15 UTC 2014

Module: Mesa
Branch: master
Commit: 8420a956924c720b3c4932a577623f836758c21c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8420a956924c720b3c4932a577623f836758c21c

Author: Eric Anholt <eric at anholt.net>
Date:   Mon Dec  8 16:52:53 2014 -0800

vc4: Reserve rb31 instead of r3 for raddr conflict spills.

This increases the cost of a raddr b conflict spill (save r3 to rb31, move
src1 to r3, move rb31 back to r3 when done, instead of just move src1 to
r3), but on average thanks to instruction pairing it's more worthwhile to
have another accumulator.

total instructions in shared programs: 46428 -> 46171 (-0.55%)
instructions in affected programs:     38030 -> 37773 (-0.68%)

---

 src/gallium/drivers/vc4/vc4_qpu_emit.c          |   50 +++++++++++++++++++----
 src/gallium/drivers/vc4/vc4_register_allocate.c |    6 +--
 2 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 856f844..f2620c0 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -93,21 +93,41 @@ swap_file(struct qpu_reg *src)
  * In that case, we need to move one to a temporary that can be used in the
  * instruction, instead.
  */
-static void
+static bool
 fixup_raddr_conflict(struct vc4_compile *c,
-                     struct qpu_reg *src0, struct qpu_reg *src1)
+                     struct qpu_reg dst,
+                     struct qpu_reg *src0, struct qpu_reg *src1,
+                     bool r3_live)
 {
         if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
             src0->mux != src1->mux ||
             src0->addr == src1->addr) {
-                return;
+                return false;
         }
 
         if (swap_file(src0) || swap_file(src1))
-                return;
+                return false;
+
+        if (src0->mux == QPU_MUX_A) {
+                /* If we're conflicting over the A regfile, then we can just
+                 * use the reserved rb31.
+                 */
+                queue(c, qpu_a_MOV(qpu_rb(31), *src1));
+                *src1 = qpu_rb(31);
+                return false;
+        } else {
+                /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
+                 * rb31, then store our desired value in r3, and tell the
+                 * caller to put rb31 back into r3 when we're done.
+                 */
+                if (r3_live)
+                        queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
+                queue(c, qpu_a_MOV(qpu_r3(), *src1));
+
+                *src1 = qpu_r3();
 
-        queue(c, qpu_a_MOV(qpu_r3(), *src1));
-        *src1 = qpu_r3();
+                return r3_live && dst.mux != QPU_MUX_R3;
+        }
 }
 
 void
@@ -118,6 +138,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t inputs_remaining = c->num_inputs;
         uint32_t vpm_read_fifo_count = 0;
         uint32_t vpm_read_offset = 0;
+        bool written_r3 = false;
+        bool needs_restore;
 
         make_empty_list(&c->qpu_inst_list);
 
@@ -416,8 +438,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
 
                 case QOP_TEX_DIRECT:
-                        fixup_raddr_conflict(c, &src[0], &src[1]);
+                        needs_restore = fixup_raddr_conflict(c, dst,
+                                                             &src[0], &src[1],
+                                                             written_r3);
                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
+                        if (needs_restore)
+                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
                         break;
 
                 case QOP_TEX_RESULT:
@@ -477,7 +503,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         if (qir_get_op_nsrc(qinst->op) == 1)
                                 src[1] = src[0];
 
-                        fixup_raddr_conflict(c, &src[0], &src[1]);
+                        needs_restore = fixup_raddr_conflict(c, dst,
+                                                             &src[0], &src[1],
+                                                             written_r3);
 
                         if (translate[qinst->op].is_mul) {
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
@@ -488,8 +516,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                                     dst,
                                                     src[0], src[1]));
                         }
+                        if (needs_restore)
+                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
+
                         break;
                 }
+
+                if (dst.mux == QPU_MUX_R3)
+                        written_r3 = true;
         }
 
         qpu_schedule_instructions(c);
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 3001900..85f29e5 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -117,10 +117,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
-                /* Reserve r3 for now, since we're using it for spilling-like
-                 * operations in vc4_qpu_emit.c
+                /* Reserve rb31 for spilling fixup_raddr_conflict() in
+                 * vc4_qpu_emit.c
                  */
-                if (vc4_regs[i].mux == QPU_MUX_R3)
+                if (vc4_regs[i].mux == QPU_MUX_B && vc4_regs[i].addr == 31)
                         continue;
 
                 /* R4 can't be written as a general purpose register. (it's