Mesa (master): broadcom/compiler: convert add to mul when possible to allow merge

Thu Mar 25 10:03:59 UTC 2021

Module: Mesa
Branch: master
Commit: 22a979be6516da89d1d1b3c0a8923c5d236b1abd
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=22a979be6516da89d1d1b3c0a8923c5d236b1abd

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Thu Mar 18 13:03:01 2021 +0100

broadcom/compiler: convert add to mul when possible to allow merge

Integer add/sub can be implemented as either an add or a mul instruction
but we always emit them as add instructions at VIR level. We can use this
flexibility to improve our QPU scheduling so we can be more effective
at instruction merging by converting these to mul instructions when we
are attempting to merge them with another add instruction.

total instructions in shared programs: 13721549 -> 13691004 (-0.22%)
instructions in affected programs: 3340493 -> 3309948 (-0.91%)
helped: 12805
HURT: 1656
Instructions are helped.

total max-temps in shared programs: 2319528 -> 2319317 (<.01%)
max-temps in affected programs: 5285 -> 5074 (-3.99%)
helped: 195
HURT: 3
Max-temps are helped.

total sfu-stalls in shared programs: 31616 -> 31752 (0.43%)
sfu-stalls in affected programs: 469 -> 605 (29.00%)
helped: 52
HURT: 161
Sfu-stalls are HURT.

total inst-and-stalls in shared programs: 13753165 -> 13722756 (-0.22%)
inst-and-stalls in affected programs: 3340383 -> 3309974 (-0.91%)
helped: 12782
HURT: 1666
Inst-and-stalls are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9769>

---

 src/broadcom/compiler/qpu_schedule.c | 95 +++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 8 deletions(-)

diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 317b7306d88..cd0015a62d3 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -820,6 +820,50 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
         return true;
 }
 
+static bool
+can_do_add_as_mul(enum v3d_qpu_add_op op)
+{
+        switch (op) {
+        case V3D_QPU_A_ADD:
+        case V3D_QPU_A_SUB:
+                return true;
+        default:
+                return false;
+        }
+}
+
+static enum v3d_qpu_mul_op
+add_op_as_mul_op(enum v3d_qpu_add_op op)
+{
+        switch (op) {
+        case V3D_QPU_A_ADD:
+                return V3D_QPU_M_ADD;
+        case V3D_QPU_A_SUB:
+                return V3D_QPU_M_SUB;
+        default:
+                unreachable("unexpected add opcode");
+        }
+}
+
+static void
+qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+{
+        STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
+        assert(inst->alu.add.op != V3D_QPU_A_NOP);
+        assert(inst->alu.mul.op == V3D_QPU_M_NOP);
+
+        memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
+        inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
+        inst->alu.add.op = V3D_QPU_A_NOP;
+
+        inst->flags.mc = inst->flags.ac;
+        inst->flags.mpf = inst->flags.apf;
+        inst->flags.muf = inst->flags.auf;
+        inst->flags.ac = V3D_QPU_PF_NONE;
+        inst->flags.apf = V3D_QPU_PF_NONE;
+        inst->flags.auf = V3D_QPU_PF_NONE;
+}
+
 static bool
 qpu_merge_inst(const struct v3d_device_info *devinfo,
                struct v3d_qpu_instr *result,
@@ -837,17 +881,52 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
         struct v3d_qpu_instr merge = *a;
         const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
 
+        struct v3d_qpu_instr mul_inst;
         if (b->alu.add.op != V3D_QPU_A_NOP) {
-                if (a->alu.add.op != V3D_QPU_A_NOP)
-                        return false;
-                merge.alu.add = b->alu.add;
+                if (a->alu.add.op == V3D_QPU_A_NOP) {
+                        merge.alu.add = b->alu.add;
+
+                        merge.flags.ac = b->flags.ac;
+                        merge.flags.apf = b->flags.apf;
+                        merge.flags.auf = b->flags.auf;
+
+                        add_instr = b;
+                        mul_instr = a;
+                }
+                /* If a's add op is used but its mul op is not, then see if we
+                 * can convert either a's add op or b's add op to a mul op
+                 * so we can merge.
+                 */
+                else if (a->alu.mul.op == V3D_QPU_M_NOP &&
+                         can_do_add_as_mul(b->alu.add.op)) {
+                        mul_inst = *b;
+                        qpu_convert_add_to_mul(&mul_inst);
 
-                merge.flags.ac = b->flags.ac;
-                merge.flags.apf = b->flags.apf;
-                merge.flags.auf = b->flags.auf;
+                        merge.alu.mul = mul_inst.alu.mul;
 
-                add_instr = b;
-                mul_instr = a;
+                        merge.flags.mc = b->flags.ac;
+                        merge.flags.mpf = b->flags.apf;
+                        merge.flags.muf = b->flags.auf;
+
+                        add_instr = a;
+                        mul_instr = &mul_inst;
+                } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
+                           can_do_add_as_mul(a->alu.add.op)) {
+                        mul_inst = *a;
+                        qpu_convert_add_to_mul(&mul_inst);
+
+                        merge = mul_inst;
+                        merge.alu.add = b->alu.add;
+
+                        merge.flags.ac = b->flags.ac;
+                        merge.flags.apf = b->flags.apf;
+                        merge.flags.auf = b->flags.auf;
+
+                        add_instr = b;
+                        mul_instr = &mul_inst;
+                } else {
+                        return false;
+                }
         }
 
         if (b->alu.mul.op != V3D_QPU_M_NOP) {