[Mesa-dev] [PATCH 3/4] nv50/ir: optimize imul/imad to xmads
Rhys Perry
pendingchaos02 at gmail.com
Wed Jun 13 22:02:51 UTC 2018
This hits the shader-db numbers a good bit, though a few xmads is way
faster than an imul or imad and the cost is mitigated by the next commit,
which optimizes many multiplications by immediates into shorter and less
register heavy instructions than the xmads.
total instructions in shared programs : 5256901 -> 5294693 (0.72%)
total gprs used in shared programs : 624328 -> 624962 (0.10%)
total shared used in shared programs : 360704 -> 360704 (0.00%)
total local used in shared programs : 20952 -> 21048 (0.46%)
local shared gpr inst bytes
helped 0 0 39 0 0
hurt 1 0 334 2277 2277
Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
---
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 53 ++++++++++++++++++++++
1 file changed, 53 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index a43b481a01..84cb5eb04b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2246,13 +2246,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
// =============================================================================
// ADD(SHL(a, b), c) -> SHLADD(a, b, c)
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
class LateAlgebraicOpt : public Pass
{
private:
virtual bool visit(Instruction *);
void handleADD(Instruction *);
+ void handleMULMAD(Instruction *);
bool tryADDToSHLADD(Instruction *);
+
+ BuildUtil bld;
};
void
@@ -2312,6 +2317,49 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
return true;
}
+
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
+void
+LateAlgebraicOpt::handleMULMAD(Instruction *i)
+{
+ // TODO: handle NV50_IR_SUBOP_MUL_HIGH
+ if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
+ return;
+ if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
+ return;
+ if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
+ return;
+
+ assert(!i->src(0).mod);
+ assert(!i->src(1).mod);
+ assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
+
+ bld.setPosition(i, true);
+
+ Value *a = i->getSrc(0);
+ Value *b = i->getSrc(1);
+ Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
+
+ Value *tmp0 = bld.getSSA();
+ Value *tmp1 = bld.getSSA();
+
+ Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
+ insn->setPredicate(i->cc, i->getPredicate());
+
+ insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
+ insn->setPredicate(i->cc, i->getPredicate());
+ insn->src(1).mod = NV50_IR_MOD_H1;
+ insn->subOp = NV50_IR_SUBOP_XMAD_MRG;
+
+ insn = bld.mkOp3(OP_XMAD, TYPE_U32, i->getDef(0), b, tmp1, tmp0);
+ insn->setPredicate(i->cc, i->getPredicate());
+ insn->src(0).mod = NV50_IR_MOD_H1;
+ insn->src(1).mod = NV50_IR_MOD_H1;
+ insn->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
+
+ delete_Instruction(prog, i);
+}
bool
LateAlgebraicOpt::visit(Instruction *i)
@@ -2320,6 +2368,11 @@ LateAlgebraicOpt::visit(Instruction *i)
case OP_ADD:
handleADD(i);
break;
+ case OP_MUL:
+ case OP_MAD:
+ case OP_FMA:
+ handleMULMAD(i);
+ break;
default:
break;
}
--
2.14.4
More information about the mesa-dev
mailing list