[Mesa-dev] [PATCH v3] nv50/ir: Split 64-bit integer MAD/MUL operations
Pierre Moreau
pierre.morrow at free.fr
Sun Oct 30 21:34:25 UTC 2016
Hardware does not support 64-bit integers MAD and MUL operations, so we need
to transform them in 32-bit operations.
Signed-off-by: Pierre Moreau <pierre.morrow at free.fr>
---
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 116 +++++++++++++++++++++
1 file changed, 116 insertions(+)
Tested with (the GPU result was compared to the CPU result):
* 0xfffffffffffffff3lu * 0xfffffffffffffff2lu + 0x8000000700000002lu
* 0xfffffffffffffff3lu * 0x8000000700000002lu + 0x8000000700000002lu
* 0x8000000100000003lu * 0xfffffffffffffff2lu + 0x8000000700000002lu
* 0x8000000100000003lu * 0x8000000700000002lu + 0x8000000700000002lu
* -523456791234l * 929835793793l + -100005793793l
* 523456791234l * 929835793793l + -100005793793l
* -523456791234l * -929835793793l + -100005793793l
* 523456791234l * -929835793793l + -100005793793l
v2:
* Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
* Move pass prior to Register Allocation, as some temporaries need to
be created.
v3:
* Remove left-over prototype `split64Reg()`
* Remove redundant check for 64-bit destination type in `visit()` (Ilia Mirkin)
* Use the `func` attribute when calling split64MulMad (Ilia Mirkin)
* Change test of source and destination as float types, to an assert (Ilia
Mirkin)
* Replace `typeSizeof(hTy)` by 4, as it will always be the case, and add an
assert for it in `split64MulMad()`
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 0fb1a78..da6bbc4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2234,6 +2234,121 @@ LateAlgebraicOpt::visit(Instruction *i)
// =============================================================================
+// Split 64-bit MUL and MAD
+class Split64BitOpPreRA : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+ void split64MulMad(Function *, Instruction *, DataType);
+
+ BuildUtil bld;
+};
+
+bool
+Split64BitOpPreRA::visit(BasicBlock *bb)
+{
+ Instruction *i, *next;
+ Modifier mod;
+
+ for (i = bb->getEntry(); i; i = next) {
+ next = i->next;
+
+ DataType hTy;
+ switch (i->dType) {
+ case TYPE_U64: hTy = TYPE_U32; break;
+ case TYPE_S64: hTy = TYPE_S32; break;
+ default:
+ continue;
+ }
+
+ if (i->op == OP_MAD || i->op == OP_MUL)
+ split64MulMad(func, i, hTy);
+ }
+
+ return true;
+}
+
+void
+Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
+{
+ assert(i->op == OP_MAD || i->op == OP_MUL);
+ assert(!isFloatType(i->dType) && !isFloatType(i->sType));
+ assert(typeSizeof(hTy) == 4);
+
+ bld.setPosition(i, true);
+
+ Value *zero = bld.mkImm(0u);
+ Value *carry = bld.getSSA(1, FILE_FLAGS);
+
+ // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
+ // values (a, b and c might be 32-bit values), using 32-bit operations. This
+ // gives the following operations:
+ // * `d.low = low(a.low * b.low) (+ c.low)?`
+ // * `d.high = low(a.high * b.low) + low(a.low * b.high)
+ // + high(a.low * b.low) (+ c.high)?`
+ //
+ // To compute the high bits, we can split in the following operations:
+ // * `tmp1 = low(a.high * b.low) (+ c.high)?`
+ // * `tmp2 = low(a.low * b.high) + tmp1`
+ // * `d.high = high(a.low * b.low) + tmp2`
+ //
+ // mkSplit put lower bits at index 0 and higher bits at index 1
+
+ Value *op1[2];
+ if (i->getSrc(0)->reg.size == 8)
+ bld.mkSplit(op1, 4, i->getSrc(0));
+ else {
+ op1[0] = i->getSrc(0);
+ op1[1] = zero;
+ }
+ Value *op2[2];
+ if (i->getSrc(1)->reg.size == 8)
+ bld.mkSplit(op2, 4, i->getSrc(1));
+ else {
+ op2[0] = i->getSrc(1);
+ op2[1] = zero;
+ }
+
+ Value *op3[2] = { NULL, NULL };
+ if (i->op == OP_MAD) {
+ if (i->getSrc(2)->reg.size == 8)
+ bld.mkSplit(op3, 4, i->getSrc(2));
+ else {
+ op3[0] = i->getSrc(2);
+ op3[1] = zero;
+ }
+ }
+
+ Value *tmpRes1Hi = bld.getSSA();
+ if (i->op == OP_MAD)
+ bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
+ else
+ bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
+
+ Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
+
+ Value *def[2] = { bld.getSSA(), bld.getSSA() };
+
+ // If it was a MAD, add the carry from the low bits
+ // It is not needed if it was a MUL, since we added high(a.low * b.low) to
+ // d.high
+ if (i->op == OP_MAD)
+ bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
+ else
+ bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
+
+ Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
+ hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
+ if (i->op == OP_MAD)
+ hiPart3->setFlagsSrc(3, carry);
+
+ bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
+
+ delete_Instruction(fn->getProgram(), i);
+}
+
+// =============================================================================
+
static inline void
updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
{
@@ -3539,6 +3654,7 @@ Program::optimizeSSA(int level)
RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
RUN_PASS(1, ConstantFolding, foldAll);
RUN_PASS(2, LateAlgebraicOpt, run);
+ RUN_PASS(1, Split64BitOpPreRA, run);
RUN_PASS(1, LoadPropagation, run);
RUN_PASS(1, IndirectPropagation, run);
RUN_PASS(2, MemoryOpt, run);
--
2.10.0
More information about the mesa-dev
mailing list