[Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations
Pierre Moreau
pierre.morrow at free.fr
Tue Oct 18 18:03:51 UTC 2016
Hello Ian,
Since I am working on a direct SPIR-V to NV50 IR translator, ultimately to be
used for OpenCL kernels, I will still need the patch for that work. (I even
wrote that patch because I needed it when handling 64-bit addresses. :-) )
But thanks for the heads-up!
Pierre
On 02:07 pm - Oct 17 2016, Ian Romanick wrote:
> I know know if it will make this patch unnecessary, but I have a GLSL
> IR-level lowering pass for 64-bit multiplication. I'm going to send
> that out with the rest of the GL_ARB_gpu_shader_int64 series within the
> next day or so.
>
> On 10/15/2016 03:24 PM, Pierre Moreau wrote:
> > Hardware does not support 64-bit integers MAD and MUL operations, so we need
> > to transform them in 32-bit operations.
> >
> > Signed-off-by: Pierre Moreau <pierre.morrow at free.fr>
> > ---
> > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 +++++++++++++++++++++
> > 1 file changed, 121 insertions(+)
> >
> > Tested with (the GPU result was compared to the CPU result):
> > * 0xfffffffffffffff3lu * 0xfffffffffffffff2lu + 0x8000000700000002lu
> > * 0xfffffffffffffff3lu * 0x8000000700000002lu + 0x8000000700000002lu
> > * 0x8000000100000003lu * 0xfffffffffffffff2lu + 0x8000000700000002lu
> > * 0x8000000100000003lu * 0x8000000700000002lu + 0x8000000700000002lu
> >
> > * -523456791234l * 929835793793l + -100005793793l
> > * 523456791234l * 929835793793l + -100005793793l
> > * -523456791234l * -929835793793l + -100005793793l
> > * 523456791234l * -929835793793l + -100005793793l
> >
> > v2:
> > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
> > * Move pass prior to Register Allocation, as some temporaries need to
> > be created.
> >
> > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > index d88bb34..a610eb5 100644
> > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i)
> >
> > // =============================================================================
> >
> > +// Split 64-bit MUL and MAD
> > +class Split64BitOpPreRA : public Pass
> > +{
> > +private:
> > + virtual bool visit(BasicBlock *);
> > + void split64BitReg(Function *, Instruction *, Instruction *,
> > + Instruction *, Value *, int);
> > + void split64MulMad(Function *, Instruction *, DataType);
> > +
> > + BuildUtil bld;
> > +};
> > +
> > +bool
> > +Split64BitOpPreRA::visit(BasicBlock *bb)
> > +{
> > + Instruction *i, *next;
> > + Modifier mod;
> > +
> > + for (i = bb->getEntry(); i; i = next) {
> > + next = i->next;
> > +
> > + if (typeSizeof(i->dType) != 8)
> > + continue;
> > +
> > + DataType hTy;
> > + switch (i->dType) {
> > + case TYPE_U64: hTy = TYPE_U32; break;
> > + case TYPE_S64: hTy = TYPE_S32; break;
> > + default:
> > + continue;
> > + }
> > +
> > + if (i->op == OP_MAD || i->op == OP_MUL)
> > + split64MulMad(bb->getFunction(), i, hTy);
> > + }
> > +
> > + return true;
> > +}
> > +
> > +void
> > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
> > +{
> > + assert(i->op == OP_MAD || i->op == OP_MUL);
> > + if (isFloatType(i->dType) || isFloatType(i->sType))
> > + return;
> > +
> > + bld.setPosition(i, true);
> > +
> > + Value *zero = bld.mkImm(0u);
> > + Value *carry = bld.getSSA(1, FILE_FLAGS);
> > +
> > + // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
> > + // values (a, b and c might be 32-bit values), using 32-bit operations. This
> > + // gives the following operations:
> > + // * `d.low = low(a.low * b.low) (+ c.low)?`
> > + // * `d.high = low(a.high * b.low) + low(a.low * b.high)
> > + // + high(a.low * b.low) (+ c.high)?`
> > + //
> > + // To compute the high bits, we can split in the following operations:
> > + // * `tmp1 = low(a.high * b.low) (+ c.high)?`
> > + // * `tmp2 = low(a.low * b.high) + tmp1`
> > + // * `d.high = high(a.low * b.low) + tmp2`
> > + //
> > + // mkSplit put lower bits at index 0 and higher bits at index 1
> > +
> > + Value *op1[2];
> > + if (i->getSrc(0)->reg.size == 8)
> > + bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0));
> > + else {
> > + op1[0] = i->getSrc(0);
> > + op1[1] = zero;
> > + }
> > + Value *op2[2];
> > + if (i->getSrc(1)->reg.size == 8)
> > + bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1));
> > + else {
> > + op2[0] = i->getSrc(1);
> > + op2[1] = zero;
> > + }
> > +
> > + Value *op3[2] = { NULL, NULL };
> > + if (i->op == OP_MAD) {
> > + if (i->getSrc(2)->reg.size == 8)
> > + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2));
> > + else {
> > + op3[0] = i->getSrc(2);
> > + op3[1] = zero;
> > + }
> > + }
> > +
> > + Value *tmpRes1Hi = bld.getSSA();
> > + if (i->op == OP_MAD)
> > + bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
> > + else
> > + bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
> > +
> > + Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
> > +
> > + Value *def[2] = { bld.getSSA(), bld.getSSA() };
> > +
> > + // If it was a MAD, add the carry from the low bits
> > + // It is not needed if it was a MUL, since we added high(a.low * b.low) to
> > + // d.high
> > + if (i->op == OP_MAD)
> > + bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
> > + else
> > + bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
> > +
> > + Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
> > + hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
> > + if (i->op == OP_MAD)
> > + hiPart3->setFlagsSrc(3, carry);
> > +
> > + bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
> > +
> > + delete_Instruction(fn->getProgram(), i);
> > +}
> > +
> > +// =============================================================================
> > +
> > static inline void
> > updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
> > {
> > @@ -3523,6 +3643,7 @@ Program::optimizeSSA(int level)
> > RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
> > RUN_PASS(1, ConstantFolding, foldAll);
> > RUN_PASS(2, LateAlgebraicOpt, run);
> > + RUN_PASS(1, Split64BitOpPreRA, run);
> > RUN_PASS(1, LoadPropagation, run);
> > RUN_PASS(1, IndirectPropagation, run);
> > RUN_PASS(2, MemoryOpt, run);
> >
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 801 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20161018/9e7f9bbc/attachment.sig>
More information about the mesa-dev
mailing list