[Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations

Sat Oct 15 22:38:20 UTC 2016

On Sat, Oct 15, 2016 at 6:24 PM, Pierre Moreau <pierre.morrow at free.fr> wrote:
> Hardware does not support 64-bit integers MAD and MUL operations, so we need
> to transform them in 32-bit operations.
>
> Signed-off-by: Pierre Moreau <pierre.morrow at free.fr>
> ---
>  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 121 +++++++++++++++++++++
>  1 file changed, 121 insertions(+)
>
> Tested with (the GPU result was compared to the CPU result):
> * 0xfffffffffffffff3lu * 0xfffffffffffffff2lu + 0x8000000700000002lu
> * 0xfffffffffffffff3lu * 0x8000000700000002lu + 0x8000000700000002lu
> * 0x8000000100000003lu * 0xfffffffffffffff2lu + 0x8000000700000002lu
> * 0x8000000100000003lu * 0x8000000700000002lu + 0x8000000700000002lu
>
> * -523456791234l * 929835793793l + -100005793793l
> *  523456791234l * 929835793793l + -100005793793l
> * -523456791234l * -929835793793l + -100005793793l
> *  523456791234l * -929835793793l + -100005793793l
>
> v2:
> * Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
> * Move pass prior to Register Allocation, as some temporaries need to
>   be created.

In principle I like this approach. I don't remember what your old one
was, but this is good. I think that nearly all of our "legalize" step
items, including the gpu-family specific ones, need to be moved to
this type of pass.

>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index d88bb34..a610eb5 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i)
>
>  // =============================================================================
>
> +// Split 64-bit MUL and MAD
> +class Split64BitOpPreRA : public Pass
> +{
> +private:
> +   virtual bool visit(BasicBlock *);
> +   void split64BitReg(Function *, Instruction *, Instruction *,
> +                      Instruction *, Value *, int);
> +   void split64MulMad(Function *, Instruction *, DataType);
> +
> +   BuildUtil bld;
> +};
> +
> +bool
> +Split64BitOpPreRA::visit(BasicBlock *bb)
> +{
> +   Instruction *i, *next;
> +   Modifier mod;
> +
> +   for (i = bb->getEntry(); i; i = next) {
> +      next = i->next;
> +
> +      if (typeSizeof(i->dType) != 8)
> +         continue;

Is this necessary? You exclusively operate on U64/S64 below.

> +
> +      DataType hTy;
> +      switch (i->dType) {
> +      case TYPE_U64: hTy = TYPE_U32; break;
> +      case TYPE_S64: hTy = TYPE_S32; break;
> +      default:
> +         continue;
> +      }
> +
> +      if (i->op == OP_MAD || i->op == OP_MUL)
> +         split64MulMad(bb->getFunction(), i, hTy);

There's an instance variable "func" (and "prog") you can use.

> +   }
> +
> +   return true;
> +}
> +
> +void
> +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
> +{
> +   assert(i->op == OP_MAD || i->op == OP_MUL);
> +   if (isFloatType(i->dType) || isFloatType(i->sType))
> +      return;

I'd make this into an assert. Given the checks before calling this
function, it can't really happen.

> +
> +   bld.setPosition(i, true);
> +
> +   Value *zero = bld.mkImm(0u);
> +   Value *carry = bld.getSSA(1, FILE_FLAGS);
> +
> +   // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
> +   // values (a, b and c might be 32-bit values), using 32-bit operations. This
> +   // gives the following operations:
> +   // * `d.low = low(a.low * b.low) (+ c.low)?`
> +   // * `d.high = low(a.high * b.low) + low(a.low * b.high)
> +   //           + high(a.low * b.low) (+ c.high)?`
> +   //
> +   // To compute the high bits, we can split in the following operations:
> +   // * `tmp1   = low(a.high * b.low) (+ c.high)?`
> +   // * `tmp2   = low(a.low * b.high) + tmp1`
> +   // * `d.high = high(a.low * b.low) + tmp2`
> +   //
> +   // mkSplit put lower bits at index 0 and higher bits at index 1
> +
> +   Value *op1[2];
> +   if (i->getSrc(0)->reg.size == 8)
> +      bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0));
> +   else {
> +      op1[0] = i->getSrc(0);
> +      op1[1] = zero;
> +   }
> +   Value *op2[2];
> +   if (i->getSrc(1)->reg.size == 8)
> +      bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1));
> +   else {
> +      op2[0] = i->getSrc(1);
> +      op2[1] = zero;
> +   }
> +
> +   Value *op3[2] = { NULL, NULL };
> +   if (i->op == OP_MAD) {
> +      if (i->getSrc(2)->reg.size == 8)
> +         bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2));
> +      else {
> +         op3[0] = i->getSrc(2);
> +         op3[1] = zero;
> +      }
> +   }
> +
> +   Value *tmpRes1Hi = bld.getSSA();
> +   if (i->op == OP_MAD)
> +      bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
> +   else
> +      bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
> +
> +   Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
> +
> +   Value *def[2] = { bld.getSSA(), bld.getSSA() };
> +
> +   // If it was a MAD, add the carry from the low bits
> +   // It is not needed if it was a MUL, since we added high(a.low * b.low) to
> +   // d.high
> +   if (i->op == OP_MAD)
> +      bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
> +   else
> +      bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
> +
> +   Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
> +   hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
> +   if (i->op == OP_MAD)
> +      hiPart3->setFlagsSrc(3, carry);
> +
> +   bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
> +
> +   delete_Instruction(fn->getProgram(), i);
> +}
> +
> +// =============================================================================
> +
>  static inline void
>  updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
>  {
> @@ -3523,6 +3643,7 @@ Program::optimizeSSA(int level)
>     RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
>     RUN_PASS(1, ConstantFolding, foldAll);
>     RUN_PASS(2, LateAlgebraicOpt, run);
> +   RUN_PASS(1, Split64BitOpPreRA, run);
>     RUN_PASS(1, LoadPropagation, run);
>     RUN_PASS(1, IndirectPropagation, run);
>     RUN_PASS(2, MemoryOpt, run);
> --
> 2.10.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev