[Mesa-dev] [PATCH v2 3/5] nv50/ir: optimize imul/imad to xmads

Wed Jul 18 18:29:09 UTC 2018

some nitpicks, but with those fixed:

Reviewed-by: Karol Herbst <kherbst at redhat.com>

On Wed, Jul 18, 2018 at 7:05 PM, Rhys Perry <pendingchaos02 at gmail.com> wrote:
> This hits the shader-db numbers a good bit, though a few xmads is way
> faster than an imul or imad and the cost is mitigated by the next commit,
> which optimizes many multiplications by immediates into shorter and less
> register heavy instructions than the xmads.
>
> total instructions in shared programs : 5256901 -> 5294693 (0.72%)
> total gprs used in shared programs    : 624328 -> 624962 (0.10%)
> total shared used in shared programs  : 360704 -> 360704 (0.00%)
> total local used in shared programs   : 20952 -> 21048 (0.46%)
>
>                 local     shared        gpr       inst      bytes
>     helped           0           0          39           0           0
>       hurt           1           0         334        2277        2277
>
> Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
> ---
>  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 51 ++++++++++++++++++++++
>  .../nouveau/codegen/nv50_ir_target_gm107.cpp       |  1 -
>  2 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index 5fc1fba970..14cc4b32d4 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -2291,13 +2291,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
>  // =============================================================================
>
>  // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
> +// MUL(a, b) -> a few XMADs
> +// MAD/FMA(a, b, c) -> a few XMADs
>  class LateAlgebraicOpt : public Pass
>  {
>  private:
>     virtual bool visit(Instruction *);
>
>     void handleADD(Instruction *);
> +   void handleMULMAD(Instruction *);
>     bool tryADDToSHLADD(Instruction *);
> +
> +   BuildUtil bld;
>  };
>
>  void
> @@ -2357,6 +2362,47 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
>
>     return true;
>  }
> +
> +// MUL(a, b) -> a few XMADs
> +// MAD/FMA(a, b, c) -> a few XMADs
> +void
> +LateAlgebraicOpt::handleMULMAD(Instruction *i)
> +{
> +   // TODO: handle NV50_IR_SUBOP_MUL_HIGH
> +   if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
> +      return;
> +   if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
> +      return;
> +   if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
> +      return;
> +
> +   assert(!i->src(0).mod);
> +   assert(!i->src(1).mod);
> +   assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
> +
> +   bld.setPosition(i, true);
> +
> +   Value *a = i->getSrc(0);
> +   Value *b = i->getSrc(1);
> +   Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
> +
> +   Value *tmp0 = bld.getSSA();
> +   Value *tmp1 = bld.getSSA();
> +
> +   Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
> +   insn->setPredicate(i->cc, i->getPredicate());
> +
> +   insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
> +   insn->setPredicate(i->cc, i->getPredicate());
> +   insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
> +
> +   insn = bld.mkOp3(OP_XMAD, TYPE_U32, i->getDef(0), b, tmp1, tmp0);
> +   insn->setPredicate(i->cc, i->getPredicate());
> +   insn->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
> +   insn->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
> +
> +   delete_Instruction(prog, i);

I think you can simply adjust the current op instead, then we don't
need to set the predicate, create and delete an instruction:

insn->op = OP_XMAD;
insn->setSrc(0, b);
insn->setSrc(1, tmp1);
insn->setSrc(2, tmp0);
insn->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
insn->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);

> +}
>
>  bool
>  LateAlgebraicOpt::visit(Instruction *i)
> @@ -2365,6 +2411,11 @@ LateAlgebraicOpt::visit(Instruction *i)
>     case OP_ADD:
>        handleADD(i);
>        break;
> +   case OP_MUL:
> +   case OP_MAD:
> +   case OP_FMA:
> +      handleMULMAD(i);
> +      break;
>     default:
>        break;
>     }
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> index f918fbfdd3..571d8a67c2 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> @@ -165,7 +165,6 @@ TargetGM107::isBarrierRequired(const Instruction *insn) const
>        }
>        break;
>     case OPCLASS_ARITH:
> -      // TODO: IMUL/IMAD require barriers too, use of XMAD instead!
>        if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
>            !isFloatType(insn->dType))
>           return true;
> --
> 2.14.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev