[Mesa-dev] [PATCH v3 3/4] nv50/ir: optimize imul/imad to xmads

Mon Jul 23 10:42:26 UTC 2018

This patch is:

Reviewied-By: Karol Herbst <kherbst at redhat.com>

forgot to add that

On Mon, Jul 23, 2018 at 11:40 AM, Rhys Perry <pendingchaos02 at gmail.com> wrote:
> This hits the shader-db numbers a good bit, though a few xmads is way
> faster than an imul or imad and the cost is mitigated by the next commit,
> which optimizes many multiplications by immediates into shorter and less
> register heavy instructions than the xmads.
>
> total instructions in shared programs : 5787704 -> 5839715 (0.90%)
> total gprs used in shared programs    : 669878 -> 670553 (0.10%)
> total shared used in shared programs  : 548832 -> 548832 (0.00%)
> total local used in shared programs   : 21068 -> 21164 (0.46%)
>
>                 local     shared        gpr       inst      bytes
>     helped           0           0          39           0           0
>       hurt           1           0         365        3076        3076
>
> Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
> ---
>  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 56 ++++++++++++++++++++++
>  .../nouveau/codegen/nv50_ir_target_gm107.cpp       |  1 -
>  2 files changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index 6deea7a360..a6ddb284b8 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -2292,13 +2292,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
>  // =============================================================================
>
>  // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
> +// MUL(a, b) -> a few XMADs
> +// MAD/FMA(a, b, c) -> a few XMADs
>  class LateAlgebraicOpt : public Pass
>  {
>  private:
>     virtual bool visit(Instruction *);
>
>     void handleADD(Instruction *);
> +   void handleMULMAD(Instruction *);
>     bool tryADDToSHLADD(Instruction *);
> +
> +   BuildUtil bld;
>  };
>
>  void
> @@ -2359,6 +2364,52 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
>     return true;
>  }
>
> +// MUL(a, b) -> a few XMADs
> +// MAD/FMA(a, b, c) -> a few XMADs
> +void
> +LateAlgebraicOpt::handleMULMAD(Instruction *i)
> +{
> +   // TODO: handle NV50_IR_SUBOP_MUL_HIGH
> +   if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
> +      return;
> +   if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
> +      return;
> +   if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
> +      return;
> +
> +   assert(!i->src(0).mod);
> +   assert(!i->src(1).mod);
> +   assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
> +
> +   bld.setPosition(i, false);
> +
> +   Value *a = i->getSrc(0);
> +   Value *b = i->getSrc(1);
> +   Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
> +
> +   Value *tmp0 = bld.getSSA();
> +   Value *tmp1 = bld.getSSA();
> +
> +   Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
> +   insn->setPredicate(i->cc, i->getPredicate());
> +
> +   insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
> +   insn->setPredicate(i->cc, i->getPredicate());
> +   insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
> +
> +   Value *pred = i->getPredicate();
> +   i->setPredicate(i->cc, NULL);
> +
> +   i->op = OP_XMAD;
> +   i->setSrc(0, b);
> +   i->setSrc(1, tmp1);
> +   i->setSrc(2, tmp0);
> +   i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
> +   i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
> +
> +   i->setPredicate(i->cc, pred);
> +}
> +
>  bool
>  LateAlgebraicOpt::visit(Instruction *i)
>  {
> @@ -2366,6 +2417,11 @@ LateAlgebraicOpt::visit(Instruction *i)
>     case OP_ADD:
>        handleADD(i);
>        break;
> +   case OP_MUL:
> +   case OP_MAD:
> +   case OP_FMA:
> +      handleMULMAD(i);
> +      break;
>     default:
>        break;
>     }
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> index bb1c234c43..edb823afb4 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> @@ -166,7 +166,6 @@ TargetGM107::isBarrierRequired(const Instruction *insn) const
>        }
>        break;
>     case OPCLASS_ARITH:
> -      // TODO: IMUL/IMAD require barriers too, use of XMAD instead!
>        if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
>            !isFloatType(insn->dType))
>           return true;
> --
> 2.14.4
>