[Mesa-dev] [PATCH v3 3/4] nv50/ir: optimize imul/imad to xmads
Rhys Perry
pendingchaos02 at gmail.com
Mon Jul 23 10:42:26 UTC 2018
This patch is:
Reviewied-By: Karol Herbst <kherbst at redhat.com>
forgot to add that
On Mon, Jul 23, 2018 at 11:40 AM, Rhys Perry <pendingchaos02 at gmail.com> wrote:
> This hits the shader-db numbers a good bit, though a few xmads is way
> faster than an imul or imad and the cost is mitigated by the next commit,
> which optimizes many multiplications by immediates into shorter and less
> register heavy instructions than the xmads.
>
> total instructions in shared programs : 5787704 -> 5839715 (0.90%)
> total gprs used in shared programs : 669878 -> 670553 (0.10%)
> total shared used in shared programs : 548832 -> 548832 (0.00%)
> total local used in shared programs : 21068 -> 21164 (0.46%)
>
> local shared gpr inst bytes
> helped 0 0 39 0 0
> hurt 1 0 365 3076 3076
>
> Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
> ---
> .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 56 ++++++++++++++++++++++
> .../nouveau/codegen/nv50_ir_target_gm107.cpp | 1 -
> 2 files changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index 6deea7a360..a6ddb284b8 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -2292,13 +2292,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
> // =============================================================================
>
> // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
> +// MUL(a, b) -> a few XMADs
> +// MAD/FMA(a, b, c) -> a few XMADs
> class LateAlgebraicOpt : public Pass
> {
> private:
> virtual bool visit(Instruction *);
>
> void handleADD(Instruction *);
> + void handleMULMAD(Instruction *);
> bool tryADDToSHLADD(Instruction *);
> +
> + BuildUtil bld;
> };
>
> void
> @@ -2359,6 +2364,52 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
> return true;
> }
>
> +// MUL(a, b) -> a few XMADs
> +// MAD/FMA(a, b, c) -> a few XMADs
> +void
> +LateAlgebraicOpt::handleMULMAD(Instruction *i)
> +{
> + // TODO: handle NV50_IR_SUBOP_MUL_HIGH
> + if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
> + return;
> + if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
> + return;
> + if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
> + return;
> +
> + assert(!i->src(0).mod);
> + assert(!i->src(1).mod);
> + assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
> +
> + bld.setPosition(i, false);
> +
> + Value *a = i->getSrc(0);
> + Value *b = i->getSrc(1);
> + Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
> +
> + Value *tmp0 = bld.getSSA();
> + Value *tmp1 = bld.getSSA();
> +
> + Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
> + insn->setPredicate(i->cc, i->getPredicate());
> +
> + insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
> + insn->setPredicate(i->cc, i->getPredicate());
> + insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
> +
> + Value *pred = i->getPredicate();
> + i->setPredicate(i->cc, NULL);
> +
> + i->op = OP_XMAD;
> + i->setSrc(0, b);
> + i->setSrc(1, tmp1);
> + i->setSrc(2, tmp0);
> + i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
> + i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
> +
> + i->setPredicate(i->cc, pred);
> +}
> +
> bool
> LateAlgebraicOpt::visit(Instruction *i)
> {
> @@ -2366,6 +2417,11 @@ LateAlgebraicOpt::visit(Instruction *i)
> case OP_ADD:
> handleADD(i);
> break;
> + case OP_MUL:
> + case OP_MAD:
> + case OP_FMA:
> + handleMULMAD(i);
> + break;
> default:
> break;
> }
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> index bb1c234c43..edb823afb4 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> @@ -166,7 +166,6 @@ TargetGM107::isBarrierRequired(const Instruction *insn) const
> }
> break;
> case OPCLASS_ARITH:
> - // TODO: IMUL/IMAD require barriers too, use of XMAD instead!
> if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
> !isFloatType(insn->dType))
> return true;
> --
> 2.14.4
>
More information about the mesa-dev
mailing list