[Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
Roland Scheidegger
sroland at vmware.com
Mon Feb 23 05:24:59 PST 2015
Does this give correct results for special floats (0, infs)?
We tried to improve (for single floats) x86 rcp in llvmpipe with
newton-raphson, but unfortunately not being able to give correct results
for these two cases (without even more additional code) meant it got all
disabled in the end (you can still see that code in the driver) since
the problems are at least as bad as those due to bad accuracy...
Roland
Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
> ---
>
> Not sure how many steps are needed for the necessary accuracy. Just
> doing 2 because that seems like a reasonable number.
>
> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++--
> 1 file changed, 39 insertions(+), 3 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> index 87e75e1..9767566 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
> bld.setPosition(i, false);
>
> // 1. Take the source and it up.
> - Value *src[2], *dst[2], *def = i->getDef(0);
> - bld.mkSplit(src, 4, i->getSrc(0));
> + Value *input = i->getSrc(0);
> + Value *src[2], *dst[2], *guess, *def = i->getDef(0);
> + bld.mkSplit(src, 4, input);
>
> // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
> dst[0] = bld.loadImm(NULL, 0);
> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>
> // 4. Recombine the two dst pieces back into the original destination.
> bld.setPosition(i, true);
> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
> +
> + // 5. Perform 2 Newton-Raphson steps
> + if (i->op == OP_RCP) {
> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2
> + Value *two = bld.getSSA(8);
> +
> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
> +
> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
> + } else {
> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
> +
> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input);
> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
> + three_half));
> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
> + three_half));
> + }
> +
> + bld.mkMov(def, guess);
> }
>
> bool
>
More information about the Nouveau
mailing list