[Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
Ilia Mirkin
imirkin at alum.mit.edu
Mon Feb 23 12:23:28 PST 2015
Just to follow up, this is the code the nvidia blob emits for rsq().
The argument is in c1[0] + c1[4]. I _think_ that c3 contains:
PB: 0x7fffffff GF100_M2MF.DATA = 0x7fffffff
PB: 0x7ff00000 GF100_M2MF.DATA = 0x7ff00000
PB: 0x00000000 GF100_M2MF.DATA = 0
PB: 0x3fe00000 GF100_M2MF.DATA = 0x3fe00000
PB: 0x3ff00000 GF100_M2MF.DATA = 0x3ff00000
00000000: 10005de4 28004400 mov b32 $r1 c1[0x4]
00000008: 10109c03 68004c00 and b32 $r2 $r1 c3[0x4]
r2 = exponent bits
00000010: 00101c03 68004c00 and b32 $r0 $r1 c3[0x0]
r0 = non-sign bits (of upper word)
00000018: 1021dc03 1a8e4c00 set $p0 0x1 ne u32 $r2 c3[0x4]
p0 = exponent == 0x7ff (i.e. inf/nan)
00000020: 00001c43 68004400 or b32 $r0 $r0 c1[0x0]
r0 = all non-sign bits or'd
00000028: fc001c04 20000000 selp b32 $r0 $r0 0x0 $p0
if (exponent == 0x7ff)
r0 = all mantissa bits
else
r0 = 0
00000030: fc01dc23 190e0000 set $p0 0x1 eq s32 $r0 0x0
p0 = is input a nan (i.e. exponent = 0x7ff and mantissa bits set)
00000038: 00001de4 28004400 mov b32 $r0 c1[0x0]
00000040: 200081e7 40000001 $p0 bra allwarp 0x90
So all of the (not $p0) stuff happens for all non-nan's, including infinities.
r0d = input
00000048: 0000a1e2 19ff0000 (not $p0) mov b32 $r2 0x7fc00000
00000050: fc0121e4 28000000 (not $p0) mov b32 $r4 0x0
00000058: 1020a103 68004400 (not $p0) and b32 $r2 $r2 not c1[0x4]
00000060: 00216002 08004000 (not $p0) add b32 $r5 $r2 0x100000
00000068: 0450e203 5800c000 (not $p0) shr u32 $r3 $r5 wrap 0x1
00000070: 10002001 50000000 (not $p0) mul rn f64 $r0d $r0d $r4d
00000078: fc00a1e4 28000000 (not $p0) mov b32 $r2 0x0
00000080: 0030e002 087fe000 (not $p0) add b32 $r3 $r3 0x1ff80000
I tried decoding this, but it's some crazy business -- futzing with
the exponent, I think it's trying to compress more into the high
32-bits so that rsqrt64h below has more precision.
00000088: 00001de4 40000000 nop
00000090: fc011de4 28000000 B mov b32 $r4 0x0
00000098: 1c115c00 c8000000 rsqrt64h $r5 $r1
000000a0: 200081e7 40000001 $p0 bra allwarp 0xf0
000000a8: 00002001 5000cff8 (not $p0) mul rn f64 $r0d $r0d
0x3fe0000000000000
r0d = input * 0.5
000000b0: 1001a001 50000000 (not $p0) mul rn f64 $r6d $r0d $r4d
r6d = 0.5 * input * guess
000000b8: 3061a201 20088c00 (not $p0) fma rn f64 $r6d neg $r6d $r4d c3[0xc]
r6d = -0.5 * input * guess * guess + 0.5 (? actually
0.5000001190928742, i.e. 0x3fe000003ff00000... seems like a bug, but
maybe it'll only read it as a f32, not f64 like I'm assuming)
000000c0: 18412001 20080000 (not $p0) fma rn f64 $r4d $r4d $r6d $r4d
guess = guess * (0.5 - 0.5 * input * guess * guess) + guess
i.e. newton-raphson step. [why didn't they just throw in a 1.5 instead
of 0.5? who knows. maybe something to do with numeric stability. or
perhaps a bug in their const upload logic which got papered over with
the extra + guess.]
000000c8: 10002001 50000000 (not $p0) mul rn f64 $r0d $r0d $r4d
000000d0: 30002201 20088c00 (not $p0) fma rn f64 $r0d neg $r0d $r4d c3[0xc]
000000d8: 00402001 20080000 (not $p0) fma rn f64 $r0d $r4d $r0d $r4d
000000e0: 08012001 50000000 (not $p0) mul rn f64 $r4d $r0d $r2d
And this is another step.
Not sure why they're so careful to still go through the motions for
infinity and only skip for nan -- seems like they could just as well
skip it for inf as well, with less code. That's what I plan on doing.
On Mon, Feb 23, 2015 at 10:40 AM, Ilia Mirkin <imirkin at alum.mit.edu> wrote:
> Oh right. I think the NVIDIA blob executes those steps conditionally
> based on the upper bits not being 0x7ff (== infinity/nan). I should do
> the same thing here. [FWIW I was able to test the nv50 code last night
> and that one's a total fail for rcp/rsq... will need to port that over
> to my nvc0 and debug there.]
>
> On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <sroland at vmware.com> wrote:
>> Does this give correct results for special floats (0, infs)?
>> We tried to improve (for single floats) x86 rcp in llvmpipe with
>> newton-raphson, but unfortunately not being able to give correct results
>> for these two cases (without even more additional code) meant it got all
>> disabled in the end (you can still see that code in the driver) since
>> the problems are at least as bad as those due to bad accuracy...
>>
>> Roland
>>
>> Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
>>> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
>>> ---
>>>
>>> Not sure how many steps are needed for the necessary accuracy. Just
>>> doing 2 because that seems like a reasonable number.
>>>
>>> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++--
>>> 1 file changed, 39 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> index 87e75e1..9767566 100644
>>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>> bld.setPosition(i, false);
>>>
>>> // 1. Take the source and it up.
>>> - Value *src[2], *dst[2], *def = i->getDef(0);
>>> - bld.mkSplit(src, 4, i->getSrc(0));
>>> + Value *input = i->getSrc(0);
>>> + Value *src[2], *dst[2], *guess, *def = i->getDef(0);
>>> + bld.mkSplit(src, 4, input);
>>>
>>> // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
>>> dst[0] = bld.loadImm(NULL, 0);
>>> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>>
>>> // 4. Recombine the two dst pieces back into the original destination.
>>> bld.setPosition(i, true);
>>> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
>>> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
>>> +
>>> + // 5. Perform 2 Newton-Raphson steps
>>> + if (i->op == OP_RCP) {
>>> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2
>>> + Value *two = bld.getSSA(8);
>>> +
>>> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
>>> +
>>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>>> + } else {
>>> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
>>> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
>>> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
>>> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
>>> +
>>> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input);
>>> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
>>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>>> + three_half));
>>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>>> + three_half));
>>> + }
>>> +
>>> + bld.mkMov(def, guess);
>>> }
>>>
>>> bool
>>>
>>
More information about the Nouveau
mailing list