[Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results

Mon Feb 23 12:23:28 PST 2015

Just to follow up, this is the code the nvidia blob emits for rsq().
The argument is in c1[0] + c1[4]. I _think_ that c3 contains:

PB: 0x7fffffff   GF100_M2MF.DATA = 0x7fffffff
PB: 0x7ff00000   GF100_M2MF.DATA = 0x7ff00000
PB: 0x00000000   GF100_M2MF.DATA = 0
PB: 0x3fe00000   GF100_M2MF.DATA = 0x3fe00000
PB: 0x3ff00000   GF100_M2MF.DATA = 0x3ff00000

00000000: 10005de4 28004400     mov b32 $r1 c1[0x4]
00000008: 10109c03 68004c00     and b32 $r2 $r1 c3[0x4]

r2 = exponent bits

00000010: 00101c03 68004c00     and b32 $r0 $r1 c3[0x0]

r0 = non-sign bits (of upper word)

00000018: 1021dc03 1a8e4c00     set $p0 0x1 ne u32 $r2 c3[0x4]

p0 = exponent == 0x7ff (i.e. inf/nan)

00000020: 00001c43 68004400     or b32 $r0 $r0 c1[0x0]

r0 = all non-sign bits or'd

00000028: fc001c04 20000000     selp b32 $r0 $r0 0x0 $p0

if (exponent == 0x7ff)
  r0 = all mantissa bits
else
  r0 = 0

00000030: fc01dc23 190e0000     set $p0 0x1 eq s32 $r0 0x0

p0 = is input a nan (i.e. exponent = 0x7ff and mantissa bits set)

00000038: 00001de4 28004400     mov b32 $r0 c1[0x0]
00000040: 200081e7 40000001     $p0 bra allwarp 0x90

So all of the (not $p0) stuff happens for all non-nan's, including infinities.

r0d = input

00000048: 0000a1e2 19ff0000     (not $p0) mov b32 $r2 0x7fc00000
00000050: fc0121e4 28000000     (not $p0) mov b32 $r4 0x0
00000058: 1020a103 68004400     (not $p0) and b32 $r2 $r2 not c1[0x4]
00000060: 00216002 08004000     (not $p0) add b32 $r5 $r2 0x100000
00000068: 0450e203 5800c000     (not $p0) shr u32 $r3 $r5 wrap 0x1
00000070: 10002001 50000000     (not $p0) mul rn f64 $r0d $r0d $r4d
00000078: fc00a1e4 28000000     (not $p0) mov b32 $r2 0x0
00000080: 0030e002 087fe000     (not $p0) add b32 $r3 $r3 0x1ff80000

I tried decoding this, but it's some crazy business -- futzing with
the exponent, I think it's trying to compress more into the high
32-bits so that rsqrt64h below has more precision.

00000088: 00001de4 40000000     nop
00000090: fc011de4 28000000   B mov b32 $r4 0x0
00000098: 1c115c00 c8000000     rsqrt64h $r5 $r1
000000a0: 200081e7 40000001     $p0 bra allwarp 0xf0
000000a8: 00002001 5000cff8     (not $p0) mul rn f64 $r0d $r0d
0x3fe0000000000000

r0d = input * 0.5

000000b0: 1001a001 50000000     (not $p0) mul rn f64 $r6d $r0d $r4d

r6d = 0.5 * input * guess

000000b8: 3061a201 20088c00     (not $p0) fma rn f64 $r6d neg $r6d $r4d c3[0xc]

r6d = -0.5 * input * guess * guess + 0.5 (? actually
0.5000001190928742, i.e. 0x3fe000003ff00000... seems like a bug, but
maybe it'll only read it as a f32, not f64 like I'm assuming)

000000c0: 18412001 20080000     (not $p0) fma rn f64 $r4d $r4d $r6d $r4d

guess = guess * (0.5 - 0.5 * input * guess * guess) + guess

i.e. newton-raphson step. [why didn't they just throw in a 1.5 instead
of 0.5? who knows. maybe something to do with numeric stability. or
perhaps a bug in their const upload logic which got papered over with
the extra + guess.]

000000c8: 10002001 50000000     (not $p0) mul rn f64 $r0d $r0d $r4d
000000d0: 30002201 20088c00     (not $p0) fma rn f64 $r0d neg $r0d $r4d c3[0xc]
000000d8: 00402001 20080000     (not $p0) fma rn f64 $r0d $r4d $r0d $r4d
000000e0: 08012001 50000000     (not $p0) mul rn f64 $r4d $r0d $r2d

And this is another step.

Not sure why they're so careful to still go through the motions for
infinity and only skip for nan -- seems like they could just as well
skip it for inf as well, with less code. That's what I plan on doing.

On Mon, Feb 23, 2015 at 10:40 AM, Ilia Mirkin <imirkin at alum.mit.edu> wrote:
> Oh right. I think the NVIDIA blob executes those steps conditionally
> based on the upper bits not being 0x7ff (== infinity/nan). I should do
> the same thing here. [FWIW I was able to test the nv50 code last night
> and that one's a total fail for rcp/rsq... will need to port that over
> to my nvc0 and debug there.]
>
> On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <sroland at vmware.com> wrote:
>> Does this give correct results for special floats (0, infs)?
>> We tried to improve (for single floats) x86 rcp in llvmpipe with
>> newton-raphson, but unfortunately not being able to give correct results
>> for these two cases (without even more additional code) meant it got all
>> disabled in the end (you can still see that code in the driver) since
>> the problems are at least as bad as those due to bad accuracy...
>>
>> Roland
>>
>> Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
>>> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
>>> ---
>>>
>>> Not sure how many steps are needed for the necessary accuracy. Just
>>> doing 2 because that seems like a reasonable number.
>>>
>>>  .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 42 ++++++++++++++++++++--
>>>  1 file changed, 39 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> index 87e75e1..9767566 100644
>>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>>     bld.setPosition(i, false);
>>>
>>>     // 1. Take the source and it up.
>>> -   Value *src[2], *dst[2], *def = i->getDef(0);
>>> -   bld.mkSplit(src, 4, i->getSrc(0));
>>> +   Value *input = i->getSrc(0);
>>> +   Value *src[2], *dst[2], *guess, *def = i->getDef(0);
>>> +   bld.mkSplit(src, 4, input);
>>>
>>>     // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
>>>     dst[0] = bld.loadImm(NULL, 0);
>>> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>>
>>>     // 4. Recombine the two dst pieces back into the original destination.
>>>     bld.setPosition(i, true);
>>> -   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
>>> +   guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
>>> +
>>> +   // 5. Perform 2 Newton-Raphson steps
>>> +   if (i->op == OP_RCP) {
>>> +      // RCP: x_{n+1} = 2 * x_n - input * x_n^2
>>> +      Value *two = bld.getSSA(8);
>>> +
>>> +      bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
>>> +
>>> +      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>>> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>>> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>>> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>>> +      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>>> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>>> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>>> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>>> +   } else {
>>> +      // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
>>> +      Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
>>> +      bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
>>> +      bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
>>> +
>>> +      half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input);
>>> +      // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
>>> +      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>>> +                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>>> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>>> +                                    three_half));
>>> +      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>>> +                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>>> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>>> +                                    three_half));
>>> +   }
>>> +
>>> +   bld.mkMov(def, guess);
>>>  }
>>>
>>>  bool
>>>
>>