[Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results

Mon Feb 23 07:40:05 PST 2015

Oh right. I think the NVIDIA blob executes those steps conditionally
based on the upper bits not being 0x7ff (== infinity/nan). I should do
the same thing here. [FWIW I was able to test the nv50 code last night
and that one's a total fail for rcp/rsq... will need to port that over
to my nvc0 and debug there.]

On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <sroland at vmware.com> wrote:
> Does this give correct results for special floats (0, infs)?
> We tried to improve (for single floats) x86 rcp in llvmpipe with
> newton-raphson, but unfortunately not being able to give correct results
> for these two cases (without even more additional code) meant it got all
> disabled in the end (you can still see that code in the driver) since
> the problems are at least as bad as those due to bad accuracy...
>
> Roland
>
> Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
>> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
>> ---
>>
>> Not sure how many steps are needed for the necessary accuracy. Just
>> doing 2 because that seems like a reasonable number.
>>
>>  .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 42 ++++++++++++++++++++--
>>  1 file changed, 39 insertions(+), 3 deletions(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> index 87e75e1..9767566 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>     bld.setPosition(i, false);
>>
>>     // 1. Take the source and it up.
>> -   Value *src[2], *dst[2], *def = i->getDef(0);
>> -   bld.mkSplit(src, 4, i->getSrc(0));
>> +   Value *input = i->getSrc(0);
>> +   Value *src[2], *dst[2], *guess, *def = i->getDef(0);
>> +   bld.mkSplit(src, 4, input);
>>
>>     // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
>>     dst[0] = bld.loadImm(NULL, 0);
>> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>
>>     // 4. Recombine the two dst pieces back into the original destination.
>>     bld.setPosition(i, true);
>> -   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
>> +   guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
>> +
>> +   // 5. Perform 2 Newton-Raphson steps
>> +   if (i->op == OP_RCP) {
>> +      // RCP: x_{n+1} = 2 * x_n - input * x_n^2
>> +      Value *two = bld.getSSA(8);
>> +
>> +      bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
>> +
>> +      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>> +      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>> +   } else {
>> +      // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
>> +      Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
>> +      bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
>> +      bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
>> +
>> +      half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input);
>> +      // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
>> +      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>> +                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>> +                                    three_half));
>> +      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>> +                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>> +                                    three_half));
>> +   }
>> +
>> +   bld.mkMov(def, guess);
>>  }
>>
>>  bool
>>
>