[Mesa-dev] [RFC 1/3] gk110/ir: Add rcp f64 implementation

Sun Mar 5 16:33:26 UTC 2017

On Sun, Mar 5, 2017 at 10:34 AM, Boyan Ding <boyan.j.ding at gmail.com> wrote:
> Signed-off-by: Boyan Ding <boyan.j.ding at gmail.com>
> ---
>  src/gallium/drivers/nouveau/codegen/lib/gk110.asm  | 156 ++++++++++++++++++++-
>  .../drivers/nouveau/codegen/lib/gk110.asm.h        |  96 ++++++++++++-
>  2 files changed, 248 insertions(+), 4 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
> index b9c05a04b9..871571e1c3 100644
> --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
> +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
> @@ -83,11 +83,165 @@ gk110_div_s32:
>     $p0 sub b32 $r1 $r1 $r2
>     $p0 add b32 $r0 $r0 0x1
>     $p3 cvt s32 $r0 neg s32 $r0
> -   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
> +   sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28
>     $p2 cvt s32 $r1 neg s32 $r1
>     ret
>
> +// RCP F64
> +//
> +// INPUT:   $r0d
> +// OUTPUT:  $r0d
> +// CLOBBER: $r2 - $r9, $p0
> +//
> +// The core of RCP and RSQ implementation is Newton-Raphson step, which is
> +// used to find successively better approximation from an imprecise initial
> +// value (single precision rcp in RCP and rsqrt64h in RSQ).
> +//
> +// The formula of Newton-Raphson step used in RCP(x) is:
> +//     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
> +// The following code uses 2 FMAs for each step, and it will basically
> +// look like:
> +//     tmp = -src * RCP_{n} + 1
> +//     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
> +//
>  gk110_rcp_f64:
> +   // Step1: classify input according to exponent and value, and calculate

Step 1 (note the space).

> +   // result for 0/inf/nan, $r2 holds the exponent value. The 0xb14 in ext
> +   // instruction means to extract 0xb bits starting from bit 0x14

You should assume that the reader is familiar with the nvidia ISA.
(Otherwise there will be a LOT of explaining to do.) However it would
be more valuable to note that this extracts the exponent bits, which
may not be apparent to someone who hasn't been staring at fp64 values
for the past 72 hours :)

> +   ext u32 $r2 $r1 0xb14
> +   add b32 $r3 $r2 0xffffffff
> +   joinat #rcp_L3
> +   // (exponent-1) > 0x7fd (unsigned) means exponent is either 0x7ff of 0.
> +   // There are three cases: nan, inf, and denorm (including 0)
> +   set b32 $p0 0x1 gt u32 $r3 0x7fd

This is extremely clever. I'm a big fan. But I didn't get it the first
time around. Perhaps I'm dumb, but I'd rather the comment be reworded
for fools like me. Something like:

"We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, Inf,
denorm, or 0). Do this by subtracting 1 from the exponent, which will
mean that it's > 0x7fd in those cases."

> +   // $r3: 0 for norms, 0x36 for denorms, -1 for others
> +   mov b32 $r3 0x0
> +   sched 0x2b 0x04 0x2d 0x2b 0x04 0x2b 0x28
> +   (not $p0) bra #rcp_L2
> +   // Nan/Inf/denorm goes here

And zero too, right? How about "Process all the special values: ... "

> +   mov b32 $r3 0xffffffff
> +   // A number is NaN if its abs value is greater than inf

Actually note that this is gtu, not gt. That means greater *or* unordered with.

> +   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
> +   (not $p0) bra #rcp_L4
> +   // NaN -> NaN

Perhaps say a few more words here? Not entirely apparent to me how
setting that bit makes it more of a NaN. (Wasn't it already a NaN in
the first place if we got here?)

> +   or b32 $r1 $r1 0x80000
> +   bra #rcp_L2
> +rcp_L4:

Better labels would be immensely useful. This one might be called
rcp_inf_or_denorm_or_zero.

> +   and b32 $r4 $r1 0x7ff00000
> +   sched 0x28 0x2b 0x04 0x28 0x2b 0x2d 0x2b
> +   // Other values with nonzero in exponent field should be inf
> +   set b32 $p0 0x1 eq s32 $r4 0x0
> +   $p0 bra #rcp_L5
> +   // +/-Inf -> +/-0
> +   xor b32 $r1 $r1 0x7ff00000
> +   mov b32 $r0 0x0
> +   bra #rcp_L2
> +rcp_L5:

rcp_denorm_or_zero

> +   set $p0 0x1 gtu f64 abs $r0d 0x0
> +   $p0 bra #rcp_L6
> +   // +/-0 -> +/-Inf
> +   sched 0x28 0x2b 0x20 0x28 0x2f 0x28 0x2b
> +   or b32 $r1 $r1 0x7ff00000
> +   bra #rcp_L2
> +rcp_L6:

rcp_denorm

> +   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
> +   mul rn f64 $r0d $r0d 0x4350000000000000
> +   mov b32 $r3 0x36
> +rcp_L2:
> +   join nop

Why not fold this into the the sources? i.e.

join mov b32 $r3 0x36 right above, etc. (I can see how a structurizing
compiler might have generated such code, but we can do a bit
better...)

> +rcp_L3:
> +   // All numbers with -1 in $r3 have their result ready in $r0d, return them
> +   // others need further calculation
> +   set b32 $p0 0x1 lt s32 $r3 0x0
> +   $p0 bra #rcp_end
> +   sched 0x28 0x28 0x04 0x28 0x2b 0x04 0x28
> +   // Step 2: Before the real calculation goes on, renormalize the values to
> +   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
> +   // result in $r6d. The exponent will be recovered later.
> +   ext u32 $r2 $r1 0xb14
> +   and b32 $r7 $r1 0x800fffff
> +   add b32 $r7 $r7 0x3ff00000
> +   mov b32 $r6 $r0
> +   // Step 3: Convert new value to float (no overflow will occur due to step
> +   // 2), calculate rcp and do newton-raphson step once
> +   cvt rz f32 $r5 f64 $r6d
> +   rcp f32 $r4 $r5
> +   mov b32 $r0 0xbf800000
> +   sched 0x28 0x28 0x2a 0x2b 0x2e 0x28 0x2e
> +   fma rn f32 $r5 $r4 $r5 $r0
> +   add ftz rn f32 $r5 neg $r5 neg 0x0

Is this different than "cvt ftz f32 $r5 neg $r5"? Also, why is this
not just folded into the fma? Is it just to flush this one value
(since denorm flushing == ftz, happens on input rather than on
output)?

> +   fma rn f32 $r0 $r4 $r5 $r4
> +   // Step 4: convert result $r0 back to double, do newton-raphson steps
> +   cvt f64 $r0d f32 $r0
> +   cvt f64 $r6d f64 neg $r6d
> +   mov b32 $r9 0x3ff00000
> +   mov b32 $r8 0x0

This is 1.0 right? I prefer

cvt f64 $r8d f32 0x3f800000

> +   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
> +   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d

This would be a good place to remind people of the equation. In fact,
I'd just move the equation down here.

> +   fma rn f64 $r4d $r6d $r0d $r8d
> +   fma rn f64 $r0d $r0d $r4d $r0d
> +   fma rn f64 $r4d $r6d $r0d $r8d
> +   fma rn f64 $r0d $r0d $r4d $r0d
> +   fma rn f64 $r4d $r6d $r0d $r8d
> +   fma rn f64 $r0d $r0d $r4d $r0d
> +   fma rn f64 $r4d $r6d $r0d $r8d
> +   sched 0x20 0x28 0x28 0x28 0x28 0x28 0x28
> +   fma rn f64 $r0d $r0d $r4d $r0d
> +   // Step 5: Exponent recovery and final processing
> +   // The exponent is recovered by adding what we added to the exponent.
> +   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
> +   //     rcp(x) = c * rcp(cx)
> +   // The delta in exponent comes from two sources:
> +   //   1) The renormalization in step 2. The delta is:
> +   //      0x3ff - $r2
> +   //   2) (For the denorm input) The 2^54 we multiplied at rcp_L6, stored
> +   //      in $r3
> +   // These 2 sources are calculated in the first two lines below, and then
> +   // added to the exponent extracted from the result above.
> +   // Note that after processing, the new exponent may >= 0x7ff (inf)
> +   // or <= 0 (denorm). Those cases will be handled respectively below
> +   subr b32 $r2 $r2 0x3ff
> +   add b32 $r4 $r2 $r3
> +   ext u32 $r3 $r1 0xb14
> +   // New exponent in $r3
> +   add b32 $r3 $r3 $r4
> +   add b32 $r2 $r3 0xffffffff
> +   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
> +   // (same logic as in step 1)
> +   set b32 $p0 0x1 lt u32 $r2 0x7fe
> +   sched 0x2b 0x28 0x2b 0x28 0x28 0x2b 0x20
> +   (not $p0) bra #rcp_L7
> +   // Norms: convert exponents back and return
> +   shl b32 $r4 $r4 clamp 0x14
> +   add b32 $r1 $r4 $r1
> +   bra #rcp_end
> +rcp_L7:

rcp_result_infinite_or_denorm

> +   // New exponent >= 0x7ff means that result is inf
> +   set b32 $p0 0x1 ge s32 $r3 0x7ff
> +   (not $p0) bra #rcp_L8
> +   // Infinity
> +   and b32 $r1 $r1 0x80000000
> +   sched 0x25 0x28 0x2b 0x23 0x25 0x28 0x23
> +   mov b32 $r0 0x0
> +   add b32 $r1 $r1 0x7ff00000
> +   bra #rcp_end
> +rcp_L8:
> +   // denorms, they only fall within a small range, can't be smaller than
> +   // 0x0004000000000000, which means if we set the exponent field to 1,

Huh?

>>> np.uint64(1).view('float64')
4.9406564584124654e-324

Did you mean that they have to be smaller than some value? Or is there
something else clever going on here?

> +   // we can get the final result by mutiplying it with 1/2 or 1/4. Decide
> +   // which one of the two is needed with exponent value, if not 0, 1/4 is
> +   // used, 1/2 otherwise
> +   set b32 $p0 0x1 ne u32 $r3 0x0
> +   and b32 $r1 $r1 0x800fffff
> +   $p0 mov b32 $r7 0x3fd00000
> +   (not $p0) mov b32 $r7 0x3fe00000
> +   sched 0x25 0x28 0x2c 0x2e 0x2e 0x00 0x00
> +   add b32 $r1 $r1 0x00100000
> +   mov b32 $r6 0x0
> +   mul rn f64 $r0d $r0d $r6d
> +rcp_end:
> +   ret
> +
>  gk110_rsq_f64:
>     ret
>
> diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
> index 8d00e2a224..ce937a71f9 100644
> --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
> +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
> @@ -65,11 +65,101 @@ uint64_t gk110_builtin_code[] = {
>         0xe088000001000406,
>         0x4000000000800001,
>         0xe6010000000ce802,
> -       0x08b08010a010b810,
> +       0x08a0a0a010a0b810,
>         0xe60100000088e806,
>         0x19000000001c003c,
>  /* 0x0218: gk110_rcp_f64 */
> -/* 0x0218: gk110_rsq_f64 */
> +       0xc00000058a1c0409,
> +       0x407fffffff9c080d,
> +       0x1480000060000000,
> +       0xb3401c03fe9c0c1d,
> +       0xe4c03c007f9c000e,
> +       0x08a0ac10acb410ac,
> +       0x120000004c20003c,
> +       0x747fffffff9fc00e,
> +       0xb4601fff801c021d,
> +       0x120000000820003c,
> +       0x21000400001c0404,
> +       0x12000000381c003c,
> +/* 0x0278: rcp_L4 */
> +       0x203ff800001c0410,
> +       0x08acb4aca010aca0,
> +       0xb3281c00001c101d,
> +       0x120000000c00003c,
> +       0x223ff800001c0404,
> +       0xe4c03c007f9c0002,
> +       0x120000001c1c003c,
> +/* 0x02b0: rcp_L5 */
> +       0xb4601c00001c021d,
> +       0x120000000c00003c,
> +       0x08aca0bca080aca0,
> +       0x213ff800001c0404,
> +       0x12000000081c003c,
> +/* 0x02d8: rcp_L6 */
> +       0xc400021a801c0001,
> +       0x740000001b1fc00e,
> +/* 0x02e8: rcp_L2 */
> +       0x85800000005c3c02,
> +/* 0x02f0: rcp_L3 */
> +       0xb3181c00001c0c1d,
> +       0x12000000d000003c,
> +       0x08a010aca010a0a0,
> +       0xc00000058a1c0409,
> +       0x204007ffff9c041c,
> +       0x401ff800001c1c1d,
> +       0xe4c03c00001c001a,
> +       0xe5400c00031c3816,
> +       0x84000000021c1412,
> +       0x745fc000001fc002,
> +       0x08b8a0b8aca8a0a0,
> +       0xcc000000029c1016,
> +       0xcac88000001c1415,
> +       0xcc001000029c1002,
> +       0xe5400000001c2c02,
> +       0xe5410000031c3c1a,
> +       0x741ff800001fc026,
> +       0xe4c03c007f9c0022,
> +       0x08a4a4a4a4a4a4a4,
> +       0xdb802000001c1812,
> +       0xdb800000021c0002,
> +       0xdb802000001c1812,
> +       0xdb800000021c0002,
> +       0xdb802000001c1812,
> +       0xdb800000021c0002,
> +       0xdb802000001c1812,
> +       0x08a0a0a0a0a0a080,
> +       0xdb800000021c0002,
> +       0x48000001ff9c0809,
> +       0xe0800000019c0812,
> +       0xc00000058a1c040d,
> +       0xe0800000021c0c0e,
> +       0x407fffffff9c0c09,
> +       0xb3101c03ff1c081d,
> +       0x0880aca0a0aca0ac,
> +       0x120000000c20003c,
> +       0xc24000000a1c1011,
> +       0xe0800000009c1006,
> +       0x120000003c1c003c,
> +/* 0x0428: rcp_L7 */
> +       0xb3681c03ff9c0c1d,
> +       0x120000001420003c,
> +       0x20400000001c0404,
> +       0x088ca0948caca094,
> +       0xe4c03c007f9c0002,
> +       0x403ff800001c0405,
> +       0x12000000201c003c,
> +/* 0x0460: rcp_L8 */
> +       0xb3501c00001c0c1d,
> +       0x204007ffff9c0404,
> +       0x741fe8000003c01e,
> +       0x741ff0000023c01e,
> +       0x080000b8b8b0a094,
> +       0x40000800001c0405,
> +       0xe4c03c007f9c001a,
> +       0xe4000000031c0002,
> +/* 0x04a0: rcp_end */
> +       0x19000000001c003c,
> +/* 0x04a8: gk110_rsq_f64 */
>         0x19000000001c003c,
>  };
>
> @@ -77,5 +167,5 @@ uint64_t gk110_builtin_offsets[] = {
>         0x0000000000000000,
>         0x00000000000000f0,
>         0x0000000000000218,
> -       0x0000000000000218,
> +       0x00000000000004a8,
>  };
> --
> 2.12.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev