[Mesa-dev] [RFC 2/3] gk110/ir: Add rcp f64 implementation
Boyan Ding
boyan.j.ding at gmail.com
Sun Mar 5 15:34:57 UTC 2017
Signed-off-by: Boyan Ding <boyan.j.ding at gmail.com>
---
src/gallium/drivers/nouveau/codegen/lib/gk110.asm | 65 +++++++++++++++++++++-
.../drivers/nouveau/codegen/lib/gk110.asm.h | 40 ++++++++++++-
2 files changed, 103 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
index 871571e1c3..fc99de31cc 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -235,14 +235,77 @@ rcp_L8:
and b32 $r1 $r1 0x800fffff
$p0 mov b32 $r7 0x3fd00000
(not $p0) mov b32 $r7 0x3fe00000
- sched 0x25 0x28 0x2c 0x2e 0x2e 0x00 0x00
+ sched 0x25 0x28 0x2c 0x2e 0x2a 0x20 0x27
add b32 $r1 $r1 0x00100000
mov b32 $r6 0x0
mul rn f64 $r0d $r0d $r6d
rcp_end:
ret
+// RSQ F64
+//
+// INPUT: $r0d
+// OUTPUT: $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
+// The formula of Newton-Raphson step used in RSQ(x) is:
+// RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+// In the code below, each step is written as:
+// tmp1 = 0.5 * x * RSQ_{n}
+// tmp2 = -RSQ_{n} * tmp1 + 0.5
+// RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+//
gk110_rsq_f64:
+ // Before getting initial result rsqrt64h, two special cases should be
+ // handled first.
+ // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+ // as NaN in rsqrt64h
+ set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+ $p0 or b32 $r1 $r1 0x00080000
+ and b32 $r2 $r1 0x7fffffff
+ sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
+ // 2. denorms: multiply them with 2^54 to make sure they become norms
+ // (will multiply 2^27 to recover in the end)
+ ext u32 $r3 $r1 0xb14
+ set b32 $p1 0x1 eq u32 $r3 0x0
+ or b32 $r2 $r0 $r2
+ $p1 mul rn f64 $r0d $r0d 0x4350000000000000
+ rsqrt64h f32 $r5 $r1
+ // rsqrt64h will give correct result for 0/inf/nan, the following logic
+ // checks whether the input is one of those (exponent is 0x7ff or all 0
+ // except for the sign bit)
+ set b32 $r6 ne u32 $r3 0x7ff
+ and b32 $r2 $r2 $r6
+ sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
+ set b32 $p0 0x1 ne u32 $r2 0x0
+ $p0 bra #rsq_norm
+ // For 0/inf/nan, make sure the sign bit agrees with input and return
+ and b32 $r1 $r1 0x80000000
+ mov b32 $r0 0x0
+ or b32 $r1 $r1 $r5
+ ret
+rsq_norm:
+ // For others, do 3 Newton-Raphson steps with the formula above
+ mov b32 $r4 0x0
+ sched 0x20 0x28 0x29 0x29 0x29 0x29 0x29
+ mov b32 $r9 0x3fe00000
+ mov b32 $r8 0x0
+ mul rn f64 $r2d $r0d $r8d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ mul rn f64 $r0d $r2d $r4d
+ sched 0x29 0x29 0x29 0x29 0x29 0x29 0x20
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ // Multiply 2^27 to result for denorm input to recover
+ $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
+ mov b32 $r1 $r5
+ sched 0x28 0x2e 0x00 0x00 0x00 0x00 0x00
+ mov b32 $r0 $r4
ret
.section #gk110_builtin_offsets
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
index ce937a71f9..d66a146692 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
@@ -153,13 +153,51 @@ uint64_t gk110_builtin_code[] = {
0x204007ffff9c0404,
0x741fe8000003c01e,
0x741ff0000023c01e,
- 0x080000b8b8b0a094,
+ 0x089c80a8b8b0a094,
0x40000800001c0405,
0xe4c03c007f9c001a,
0xe4000000031c0002,
/* 0x04a0: rcp_end */
0x19000000001c003c,
/* 0x04a8: gk110_rsq_f64 */
+ 0xb4601fff801c021d,
+ 0x2100040000000404,
+ 0x203fffffff9c0408,
+ 0x08a0a094b0a0809c,
+ 0xc00000058a1c040d,
+ 0xb3201c00001c0c3d,
+ 0xe2001000011c000a,
+ 0xc400021a80040001,
+ 0x84000000039c0416,
+ 0xb2d01c03ff9c0c19,
+ 0xe2000000031c080a,
+ 0x08a0b8a09c80aca0,
+ 0xb3501c00001c081d,
+ 0x120000001000003c,
+ 0x20400000001c0404,
+ 0xe4c03c007f9c0002,
+ 0xe2001000029c0406,
+ 0x19000000001c003c,
+/* 0x0538: rsq_norm */
+ 0xe4c03c007f9c0012,
+ 0x08a4a4a4a4a4a080,
+ 0x741ff000001fc026,
+ 0xe4c03c007f9c0022,
+ 0xe4000000041c000a,
+ 0xe4000000021c0802,
+ 0xdb882000001c101a,
+ 0xdb801000031c1012,
+ 0xe4000000021c0802,
+ 0x0880a4a4a4a4a4a4,
+ 0xdb882000001c101a,
+ 0xdb801000031c1012,
+ 0xe4000000021c0802,
+ 0xdb882000001c101a,
+ 0xdb801000031c1012,
+ 0xc400020d00041011,
+ 0xe4c03c00029c0006,
+ 0x080000000000b8a0,
+ 0xe4c03c00021c0002,
0x19000000001c003c,
};
--
2.12.0
More information about the mesa-dev
mailing list