[Mesa-dev] [PATCH 10/13] gm107/ir: add fp64 rsq

Karol Herbst kherbst at redhat.com
Sun Jul 15 18:15:50 UTC 2018


Signed-off-by: Karol Herbst <kherbst at redhat.com>
---
 .../drivers/nouveau/codegen/lib/gm107.asm     | 78 ++++++++++++++++++-
 .../drivers/nouveau/codegen/lib/gm107.asm.h   | 51 +++++++++++-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp |  2 +-
 3 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
index 595d9dc5d41..faee0218d18 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
@@ -269,8 +269,84 @@ rcp_result_denorm:
 rcp_end:
    ret
 
+// RSQ F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
 gm107_rsq_f64:
-   sched (st 0x0) (st 0x0) (st 0x0)
+   // Before getting initial result rsqrt64h, two special cases should be
+   // handled first.
+   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+   //    as NaN in rsqrt64h
+   sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd)
+   dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
+   $p0 lop32i or $r1 $r1 0x00080000
+   lop32i and $r2 $r1 0x7fffffff
+   // 2. denorms and small normal values: using their original value will
+   //    lose precision either at rsqrt64h or the first step in newton-raphson
+   //    steps below. Take 2 as a threshold in exponent field, and multiply
+   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+   //    to recover in the end)
+   sched (st 0xd) (st 0xd) (st 0xd)
+   bfe u32 $r3 $r1 0xb14
+   isetp le u32 and $p1 1 $r3 0x2 1
+   lop or 1 $r2 $r0 $r2
+   sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd)
+   $p1 dmul $r0 $r0 0x4350000000000000
+   mufu rsq64h $r5 $r1
+   // rsqrt64h will give correct result for 0/inf/nan, the following logic
+   // checks whether the input is one of those (exponent is 0x7ff or all 0
+   // except for the sign bit)
+   iset ne u32 and $r6 $r3 0x7ff 1
+   sched (st 0xd) (st 0xd) (st 0xd)
+   lop and 1 $r2 $r2 $r6
+   isetp ne u32 and $p0 1 $r2 0x0 1
+   $p0 bra #rsq_norm
+   // For 0/inf/nan, make sure the sign bit agrees with input and return
+   sched (st 0xd) (st 0xd) (st 0xd wt 0x1)
+   lop32i and $r1 $r1 0x80000000
+   mov $r0 0x0 0xf
+   lop or 1 $r1 $r1 $r5
+   sched (st 0xd) (st 0xf) (st 0xf)
+   ret
+   nop 0
+   nop 0
+rsq_norm:
+   // For others, do 4 Newton-Raphson steps with the formula:
+   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+   // In the code below, each step is written as:
+   //     tmp1 = 0.5 * x * RSQ_{n}
+   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
+   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+   sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3)
+   mov $r4 0x0 0xf
+   // 0x3f000000: 1/2
+   f2f f32 f64 $r8 0x3f000000
+   dmul $r2 $r0 $r8
+   sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   // Multiply 2^27 to result for small inputs to recover
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd)
+   $p1 dmul $r4 $r4 0x41a0000000000000
+   mov $r1 $r5 0xf
+   mov $r0 $r4 0xf
+   sched (st 0xd) (st 0xf) (st 0xf)
    ret
    nop 0
    nop 0
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
index 206d01bde83..8eb27bbac99 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
@@ -182,7 +182,56 @@ uint64_t gm107_builtin_code[] = {
 /* 0x0558: rcp_end */
 	0xe32000000007000f,
 /* 0x0560: gm107_rsq_f64 */
-	0x001f8000fc0007e0,
+	0x001fb401fda1ff0d,
+	0x368c03fff0070087,
+	0x0420008000000101,
+	0x0407fffffff70102,
+	0x001fb400fda007ed,
+	0x38000000b1470103,
+	0x366603800027030f,
+	0x5c47020000270002,
+	0x001fb401e1a0070d,
+	0x3880004350010000,
+	0x5080000000770105,
+	0x365a03807ff70306,
+	0x001fb400fda007ed,
+	0x5c47000000670202,
+	0x5b6a03800ff70207,
+	0xe24000000400000f,
+	0x003fb400fda007ed,
+	0x0408000000070101,
+	0x5c9807800ff70000,
+	0x5c47020000570101,
+	0x001fbc00fde007ed,
+	0xe32000000007000f,
+	0x50b0000000070f00,
+	0x50b0000000070f00,
+/* 0x0620: rsq_norm */
+	0x0060b400e5a007ed,
+	0x5c9807800ff70004,
+	0x38a8003f00070b08,
+	0x5c80000000870002,
+	0x003c3401e1a01f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x001fb401fda00f0d,
+	0x38800041a0010404,
+	0x5c98078000570001,
+	0x5c98078000470000,
+	0x001fbc00fde007ed,
 	0xe32000000007000f,
 	0x50b0000000070f00,
 	0x50b0000000070f00,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index d5318f179f2..a50e56436ba 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -127,7 +127,7 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
    bld.mkSplit(src, 4, i->getSrc(0));
 
    int chip = prog->getTarget()->getChipset();
-   if (chip >= NVISA_GK104_CHIPSET && (i->op == OP_RCP || chip < NVISA_GM107_CHIPSET)) {
+   if (chip >= NVISA_GK104_CHIPSET) {
       handleRCPRSQLib(i, src);
       return;
    }
-- 
2.17.1



More information about the mesa-dev mailing list