[Mesa-dev] [PATCH] gm107/ir/lib: use xmad for imul/imad

Rhys Perry pendingchaos02 at gmail.com
Mon Jul 16 14:25:41 UTC 2018


Seems to be about two times faster at the cost of being larger, using and
clobbering one more register and being a little more difficult to read.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
---
 src/gallium/drivers/nouveau/codegen/lib/gm107.asm  | 92 ++++++++++++++-------
 .../drivers/nouveau/codegen/lib/gm107.asm.h        | 94 +++++++++++++++-------
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      |  2 +-
 3 files changed, 128 insertions(+), 60 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
index 7ee5f8fc65..d7db3ee7fb 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
@@ -7,36 +7,56 @@
 //
 // INPUT:   $r0: dividend, $r1: divisor
 // OUTPUT:  $r0: result, $r1: modulus
-// CLOBBER: $r2 - $r3, $p0 - $p1
-// SIZE:    22 / 14 * 8 bytes
+// CLOBBER: $r2 - $r4, $p0 - $p1
+//
+// xmad $r4 s1 s0 s2
+// xmad mrg d0 s1 h1 s0 $r255
+// xmad psl cbcc d0 h1 s1 h1 d0 $r4
+// is d0 = s0 * s1 + s2 (clobbering $r4)
 //
 gm107_div_u32:
    sched (st 0xd wr 0x0 wt 0x3f) (st 0x1 wt 0x1) (st 0x6)
    flo u32 $r2 $r1
    lop xor 1 $r2 $r2 0x1f
    mov $r3 0x1 0xf
-   sched (st 0x1) (st 0xf wr 0x0) (st 0x6 wr 0x0 wt 0x1)
+   sched (st 0x1) (st 0xf wr 0x0) (st 0x1 wt 0x1)
    shl $r2 $r3 $r2
    i2i u32 u32 $r1 neg $r1
-   imul u32 u32 $r3 $r1 $r2
-   sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
+   xmad $r4 $r2 $r1 $r255
+   sched (st 0x6) (st 0x6) (st 0x6 wr 0x0)
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   imul u32 u32 $r3 $r1 $r2
+   sched (st 0x1 wt 0x1) (st 0x6) (st 0x6)
+   xmad $r4 $r2 $r1 $r255
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
+   sched (st 0x6 wr 0x0) (st 0x1 wt 0x1) (st 0x6)
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
-   imul u32 u32 $r3 $r1 $r2
+   xmad $r4 $r2 $r1 $r255
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   sched (st 0x6) (st 0x6 wr 0x0 wt 0x1) (st 0x1 wt 0x1)
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   imul u32 u32 $r3 $r1 $r2
-   sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 rd 0x1 wt 0x1)
+   xmad $r4 $r2 $r1 $r255
+   sched (st 0x6) (st 0x6) (st 0x6 wr 0x0)
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   imul u32 u32 $r3 $r1 $r2
+   sched (st 0x1 wt 0x1) (st 0x6) (st 0x6)
+   xmad $r4 $r2 $r1 $r255
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
+   sched (st 0x6 wr 0x0 rd 0x1) (st 0x6 wt 0x2) (st 0x6 wr 0x0 rd 0x1 wt 0x1)
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   sched (st 0x6 wt 0x2) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2)
    mov $r3 $r0 0xf
    imul u32 u32 hi $r0 $r0 $r2
+   sched (st 0xf wr 0x0 rd 0x1 wt 0x2) (st 0x1 wt 0x3) (st 0x6)
    i2i u32 u32 $r2 neg $r1
-   sched (st 0x6 wr 0x0 wt 0x3) (st 0xd wt 0x1) (st 0x1)
-   imad u32 u32 $r1 $r1 $r0 $r3
+   xmad $r4 $r0 $r1 $r3
+   xmad mrg $r1 $r0 h1 $r1 $r255
+   sched (st 0x6) (st 0xd) (st 0x1)
+   xmad psl cbcc $r1 h1 $r0 h1 $r1 $r4
    isetp ge u32 and $p0 1 $r1 $r2 1
    $p0 iadd $r1 $r1 neg $r2
    sched (st 0x5) (st 0xd) (st 0x1)
@@ -52,7 +72,7 @@ gm107_div_u32:
 //
 // INPUT:   $r0: dividend, $r1: divisor
 // OUTPUT:  $r0: result, $r1: modulus
-// CLOBBER: $r2 - $r3, $p0 - $p3
+// CLOBBER: $r2 - $r4, $p0 - $p3
 //
 gm107_div_s32:
    sched (st 0xd wt 0x3f) (st 0x1) (st 0x1 wr 0x0)
@@ -63,35 +83,51 @@ gm107_div_s32:
    i2i s32 s32 $r1 abs $r1
    flo u32 $r2 $r1
    lop xor 1 $r2 $r2 0x1f
-   sched (st 0x6) (st 0x1) (st 0xf wr 0x1)
+   sched (st 0x6) (st 0x1) (st 0xf wr 0x0)
    mov $r3 0x1 0xf
    shl $r2 $r3 $r2
    i2i u32 u32 $r1 neg $r1
-   sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
-   imul u32 u32 $r3 $r1 $r2
+   sched (st 0x1 wt 0x1) (st 0x6) (st 0x6)
+   xmad $r4 $r2 $r1 $r255
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
+   sched (st 0x6 wr 0x0) (st 0x1 wt 0x1) (st 0x6)
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   imul u32 u32 $r3 $r1 $r2
-   sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
+   xmad $r4 $r2 $r1 $r255
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   sched (st 0x6) (st 0x6 wr 0x0) (st 0x1 wt 0x1)
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   imul u32 u32 $r3 $r1 $r2
+   xmad $r4 $r2 $r1 $r255
+   sched (st 0x6) (st 0x6) (st 0x6 wr 0x0 wt 0x1)
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
-   imul u32 u32 $r3 $r1 $r2
+   sched (st 0x1 wt 0x1) (st 0x6) (st 0x6)
+   xmad $r4 $r2 $r1 $r255
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
+   sched (st 0x6 wr 0x0) (st 0x1 wt 0x1) (st 0x6)
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   imul u32 u32 $r3 $r1 $r2
-   sched (st 0x6 wr 0x1 rd 0x2 wt 0x2) (st 0x2 wt 0x5) (st 0x6 wr 0x0 rd 0x1 wt 0x2)
+   xmad $r4 $r2 $r1 $r255
+   xmad mrg $r3 $r2 h1 $r1 $r255
+   sched (st 0x6) (st 0x6 wr 0x0 rd 0x1) (st 0x6 wt 0x2)
+   xmad psl cbcc $r3 h1 $r2 h1 $r3 $r4
    imad u32 u32 hi $r2 $r2 $r3 $r2
    mov $r3 $r0 0xf
+   sched (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2) (st 0x1 wt 0x3)
    imul u32 u32 hi $r0 $r0 $r2
-   sched (st 0xf wr 0x1 rd 0x2 wt 0x2) (st 0x6 wr 0x0 wt 0x5) (st 0xd wt 0x3)
    i2i u32 u32 $r2 neg $r1
-   imad u32 u32 $r1 $r1 $r0 $r3
+   xmad $r4 $r0 $r1 $r3
+   sched (st 0x6) (st 0x6) (st 0xd)
+   xmad mrg $r1 $r0 h1 $r1 $r255
+   xmad psl cbcc $r1 h1 $r0 h1 $r1 $r4
    isetp ge u32 and $p0 1 $r1 $r2 1
    sched (st 0x1) (st 0x5) (st 0xd)
    $p0 iadd $r1 $r1 neg $r2
    $p0 iadd $r0 $r0 0x1
    $p0 isetp ge u32 and $p0 1 $r1 $r2 1
-   sched (st 0x1) (st 0x2) (st 0xf wr 0x0)
+   sched (st 0x1) (st 0x1) (st 0xf wr 0x0)
    $p0 iadd $r1 $r1 neg $r2
    $p0 iadd $r0 $r0 0x1
    $p3 i2i s32 s32 $r0 neg $r0
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
index 65c93f7ae8..ec8316b72d 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
@@ -4,28 +4,44 @@ uint64_t gm107_builtin_code[] = {
 	0x5c30000000170002,
 	0x3847040001f70202,
 	0x3898078000170003,
-	0x003c1800e1e007e1,
+	0x003f8400e1e007e1,
 	0x5c48000000270302,
 	0x5ce0200000170a01,
-	0x5c38000000270103,
-	0x003c1801e0c00f06,
+	0x5b007f8000170204,
+	0x001c1800fcc007e6,
+	0x5b007fa800170203,
+	0x5b30021800370203,
 	0x5a40010000370202,
-	0x5c38000000270103,
+	0x001f9800fcc00fe1,
+	0x5b007f8000170204,
+	0x5b007fa800170203,
+	0x5b30021800370203,
+	0x001f9801fc200706,
 	0x5a40010000370202,
-	0x003c1801e0c00f06,
-	0x5c38000000270103,
+	0x5b007f8000170204,
+	0x5b007fa800170203,
+	0x003f8401e0c007e6,
+	0x5b30021800370203,
 	0x5a40010000370202,
-	0x5c38000000270103,
-	0x00241801e0c00f06,
+	0x5b007f8000170204,
+	0x001c1800fcc007e6,
+	0x5b007fa800170203,
+	0x5b30021800370203,
 	0x5a40010000370202,
-	0x5c38000000270103,
+	0x001f9800fcc00fe1,
+	0x5b007f8000170204,
+	0x5b007fa800170203,
+	0x5b30021800370203,
+	0x00241802fcc00106,
 	0x5a40010000370202,
-	0x00443c0120c017e6,
 	0x5c98078000070003,
 	0x5c38008000270000,
+	0x001f9803fc20110f,
 	0x5ce0200000170a02,
-	0x001f8401fda01f06,
-	0x5a00018000070101,
+	0x5b00018000170004,
+	0x5b007fa800170001,
+	0x001f8400fda007e6,
+	0x5b30021800170001,
 	0x5b6c038000270107,
 	0x5c11000000200101,
 	0x001f8400fda007e5,
@@ -36,7 +52,7 @@ uint64_t gm107_builtin_code[] = {
 	0x3810000000100000,
 	0xe32000000007000f,
 	0x50b0000000070f00,
-/* 0x0120: gm107_div_s32 */
+/* 0x01a0: gm107_div_s32 */
 	0x001c0400fc21ffed,
 	0x5b6303800ff70017,
 	0x5b6341000ff7011f,
@@ -45,35 +61,51 @@ uint64_t gm107_builtin_code[] = {
 	0x5ce2000000173a01,
 	0x5c30000000170002,
 	0x3847040001f70202,
-	0x001cbc00fc2007e6,
+	0x001c3c00fc2007e6,
 	0x3898078000170003,
 	0x5c48000000270302,
 	0x5ce0200000170a01,
-	0x005c9802e4c01726,
-	0x5c38000000270103,
+	0x001f9800fcc00fe1,
+	0x5b007f8000170204,
+	0x5b007fa800170203,
+	0x5b30021800370203,
+	0x001f9801fc200706,
 	0x5a40010000370202,
-	0x5c38000000270103,
-	0x005c9802e4c01726,
+	0x5b007f8000170204,
+	0x5b007fa800170203,
+	0x003f8400e0c007e6,
+	0x5b30021800370203,
 	0x5a40010000370202,
-	0x5c38000000270103,
+	0x5b007f8000170204,
+	0x003c1800fcc007e6,
+	0x5b007fa800170203,
+	0x5b30021800370203,
 	0x5a40010000370202,
-	0x005c9802e4c01726,
-	0x5c38000000270103,
+	0x001f9800fcc00fe1,
+	0x5b007f8000170204,
+	0x5b007fa800170203,
+	0x5b30021800370203,
+	0x001f9801fc200706,
 	0x5a40010000370202,
-	0x5c38000000270103,
-	0x00441805fc401226,
+	0x5b007f8000170204,
+	0x5b007fa800170203,
+	0x005f980020c007e6,
+	0x5b30021800370203,
 	0x5a40010000370202,
 	0x5c98078000070003,
+	0x007f840221e00906,
 	0x5c38008000270000,
-	0x007fb405e0c0122f,
 	0x5ce0200000170a02,
-	0x5a00018000070101,
+	0x5b00018000170004,
+	0x001fb400fcc007e6,
+	0x5b007fa800170001,
+	0x5b30021800170001,
 	0x5b6c038000270107,
 	0x001fb400fca007e1,
 	0x5c11000000200101,
 	0x3810000000100000,
 	0x5b6c038000200107,
-	0x001c3c00fc4007e1,
+	0x001c3c00fc2007e1,
 	0x5c11000000200101,
 	0x3810000000100000,
 	0x5ce0200000033a00,
@@ -81,8 +113,8 @@ uint64_t gm107_builtin_code[] = {
 	0x5ce0200000123a01,
 	0xe32000000007000f,
 	0x50b0000000070f00,
-/* 0x0280: gm107_rcp_f64 */
-/* 0x0280: gm107_rsq_f64 */
+/* 0x0380: gm107_rcp_f64 */
+/* 0x0380: gm107_rsq_f64 */
 	0x001f8000fc0007e0,
 	0xe32000000007000f,
 	0x50b0000000070f00,
@@ -91,7 +123,7 @@ uint64_t gm107_builtin_code[] = {
 
 uint64_t gm107_builtin_offsets[] = {
 	0x0000000000000000,
-	0x0000000000000120,
-	0x0000000000000280,
-	0x0000000000000280,
+	0x00000000000001a0,
+	0x0000000000000380,
+	0x0000000000000380,
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 597dcdffbe..d51ebbe258 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -74,7 +74,7 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
    }
    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
    bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
-   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
+   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0x1e : 0x1d, 2);
    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
 
    call->fixed = 1;
-- 
2.14.4



More information about the mesa-dev mailing list