[PATCH 1/2] drm/amdkfd: Replace gfx10 trap handler with correct branch

Cornwall, Jay Jay.Cornwall at amd.com
Wed Jul 24 17:27:45 UTC 2019


Previously submitted code was taken from an incorrect branch and
was non-functional.

Cc: Oak Zeng <oak.zeng at amd.com>
Signed-off-by: Jay Cornwall <jay.cornwall at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h     |  578 +++---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | 1978 +++++++++-----------
 2 files changed, 1235 insertions(+), 1321 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 4275940..beabf16 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -680,24 +680,47 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 };
 
 static const uint32_t cwsr_trap_gfx10_hex[] = {
-	0xbf820001, 0xbf82012e,
-	0xb0804004, 0xb970f802,
-	0x8a708670, 0xb971f803,
-	0x8771ff71, 0x00000400,
-	0xbf850008, 0xb971f803,
-	0x8771ff71, 0x000001ff,
-	0xbf850001, 0x806c846c,
+	0xbf820001, 0xbf8201b7,
+	0xb0804004, 0xb978f802,
+	0x8a788678, 0xb971f803,
+	0x876eff71, 0x00000400,
+	0xbf850033, 0x876eff71,
+	0x00000100, 0xbf840002,
+	0x8878ff78, 0x00002000,
+	0x8a77ff77, 0xff000000,
+	0xb96ef807, 0x876fff6e,
+	0x02000000, 0x8f6f866f,
+	0x88776f77, 0x876fff6e,
+	0x003f8000, 0x8f6f896f,
+	0x88776f77, 0x8a6eff6e,
+	0x023f8000, 0xb9eef807,
+	0xb970f812, 0xb971f813,
+	0x8ff08870, 0xf4051bb8,
+	0xfa000000, 0xbf8cc07f,
+	0xf4051c38, 0xfa000008,
+	0xbf8cc07f, 0x87ee6e6e,
+	0xbf840001, 0xbe80206e,
+	0xb971f803, 0x8771ff71,
+	0x000001ff, 0xbf850002,
+	0x806c846c, 0x826d806d,
+	0x876dff6d, 0x0000ffff,
+	0x906e8977, 0x876fff6e,
+	0x003f8000, 0x906e8677,
+	0x876eff6e, 0x02000000,
+	0x886e6f6e, 0xb9eef807,
+	0x87fe7e7e, 0x87ea6a6a,
+	0xb9f8f802, 0xbe80226c,
+	0xb971f803, 0x8771ff71,
+	0x00000100, 0xbf840006,
+	0xbef60380, 0xb9f60203,
 	0x876dff6d, 0x0000ffff,
-	0xbe80226c, 0xb971f803,
-	0x8771ff71, 0x00000100,
-	0xbf840006, 0xbef60380,
-	0xb9f60203, 0x876dff6d,
-	0x0000ffff, 0x80ec886c,
-	0x82ed806d, 0xbef60380,
-	0xb9f60283, 0xb973f816,
-	0xb9762c07, 0x8f769c76,
-	0x886d766d, 0xb97603c7,
-	0x8f769b76, 0x886d766d,
+	0x80ec886c, 0x82ed806d,
+	0xbef60380, 0xb9f60283,
+	0xb972f816, 0xb9762c07,
+	0x8f769a76, 0x886d766d,
+	0xb97603c7, 0x8f769976,
+	0x886d766d, 0xb9760647,
+	0x8f769876, 0x886d766d,
 	0xb976f807, 0x8776ff76,
 	0x00007fff, 0xb9f6f807,
 	0xbeee037e, 0xbeef037f,
@@ -706,276 +729,339 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xbef4037e, 0x8775ff7f,
 	0x0000ffff, 0x8875ff75,
 	0x00040000, 0xbef60380,
-	0xbef703ff, 0x00807fac,
+	0xbef703ff, 0x10807fac,
 	0x8776ff7f, 0x08000000,
 	0x90768376, 0x88777677,
 	0x8776ff7f, 0x70000000,
 	0x90768176, 0x88777677,
 	0xbefb037c, 0xbefa0380,
-	0xb97202dc, 0x8872727f,
-	0xbefe03c1, 0x877c8172,
+	0xb97302dc, 0x8f739973,
+	0x8873737f, 0xb97a2a05,
+	0x807a817a, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0x8f7a897a,
+	0xbf820001, 0x8f7a8a7a,
+	0xb9761e06, 0x8f768a76,
+	0x807a767a, 0x807aff7a,
+	0x00000200, 0xbef60384,
+	0xbef603ff, 0x01000000,
+	0xbefe037c, 0xbefc037a,
+	0xf4611efa, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xbefe037c, 0xbefc037a,
+	0xf4611b3a, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xbefe037c, 0xbefc037a,
+	0xf4611b7a, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xbefe037c, 0xbefc037a,
+	0xf4611bba, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xbefe037c, 0xbefc037a,
+	0xf4611bfa, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xbefe037c, 0xbefc037a,
+	0xf4611e3a, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xb971f803, 0xbefe037c,
+	0xbefc037a, 0xf4611c7a,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xbefe037c,
+	0xbefc037a, 0xf4611cba,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xb97bf801,
+	0xbefe037c, 0xbefc037a,
+	0xf4611efa, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0x8776ff7f, 0x04000000,
+	0xbeef0380, 0x886f6f76,
+	0xb97a2a05, 0x807a817a,
+	0x907c9973, 0x877c817c,
 	0xbf06817c, 0xbf850002,
-	0xbeff0380, 0xbf820001,
-	0xbeff03c1, 0xb9712a05,
-	0x80718171, 0x8f718271,
-	0x877c8172, 0xbf06817c,
-	0xbf85000d, 0x8f768771,
+	0x8f7a897a, 0xbf820001,
+	0x8f7a8a7a, 0xb9761e06,
+	0x8f768a76, 0x807a767a,
+	0x8f7682ff, 0x0000006a,
+	0xbef603ff, 0x01000000,
+	0xbef20374, 0x80747a74,
+	0x82758075, 0xbefc0380,
+	0xbf800000, 0xbe802f00,
+	0xbe822f02, 0xbe842f04,
+	0xbe862f06, 0xbe882f08,
+	0xbe8a2f0a, 0xbe8c2f0c,
+	0xbe8e2f0e, 0xf469003a,
+	0xfa000000, 0xf469013a,
+	0xfa000010, 0xf469023a,
+	0xfa000020, 0xf469033a,
+	0xfa000030, 0x8074c074,
+	0x82758075, 0x807c907c,
+	0xbf0aff7c, 0x00000060,
+	0xbf85ffea, 0xbe802f00,
+	0xbe822f02, 0xbe842f04,
+	0xbe862f06, 0xbe882f08,
+	0xf469003a, 0xfa000000,
+	0xf469013a, 0xfa000010,
+	0xf465023a, 0xfa000020,
+	0x8074c074, 0x82758075,
+	0xbef40372, 0xbefa0380,
+	0xbefe03c1, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0xbeff0380,
+	0xbf820002, 0xbeff03c1,
+	0xbf82000b, 0xbef603ff,
+	0x01000000, 0xe0704000,
+	0x7a5d0000, 0xe0704080,
+	0x7a5d0100, 0xe0704100,
+	0x7a5d0200, 0xe0704180,
+	0x7a5d0300, 0xbf82000a,
 	0xbef603ff, 0x01000000,
-	0xbefc0380, 0x7e008700,
-	0xe0704000, 0x7a5d0000,
-	0x807c817c, 0x807aff7a,
-	0x00000080, 0xbf0a717c,
-	0xbf85fff8, 0xbf82001b,
-	0x8f768871, 0xbef603ff,
-	0x01000000, 0xbefc0380,
-	0x7e008700, 0xe0704000,
-	0x7a5d0000, 0x807c817c,
-	0x807aff7a, 0x00000100,
-	0xbf0a717c, 0xbf85fff8,
-	0xb9711e06, 0x8771c171,
-	0xbf84000c, 0x8f718371,
-	0x80717c71, 0xbefe03c1,
-	0xbeff0380, 0x7e008700,
 	0xe0704000, 0x7a5d0000,
-	0x807c817c, 0x807aff7a,
-	0x00000080, 0xbf0a717c,
-	0xbf85fff8, 0xbf8a0000,
-	0x8776ff72, 0x04000000,
-	0xbf84002b, 0xbefe03c1,
-	0x877c8172, 0xbf06817c,
+	0xe0704100, 0x7a5d0100,
+	0xe0704200, 0x7a5d0200,
+	0xe0704300, 0x7a5d0300,
+	0xbefe03c1, 0x907c9973,
+	0x877c817c, 0xbf06817c,
 	0xbf850002, 0xbeff0380,
 	0xbf820001, 0xbeff03c1,
 	0xb9714306, 0x8771c171,
-	0xbf840021, 0x8f718671,
+	0xbf840046, 0xbf8a0000,
+	0x8776ff6f, 0x04000000,
+	0xbf840042, 0x8f718671,
 	0x8f718271, 0xbef60371,
+	0xb97a2a05, 0x807a817a,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850002,
+	0x8f7a897a, 0xbf820001,
+	0x8f7a8a7a, 0xb9761e06,
+	0x8f768a76, 0x807a767a,
+	0x807aff7a, 0x00000200,
+	0x807aff7a, 0x00000080,
 	0xbef603ff, 0x01000000,
 	0xd7650000, 0x000100c1,
 	0xd7660000, 0x000200c1,
-	0x16000084, 0x877c8172,
-	0xbf06817c, 0xbefc0380,
-	0xbf85000a, 0x807cff7c,
-	0x00000080, 0x807aff7a,
-	0x00000080, 0xd5250000,
-	0x0001ff00, 0x00000080,
-	0xbf0a717c, 0xbf85fff7,
-	0xbf820009, 0x807cff7c,
-	0x00000100, 0x807aff7a,
-	0x00000100, 0xd5250000,
-	0x0001ff00, 0x00000100,
-	0xbf0a717c, 0xbf85fff7,
-	0x877c8172, 0xbf06817c,
-	0xbf850003, 0x8f7687ff,
-	0x0000006a, 0xbf820002,
-	0x8f7688ff, 0x0000006a,
-	0xbef603ff, 0x01000000,
-	0x877c8172, 0xbf06817c,
-	0xbefc0380, 0xbf800000,
-	0xbf85000b, 0xbe802e00,
-	0x7e000200, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000080, 0x807c817c,
-	0xbf0aff7c, 0x0000006a,
-	0xbf85fff6, 0xbf82000a,
-	0xbe802e00, 0x7e000200,
-	0xe0704000, 0x7a5d0000,
-	0x807aff7a, 0x00000100,
-	0x807c817c, 0xbf0aff7c,
-	0x0000006a, 0xbf85fff6,
-	0xbef60384, 0xbef603ff,
-	0x01000000, 0x877c8172,
-	0xbf06817c, 0xbf850030,
-	0x7e00027b, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000080, 0x7e00026c,
-	0xe0704000, 0x7a5d0000,
-	0x807aff7a, 0x00000080,
-	0x7e00026d, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000080, 0x7e00026e,
-	0xe0704000, 0x7a5d0000,
-	0x807aff7a, 0x00000080,
-	0x7e00026f, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000080, 0x7e000270,
+	0x16000084, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbefc0380, 0xbf850012,
+	0xbe8303ff, 0x00000080,
+	0xbf800000, 0xbf800000,
+	0xbf800000, 0xd8d80000,
+	0x01000000, 0xbf8c0000,
+	0xe0704000, 0x7a5d0100,
+	0x807c037c, 0x807a037a,
+	0xd5250000, 0x0001ff00,
+	0x00000080, 0xbf0a717c,
+	0xbf85fff4, 0xbf820011,
+	0xbe8303ff, 0x00000100,
+	0xbf800000, 0xbf800000,
+	0xbf800000, 0xd8d80000,
+	0x01000000, 0xbf8c0000,
+	0xe0704000, 0x7a5d0100,
+	0x807c037c, 0x807a037a,
+	0xd5250000, 0x0001ff00,
+	0x00000100, 0xbf0a717c,
+	0xbf85fff4, 0xbefe03c1,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850004,
+	0xbefa03ff, 0x00000200,
+	0xbeff0380, 0xbf820003,
+	0xbefa03ff, 0x00000400,
+	0xbeff03c1, 0xb9712a05,
+	0x80718171, 0x8f718271,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850018,
+	0x8f768771, 0xbef603ff,
+	0x01000000, 0xbefc0384,
+	0xbf0a717c, 0xbf840038,
+	0x7e008700, 0x7e028701,
+	0x7e048702, 0x7e068703,
 	0xe0704000, 0x7a5d0000,
-	0x807aff7a, 0x00000080,
-	0xb971f803, 0x7e000271,
+	0xe0704080, 0x7a5d0100,
+	0xe0704100, 0x7a5d0200,
+	0xe0704180, 0x7a5d0300,
+	0x807c847c, 0x807aff7a,
+	0x00000200, 0xbf0a717c,
+	0xbf85ffef, 0xbf820026,
+	0x8f768871, 0xbef603ff,
+	0x01000000, 0xbefc0384,
+	0xbf0a717c, 0xbf840020,
+	0x7e008700, 0x7e028701,
+	0x7e048702, 0x7e068703,
 	0xe0704000, 0x7a5d0000,
+	0xe0704100, 0x7a5d0100,
+	0xe0704200, 0x7a5d0200,
+	0xe0704300, 0x7a5d0300,
+	0x807c847c, 0x807aff7a,
+	0x00000400, 0xbf0a717c,
+	0xbf85ffef, 0xb9711e06,
+	0x8771c171, 0xbf84000c,
+	0x8f718371, 0x80717c71,
+	0xbefe03c1, 0xbeff0380,
+	0x7e008700, 0xe0704000,
+	0x7a5d0000, 0x807c817c,
 	0x807aff7a, 0x00000080,
-	0x7e000273, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000080, 0xb97bf801,
-	0x7e00027b, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000080, 0xbf82002f,
-	0x7e00027b, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000100, 0x7e00026c,
-	0xe0704000, 0x7a5d0000,
-	0x807aff7a, 0x00000100,
-	0x7e00026d, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000100, 0x7e00026e,
-	0xe0704000, 0x7a5d0000,
-	0x807aff7a, 0x00000100,
-	0x7e00026f, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000100, 0x7e000270,
-	0xe0704000, 0x7a5d0000,
-	0x807aff7a, 0x00000100,
-	0xb971f803, 0x7e000271,
-	0xe0704000, 0x7a5d0000,
-	0x807aff7a, 0x00000100,
-	0x7e000273, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000100, 0xb97bf801,
-	0x7e00027b, 0xe0704000,
-	0x7a5d0000, 0x807aff7a,
-	0x00000100, 0xbf820119,
-	0xbef4037e, 0x8775ff7f,
-	0x0000ffff, 0x8875ff75,
-	0x00040000, 0xbef60380,
-	0xbef703ff, 0x00807fac,
-	0x8772ff7f, 0x08000000,
-	0x90728372, 0x88777277,
-	0x8772ff7f, 0x70000000,
-	0x90728172, 0x88777277,
-	0xb97902dc, 0x8879797f,
-	0xbef80380, 0xbefe03c1,
-	0x877c8179, 0xbf06817c,
+	0xbf0a717c, 0xbf85fff8,
+	0xbf82013d, 0xbef4037e,
+	0x8775ff7f, 0x0000ffff,
+	0x8875ff75, 0x00040000,
+	0xbef60380, 0xbef703ff,
+	0x10807fac, 0x8772ff7f,
+	0x08000000, 0x90728372,
+	0x88777277, 0x8772ff7f,
+	0x70000000, 0x90728172,
+	0x88777277, 0xb97302dc,
+	0x8f739973, 0x8873737f,
+	0x8772ff7f, 0x04000000,
+	0xbf840036, 0xbefe03c1,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850002,
+	0xbeff0380, 0xbf820001,
+	0xbeff03c1, 0xb96f4306,
+	0x876fc16f, 0xbf84002b,
+	0x8f6f866f, 0x8f6f826f,
+	0xbef6036f, 0xb9782a05,
+	0x80788178, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0x8f788978,
+	0xbf820001, 0x8f788a78,
+	0xb9721e06, 0x8f728a72,
+	0x80787278, 0x8078ff78,
+	0x00000200, 0x8078ff78,
+	0x00000080, 0xbef603ff,
+	0x01000000, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbefc0380, 0xbf850009,
+	0xe0310000, 0x781d0000,
+	0x807cff7c, 0x00000080,
+	0x8078ff78, 0x00000080,
+	0xbf0a6f7c, 0xbf85fff8,
+	0xbf820008, 0xe0310000,
+	0x781d0000, 0x807cff7c,
+	0x00000100, 0x8078ff78,
+	0x00000100, 0xbf0a6f7c,
+	0xbf85fff8, 0xbef80380,
+	0xbefe03c1, 0x907c9973,
+	0x877c817c, 0xbf06817c,
 	0xbf850002, 0xbeff0380,
 	0xbf820001, 0xbeff03c1,
 	0xb96f2a05, 0x806f816f,
-	0x8f6f826f, 0x877c8179,
-	0xbf06817c, 0xbf850013,
-	0x8f76876f, 0xbef603ff,
-	0x01000000, 0xbef20378,
-	0x8078ff78, 0x00000080,
-	0xbefc0381, 0xe0304000,
-	0x785d0000, 0xbf8c3f70,
-	0x7e008500, 0x807c817c,
-	0x8078ff78, 0x00000080,
-	0xbf0a6f7c, 0xbf85fff7,
-	0xe0304000, 0x725d0000,
-	0xbf820023, 0x8f76886f,
+	0x8f6f826f, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850022, 0x8f76876f,
 	0xbef603ff, 0x01000000,
 	0xbef20378, 0x8078ff78,
-	0x00000100, 0xbefc0381,
+	0x00000200, 0xbefc0384,
 	0xe0304000, 0x785d0000,
+	0xe0304080, 0x785d0100,
+	0xe0304100, 0x785d0200,
+	0xe0304180, 0x785d0300,
 	0xbf8c3f70, 0x7e008500,
-	0x807c817c, 0x8078ff78,
-	0x00000100, 0xbf0a6f7c,
-	0xbf85fff7, 0xb96f1e06,
-	0x876fc16f, 0xbf84000e,
-	0x8f6f836f, 0x806f7c6f,
-	0xbefe03c1, 0xbeff0380,
-	0xe0304000, 0x785d0000,
-	0xbf8c3f70, 0x7e008500,
-	0x807c817c, 0x8078ff78,
-	0x00000080, 0xbf0a6f7c,
-	0xbf85fff7, 0xbeff03c1,
+	0x7e028501, 0x7e048502,
+	0x7e068503, 0x807c847c,
+	0x8078ff78, 0x00000200,
+	0xbf0a6f7c, 0xbf85ffee,
 	0xe0304000, 0x725d0000,
-	0x8772ff79, 0x04000000,
-	0xbf840020, 0xbefe03c1,
-	0x877c8179, 0xbf06817c,
-	0xbf850002, 0xbeff0380,
-	0xbf820001, 0xbeff03c1,
-	0xb96f4306, 0x876fc16f,
-	0xbf840016, 0x8f6f866f,
-	0x8f6f826f, 0xbef6036f,
+	0xe0304080, 0x725d0100,
+	0xe0304100, 0x725d0200,
+	0xe0304180, 0x725d0300,
+	0xbf820032, 0x8f76886f,
 	0xbef603ff, 0x01000000,
-	0x877c8172, 0xbf06817c,
-	0xbefc0380, 0xbf850007,
-	0x807cff7c, 0x00000080,
-	0x8078ff78, 0x00000080,
-	0xbf0a6f7c, 0xbf85fffa,
-	0xbf820006, 0x807cff7c,
-	0x00000100, 0x8078ff78,
-	0x00000100, 0xbf0a6f7c,
-	0xbf85fffa, 0x877c8179,
-	0xbf06817c, 0xbf850003,
-	0x8f7687ff, 0x0000006a,
-	0xbf820002, 0x8f7688ff,
-	0x0000006a, 0xbef603ff,
-	0x01000000, 0x877c8179,
-	0xbf06817c, 0xbf850012,
-	0xf4211cba, 0xf0000000,
+	0xbef20378, 0x8078ff78,
+	0x00000400, 0xbefc0384,
+	0xe0304000, 0x785d0000,
+	0xe0304100, 0x785d0100,
+	0xe0304200, 0x785d0200,
+	0xe0304300, 0x785d0300,
+	0xbf8c3f70, 0x7e008500,
+	0x7e028501, 0x7e048502,
+	0x7e068503, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffee,
+	0xb96f1e06, 0x876fc16f,
+	0xbf84000e, 0x8f6f836f,
+	0x806f7c6f, 0xbefe03c1,
+	0xbeff0380, 0xe0304000,
+	0x785d0000, 0xbf8c3f70,
+	0x7e008500, 0x807c817c,
 	0x8078ff78, 0x00000080,
-	0xbefc0381, 0xf421003a,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xbf8cc07f,
-	0xbe803000, 0xbf800000,
-	0x807c817c, 0xbf0aff7c,
-	0x0000006a, 0xbf85fff5,
-	0xbe800372, 0xbf820011,
-	0xf4211cba, 0xf0000000,
-	0x8078ff78, 0x00000100,
-	0xbefc0381, 0xf421003a,
-	0xf0000000, 0x8078ff78,
-	0x00000100, 0xbf8cc07f,
-	0xbe803000, 0xbf800000,
-	0x807c817c, 0xbf0aff7c,
-	0x0000006a, 0xbf85fff5,
-	0xbe800372, 0xbef60384,
+	0xbf0a6f7c, 0xbf85fff7,
+	0xbeff03c1, 0xe0304000,
+	0x725d0000, 0xe0304080,
+	0x725d0100, 0xe0304100,
+	0x725d0200, 0xe0304180,
+	0x725d0300, 0xb9782a05,
+	0x80788178, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0x8f788978,
+	0xbf820001, 0x8f788a78,
+	0xb9721e06, 0x8f728a72,
+	0x80787278, 0x8078ff78,
+	0x00000200, 0x80f8ff78,
+	0x00000058, 0x80f88878,
+	0x8f7682ff, 0x0000006a,
+	0xbef603ff, 0x01000000,
+	0xbefc03ff, 0x0000006a,
+	0xf425003a, 0xf0000000,
+	0x80f8a078, 0xbf8cc07f,
+	0x80fc827c, 0xbf800000,
+	0xbe803100, 0xf42d003a,
+	0xf0000000, 0x80f8c078,
+	0xbf8cc07f, 0x80fc887c,
+	0xbf800000, 0xbe803100,
+	0xbe823102, 0xbe843104,
+	0xbe863106, 0xf431003a,
+	0xf0000000, 0x80f8c078,
+	0xbf8cc07f, 0x80fc907c,
+	0xbf800000, 0xbe803100,
+	0xbe823102, 0xbe843104,
+	0xbe863106, 0xbe883108,
+	0xbe8a310a, 0xbe8c310c,
+	0xbe8e310e, 0xbf06807c,
+	0xbf84fff0, 0xb9782a05,
+	0x80788178, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0x8f788978,
+	0xbf820001, 0x8f788a78,
+	0xb9721e06, 0x8f728a72,
+	0x80787278, 0x8078ff78,
+	0x00000200, 0xbef60384,
 	0xbef603ff, 0x01000000,
-	0x877c8179, 0xbf06817c,
-	0xbf850025, 0xf4211bfa,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xf4211b3a,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xf4211b7a,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xf4211eba,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xf4211efa,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xf4211c3a,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xf4211c7a,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xf4211cfa,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xf4211e7a,
-	0xf0000000, 0x8078ff78,
-	0x00000080, 0xbf820024,
 	0xf4211bfa, 0xf0000000,
-	0x8078ff78, 0x00000100,
-	0xf4211b3a, 0xf0000000,
-	0x8078ff78, 0x00000100,
+	0x80788478, 0xf4211b3a,
+	0xf0000000, 0x80788478,
 	0xf4211b7a, 0xf0000000,
-	0x8078ff78, 0x00000100,
-	0xf4211eba, 0xf0000000,
-	0x8078ff78, 0x00000100,
+	0x80788478, 0xf4211eba,
+	0xf0000000, 0x80788478,
 	0xf4211efa, 0xf0000000,
-	0x8078ff78, 0x00000100,
-	0xf4211c3a, 0xf0000000,
-	0x8078ff78, 0x00000100,
+	0x80788478, 0xf4211c3a,
+	0xf0000000, 0x80788478,
 	0xf4211c7a, 0xf0000000,
-	0x8078ff78, 0x00000100,
+	0x80788478, 0xf4211e7a,
+	0xf0000000, 0x80788478,
 	0xf4211cfa, 0xf0000000,
-	0x8078ff78, 0x00000100,
-	0xf4211e7a, 0xf0000000,
-	0x8078ff78, 0x00000100,
-	0xbf8cc07f, 0x876dff6d,
+	0x80788478, 0xbf8cc07f,
+	0xbef2036d, 0x876dff72,
 	0x0000ffff, 0xbefc036f,
 	0xbefe037a, 0xbeff037b,
 	0x876f71ff, 0x000003ff,
-	0xb9ef4803, 0xb9f3f816,
+	0xb9ef4803, 0xb9f9f816,
 	0x876f71ff, 0xfffff800,
 	0x906f8b6f, 0xb9efa2c3,
-	0xb9f9f801, 0x876fff6d,
-	0xf0000000, 0x906f9c6f,
-	0x8f6f906f, 0xbef20380,
-	0x88726f72, 0x876fff6d,
-	0x08000000, 0x906f9b6f,
-	0x8f6f8f6f, 0x88726f72,
-	0x876fff70, 0x00800000,
-	0x906f976f, 0xb9f2f807,
-	0xb9f0f802, 0xbf8a0000,
-	0xbe80226c, 0xbf810000,
+	0xb9f3f801, 0x876fff72,
+	0xfc000000, 0x906f9a6f,
+	0x8f6f906f, 0xbef30380,
+	0x88736f73, 0x876fff72,
+	0x02000000, 0x906f996f,
+	0x8f6f8f6f, 0x88736f73,
+	0x876fff72, 0x01000000,
+	0x906f986f, 0x8f6f996f,
+	0x88736f73, 0x876fff70,
+	0x00800000, 0x906f976f,
+	0xb9f3f807, 0x87fe7e7e,
+	0x87ea6a6a, 0xb9f0f802,
+	0xbf8a0000, 0xbe80226c,
+	0xbf810000, 0xbf9f0000,
 	0xbf9f0000, 0xbf9f0000,
 	0xbf9f0000, 0xbf9f0000,
-	0xbf9f0000, 0x00000000,
 };
 static const uint32_t cwsr_trap_arcturus_hex[] = {
 	0xbf820001, 0xbf8202c4,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index f20e463..261e054 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -20,1105 +20,933 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
+var SQ_WAVE_STATUS_INST_ATC_SHIFT		= 23
+var SQ_WAVE_STATUS_INST_ATC_MASK		= 0x00800000
+var SQ_WAVE_STATUS_SPI_PRIO_MASK		= 0x00000006
+var SQ_WAVE_STATUS_HALT_MASK			= 0x2000
+
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT		= 12
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT		= 8
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE		= 6
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT		= 24
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE		= 4
+var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT	= 24
+var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE	= 4
+var SQ_WAVE_IB_STS2_WAVE64_SHIFT		= 11
+var SQ_WAVE_IB_STS2_WAVE64_SIZE			= 1
+
+var SQ_WAVE_TRAPSTS_SAVECTX_MASK		= 0x400
+var SQ_WAVE_TRAPSTS_EXCE_MASK			= 0x1FF
+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT		= 10
+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK		= 0x100
+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT		= 8
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK		= 0x3FF
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT		= 0x0
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE		= 10
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK		= 0xFFFFF800
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT		= 11
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE		= 21
+var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK		= 0x800
+
+var SQ_WAVE_IB_STS_RCNT_SHIFT			= 16
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT		= 15
+var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT		= 25
+var SQ_WAVE_IB_STS_REPLAY_W64H_SIZE		= 1
+var SQ_WAVE_IB_STS_REPLAY_W64H_MASK		= 0x02000000
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE		= 1
+var SQ_WAVE_IB_STS_RCNT_SIZE			= 6
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK	= 0x003F8000
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF
+
+var SQ_BUF_RSRC_WORD1_ATC_SHIFT			= 24
+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT		= 27
+
+// bits [31:24] unused by SPI debug data
+var TTMP11_SAVE_REPLAY_W64H_SHIFT		= 31
+var TTMP11_SAVE_REPLAY_W64H_MASK		= 0x80000000
+var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT		= 24
+var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK		= 0x7F000000
+
+// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
+// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+var S_SAVE_BUF_RSRC_WORD1_STRIDE		= 0x00040000
+var S_SAVE_BUF_RSRC_WORD3_MISC			= 0x10807FAC
+
+var S_SAVE_SPI_INIT_ATC_MASK			= 0x08000000
+var S_SAVE_SPI_INIT_ATC_SHIFT			= 27
+var S_SAVE_SPI_INIT_MTYPE_MASK			= 0x70000000
+var S_SAVE_SPI_INIT_MTYPE_SHIFT			= 28
+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
+
+var S_SAVE_PC_HI_RCNT_SHIFT			= 26
+var S_SAVE_PC_HI_RCNT_MASK			= 0xFC000000
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT		= 25
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK		= 0x02000000
+var S_SAVE_PC_HI_REPLAY_W64H_SHIFT		= 24
+var S_SAVE_PC_HI_REPLAY_W64H_MASK		= 0x01000000
+
+var s_sgpr_save_num				= 106
+
+var s_save_spi_init_lo				= exec_lo
+var s_save_spi_init_hi				= exec_hi
+var s_save_pc_lo				= ttmp0
+var s_save_pc_hi				= ttmp1
+var s_save_exec_lo				= ttmp2
+var s_save_exec_hi				= ttmp3
+var s_save_status				= ttmp12
+var s_save_trapsts				= ttmp5
+var s_save_xnack_mask				= ttmp6
+var s_wave_size					= ttmp7
+var s_save_buf_rsrc0				= ttmp8
+var s_save_buf_rsrc1				= ttmp9
+var s_save_buf_rsrc2				= ttmp10
+var s_save_buf_rsrc3				= ttmp11
+var s_save_mem_offset				= ttmp14
+var s_save_alloc_size				= s_save_trapsts
+var s_save_tmp					= s_save_buf_rsrc2
+var s_save_m0					= ttmp15
+
+var S_RESTORE_BUF_RSRC_WORD1_STRIDE		= S_SAVE_BUF_RSRC_WORD1_STRIDE
+var S_RESTORE_BUF_RSRC_WORD3_MISC		= S_SAVE_BUF_RSRC_WORD3_MISC
+
+var S_RESTORE_SPI_INIT_ATC_MASK			= 0x08000000
+var S_RESTORE_SPI_INIT_ATC_SHIFT		= 27
+var S_RESTORE_SPI_INIT_MTYPE_MASK		= 0x70000000
+var S_RESTORE_SPI_INIT_MTYPE_SHIFT		= 28
+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
+var S_WAVE_SIZE					= 25
+
+var S_RESTORE_PC_HI_RCNT_SHIFT			= S_SAVE_PC_HI_RCNT_SHIFT
+var S_RESTORE_PC_HI_RCNT_MASK			= S_SAVE_PC_HI_RCNT_MASK
+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT		= S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK		= S_SAVE_PC_HI_FIRST_REPLAY_MASK
+
+var s_restore_spi_init_lo			= exec_lo
+var s_restore_spi_init_hi			= exec_hi
+var s_restore_mem_offset			= ttmp12
+var s_restore_alloc_size			= ttmp3
+var s_restore_tmp				= ttmp6
+var s_restore_mem_offset_save			= s_restore_tmp
+var s_restore_m0				= s_restore_alloc_size
+var s_restore_mode				= ttmp7
+var s_restore_pc_lo				= ttmp0
+var s_restore_pc_hi				= ttmp1
+var s_restore_exec_lo				= ttmp14
+var s_restore_exec_hi				= ttmp15
+var s_restore_status				= ttmp4
+var s_restore_trapsts				= ttmp5
+var s_restore_xnack_mask			= ttmp13
+var s_restore_buf_rsrc0				= ttmp8
+var s_restore_buf_rsrc1				= ttmp9
+var s_restore_buf_rsrc2				= ttmp10
+var s_restore_buf_rsrc3				= ttmp11
+var s_restore_size				= ttmp7
 
 shader main
+	asic(DEFAULT)
+	type(CS)
+	wave_size(32)
 
-asic(DEFAULT)
-
-type(CS)
-
-wave_size(32)
-/*************************************************************************/
-/*					control on how to run the shader					 */
-/*************************************************************************/
-//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
-var EMU_RUN_HACK					=	0
-var EMU_RUN_HACK_RESTORE_NORMAL		=	0
-var EMU_RUN_HACK_SAVE_NORMAL_EXIT	=	0
-var	EMU_RUN_HACK_SAVE_SINGLE_WAVE	=	0
-var EMU_RUN_HACK_SAVE_FIRST_TIME	= 	0					//for interrupted restore in which the first save is through EMU_RUN_HACK
-var SAVE_LDS						= 	0
-var WG_BASE_ADDR_LO					=   0x9000a000
-var WG_BASE_ADDR_HI					=	0x0
-var WAVE_SPACE						=	0x9000				//memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved
-var CTX_SAVE_CONTROL				=	0x0
-var CTX_RESTORE_CONTROL				=	CTX_SAVE_CONTROL
-var SIM_RUN_HACK					=	0					//any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
-var	SGPR_SAVE_USE_SQC				=	0					//use SQC D$ to do the write
-var USE_MTBUF_INSTEAD_OF_MUBUF		=	0					//need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
-var SWIZZLE_EN						=	0					//whether we use swizzled buffer addressing
-var SAVE_RESTORE_HWID_DDID          =   0
-var RESTORE_DDID_IN_SGPR18          =   0
-/**************************************************************************/
-/*                     	variables							              */
-/**************************************************************************/
-var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
-var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
-var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
-
-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT	= 8
-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE	= 6
-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT	= 24
-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE	= 4						//FIXME	 sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
-var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT    = 24
-var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE     = 4
-var SQ_WAVE_IB_STS2_WAVE64_SHIFT        = 11
-var SQ_WAVE_IB_STS2_WAVE64_SIZE         = 1
-
-var	SQ_WAVE_TRAPSTS_SAVECTX_MASK	=	0x400
-var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF          			// Exception mask
-var	SQ_WAVE_TRAPSTS_SAVECTX_SHIFT	=	10					
-var	SQ_WAVE_TRAPSTS_MEM_VIOL_MASK	=	0x100					
-var	SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT	=	8		
-var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK 	=	0x3FF
-var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT 	=	0x0
-var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE 	=	10
-var	SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK 	=	0xFFFFF800	
-var	SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT 	=	11
-var	SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE 	=	21	
-
-var SQ_WAVE_IB_STS_RCNT_SHIFT			=	16					//FIXME
-var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT	=	15					//FIXME
-var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
-var SQ_WAVE_IB_STS_RCNT_SIZE            =   6                   //FIXME
-var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF	//FIXME
- 
-var	SQ_BUF_RSRC_WORD1_ATC_SHIFT		=	24
-var	SQ_BUF_RSRC_WORD3_MTYPE_SHIFT	=	27
-
-
-/*      Save        */
-var	S_SAVE_BUF_RSRC_WORD1_STRIDE		=	0x00040000  		//stride is 4 bytes 
-var	S_SAVE_BUF_RSRC_WORD3_MISC			= 	0x00807FAC			//SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE			
-
-var	S_SAVE_SPI_INIT_ATC_MASK			=	0x08000000			//bit[27]: ATC bit
-var	S_SAVE_SPI_INIT_ATC_SHIFT			=	27
-var	S_SAVE_SPI_INIT_MTYPE_MASK			=	0x70000000			//bit[30:28]: Mtype
-var	S_SAVE_SPI_INIT_MTYPE_SHIFT			=	28
-var	S_SAVE_SPI_INIT_FIRST_WAVE_MASK		=	0x04000000			//bit[26]: FirstWaveInTG
-var	S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT	=	26
-
-var S_SAVE_PC_HI_RCNT_SHIFT				=	28					//FIXME	 check with Brian to ensure all fields other than PC[47:0] can be used
-var S_SAVE_PC_HI_RCNT_MASK				=   0xF0000000			//FIXME
-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT		=	27					//FIXME
-var S_SAVE_PC_HI_FIRST_REPLAY_MASK		=	0x08000000			//FIXME
-
-var	s_save_spi_init_lo				=	exec_lo
-var s_save_spi_init_hi				=	exec_hi
-
-var	s_save_pc_lo			=	ttmp0			//{TTMP1, TTMP0} = {3��h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
-var	s_save_pc_hi			=	ttmp1			
-var s_save_exec_lo			=	ttmp2
-var s_save_exec_hi			= 	ttmp3			
-var	s_save_status			=	ttmp4			
-var	s_save_trapsts			=	ttmp5			//not really used until the end of the SAVE routine
-var s_wave_size         	=	ttmp6           //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK
-var s_save_xnack_mask	    =	ttmp7
-var	s_save_buf_rsrc0		=	ttmp8
-var	s_save_buf_rsrc1		=	ttmp9
-var	s_save_buf_rsrc2		=	ttmp10
-var	s_save_buf_rsrc3		=	ttmp11
-
-var s_save_mem_offset		= 	ttmp14
-var s_sgpr_save_num         =   106                     //in gfx10, all sgpr must be saved
-var s_save_alloc_size		=	s_save_trapsts			//conflict
-var s_save_tmp              =   s_save_buf_rsrc2       	//shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
-var s_save_m0				=	ttmp15					
-
-/*      Restore     */
-var	S_RESTORE_BUF_RSRC_WORD1_STRIDE			=	S_SAVE_BUF_RSRC_WORD1_STRIDE 
-var	S_RESTORE_BUF_RSRC_WORD3_MISC			= 	S_SAVE_BUF_RSRC_WORD3_MISC		 
-
-var	S_RESTORE_SPI_INIT_ATC_MASK			    =	0x08000000			//bit[27]: ATC bit
-var	S_RESTORE_SPI_INIT_ATC_SHIFT			=	27
-var	S_RESTORE_SPI_INIT_MTYPE_MASK			=	0x70000000			//bit[30:28]: Mtype
-var	S_RESTORE_SPI_INIT_MTYPE_SHIFT			=	28
-var	S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		=	0x04000000			//bit[26]: FirstWaveInTG
-var	S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT	    =	26
-
-var S_RESTORE_PC_HI_RCNT_SHIFT				=	S_SAVE_PC_HI_RCNT_SHIFT
-var S_RESTORE_PC_HI_RCNT_MASK				=   S_SAVE_PC_HI_RCNT_MASK
-var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT		=	S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
-var S_RESTORE_PC_HI_FIRST_REPLAY_MASK		=	S_SAVE_PC_HI_FIRST_REPLAY_MASK
-
-var s_restore_spi_init_lo                   =   exec_lo
-var s_restore_spi_init_hi                   =   exec_hi
-
-var s_restore_mem_offset		= 	ttmp12
-var s_restore_alloc_size		=	ttmp3
-var s_restore_tmp           	=   ttmp6
-var s_restore_mem_offset_save	= 	s_restore_tmp 		//no conflict
-
-var s_restore_m0			=	s_restore_alloc_size	//no conflict			
-
-var s_restore_mode			=  	ttmp13
-var s_restore_hwid1         =  ttmp2
-var s_restore_ddid          =  s_restore_hwid1
-var	s_restore_pc_lo		    =	ttmp0			
-var	s_restore_pc_hi		    =	ttmp1
-var s_restore_exec_lo		=	ttmp14
-var s_restore_exec_hi		= 	ttmp15
-var	s_restore_status	    =	ttmp4			
-var	s_restore_trapsts	    =	ttmp5
-//var s_restore_xnack_mask_lo	=	xnack_mask_lo
-//var s_restore_xnack_mask_hi	=	xnack_mask_hi
-var s_restore_xnack_mask    =   ttmp7
-var	s_restore_buf_rsrc0		=	ttmp8
-var	s_restore_buf_rsrc1		=	ttmp9
-var	s_restore_buf_rsrc2		=	ttmp10
-var	s_restore_buf_rsrc3		=	ttmp11
-var s_restore_size         	=	ttmp13                  //ttmp13 has no conflict
-
-/**************************************************************************/
-/*                     	trap handler entry points			              */
-/**************************************************************************/
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 					//hack to use trap_id for determining save/restore
-		//FIXME VCCZ un-init assertion s_getreg_b32  	s_save_status, hwreg(HW_REG_STATUS)			//save STATUS since we will change SCC
-		s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 				//change SCC
-    	s_cmp_eq_u32 s_save_tmp, 0x007e0000  						//Save: trap_id = 0x7e. Restore: trap_id = 0x7f.  
-    	s_cbranch_scc0 L_JUMP_TO_RESTORE							//do not need to recover STATUS here  since we are going to RESTORE
-		//FIXME  s_setreg_b32 	hwreg(HW_REG_STATUS), 	s_save_status		//need to recover STATUS since we are going to SAVE	
-		s_branch L_SKIP_RESTORE 									//NOT restore, SAVE actually
-	else	
-		s_branch L_SKIP_RESTORE 									//NOT restore. might be a regular trap or save
-    end
+	s_branch	L_SKIP_RESTORE						//NOT restore. might be a regular trap or save
 
 L_JUMP_TO_RESTORE:
-    s_branch L_RESTORE												//restore
+	s_branch	L_RESTORE
 
 L_SKIP_RESTORE:
-	
-	s_getreg_b32  	s_save_status, hwreg(HW_REG_STATUS)								//save STATUS since we will change SCC
-    s_andn2_b32		s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
-	s_getreg_b32  	s_save_trapsts, hwreg(HW_REG_TRAPSTS)    		 				
-	s_and_b32		s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK	//check whether this is for save  
-	s_cbranch_scc1	L_SAVE															//this is the operation for save
-
-    // *********    Handle non-CWSR traps       *******************
-    if (!EMU_RUN_HACK)
-		s_getreg_b32     s_save_trapsts, hwreg(HW_REG_TRAPSTS)
-		s_and_b32        s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
-		s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
-		s_add_u32    ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0 
-		
-		L_EXCP_CASE:
-		s_and_b32    ttmp1, ttmp1, 0xFFFF
-		s_rfe_b64    [ttmp0, ttmp1]
-	end
-    // *********        End handling of non-CWSR traps   *******************
-
-/**************************************************************************/
-/*                     	save routine						              */
-/**************************************************************************/
-
-L_SAVE:	
-	
+	s_getreg_b32	s_save_status, hwreg(HW_REG_STATUS)			//save STATUS since we will change SCC
+	s_andn2_b32	s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK
+	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK	//check whether this is for save
+	s_cbranch_scc1	L_SAVE
+
+	// If STATUS.MEM_VIOL is asserted then halt the wave to prevent
+	// the exception raising again and blocking context save.
+	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+	s_cbranch_scc0	L_FETCH_2ND_TRAP
+	s_or_b32	s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+
+L_FETCH_2ND_TRAP:
+	// Preserve and clear scalar XNACK state before issuing scalar loads.
+	// Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
+	// unused space ttmp11[31:24].
+	s_andn2_b32	ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
+	s_getreg_b32	ttmp2, hwreg(HW_REG_IB_STS)
+	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
+	s_lshl_b32	ttmp3, ttmp3, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
+	s_or_b32	ttmp11, ttmp11, ttmp3
+	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+	s_lshl_b32	ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+	s_or_b32	ttmp11, ttmp11, ttmp3
+	s_andn2_b32	ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
+	s_setreg_b32	hwreg(HW_REG_IB_STS), ttmp2
+
+	// Read second-level TBA/TMA from first-level TMA and jump if available.
+	// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
+	// ttmp12 holds SQ_WAVE_STATUS
+	s_getreg_b32	ttmp4, hwreg(HW_REG_SHADER_TMA_LO)
+	s_getreg_b32	ttmp5, hwreg(HW_REG_SHADER_TMA_HI)
+	s_lshl_b64	[ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
+	s_load_dwordx2	[ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1		// second-level TBA
+	s_waitcnt	lgkmcnt(0)
+	s_load_dwordx2	[ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1		// second-level TMA
+	s_waitcnt	lgkmcnt(0)
+	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
+	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set
+	s_setpc_b64	[ttmp2, ttmp3]						// jump to second-level trap handler
+
+L_NO_NEXT_TRAP:
+	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+	s_and_b32	s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK
+	s_cbranch_scc1	L_EXCP_CASE						// Exception, jump back to the shader program directly.
+	s_add_u32	ttmp0, ttmp0, 4						// S_TRAP case, add 4 to ttmp0
+	s_addc_u32	ttmp1, ttmp1, 0
+L_EXCP_CASE:
+	s_and_b32	ttmp1, ttmp1, 0xFFFF
+
+	// Restore SQ_WAVE_IB_STS.
+	s_lshr_b32	ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+	s_lshr_b32	ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
+	s_and_b32	ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
+	s_or_b32	ttmp2, ttmp2, ttmp3
+	s_setreg_b32	hwreg(HW_REG_IB_STS), ttmp2
+
+	// Restore SQ_WAVE_STATUS.
+	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
+	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
+	s_setreg_b32	hwreg(HW_REG_STATUS), s_save_status
+
+	s_rfe_b64	[ttmp0, ttmp1]
+
+L_SAVE:
 	//check whether there is mem_viol
-	s_getreg_b32  	s_save_trapsts, hwreg(HW_REG_TRAPSTS)    		 				
-	s_and_b32		s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK			
+	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+	s_and_b32	s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
 	s_cbranch_scc0	L_NO_PC_REWIND
-    
+
 	//if so, need rewind PC assuming GDS operation gets NACKed
-	s_mov_b32       s_save_tmp, 0															//clear mem_viol bit
-	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp	//clear mem_viol bit 
-	s_and_b32 		s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
-	s_sub_u32 		s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
-	s_subb_u32 		s_save_pc_hi, s_save_pc_hi, 0x0			  // -scc
+	s_mov_b32	s_save_tmp, 0
+	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp	//clear mem_viol bit
+	s_and_b32	s_save_pc_hi, s_save_pc_hi, 0x0000ffff			//pc[47:32]
+	s_sub_u32	s_save_pc_lo, s_save_pc_lo, 8				//pc[31:0]-8
+	s_subb_u32	s_save_pc_hi, s_save_pc_hi, 0x0
 
 L_NO_PC_REWIND:
-    s_mov_b32       s_save_tmp, 0															//clear saveCtx bit
-	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp		//clear saveCtx bit   
-
-	//s_mov_b32		s_save_xnack_mask_lo,	xnack_mask_lo									//save XNACK_MASK  
-	//s_mov_b32		s_save_xnack_mask_hi,	xnack_mask_hi
-    s_getreg_b32	s_save_xnack_mask,  hwreg(HW_REG_SHADER_XNACK_MASK)  
-	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)					//save RCNT
-	s_lshl_b32		s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
-	s_or_b32		s_save_pc_hi, s_save_pc_hi, s_save_tmp
-	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)	//save FIRST_REPLAY
-	s_lshl_b32		s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
-	s_or_b32		s_save_pc_hi, s_save_pc_hi, s_save_tmp
-	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS)										//clear RCNT and FIRST_REPLAY in IB_STS
-	s_and_b32		s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+	s_mov_b32	s_save_tmp, 0
+	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp	//clear saveCtx bit
+
+	s_getreg_b32	s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
+	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)
+	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
+	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)
+	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
+	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, SQ_WAVE_IB_STS_REPLAY_W64H_SIZE)
+	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
+	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
+	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS)			//clear RCNT and FIRST_REPLAY and REPLAY_W64H in IB_STS
+	s_and_b32	s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
 
 	s_setreg_b32	hwreg(HW_REG_IB_STS), s_save_tmp
-    
-	/*		inform SPI the readiness and wait for SPI's go signal */
-	s_mov_b32		s_save_exec_lo,	exec_lo													//save EXEC and use EXEC for the go signal from SPI
-	s_mov_b32		s_save_exec_hi,	exec_hi
-	s_mov_b64		exec, 	0x0																//clear EXEC to get ready to receive
-	if (EMU_RUN_HACK)
-	
-	else
-		s_sendmsg	sendmsg(MSG_SAVEWAVE)													//send SPI a message and wait for SPI's write to EXEC  
-	end
-
-  L_SLEEP:		
-	s_sleep 0x2
-	
-	if (EMU_RUN_HACK)
-																							
-	else
-		s_cbranch_execz	L_SLEEP                                                         
-	end
-
-
-	/*      setup Resource Contants    */
-	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))	
-		//calculate wd_addr using absolute thread id 
-		v_readlane_b32 s_save_tmp, v9, 0
-        //determine it is wave32 or wave64
-        s_getreg_b32 	s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
-        s_cmp_eq_u32    s_wave_size, 0
-        s_cbranch_scc1  L_SAVE_WAVE32
-        s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64
-        s_branch    L_SAVE_CON
-    L_SAVE_WAVE32:
-        s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32
-    L_SAVE_CON:
-		s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
-		s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
-		s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
-		s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL		
-	else
-	end
-	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
-		s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
-		s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
-		s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL		
-	else
-	end
-	
-	
-	s_mov_b32		s_save_buf_rsrc0, 	s_save_spi_init_lo														//base_addr_lo
-	s_and_b32		s_save_buf_rsrc1, 	s_save_spi_init_hi, 0x0000FFFF											//base_addr_hi
-	s_or_b32		s_save_buf_rsrc1, 	s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
-    s_mov_b32       s_save_buf_rsrc2,   0                                               						//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
-	s_mov_b32		s_save_buf_rsrc3, 	S_SAVE_BUF_RSRC_WORD3_MISC
-	s_and_b32		s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK		
-	s_lshr_b32		s_save_tmp,  		s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)			//get ATC bit into position
-	s_or_b32		s_save_buf_rsrc3, 	s_save_buf_rsrc3,  s_save_tmp											//or ATC
-	s_and_b32		s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK		
-	s_lshr_b32		s_save_tmp,  		s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)		//get MTYPE bits into position
-	s_or_b32		s_save_buf_rsrc3, 	s_save_buf_rsrc3,  s_save_tmp											//or MTYPE	
-	
-	s_mov_b32		s_save_m0,			m0																	//save M0
-	
-	/* 		global mem offset			*/
-	s_mov_b32		s_save_mem_offset, 	0x0																		//mem offset initial value = 0
-    s_getreg_b32 	s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size
-    s_or_b32        s_wave_size, s_save_spi_init_hi,    s_wave_size                                             //share s_wave_size with exec_hi
-
-    /*      	save VGPRs	    */
-	//////////////////////////////
-  L_SAVE_VGPR:
-  
- 	s_mov_b32		exec_lo, 0xFFFFFFFF 											//need every thread from now on
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1  
-    s_cbranch_scc1  L_ENABLE_SAVE_VGPR_EXEC_HI   
-    s_mov_b32		exec_hi, 0x00000000
-    s_branch        L_SAVE_VGPR_NORMAL
-  L_ENABLE_SAVE_VGPR_EXEC_HI:
-	s_mov_b32		exec_hi, 0xFFFFFFFF
-  L_SAVE_VGPR_NORMAL:	
-	s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 					//vpgr_size
-	//for wave32 and wave64, the num of vgpr function is the same?
-    s_add_u32 		s_save_alloc_size, s_save_alloc_size, 1
-	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 2 						//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
-    //determine it is wave32 or wave64
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_SAVE_VGPR_WAVE64
-
-    //zhenxu added it for save vgpr for wave32
-	s_lshl_b32		s_save_buf_rsrc2,  s_save_alloc_size, 7							//NUM_RECORDS in bytes (32 threads*4)
-	if (SWIZZLE_EN)
-		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
-	end
-	
-    s_mov_b32 		m0, 0x0 														//VGPR initial index value =0
-	//s_set_gpr_idx_on  m0, 0x1														//M0[7:0] = M0[7:0] and M0[15:12] = 0x1
-    //s_add_u32		s_save_alloc_size, s_save_alloc_size, 0x1000					//add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
-
-  L_SAVE_VGPR_WAVE32_LOOP: 										
-	v_movrels_b32 		v0, v0															//v0 = v[0+m0]	
-	    
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
-		tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
-		buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
-	end
-
-    s_add_u32		m0, m0, 1														//next vgpr index
-	s_add_u32		s_save_mem_offset, s_save_mem_offset, 128						//every buffer_store_dword does 128 bytes
-	s_cmp_lt_u32 	m0,	s_save_alloc_size 											//scc = (m0 < s_save_alloc_size) ? 1 : 0
-	s_cbranch_scc1 	L_SAVE_VGPR_WAVE32_LOOP												//VGPR save is complete?
-    s_branch    L_SAVE_LDS
-    //save vgpr for wave32 ends
-
-  L_SAVE_VGPR_WAVE64:
-	s_lshl_b32		s_save_buf_rsrc2,  s_save_alloc_size, 8							//NUM_RECORDS in bytes (64 threads*4)
-	if (SWIZZLE_EN)
-		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
-	end
-	
-    s_mov_b32 		m0, 0x0 														//VGPR initial index value =0
-	//s_set_gpr_idx_on  m0, 0x1														//M0[7:0] = M0[7:0] and M0[15:12] = 0x1
-    //s_add_u32		s_save_alloc_size, s_save_alloc_size, 0x1000					//add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
-
-  L_SAVE_VGPR_WAVE64_LOOP: 										
-	v_movrels_b32 		v0, v0															//v0 = v[0+m0]	
-	    
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
-		tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
-		buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
-	end
-
-    s_add_u32		m0, m0, 1														//next vgpr index
-	s_add_u32		s_save_mem_offset, s_save_mem_offset, 256						//every buffer_store_dword does 256 bytes
-	s_cmp_lt_u32 	m0,	s_save_alloc_size 											//scc = (m0 < s_save_alloc_size) ? 1 : 0
-	s_cbranch_scc1 	L_SAVE_VGPR_WAVE64_LOOP												//VGPR save is complete?
-	//s_set_gpr_idx_off
-    //
-    //Below part will be the save shared vgpr part (new for gfx10)
-    s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 			//shared_vgpr_size
-    s_and_b32		s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF				//shared_vgpr_size is zero?
-    s_cbranch_scc0	L_SAVE_LDS													    //no shared_vgpr used? jump to L_SAVE_LDS
-    s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 3 						//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
-    //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
-    //save shared_vgpr will start from the index of m0
-    s_add_u32       s_save_alloc_size, s_save_alloc_size, m0
-    s_mov_b32		exec_lo, 0xFFFFFFFF
-    s_mov_b32		exec_hi, 0x00000000
-    L_SAVE_SHARED_VGPR_WAVE64_LOOP: 										
-	v_movrels_b32 		v0, v0															//v0 = v[0+m0]	
-	buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
-    s_add_u32		m0, m0, 1														//next vgpr index
-	s_add_u32		s_save_mem_offset, s_save_mem_offset, 128						//every buffer_store_dword does 256 bytes
-	s_cmp_lt_u32 	m0,	s_save_alloc_size 											//scc = (m0 < s_save_alloc_size) ? 1 : 0
-	s_cbranch_scc1 	L_SAVE_SHARED_VGPR_WAVE64_LOOP									//SHARED_VGPR save is complete?
-    
-	/*      	save LDS	    */
-	//////////////////////////////
-  L_SAVE_LDS:
-
-    //Only check the first wave need LDS
-	/*      the first wave in the threadgroup    */
-	s_barrier																		//FIXME  not performance-optimal "LDS is used? wait for other waves in the same TG" 
-	s_and_b32		s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK								//exec is still used here
-	s_cbranch_scc0	L_SAVE_SGPR
-	
-	s_mov_b32		exec_lo, 0xFFFFFFFF 											//need every thread from now on
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_ENABLE_SAVE_LDS_EXEC_HI   
-    s_mov_b32		exec_hi, 0x00000000
-    s_branch        L_SAVE_LDS_NORMAL
-  L_ENABLE_SAVE_LDS_EXEC_HI:
-	s_mov_b32		exec_hi, 0xFFFFFFFF
-  L_SAVE_LDS_NORMAL:	
-	s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 			//lds_size
-	s_and_b32		s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF				//lds_size is zero?
-	s_cbranch_scc0	L_SAVE_SGPR														//no lds used? jump to L_SAVE_VGPR
-	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 6 						//LDS size in dwords = lds_size * 64dw
-	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 2 						//LDS size in bytes
-	s_mov_b32		s_save_buf_rsrc2,  s_save_alloc_size  							//NUM_RECORDS in bytes
-	if (SWIZZLE_EN)
-		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
-	end
-
-    //load 0~63*4(byte address) to vgpr v15
-    v_mbcnt_lo_u32_b32 v0, -1, 0
-    v_mbcnt_hi_u32_b32 v0, -1, v0
-    v_mul_u32_u24 v0, 4, v0
-
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_mov_b32 		m0, 0x0
-    s_cbranch_scc1  L_SAVE_LDS_LOOP_W64
-
-  L_SAVE_LDS_LOOP_W32:									
-	if (SAVE_LDS)
-    ds_read_b32 v1, v0
-    s_waitcnt 0														    //ensure data ready
-    buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
-	//buffer_store_lds_dword	s_save_buf_rsrc0, s_save_mem_offset lds:1               //save lds to memory doesn't exist in 10
-	end
-	s_add_u32		m0, m0, 128															//every buffer_store_lds does 128 bytes
-	s_add_u32		s_save_mem_offset, s_save_mem_offset, 128							//mem offset increased by 128 bytes
-    v_add_nc_u32    v0, v0, 128
-	s_cmp_lt_u32	m0, s_save_alloc_size												//scc=(m0 < s_save_alloc_size) ? 1 : 0
-	s_cbranch_scc1  L_SAVE_LDS_LOOP_W32													//LDS save is complete?
-    s_branch        L_SAVE_SGPR
-
-  L_SAVE_LDS_LOOP_W64:									
-	if (SAVE_LDS)
-    ds_read_b32 v1, v0
-    s_waitcnt 0														    //ensure data ready
-    buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
-	//buffer_store_lds_dword	s_save_buf_rsrc0, s_save_mem_offset lds:1               //save lds to memory doesn't exist in 10
-	end
-	s_add_u32		m0, m0, 256															//every buffer_store_lds does 256 bytes
-	s_add_u32		s_save_mem_offset, s_save_mem_offset, 256							//mem offset increased by 256 bytes
-    v_add_nc_u32    v0, v0, 256
-	s_cmp_lt_u32	m0, s_save_alloc_size												//scc=(m0 < s_save_alloc_size) ? 1 : 0
-	s_cbranch_scc1  L_SAVE_LDS_LOOP_W64													//LDS save is complete?
-   
-	
-	/*      	save SGPRs	    */
-	//////////////////////////////
-	//s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) 				//spgr_size
-	//s_add_u32 		s_save_alloc_size, s_save_alloc_size, 1
-	//s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 4 						//Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value) 
-	//s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 3 						//In gfx10, Number of SGPRs = (sgpr_size + 1) * 8   (non-zero value) 
-  L_SAVE_SGPR:
-    //need to look at it is wave32 or wave64
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_SAVE_SGPR_VMEM_WAVE64
-    if (SGPR_SAVE_USE_SQC)
-		s_lshl_b32		s_save_buf_rsrc2,	s_sgpr_save_num, 2					//NUM_RECORDS in bytes
-    else
-        s_lshl_b32		s_save_buf_rsrc2,	s_sgpr_save_num, 7					//NUM_RECORDS in bytes (32 threads)
-    end
-    s_branch    L_SAVE_SGPR_CONT    
-  L_SAVE_SGPR_VMEM_WAVE64:
-	if (SGPR_SAVE_USE_SQC)
-		s_lshl_b32		s_save_buf_rsrc2,	s_sgpr_save_num, 2					//NUM_RECORDS in bytes 
-	else
-		s_lshl_b32		s_save_buf_rsrc2,	s_sgpr_save_num, 8					//NUM_RECORDS in bytes (64 threads)
-	end
-  L_SAVE_SGPR_CONT:
-	if (SWIZZLE_EN)
-		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
-	end
-	
-	//s_mov_b32 		m0, 0x0 														//SGPR initial index value =0		
-    //s_nop           0x0                                                             //Manually inserted wait states
-	
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1
-    
-    s_mov_b32 		m0, 0x0 														//SGPR initial index value =0		
-    s_nop           0x0                                                             //Manually inserted wait states
-
-    s_cbranch_scc1  L_SAVE_SGPR_LOOP_WAVE64
-
-  L_SAVE_SGPR_LOOP_WAVE32: 										
-	s_movrels_b32 	s0, s0 															//s0 = s[0+m0]
-    //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change    
-	write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)							//PV: the best performance should be using s_buffer_store_dwordx4
-	s_add_u32		m0, m0, 1														//next sgpr index
-	s_cmp_lt_u32 	m0, s_sgpr_save_num 											//scc = (m0 < s_sgpr_save_num) ? 1 : 0
-	s_cbranch_scc1 	L_SAVE_SGPR_LOOP_WAVE32												//SGPR save is complete?
-    s_branch    L_SAVE_HWREG
-
-  L_SAVE_SGPR_LOOP_WAVE64: 										
-	s_movrels_b32 	s0, s0 															//s0 = s[0+m0]
-    //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change    
-	write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)							//PV: the best performance should be using s_buffer_store_dwordx4
-	s_add_u32		m0, m0, 1														//next sgpr index
-	s_cmp_lt_u32 	m0, s_sgpr_save_num 											//scc = (m0 < s_sgpr_save_num) ? 1 : 0
-	s_cbranch_scc1 	L_SAVE_SGPR_LOOP_WAVE64												//SGPR save is complete?
-
-	
-	/* 		save HW registers	*/
-	//////////////////////////////
-  L_SAVE_HWREG:
-    s_mov_b32		s_save_buf_rsrc2, 0x4								//NUM_RECORDS	in bytes
-	if (SWIZZLE_EN)
-		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
-	end
-
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_SAVE_HWREG_WAVE64
-	
-	write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)					//M0
-
-	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))      
-		s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
-		s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0			//carry bit over
-	end
-
-	write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)					//PC
-	write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
-	write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//EXEC
-	write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
-	write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//STATUS 
-	
-	//s_save_trapsts conflicts with s_save_alloc_size
-	s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
-	write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//TRAPSTS
-	
-	//write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)			//XNACK_MASK_LO
-	write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)			//XNACK_MASK_HI
-	
-	//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
-	s_getreg_b32 	s_save_m0, hwreg(HW_REG_MODE)																						//MODE
-	write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
-    if(SAVE_RESTORE_HWID_DDID)
-    s_getreg_b32 	s_save_m0, hwreg(HW_REG_HW_ID1)																						//HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
-    write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
-    end
-    s_branch   L_S_PGM_END_SAVED
-
-  L_SAVE_HWREG_WAVE64:
-    write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)					//M0
-
-	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))      
-		s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
-		s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0			//carry bit over
-	end
-
-	write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)					//PC
-	write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
-	write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//EXEC
-	write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
-	write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//STATUS 
-	
-	//s_save_trapsts conflicts with s_save_alloc_size
-	s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
-	write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//TRAPSTS
-	
-	//write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)			//XNACK_MASK_LO
-	write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)			//XNACK_MASK_HI
-	
-	//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
-	s_getreg_b32 	s_save_m0, hwreg(HW_REG_MODE)																						//MODE
-	write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
-
-
-    if(SAVE_RESTORE_HWID_DDID)
-    s_getreg_b32 	s_save_m0, hwreg(HW_REG_HW_ID1)																						//HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
-    write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
-
-	/* 		save DDID	*/
-	//////////////////////////////
-  L_SAVE_DDID:
-    //EXEC has been saved, no vector inst following
-    s_mov_b32	exec_lo, 0x80000000    //Set MSB to 1. Cleared when draw index is returned
-    s_sendmsg sendmsg(MSG_GET_DDID)
-
-  L_WAIT_DDID_LOOP:    
-    s_nop		7			// sleep a bit
-    s_bitcmp0_b32 exec_lo, 31	// test to see if MSB is cleared, meaning done
-    s_cbranch_scc0	L_WAIT_DDID_LOOP
-
-    s_mov_b32	s_save_m0, exec_lo
-
-
-    s_mov_b32		s_save_buf_rsrc2, 0x4								//NUM_RECORDS	in bytes
-	if (SWIZZLE_EN)
-		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
-	end
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_SAVE_DDID_WAVE64
-
-    write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) 
-
-  L_SAVE_DDID_WAVE64:
-    write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) 
-
-    end
-   
-  L_S_PGM_END_SAVED:
-	/*     S_PGM_END_SAVED  */    							//FIXME  graphics ONLY
-	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))	
-		s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
-		s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
-		s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0			//carry bit over
-		s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
-	else
-	end
-
-	
-    s_branch	L_END_PGM
-	
-
-				
-/**************************************************************************/
-/*                     	restore routine						              */
-/**************************************************************************/
+
+	/* inform SPI the readiness and wait for SPI's go signal */
+	s_mov_b32	s_save_exec_lo, exec_lo					//save EXEC and use EXEC for the go signal from SPI
+	s_mov_b32	s_save_exec_hi, exec_hi
+	s_mov_b64	exec, 0x0						//clear EXEC to get ready to receive
+
+	s_sendmsg	sendmsg(MSG_SAVEWAVE)					//send SPI a message and wait for SPI's write to EXEC
+
+L_SLEEP:
+	// sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
+	// SQ hang, since the 7,8th wave could not get arbit to exec inst, while
+	// other waves are stuck into the sleep-loop and waiting for wrexec!=0
+	s_sleep		0x2
+	s_cbranch_execz	L_SLEEP
+
+	/* setup Resource Contants */
+	s_mov_b32	s_save_buf_rsrc0, s_save_spi_init_lo			//base_addr_lo
+	s_and_b32	s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF	//base_addr_hi
+	s_or_b32	s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
+	s_mov_b32	s_save_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+	s_mov_b32	s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
+	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+	s_lshr_b32	s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
+	s_or_b32	s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp		//or ATC
+	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+	s_lshr_b32	s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
+	s_or_b32	s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp		//or MTYPE
+
+	s_mov_b32	s_save_m0, m0
+
+	/* global mem offset */
+	s_mov_b32	s_save_mem_offset, 0x0
+	s_getreg_b32	s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
+	s_lshl_b32	s_wave_size, s_wave_size, S_WAVE_SIZE
+	s_or_b32	s_wave_size, s_save_spi_init_hi, s_wave_size		//share s_wave_size with exec_hi, it's at bit25
+
+	/* save HW registers */
+
+L_SAVE_HWREG:
+	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
+	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
+	get_svgpr_size_bytes(s_save_tmp)
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
+
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
+
+	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+	write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
+
+	s_getreg_b32	s_save_m0, hwreg(HW_REG_MODE)
+	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+
+	/* the first wave in the threadgroup */
+	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
+	s_mov_b32	s_save_exec_hi, 0x0
+	s_or_b32	s_save_exec_hi, s_save_tmp, s_save_exec_hi		// save first wave bit to s_save_exec_hi.bits[26]
+
+	/* save SGPRs */
+	// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+
+	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
+	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
+	get_svgpr_size_bytes(s_save_tmp)
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+	s_mov_b32	s_save_xnack_mask, s_save_buf_rsrc0
+	s_add_u32	s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+	s_addc_u32	s_save_buf_rsrc1, s_save_buf_rsrc1, 0
+
+	s_mov_b32	m0, 0x0							//SGPR initial index value =0
+	s_nop		0x0							//Manually inserted wait states
+L_SAVE_SGPR_LOOP:
+	// SGPR is allocated in 16 SGPR granularity
+	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
+	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
+	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
+	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
+	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
+	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
+	s_movrels_b64	s12, s12						//s12 = s[12+m0], s13 = s[13+m0]
+	s_movrels_b64	s14, s14						//s14 = s[14+m0], s15 = s[15+m0]
+
+	write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
+	s_add_u32	m0, m0, 16						//next sgpr index
+	s_cmp_lt_u32	m0, 96							//scc = (m0 < first 96 SGPR) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_SGPR_LOOP					//first 96 SGPR save is complete?
+
+	//save the rest 10 SGPR
+	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
+	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
+	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
+	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
+	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
+	write_10sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
+
+	// restore s_save_buf_rsrc0,1
+	s_mov_b32	s_save_buf_rsrc0, s_save_xnack_mask
+
+	/* save first 4 VGPR, then LDS save could use   */
+	// each wave will alloc 4 vgprs at least...
+
+	s_mov_b32	s_save_mem_offset, 0
+ 	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_SAVE_4VGPR_EXEC_HI
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_SAVE_4VGPR_WAVE32
+L_ENABLE_SAVE_4VGPR_EXEC_HI:
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+	s_branch	L_SAVE_4VGPR_WAVE64
+L_SAVE_4VGPR_WAVE32:
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR Allocated in 4-GPR granularity
+
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
+	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
+	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
+	s_branch	L_SAVE_LDS
+
+L_SAVE_4VGPR_WAVE64:
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR Allocated in 4-GPR granularity
+
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
+
+	/* save LDS */
+
+L_SAVE_LDS:
+	// Change EXEC to all threads...
+	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_SAVE_LDS_EXEC_HI
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_SAVE_LDS_NORMAL
+L_ENABLE_SAVE_LDS_EXEC_HI:
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+L_SAVE_LDS_NORMAL:
+	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
+	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//lds_size is zero?
+	s_cbranch_scc0	L_SAVE_LDS_DONE						//no lds used? jump to L_SAVE_DONE
+
+	s_barrier								//LDS is used? wait for other waves in the same TG
+	s_and_b32	s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
+	s_cbranch_scc0	L_SAVE_LDS_DONE
+
+	// first wave do LDS save;
+
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 6			//LDS size in dwords = lds_size * 64dw
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//LDS size in bytes
+	s_mov_b32	s_save_buf_rsrc2, s_save_alloc_size			//NUM_RECORDS in bytes
+
+	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
+	//
+	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
+	get_svgpr_size_bytes(s_save_tmp)
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	//load 0~63*4(byte address) to vgpr v0
+	v_mbcnt_lo_u32_b32	v0, -1, 0
+	v_mbcnt_hi_u32_b32	v0, -1, v0
+	v_mul_u32_u24	v0, 4, v0
+
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_mov_b32	m0, 0x0
+	s_cbranch_scc1	L_SAVE_LDS_W64
+
+L_SAVE_LDS_W32:
+	s_mov_b32	s3, 128
+	s_nop		0
+	s_nop		0
+	s_nop		0
+L_SAVE_LDS_LOOP_W32:
+	ds_read_b32	v1, v0
+	s_waitcnt	0
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
+	v_add_nc_u32	v0, v0, 128						//mem offset increased by 128 bytes
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_LDS_LOOP_W32					//LDS save is complete?
+
+	s_branch	L_SAVE_LDS_DONE
+
+L_SAVE_LDS_W64:
+	s_mov_b32	s3, 256
+	s_nop		0
+	s_nop		0
+	s_nop		0
+L_SAVE_LDS_LOOP_W64:
+	ds_read_b32	v1, v0
+	s_waitcnt	0
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
+	v_add_nc_u32	v0, v0, 256						//mem offset increased by 256 bytes
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_LDS_LOOP_W64					//LDS save is complete?
+
+L_SAVE_LDS_DONE:
+	/* save VGPRs  - set the Rest VGPRs */
+L_SAVE_VGPR:
+	// VGPR SR memory offset: 0
+	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_SAVE_VGPR_EXEC_HI
+	s_mov_b32	s_save_mem_offset, (0+128*4)				// for the rest VGPRs
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_SAVE_VGPR_NORMAL
+L_ENABLE_SAVE_VGPR_EXEC_HI:
+	s_mov_b32	s_save_mem_offset, (0+256*4)				// for the rest VGPRs
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+L_SAVE_VGPR_NORMAL:
+	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
+	s_add_u32	s_save_alloc_size, s_save_alloc_size, 1
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+	//determine it is wave32 or wave64
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_SAVE_VGPR_WAVE64
+
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR Allocated in 4-GPR granularity
+
+	// VGPR store using dw burst
+	s_mov_b32	m0, 0x4							//VGPR initial index value =4
+	s_cmp_lt_u32	m0, s_save_alloc_size
+	s_cbranch_scc0	L_SAVE_VGPR_END
+
+L_SAVE_VGPR_W32_LOOP:
+	v_movrels_b32	v0, v0							//v0 = v[0+m0]
+	v_movrels_b32	v1, v1							//v1 = v[1+m0]
+	v_movrels_b32	v2, v2							//v2 = v[2+m0]
+	v_movrels_b32	v3, v3							//v3 = v[3+m0]
+
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
+	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
+	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
+
+	s_add_u32	m0, m0, 4						//next vgpr index
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128*4		//every buffer_store_dword does 128 bytes
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_VGPR_W32_LOOP					//VGPR save is complete?
+
+	s_branch	L_SAVE_VGPR_END
+
+L_SAVE_VGPR_WAVE64:
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR store using dw burst
+	s_mov_b32	m0, 0x4							//VGPR initial index value =4
+	s_cmp_lt_u32	m0, s_save_alloc_size
+	s_cbranch_scc0	L_SAVE_VGPR_END
+
+L_SAVE_VGPR_W64_LOOP:
+	v_movrels_b32	v0, v0							//v0 = v[0+m0]
+	v_movrels_b32	v1, v1							//v1 = v[1+m0]
+	v_movrels_b32	v2, v2							//v2 = v[2+m0]
+	v_movrels_b32	v3, v3							//v3 = v[3+m0]
+
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
+
+	s_add_u32	m0, m0, 4						//next vgpr index
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, 256*4		//every buffer_store_dword does 256 bytes
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_VGPR_W64_LOOP					//VGPR save is complete?
+
+	//Below part will be the save shared vgpr part (new for gfx10)
+	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
+	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
+	s_cbranch_scc0	L_SAVE_VGPR_END						//no shared_vgpr used? jump to L_SAVE_LDS
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 3			//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
+	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
+	//save shared_vgpr will start from the index of m0
+	s_add_u32	s_save_alloc_size, s_save_alloc_size, m0
+	s_mov_b32	exec_lo, 0xFFFFFFFF
+	s_mov_b32	exec_hi, 0x00000000
+L_SAVE_SHARED_VGPR_WAVE64_LOOP:
+	v_movrels_b32	v0, v0							//v0 = v[0+m0]
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	s_add_u32	m0, m0, 1						//next vgpr index
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_SHARED_VGPR_WAVE64_LOOP				//SHARED_VGPR save is complete?
+
+L_SAVE_VGPR_END:
+	s_branch	L_END_PGM
 
 L_RESTORE:
-    /*      Setup Resource Contants    */
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
-		//calculate wd_addr using absolute thread id
-		v_readlane_b32 s_restore_tmp, v9, 0
-        //determine it is wave32 or wave64
-        s_getreg_b32 	s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13
-        s_cmp_eq_u32    s_restore_size, 0
-        s_cbranch_scc1  L_RESTORE_WAVE32
-        s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64
-        s_branch    L_RESTORE_CON
-    L_RESTORE_WAVE32:
-        s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32
-    L_RESTORE_CON:
-		s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
-		s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
-		s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
-		s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL	
-	else
-	end
-	
-    s_mov_b32		s_restore_buf_rsrc0, 	s_restore_spi_init_lo															//base_addr_lo
-	s_and_b32		s_restore_buf_rsrc1, 	s_restore_spi_init_hi, 0x0000FFFF												//base_addr_hi
-	s_or_b32		s_restore_buf_rsrc1, 	s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
-    s_mov_b32       s_restore_buf_rsrc2,   	0                                               								//NUM_RECORDS initial value = 0 (in bytes)
-	s_mov_b32		s_restore_buf_rsrc3, 	S_RESTORE_BUF_RSRC_WORD3_MISC
-	s_and_b32		s_restore_tmp,         	s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK		
-	s_lshr_b32		s_restore_tmp,  		s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)		//get ATC bit into position
-	s_or_b32		s_restore_buf_rsrc3, 	s_restore_buf_rsrc3,  s_restore_tmp												//or ATC
-	s_and_b32		s_restore_tmp,         	s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK		
-	s_lshr_b32		s_restore_tmp,  		s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)	//get MTYPE bits into position
-	s_or_b32		s_restore_buf_rsrc3, 	s_restore_buf_rsrc3,  s_restore_tmp												//or MTYPE
-    //determine it is wave32 or wave64
-    s_getreg_b32 	s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
-    s_or_b32        s_restore_size, s_restore_spi_init_hi,    s_restore_size                                             //share s_wave_size with exec_hi
-	
-	/* 		global mem offset			*/
-	s_mov_b32		s_restore_mem_offset, 0x0								//mem offset initial value = 0
-
-        /*      	restore VGPRs	    */
-	//////////////////////////////
-  L_RESTORE_VGPR:
-  
- 	s_mov_b32		exec_lo, 0xFFFFFFFF 													//need every thread from now on   //be consistent with SAVE although can be moved ahead
-    s_and_b32       m0, s_restore_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_ENABLE_RESTORE_VGPR_EXEC_HI   
-    s_mov_b32		exec_hi, 0x00000000
-    s_branch        L_RESTORE_VGPR_NORMAL
-  L_ENABLE_RESTORE_VGPR_EXEC_HI:
-	s_mov_b32		exec_hi, 0xFFFFFFFF
-  L_RESTORE_VGPR_NORMAL:	
-	s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 	//vpgr_size
-	s_add_u32 		s_restore_alloc_size, s_restore_alloc_size, 1
-	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 2 							//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
-    //determine it is wave32 or wave64
-    s_and_b32       m0, s_restore_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_RESTORE_VGPR_WAVE64
-
-    s_lshl_b32		s_restore_buf_rsrc2,  s_restore_alloc_size, 7						    //NUM_RECORDS in bytes (32 threads*4)
-	if (SWIZZLE_EN)
-		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
-	end	
-
-	s_mov_b32		s_restore_mem_offset_save, s_restore_mem_offset							// restore start with v1, v0 will be the last
-	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 128
-    s_mov_b32 		m0, 1 																	//VGPR initial index value = 1
-	//s_set_gpr_idx_on  m0, 0x8																//M0[7:0] = M0[7:0] and M0[15:12] = 0x8
-    //s_add_u32		s_restore_alloc_size, s_restore_alloc_size, 0x8000						//add 0x8000 since we compare m0 against it later, might not need this in gfx10	
-
-  L_RESTORE_VGPR_WAVE32_LOOP: 										
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
-		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
-		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1	
-	end
-	s_waitcnt		vmcnt(0)																//ensure data ready
-	v_movreld_b32		v0, v0																	//v[0+m0] = v0
-    s_add_u32		m0, m0, 1																//next vgpr index
-	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 128							//every buffer_load_dword does 128 bytes
-	s_cmp_lt_u32 	m0,	s_restore_alloc_size 												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
-	s_cbranch_scc1 	L_RESTORE_VGPR_WAVE32_LOOP														//VGPR restore (except v0) is complete?
-	//s_set_gpr_idx_off
-																							/* VGPR restore on v0 */
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
-		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
-		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save	slc:1 glc:1	
-	end
-
-    s_branch    L_RESTORE_LDS
-
-  L_RESTORE_VGPR_WAVE64:
-    s_lshl_b32		s_restore_buf_rsrc2,  s_restore_alloc_size, 8						    //NUM_RECORDS in bytes (64 threads*4)
-	if (SWIZZLE_EN)
-		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
-	end	
-
-	s_mov_b32		s_restore_mem_offset_save, s_restore_mem_offset							// restore start with v1, v0 will be the last
-	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256
-    s_mov_b32 		m0, 1 																	//VGPR initial index value = 1
-  L_RESTORE_VGPR_WAVE64_LOOP: 										
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
-		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
-		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1	
-	end
-	s_waitcnt		vmcnt(0)																//ensure data ready
-	v_movreld_b32		v0, v0																	//v[0+m0] = v0
-    s_add_u32		m0, m0, 1																//next vgpr index
-	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256							//every buffer_load_dword does 256 bytes
-	s_cmp_lt_u32 	m0,	s_restore_alloc_size 												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
-	s_cbranch_scc1 	L_RESTORE_VGPR_WAVE64_LOOP														//VGPR restore (except v0) is complete?
-	//s_set_gpr_idx_off
-    //
-    //Below part will be the restore shared vgpr part (new for gfx10)
-    s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 			//shared_vgpr_size
-    s_and_b32		s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF				//shared_vgpr_size is zero?
-    s_cbranch_scc0	L_RESTORE_V0													    //no shared_vgpr used? jump to L_SAVE_LDS
-    s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 3 						//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
-    //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
-    //restore shared_vgpr will start from the index of m0
-    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, m0
-    s_mov_b32		exec_lo, 0xFFFFFFFF
-    s_mov_b32		exec_hi, 0x00000000
-    L_RESTORE_SHARED_VGPR_WAVE64_LOOP: 
-    buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1
-    s_waitcnt		vmcnt(0)																//ensure data ready
-	v_movreld_b32		v0, v0																	//v[0+m0] = v0
-    s_add_u32		m0, m0, 1																//next vgpr index
-	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 128							//every buffer_load_dword does 256 bytes
-	s_cmp_lt_u32 	m0,	s_restore_alloc_size 												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
-	s_cbranch_scc1 	L_RESTORE_SHARED_VGPR_WAVE64_LOOP														//VGPR restore (except v0) is complete?
-
-    s_mov_b32 exec_hi, 0xFFFFFFFF                                                           //restore back exec_hi before restoring V0!!
-	
-    /* VGPR restore on v0 */
-  L_RESTORE_V0:
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)       
-		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
-		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save	slc:1 glc:1	
-	end
-
-
-    /*      	restore LDS	    */
-	//////////////////////////////
-  L_RESTORE_LDS:
-
-    //Only need to check the first wave    
-	/*      the first wave in the threadgroup    */
-	s_and_b32		s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK			
-	s_cbranch_scc0	L_RESTORE_SGPR
-	
-    s_mov_b32		exec_lo, 0xFFFFFFFF 													//need every thread from now on   //be consistent with SAVE although can be moved ahead
-    s_and_b32       m0, s_restore_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_ENABLE_RESTORE_LDS_EXEC_HI   
-    s_mov_b32		exec_hi, 0x00000000
-    s_branch        L_RESTORE_LDS_NORMAL
-  L_ENABLE_RESTORE_LDS_EXEC_HI:
-	s_mov_b32		exec_hi, 0xFFFFFFFF
-  L_RESTORE_LDS_NORMAL:	
-	s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 				//lds_size
-	s_and_b32		s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF					//lds_size is zero?
-	s_cbranch_scc0	L_RESTORE_SGPR															//no lds used? jump to L_RESTORE_VGPR
-	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 6 							//LDS size in dwords = lds_size * 64dw
-	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 2 							//LDS size in bytes
-	s_mov_b32		s_restore_buf_rsrc2,	s_restore_alloc_size							//NUM_RECORDS in bytes
-	if (SWIZZLE_EN)
-		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
-	end
-
-    s_and_b32       m0, s_wave_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_mov_b32 		m0, 0x0
-    s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64
-
-  L_RESTORE_LDS_LOOP_W32:									
-	if (SAVE_LDS)
-	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
-    s_waitcnt 0
-	end
-    s_add_u32		m0, m0, 128																//every buffer_load_dword does 256 bytes
-	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 128						//mem offset increased by 256 bytes
-	s_cmp_lt_u32	m0, s_restore_alloc_size												//scc=(m0 < s_restore_alloc_size) ? 1 : 0
-	s_cbranch_scc1  L_RESTORE_LDS_LOOP_W32														//LDS restore is complete?
-    s_branch        L_RESTORE_SGPR
-
-  L_RESTORE_LDS_LOOP_W64:									
-	if (SAVE_LDS)
-	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
-    s_waitcnt 0
-	end
-    s_add_u32		m0, m0, 256																//every buffer_load_dword does 256 bytes
-	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256							//mem offset increased by 256 bytes
-	s_cmp_lt_u32	m0, s_restore_alloc_size												//scc=(m0 < s_restore_alloc_size) ? 1 : 0
-	s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64														//LDS restore is complete?
-
-	
-    /*      	restore SGPRs	    */
-	//////////////////////////////
-	//s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) 				//spgr_size
-	//s_add_u32 		s_restore_alloc_size, s_restore_alloc_size, 1
-	//s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 4 							//Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
-	//s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 3 							//Number of SGPRs = (sgpr_size + 1) * 8   (non-zero value)
-  L_RESTORE_SGPR:
-    //need to look at it is wave32 or wave64
-    s_and_b32       m0, s_restore_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_RESTORE_SGPR_VMEM_WAVE64
-	if (SGPR_SAVE_USE_SQC)
-		s_lshl_b32		s_restore_buf_rsrc2,	s_sgpr_save_num, 2						//NUM_RECORDS in bytes 
-	else
-        s_lshl_b32		s_restore_buf_rsrc2,	s_sgpr_save_num, 7						//NUM_RECORDS in bytes (32 threads)
-    end
-    s_branch        L_RESTORE_SGPR_CONT
-  L_RESTORE_SGPR_VMEM_WAVE64:
-    if (SGPR_SAVE_USE_SQC)
-		s_lshl_b32		s_restore_buf_rsrc2,	s_sgpr_save_num, 2						//NUM_RECORDS in bytes 
-	else
-		s_lshl_b32		s_restore_buf_rsrc2,	s_sgpr_save_num, 8						//NUM_RECORDS in bytes (64 threads)
-	end
-
-  L_RESTORE_SGPR_CONT:
-	if (SWIZZLE_EN)
-		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
-	end
-
-    s_and_b32       m0, s_restore_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_RESTORE_SGPR_WAVE64
-
-    read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)		//save s0 to s_restore_tmp
-	s_mov_b32 		m0, 0x1
-
-  L_RESTORE_SGPR_LOOP_WAVE32:
-    read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)															//PV: further performance improvement can be made
-	s_waitcnt		lgkmcnt(0)																//ensure data ready
-	s_movreld_b32 	s0, s0                                                                  //s[0+m0] = s0
-    s_nop 0                                                                                 // hazard SALU M0=> S_MOVREL
-	s_add_u32		m0, m0, 1																//next sgpr index
-	s_cmp_lt_u32 	m0, s_sgpr_save_num												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
-	s_cbranch_scc1 	L_RESTORE_SGPR_LOOP_WAVE32														//SGPR restore (except s0) is complete?
-	s_mov_b32		s0, s_restore_tmp															/* SGPR restore on s0 */
-    s_branch        L_RESTORE_HWREG
-  
-  L_RESTORE_SGPR_WAVE64:
-	read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)		//save s0 to s_restore_tmp
-	s_mov_b32 		m0, 0x1																				//SGPR initial index value =1	//go on with with s1
-	
-  L_RESTORE_SGPR_LOOP_WAVE64: 																					
-	read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)															//PV: further performance improvement can be made
-	s_waitcnt		lgkmcnt(0)																//ensure data ready
-	s_movreld_b32 	s0, s0                                                                  //s[0+m0] = s0
-    s_nop 0                                                                                 // hazard SALU M0=> S_MOVREL
-	s_add_u32		m0, m0, 1																//next sgpr index
-	s_cmp_lt_u32 	m0, s_sgpr_save_num												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
-	s_cbranch_scc1 	L_RESTORE_SGPR_LOOP_WAVE64														//SGPR restore (except s0) is complete?
-	s_mov_b32		s0, s_restore_tmp															/* SGPR restore on s0 */
-
-	
-    /* 		restore HW registers	*/
-	//////////////////////////////
-  L_RESTORE_HWREG:
-    s_mov_b32		s_restore_buf_rsrc2, 0x4												//NUM_RECORDS	in bytes
-	if (SWIZZLE_EN)
-		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
-	end
-
-    s_and_b32       m0, s_restore_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_RESTORE_HWREG_WAVE64
-
-    read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//M0
-	read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//PC
-	read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
-	read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//EXEC
-	read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
-	read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//STATUS
-	read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//TRAPSTS
-    //read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK_LO
-	//read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK_HI
-    read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK
-	read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//MODE
-    if(SAVE_RESTORE_HWID_DDID)
-    read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//HW_ID1
-    end
-    s_branch        L_RESTORE_HWREG_FINISH
-
-  L_RESTORE_HWREG_WAVE64:
-	read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//M0
-	read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//PC
-	read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
-	read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//EXEC
-	read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
-	read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//STATUS
-	read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//TRAPSTS
-    //read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK_LO
-	//read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK_HI
-    read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK
-	read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//MODE
-    if(SAVE_RESTORE_HWID_DDID)
-    read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//HW_ID1
-    end
-  L_RESTORE_HWREG_FINISH:
-	s_waitcnt		lgkmcnt(0)																						//from now on, it is safe to restore STATUS and IB_STS
-  
-
-
-    if(SAVE_RESTORE_HWID_DDID)
-  L_RESTORE_DDID:
-    s_mov_b32      m0, s_restore_hwid1                                                      //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave
-    s_ttracedata                                                                            //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore
-                                    
-    s_mov_b32		s_restore_buf_rsrc2, 0x4												//NUM_RECORDS	in bytes
-	if (SWIZZLE_EN)
-		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
-	else
-		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
-	end
-
-    s_and_b32       m0, s_restore_size, 1
-    s_cmp_eq_u32    m0, 1
-    s_cbranch_scc1  L_RESTORE_DDID_WAVE64
-
-    read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)	
-    s_branch        L_RESTORE_DDID_FINISH
-  L_RESTORE_DDID_WAVE64:
-    read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)	
-
-  L_RESTORE_DDID_FINISH:
-    s_waitcnt		lgkmcnt(0)
-    //s_mov_b32      m0, s_restore_ddid
-    //s_ttracedata   
-    if (RESTORE_DDID_IN_SGPR18)
-        s_mov_b32   s18, s_restore_ddid
-	end	
-    
-    end   
-
-	s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff    	//pc[47:32]        //Do it here in order not to affect STATUS
-
-	//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
-	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
-		s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8	  //two back-to-back s_trap are used (first for save and second for restore)
-		s_addc_u32	s_restore_pc_hi, s_restore_pc_hi, 0x0		 //carry bit over
-	end	
-	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))	      
-		s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
-		s_addc_u32	s_restore_pc_hi, s_restore_pc_hi, 0x0		 //carry bit over
-	end
-	
-	s_mov_b32 		m0, 		s_restore_m0
-	s_mov_b32 		exec_lo, 	s_restore_exec_lo
-	s_mov_b32 		exec_hi, 	s_restore_exec_hi
-	
-	s_and_b32		s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+	/* Setup Resource Contants */
+	s_mov_b32	s_restore_buf_rsrc0, s_restore_spi_init_lo		//base_addr_lo
+	s_and_b32	s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF	//base_addr_hi
+	s_or_b32	s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
+	s_mov_b32	s_restore_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes)
+	s_mov_b32	s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
+	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+	s_lshr_b32	s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
+	s_or_b32	s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp	//or ATC
+	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+	s_lshr_b32	s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
+	s_or_b32	s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp	//or MTYPE
+	//determine it is wave32 or wave64
+	s_getreg_b32	s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
+	s_lshl_b32	s_restore_size, s_restore_size, S_WAVE_SIZE
+	s_or_b32	s_restore_size, s_restore_spi_init_hi, s_restore_size
+
+	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+	s_cbranch_scc0	L_RESTORE_VGPR
+
+	/* restore LDS */
+L_RESTORE_LDS:
+	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_RESTORE_LDS_EXEC_HI
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_RESTORE_LDS_NORMAL
+L_ENABLE_RESTORE_LDS_EXEC_HI:
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+L_RESTORE_LDS_NORMAL:
+	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
+	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//lds_size is zero?
+	s_cbranch_scc0	L_RESTORE_VGPR						//no lds used? jump to L_RESTORE_VGPR
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 6		//LDS size in dwords = lds_size * 64dw
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//LDS size in bytes
+	s_mov_b32	s_restore_buf_rsrc2, s_restore_alloc_size		//NUM_RECORDS in bytes
+
+	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
+	//
+	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
+	get_svgpr_size_bytes(s_restore_tmp)
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
+
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_mov_b32	m0, 0x0
+	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64
+
+L_RESTORE_LDS_LOOP_W32:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
+	s_add_u32	m0, m0, 128						// 128 DW
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128		//mem offset increased by 128DW
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W32					//LDS restore is complete?
+	s_branch	L_RESTORE_VGPR
+
+L_RESTORE_LDS_LOOP_W64:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
+	s_add_u32	m0, m0, 256						// 256 DW
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256		//mem offset increased by 256DW
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64					//LDS restore is complete?
+
+	/* restore VGPRs */
+L_RESTORE_VGPR:
+	// VGPR SR memory offset : 0
+	s_mov_b32	s_restore_mem_offset, 0x0
+ 	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_RESTORE_VGPR_EXEC_HI
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_RESTORE_VGPR_NORMAL
+L_ENABLE_RESTORE_VGPR_EXEC_HI:
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+L_RESTORE_VGPR_NORMAL:
+	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
+	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, 1
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+	//determine it is wave32 or wave64
+	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64
+
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR load using dw burst
+	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v1, v0 will be the last
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4
+	s_mov_b32	m0, 4							//VGPR initial index value = 4
+
+L_RESTORE_VGPR_WAVE32_LOOP:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
+	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
+	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
+	s_waitcnt	vmcnt(0)
+	v_movreld_b32	v0, v0							//v[0+m0] = v0
+	v_movreld_b32	v1, v1
+	v_movreld_b32	v2, v2
+	v_movreld_b32	v3, v3
+	s_add_u32	m0, m0, 4						//next vgpr index
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4	//every buffer_load_dword does 128 bytes
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_VGPR_WAVE32_LOOP				//VGPR restore (except v0) is complete?
+
+	/* VGPR restore on v0 */
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
+	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
+	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
+
+	s_branch	L_RESTORE_SGPR
+
+L_RESTORE_VGPR_WAVE64:
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR load using dw burst
+	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v4, v0 will be the last
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4
+	s_mov_b32	m0, 4							//VGPR initial index value = 4
+
+L_RESTORE_VGPR_WAVE64_LOOP:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+	s_waitcnt	vmcnt(0)
+	v_movreld_b32	v0, v0							//v[0+m0] = v0
+	v_movreld_b32	v1, v1
+	v_movreld_b32	v2, v2
+	v_movreld_b32	v3, v3
+	s_add_u32	m0, m0, 4						//next vgpr index
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4	//every buffer_load_dword does 256 bytes
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64_LOOP				//VGPR restore (except v0) is complete?
+
+	//Below part will be the restore shared vgpr part (new for gfx10)
+	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)	//shared_vgpr_size
+	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
+	s_cbranch_scc0	L_RESTORE_V0						//no shared_vgpr used?
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 3		//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
+	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
+	//restore shared_vgpr will start from the index of m0
+	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, m0
+	s_mov_b32	exec_lo, 0xFFFFFFFF
+	s_mov_b32	exec_hi, 0x00000000
+L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+	s_waitcnt	vmcnt(0)
+	v_movreld_b32	v0, v0							//v[0+m0] = v0
+	s_add_u32	m0, m0, 1						//next vgpr index
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_SHARED_VGPR_WAVE64_LOOP			//VGPR restore (except v0) is complete?
+
+	s_mov_b32	exec_hi, 0xFFFFFFFF					//restore back exec_hi before restoring V0!!
+
+	/* VGPR restore on v0 */
+L_RESTORE_V0:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
+	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
+	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
+
+	/* restore SGPRs */
+	//will be 2+8+16*6
+	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
+L_RESTORE_SGPR:
+	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
+	get_svgpr_size_bytes(s_restore_tmp)
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
+	s_sub_u32	s_restore_mem_offset, s_restore_mem_offset, 22*4	//s106~s127 is not saved
+	s_sub_u32	s_restore_mem_offset, s_restore_mem_offset, 2*4		// restore SGPR from S[n] to S[0], by 2 sgprs group
+
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	s_mov_b32	m0, s_sgpr_save_num
+
+	read_2sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+
+	s_waitcnt	lgkmcnt(0)
+
+	s_sub_u32	m0, m0, 2						// Restore from S[n] to S[0]
+	s_nop		0							// hazard SALU M0=> S_MOVREL
+
+	s_movreld_b64	s0, s0							//s[0+m0] = s0
+
+	read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+	s_waitcnt	lgkmcnt(0)
+
+	s_sub_u32	m0, m0, 8						// Restore from S[n] to S[0]
+	s_nop		0							// hazard SALU M0=> S_MOVREL
+
+	s_movreld_b64	s0, s0							//s[0+m0] = s0
+	s_movreld_b64	s2, s2
+	s_movreld_b64	s4, s4
+	s_movreld_b64	s6, s6
+
+ L_RESTORE_SGPR_LOOP:
+	read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+	s_waitcnt	lgkmcnt(0)
+
+	s_sub_u32	m0, m0, 16						// Restore from S[n] to S[0]
+	s_nop		0							// hazard SALU M0=> S_MOVREL
+
+	s_movreld_b64	s0, s0							//s[0+m0] = s0
+	s_movreld_b64	s2, s2
+	s_movreld_b64	s4, s4
+	s_movreld_b64	s6, s6
+	s_movreld_b64	s8, s8
+	s_movreld_b64	s10, s10
+	s_movreld_b64	s12, s12
+	s_movreld_b64	s14, s14
+
+	s_cmp_eq_u32	m0, 0							//scc = (m0 < s_sgpr_save_num) ? 1 : 0
+	s_cbranch_scc0	L_RESTORE_SGPR_LOOP
+
+	/* restore HW registers */
+L_RESTORE_HWREG:
+	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
+	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
+	get_svgpr_size_bytes(s_restore_tmp)
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
+
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
+
+	s_waitcnt	lgkmcnt(0)						//from now on, it is safe to restore STATUS and IB_STS
+
+	s_mov_b32	s_restore_tmp, s_restore_pc_hi
+	s_and_b32	s_restore_pc_hi, s_restore_tmp, 0x0000ffff		//pc[47:32] //Do it here in order not to affect STATUS
+
+	s_mov_b32	m0, s_restore_m0
+	s_mov_b32	exec_lo, s_restore_exec_lo
+	s_mov_b32	exec_hi, s_restore_exec_hi
+
+	s_and_b32	s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
 	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
-    s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask         //restore xnack_mask
-	s_and_b32		s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
-	s_lshr_b32		s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+	s_setreg_b32	hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
+	s_and_b32	s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+	s_lshr_b32	s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
 	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
-	//s_setreg_b32 	hwreg(HW_REG_TRAPSTS), 	s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
-	s_setreg_b32 	hwreg(HW_REG_MODE), 	s_restore_mode
-	//reuse s_restore_m0 as a temp register
-	s_and_b32		s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
-	s_lshr_b32		s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
-	s_lshl_b32		s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
-	s_mov_b32		s_restore_tmp, 0x0																				//IB_STS is zero
-	s_or_b32		s_restore_tmp, s_restore_tmp, s_restore_m0
-	s_and_b32		s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
-	s_lshr_b32		s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
-	s_lshl_b32		s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
-	s_or_b32		s_restore_tmp, s_restore_tmp, s_restore_m0
-    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK 
-    s_lshr_b32		s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
-	s_setreg_b32 	hwreg(HW_REG_IB_STS), 	s_restore_tmp
-	s_setreg_b32 	hwreg(HW_REG_STATUS), 	s_restore_status
-
-	s_barrier													//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
-	
-	
-//	s_rfe_b64 s_restore_pc_lo                              		//Return to the main shader program and resume execution
-    s_rfe_b64  s_restore_pc_lo            // s_restore_m0[0] is used to set STATUS.inst_atc 
-
-
-/**************************************************************************/
-/*                     	the END								              */
-/**************************************************************************/	
-L_END_PGM:	
+	s_setreg_b32	hwreg(HW_REG_MODE), s_restore_mode
+	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK
+	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+	s_mov_b32	s_restore_mode, 0x0
+	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
+	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
+	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK
+	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
+	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT
+	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
+
+	s_and_b32	s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+	s_lshr_b32	s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+	s_setreg_b32 	hwreg(HW_REG_IB_STS), s_restore_mode
+
+	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
+	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
+	s_setreg_b32	hwreg(HW_REG_STATUS), s_restore_status			// SCC is included, which is changed by previous salu
+
+	s_barrier								//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
+
+	s_rfe_b64	s_restore_pc_lo						//Return to the main shader program and resume execution
+
+L_END_PGM:
 	s_endpgm
-	
-end	
-
-
-/**************************************************************************/
-/*                     	the helper functions							  */
-/**************************************************************************/
-function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
-	if (use_sqc)
-		s_mov_b32 exec_lo, m0					//assuming exec_lo is not needed anymore from this point on
-		s_mov_b32 m0, s_mem_offset
-		s_buffer_store_dword s, s_rsrc, m0		glc:1	
-		s_add_u32		s_mem_offset, s_mem_offset, 4
-		s_mov_b32	m0, exec_lo
-    elsif (use_mtbuf)
-        v_mov_b32	v0,	s
-        tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-		s_add_u32		s_mem_offset, s_mem_offset, 128
-    else 
-        v_mov_b32	v0,	s
-		buffer_store_dword	v0, v0, s_rsrc, s_mem_offset	slc:1 glc:1
-        s_add_u32		s_mem_offset, s_mem_offset, 128
-	end
 end
 
-function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
-	if (use_sqc)
-		s_mov_b32 exec_lo, m0					//assuming exec_lo is not needed anymore from this point on
-		s_mov_b32 m0, s_mem_offset
-		s_buffer_store_dword s, s_rsrc, m0		glc:1	
-		s_add_u32		s_mem_offset, s_mem_offset, 4
-		s_mov_b32	m0, exec_lo
-    elsif (use_mtbuf)
-        v_mov_b32	v0,	s
-        tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-		s_add_u32		s_mem_offset, s_mem_offset, 256
-    else 
-        v_mov_b32	v0,	s
-		buffer_store_dword	v0, v0, s_rsrc, s_mem_offset	slc:1 glc:1
-        s_add_u32		s_mem_offset, s_mem_offset, 256
-	end
+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+	s_mov_b32	exec_lo, m0
+	s_mov_b32	m0, s_mem_offset
+	s_buffer_store_dword	s, s_rsrc, m0 glc:1
+	s_add_u32	s_mem_offset, s_mem_offset, 4
+	s_mov_b32	m0, exec_lo
+end
+
+
+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_store_dwordx4	s[0], s_rsrc, 0 glc:1
+	s_buffer_store_dwordx4	s[4], s_rsrc, 16 glc:1
+	s_buffer_store_dwordx4	s[8], s_rsrc, 32 glc:1
+	s_buffer_store_dwordx4	s[12], s_rsrc, 48 glc:1
+	s_add_u32	s_rsrc[0], s_rsrc[0], 4*16
+	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0
+end
+
+function write_10sgpr_to_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_store_dwordx4	s[0], s_rsrc, 0 glc:1
+	s_buffer_store_dwordx4	s[4], s_rsrc, 16 glc:1
+	s_buffer_store_dwordx2	s[8], s_rsrc, 32 glc:1
+	s_add_u32	s_rsrc[0], s_rsrc[0], 4*16
+	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0
+end
+
+
+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_load_dword	s, s_rsrc, s_mem_offset glc:1
+	s_add_u32	s_mem_offset, s_mem_offset, 4
 end
 
-function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc)
-	s_buffer_load_dword s, s_rsrc, s_mem_offset		glc:1
-	if (use_sqc)
-		s_add_u32		s_mem_offset, s_mem_offset, 4
-	else
-        s_add_u32		s_mem_offset, s_mem_offset, 128
-	end
+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_load_dwordx16	s, s_rsrc, s_mem_offset glc:1
+	s_sub_u32	s_mem_offset, s_mem_offset, 4*16
 end
 
-function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc)
-	s_buffer_load_dword s, s_rsrc, s_mem_offset		glc:1
-	if (use_sqc)
-		s_add_u32		s_mem_offset, s_mem_offset, 4
-	else
-        s_add_u32		s_mem_offset, s_mem_offset, 256
-	end
+function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_load_dwordx8	s, s_rsrc, s_mem_offset glc:1
+	s_sub_u32	s_mem_offset, s_mem_offset, 4*16
 end
 
+function read_2sgpr_from_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_load_dwordx2 s, s_rsrc, s_mem_offset glc:1
+	s_sub_u32	s_mem_offset, s_mem_offset, 4*8
+end
+
+
+function get_lds_size_bytes(s_lds_size_byte)
+	s_getreg_b32	s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
+	s_lshl_b32	s_lds_size_byte, s_lds_size_byte, 8			//LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
+end
+
+function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
+	s_getreg_b32	s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
+	s_add_u32	s_vgpr_size_byte, s_vgpr_size_byte, 1
+	s_lshr_b32	m0, s_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_SHIFT_W64
+	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+7)		//Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
+	s_branch	L_SHIFT_DONE
+L_ENABLE_SHIFT_W64:
+	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+8)		//Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
+L_SHIFT_DONE:
+end
+
+function get_svgpr_size_bytes(s_svgpr_size_byte)
+	s_getreg_b32	s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
+	s_lshl_b32	s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
+end
+
+function get_sgpr_size_bytes
+	return 512
+end
+
+function get_hwreg_size_bytes
+	return 128
+end
-- 
2.7.4



More information about the amd-gfx mailing list