[PATCH] drm/amdkfd: Use SQC when TCP would fail in gfx10.1 context save
Laurent Morichetti
laurent.morichetti at amd.com
Fri Feb 23 22:08:30 UTC 2024
Similarly to gfx9, gfx10.1 drops vector stores when an xnack error is
raised. To work around this issue, use scalar stores instead of vector
stores when trapsts.xnack_error == 1.
Signed-off-by: Laurent Morichetti <laurent.morichetti at amd.com>
---
.../gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 543 ++++++++++++------
.../amd/amdkfd/cwsr_trap_handler_gfx10.asm | 156 ++++-
2 files changed, 530 insertions(+), 169 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 2e9b64edb8d2..5a0308d26b53 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -678,7 +678,7 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
};
static const uint32_t cwsr_trap_nv1x_hex[] = {
- 0xbf820001, 0xbf8201f5,
+ 0xbf820001, 0xbf820394,
0xb0804004, 0xb978f802,
0x8a78ff78, 0x00020006,
0xb97bf803, 0x876eff78,
@@ -769,13 +769,90 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x877c817c, 0xbf06817c,
0xbf850002, 0xbeff0380,
0xbf820002, 0xbeff03c1,
- 0xbf82000b, 0xbef603ff,
- 0x01000000, 0xe0704000,
- 0x705d0000, 0xe0704080,
- 0x705d0100, 0xe0704100,
- 0x705d0200, 0xe0704180,
- 0x705d0300, 0xbf82000a,
- 0xbef603ff, 0x01000000,
+ 0xbf820058, 0xbef603ff,
+ 0x01000000, 0xb97af803,
+ 0x8a7a7aff, 0x10000000,
+ 0xbf850049, 0xbe840380,
+ 0xd7600000, 0x00000900,
+ 0x80048104, 0xd7600001,
+ 0x00000900, 0x80048104,
+ 0xd7600002, 0x00000900,
+ 0x80048104, 0xd7600003,
+ 0x00000900, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000901,
+ 0x80048104, 0xd7600001,
+ 0x00000901, 0x80048104,
+ 0xd7600002, 0x00000901,
+ 0x80048104, 0xd7600003,
+ 0x00000901, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000902,
+ 0x80048104, 0xd7600001,
+ 0x00000902, 0x80048104,
+ 0xd7600002, 0x00000902,
+ 0x80048104, 0xd7600003,
+ 0x00000902, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000903,
+ 0x80048104, 0xd7600001,
+ 0x00000903, 0x80048104,
+ 0xd7600002, 0x00000903,
+ 0x80048104, 0xd7600003,
+ 0x00000903, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0xbf820060,
+ 0xe0704000, 0x705d0000,
+ 0xe0704080, 0x705d0100,
+ 0xe0704100, 0x705d0200,
+ 0xe0704180, 0x705d0300,
+ 0xbf820057, 0xbef603ff,
+ 0x01000000, 0xb97af803,
+ 0x8a7a7aff, 0x10000000,
+ 0xbf850049, 0xbe840380,
+ 0xd7600000, 0x00000900,
+ 0x80048104, 0xd7600001,
+ 0x00000900, 0x80048104,
+ 0xd7600002, 0x00000900,
+ 0x80048104, 0xd7600003,
+ 0x00000900, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06c004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000901,
+ 0x80048104, 0xd7600001,
+ 0x00000901, 0x80048104,
+ 0xd7600002, 0x00000901,
+ 0x80048104, 0xd7600003,
+ 0x00000901, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06c004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000902,
+ 0x80048104, 0xd7600001,
+ 0x00000902, 0x80048104,
+ 0xd7600002, 0x00000902,
+ 0x80048104, 0xd7600003,
+ 0x00000902, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06c004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000903,
+ 0x80048104, 0xd7600001,
+ 0x00000903, 0x80048104,
+ 0xd7600002, 0x00000903,
+ 0x80048104, 0xd7600003,
+ 0x00000903, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06c004,
+ 0xbf84ffef, 0xbf820008,
0xe0704000, 0x705d0000,
0xe0704100, 0x705d0100,
0xe0704200, 0x705d0200,
@@ -855,9 +932,9 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0xbf850002, 0xbeff0380,
0xbf820001, 0xbeff03c1,
0xb97b4306, 0x877bc17b,
- 0xbf840044, 0xbf8a0000,
+ 0xbf840086, 0xbf8a0000,
0x877aff6d, 0x80000000,
- 0xbf840040, 0x8f7b867b,
+ 0xbf840082, 0x8f7b867b,
0x8f7b827b, 0xbef6037b,
0xb9703a05, 0x80708170,
0xbf0d9973, 0xbf850002,
@@ -871,16 +948,49 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0xd7660000, 0x000200c1,
0x16000084, 0x907c9973,
0x877c817c, 0xbf06817c,
- 0xbefc0380, 0xbf850012,
- 0xbe8303ff, 0x00000080,
+ 0xbefc0380, 0xbf850033,
+ 0xb97af803, 0x8a7a7aff,
+ 0x10000000, 0xbf85001d,
+ 0xd8d80000, 0x01000000,
+ 0xbf8c0000, 0xbe840380,
+ 0xd7600000, 0x00000901,
+ 0x80048104, 0xd7600001,
+ 0x00000901, 0x80048104,
+ 0xd7600002, 0x00000901,
+ 0x80048104, 0xd7600003,
+ 0x00000901, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0x807cff7c,
+ 0x00000080, 0xd5250000,
+ 0x0001ff00, 0x00000080,
+ 0xbf0a7b7c, 0xbf85ffe4,
+ 0xbf820044, 0xbe8303ff,
+ 0x00000080, 0xbf800000,
0xbf800000, 0xbf800000,
- 0xbf800000, 0xd8d80000,
+ 0xd8d80000, 0x01000000,
+ 0xbf8c0000, 0xe0704000,
+ 0x705d0100, 0x807c037c,
+ 0x80700370, 0xd5250000,
+ 0x0001ff00, 0x00000080,
+ 0xbf0a7b7c, 0xbf85fff4,
+ 0xbf820032, 0xb97af803,
+ 0x8a7a7aff, 0x10000000,
+ 0xbf85001d, 0xd8d80000,
0x01000000, 0xbf8c0000,
- 0xe0704000, 0x705d0100,
- 0x807c037c, 0x80700370,
+ 0xbe840380, 0xd7600000,
+ 0x00000901, 0x80048104,
+ 0xd7600001, 0x00000901,
+ 0x80048104, 0xd7600002,
+ 0x00000901, 0x80048104,
+ 0xd7600003, 0x00000901,
+ 0x80048104, 0xf469003a,
+ 0xe0000000, 0x80709070,
+ 0xbf06c004, 0xbf84ffef,
+ 0x807cff7c, 0x00000100,
0xd5250000, 0x0001ff00,
- 0x00000080, 0xbf0a7b7c,
- 0xbf85fff4, 0xbf820011,
+ 0x00000100, 0xbf0a7b7c,
+ 0xbf85ffe4, 0xbf820011,
0xbe8303ff, 0x00000100,
0xbf800000, 0xbf800000,
0xbf800000, 0xd8d80000,
@@ -898,10 +1008,52 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0xbeff03c1, 0xb97b3a05,
0x807b817b, 0x8f7b827b,
0x907c9973, 0x877c817c,
- 0xbf06817c, 0xbf850017,
+ 0xbf06817c, 0xbf85006b,
0xbef603ff, 0x01000000,
0xbefc0384, 0xbf0a7b7c,
- 0xbf840037, 0x7e008700,
+ 0xbf8400fa, 0xb97af803,
+ 0x8a7a7aff, 0x10000000,
+ 0xbf850050, 0x7e008700,
+ 0x7e028701, 0x7e048702,
+ 0x7e068703, 0xbe840380,
+ 0xd7600000, 0x00000900,
+ 0x80048104, 0xd7600001,
+ 0x00000900, 0x80048104,
+ 0xd7600002, 0x00000900,
+ 0x80048104, 0xd7600003,
+ 0x00000900, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000901,
+ 0x80048104, 0xd7600001,
+ 0x00000901, 0x80048104,
+ 0xd7600002, 0x00000901,
+ 0x80048104, 0xd7600003,
+ 0x00000901, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000902,
+ 0x80048104, 0xd7600001,
+ 0x00000902, 0x80048104,
+ 0xd7600002, 0x00000902,
+ 0x80048104, 0xd7600003,
+ 0x00000902, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0xbe840380,
+ 0xd7600000, 0x00000903,
+ 0x80048104, 0xd7600001,
+ 0x00000903, 0x80048104,
+ 0xd7600002, 0x00000903,
+ 0x80048104, 0xd7600003,
+ 0x00000903, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06a004,
+ 0xbf84ffef, 0x807c847c,
+ 0xbf0a7b7c, 0xbf85ffb1,
+ 0xbf8200a6, 0x7e008700,
0x7e028701, 0x7e048702,
0x7e068703, 0xe0704000,
0x705d0000, 0xe0704080,
@@ -910,9 +1062,51 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x705d0300, 0x807c847c,
0x8070ff70, 0x00000200,
0xbf0a7b7c, 0xbf85ffef,
- 0xbf820025, 0xbef603ff,
+ 0xbf820094, 0xbef603ff,
0x01000000, 0xbefc0384,
- 0xbf0a7b7c, 0xbf840011,
+ 0xbf0a7b7c, 0xbf840065,
+ 0xb97af803, 0x8a7a7aff,
+ 0x10000000, 0xbf850050,
+ 0x7e008700, 0x7e028701,
+ 0x7e048702, 0x7e068703,
+ 0xbe840380, 0xd7600000,
+ 0x00000900, 0x80048104,
+ 0xd7600001, 0x00000900,
+ 0x80048104, 0xd7600002,
+ 0x00000900, 0x80048104,
+ 0xd7600003, 0x00000900,
+ 0x80048104, 0xf469003a,
+ 0xe0000000, 0x80709070,
+ 0xbf06c004, 0xbf84ffef,
+ 0xbe840380, 0xd7600000,
+ 0x00000901, 0x80048104,
+ 0xd7600001, 0x00000901,
+ 0x80048104, 0xd7600002,
+ 0x00000901, 0x80048104,
+ 0xd7600003, 0x00000901,
+ 0x80048104, 0xf469003a,
+ 0xe0000000, 0x80709070,
+ 0xbf06c004, 0xbf84ffef,
+ 0xbe840380, 0xd7600000,
+ 0x00000902, 0x80048104,
+ 0xd7600001, 0x00000902,
+ 0x80048104, 0xd7600002,
+ 0x00000902, 0x80048104,
+ 0xd7600003, 0x00000902,
+ 0x80048104, 0xf469003a,
+ 0xe0000000, 0x80709070,
+ 0xbf06c004, 0xbf84ffef,
+ 0xbe840380, 0xd7600000,
+ 0x00000903, 0x80048104,
+ 0xd7600001, 0x00000903,
+ 0x80048104, 0xd7600002,
+ 0x00000903, 0x80048104,
+ 0xd7600003, 0x00000903,
+ 0x80048104, 0xf469003a,
+ 0xe0000000, 0x80709070,
+ 0xbf06c004, 0xbf84ffef,
+ 0x807c847c, 0xbf0a7b7c,
+ 0xbf85ffb1, 0xbf82003b,
0x7e008700, 0x7e028701,
0x7e048702, 0x7e068703,
0xe0704000, 0x705d0000,
@@ -922,179 +1116,192 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x807c847c, 0x8070ff70,
0x00000400, 0xbf0a7b7c,
0xbf85ffef, 0xb97b1e06,
- 0x877bc17b, 0xbf84000c,
+ 0x877bc17b, 0xbf840027,
0x8f7b837b, 0x807b7c7b,
0xbefe03c1, 0xbeff0380,
- 0x7e008700, 0xe0704000,
- 0x705d0000, 0x807c817c,
- 0x8070ff70, 0x00000080,
- 0xbf0a7b7c, 0xbf85fff8,
- 0xbf820144, 0xbef4037e,
- 0x8775ff7f, 0x0000ffff,
- 0x8875ff75, 0x00040000,
- 0xbef60380, 0xbef703ff,
- 0x10807fac, 0xb97202dc,
- 0x8f729972, 0x876eff7f,
- 0x04000000, 0xbf840034,
+ 0xb97af803, 0x8a7a7aff,
+ 0x10000000, 0xbf850017,
+ 0x7e008700, 0xbe840380,
+ 0xd7600000, 0x00000900,
+ 0x80048104, 0xd7600001,
+ 0x00000900, 0x80048104,
+ 0xd7600002, 0x00000900,
+ 0x80048104, 0xd7600003,
+ 0x00000900, 0x80048104,
+ 0xf469003a, 0xe0000000,
+ 0x80709070, 0xbf06c004,
+ 0xbf84ffef, 0x807c817c,
+ 0xbf0a7b7c, 0xbf85ffea,
+ 0xbf820008, 0x7e008700,
+ 0xe0704000, 0x705d0000,
+ 0x807c817c, 0x8070ff70,
+ 0x00000080, 0xbf0a7b7c,
+ 0xbf85fff8, 0xbf820144,
+ 0xbef4037e, 0x8775ff7f,
+ 0x0000ffff, 0x8875ff75,
+ 0x00040000, 0xbef60380,
+ 0xbef703ff, 0x10807fac,
+ 0xb97202dc, 0x8f729972,
+ 0x876eff7f, 0x04000000,
+ 0xbf840034, 0xbefe03c1,
+ 0x907c9972, 0x877c817c,
+ 0xbf06817c, 0xbf850002,
+ 0xbeff0380, 0xbf820001,
+ 0xbeff03c1, 0xb96f4306,
+ 0x876fc16f, 0xbf840029,
+ 0x8f6f866f, 0x8f6f826f,
+ 0xbef6036f, 0xb9783a05,
+ 0x80788178, 0xbf0d9972,
+ 0xbf850002, 0x8f788978,
+ 0xbf820001, 0x8f788a78,
+ 0xb96e1e06, 0x8f6e8a6e,
+ 0x80786e78, 0x8078ff78,
+ 0x00000200, 0x8078ff78,
+ 0x00000080, 0xbef603ff,
+ 0x01000000, 0x907c9972,
+ 0x877c817c, 0xbf06817c,
+ 0xbefc0380, 0xbf850009,
+ 0xe0310000, 0x781d0000,
+ 0x807cff7c, 0x00000080,
+ 0x8078ff78, 0x00000080,
+ 0xbf0a6f7c, 0xbf85fff8,
+ 0xbf820008, 0xe0310000,
+ 0x781d0000, 0x807cff7c,
+ 0x00000100, 0x8078ff78,
+ 0x00000100, 0xbf0a6f7c,
+ 0xbf85fff8, 0xbef80380,
0xbefe03c1, 0x907c9972,
0x877c817c, 0xbf06817c,
0xbf850002, 0xbeff0380,
0xbf820001, 0xbeff03c1,
- 0xb96f4306, 0x876fc16f,
- 0xbf840029, 0x8f6f866f,
- 0x8f6f826f, 0xbef6036f,
- 0xb9783a05, 0x80788178,
- 0xbf0d9972, 0xbf850002,
- 0x8f788978, 0xbf820001,
- 0x8f788a78, 0xb96e1e06,
- 0x8f6e8a6e, 0x80786e78,
+ 0xb96f3a05, 0x806f816f,
+ 0x8f6f826f, 0x907c9972,
+ 0x877c817c, 0xbf06817c,
+ 0xbf850024, 0xbef603ff,
+ 0x01000000, 0xbeee0378,
0x8078ff78, 0x00000200,
- 0x8078ff78, 0x00000080,
- 0xbef603ff, 0x01000000,
- 0x907c9972, 0x877c817c,
- 0xbf06817c, 0xbefc0380,
- 0xbf850009, 0xe0310000,
- 0x781d0000, 0x807cff7c,
- 0x00000080, 0x8078ff78,
- 0x00000080, 0xbf0a6f7c,
- 0xbf85fff8, 0xbf820008,
- 0xe0310000, 0x781d0000,
- 0x807cff7c, 0x00000100,
- 0x8078ff78, 0x00000100,
- 0xbf0a6f7c, 0xbf85fff8,
- 0xbef80380, 0xbefe03c1,
- 0x907c9972, 0x877c817c,
- 0xbf06817c, 0xbf850002,
- 0xbeff0380, 0xbf820001,
- 0xbeff03c1, 0xb96f3a05,
- 0x806f816f, 0x8f6f826f,
- 0x907c9972, 0x877c817c,
- 0xbf06817c, 0xbf850024,
- 0xbef603ff, 0x01000000,
- 0xbeee0378, 0x8078ff78,
- 0x00000200, 0xbefc0384,
- 0xbf0a6f7c, 0xbf840050,
+ 0xbefc0384, 0xbf0a6f7c,
+ 0xbf840050, 0xe0304000,
+ 0x785d0000, 0xe0304080,
+ 0x785d0100, 0xe0304100,
+ 0x785d0200, 0xe0304180,
+ 0x785d0300, 0xbf8c3f70,
+ 0x7e008500, 0x7e028501,
+ 0x7e048502, 0x7e068503,
+ 0x807c847c, 0x8078ff78,
+ 0x00000200, 0xbf0a6f7c,
+ 0xbf85ffee, 0xe0304000,
+ 0x6e5d0000, 0xe0304080,
+ 0x6e5d0100, 0xe0304100,
+ 0x6e5d0200, 0xe0304180,
+ 0x6e5d0300, 0xbf8c3f70,
+ 0xbf820034, 0xbef603ff,
+ 0x01000000, 0xbeee0378,
+ 0x8078ff78, 0x00000400,
+ 0xbefc0384, 0xbf0a6f7c,
+ 0xbf840012, 0xe0304000,
+ 0x785d0000, 0xe0304100,
+ 0x785d0100, 0xe0304200,
+ 0x785d0200, 0xe0304300,
+ 0x785d0300, 0xbf8c3f70,
+ 0x7e008500, 0x7e028501,
+ 0x7e048502, 0x7e068503,
+ 0x807c847c, 0x8078ff78,
+ 0x00000400, 0xbf0a6f7c,
+ 0xbf85ffee, 0xb96f1e06,
+ 0x876fc16f, 0xbf84000e,
+ 0x8f6f836f, 0x806f7c6f,
+ 0xbefe03c1, 0xbeff0380,
0xe0304000, 0x785d0000,
- 0xe0304080, 0x785d0100,
- 0xe0304100, 0x785d0200,
- 0xe0304180, 0x785d0300,
0xbf8c3f70, 0x7e008500,
- 0x7e028501, 0x7e048502,
- 0x7e068503, 0x807c847c,
- 0x8078ff78, 0x00000200,
- 0xbf0a6f7c, 0xbf85ffee,
+ 0x807c817c, 0x8078ff78,
+ 0x00000080, 0xbf0a6f7c,
+ 0xbf85fff7, 0xbeff03c1,
0xe0304000, 0x6e5d0000,
- 0xe0304080, 0x6e5d0100,
- 0xe0304100, 0x6e5d0200,
- 0xe0304180, 0x6e5d0300,
- 0xbf8c3f70, 0xbf820034,
- 0xbef603ff, 0x01000000,
- 0xbeee0378, 0x8078ff78,
- 0x00000400, 0xbefc0384,
- 0xbf0a6f7c, 0xbf840012,
- 0xe0304000, 0x785d0000,
- 0xe0304100, 0x785d0100,
- 0xe0304200, 0x785d0200,
- 0xe0304300, 0x785d0300,
- 0xbf8c3f70, 0x7e008500,
- 0x7e028501, 0x7e048502,
- 0x7e068503, 0x807c847c,
- 0x8078ff78, 0x00000400,
- 0xbf0a6f7c, 0xbf85ffee,
- 0xb96f1e06, 0x876fc16f,
- 0xbf84000e, 0x8f6f836f,
- 0x806f7c6f, 0xbefe03c1,
- 0xbeff0380, 0xe0304000,
- 0x785d0000, 0xbf8c3f70,
- 0x7e008500, 0x807c817c,
- 0x8078ff78, 0x00000080,
- 0xbf0a6f7c, 0xbf85fff7,
- 0xbeff03c1, 0xe0304000,
- 0x6e5d0000, 0xe0304100,
- 0x6e5d0100, 0xe0304200,
- 0x6e5d0200, 0xe0304300,
- 0x6e5d0300, 0xbf8c3f70,
+ 0xe0304100, 0x6e5d0100,
+ 0xe0304200, 0x6e5d0200,
+ 0xe0304300, 0x6e5d0300,
+ 0xbf8c3f70, 0xb9783a05,
+ 0x80788178, 0xbf0d9972,
+ 0xbf850002, 0x8f788978,
+ 0xbf820001, 0x8f788a78,
+ 0xb96e1e06, 0x8f6e8a6e,
+ 0x80786e78, 0x8078ff78,
+ 0x00000200, 0x80f8ff78,
+ 0x00000050, 0xbef603ff,
+ 0x01000000, 0xbefc03ff,
+ 0x0000006c, 0x80f89078,
+ 0xf429003a, 0xf0000000,
+ 0xbf8cc07f, 0x80fc847c,
+ 0xbf800000, 0xbe803100,
+ 0xbe823102, 0x80f8a078,
+ 0xf42d003a, 0xf0000000,
+ 0xbf8cc07f, 0x80fc887c,
+ 0xbf800000, 0xbe803100,
+ 0xbe823102, 0xbe843104,
+ 0xbe863106, 0x80f8c078,
+ 0xf431003a, 0xf0000000,
+ 0xbf8cc07f, 0x80fc907c,
+ 0xbf800000, 0xbe803100,
+ 0xbe823102, 0xbe843104,
+ 0xbe863106, 0xbe883108,
+ 0xbe8a310a, 0xbe8c310c,
+ 0xbe8e310e, 0xbf06807c,
+ 0xbf84fff0, 0xba80f801,
+ 0x00000000, 0xbf8a0000,
0xb9783a05, 0x80788178,
0xbf0d9972, 0xbf850002,
0x8f788978, 0xbf820001,
0x8f788a78, 0xb96e1e06,
0x8f6e8a6e, 0x80786e78,
0x8078ff78, 0x00000200,
- 0x80f8ff78, 0x00000050,
0xbef603ff, 0x01000000,
- 0xbefc03ff, 0x0000006c,
- 0x80f89078, 0xf429003a,
- 0xf0000000, 0xbf8cc07f,
- 0x80fc847c, 0xbf800000,
- 0xbe803100, 0xbe823102,
- 0x80f8a078, 0xf42d003a,
- 0xf0000000, 0xbf8cc07f,
- 0x80fc887c, 0xbf800000,
- 0xbe803100, 0xbe823102,
- 0xbe843104, 0xbe863106,
- 0x80f8c078, 0xf431003a,
- 0xf0000000, 0xbf8cc07f,
- 0x80fc907c, 0xbf800000,
- 0xbe803100, 0xbe823102,
- 0xbe843104, 0xbe863106,
- 0xbe883108, 0xbe8a310a,
- 0xbe8c310c, 0xbe8e310e,
- 0xbf06807c, 0xbf84fff0,
- 0xba80f801, 0x00000000,
- 0xbf8a0000, 0xb9783a05,
- 0x80788178, 0xbf0d9972,
- 0xbf850002, 0x8f788978,
- 0xbf820001, 0x8f788a78,
- 0xb96e1e06, 0x8f6e8a6e,
- 0x80786e78, 0x8078ff78,
- 0x00000200, 0xbef603ff,
- 0x01000000, 0xf4211bfa,
+ 0xf4211bfa, 0xf0000000,
+ 0x80788478, 0xf4211b3a,
0xf0000000, 0x80788478,
- 0xf4211b3a, 0xf0000000,
- 0x80788478, 0xf4211b7a,
+ 0xf4211b7a, 0xf0000000,
+ 0x80788478, 0xf4211c3a,
0xf0000000, 0x80788478,
- 0xf4211c3a, 0xf0000000,
- 0x80788478, 0xf4211c7a,
+ 0xf4211c7a, 0xf0000000,
+ 0x80788478, 0xf4211eba,
0xf0000000, 0x80788478,
- 0xf4211eba, 0xf0000000,
- 0x80788478, 0xf4211efa,
+ 0xf4211efa, 0xf0000000,
+ 0x80788478, 0xf4211e7a,
0xf0000000, 0x80788478,
- 0xf4211e7a, 0xf0000000,
- 0x80788478, 0xf4211cfa,
+ 0xf4211cfa, 0xf0000000,
+ 0x80788478, 0xf4211bba,
0xf0000000, 0x80788478,
+ 0xbf8cc07f, 0xb9eef814,
0xf4211bba, 0xf0000000,
0x80788478, 0xbf8cc07f,
- 0xb9eef814, 0xf4211bba,
- 0xf0000000, 0x80788478,
- 0xbf8cc07f, 0xb9eef815,
- 0xbefc036f, 0xbefe0370,
- 0xbeff0371, 0x876f7bff,
- 0x000003ff, 0xb9ef4803,
- 0xb9f9f816, 0x876f7bff,
- 0xfffff800, 0x906f8b6f,
- 0xb9efa2c3, 0xb9f3f801,
- 0xb96e3a05, 0x806e816e,
- 0xbf0d9972, 0xbf850002,
- 0x8f6e896e, 0xbf820001,
- 0x8f6e8a6e, 0xb96f1e06,
- 0x8f6f8a6f, 0x806e6f6e,
- 0x806eff6e, 0x00000200,
- 0x806e746e, 0x826f8075,
- 0x876fff6f, 0x0000ffff,
- 0xf4091c37, 0xfa000050,
- 0xf4091d37, 0xfa000060,
- 0xf4011e77, 0xfa000074,
- 0xbf8cc07f, 0x906e8977,
- 0x876fff6e, 0x003f8000,
- 0x906e8677, 0x876eff6e,
- 0x02000000, 0x886e6f6e,
- 0xb9eef807, 0x876dff6d,
- 0x0000ffff, 0x87fe7e7e,
- 0x87ea6a6a, 0xb9faf802,
- 0xbe80226c, 0xbf9b0000,
+ 0xb9eef815, 0xbefc036f,
+ 0xbefe0370, 0xbeff0371,
+ 0x876f7bff, 0x000003ff,
+ 0xb9ef4803, 0xb9f9f816,
+ 0x876f7bff, 0xfffff800,
+ 0x906f8b6f, 0xb9efa2c3,
+ 0xb9f3f801, 0xb96e3a05,
+ 0x806e816e, 0xbf0d9972,
+ 0xbf850002, 0x8f6e896e,
+ 0xbf820001, 0x8f6e8a6e,
+ 0xb96f1e06, 0x8f6f8a6f,
+ 0x806e6f6e, 0x806eff6e,
+ 0x00000200, 0x806e746e,
+ 0x826f8075, 0x876fff6f,
+ 0x0000ffff, 0xf4091c37,
+ 0xfa000050, 0xf4091d37,
+ 0xfa000060, 0xf4011e77,
+ 0xfa000074, 0xbf8cc07f,
+ 0x906e8977, 0x876fff6e,
+ 0x003f8000, 0x906e8677,
+ 0x876eff6e, 0x02000000,
+ 0x886e6f6e, 0xb9eef807,
+ 0x876dff6d, 0x0000ffff,
+ 0x87fe7e7e, 0x87ea6a6a,
+ 0xb9faf802, 0xbe80226c,
+ 0xbf9b0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000,
- 0xbf9f0000, 0x00000000,
};
static const uint32_t cwsr_trap_arcturus_hex[] = {
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index 7568ff3af978..e1aaa5ce0784 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -44,6 +44,7 @@
#define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO)
#define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO)
#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO)
+#define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
@@ -86,6 +87,7 @@ var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000
var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000
var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000
#endif
+var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000
var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12
var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19
@@ -475,6 +477,16 @@ L_SAVE_4VGPR_WAVE32:
// VGPR Allocated in 4-GPR granularity
+#if SAVE_AFTER_XNACK_ERROR
+ check_if_tcp_store_ok()
+ s_cbranch_scc1 L_SAVE_FIRST_VGPRS32_WITH_TCP
+
+ write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
+ s_branch L_SAVE_HWREG
+
+L_SAVE_FIRST_VGPRS32_WITH_TCP:
+#endif
+
#if !NO_SQC_STORE
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
#endif
@@ -488,6 +500,16 @@ L_SAVE_4VGPR_WAVE64:
// VGPR Allocated in 4-GPR granularity
+#if SAVE_AFTER_XNACK_ERROR
+ check_if_tcp_store_ok()
+ s_cbranch_scc1 L_SAVE_FIRST_VGPRS64_WITH_TCP
+
+ write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
+ s_branch L_SAVE_HWREG
+
+L_SAVE_FIRST_VGPRS64_WITH_TCP:
+#endif
+
#if !NO_SQC_STORE
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
#endif
@@ -660,6 +682,26 @@ L_SAVE_LDS_NORMAL:
s_cbranch_scc1 L_SAVE_LDS_W64
L_SAVE_LDS_W32:
+#if SAVE_AFTER_XNACK_ERROR
+ check_if_tcp_store_ok()
+ s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W32
+
+L_SAVE_LDS_LOOP_SQC_W32:
+ ds_read_b32 v1, v0
+ s_waitcnt 0
+
+ write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
+
+ s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes
+ v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W32 //LDS save is complete?
+
+ s_branch L_SAVE_LDS_DONE
+
+L_SAVE_LDS_WITH_TCP_W32:
+#endif
+
s_mov_b32 s3, 128
s_nop 0
s_nop 0
@@ -669,7 +711,7 @@ L_SAVE_LDS_LOOP_W32:
s_waitcnt 0
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
- s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
+ s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
@@ -678,6 +720,26 @@ L_SAVE_LDS_LOOP_W32:
s_branch L_SAVE_LDS_DONE
L_SAVE_LDS_W64:
+#if SAVE_AFTER_XNACK_ERROR
+ check_if_tcp_store_ok()
+ s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W64
+
+L_SAVE_LDS_LOOP_SQC_W64:
+ ds_read_b32 v1, v0
+ s_waitcnt 0
+
+ write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
+
+ s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
+ v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W64 //LDS save is complete?
+
+ s_branch L_SAVE_LDS_DONE
+
+L_SAVE_LDS_WITH_TCP_W64:
+#endif
+
s_mov_b32 s3, 256
s_nop 0
s_nop 0
@@ -727,6 +789,25 @@ L_SAVE_VGPR_NORMAL:
s_cmp_lt_u32 m0, s_save_alloc_size
s_cbranch_scc0 L_SAVE_VGPR_END
+#if SAVE_AFTER_XNACK_ERROR
+ check_if_tcp_store_ok()
+ s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP
+
+L_SAVE_VGPR_LOOP_SQC_W32:
+ v_movrels_b32 v0, v0 //v0 = v[0+m0]
+ v_movrels_b32 v1, v1 //v1 = v[1+m0]
+ v_movrels_b32 v2, v2 //v2 = v[2+m0]
+ v_movrels_b32 v3, v3 //v3 = v[3+m0]
+
+ write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
+
+ s_add_u32 m0, m0, 4
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W32
+
+ s_branch L_SAVE_VGPR_END
+#endif
+
L_SAVE_VGPR_W32_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v1, v1 //v1 = v[1+m0]
@@ -753,6 +834,25 @@ L_SAVE_VGPR_WAVE64:
s_cmp_lt_u32 m0, s_save_alloc_size
s_cbranch_scc0 L_SAVE_SHARED_VGPR
+#if SAVE_AFTER_XNACK_ERROR
+ check_if_tcp_store_ok()
+ s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP
+
+L_SAVE_VGPR_LOOP_SQC_W64:
+ v_movrels_b32 v0, v0 //v0 = v[0+m0]
+ v_movrels_b32 v1, v1 //v1 = v[1+m0]
+ v_movrels_b32 v2, v2 //v2 = v[2+m0]
+ v_movrels_b32 v3, v3 //v3 = v[3+m0]
+
+ write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
+
+ s_add_u32 m0, m0, 4
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W64
+
+ s_branch L_SAVE_VGPR_END
+#endif
+
L_SAVE_VGPR_W64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v1, v1 //v1 = v[1+m0]
@@ -780,6 +880,23 @@ L_SAVE_SHARED_VGPR:
s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
s_mov_b32 exec_lo, 0xFFFFFFFF
s_mov_b32 exec_hi, 0x00000000
+
+#if SAVE_AFTER_XNACK_ERROR
+ check_if_tcp_store_ok()
+ s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP
+
+L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC:
+ v_movrels_b32 v0, v0
+
+ write_vgprs_to_mem_with_sqc_w64(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)
+
+ s_add_u32 m0, m0, 1
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC
+
+ s_branch L_SAVE_VGPR_END
+#endif
+
L_SAVE_SHARED_VGPR_WAVE64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
@@ -1190,6 +1307,43 @@ function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1
end
+#if SAVE_AFTER_XNACK_ERROR
+function check_if_tcp_store_ok
+ // If TRAPSTS.XNACK_ERROR=1 then TCP stores will fail.
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS)
+ s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp
+
+L_TCP_STORE_CHECK_DONE:
+end
+
+function write_vgpr_to_mem_with_sqc(vgpr, n_lanes, s_rsrc, s_mem_offset)
+ s_mov_b32 s4, 0
+
+L_WRITE_VGPR_LANE_LOOP:
+ for var lane = 0; lane < 4; ++lane
+ v_readlane_b32 s[lane], vgpr, s4
+ s_add_u32 s4, s4, 1
+ end
+
+ s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1
+
+ s_add_u32 s_mem_offset, s_mem_offset, 0x10
+ s_cmp_eq_u32 s4, n_lanes
+ s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP
+end
+
+function write_vgprs_to_mem_with_sqc_w32(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
+ for var vgpr = 0; vgpr < n_vgprs; ++vgpr
+ write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 32, s_rsrc, s_mem_offset)
+ end
+end
+
+function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
+ for var vgpr = 0; vgpr < n_vgprs; ++vgpr
+ write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 64, s_rsrc, s_mem_offset)
+ end
+end
+#endif
function get_lds_size_bytes(s_lds_size_byte)
s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
base-commit: 420b2460a743b320868f30e407d4c4685958ea2c
prerequisite-patch-id: 9415524d1e3a92ea7fba9d636f2d34746ca9ef95
--
2.25.1
More information about the amd-gfx
mailing list