After Vega 56/64 GPU hang I unable reboot system
Mikhail Gavrilov
mikhail.v.gavrilov at gmail.com
Thu Jan 10 15:22:56 UTC 2019
On Thu, 10 Jan 2019 at 00:36, Mikhail Gavrilov
<mikhail.v.gavrilov at gmail.com> wrote:
>
> All new one logs attached here.
>
> Thanks.
>
> P.S. This time I had to terminate command `./umr -O verbose,follow -R
> gfx[.] > gfx.log 2>&1` cause it tried to write log infinitely.
> I also had to terminate command `./umr -O verbose,follow -R gfx[.] >
> gfx.log 2>&1` cause it stuck for a long time.
>
>
It became clear why umr stuck at the gfx dump. I ran umr under gdb and
I got a segfault at a moment when umr was stuck earlier.
Tom, are you here? Can you look attached backtrace?
--
Best Regards,
Mike Gavrilov.
-------------- next part --------------
pgm[1 at 0x80010e520500 + 0x12c4] = 0x043a0f18 v_sub_f32_e32 v29, v24, v7
pgm[1 at 0x80010e520500 + 0x12c8] = 0x0a3c3d0d v_mul_f32_e32 v30, v13, v30
pgm[1 at 0x80010e520500 + 0x12cc] = 0x2c3c3b11 v_mac_f32_e32 v30, v17, v29
pgm[1 at 0x80010e520500 + 0x12d0] = 0x0238391e v_add_f32_e32 v28, v30, v28
pgm[1 at 0x80010e520500 + 0x12d4] = 0xd105801c v_mul_f32_e64 v28, v21, v28 clamp
pgm[1 at 0x80010e520500 + 0x12d8] = 0x00023915 ;;
pgm[1 at 0x80010e520500 + 0x12dc] = 0xd1c1001e v_mad_f32 v30, v28, v13, v6
pgm[1 at 0x80010e520500 + 0x12e0] = 0x041a1b1c ;;
pgm[1 at 0x80010e520500 + 0x12e4] = 0xd1c1001d v_mad_f32 v29, v28, v17, v7
pgm[1 at 0x80010e520500 + 0x12e8] = 0x041e231c ;;
pgm[1 at 0x80010e520500 + 0x12ec] = 0x0432331e v_sub_f32_e32 v25, v30, v25
pgm[1 at 0x80010e520500 + 0x12f0] = 0xd1c1001c v_mad_f32 v28, v28, v10, v3
pgm[1 at 0x80010e520500 + 0x12f4] = 0x040e151c ;;
pgm[1 at 0x80010e520500 + 0x12f8] = 0x0430311d v_sub_f32_e32 v24, v29, v24
pgm[1 at 0x80010e520500 + 0x12fc] = 0x0a323319 v_mul_f32_e32 v25, v25, v25
pgm[1 at 0x80010e520500 + 0x1300] = 0x0434351c v_sub_f32_e32 v26, v28, v26
pgm[1 at 0x80010e520500 + 0x1304] = 0x2c323118 v_mac_f32_e32 v25, v24, v24
pgm[1 at 0x80010e520500 + 0x1308] = 0x2c32351a v_mac_f32_e32 v25, v26, v26
pgm[1 at 0x80010e520500 + 0x130c] = 0x7c8c331b v_cmp_ge_f32_e32 vcc, v27, v25
pgm[1 at 0x80010e520500 + 0x1310] = 0xd100001e v_cndmask_b32_e64 v30, 0, -1, vcc
pgm[1 at 0x80010e520500 + 0x1314] = 0x01a98280 ;;
pgm[1 at 0x80010e520500 + 0x1318] = 0x87fe007e s_or_b64 exec, exec, s[0:1]
pgm[1 at 0x80010e520500 + 0x131c] = 0x26302e9f v_and_b32_e32 v24, 31, v23
pgm[1 at 0x80010e520500 + 0x1320] = 0xd1120018 v_lshlrev_b32_e64 v24, v24, 1
pgm[1 at 0x80010e520500 + 0x1324] = 0x00010318 ;;
pgm[1 at 0x80010e520500 + 0x1328] = 0x7e32570e v_not_b32_e32 v25, v14
pgm[1 at 0x80010e520500 + 0x132c] = 0x26303318 v_and_b32_e32 v24, v24, v25
pgm[1 at 0x80010e520500 + 0x1330] = 0x7d9a3c80 v_cmp_ne_u32_e32 vcc, 0, v30
pgm[1 at 0x80010e520500 + 0x1334] = 0x00363080 v_cndmask_b32_e32 v27, 0, v24, vcc
pgm[1 at 0x80010e520500 + 0x1338] = 0x2a30370e v_xor_b32_e32 v24, v14, v27
pgm[1 at 0x80010e520500 + 0x133c] = 0x682e2e81 v_add_u32_e32 v23, 1, v23
pgm[1 at 0x80010e520500 + 0x1340] = 0x682c2c90 v_add_u32_e32 v22, 16, v22
pgm[1 at 0x80010e520500 + 0x1344] = 0x7e320280 v_mov_b32_e32 v25, 0
pgm[1 at 0x80010e520500 + 0x1348] = 0xbf820000 s_branch 0
pgm[1 at 0x80010e520500 + 0x134c] = 0x7d9a3280 v_cmp_ne_u32_e32 vcc, 0, v25
pgm[1 at 0x80010e520500 + 0x1350] = 0x86ea6a7e s_and_b64 vcc, exec, vcc
pgm[1 at 0x80010e520500 + 0x1354] = 0xbf86fd70 s_cbranch_vccz 64880
pgm[1 at 0x80010e520500 + 0x1358] = 0xb0038000 s_movk_i32 s3, 0x8000
pgm[1 at 0x80010e520500 + 0x135c] = 0xc00e0001 s_load_dwordx8 s[0:7], s[2:3], 0x0
pgm[1 at 0x80010e520500 + 0x1360] = 0x00000000 ;;
pgm[1 at 0x80010e520500 + 0x1364] = 0x7e1e030e v_mov_b32_e32 v15, v14
pgm[1 at 0x80010e520500 + 0x1368] = 0x7e20030e v_mov_b32_e32 v16, v14
pgm[1 at 0x80010e520500 + 0x136c] = 0x7e22030e v_mov_b32_e32 v17, v14
pgm[1 at 0x80010e520500 + 0x1370] = 0xbf8cc07f s_waitcnt lgkmcnt(0)
pgm[1 at 0x80010e520500 + 0x1374] = 0xf0201f00 image_store v[14:17], v0, s[0:7] dmask:0xf unorm
pgm[1 at 0x80010e520500 + 0x1378] = 0x00000e00 ;;
pgm[1 at 0x80010e520500 + 0x137c] = 0xbf810000 s_endpgm
End of disassembly.
Disassembly of shader 1 at 0x80010010b400 of length 161292 bytes from IB[1 at 0x80010e555000 + 0x1078]
Program received signal SIGSEGV, Segmentation fault.
0x0000000000494220 in llvm::MCAssembler::getAtom(llvm::MCSymbol const&) const [clone .cold.240] ()
(gdb) thread apply all bt full
Thread 1 (Thread 0x7ffff7a20740 (LWP 5975)):
#0 0x0000000000494220 in llvm::MCAssembler::getAtom(llvm::MCSymbol const&) const [clone .cold.240] ()
No symbol table info available.
#1 0x00007fff3fffffff in ?? ()
No symbol table info available.
#2 0x0000000000000902 in ?? ()
No symbol table info available.
#3 0x0000000000533a41 in llvm::AMDGPUDisassembler::decodeOperand_VReg_96(unsigned int) const ()
No symbol table info available.
#4 0x0000000000537243 in DecodeSReg_128RegisterClass(llvm::MCInst&, unsigned int, unsigned long, void const*) [clone .isra.60] ()
No symbol table info available.
#5 0x00000000005464b2 in llvm::MCDisassembler::DecodeStatus llvm::decodeToMCInst<unsigned long>(llvm::MCDisassembler::DecodeStatus, unsigned int, unsigned long, llvm::MCInst&, unsigned long, void const*, bool&) ()
No symbol table info available.
#6 0x000000000054e1ad in llvm::AMDGPUDisassembler::tryDecodeInst(unsigned char const*, llvm::MCInst&, unsigned long, unsigned long) const ()
No symbol table info available.
#7 0x000000000054ec0f in llvm::AMDGPUDisassembler::getInstruction(llvm::MCInst&, unsigned long&, llvm::ArrayRef<unsigned char>, unsigned long, llvm::raw_ostream&, llvm::raw_ostream&) const ()
No symbol table info available.
#8 0x00000000007d0a5e in LLVMDisasmInstruction ()
No symbol table info available.
#9 0x00000000004c24b1 in umr_llvm_disasm (disasm_text=0x7ffff7952010, PC=140741784417280, inst_bytes=161292, inst=0x4599e30 "\202", asic=<optimized out>) at /home/mikhail/packaging-work/umr/src/lib/umr_llvm_disasm.c:81
i = 4532
tmp = "\tv_madak_f32 v255, v255, v255, 0xfffffff0\000\071\070, attr49.z\000isn't aligned 3\000\061\065 offen offset:2521 glc\000\061 glc tfe ; Error: unknown operand encoding 125\000oding 125\000\000\000\200\000\000\000\321\002\000\000\340\023\000\000v\002\000\000\000\000\000\000N\000\000\000\000\000\000\000\t\000\000\000\000\000\000\000\274\000\000\000}\000\000\000"...
cpuname = <optimized out>
disasm_ref = 0x454a080
x = 18128
z = <optimized out>
n = <optimized out>
disasm_ref = <optimized out>
x = <optimized out>
z = <optimized out>
i = <optimized out>
n = <optimized out>
tmp = <optimized out>
cpuname = <optimized out>
#10 umr_llvm_disasm (asic=<optimized out>, inst=0x4599e30 "\202", inst_bytes=161292, PC=140741784417280, disasm_text=0x7ffff7952010) at /home/mikhail/packaging-work/umr/src/lib/umr_llvm_disasm.c:38
disasm_ref = <optimized out>
x = <optimized out>
z = <optimized out>
i = <optimized out>
n = <optimized out>
tmp = <optimized out>
cpuname = <optimized out>
#11 0x00000000004c26d1 in umr_vm_disasm (asic=asic at entry=0x1c08a50, vmid=1, addr=140741784417280, PC=PC at entry=0, size=<optimized out>, start_offset=start_offset at entry=0, wd=0x1c0b6b0) at /home/mikhail/packaging-work/umr/src/lib/umr_llvm_disasm.c:170
opcodes = 0x4599e30
x = <optimized out>
y = <optimized out>
--Type <RET> for more, q to quit, c to continue without paging--c
nwave = <optimized out>
wavehits = 0
opcode_strs = 0x7ffff7952010
pwd = <optimized out>
r = 0
#12 0x00000000004b998f in umr_dump_shaders (asic=asic at entry=0x1c08a50, decoder=decoder at entry=0x1c0b5d0, wd=wd at entry=0x1c0b6b0) at /home/mikhail/packaging-work/umr/src/lib/dump_ib.c:88
pshader = <optimized out>
shader = 0x1c094c0
#13 0x00000000004b056e in umr_read_ring (asic=asic at entry=0x1c08a50, ringpath=<optimized out>) at /home/mikhail/packaging-work/umr/src/app/ring_read.c:141
ringname = "gfx", '\000' <repeats 28 times>
from = ".", '\000' <repeats 30 times>
to = '\000' <repeats 31 times>
use_decoder = <optimized out>
enable_decoder = 1
wptr = 3072
rptr = 2044
drv_wptr = 3072
ringsize = 8192
start = 3076
end = 3072
value = <optimized out>
ring_data = <optimized out>
decoder = {pm = 4, src = {addr = 0, vmid = 0, ib_addr = 0}, pm4 = {cur_opcode = 34, pkt_type = 3, n_words = 4, cur_word = 0, control = 1074790400, next_ib_state = {ib_addr_lo = 0, ib_addr_hi = 0, ib_size = 0, ib_vmid = 0, tally = 0}, next_write_mem = {type = 570425344, addr_lo = 4082, addr_hi = 0, value = 0}, nop = {pktlen = 0, pkttype = 0, magic = 0, str = 0x0}}, sdma = {cur_opcode = 4294967295, cur_sub_opcode = 0, n_words = 0, cur_word = 0, header_dw = 0, next_write_mem = 0, next_ib_state = {ib_addr_lo = 0, ib_addr_hi = 0, csa_addr_lo = 0, csa_addr_hi = 0, ib_size = 0, ib_vmid = 0}}, next_ib = 0x1c0b410, next_ib_info = {ib_addr = 0, vm_base_addr = 0, vmid = 0, size = 0, addr = 768}, shader = 0x0}
pdecoder = 0x1c0b5d0
ppdecoder = <optimized out>
wd = 0x1c0b6b0
#14 0x0000000000496c68 in main (argc=<optimized out>, argv=<optimized out>) at /home/mikhail/packaging-work/umr/src/app/main.c:339
i = 3
j = <optimized out>
k = <optimized out>
l = <optimized out>
asic = 0x1c08a50
blockname = <optimized out>
str = <optimized out>
str2 = <optimized out>
asicname = "\000\000\000\000\004", '\000' <repeats 19 times>, "\001", '\000' <repeats 11 times>, "\004", '\000' <repeats 19 times>, "\a", '\000' <repeats 11 times>, "\004", '\000' <repeats 31 times>, "\004", '\000' <repeats 31 times>, "\004", '\000' <repeats 31 times>, "\004", '\000' <repeats 31 times>, "\004", '\000' <repeats 31 times>...
ipname = '\000' <repeats 56 times>, "-options", '\000' <repeats 120 times>, "\037", '\000' <repeats 31 times>...
regname = "\000\000\000\000\000 ", '\000' <repeats 18 times>, "\017\004", '\000' <repeats 11 times>, " ", '\000' <repeats 18 times>, "\220\377\377\377\377\377\377\377", '\000' <repeats 16 times>, "\031", '\000' <repeats 15 times>, "\a\000\000\000\000\000\000\000\037\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\030\220\275\001\000\000\000\000P\000\000\000\000\000\000\000\220\377\377\377\377\377\377\377\000\000\000\000\000\000\000\000\003\000\000\000w\000\000\000[\000\000\000\060", '\000' <repeats 27 times>, "n\000\000\000|", '\000' <repeats 19 times>...
req = {tv_sec = 0, tv_nsec = 7312272888393198945}
(gdb)
(gdb)
(gdb)
More information about the amd-gfx
mailing list