After Vega 56/64 GPU hang I unable reboot system

Mikhail Gavrilov mikhail.v.gavrilov at gmail.com
Thu Jan 10 15:22:56 UTC 2019


On Thu, 10 Jan 2019 at 00:36, Mikhail Gavrilov
<mikhail.v.gavrilov at gmail.com> wrote:
>
> All new one logs attached here.
>
> Thanks.
>
> P.S. This time I had to terminate command `./umr -O verbose,follow -R
> gfx[.] > gfx.log 2>&1` cause it tried to write log infinitely.
> I also had to terminate command `./umr -O verbose,follow -R gfx[.] >
> gfx.log 2>&1` cause it stuck for a long time.
>
>

It became clear why umr stuck at the gfx dump. I ran umr under gdb and
I got  a segfault at a moment when umr was stuck earlier.
Tom, are you here? Can you look attached backtrace?


--
Best Regards,
Mike Gavrilov.
-------------- next part --------------
   pgm[1 at 0x80010e520500 + 0x12c4] = 0x043a0f18		v_sub_f32_e32 v29, v24, v7                                 	
   pgm[1 at 0x80010e520500 + 0x12c8] = 0x0a3c3d0d		v_mul_f32_e32 v30, v13, v30                                	
   pgm[1 at 0x80010e520500 + 0x12cc] = 0x2c3c3b11		v_mac_f32_e32 v30, v17, v29                                	
   pgm[1 at 0x80010e520500 + 0x12d0] = 0x0238391e		v_add_f32_e32 v28, v30, v28                                	
   pgm[1 at 0x80010e520500 + 0x12d4] = 0xd105801c		v_mul_f32_e64 v28, v21, v28 clamp                          	
   pgm[1 at 0x80010e520500 + 0x12d8] = 0x00023915	;;                                                          	
   pgm[1 at 0x80010e520500 + 0x12dc] = 0xd1c1001e		v_mad_f32 v30, v28, v13, v6                                	
   pgm[1 at 0x80010e520500 + 0x12e0] = 0x041a1b1c	;;                                                          	
   pgm[1 at 0x80010e520500 + 0x12e4] = 0xd1c1001d		v_mad_f32 v29, v28, v17, v7                                	
   pgm[1 at 0x80010e520500 + 0x12e8] = 0x041e231c	;;                                                          	
   pgm[1 at 0x80010e520500 + 0x12ec] = 0x0432331e		v_sub_f32_e32 v25, v30, v25                                	
   pgm[1 at 0x80010e520500 + 0x12f0] = 0xd1c1001c		v_mad_f32 v28, v28, v10, v3                                	
   pgm[1 at 0x80010e520500 + 0x12f4] = 0x040e151c	;;                                                          	
   pgm[1 at 0x80010e520500 + 0x12f8] = 0x0430311d		v_sub_f32_e32 v24, v29, v24                                	
   pgm[1 at 0x80010e520500 + 0x12fc] = 0x0a323319		v_mul_f32_e32 v25, v25, v25                                	
   pgm[1 at 0x80010e520500 + 0x1300] = 0x0434351c		v_sub_f32_e32 v26, v28, v26                                	
   pgm[1 at 0x80010e520500 + 0x1304] = 0x2c323118		v_mac_f32_e32 v25, v24, v24                                	
   pgm[1 at 0x80010e520500 + 0x1308] = 0x2c32351a		v_mac_f32_e32 v25, v26, v26                                	
   pgm[1 at 0x80010e520500 + 0x130c] = 0x7c8c331b		v_cmp_ge_f32_e32 vcc, v27, v25                             	
   pgm[1 at 0x80010e520500 + 0x1310] = 0xd100001e		v_cndmask_b32_e64 v30, 0, -1, vcc                          	
   pgm[1 at 0x80010e520500 + 0x1314] = 0x01a98280	;;                                                          	
   pgm[1 at 0x80010e520500 + 0x1318] = 0x87fe007e		s_or_b64 exec, exec, s[0:1]                                	
   pgm[1 at 0x80010e520500 + 0x131c] = 0x26302e9f		v_and_b32_e32 v24, 31, v23                                 	
   pgm[1 at 0x80010e520500 + 0x1320] = 0xd1120018		v_lshlrev_b32_e64 v24, v24, 1                              	
   pgm[1 at 0x80010e520500 + 0x1324] = 0x00010318	;;                                                          	
   pgm[1 at 0x80010e520500 + 0x1328] = 0x7e32570e		v_not_b32_e32 v25, v14                                     	
   pgm[1 at 0x80010e520500 + 0x132c] = 0x26303318		v_and_b32_e32 v24, v24, v25                                	
   pgm[1 at 0x80010e520500 + 0x1330] = 0x7d9a3c80		v_cmp_ne_u32_e32 vcc, 0, v30                               	
   pgm[1 at 0x80010e520500 + 0x1334] = 0x00363080		v_cndmask_b32_e32 v27, 0, v24, vcc                         	
   pgm[1 at 0x80010e520500 + 0x1338] = 0x2a30370e		v_xor_b32_e32 v24, v14, v27                                	
   pgm[1 at 0x80010e520500 + 0x133c] = 0x682e2e81		v_add_u32_e32 v23, 1, v23                                  	
   pgm[1 at 0x80010e520500 + 0x1340] = 0x682c2c90		v_add_u32_e32 v22, 16, v22                                 	
   pgm[1 at 0x80010e520500 + 0x1344] = 0x7e320280		v_mov_b32_e32 v25, 0                                       	
   pgm[1 at 0x80010e520500 + 0x1348] = 0xbf820000		s_branch 0                                                 	
   pgm[1 at 0x80010e520500 + 0x134c] = 0x7d9a3280		v_cmp_ne_u32_e32 vcc, 0, v25                               	
   pgm[1 at 0x80010e520500 + 0x1350] = 0x86ea6a7e		s_and_b64 vcc, exec, vcc                                   	
   pgm[1 at 0x80010e520500 + 0x1354] = 0xbf86fd70		s_cbranch_vccz 64880                                       	
   pgm[1 at 0x80010e520500 + 0x1358] = 0xb0038000		s_movk_i32 s3, 0x8000                                      	
   pgm[1 at 0x80010e520500 + 0x135c] = 0xc00e0001		s_load_dwordx8 s[0:7], s[2:3], 0x0                         	
   pgm[1 at 0x80010e520500 + 0x1360] = 0x00000000	;;                                                          	
   pgm[1 at 0x80010e520500 + 0x1364] = 0x7e1e030e		v_mov_b32_e32 v15, v14                                     	
   pgm[1 at 0x80010e520500 + 0x1368] = 0x7e20030e		v_mov_b32_e32 v16, v14                                     	
   pgm[1 at 0x80010e520500 + 0x136c] = 0x7e22030e		v_mov_b32_e32 v17, v14                                     	
   pgm[1 at 0x80010e520500 + 0x1370] = 0xbf8cc07f		s_waitcnt lgkmcnt(0)                                       	
   pgm[1 at 0x80010e520500 + 0x1374] = 0xf0201f00		image_store v[14:17], v0, s[0:7] dmask:0xf unorm           	
   pgm[1 at 0x80010e520500 + 0x1378] = 0x00000e00	;;                                                          	
   pgm[1 at 0x80010e520500 + 0x137c] = 0xbf810000		s_endpgm                                                   	
End of disassembly.

Disassembly of shader 1 at 0x80010010b400 of length 161292 bytes from IB[1 at 0x80010e555000 + 0x1078]

Program received signal SIGSEGV, Segmentation fault.
0x0000000000494220 in llvm::MCAssembler::getAtom(llvm::MCSymbol const&) const [clone .cold.240] ()
(gdb) thread apply all bt full

Thread 1 (Thread 0x7ffff7a20740 (LWP 5975)):
#0  0x0000000000494220 in llvm::MCAssembler::getAtom(llvm::MCSymbol const&) const [clone .cold.240] ()
No symbol table info available.
#1  0x00007fff3fffffff in ?? ()
No symbol table info available.
#2  0x0000000000000902 in ?? ()
No symbol table info available.
#3  0x0000000000533a41 in llvm::AMDGPUDisassembler::decodeOperand_VReg_96(unsigned int) const ()
No symbol table info available.
#4  0x0000000000537243 in DecodeSReg_128RegisterClass(llvm::MCInst&, unsigned int, unsigned long, void const*) [clone .isra.60] ()
No symbol table info available.
#5  0x00000000005464b2 in llvm::MCDisassembler::DecodeStatus llvm::decodeToMCInst<unsigned long>(llvm::MCDisassembler::DecodeStatus, unsigned int, unsigned long, llvm::MCInst&, unsigned long, void const*, bool&) ()
No symbol table info available.
#6  0x000000000054e1ad in llvm::AMDGPUDisassembler::tryDecodeInst(unsigned char const*, llvm::MCInst&, unsigned long, unsigned long) const ()
No symbol table info available.
#7  0x000000000054ec0f in llvm::AMDGPUDisassembler::getInstruction(llvm::MCInst&, unsigned long&, llvm::ArrayRef<unsigned char>, unsigned long, llvm::raw_ostream&, llvm::raw_ostream&) const ()
No symbol table info available.
#8  0x00000000007d0a5e in LLVMDisasmInstruction ()
No symbol table info available.
#9  0x00000000004c24b1 in umr_llvm_disasm (disasm_text=0x7ffff7952010, PC=140741784417280, inst_bytes=161292, inst=0x4599e30 "\202", asic=<optimized out>) at /home/mikhail/packaging-work/umr/src/lib/umr_llvm_disasm.c:81
        i = 4532
        tmp = "\tv_madak_f32 v255, v255, v255, 0xfffffff0\000\071\070, attr49.z\000isn't aligned 3\000\061\065 offen offset:2521 glc\000\061 glc tfe ; Error: unknown operand encoding 125\000oding 125\000\000\000\200\000\000\000\321\002\000\000\340\023\000\000v\002\000\000\000\000\000\000N\000\000\000\000\000\000\000\t\000\000\000\000\000\000\000\274\000\000\000}\000\000\000"...
        cpuname = <optimized out>
        disasm_ref = 0x454a080
        x = 18128
        z = <optimized out>
        n = <optimized out>
        disasm_ref = <optimized out>
        x = <optimized out>
        z = <optimized out>
        i = <optimized out>
        n = <optimized out>
        tmp = <optimized out>
        cpuname = <optimized out>
#10 umr_llvm_disasm (asic=<optimized out>, inst=0x4599e30 "\202", inst_bytes=161292, PC=140741784417280, disasm_text=0x7ffff7952010) at /home/mikhail/packaging-work/umr/src/lib/umr_llvm_disasm.c:38
        disasm_ref = <optimized out>
        x = <optimized out>
        z = <optimized out>
        i = <optimized out>
        n = <optimized out>
        tmp = <optimized out>
        cpuname = <optimized out>
#11 0x00000000004c26d1 in umr_vm_disasm (asic=asic at entry=0x1c08a50, vmid=1, addr=140741784417280, PC=PC at entry=0, size=<optimized out>, start_offset=start_offset at entry=0, wd=0x1c0b6b0) at /home/mikhail/packaging-work/umr/src/lib/umr_llvm_disasm.c:170
        opcodes = 0x4599e30
        x = <optimized out>
        y = <optimized out>
--Type <RET> for more, q to quit, c to continue without paging--c
        nwave = <optimized out>
        wavehits = 0
        opcode_strs = 0x7ffff7952010
        pwd = <optimized out>
        r = 0
#12 0x00000000004b998f in umr_dump_shaders (asic=asic at entry=0x1c08a50, decoder=decoder at entry=0x1c0b5d0, wd=wd at entry=0x1c0b6b0) at /home/mikhail/packaging-work/umr/src/lib/dump_ib.c:88
        pshader = <optimized out>
        shader = 0x1c094c0
#13 0x00000000004b056e in umr_read_ring (asic=asic at entry=0x1c08a50, ringpath=<optimized out>) at /home/mikhail/packaging-work/umr/src/app/ring_read.c:141
        ringname = "gfx", '\000' <repeats 28 times>
        from = ".", '\000' <repeats 30 times>
        to = '\000' <repeats 31 times>
        use_decoder = <optimized out>
        enable_decoder = 1
        wptr = 3072
        rptr = 2044
        drv_wptr = 3072
        ringsize = 8192
        start = 3076
        end = 3072
        value = <optimized out>
        ring_data = <optimized out>
        decoder = {pm = 4, src = {addr = 0, vmid = 0, ib_addr = 0}, pm4 = {cur_opcode = 34, pkt_type = 3, n_words = 4, cur_word = 0, control = 1074790400, next_ib_state = {ib_addr_lo = 0, ib_addr_hi = 0, ib_size = 0, ib_vmid = 0, tally = 0}, next_write_mem = {type = 570425344, addr_lo = 4082, addr_hi = 0, value = 0}, nop = {pktlen = 0, pkttype = 0, magic = 0, str = 0x0}}, sdma = {cur_opcode = 4294967295, cur_sub_opcode = 0, n_words = 0, cur_word = 0, header_dw = 0, next_write_mem = 0, next_ib_state = {ib_addr_lo = 0, ib_addr_hi = 0, csa_addr_lo = 0, csa_addr_hi = 0, ib_size = 0, ib_vmid = 0}}, next_ib = 0x1c0b410, next_ib_info = {ib_addr = 0, vm_base_addr = 0, vmid = 0, size = 0, addr = 768}, shader = 0x0}
        pdecoder = 0x1c0b5d0
        ppdecoder = <optimized out>
        wd = 0x1c0b6b0
#14 0x0000000000496c68 in main (argc=<optimized out>, argv=<optimized out>) at /home/mikhail/packaging-work/umr/src/app/main.c:339
        i = 3
        j = <optimized out>
        k = <optimized out>
        l = <optimized out>
        asic = 0x1c08a50
        blockname = <optimized out>
        str = <optimized out>
        str2 = <optimized out>
        asicname = "\000\000\000\000\004", '\000' <repeats 19 times>, "\001", '\000' <repeats 11 times>, "\004", '\000' <repeats 19 times>, "\a", '\000' <repeats 11 times>, "\004", '\000' <repeats 31 times>, "\004", '\000' <repeats 31 times>, "\004", '\000' <repeats 31 times>, "\004", '\000' <repeats 31 times>, "\004", '\000' <repeats 31 times>...
        ipname = '\000' <repeats 56 times>, "-options", '\000' <repeats 120 times>, "\037", '\000' <repeats 31 times>...
        regname = "\000\000\000\000\000 ", '\000' <repeats 18 times>, "\017\004", '\000' <repeats 11 times>, " ", '\000' <repeats 18 times>, "\220\377\377\377\377\377\377\377", '\000' <repeats 16 times>, "\031", '\000' <repeats 15 times>, "\a\000\000\000\000\000\000\000\037\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\030\220\275\001\000\000\000\000P\000\000\000\000\000\000\000\220\377\377\377\377\377\377\377\000\000\000\000\000\000\000\000\003\000\000\000w\000\000\000[\000\000\000\060", '\000' <repeats 27 times>, "n\000\000\000|", '\000' <repeats 19 times>...
        req = {tv_sec = 0, tv_nsec = 7312272888393198945}
(gdb) 
(gdb) 
(gdb) 


More information about the amd-gfx mailing list