[PATCH 1/2] drm/amdgpu: fix gpu reset crash

Mon Apr 24 09:47:22 UTC 2017

Am 24.04.2017 um 11:40 schrieb Chunming Zhou:
> [  413.687439] BUG: unable to handle kernel NULL pointer dereference at 0000000000000548
> [  413.687479] IP: [<ffffffff8109b175>] to_live_kthread+0x5/0x60
> [  413.687507] PGD 1efd12067
> [  413.687519] PUD 1efd11067
> [  413.687531] PMD 0
>
> [  413.687543] Oops: 0000 [#1] SMP
> [  413.687557] Modules linked in: amdgpu(OE) ttm(OE) drm_kms_helper(E) drm(E) i2c_algo_bit(E) fb_sys_fops(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) rpcsec_gss_krb5(E) nfsv4(E) nfs(E) fscache(E) snd_hda_codec_realtek(E) snd_hda_codec_generic(E) snd_hda_codec_hdmi(E) snd_hda_intel(E) eeepc_wmi(E) snd_hda_codec(E) asus_wmi(E) snd_hda_core(E) sparse_keymap(E) snd_hwdep(E) video(E) snd_pcm(E) snd_seq_midi(E) joydev(E) snd_seq_midi_event(E) snd_rawmidi(E) snd_seq(E) snd_seq_device(E) snd_timer(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) snd(E) crc32_pclmul(E) ghash_clmulni_intel(E) soundcore(E) aesni_intel(E) aes_x86_64(E) lrw(E) gf128mul(E) glue_helper(E) ablk_helper(E) cryptd(E) shpchp(E) serio_raw(E) i2c_piix4(E) 8250_dw(E) i2c_designware_platform(E) i2c_designware_core(E) mac_hid(E) binfmt_misc(E)
> [  413.687894]  parport_pc(E) ppdev(E) lp(E) parport(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) sunrpc(E) autofs4(E) hid_generic(E) usbhid(E) hid(E) psmouse(E) ahci(E) r8169(E) mii(E) libahci(E) wmi(E)
> [  413.687989] CPU: 13 PID: 1134 Comm: kworker/13:2 Tainted: G           OE   4.9.0-custom #4
> [  413.688019] Hardware name: System manufacturer System Product Name/PRIME B350-PLUS, BIOS 0606 04/06/2017
> [  413.688089] Workqueue: events amd_sched_job_timedout [amdgpu]
> [  413.688116] task: ffff88020f9657c0 task.stack: ffffc90001a88000
> [  413.688139] RIP: 0010:[<ffffffff8109b175>]  [<ffffffff8109b175>] to_live_kthread+0x5/0x60
> [  413.688171] RSP: 0018:ffffc90001a8bd60  EFLAGS: 00010282
> [  413.688191] RAX: ffff88020f0073f8 RBX: ffff88020f000000 RCX: 0000000000000000
> [  413.688217] RDX: 0000000000000001 RSI: ffff88020f9670c0 RDI: 0000000000000000
> [  413.688243] RBP: ffffc90001a8bd78 R08: 0000000000000000 R09: 0000000000001000
> [  413.688269] R10: 0000006051b11a82 R11: 0000000000000001 R12: 0000000000000000
> [  413.688295] R13: ffff88020f002770 R14: ffff88020f004838 R15: ffff8801b23c2c60
> [  413.688321] FS:  0000000000000000(0000) GS:ffff88021ef40000(0000) knlGS:0000000000000000
> [  413.688352] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  413.688373] CR2: 0000000000000548 CR3: 00000001efd0f000 CR4: 00000000003406e0
> [  413.688399] Stack:
> [  413.688407]  ffffffff8109b304 ffff88020f000000 0000000000000070 ffffc90001a8bdf0
> [  413.688439]  ffffffffa05ce29d ffffffffa052feb7 ffffffffa07b5820 ffffc90001a8bda0
> [  413.688470]  ffffffff00000018 ffff8801bb88f060 0000000001a8bdb8 ffff88021ef59280
> [  413.688502] Call Trace:
> [  413.688514]  [<ffffffff8109b304>] ? kthread_park+0x14/0x60
> [  413.688555]  [<ffffffffa05ce29d>] amdgpu_gpu_reset+0x7d/0x670 [amdgpu]
> [  413.688589]  [<ffffffffa052feb7>] ? drm_printk+0x97/0xa0 [drm]
> [  413.688643]  [<ffffffffa0698136>] amdgpu_job_timedout+0x46/0x50 [amdgpu]
> [  413.688700]  [<ffffffffa06969e7>] amd_sched_job_timedout+0x17/0x20 [amdgpu]
> [  413.688727]  [<ffffffff81095493>] process_one_work+0x153/0x3f0
> [  413.688751]  [<ffffffff81095c5b>] worker_thread+0x12b/0x4b0
> [  413.688773]  [<ffffffff8100392e>] ? do_syscall_64+0x6e/0x180
> [  413.688795]  [<ffffffff81095b30>] ? rescuer_thread+0x350/0x350
> [  413.688818]  [<ffffffff8100392e>] ? do_syscall_64+0x6e/0x180
> [  413.688839]  [<ffffffff8109b423>] kthread+0xd3/0xf0
> [  413.688858]  [<ffffffff8109b350>] ? kthread_park+0x60/0x60
> [  413.688881]  [<ffffffff817e1ee5>] ret_from_fork+0x25/0x30
> [  413.688901] Code: 25 40 d3 00 00 48 8b 80 48 05 00 00 48 89 e5 5d 48 8b 40 c8 48 c1 e8 02 83 e0 01 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 8b b7 48 05 00 00 55 48 89 e5 48 85 f6 74 31 8b 97 f8 18 00
> [  413.689045] RIP  [<ffffffff8109b175>] to_live_kthread+0x5/0x60
> [  413.689064]  RSP <ffffc90001a8bd60>
> [  413.689076] CR2: 0000000000000548
> [  413.697985] ---[ end trace 0a314a64821f84e9 ]---
>
> The root cause is some ring doesn't have scheduler, like KIQ ring
>
> Change-Id: I420e84add9cdd9a7fd1f9921b8a5d0afa3dd2058
> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com>

Reviewed-by: Christian König <christian.koenig at amd.com> for both.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++---
>   1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 9993085..168a9de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2675,7 +2675,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];
>   
> -		if (!ring)
> +		if (!ring || !ring->sched.thread)
>   			continue;
>   		kcl_kthread_park(ring->sched.thread);
>   		amd_sched_hw_job_reset(&ring->sched);
> @@ -2770,7 +2770,8 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   		}
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = adev->rings[i];
> -			if (!ring)
> +
> +			if (!ring || !ring->sched.thread)
>   				continue;
>   
>   			amd_sched_job_recovery(&ring->sched);
> @@ -2779,7 +2780,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	} else {
>   		dev_err(adev->dev, "asic resume failed (%d).\n", r);
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -			if (adev->rings[i]) {
> +			if (adev->rings[i] && adev->rings[i]->sched.thread) {
>   				kcl_kthread_unpark(adev->rings[i]->sched.thread);
>   			}
>   		}