[PATCH] drm/amdkfd: fix NULL pointer dereference

vitaly.prosyak at amd.com vitaly.prosyak at amd.com
Sat Apr 13 18:07:06 UTC 2024


From: Vitaly Prosyak <vitaly.prosyak at amd.com>

[  +0.006038] BUG: kernel NULL pointer dereference, address: 0000000000000028
[  +0.006969] #PF: supervisor read access in kernel mode
[  +0.005139] #PF: error_code(0x0000) - not-present page
[  +0.005139] PGD 0 P4D 0
[  +0.002530] Oops: 0000 [#1] PREEMPT SMP NOPTI
[  +0.004356] CPU: 11 PID: 12625 Comm: kworker/11:0 Tainted: G        W          6.7.0+ #2
[  +0.008097] Hardware name: ASUS System Product Name/Pro WS WRX80E-SAGE SE WIFI II, BIOS 1302 12/08/2023
[  +0.009398] Workqueue: events evict_process_worker [amdgpu]
[  +0.005750] RIP: 0010:evict_process_worker+0x2f/0x460 [amdgpu]
[  +0.005991] Code: 55 48 89 e5 41 57 41 56 4c 8d b7 a8 fc ff ff 41 55 41 54 53 48 89 fb 48 83 ec 10 0f 1f 44 00 00 48 8b 43 f8 8b 93 b0 00 00 00 <48> 3b 50 28 0f 85 50 03 00 00 48 8d 7b 58 e8 ee be cb bf 48 8b 05
[  +0.018791] RSP: 0018:ffffc90009a2be10 EFLAGS: 00010282
[  +0.005226] RAX: 0000000000000000 RBX: ffff888197ffc358 RCX: 0000000000000000
[  +0.007140] RDX: 0000000000000a1b RSI: 0000000000000000 RDI: ffff888197ffc358
[  +0.007139] RBP: ffffc90009a2be48 R08: 0000000000000000 R09: 0000000000000000
[  +0.007139] R10: 0000000000000000 R11: 0000000000000000 R12: ffff888197ffc358
[  +0.007139] R13: ffff888100153a00 R14: ffff888197ffc000 R15: ffff888100153a05
[  +0.007137] FS:  0000000000000000(0000) GS:ffff889facac0000(0000) knlGS:0000000000000000
[  +0.008094] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  +0.005747] CR2: 0000000000000028 CR3: 000000010d1fc001 CR4: 0000000000770ef0
[  +0.007138] PKRU: 55555554
[  +0.002702] Call Trace:
[  +0.002443]  <TASK>
[  +0.002096]  ? show_regs+0x72/0x90
[  +0.003402]  ? __die+0x25/0x80
[  +0.003052]  ? page_fault_oops+0x154/0x4c0
[  +0.004099]  ? do_user_addr_fault+0x30e/0x6e0
[  +0.004357]  ? psi_group_change+0x237/0x520
[  +0.004185]  ? exc_page_fault+0x84/0x1b0
[  +0.003926]  ? asm_exc_page_fault+0x27/0x30
[  +0.004187]  ? evict_process_worker+0x2f/0x460 [amdgpu]
[  +0.005377]  process_one_work+0x17b/0x360
[  +0.004011]  ? __pfx_worker_thread+0x10/0x10
[  +0.004269]  worker_thread+0x307/0x430
[  +0.003748]  ? __pfx_worker_thread+0x10/0x10
[  +0.004268]  kthread+0xf7/0x130
[  +0.003142]  ? __pfx_kthread+0x10/0x10
[  +0.003749]  ret_from_fork+0x46/0x70
[  +0.003573]  ? __pfx_kthread+0x10/0x10
[  +0.003747]  ret_from_fork_asm+0x1b/0x30
[  +0.003924]  </TASK>

When we run stressful tests, the eviction fence could be zero and not match
to last_eviction_seqno.

Avoid calling dma_fence_signal and dma_fence_put with zero fences to rely
on checking parameters in DMA API.

Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Christian Koenig <christian.koenig at amd.com>
Cc: Xiaogang Chen <xiaogang.chen at amd.com>
Cc: Felix Kuehling <felix.kuehling at amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index eb380296017d..a15fae1c398a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -2118,7 +2118,7 @@ static void evict_process_worker(struct work_struct *work)
 	 */
 	p = container_of(dwork, struct kfd_process, eviction_work);
 	trace_kfd_evict_process_worker_start(p);
-	WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
+	WARN_ONCE(p->ef && p->last_eviction_seqno != p->ef->seqno,
 		  "Eviction fence mismatch\n");
 
 	/* Narrow window of overlap between restore and evict work
@@ -2134,9 +2134,11 @@ static void evict_process_worker(struct work_struct *work)
 	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
 	ret = kfd_process_evict_queues(p, false, KFD_QUEUE_EVICTION_TRIGGER_TTM);
 	if (!ret) {
-		dma_fence_signal(p->ef);
-		dma_fence_put(p->ef);
-		p->ef = NULL;
+		if (p->ef) {
+			dma_fence_signal(p->ef);
+			dma_fence_put(p->ef);
+			p->ef = NULL;
+		}
 
 		if (!kfd_process_unmap_doorbells_if_idle(p))
 			kfd_process_schedule_restore(p);
-- 
2.25.1



More information about the amd-gfx mailing list