[PATCH 1/5] drm/amdkfd: svm range restore work deadlock when process exit
Philip Yang
Philip.Yang at amd.com
Fri Jan 14 20:32:52 UTC 2022
kfd_process_notifier_release flush svm_range_restore_work may cause
deadlock, because svm_range_restore_work calls
svm_range_list_lock_and_flush_work which takes mmap write lock. Move
flush svm_range_restore_work to kfd_process_wq_release to avoid
deadlock. Then svm_range_restore_work needs taking mm ref.
Signed-off-by: Philip Yang <Philip.Yang at amd.com>
Reported-by: Ruili Ji <ruili.ji at amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 -
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 14 ++++++++------
2 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d1145da5348f..74f162887d3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1150,7 +1150,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
- cancel_delayed_work_sync(&p->svms.restore_work);
mutex_lock(&p->mutex);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f2805ba74c80..37b3191615b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1643,13 +1643,13 @@ static void svm_range_restore_work(struct work_struct *work)
pr_debug("restore svm ranges\n");
- /* kfd_process_notifier_release destroys this worker thread. So during
- * the lifetime of this thread, kfd_process and mm will be valid.
- */
p = container_of(svms, struct kfd_process, svms);
- mm = p->mm;
- if (!mm)
+ /* Avoid mm is gone when svm_range_validate_and_map ranges */
+ mm = get_task_mm(p->lead_thread);
+ if (!mm) {
+ pr_debug("svms 0x%p process mm gone\n", svms);
return;
+ }
svm_range_list_lock_and_flush_work(svms, mm);
mutex_lock(&svms->lock);
@@ -1703,6 +1703,7 @@ static void svm_range_restore_work(struct work_struct *work)
out_reschedule:
mutex_unlock(&svms->lock);
mmap_write_unlock(mm);
+ mmput(mm);
/* If validation failed, reschedule another attempt */
if (evicted_ranges) {
@@ -2830,6 +2831,8 @@ void svm_range_list_fini(struct kfd_process *p)
pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
+ cancel_delayed_work_sync(&p->svms.restore_work);
+
/* Ensure list work is finished before process is destroyed */
flush_work(&p->svms.deferred_list_work);
@@ -2840,7 +2843,6 @@ void svm_range_list_fini(struct kfd_process *p)
atomic_inc(&p->svms.drain_pagefaults);
svm_range_drain_retry_fault(&p->svms);
-
list_for_each_entry_safe(prange, next, &p->svms.list, list) {
svm_range_unlink(prange);
svm_range_remove_notifier(prange);
--
2.17.1
More information about the amd-gfx
mailing list