[PATCH 1/3] drm/amdkfd: process exit and retry fault race
Felix Kuehling
felix.kuehling at amd.com
Wed Nov 17 23:18:48 UTC 2021
On 2021-11-16 10:43 p.m., Philip Yang wrote:
> kfd process mmu release notifier callback drain retry fault to ensure no
> retry fault comes after removing kfd process from the hash table,
> otherwise svm page fault handler will fail to recover the fault and dump
> GPU vm fault log.
>
> Drain retry fault needs flush restore page fault work to wait for
> the last fault is handled because IH dispatch increase rptr first and
> then calls restore_pages, so restore pages may still handle the last
> fault but amdgpu_ih_has_checkpoint_processed return true.
This fixes the problem, but it will result in waiting longer than
necessary because the worker only finishes when the IH ring is empty.
>
> restore pages can not call mmget because mmput may call mmu notifier
> release to cause deadlock.
See my comment inline.
>
> Refactor deferred list work to call mmget and take mmap write lock to
> handle all ranges, to avoid mm is gone while inserting mmu notifier.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 69 ++++++++++++------------
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 +
> 3 files changed, 41 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index d4c8a6948a9f..8b4b045d5c92 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1143,6 +1143,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
> if (WARN_ON(p->mm != mm))
> return;
>
> + /*
> + * Ensure no retry fault comes in afterwards, as page fault handler will
> + * not find kfd process and take mm lock to recover fault.
> + */
> + svm_range_drain_retry_fault(&p->svms);
> +
> mutex_lock(&kfd_processes_mutex);
> hash_del_rcu(&p->kfd_processes);
> mutex_unlock(&kfd_processes_mutex);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 88360f23eb61..c1f367934428 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1953,9 +1953,10 @@ svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
> }
> }
>
> -static void svm_range_drain_retry_fault(struct svm_range_list *svms)
> +void svm_range_drain_retry_fault(struct svm_range_list *svms)
> {
> struct kfd_process_device *pdd;
> + struct amdgpu_device *adev;
> struct kfd_process *p;
> uint32_t i;
>
> @@ -1967,9 +1968,11 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
> continue;
>
> pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
> + adev = pdd->dev->adev;
> + amdgpu_ih_wait_on_checkpoint_process(adev, &adev->irq.ih1);
>
> - amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev,
> - &pdd->dev->adev->irq.ih1);
> + /* Wait for the last page fault is handled */
> + flush_work(&adev->irq.ih1_work);
> pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
> }
> }
> @@ -1979,43 +1982,43 @@ static void svm_range_deferred_list_work(struct work_struct *work)
> struct svm_range_list *svms;
> struct svm_range *prange;
> struct mm_struct *mm;
> + struct kfd_process *p;
>
> svms = container_of(work, struct svm_range_list, deferred_list_work);
> pr_debug("enter svms 0x%p\n", svms);
>
> + p = container_of(svms, struct kfd_process, svms);
> + mm = p->mm;
> +
> + /* Take mm->mm_users to avoid mm is gone when inserting mmu notifier */
> + if (!mm || !mmget_not_zero(mm)) {
get_task_mm would be safer than relying on p->mm. I regret ever adding
that to the process structure.
> + pr_debug("svms 0x%p process mm gone\n", svms);
> + return;
> + }
> +retry:
> + mmap_write_lock(mm);
> +
> + /* Checking for the need to drain retry faults must be inside
> + * mmap write lock to serialize with munmap notifiers.
> + */
> + if (unlikely(READ_ONCE(svms->drain_pagefaults))) {
> + WRITE_ONCE(svms->drain_pagefaults, false);
> + mmap_write_unlock(mm);
> + svm_range_drain_retry_fault(svms);
> + goto retry;
> + }
> +
> spin_lock(&svms->deferred_list_lock);
> while (!list_empty(&svms->deferred_range_list)) {
> prange = list_first_entry(&svms->deferred_range_list,
> struct svm_range, deferred_list);
> + list_del_init(&prange->deferred_list);
> spin_unlock(&svms->deferred_list_lock);
> +
> pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
> prange->start, prange->last, prange->work_item.op);
>
> - mm = prange->work_item.mm;
> -retry:
> - mmap_write_lock(mm);
> mutex_lock(&svms->lock);
> -
> - /* Checking for the need to drain retry faults must be in
> - * mmap write lock to serialize with munmap notifiers.
> - *
> - * Remove from deferred_list must be inside mmap write lock,
> - * otherwise, svm_range_list_lock_and_flush_work may hold mmap
> - * write lock, and continue because deferred_list is empty, then
> - * deferred_list handle is blocked by mmap write lock.
> - */
> - spin_lock(&svms->deferred_list_lock);
> - if (unlikely(svms->drain_pagefaults)) {
> - svms->drain_pagefaults = false;
> - spin_unlock(&svms->deferred_list_lock);
> - mutex_unlock(&svms->lock);
> - mmap_write_unlock(mm);
> - svm_range_drain_retry_fault(svms);
> - goto retry;
> - }
> - list_del_init(&prange->deferred_list);
> - spin_unlock(&svms->deferred_list_lock);
> -
> mutex_lock(&prange->migrate_mutex);
> while (!list_empty(&prange->child_list)) {
> struct svm_range *pchild;
> @@ -2031,12 +2034,13 @@ static void svm_range_deferred_list_work(struct work_struct *work)
>
> svm_range_handle_list_op(svms, prange);
> mutex_unlock(&svms->lock);
> - mmap_write_unlock(mm);
>
> spin_lock(&svms->deferred_list_lock);
> }
> spin_unlock(&svms->deferred_list_lock);
>
> + mmap_write_unlock(mm);
> + mmput(mm);
> pr_debug("exit svms 0x%p\n", svms);
> }
>
> @@ -2600,12 +2604,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>
> pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
>
> - mm = get_task_mm(p->lead_thread);
> - if (!mm) {
> - pr_debug("svms 0x%p failed to get mm\n", svms);
> - r = -ESRCH;
> - goto out;
> - }
> + /* mm is available because kfd_process_notifier_release drain fault */
This is not a valid assumption because the mm_users count is 0 when the
notifier_release runs. So you can't rely on the mm being usable here
while you're draining faults in notifier_release.
A better way to avoid the deadlock would be to drain faults not in
notifier_release, but in kfd_process_wq_release.
Regards,
Felix
> + mm = p->mm;
>
> mmap_read_lock(mm);
> retry_write_locked:
> @@ -2708,7 +2708,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>
> svm_range_count_fault(adev, p, gpuidx);
>
> - mmput(mm);
> out:
> kfd_unref_process(p);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 6dc91c33e80f..0a8bcdb3dddf 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -189,6 +189,7 @@ void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm,
> struct kfd_process_device *
> svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
> void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_struct *mm);
> +void svm_range_drain_retry_fault(struct svm_range_list *svms);
>
> /* SVM API and HMM page migration work together, device memory type
> * is initialized to not 0 when page migration register device memory.
More information about the amd-gfx
mailing list