[PATCH 1/3] drm/amdkfd: process exit and retry fault race

Felix Kuehling felix.kuehling at amd.com
Wed Nov 17 23:18:48 UTC 2021


On 2021-11-16 10:43 p.m., Philip Yang wrote:
> kfd process mmu release notifier callback drain retry fault to ensure no
> retry fault comes after removing kfd process from the hash table,
> otherwise svm page fault handler will fail to recover the fault and dump
> GPU vm fault log.
>
> Drain retry fault needs flush restore page fault work to wait for
> the last fault is handled because IH dispatch increase rptr first and
> then calls restore_pages, so restore pages may still handle the last
> fault but amdgpu_ih_has_checkpoint_processed return true.

This fixes the problem, but it will result in waiting longer than 
necessary because the worker only finishes when the IH ring is empty.


>
> restore pages can not call mmget because mmput may call mmu notifier
> release to cause deadlock.

See my comment inline.


>
> Refactor deferred list work to call mmget and take mmap write lock to
> handle all ranges, to avoid mm is gone while inserting mmu notifier.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c |  6 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 69 ++++++++++++------------
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  1 +
>   3 files changed, 41 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index d4c8a6948a9f..8b4b045d5c92 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1143,6 +1143,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   	if (WARN_ON(p->mm != mm))
>   		return;
>   
> +	/*
> +	 * Ensure no retry fault comes in afterwards, as page fault handler will
> +	 * not find kfd process and take mm lock to recover fault.
> +	 */
> +	svm_range_drain_retry_fault(&p->svms);
> +
>   	mutex_lock(&kfd_processes_mutex);
>   	hash_del_rcu(&p->kfd_processes);
>   	mutex_unlock(&kfd_processes_mutex);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 88360f23eb61..c1f367934428 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1953,9 +1953,10 @@ svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
>   	}
>   }
>   
> -static void svm_range_drain_retry_fault(struct svm_range_list *svms)
> +void svm_range_drain_retry_fault(struct svm_range_list *svms)
>   {
>   	struct kfd_process_device *pdd;
> +	struct amdgpu_device *adev;
>   	struct kfd_process *p;
>   	uint32_t i;
>   
> @@ -1967,9 +1968,11 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
>   			continue;
>   
>   		pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
> +		adev = pdd->dev->adev;
> +		amdgpu_ih_wait_on_checkpoint_process(adev, &adev->irq.ih1);
>   
> -		amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev,
> -						     &pdd->dev->adev->irq.ih1);
> +		/* Wait for the last page fault is handled */
> +		flush_work(&adev->irq.ih1_work);
>   		pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
>   	}
>   }
> @@ -1979,43 +1982,43 @@ static void svm_range_deferred_list_work(struct work_struct *work)
>   	struct svm_range_list *svms;
>   	struct svm_range *prange;
>   	struct mm_struct *mm;
> +	struct kfd_process *p;
>   
>   	svms = container_of(work, struct svm_range_list, deferred_list_work);
>   	pr_debug("enter svms 0x%p\n", svms);
>   
> +	p = container_of(svms, struct kfd_process, svms);
> +	mm = p->mm;
> +
> +	/* Take mm->mm_users to avoid mm is gone when inserting mmu notifier */
> +	if (!mm || !mmget_not_zero(mm)) {

get_task_mm would be safer than relying on p->mm. I regret ever adding 
that to the process structure.


> +		pr_debug("svms 0x%p process mm gone\n", svms);
> +		return;
> +	}
> +retry:
> +	mmap_write_lock(mm);
> +
> +	/* Checking for the need to drain retry faults must be inside
> +	 * mmap write lock to serialize with munmap notifiers.
> +	 */
> +	if (unlikely(READ_ONCE(svms->drain_pagefaults))) {
> +		WRITE_ONCE(svms->drain_pagefaults, false);
> +		mmap_write_unlock(mm);
> +		svm_range_drain_retry_fault(svms);
> +		goto retry;
> +	}
> +
>   	spin_lock(&svms->deferred_list_lock);
>   	while (!list_empty(&svms->deferred_range_list)) {
>   		prange = list_first_entry(&svms->deferred_range_list,
>   					  struct svm_range, deferred_list);
> +		list_del_init(&prange->deferred_list);
>   		spin_unlock(&svms->deferred_list_lock);
> +
>   		pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
>   			 prange->start, prange->last, prange->work_item.op);
>   
> -		mm = prange->work_item.mm;
> -retry:
> -		mmap_write_lock(mm);
>   		mutex_lock(&svms->lock);
> -
> -		/* Checking for the need to drain retry faults must be in
> -		 * mmap write lock to serialize with munmap notifiers.
> -		 *
> -		 * Remove from deferred_list must be inside mmap write lock,
> -		 * otherwise, svm_range_list_lock_and_flush_work may hold mmap
> -		 * write lock, and continue because deferred_list is empty, then
> -		 * deferred_list handle is blocked by mmap write lock.
> -		 */
> -		spin_lock(&svms->deferred_list_lock);
> -		if (unlikely(svms->drain_pagefaults)) {
> -			svms->drain_pagefaults = false;
> -			spin_unlock(&svms->deferred_list_lock);
> -			mutex_unlock(&svms->lock);
> -			mmap_write_unlock(mm);
> -			svm_range_drain_retry_fault(svms);
> -			goto retry;
> -		}
> -		list_del_init(&prange->deferred_list);
> -		spin_unlock(&svms->deferred_list_lock);
> -
>   		mutex_lock(&prange->migrate_mutex);
>   		while (!list_empty(&prange->child_list)) {
>   			struct svm_range *pchild;
> @@ -2031,12 +2034,13 @@ static void svm_range_deferred_list_work(struct work_struct *work)
>   
>   		svm_range_handle_list_op(svms, prange);
>   		mutex_unlock(&svms->lock);
> -		mmap_write_unlock(mm);
>   
>   		spin_lock(&svms->deferred_list_lock);
>   	}
>   	spin_unlock(&svms->deferred_list_lock);
>   
> +	mmap_write_unlock(mm);
> +	mmput(mm);
>   	pr_debug("exit svms 0x%p\n", svms);
>   }
>   
> @@ -2600,12 +2604,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   
>   	pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
>   
> -	mm = get_task_mm(p->lead_thread);
> -	if (!mm) {
> -		pr_debug("svms 0x%p failed to get mm\n", svms);
> -		r = -ESRCH;
> -		goto out;
> -	}
> +	/* mm is available because kfd_process_notifier_release drain fault */
This is not a valid assumption because the mm_users count is 0 when the 
notifier_release runs. So you can't rely on the mm being usable here 
while you're draining faults in notifier_release.

A better way to avoid the deadlock would be to drain faults not in 
notifier_release, but in kfd_process_wq_release.

Regards,
   Felix


> +	mm = p->mm;
>   
>   	mmap_read_lock(mm);
>   retry_write_locked:
> @@ -2708,7 +2708,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   
>   	svm_range_count_fault(adev, p, gpuidx);
>   
> -	mmput(mm);
>   out:
>   	kfd_unref_process(p);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 6dc91c33e80f..0a8bcdb3dddf 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -189,6 +189,7 @@ void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm,
>   struct kfd_process_device *
>   svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
>   void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_struct *mm);
> +void svm_range_drain_retry_fault(struct svm_range_list *svms);
>   
>   /* SVM API and HMM page migration work together, device memory type
>    * is initialized to not 0 when page migration register device memory.


More information about the amd-gfx mailing list