[PATCH V2 1/1] drm/amdkfd: evict svm bo worker handle error

Felix Kuehling felix.kuehling at amd.com
Tue Mar 15 15:16:36 UTC 2022


Am 2022-03-15 um 11:05 schrieb philip yang:
> Migrate vram to ram may fail to find the vma if process is exiting
> and vma is removed, evict svm bo worker sets prange->svm_bo to NULL
> and warn svm_bo ref count != 1 only if migrating vram to ram
> successfully.
>
> Signed-off-by: Philip Yang<Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 28 +++++++++++++++++++-----
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 21 +++++++++++-------
>   2 files changed, 36 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index 7f689094be5a..7187417273f9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -638,6 +638,23 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   	return r;
>   }
>   
> +/**
> + * svm_migrate_vma_to_ram - migrate range inside one vma from device to system
> + *
> + * @adev: amdgpu device to migrate from
> + * @prange: svm range structure
> + * @vma: vm_area_struct that range [start, end] belongs to
> + * @start: range start virtual address in pages
> + * @end: range end virtual address in pages
> + *
> + * Context: Process context, caller hold mmap read lock, svms lock except calling
> + *          from svm_range_evict_svm_bo_worker, prange->migrate_mutex

This doesn't make a lot of sense to me. This function either requires 
that svms->lock is held, or it doesn't. If svm_range_evict_svm_bo_worker 
functions without holding that lock, it means this function doesn't 
depend on it. I'd just remove the "svms lock ..." from the comment. Also 
remove it from the comment above svm_migrate_vram_to_ram for consistency.

With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> + *
> + * Return:
> + *   0 - success with all pages migrated
> + *   negative values - indicate error
> + *   positive values - partial migration, number of pages not migrated
> + */
>   static long
>   svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   		       struct vm_area_struct *vma, uint64_t start, uint64_t end)
> @@ -709,8 +726,6 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   		pdd = svm_range_get_pdd_by_adev(prange, adev);
>   		if (pdd)
>   			WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
> -
> -		return upages;
>   	}
>   	return r ? r : upages;
>   }
> @@ -759,13 +774,16 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm)
>   		unsigned long next;
>   
>   		vma = find_vma(mm, addr);
> -		if (!vma || addr < vma->vm_start)
> +		if (!vma || addr < vma->vm_start) {
> +			pr_debug("failed to find vma for prange %p\n", prange);
> +			r = -EFAULT;
>   			break;
> +		}
>   
>   		next = min(vma->vm_end, end);
>   		r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next);
>   		if (r < 0) {
> -			pr_debug("failed %ld to migrate\n", r);
> +			pr_debug("failed %ld to migrate prange %p\n", r, prange);
>   			break;
>   		} else {
>   			upages += r;
> @@ -773,7 +791,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm)
>   		addr = next;
>   	}
>   
> -	if (!upages) {
> +	if (r >= 0 && !upages) {
>   		svm_range_vram_node_free(prange);
>   		prange->actual_loc = 0;
>   	}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 509d915cbe69..3b8856b4cece 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -3155,6 +3155,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
>   	struct svm_range_bo *svm_bo;
>   	struct kfd_process *p;
>   	struct mm_struct *mm;
> +	int r = 0;
>   
>   	svm_bo = container_of(work, struct svm_range_bo, eviction_work);
>   	if (!svm_bo_ref_unless_zero(svm_bo))
> @@ -3170,7 +3171,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
>   
>   	mmap_read_lock(mm);
>   	spin_lock(&svm_bo->list_lock);
> -	while (!list_empty(&svm_bo->range_list)) {
> +	while (!list_empty(&svm_bo->range_list) && !r) {
>   		struct svm_range *prange =
>   				list_first_entry(&svm_bo->range_list,
>   						struct svm_range, svm_bo_list);
> @@ -3184,15 +3185,18 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
>   
>   		mutex_lock(&prange->migrate_mutex);
>   		do {
> -			svm_migrate_vram_to_ram(prange,
> +			r = svm_migrate_vram_to_ram(prange,
>   						svm_bo->eviction_fence->mm);
> -		} while (prange->actual_loc && --retries);
> -		WARN(prange->actual_loc, "Migration failed during eviction");
> +		} while (!r && prange->actual_loc && --retries);
>   
> -		mutex_lock(&prange->lock);
> -		prange->svm_bo = NULL;
> -		mutex_unlock(&prange->lock);
> +		if (!r && prange->actual_loc)
> +			pr_info_once("Migration failed during eviction");
>   
> +		if (!prange->actual_loc) {
> +			mutex_lock(&prange->lock);
> +			prange->svm_bo = NULL;
> +			mutex_unlock(&prange->lock);
> +		}
>   		mutex_unlock(&prange->migrate_mutex);
>   
>   		spin_lock(&svm_bo->list_lock);
> @@ -3201,10 +3205,11 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
>   	mmap_read_unlock(mm);
>   
>   	dma_fence_signal(&svm_bo->eviction_fence->base);
> +
>   	/* This is the last reference to svm_bo, after svm_range_vram_node_free
>   	 * has been called in svm_migrate_vram_to_ram
>   	 */
> -	WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
> +	WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
>   	svm_range_bo_unref(svm_bo);
>   }
>   
> -- 2.35.1


More information about the amd-gfx mailing list