[PATCH 1/1] drm/amdkfd: evict svm bo worker handle error
Felix Kuehling
felix.kuehling at amd.com
Mon Mar 14 19:58:30 UTC 2022
Am 2022-03-14 um 10:50 schrieb Philip Yang:
> Migrate vram to ram may fail to find the vma if process is exiting
> and vma is removed, evict svm bo worker sets prange->svm_bo to NULL
> and warn svm_bo ref count != 1 only if migrating vram to ram
> successfully.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 27 +++++++++++++++++++-----
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 20 ++++++++++--------
> 2 files changed, 33 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index 7f689094be5a..c8aef2fe459b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -638,6 +638,22 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
> return r;
> }
>
> +/**
> + * svm_migrate_vma_to_ram - migrate range inside one vma from device to system
> + *
> + * @adev: amdgpu device to migrate from
> + * @prange: svm range structure
> + * @vma: vm_area_struct that range [start, end] belongs to
> + * @start: range start virtual address in pages
> + * @end: range end virtual address in pages
> + *
> + * Context: Process context, caller hold mmap read lock, svms lock, prange lock
> + *
> + * Return:
> + * 0 - success with all pages migrated
> + * negative values - indicate error
> + * positive values - partial migration, number of pages not migrated
> + */
> static long
> svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
> struct vm_area_struct *vma, uint64_t start, uint64_t end)
> @@ -709,8 +725,6 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
> pdd = svm_range_get_pdd_by_adev(prange, adev);
> if (pdd)
> WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
> -
> - return upages;
> }
> return r ? r : upages;
> }
> @@ -759,13 +773,16 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm)
> unsigned long next;
>
> vma = find_vma(mm, addr);
> - if (!vma || addr < vma->vm_start)
> + if (!vma || addr < vma->vm_start) {
> + pr_debug("failed to find vma for prange %p\n", prange);
> + r = -EFAULT;
> break;
> + }
>
> next = min(vma->vm_end, end);
> r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next);
> if (r < 0) {
> - pr_debug("failed %ld to migrate\n", r);
> + pr_debug("failed %ld to migrate prange %p\n", r, prange);
> break;
> } else {
> upages += r;
> @@ -773,7 +790,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm)
> addr = next;
> }
>
> - if (!upages) {
> + if (r >= 0 && !upages) {
> svm_range_vram_node_free(prange);
> prange->actual_loc = 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 509d915cbe69..215943424c06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -3155,6 +3155,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
> struct svm_range_bo *svm_bo;
> struct kfd_process *p;
> struct mm_struct *mm;
> + int r = 0;
>
> svm_bo = container_of(work, struct svm_range_bo, eviction_work);
> if (!svm_bo_ref_unless_zero(svm_bo))
> @@ -3170,7 +3171,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
>
> mmap_read_lock(mm);
> spin_lock(&svm_bo->list_lock);
> - while (!list_empty(&svm_bo->range_list)) {
> + while (!list_empty(&svm_bo->range_list) && !r) {
> struct svm_range *prange =
> list_first_entry(&svm_bo->range_list,
> struct svm_range, svm_bo_list);
> @@ -3184,15 +3185,15 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
>
> mutex_lock(&prange->migrate_mutex);
> do {
> - svm_migrate_vram_to_ram(prange,
> + r = svm_migrate_vram_to_ram(prange,
> svm_bo->eviction_fence->mm);
> - } while (prange->actual_loc && --retries);
> - WARN(prange->actual_loc, "Migration failed during eviction");
> -
> - mutex_lock(&prange->lock);
> - prange->svm_bo = NULL;
> - mutex_unlock(&prange->lock);
> + } while (!r && prange->actual_loc && --retries);
I think it would still be good to have a WARN here for other cases than
process termination. Is there a way to distinguish the process
termination case from the error code? Maybe we could even avoid the
retry in this case.
Other than that, this patch is a good improvement on the error handling.
Regards,
Felix
>
> + if (!prange->actual_loc) {
> + mutex_lock(&prange->lock);
> + prange->svm_bo = NULL;
> + mutex_unlock(&prange->lock);
> + }
> mutex_unlock(&prange->migrate_mutex);
>
> spin_lock(&svm_bo->list_lock);
> @@ -3201,10 +3202,11 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
> mmap_read_unlock(mm);
>
> dma_fence_signal(&svm_bo->eviction_fence->base);
> +
> /* This is the last reference to svm_bo, after svm_range_vram_node_free
> * has been called in svm_migrate_vram_to_ram
> */
> - WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
> + WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
> svm_range_bo_unref(svm_bo);
> }
>
More information about the amd-gfx
mailing list