<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<p><br>
</p>
<div class="moz-cite-prefix">On 2022-03-14 3:58 p.m., Felix Kuehling
wrote:<br>
</div>
<blockquote type="cite" cite="mid:486b326f-8842-5de4-07ad-7ab1f209132e@amd.com">Am
2022-03-14 um 10:50 schrieb Philip Yang:
<br>
<blockquote type="cite">Migrate vram to ram may fail to find the
vma if process is exiting
<br>
and vma is removed, evict svm bo worker sets prange->svm_bo
to NULL
<br>
and warn svm_bo ref count != 1 only if migrating vram to ram
<br>
successfully.
<br>
<br>
Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
<br>
---
<br>
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 27
+++++++++++++++++++-----
<br>
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 20
++++++++++--------
<br>
2 files changed, 33 insertions(+), 14 deletions(-)
<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
<br>
index 7f689094be5a..c8aef2fe459b 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
<br>
@@ -638,6 +638,22 @@ svm_migrate_copy_to_ram(struct
amdgpu_device *adev, struct svm_range *prange,
<br>
return r;
<br>
}
<br>
+/**
<br>
+ * svm_migrate_vma_to_ram - migrate range inside one vma from
device to system
<br>
+ *
<br>
+ * @adev: amdgpu device to migrate from
<br>
+ * @prange: svm range structure
<br>
+ * @vma: vm_area_struct that range [start, end] belongs to
<br>
+ * @start: range start virtual address in pages
<br>
+ * @end: range end virtual address in pages
<br>
+ *
<br>
+ * Context: Process context, caller hold mmap read lock, svms
lock, prange lock
<br>
+ *
<br>
+ * Return:
<br>
+ * 0 - success with all pages migrated
<br>
+ * negative values - indicate error
<br>
+ * positive values - partial migration, number of pages not
migrated
<br>
+ */
<br>
static long
<br>
svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct
svm_range *prange,
<br>
struct vm_area_struct *vma, uint64_t start,
uint64_t end)
<br>
@@ -709,8 +725,6 @@ svm_migrate_vma_to_ram(struct amdgpu_device
*adev, struct svm_range *prange,
<br>
pdd = svm_range_get_pdd_by_adev(prange, adev);
<br>
if (pdd)
<br>
WRITE_ONCE(pdd->page_out, pdd->page_out +
cpages);
<br>
-
<br>
- return upages;
<br>
}
<br>
return r ? r : upages;
<br>
}
<br>
@@ -759,13 +773,16 @@ int svm_migrate_vram_to_ram(struct
svm_range *prange, struct mm_struct *mm)
<br>
unsigned long next;
<br>
vma = find_vma(mm, addr);
<br>
- if (!vma || addr < vma->vm_start)
<br>
+ if (!vma || addr < vma->vm_start) {
<br>
+ pr_debug("failed to find vma for prange %p\n",
prange);
<br>
+ r = -EFAULT;
<br>
break;
<br>
+ }
<br>
next = min(vma->vm_end, end);
<br>
r = svm_migrate_vma_to_ram(adev, prange, vma, addr,
next);
<br>
if (r < 0) {
<br>
- pr_debug("failed %ld to migrate\n", r);
<br>
+ pr_debug("failed %ld to migrate prange %p\n", r,
prange);
<br>
break;
<br>
} else {
<br>
upages += r;
<br>
@@ -773,7 +790,7 @@ int svm_migrate_vram_to_ram(struct svm_range
*prange, struct mm_struct *mm)
<br>
addr = next;
<br>
}
<br>
- if (!upages) {
<br>
+ if (r >= 0 && !upages) {
<br>
svm_range_vram_node_free(prange);
<br>
prange->actual_loc = 0;
<br>
}
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
<br>
index 509d915cbe69..215943424c06 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
<br>
@@ -3155,6 +3155,7 @@ static void
svm_range_evict_svm_bo_worker(struct work_struct *work)
<br>
struct svm_range_bo *svm_bo;
<br>
struct kfd_process *p;
<br>
struct mm_struct *mm;
<br>
+ int r = 0;
<br>
svm_bo = container_of(work, struct svm_range_bo,
eviction_work);
<br>
if (!svm_bo_ref_unless_zero(svm_bo))
<br>
@@ -3170,7 +3171,7 @@ static void
svm_range_evict_svm_bo_worker(struct work_struct *work)
<br>
mmap_read_lock(mm);
<br>
spin_lock(&svm_bo->list_lock);
<br>
- while (!list_empty(&svm_bo->range_list)) {
<br>
+ while (!list_empty(&svm_bo->range_list) &&
!r) {
<br>
struct svm_range *prange =
<br>
list_first_entry(&svm_bo->range_list,
<br>
struct svm_range, svm_bo_list);
<br>
@@ -3184,15 +3185,15 @@ static void
svm_range_evict_svm_bo_worker(struct work_struct *work)
<br>
mutex_lock(&prange->migrate_mutex);
<br>
do {
<br>
- svm_migrate_vram_to_ram(prange,
<br>
+ r = svm_migrate_vram_to_ram(prange,
<br>
svm_bo->eviction_fence->mm);
<br>
- } while (prange->actual_loc && --retries);
<br>
- WARN(prange->actual_loc, "Migration failed during
eviction");
<br>
-
<br>
- mutex_lock(&prange->lock);
<br>
- prange->svm_bo = NULL;
<br>
- mutex_unlock(&prange->lock);
<br>
+ } while (!r && prange->actual_loc &&
--retries);
<br>
</blockquote>
<br>
I think it would still be good to have a WARN here for other cases
than process termination. Is there a way to distinguish the
process termination case from the error code? Maybe we could even
avoid the retry in this case.
<br>
<br>
</blockquote>
<p>It was bug that prange->actual_loc set to 0 even if vma not
found, that's why this WARN never trigger. With this fix, the WARN
is kind of duplicate with below WARN_ONCE. Change this to
pr_info_once to help debug, as below WARN_ONCE is real critical
issue to notify TTM to alloc VRAM from BO while svm_bo ref count
is not 1.</p>
<p>retry is avoided if r return error code.</p>
<p>Regards,</p>
<p>Philip<br>
</p>
<blockquote type="cite" cite="mid:486b326f-8842-5de4-07ad-7ab1f209132e@amd.com">Other than
that, this patch is a good improvement on the error handling.
<br>
<br>
Regards,
<br>
Felix
<br>
<br>
<br>
<blockquote type="cite"> + if (!prange->actual_loc) {
<br>
+ mutex_lock(&prange->lock);
<br>
+ prange->svm_bo = NULL;
<br>
+ mutex_unlock(&prange->lock);
<br>
+ }
<br>
mutex_unlock(&prange->migrate_mutex);
<br>
spin_lock(&svm_bo->list_lock);
<br>
@@ -3201,10 +3202,11 @@ static void
svm_range_evict_svm_bo_worker(struct work_struct *work)
<br>
mmap_read_unlock(mm);
<br>
dma_fence_signal(&svm_bo->eviction_fence->base);
<br>
+
<br>
/* This is the last reference to svm_bo, after
svm_range_vram_node_free
<br>
* has been called in svm_migrate_vram_to_ram
<br>
*/
<br>
- WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was
not the last reference\n");
<br>
+ WARN_ONCE(!r && kref_read(&svm_bo->kref) !=
1, "This was not the last reference\n");
<br>
svm_range_bo_unref(svm_bo);
<br>
}
<br>
</blockquote>
</blockquote>
</body>
</html>