[PATCH] drm/amdgpu: check vm bo eviction valuable at last

Qiang Yu qiang.yu at amd.com
Thu Feb 17 09:04:40 UTC 2022


Workstation application ANSA/META get this error dmesg:
[drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA (-16)

This is caused by:
1. create a 256MB buffer in invisible VRAM
2. CPU map the buffer and access it causes vm_fault and try to move
   it to visible VRAM
3. force visible VRAM space and traverse all VRAM bos to check if
   evicting this bo is valuable
4. when checking a VM bo (in invisible VRAM), amdgpu_vm_evictable()
   will set amdgpu_vm->evicting, but latter due to not in visible
   VRAM, won't really evict it so not add it to amdgpu_vm->evicted
5. before next CS to clear the amdgpu_vm->evicting, user VM ops
   ioctl will pass amdgpu_vm_ready() (check amdgpu_vm->evicted)
   but fail in amdgpu_vm_bo_update_mapping() (check
   amdgpu_vm->evicting) and get this error log

This error won't affect functionality as next CS will finish the
waiting VM ops. But we'd better make the amdgpu_vm->evicting
correctly reflact the vm status and clear the error log.

Signed-off-by: Qiang Yu <qiang.yu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 85 ++++++++++++++-----------
 1 file changed, 47 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 5a32ee66d8c8..88a27911054f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1306,45 +1306,11 @@ uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm,
 	return flags;
 }
 
-/*
- * amdgpu_ttm_bo_eviction_valuable - Check to see if we can evict a buffer
- * object.
- *
- * Return true if eviction is sensible. Called by ttm_mem_evict_first() on
- * behalf of ttm_bo_mem_force_space() which tries to evict buffer objects until
- * it can find space for a new object and by ttm_bo_force_list_clean() which is
- * used to clean out a memory space.
- */
-static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
-					    const struct ttm_place *place)
+static bool amdgpu_ttm_mem_eviction_valuable(struct ttm_buffer_object *bo,
+					     const struct ttm_place *place)
 {
 	unsigned long num_pages = bo->resource->num_pages;
 	struct amdgpu_res_cursor cursor;
-	struct dma_resv_list *flist;
-	struct dma_fence *f;
-	int i;
-
-	/* Swapout? */
-	if (bo->resource->mem_type == TTM_PL_SYSTEM)
-		return true;
-
-	if (bo->type == ttm_bo_type_kernel &&
-	    !amdgpu_vm_evictable(ttm_to_amdgpu_bo(bo)))
-		return false;
-
-	/* If bo is a KFD BO, check if the bo belongs to the current process.
-	 * If true, then return false as any KFD process needs all its BOs to
-	 * be resident to run successfully
-	 */
-	flist = dma_resv_shared_list(bo->base.resv);
-	if (flist) {
-		for (i = 0; i < flist->shared_count; ++i) {
-			f = rcu_dereference_protected(flist->shared[i],
-				dma_resv_held(bo->base.resv));
-			if (amdkfd_fence_check_mm(f, current->mm))
-				return false;
-		}
-	}
 
 	switch (bo->resource->mem_type) {
 	case AMDGPU_PL_PREEMPT:
@@ -1377,10 +1343,53 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 		return false;
 
 	default:
-		break;
+		return ttm_bo_eviction_valuable(bo, place);
 	}
+}
 
-	return ttm_bo_eviction_valuable(bo, place);
+/*
+ * amdgpu_ttm_bo_eviction_valuable - Check to see if we can evict a buffer
+ * object.
+ *
+ * Return true if eviction is sensible. Called by ttm_mem_evict_first() on
+ * behalf of ttm_bo_mem_force_space() which tries to evict buffer objects until
+ * it can find space for a new object and by ttm_bo_force_list_clean() which is
+ * used to clean out a memory space.
+ */
+static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
+					    const struct ttm_place *place)
+{
+	struct dma_resv_list *flist;
+	struct dma_fence *f;
+	int i;
+
+	/* Swapout? */
+	if (bo->resource->mem_type == TTM_PL_SYSTEM)
+		return true;
+
+	/* If bo is a KFD BO, check if the bo belongs to the current process.
+	 * If true, then return false as any KFD process needs all its BOs to
+	 * be resident to run successfully
+	 */
+	flist = dma_resv_shared_list(bo->base.resv);
+	if (flist) {
+		for (i = 0; i < flist->shared_count; ++i) {
+			f = rcu_dereference_protected(flist->shared[i],
+				dma_resv_held(bo->base.resv));
+			if (amdkfd_fence_check_mm(f, current->mm))
+				return false;
+		}
+	}
+
+	/* Check by different mem type. */
+	if (!amdgpu_ttm_mem_eviction_valuable(bo, place))
+		return false;
+
+	/* VM bo should be checked at last because it will mark VM evicting. */
+	if (bo->type == ttm_bo_type_kernel)
+		return amdgpu_vm_evictable(ttm_to_amdgpu_bo(bo));
+
+	return true;
 }
 
 static void amdgpu_ttm_vram_mm_access(struct amdgpu_device *adev, loff_t pos,
-- 
2.25.1



More information about the dri-devel mailing list