[PATCH] drm/amdgpu: Fix error handling in amdgpu_amdkfd_gpuvm_free_memory_of_gpu

Ramesh Errabolu Ramesh.Errabolu at amd.com
Thu Jun 9 17:17:27 UTC 2022


Following error conditions are fixed:
    Ensure calls to delete a list element are safe
    Unpin MMIO and DOORBELL BOs only after map count goes to zero
    Print a warning message if unreserving VMs encounters an error

Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu at amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 38 ++++++++++---------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index a1de900ba677..78b3efecc2f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1013,14 +1013,14 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
 	mutex_unlock(&process_info->lock);
 }
 
-static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
+static void remove_kgd_mem_from_validate_list(struct kgd_mem *mem,
 		struct amdkfd_process_info *process_info)
 {
 	struct ttm_validate_buffer *bo_list_entry;
 
 	bo_list_entry = &mem->validate_list;
 	mutex_lock(&process_info->lock);
-	list_del(&bo_list_entry->head);
+	list_del_init(&bo_list_entry->head);
 	mutex_unlock(&process_info->lock);
 }
 
@@ -1796,7 +1796,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
 
 allocate_init_user_pages_failed:
 err_pin_bo:
-	remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
+	remove_kgd_mem_from_validate_list(*mem, avm->process_info);
 	drm_vma_node_revoke(&gobj->vma_node, drm_priv);
 err_node_allow:
 	/* Don't unreserve system mem limit twice */
@@ -1825,20 +1825,12 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
 	unsigned long bo_size = mem->bo->tbo.base.size;
 	struct kfd_mem_attachment *entry, *tmp;
 	struct bo_vm_reservation_context ctx;
-	struct ttm_validate_buffer *bo_list_entry;
 	unsigned int mapped_to_gpu_memory;
 	int ret;
 	bool is_imported = false;
 
 	mutex_lock(&mem->lock);
 
-	/* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
-	if (mem->alloc_flags &
-	    (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
-	     KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
-		amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
-	}
-
 	mapped_to_gpu_memory = mem->mapped_to_gpu_memory;
 	is_imported = mem->is_imported;
 	mutex_unlock(&mem->lock);
@@ -1852,11 +1844,10 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
 		return -EBUSY;
 	}
 
-	/* Make sure restore workers don't access the BO any more */
-	bo_list_entry = &mem->validate_list;
-	mutex_lock(&process_info->lock);
-	list_del(&bo_list_entry->head);
-	mutex_unlock(&process_info->lock);
+	/* Make sure restore workers don't access the BO any more
+	 * Ensure removal of BO from validate list is reentrant safe
+	 */
+	remove_kgd_mem_from_validate_list(mem, process_info);
 
 	/* No more MMU notifiers */
 	amdgpu_mn_unregister(mem->bo);
@@ -1878,7 +1869,18 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
 	list_for_each_entry_safe(entry, tmp, &mem->attachments, list)
 		kfd_mem_detach(entry);
 
+	/* Return success even in case of error */
 	ret = unreserve_bo_and_vms(&ctx, false, false);
+	if (unlikely(ret)) {
+		WARN_ONCE(ret, "Error in unreserving BO and associated VMs");
+		ret = 0;
+	}
+
+	/* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
+	if (mem->alloc_flags &
+	    (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+	     KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))
+		amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
 
 	/* Free the sync object */
 	amdgpu_sync_free(&mem->sync);
@@ -2814,7 +2816,7 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem
 bo_reservation_failure:
 	mutex_unlock(&(*mem)->process_info->lock);
 	amdgpu_sync_free(&(*mem)->sync);
-	remove_kgd_mem_from_kfd_bo_list(*mem, process_info);
+	remove_kgd_mem_from_validate_list(*mem, process_info);
 	amdgpu_bo_unref(&gws_bo);
 	mutex_destroy(&(*mem)->lock);
 	kfree(*mem);
@@ -2832,7 +2834,7 @@ int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)
 	/* Remove BO from process's validate list so restore worker won't touch
 	 * it anymore
 	 */
-	remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);
+	remove_kgd_mem_from_validate_list(kgd_mem, process_info);
 
 	ret = amdgpu_bo_reserve(gws_bo, false);
 	if (unlikely(ret)) {
-- 
2.35.1



More information about the amd-gfx mailing list