[PATCH v2] drm/amdgpu: Fix the dead lock issue.

Emily Deng Emily.Deng at amd.com
Mon Sep 10 09:34:17 UTC 2018


It will ramdomly have the dead lock issue when test TDR:
1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock
2. amdgpu_bo_create locked the bo's resv lock
3. amdgpu_bo_create_shadow is waiting for the shadow_list_lock
4. amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
lock.

v2:
   Make a local copy of the list

Signed-off-by: Emily Deng <Emily.Deng at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index acfc63e..2b9f597 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3006,6 +3006,9 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 	long r = 1;
 	int i = 0;
 	long tmo;
+	struct list_head local_shadow_list;
+
+	INIT_LIST_HEAD(&local_shadow_list);
 
 	if (amdgpu_sriov_runtime(adev))
 		tmo = msecs_to_jiffies(8000);
@@ -3013,8 +3016,15 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 		tmo = msecs_to_jiffies(100);
 
 	DRM_INFO("recover vram bo from shadow start\n");
+
 	mutex_lock(&adev->shadow_list_lock);
 	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+		amdgpu_bo_ref(bo);
+		list_add_tail(&bo->copy_shadow_list, &local_shadow_list);
+	}
+	mutex_unlock(&adev->shadow_list_lock);
+
+	list_for_each_entry_safe(bo, tmp, &local_shadow_list, copy_shadow_list) {
 		next = NULL;
 		amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
 		if (fence) {
@@ -3033,8 +3043,8 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 
 		dma_fence_put(fence);
 		fence = next;
+		amdgpu_bo_unref(&bo);
 	}
-	mutex_unlock(&adev->shadow_list_lock);
 
 	if (fence) {
 		r = dma_fence_wait_timeout(fence, false, tmo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 907fdf4..cfee16c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -103,6 +103,7 @@ struct amdgpu_bo {
 		struct list_head	mn_list;
 		struct list_head	shadow_list;
 	};
+        struct list_head                copy_shadow_list;
 
 	struct kgd_mem                  *kfd_bo;
 };
-- 
2.7.4



More information about the amd-gfx mailing list