[PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v3)

StDenis, Tom Tom.StDenis at amd.com
Mon Jan 7 17:31:48 UTC 2019


v2: Move locks around in other functions so that this
function can stand on its own.  Also only hold the hive
specific lock for add/remove device instead of the driver
global lock so you can't add/remove devices in parallel from
one hive.

v3: add reset_lock

Signed-off-by: Tom St Denis <tom.stdenis at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 41 ++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  5 +--
 3 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39d5d058b2c7..1a558dc41ba6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 * by different nodes. No point also since the one node already executing
 	 * reset will also reset all the other nodes in the hive.
 	 */
-	hive = amdgpu_get_xgmi_hive(adev);
+	hive = amdgpu_get_xgmi_hive(adev, 0);
 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
-	    !mutex_trylock(&hive->hive_lock))
+	    !mutex_trylock(&hive->reset_lock))
 		return 0;
 
 	/* Start with adev pre asic reset first for soft reset check.*/
@@ -3606,7 +3606,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	}
 
 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
-		mutex_unlock(&hive->hive_lock);
+		mutex_unlock(&hive->reset_lock);
 
 	if (r)
 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8a8bc60cb6b4..9e98ab8b1525 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
 	return &hive->device_list;
 }
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
 {
 	int i;
 	struct amdgpu_hive_info *tmp;
 
 	if (!adev->gmc.xgmi.hive_id)
 		return NULL;
+
+	mutex_lock(&xgmi_mutex);
+
 	for (i = 0 ; i < hive_count; ++i) {
 		tmp = &xgmi_hives[i];
-		if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+			if (lock)
+				mutex_lock(&tmp->hive_lock);
+			mutex_unlock(&xgmi_mutex);
 			return tmp;
+		}
 	}
-	if (i >= AMDGPU_MAX_XGMI_HIVE)
+	if (i >= AMDGPU_MAX_XGMI_HIVE) {
+		mutex_unlock(&xgmi_mutex);
 		return NULL;
+	}
 
 	/* initialize new hive if not exist */
 	tmp = &xgmi_hives[hive_count++];
 	tmp->hive_id = adev->gmc.xgmi.hive_id;
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
+	mutex_init(&tmp->reset_lock);
+	if (lock)
+		mutex_lock(&tmp->hive_lock);
+
+	mutex_unlock(&xgmi_mutex);
 
 	return tmp;
 }
@@ -111,8 +125,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		return ret;
 	}
 
-	mutex_lock(&xgmi_mutex);
-	hive = amdgpu_get_xgmi_hive(adev);
+	/* find hive and take lock */
+	hive = amdgpu_get_xgmi_hive(adev, 1);
 	if (!hive)
 		goto exit;
 
@@ -142,8 +156,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 			break;
 	}
 
+	mutex_unlock(&hive->hive_lock);
 exit:
-	mutex_unlock(&xgmi_mutex);
 	return ret;
 }
 
@@ -154,15 +168,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return;
 
-	mutex_lock(&xgmi_mutex);
-
-	hive = amdgpu_get_xgmi_hive(adev);
+	hive = amdgpu_get_xgmi_hive(adev, 1);
 	if (!hive)
-		goto exit;
+		return;
 
-	if (!(hive->number_devices--))
+	if (!(hive->number_devices--)) {
 		mutex_destroy(&hive->hive_lock);
-
-exit:
-	mutex_unlock(&xgmi_mutex);
+		mutex_destroy(&hive->reset_lock);
+	} else {
+		mutex_unlock(&hive->hive_lock);
+	}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6151eb9c8ad3..14bc60664159 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -29,10 +29,11 @@ struct amdgpu_hive_info {
 	struct list_head	device_list;
 	struct psp_xgmi_topology_info	topology_info;
 	int number_devices;
-	struct mutex hive_lock;
+	struct mutex hive_lock,
+		     reset_lock;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
 void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
-- 
2.17.2



More information about the amd-gfx mailing list