[PATCH] drm/amdgpu: annotate a false positive locking dependency
Dennis Li
Dennis.Li at amd.com
Thu Aug 6 01:24:04 UTC 2020
[ 264.483189] ======================================================
[ 264.483487] WARNING: possible circular locking dependency detected
[ 264.483781] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G OE
[ 264.484076] ------------------------------------------------------
[ 264.484370] kworker/39:1/567 is trying to acquire lock:
[ 264.484663] ffffffffc15df4b0 (mgpu_info.mutex){+.+.}, at: amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
[ 264.485081]
but task is already holding lock:
[ 264.485670] ffff965fd31647a0 (&adev->reset_sem){++++}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
[ 264.486074]
which lock already depends on the new lock.
[ 264.487043]
the existing dependency chain (in reverse order) is:
[ 264.487710]
-> #3 (&adev->reset_sem){++++}:
[ 264.488400] down_write+0x49/0x120
[ 264.488783] amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
[ 264.489179] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[ 264.489544] process_one_work+0x29e/0x630
[ 264.489910] worker_thread+0x3c/0x3f0
[ 264.490279] kthread+0x12f/0x150
[ 264.490649] ret_from_fork+0x3a/0x50
[ 264.491020]
-> #2 (&tmp->hive_lock){+.+.}:
[ 264.491764] __mutex_lock+0x95/0xa20
[ 264.492137] mutex_lock_nested+0x1b/0x20
[ 264.492553] amdgpu_get_xgmi_hive+0x352/0x400 [amdgpu]
[ 264.492972] amdgpu_xgmi_add_device+0xb8/0x460 [amdgpu]
[ 264.493387] amdgpu_device_init+0x12fb/0x1e10 [amdgpu]
[ 264.493807] amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[ 264.494226] amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[ 264.494617] local_pci_probe+0x47/0xa0
[ 264.494998] work_for_cpu_fn+0x1a/0x30
[ 264.495369] process_one_work+0x29e/0x630
[ 264.495746] worker_thread+0x22b/0x3f0
[ 264.496124] kthread+0x12f/0x150
[ 264.496504] ret_from_fork+0x3a/0x50
[ 264.496876]
-> #1 (xgmi_mutex){+.+.}:
[ 264.497596] __mutex_lock+0x95/0xa20
[ 264.497954] mutex_lock_nested+0x1b/0x20
[ 264.498346] amdgpu_get_xgmi_hive+0x38/0x400 [amdgpu]
[ 264.498741] amdgpu_xgmi_set_pstate+0x10/0x20 [amdgpu]
[ 264.499126] amdgpu_device_ip_late_init+0x219/0x230 [amdgpu]
[ 264.499506] amdgpu_device_init+0x1401/0x1e10 [amdgpu]
[ 264.499886] amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[ 264.500264] amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[ 264.500608] local_pci_probe+0x47/0xa0
[ 264.500945] work_for_cpu_fn+0x1a/0x30
[ 264.501276] process_one_work+0x29e/0x630
[ 264.501603] worker_thread+0x22b/0x3f0
[ 264.501927] kthread+0x12f/0x150
[ 264.502239] ret_from_fork+0x3a/0x50
[ 264.502541]
-> #0 (mgpu_info.mutex){+.+.}:
[ 264.503126] __lock_acquire+0x13ec/0x16e0
[ 264.503411] lock_acquire+0xb8/0x1c0
[ 264.503693] __mutex_lock+0x95/0xa20
[ 264.504019] mutex_lock_nested+0x1b/0x20
[ 264.504354] amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
[ 264.504691] amdgpu_device_gpu_recover+0x360/0x1030 [amdgpu]
[ 264.505029] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[ 264.505334] process_one_work+0x29e/0x630
[ 264.505617] worker_thread+0x3c/0x3f0
[ 264.505897] kthread+0x12f/0x150
[ 264.506176] ret_from_fork+0x3a/0x50
[ 264.506453]
other info that might help us debug this:
[ 264.507267] Chain exists of:
mgpu_info.mutex --> &tmp->hive_lock --> &adev->reset_sem
[ 264.508102] Possible unsafe locking scenario:
[ 264.508664] CPU0 CPU1
[ 264.508945] ---- ----
[ 264.509221] lock(&adev->reset_sem);
[ 264.509524] lock(&tmp->hive_lock);
[ 264.509818] lock(&adev->reset_sem);
[ 264.510111] lock(mgpu_info.mutex);
[ 264.510401]
*** DEADLOCK ***
[ 264.511224] 4 locks held by kworker/39:1/567:
[ 264.511499] #0: ffff961ff5c1d348 ((wq_completion)events){+.+.}, at: process_one_work+0x21f/0x630
[ 264.511793] #1: ffffafa90e233e58 ((work_completion)(&con->recovery_work)){+.+.}, at: process_one_work+0x21f/0x630
[ 264.512100] #2: ffffffffc16245d0 (&tmp->hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xb0/0x1030 [amdgpu]
[ 264.512450] #3: ffff965fd31647a0 (&adev->reset_sem){++++}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
Remove the lock(&hive->hive_lock) out of amdgpu_get_xgmi_hive,
to disable its locking dependency on xgmi_mutex.
Signed-off-by: Dennis Li <Dennis.Li at amd.com>
Change-Id: I2d9d80ee23f9f9ac6ce9e1b9e5e1b2b3530f5bdd
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac97fbd2..6c572db42d92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
- struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
/* It's a bug to not have a hive within this function */
if (WARN_ON(!hive))
@@ -4283,7 +4283,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* We always reset all schedulers for device and all devices for XGMI
* hive so that should take care of them too.
*/
- hive = amdgpu_get_xgmi_hive(adev, false);
+ hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5680f7eafcb1..5cd42740461c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct amdgpu_device *remote_adev = NULL;
struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle = NULL;
- struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
/* Build list of devices to query RAS related errors */
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 67a756f4337b..19fd5ce3e857 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -336,7 +336,7 @@ static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
{
int i;
struct amdgpu_hive_info *tmp;
@@ -349,8 +349,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
for (i = 0 ; i < hive_count; ++i) {
tmp = &xgmi_hives[i];
if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
- if (lock)
- mutex_lock(&tmp->hive_lock);
mutex_unlock(&xgmi_mutex);
return tmp;
}
@@ -374,9 +372,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
mutex_init(&tmp->hive_lock);
atomic_set(&tmp->in_reset, 0);
task_barrier_init(&tmp->tb);
-
- if (lock)
- mutex_lock(&tmp->hive_lock);
tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
tmp->hi_req_gpu = NULL;
/*
@@ -392,7 +387,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
{
int ret = 0;
- struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
struct amdgpu_device *request_adev = hive->hi_req_gpu ?
hive->hi_req_gpu : adev;
bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
@@ -515,7 +510,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
}
- hive = amdgpu_get_xgmi_hive(adev, 1);
+ hive = amdgpu_get_xgmi_hive(adev);
if (!hive) {
ret = -EINVAL;
dev_err(adev->dev,
@@ -524,6 +519,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
goto exit;
}
+ mutex_lock(&hive->hive_lock);
+
top_info = &adev->psp.xgmi_context.top_info;
list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
@@ -587,10 +584,11 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.supported)
return -EINVAL;
- hive = amdgpu_get_xgmi_hive(adev, 1);
+ hive = amdgpu_get_xgmi_hive(adev);
if (!hive)
return -EINVAL;
+ mutex_lock(&hive->hive_lock);
task_barrier_rem_task(&hive->tb);
amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
mutex_unlock(&hive->hive_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 61720cd4a1ee..ae3249c1aafe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -51,7 +51,7 @@ struct amdgpu_pcs_ras_field {
uint32_t pcs_err_shift;
};
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
--
2.17.1
More information about the amd-gfx
mailing list