[PATCH] drm/amdgpu: annotate a false positive locking dependency
Felix Kuehling
felix.kuehling at amd.com
Thu Aug 6 02:54:36 UTC 2020
The commit headline is misleading. An annotation would be something like
replacing mutex_lock with mutex_lock_nested. You're not annotating
anything, you're actually changing the locking.
Am 2020-08-05 um 9:24 p.m. schrieb Dennis Li:
> [ 264.483189] ======================================================
> [ 264.483487] WARNING: possible circular locking dependency detected
> [ 264.483781] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G OE
> [ 264.484076] ------------------------------------------------------
> [ 264.484370] kworker/39:1/567 is trying to acquire lock:
> [ 264.484663] ffffffffc15df4b0 (mgpu_info.mutex){+.+.}, at: amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
> [ 264.485081]
> but task is already holding lock:
> [ 264.485670] ffff965fd31647a0 (&adev->reset_sem){++++}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
> [ 264.486074]
> which lock already depends on the new lock.
[snip]
> Remove the lock(&hive->hive_lock) out of amdgpu_get_xgmi_hive,
> to disable its locking dependency on xgmi_mutex.
Why is this safe? I think the idea with this was, that xgmi_mutex
protects the global xgmi_hives array and protects XGMI hives while they
are being added to or removed from it. Taking the hive_lock before
dropping the xgmi_mutex would ensure that the hive is always protected
by one of those locks.
I think you're opening up a small potential race condition now, where an
XGMI hive can be removed with amdgpu_xgmi_remove_device between
amdgpu_get_xgmi_hive and locking it. So you can't guarantee that the
hive returned by amdgpu_xgmi_remove_device is valid any more.
I think this is not a big deal, because the amdgpu_hive_info structures
are part of the global array and never freed. So there is no risk of
dereferencing an invalid pointer. The only risk is, that you're getting
an amdgpu_hive_info structure for a hive that no longer exists. You
could add a simple check after taking the hive_lock for
hive->number_devices == 0. In amdgpu_xgmi_remove_device you'd want to
avoid decrementing the device counter again in this case and return an
error instead. In amdgpu_xgmi_add_device, you will have to ensure a new
XGMI hive is created by calling amdgpu_xgmi_remove_device again, or just
return an error because someone concurrently removing a device while
adding it at the same time is very unexpected.
Regards,
Felix
>
> Signed-off-by: Dennis Li <Dennis.Li at amd.com>
> Change-Id: I2d9d80ee23f9f9ac6ce9e1b9e5e1b2b3530f5bdd
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 62ecac97fbd2..6c572db42d92 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
> {
> struct amdgpu_device *adev =
> container_of(__work, struct amdgpu_device, xgmi_reset_work);
> - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
>
> /* It's a bug to not have a hive within this function */
> if (WARN_ON(!hive))
> @@ -4283,7 +4283,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> * We always reset all schedulers for device and all devices for XGMI
> * hive so that should take care of them too.
> */
> - hive = amdgpu_get_xgmi_hive(adev, false);
> + hive = amdgpu_get_xgmi_hive(adev);
> if (hive) {
> if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
> DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5680f7eafcb1..5cd42740461c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
> struct amdgpu_device *remote_adev = NULL;
> struct amdgpu_device *adev = ras->adev;
> struct list_head device_list, *device_list_handle = NULL;
> - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
>
> /* Build list of devices to query RAS related errors */
> if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 67a756f4337b..19fd5ce3e857 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -336,7 +336,7 @@ static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
>
>
>
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
> {
> int i;
> struct amdgpu_hive_info *tmp;
> @@ -349,8 +349,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
> for (i = 0 ; i < hive_count; ++i) {
> tmp = &xgmi_hives[i];
> if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
> - if (lock)
> - mutex_lock(&tmp->hive_lock);
> mutex_unlock(&xgmi_mutex);
> return tmp;
> }
> @@ -374,9 +372,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
> mutex_init(&tmp->hive_lock);
> atomic_set(&tmp->in_reset, 0);
> task_barrier_init(&tmp->tb);
> -
> - if (lock)
> - mutex_lock(&tmp->hive_lock);
> tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
> tmp->hi_req_gpu = NULL;
> /*
> @@ -392,7 +387,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
> int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
> {
> int ret = 0;
> - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
> struct amdgpu_device *request_adev = hive->hi_req_gpu ?
> hive->hi_req_gpu : adev;
> bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
> @@ -515,7 +510,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
> adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
> }
>
> - hive = amdgpu_get_xgmi_hive(adev, 1);
> + hive = amdgpu_get_xgmi_hive(adev);
> if (!hive) {
> ret = -EINVAL;
> dev_err(adev->dev,
> @@ -524,6 +519,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
> goto exit;
> }
>
> + mutex_lock(&hive->hive_lock);
> +
> top_info = &adev->psp.xgmi_context.top_info;
>
> list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
> @@ -587,10 +584,11 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
> if (!adev->gmc.xgmi.supported)
> return -EINVAL;
>
> - hive = amdgpu_get_xgmi_hive(adev, 1);
> + hive = amdgpu_get_xgmi_hive(adev);
> if (!hive)
> return -EINVAL;
>
> + mutex_lock(&hive->hive_lock);
> task_barrier_rem_task(&hive->tb);
> amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
> mutex_unlock(&hive->hive_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 61720cd4a1ee..ae3249c1aafe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -51,7 +51,7 @@ struct amdgpu_pcs_ras_field {
> uint32_t pcs_err_shift;
> };
>
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
> int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
> int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
> int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
More information about the amd-gfx
mailing list