[PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout
Andrey Grodzovsky
Andrey.Grodzovsky at amd.com
Wed Jan 20 16:39:54 UTC 2021
On 1/20/21 9:12 AM, Horace Chen wrote:
> Fix a racing issue when jobs on 2 rings timeout simultaneously.
>
> If 2 rings timed out at the same time, the
> amdgpu_device_gpu_recover will be reentered. Then the
> adev->gmc.xgmi.head will be grabbed by 2 local linked list,
> which may cause wild pointer issue in iterating.
>
> lock the device earily to prevent the node be added to 2
> different lists.
>
> also increase karma for the skipped job since the job is also
> timed out and should be guilty.
>
> Signed-off-by: Horace Chen <horace.chen at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 70 +++++++++++++++++++---
> 1 file changed, 61 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 4d434803fb49..d59d3182ac2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4460,6 +4460,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> up_write(&adev->reset_sem);
> }
>
> +/*
> + * to lockup a list of amdgpu devices in a hive safely, if not a hive
> + * with multiple nodes, it will be same as amdgpu_device_lock_adev.
> + *
> + * unlock won't require roll back.
> + */
> +static bool amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> + struct amdgpu_device *tmp_adev = NULL;
> +
> + if (adev->gmc.xgmi.num_physical_nodes > 1) {
> + if (!hive) {
> + dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
> + return false;
> + }
> + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> + if (!amdgpu_device_lock_adev(tmp_adev, hive))
> + goto roll_back;
> + }
> + return true;
> + } else {
> + return amdgpu_device_lock_adev(adev, hive);
> + }
> +roll_back:
> + if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
> + /*
> + * if the lockup iteration break in the middle of a hive,
> + * it may means there may has a race issue,
> + * or a hive device locked up independently.
> + * we may be in trouble and may not,
> + * so will try to roll back the lock and give out a warnning.
> + */
> + dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
> + list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> + amdgpu_device_unlock_adev(tmp_adev);
> + }
> + }
> + return false;
> +}
> +
> static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
> {
> struct pci_dev *p = NULL;
> @@ -4573,11 +4613,32 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
> job ? job->base.id : -1, hive->hive_id);
> amdgpu_put_xgmi_hive(hive);
> + if (job)
> + drm_sched_increase_karma(&job->base);
> return 0;
> }
> mutex_lock(&hive->hive_lock);
> }
>
> + /*
> + * lock the device before we try to operate the linked list
> + * if didn't get the device lock, don't touch the linked list since
> + * others may iterating it.
> + */
> + if (!amdgpu_device_lock_hive_adev(adev, hive)) {
> + dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> + job ? job->base.id : -1);
> +
> + if (adev->gmc.xgmi.num_physical_nodes > 1 && !hive)
> + r = -ENODEV;
> + else
> + r = 0;
You can just change amdgpu_device_lock_hive_adev return type to int instead
of code duplication, maybe returning EAGAIN for actual locking failure.
Andrey
> + /* even we skipped this reset, still need to set the job to guilty */
> + if (job)
> + drm_sched_increase_karma(&job->base);
> + goto skip_recovery;
> + }
> +
> /*
> * Build list of devices to reset.
> * In case we are in XGMI hive mode, resort the device list
> @@ -4585,8 +4646,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> */
> INIT_LIST_HEAD(&device_list);
> if (adev->gmc.xgmi.num_physical_nodes > 1) {
> - if (!hive)
> - return -ENODEV;
> if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
> list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
> device_list_handle = &hive->device_list;
> @@ -4597,13 +4656,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>
> /* block all schedulers and reset given job's ring */
> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> - if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
> - dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> - job ? job->base.id : -1);
> - r = 0;
> - goto skip_recovery;
> - }
> -
> /*
> * Try to put the audio codec into suspend state
> * before gpu reset started.
More information about the amd-gfx
mailing list