[PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout
Andrey Grodzovsky
Andrey.Grodzovsky at amd.com
Thu Jan 21 18:16:17 UTC 2021
Looks good to me
Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
Andrey
On 1/21/21 5:21 AM, Horace Chen wrote:
> Fix a racing issue when jobs on 2 rings timeout simultaneously.
>
> If 2 rings timed out at the same time, the
> amdgpu_device_gpu_recover will be reentered. Then the
> adev->gmc.xgmi.head will be grabbed by 2 local linked list,
> which may cause wild pointer issue in iterating.
>
> lock the device earily to prevent the node be added to 2
> different lists.
>
> also increase karma for the skipped job since the job is also
> timed out and should be guilty.
>
> Signed-off-by: Horace Chen <horace.chen at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 69 ++++++++++++++++++----
> 1 file changed, 59 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 82fc392e4296..702e577be5e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4459,6 +4459,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> up_write(&adev->reset_sem);
> }
>
> +/*
> + * to lockup a list of amdgpu devices in a hive safely, if not a hive
> + * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
> + *
> + * unlock won't require roll back.
> + */
> +static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
> +{
> + struct amdgpu_device *tmp_adev = NULL;
> +
> + if (adev->gmc.xgmi.num_physical_nodes > 1) {
> + if (!hive) {
> + dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
> + return -ENODEV;
> + }
> + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> + if (!amdgpu_device_lock_adev(tmp_adev, hive))
> + goto roll_back;
> + }
> + } else if (!amdgpu_device_lock_adev(adev, hive))
> + return -EAGAIN;
> +
> + return 0;
> +roll_back:
> + if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
> + /*
> + * if the lockup iteration break in the middle of a hive,
> + * it may means there may has a race issue,
> + * or a hive device locked up independently.
> + * we may be in trouble and may not, so will try to roll back
> + * the lock and give out a warnning.
> + */
> + dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
> + list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> + amdgpu_device_unlock_adev(tmp_adev);
> + }
> + }
> + return -EAGAIN;
> +}
> +
> static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
> {
> struct pci_dev *p = NULL;
> @@ -4572,11 +4612,29 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
> job ? job->base.id : -1, hive->hive_id);
> amdgpu_put_xgmi_hive(hive);
> + if (job)
> + drm_sched_increase_karma(&job->base);
> return 0;
> }
> mutex_lock(&hive->hive_lock);
> }
>
> + /*
> + * lock the device before we try to operate the linked list
> + * if didn't get the device lock, don't touch the linked list since
> + * others may iterating it.
> + */
> + r = amdgpu_device_lock_hive_adev(adev, hive);
> + if (r) {
> + dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> + job ? job->base.id : -1);
> +
> + /* even we skipped this reset, still need to set the job to guilty */
> + if (job)
> + drm_sched_increase_karma(&job->base);
> + goto skip_recovery;
> + }
> +
> /*
> * Build list of devices to reset.
> * In case we are in XGMI hive mode, resort the device list
> @@ -4584,8 +4642,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> */
> INIT_LIST_HEAD(&device_list);
> if (adev->gmc.xgmi.num_physical_nodes > 1) {
> - if (!hive)
> - return -ENODEV;
> if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
> list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
> device_list_handle = &hive->device_list;
> @@ -4596,13 +4652,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>
> /* block all schedulers and reset given job's ring */
> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> - if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
> - dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
> - job ? job->base.id : -1);
> - r = 0;
> - goto skip_recovery;
> - }
> -
> /*
> * Try to put the audio codec into suspend state
> * before gpu reset started.
> @@ -4740,7 +4789,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> amdgpu_put_xgmi_hive(hive);
> }
>
> - if (r)
> + if (r && r != -EAGAIN)
> dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
> return r;
> }
More information about the amd-gfx
mailing list