[PATCH] drm/amdgpu: Effective health check before reset

Tue Jul 29 07:37:55 UTC 2025

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Sun, Ce(Overlord) <Ce.Sun at amd.com>
> Sent: Tuesday, July 29, 2025 3:01 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Sun, Ce(Overlord) <Ce.Sun at amd.com>
> Subject: [PATCH] drm/amdgpu: Effective health check before reset
>
> Move amdgpu_device_health_check into amdgpu_device_gpu_recover to ensure
> that if the device is present can be checked before reset
>
> The reason is:
> 1.During the dpc event, the device where the dpc event occurs is not present on the
> bus 2.When both dpc event and ATHUB event occur simultaneously,the dpc thread
> holds the reset domain lock when detecting error,and the gpu recover thread
> acquires the hive lock.  The device is simultaneously in the states of
> amdgpu_ras_in_recovery and occurs_dpc,so gpu recover thread will not go to
> amdgpu_device_health_check.  It waits for the reset domain lock held by the dpc
> thread, but dpc thread has not released the reset domain lock.In the dpc callback
> slot_reset,to obtain the hive lock, the hive lock is held by the gpu recover thread at
> this time.So a deadlock occurred
>
> Signed-off-by: Ce Sun <cesun102 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 26 +++++++---------------
>  1 file changed, 8 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 343155f5375c..efe98ffb679a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6128,12 +6128,11 @@ static int amdgpu_device_health_check(struct
> list_head *device_list_handle)
>       return ret;
>  }
>
> -static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
> +static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
>                                         struct list_head *device_list,
>                                         struct amdgpu_hive_info *hive)
>  {
>       struct amdgpu_device *tmp_adev = NULL;
> -     int r;
>
>       /*
>        * Build list of devices to reset.
> @@ -6153,14 +6152,6 @@ static int amdgpu_device_recovery_prepare(struct
> amdgpu_device *adev,
>       } else {
>               list_add_tail(&adev->reset_list, device_list);
>       }
> -
> -     if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
> -             r = amdgpu_device_health_check(device_list);
> -             if (r)
> -                     return r;
> -     }
> -
> -     return 0;
>  }
>
>  static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
> @@ -6453,8 +6444,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device
> *adev,
>       reset_context->hive = hive;
>       INIT_LIST_HEAD(&device_list);
>
> -     if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
> -             goto end_reset;
> +     amdgpu_device_recovery_prepare(adev, &device_list, hive);
> +
> +     if (!amdgpu_sriov_vf(adev)) {
> +             r = amdgpu_device_health_check(&device_list);
> +             if (r)
> +                     goto end_reset;
> +     }
>
>       /* We need to lock reset domain only once both for XGMI and single device */
>       amdgpu_device_recovery_get_reset_lock(adev, &device_list); @@ -6952,12
> +6948,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>       int r = 0, i;
>       u32 memsize;
>
> -     /* PCI error slot reset should be skipped During RAS recovery */
> -     if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
> -         amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
> -         amdgpu_ras_in_recovery(adev))
> -             return PCI_ERS_RESULT_RECOVERED;
> -
>       dev_info(adev->dev, "PCI error: slot reset callback!!\n");
>
>       memset(&reset_context, 0, sizeof(reset_context));
> --
> 2.34.1