[PATCH] drm/amdgpu: Effective health check before reset

Tue Jul 29 06:20:21 UTC 2025

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Sun, Ce(Overlord) <Ce.Sun at amd.com>
> Sent: Tuesday, July 29, 2025 12:16 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Sun, Ce(Overlord) <Ce.Sun at amd.com>
> Subject: [PATCH] drm/amdgpu: Effective health check before reset
>
> Move amdgpu_device_health_check into amdgpu_device_gpu_recover to ensure
> that if the device is present can be checked before reset
>
> Signed-off-by: Ce Sun <cesun102 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 +++++++---------------
>  1 file changed, 8 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2659e3ebbe49..176712225037 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6129,12 +6129,11 @@ static int amdgpu_device_health_check(struct
> list_head *device_list_handle)
>       return ret;
>  }
>
> -static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
> +static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
>                                         struct list_head *device_list,
>                                         struct amdgpu_hive_info *hive)
>  {
>       struct amdgpu_device *tmp_adev = NULL;
> -     int r;
>
>       /*
>        * Build list of devices to reset.
> @@ -6155,13 +6154,6 @@ static int amdgpu_device_recovery_prepare(struct
> amdgpu_device *adev,
>               list_add_tail(&adev->reset_list, device_list);
>       }
>
> -     if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
> -             r = amdgpu_device_health_check(device_list);
> -             if (r)
> -                     return r;
> -     }
> -
> -     return 0;
>  }
>
>  static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
> @@ -6449,8 +6441,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device
> *adev,
>       reset_context->hive = hive;
>       INIT_LIST_HEAD(&device_list);
>
> -     if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
> -             goto end_reset;
> +     amdgpu_device_recovery_prepare(adev, &device_list, hive);
> +
> +     if (!amdgpu_sriov_vf(adev)) {

[Tao] so the condition is also changed.
Please describe the issue you'd like to fix in more detail.

> +             r = amdgpu_device_health_check(&device_list);
> +             if (r)
> +                     goto end_reset;
> +     }
>
>       /* We need to lock reset domain only once both for XGMI and single device */
>       amdgpu_device_recovery_get_reset_lock(adev, &device_list); @@ -6956,12
> +6953,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>       int r = 0, i;
>       u32 memsize;
>
> -     /* PCI error slot reset should be skipped During RAS recovery */
> -     if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
> -         amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
> -         amdgpu_ras_in_recovery(adev))
> -             return PCI_ERS_RESULT_RECOVERED;
> -
>       dev_info(adev->dev, "PCI error: slot reset callback!!\n");
>
>       memset(&reset_context, 0, sizeof(reset_context));
> --
> 2.34.1