[PATCH] drm/amdgpu: Do a basic health check before reset

Wed Mar 13 09:48:29 UTC 2024

[AMD Official Use Only - General]

Reviewed-by: Asad Kamal <asad.kamal at amd.com>

Thanks & Regards
Asad

-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar at amd.com>
Sent: Wednesday, March 13, 2024 3:12 PM
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Kamal, Asad <Asad.Kamal at amd.com>
Subject: [PATCH] drm/amdgpu: Do a basic health check before reset

Check if the device is present in the bus before trying to recover. It could be that device itself is lost from the bus in some hang situations.

Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e9454e6e4cb..b37113b79483 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5536,6 +5536,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)

 }

+static int amdgpu_device_health_check(struct list_head
+*device_list_handle) {
+       struct amdgpu_device *tmp_adev;
+       int ret = 0;
+       u32 status;
+
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+               pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+               if (PCI_POSSIBLE_ERROR(status)) {
+                       dev_err(tmp_adev->dev, "device lost from bus!");
+                       ret = -ENODEV;
+               }
+       }
+
+       return ret;
+}
+
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -5607,6 +5624,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                device_list_handle = &device_list;
        }

+       if (!amdgpu_sriov_vf(adev)) {
+               r = amdgpu_device_health_check(device_list_handle);
+               if (r)
+                       goto end_reset;
+       }
+
        /* We need to lock reset domain only once both for XGMI and single device */
        tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
                                    reset_list);
@@ -5772,6 +5795,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                                            reset_list);
        amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

+end_reset:
        if (hive) {
                mutex_unlock(&hive->hive_lock);
                amdgpu_put_xgmi_hive(hive);
--
2.25.1