[PATCH] drm/amdgpu: Effective health check before reset
Ce Sun
cesun102 at amd.com
Tue Jul 29 04:15:45 UTC 2025
Move amdgpu_device_health_check into amdgpu_device_gpu_recover to
ensure that if the device is present can be checked before reset
Signed-off-by: Ce Sun <cesun102 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 +++++++---------------
1 file changed, 8 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2659e3ebbe49..176712225037 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6129,12 +6129,11 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
return ret;
}
-static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
+static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
struct list_head *device_list,
struct amdgpu_hive_info *hive)
{
struct amdgpu_device *tmp_adev = NULL;
- int r;
/*
* Build list of devices to reset.
@@ -6155,13 +6154,6 @@ static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
list_add_tail(&adev->reset_list, device_list);
}
- if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
- r = amdgpu_device_health_check(device_list);
- if (r)
- return r;
- }
-
- return 0;
}
static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
@@ -6449,8 +6441,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_context->hive = hive;
INIT_LIST_HEAD(&device_list);
- if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
- goto end_reset;
+ amdgpu_device_recovery_prepare(adev, &device_list, hive);
+
+ if (!amdgpu_sriov_vf(adev)) {
+ r = amdgpu_device_health_check(&device_list);
+ if (r)
+ goto end_reset;
+ }
/* We need to lock reset domain only once both for XGMI and single device */
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
@@ -6956,12 +6953,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
int r = 0, i;
u32 memsize;
- /* PCI error slot reset should be skipped During RAS recovery */
- if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
- amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
- amdgpu_ras_in_recovery(adev))
- return PCI_ERS_RESULT_RECOVERED;
-
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
memset(&reset_context, 0, sizeof(reset_context));
--
2.34.1
More information about the amd-gfx
mailing list