[PATCH 1/2] drm/amdgpu: Add fatal error detected flag
Kamal, Asad
Asad.Kamal at amd.com
Thu Feb 22 13:23:56 UTC 2024
[AMD Official Use Only - General]
Series is Reviewed-by: Asad Kamal <asad.kamal at amd.com>
Thanks & Regards
Asad
-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar at amd.com>
Sent: Thursday, February 22, 2024 3:47 PM
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Joshi, Mukul <Mukul.Joshi at amd.com>; Kamal, Asad <Asad.Kamal at amd.com>
Subject: [PATCH 1/2] drm/amdgpu: Add fatal error detected flag
For a RAS error that needs a full reset to recover, set the fatal error status. Clear the status once the device is reset.
Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 ++++
3 files changed, 39 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1ef892bea488..d475c54c0a08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5308,6 +5308,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
+ amdgpu_ras_set_fed(tmp_adev, false);
r = amdgpu_device_asic_init(tmp_adev);
if (r) {
dev_warn(tmp_adev->dev, "asic atom init failed!"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 46f3d1013e8c..2c94de305c69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2439,6 +2439,18 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /* For any RAS error that needs a full reset to
+ * recover, set the fatal error status
+ */
+ if (hive) {
+ list_for_each_entry(remote_adev,
+ &hive->device_list,
+ gmc.xgmi.head)
+ amdgpu_ras_set_fed(remote_adev,
+ true);
+ } else {
+ amdgpu_ras_set_fed(adev, true);
+ }
psp_fatal_error_recovery_quirk(&adev->psp);
}
}
@@ -3440,6 +3452,26 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
return 0;
}
+bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) {
+ struct amdgpu_ras *ras;
+
+ ras = amdgpu_ras_get_context(adev);
+ if (!ras)
+ return false;
+
+ return atomic_read(&ras->fed);
+}
+
+void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) {
+ struct amdgpu_ras *ras;
+
+ ras = amdgpu_ras_get_context(adev);
+ if (ras)
+ atomic_set(&ras->fed, !!status);
+}
+
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) {
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index d10e5bb0e52f..e0f8ce9d8440 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -477,6 +477,8 @@ struct amdgpu_ras {
wait_queue_head_t page_retirement_wq;
struct mutex page_retirement_lock;
atomic_t page_retirement_req_cnt;
+ /* Fatal error detected flag */
+ atomic_t fed;
};
struct ras_fs_data {
@@ -873,4 +875,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,
void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
struct ras_err_addr *mca_err_addr);
+
+void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); bool
+amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
+
#endif
--
2.25.1
More information about the amd-gfx
mailing list