[PATCH 5/5] drm/amdgpu: add ras fatal flag to distingush fatal error reset
Zhang, Hawking
Hawking.Zhang at amd.com
Wed Jun 5 08:21:50 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
Series is
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Friday, May 31, 2024 18:49
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 5/5] drm/amdgpu: add ras fatal flag to distingush fatal error reset
Check it in mode1 reset.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 ++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 +
.../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 +-
.../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 2 +- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +-
6 files changed, 37 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2071e30d7e56..97b770ba6424 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2451,6 +2451,26 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev)
return false;
}
+bool amdgpu_ras_in_fatal(struct amdgpu_device *adev) {
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ int hive_ras_fatal = 0;
+
+ if (!amdgpu_ras_in_recovery(adev))
+ return false;
+
+ if (hive) {
+ hive_ras_fatal = atomic_read(&hive->ras_fatal);
+ amdgpu_put_xgmi_hive(hive);
+ }
+
+ if (ras && (atomic_read(&ras->in_fatal) || hive_ras_fatal))
+ return true;
+
+ return false;
+}
+
static void amdgpu_ras_do_recovery(struct work_struct *work) {
struct amdgpu_ras *ras =
@@ -2462,6 +2482,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
if (hive) {
atomic_set(&hive->ras_recovery, 1);
+ if (atomic_read(&ras->in_fatal))
+ atomic_set(&hive->ras_fatal, 1);
/* If any device which is part of the hive received RAS fatal
* error interrupt, set fatal error status on all. This @@ -2526,8 +2548,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
}
atomic_set(&ras->in_recovery, 0);
+ atomic_set(&ras->in_fatal, 0);
if (hive) {
atomic_set(&hive->ras_recovery, 0);
+ atomic_set(&hive->ras_fatal, 0);
amdgpu_put_xgmi_hive(hive);
}
}
@@ -2982,6 +3006,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
+ atomic_set(&con->in_fatal, 0);
con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
@@ -4006,8 +4031,13 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal)
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
}
- if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
+ if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) {
+ if (fatal)
+ atomic_set(&ras->in_fatal, 1);
+
amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
+ }
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ed5793458a70..444a7fb7fbe3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -489,6 +489,7 @@ struct amdgpu_ras {
/* gpu recovery */
struct work_struct recovery_work;
atomic_t in_recovery;
+ atomic_t in_fatal;
struct amdgpu_device *adev;
/* error handler data */
struct ras_err_handler_data *eh_data;
@@ -953,6 +954,7 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
pasid_notify pasid_fn, void *data, uint32_t reset);
bool amdgpu_ras_in_recovery(struct amdgpu_device *adev);
+bool amdgpu_ras_in_fatal(struct amdgpu_device *adev);
__printf(3, 4)
void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index a3bfc16de6d4..a6d6272a4ec6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -44,6 +44,7 @@ struct amdgpu_hive_info {
struct amdgpu_reset_domain *reset_domain;
atomic_t ras_recovery;
+ atomic_t ras_fatal;
struct ras_event_manager event_mgr;
};
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 04533f99f1e3..a850e7b29d9d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1876,7 +1876,7 @@ static int aldebaran_mode1_reset(struct smu_context *smu)
/* fatal error triggered by ras, PMFW supports the flag
from 68.44.0 */
if ((smu->smc_fw_version >= 0x00442c00) &&
- amdgpu_ras_in_recovery(adev))
+ amdgpu_ras_in_fatal(adev))
fatal_err = 1;
param |= (fatal_err << 16);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index d1766a603bb9..d6c6c9a08e9d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2788,7 +2788,7 @@ static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu,
struct amdgpu_device *adev = smu->adev;
if ((smu->smc_fw_version >= supported_version) &&
- amdgpu_ras_in_recovery(adev))
+ amdgpu_ras_in_fatal(adev))
/* Set RAS fatal error reset flag */
*param = 1 << 16;
else
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index c1d7528a6dc8..4434872bbe2e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2580,7 +2580,7 @@ static int smu_v13_0_6_mode1_reset(struct smu_context *smu)
param = SMU_RESET_MODE_1;
/* fatal error triggered by ras, PMFW supports the flag */
- if (amdgpu_ras_in_recovery(adev))
+ if (amdgpu_ras_in_fatal(adev))
fatal_err = 1;
param |= (fatal_err << 16);
--
2.34.1
More information about the amd-gfx
mailing list