[PATCH V2 4/5] drm/amdgpu: add interface to check mca umc status
YiPeng Chai
YiPeng.Chai at amd.com
Thu Jan 18 06:42:56 UTC 2024
Add interface to check mca umc status.
Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 12 ++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 4 +++-
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 20 +++++++++++++++++++
.../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 6 +++---
5 files changed, 38 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index fde20857b3dd..65ed8bb5c120 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -27,6 +27,16 @@
#include "umc/umc_6_7_0_offset.h"
#include "umc/umc_6_7_0_sh_mask.h"
+static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev,
+ uint64_t mc_status)
+{
+ if (adev->umc.ras->check_ecc_err_status)
+ return adev->umc.ras->check_ecc_err_status(adev,
+ AMDGPU_MCA_ERROR_TYPE_DE, &mc_status);
+
+ return false;
+}
+
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count)
@@ -257,7 +267,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
amdgpu_ras_error_statistic_ue_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
else {
- if (!!(MCA_REG__STATUS__DEFERRED(entry->regs[MCA_REG_IDX_STATUS])))
+ if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]))
amdgpu_ras_error_statistic_de_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
else
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index b399f1b62887..b964110ed1e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -65,6 +65,7 @@ enum amdgpu_mca_ip {
enum amdgpu_mca_error_type {
AMDGPU_MCA_ERROR_TYPE_UE = 0,
AMDGPU_MCA_ERROR_TYPE_CE,
+ AMDGPU_MCA_ERROR_TYPE_DE,
};
struct amdgpu_mca_ras_block {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index de2dc1853636..83199296ed10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -21,7 +21,7 @@
#ifndef __AMDGPU_UMC_H__
#define __AMDGPU_UMC_H__
#include "amdgpu_ras.h"
-
+#include "amdgpu_mca.h"
/*
* (addr / 256) * 4096, the higher 26 bits in ErrorAddr
* is the index of 4KB block
@@ -64,6 +64,8 @@ struct amdgpu_umc_ras {
void *ras_error_status);
void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev,
void *ras_error_status);
+ bool (*check_ecc_err_status)(struct amdgpu_device *adev,
+ enum amdgpu_mca_error_type type, void *ras_error_status);
/* support different eeprom table version for different asic */
void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr);
};
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index fa2168f1d3bf..1e8e97d72f1e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -425,6 +425,25 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
}
}
+static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
+ enum amdgpu_mca_error_type type, void *ras_error_status)
+{
+ uint64_t mc_umc_status = *(uint64_t *)ras_error_status;
+
+ switch (type) {
+ case AMDGPU_MCA_ERROR_TYPE_UE:
+ return umc_v12_0_is_uncorrectable_error(adev, mc_umc_status);
+ case AMDGPU_MCA_ERROR_TYPE_CE:
+ return umc_v12_0_is_correctable_error(adev, mc_umc_status);
+ case AMDGPU_MCA_ERROR_TYPE_DE:
+ return umc_v12_0_is_deferred_error(adev, mc_umc_status);
+ default:
+ return false;
+ }
+
+ return false;
+}
+
static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
{
amdgpu_umc_loop_channels(adev,
@@ -510,5 +529,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
+ .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
};
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..67fc01e0f9c6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2557,9 +2557,9 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct
return 0;
}
- if ((type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0)) ||
- (type == AMDGPU_MCA_ERROR_TYPE_CE && (umc_v12_0_is_correctable_error(adev, status0) ||
- umc_v12_0_is_deferred_error(adev, status0))))
+ if (umc_v12_0_is_deferred_error(adev, status0) ||
+ umc_v12_0_is_uncorrectable_error(adev, status0) ||
+ umc_v12_0_is_correctable_error(adev, status0))
*count = 1;
return 0;
--
2.34.1
More information about the amd-gfx
mailing list