[PATCH] drm/amdgpu: Parse all deferred errors with UMC aca handle
Zhang, Hawking
Hawking.Zhang at amd.com
Tue Mar 25 06:41:56 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Regards,
Hawking
-----Original Message-----
From: Liu, Xiang(Dean) <Xiang.Liu at amd.com>
Sent: Monday, March 24, 2025 22:14
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Liu, Xiang(Dean) <Xiang.Liu at amd.com>
Subject: [PATCH] drm/amdgpu: Parse all deferred errors with UMC aca handle
We should only increase the deferred errors in UMC block.
Signed-off-by: Xiang Liu <xiang.liu at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 8 -------- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 8 +++----- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +-
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 3 ++-
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +-
9 files changed, 14 insertions(+), 19 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index dc47f5fd4ea1..b4ad163f42a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -195,6 +195,10 @@ static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, {
const struct aca_bank_ops *bank_ops = handle->bank_ops;
+ /* Parse all deferred errors with UMC aca handle */
+ if (ACA_BANK_ERR_IS_DEFFERED(bank))
+ return handle->hwip == ACA_HWIP_TYPE_UMC;
+
if (!aca_bank_hwip_is_matched(bank, handle->hwip))
return false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index 6b180f1b33fd..38c88897e1ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -80,14 +80,6 @@ struct ras_query_context;
(ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))
-#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
- (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
- ACA_ERROR_TYPE_CE)
-
-#define ACA_BANK_ERR_UE_DE_DECODE(bank) \
- (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
- ACA_ERROR_TYPE_UE)
-
enum aca_reg_idx {
ACA_REG_IDX_CTL = 0,
ACA_REG_IDX_STATUS = 1,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index be426542c1ae..af62688d34bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1134,7 +1134,7 @@ static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_ban
break;
case ACA_SMU_TYPE_CE:
count = ext_error_code == 6 ? count : 0ULL;
- bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, count);
break;
default:
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index e84238336fb6..ff77f59a1499 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -893,15 +893,13 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
switch (type) {
case ACA_SMU_TYPE_UE:
- bank->aca_err_type = ACA_BANK_ERR_UE_DE_DECODE(bank);
+ bank->aca_err_type = ACA_ERROR_TYPE_UE;
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, 1ULL);
break;
case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
- (bank->aca_err_type == ACA_ERROR_TYPE_CE) ?
- ACA_REG__MISC0__ERRCNT(misc0) :
- 1);
+ ACA_REG__MISC0__ERRCNT(misc0));
break;
default:
return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index a758f0889d29..41afabd812d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -1290,7 +1290,7 @@ static int jpeg_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_ban
1ULL);
break;
case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
ACA_REG__MISC0__ERRCNT(misc0));
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index a54e7b929295..84cde1239ee4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -751,7 +751,7 @@ static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank
1ULL);
break;
case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
ACA_REG__MISC0__ERRCNT(misc0));
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 2c659470946c..c1c59637bd51 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2397,7 +2397,7 @@ static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_ban
1ULL);
break;
case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
ACA_REG__MISC0__ERRCNT(misc0));
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 74f57b2d30a5..0e404c074975 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -85,7 +85,8 @@ bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_sta
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
- (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1));
+ ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1) ||
+ (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Poison) ==
+1)));
}
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index 20f47947e894..1e4ec8f07896 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1940,7 +1940,7 @@ static int vcn_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_bank
1ULL);
break;
case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
ACA_REG__MISC0__ERRCNT(misc0));
break;
--
2.34.1
More information about the amd-gfx
mailing list