[PATCH] drm/amdgpu: Parse all deferred errors with UMC aca handle

Zhang, Hawking Hawking.Zhang at amd.com
Tue Mar 25 06:41:56 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: Liu, Xiang(Dean) <Xiang.Liu at amd.com>
Sent: Monday, March 24, 2025 22:14
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Liu, Xiang(Dean) <Xiang.Liu at amd.com>
Subject: [PATCH] drm/amdgpu: Parse all deferred errors with UMC aca handle

We should only increase the deferred errors in UMC block.

Signed-off-by: Xiang Liu <xiang.liu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c  | 4 ++++  drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h  | 8 --------  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c  | 8 +++-----  drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +-  drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c  | 2 +-  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c   | 3 ++-
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c  | 2 +-
 9 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index dc47f5fd4ea1..b4ad163f42a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -195,6 +195,10 @@ static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,  {
        const struct aca_bank_ops *bank_ops = handle->bank_ops;

+       /* Parse all deferred errors with UMC aca handle */
+       if (ACA_BANK_ERR_IS_DEFFERED(bank))
+               return handle->hwip == ACA_HWIP_TYPE_UMC;
+
        if (!aca_bank_hwip_is_matched(bank, handle->hwip))
                return false;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index 6b180f1b33fd..38c88897e1ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -80,14 +80,6 @@ struct ras_query_context;
        (ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
         ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))

-#define ACA_BANK_ERR_CE_DE_DECODE(bank)                             \
-       (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
-                                         ACA_ERROR_TYPE_CE)
-
-#define ACA_BANK_ERR_UE_DE_DECODE(bank)                             \
-       (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
-                                         ACA_ERROR_TYPE_UE)
-
 enum aca_reg_idx {
        ACA_REG_IDX_CTL                 = 0,
        ACA_REG_IDX_STATUS              = 1,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index be426542c1ae..af62688d34bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1134,7 +1134,7 @@ static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_ban
                break;
        case ACA_SMU_TYPE_CE:
                count = ext_error_code == 6 ? count : 0ULL;
-               bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+               bank->aca_err_type = ACA_ERROR_TYPE_CE;
                ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, count);
                break;
        default:
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index e84238336fb6..ff77f59a1499 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -893,15 +893,13 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,

        switch (type) {
        case ACA_SMU_TYPE_UE:
-               bank->aca_err_type = ACA_BANK_ERR_UE_DE_DECODE(bank);
+               bank->aca_err_type = ACA_ERROR_TYPE_UE;
                ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, 1ULL);
                break;
        case ACA_SMU_TYPE_CE:
-               bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+               bank->aca_err_type = ACA_ERROR_TYPE_CE;
                ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
-                       (bank->aca_err_type == ACA_ERROR_TYPE_CE) ?
-                               ACA_REG__MISC0__ERRCNT(misc0) :
-                               1);
+                                                    ACA_REG__MISC0__ERRCNT(misc0));
                break;
        default:
                return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index a758f0889d29..41afabd812d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -1290,7 +1290,7 @@ static int jpeg_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_ban
                                                     1ULL);
                break;
        case ACA_SMU_TYPE_CE:
-               bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+               bank->aca_err_type = ACA_ERROR_TYPE_CE;
                ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
                                                     ACA_REG__MISC0__ERRCNT(misc0));
                break;
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index a54e7b929295..84cde1239ee4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -751,7 +751,7 @@ static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank
                                                     1ULL);
                break;
        case ACA_SMU_TYPE_CE:
-               bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+               bank->aca_err_type = ACA_ERROR_TYPE_CE;
                ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
                                                     ACA_REG__MISC0__ERRCNT(misc0));
                break;
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 2c659470946c..c1c59637bd51 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2397,7 +2397,7 @@ static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_ban
                                                     1ULL);
                break;
        case ACA_SMU_TYPE_CE:
-               bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+               bank->aca_err_type = ACA_ERROR_TYPE_CE;
                ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
                                                     ACA_REG__MISC0__ERRCNT(misc0));
                break;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 74f57b2d30a5..0e404c074975 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -85,7 +85,8 @@ bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_sta

        return (amdgpu_ras_is_poison_mode_supported(adev) &&
                (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
-               (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1));
+               ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1) ||
+               (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Poison) ==
+1)));
 }

 bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index 20f47947e894..1e4ec8f07896 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1940,7 +1940,7 @@ static int vcn_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_bank
                                                     1ULL);
                break;
        case ACA_SMU_TYPE_CE:
-               bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);
+               bank->aca_err_type = ACA_ERROR_TYPE_CE;
                ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
                                                     ACA_REG__MISC0__ERRCNT(misc0));
                break;
--
2.34.1



More information about the amd-gfx mailing list