[PATCH 4/4 v3] drm/amdgpu: Correct the loss of aca bank reg info

Fri Aug 15 08:57:11 UTC 2025

[AMD Official Use Only - AMD Internal Distribution Only]

+
+               bank_count = amdgpu_aca_get_bank_count(adev);
+               if (bank_count) {
+                       total_bank_count += bank_count;
+                       amdgpu_aca_clear_bank_count(adev);

I seem to have misunderstood the usage of 'bank_comunt' before. If it is only used as a marker, it is best not to record it in ACA as it seems unreasonable.
If you only want to confirm if there are new ACA banks available, you can record the bank sequence number outside and compare whether the new sequence number is larger than the saved number.

Best Regards,
Kevin

-----Original Message-----
From: Sun, Ce(Overlord) <Ce.Sun at amd.com>
Sent: Friday, August 15, 2025 12:12
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Sun, Ce(Overlord) <Ce.Sun at amd.com>
Subject: [PATCH 4/4 v3] drm/amdgpu: Correct the loss of aca bank reg info

By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained

v2: add corresponding delay before send msg to SMU to query mca bank info.
(Stanley)

v3: the loop cannot exit. (Thomas)

Signed-off-by: Ce Sun <cesun102 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 65 +++++++++++++------------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 +-  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 12 +++--
 3 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 31850a47a41f..9ccc1fbca14f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)

-#define MAX_UMC_POISON_POLLING_TIME_ASYNC  300  //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  50

 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms

@@ -131,6 +131,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
 #define BYPASS_ALLOCATED_ADDRESS        0x0
 #define BYPASS_INITIALIZATION_ADDRESS   0x1

+#define MAX_BANK_COUNT 12
+
 enum amdgpu_ras_retire_page_reservation {
        AMDGPU_RAS_RETIRE_PAGE_RESERVED,
        AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -3306,8 +3308,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
        mutex_init(&ecc_log->lock);

        INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
+       ecc_log->consumption_de_count = 0;
+       ecc_log->creation_de_count = 0;
 }

 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3326,8 +3328,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
        mutex_unlock(&ecc_log->lock);

        mutex_destroy(&ecc_log->lock);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
+       ecc_log->consumption_de_count = 0;
+       ecc_log->creation_de_count = 0;
 }
 #endif

@@ -3381,49 +3383,48 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
                                uint32_t poison_creation_count)
 {
        int ret = 0;
-       struct ras_ecc_log_info *ecc_log;
        struct ras_query_if info;
-       uint32_t timeout = 0;
+       struct ras_ecc_log_info *ecc_log;
+       uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-       uint64_t de_queried_count;
-       uint32_t new_detect_count, total_detect_count;
-       uint32_t need_query_count = poison_creation_count;
+       uint64_t creation_de_count = 0;
+       uint64_t consumption_de_count = 0;
        enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+       uint64_t bank_count = 0;
+       uint64_t total_bank_count = 0;

        memset(&info, 0, sizeof(info));
        info.head.block = AMDGPU_RAS_BLOCK__UMC;
-
        ecc_log = &ras->umc_ecc_log;
-       total_detect_count = 0;
+
        do {
                ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
                if (ret)
                        return ret;
-
-               de_queried_count = ecc_log->de_queried_count;
-               if (de_queried_count > ecc_log->prev_de_queried_count) {
-                       new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
-                       ecc_log->prev_de_queried_count = de_queried_count;
-                       timeout = 0;
+               creation_de_count = ecc_log->creation_de_count;
+               consumption_de_count = ecc_log->consumption_de_count;
+
+               bank_count = amdgpu_aca_get_bank_count(adev);
+               if (bank_count) {
+                       total_bank_count += bank_count;
+                       amdgpu_aca_clear_bank_count(adev);
+                       timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
                } else {
-                       new_detect_count = 0;
+                       --timeout;
+                       msleep(20);
                }

-               if (new_detect_count) {
-                       total_detect_count += new_detect_count;
-               } else {
-                       if (!timeout && need_query_count)
-                               timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+               if (creation_de_count && consumption_de_count)
+                       break;

-                       if (timeout) {
-                               if (!--timeout)
-                                       break;
-                               msleep(1);
-                       }
-               }
-       } while (total_detect_count < need_query_count);
+               if (total_bank_count >= MAX_BANK_COUNT)
+                       break;
+       } while (timeout);
+
+       ecc_log->creation_de_count = 0;
+       ecc_log->consumption_de_count = 0;

-       if (total_detect_count)
+       if (consumption_de_count)
                schedule_delayed_work(&ras->page_retirement_dwork, 0);

        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6265dac0e1c0..b4eb427409ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,8 @@ struct ras_ecc_err {  struct ras_ecc_log_info {
        struct mutex lock;
        struct radix_tree_root de_page_tree;
-       uint64_t        de_queried_count;
-       uint64_t        prev_de_queried_count;
+       uint64_t consumption_de_count;
+       uint64_t creation_de_count;
 };

 struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e590cbdd8de9..11b99095efd3 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -536,8 +536,14 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
        mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);

-       if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0))
+       /* only creation/consumption defer error can access here.
+        * MCA_UMC_HWID_V12_0/MCA_UMC_MCATYPE_V12_0.
+        * It is the hwid/mactype of the consumption defer error
+        * */
+       if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) {
+               con->umc_ecc_log.creation_de_count++;
                return 0;
+       }

        if (!status)
                return 0;
@@ -582,7 +588,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
        if (ret) {
                if (ret == -EEXIST)
-                       con->umc_ecc_log.de_queried_count++;
+                       con->umc_ecc_log.consumption_de_count++;
                else
                        dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);

@@ -590,7 +596,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
                return ret;
        }

-       con->umc_ecc_log.de_queried_count++;
+       con->umc_ecc_log.consumption_de_count++;

        memset(page_pfn, 0, sizeof(page_pfn));
        count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
--
2.34.1