[PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info

Wed Aug 13 09:48:17 UTC 2025

[AMD Official Use Only - AMD Internal Distribution Only]

This issue is caused by SMU polling mca bank info delay, driver can add corresponding delay before send msg to SMU to query mca bank info.

Regards,
Stanley
From: Sun, Ce(Overlord) <Ce.Sun at amd.com>
Sent: Tuesday, August 12, 2025 7:10 PM
To: Chai, Thomas <YiPeng.Chai at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
Subject: Re: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info


[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Sun, Ce(Overlord) <Ce.Sun at amd.com<mailto:Ce.Sun at amd.com>>
Sent: Tuesday, August 12, 2025 3:35 PM
To: amd-gfx at lists.freedesktop.org<mailto:amd-gfx at lists.freedesktop.org>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com<mailto:Tao.Zhou1 at amd.com>>; Yang, Stanley <Stanley.Yang at amd.com<mailto:Stanley.Yang at amd.com>>; Zhang, Hawking <Hawking.Zhang at amd.com<mailto:Hawking.Zhang at amd.com>>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com<mailto:KevinYang.Wang at amd.com>>; Chai, Thomas <YiPeng.Chai at amd.com<mailto:YiPeng.Chai at amd.com>>; Sun, Ce(Overlord) <Ce.Sun at amd.com<mailto:Ce.Sun at amd.com>>
Subject: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info

By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained

Signed-off-by: Ce Sun <cesun102 at amd.com<mailto:cesun102 at amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 46 +++++++------------------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 --  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  |  7 ----
 3 files changed, 13 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f00a9e0c9c47..ad8ad08f0f33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)

-#define MAX_UMC_POISON_POLLING_TIME_ASYNC  300  //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  50  //ms

 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms

@@ -3317,8 +3317,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
        mutex_init(&ecc_log->lock);

        INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
 }

 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3337,8 +3335,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
        mutex_unlock(&ecc_log->lock);

        mutex_destroy(&ecc_log->lock);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
 }

 static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, @@ -3386,49 +3382,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
                                uint32_t poison_creation_count)
 {
        int ret = 0;
-       struct ras_ecc_log_info *ecc_log;
        struct ras_query_if info;
-       uint32_t timeout = 0;
+       uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-       uint64_t de_queried_count;
-       uint32_t new_detect_count, total_detect_count;
-       uint32_t need_query_count = poison_creation_count;
        enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+       uint64_t prev_de_queried_count = 0;
+       uint64_t bank_count = 0;

        memset(&info, 0, sizeof(info));
        info.head.block = AMDGPU_RAS_BLOCK__UMC;

-       ecc_log = &ras->umc_ecc_log;
-       total_detect_count = 0;
        do {
                ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
                if (ret)
                        return ret;

-               de_queried_count = ecc_log->de_queried_count;
-               if (de_queried_count > ecc_log->prev_de_queried_count) {
-                       new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
-                       ecc_log->prev_de_queried_count = de_queried_count;
-                       timeout = 0;
+               bank_count = amdgpu_aca_get_bank_count(adev);

[Thomas] Does bank_count  only use for umc deferred error or include umc ce de and other ras block bank error?
[Ce,Sun]Hi Thomas,thank you for your review.
Yes, here include umc ce de and other ras block bank error. If there are many bank errors stuck earlier, we will read out all the ones stuck earlier with bank count=12.
                  The amdgpu_ras_poison_creation_handler function is used to handle UMC deferred error. not include umc ce and other ras block bank error.
[Ce,Sun] As mentioned earlier, if a lot of umc ce is stuck and not handled earlier, it will be reported through amdgpu_ras_poison_creation_handler. I think how to handle or parse it is done by aca parser. we aim is still to read out all the bank reg info.If the include umc ce and other ras block bank error gets stuck ahead.I think it should be a normal behavior to parse out all the umc ce and other ras block bank errors that were not reported earlier through the creation/consumption interrupt

+               if (bank_count) {
+                       prev_de_queried_count = bank_count;
+                       amdgpu_aca_clear_bank_count(adev);
+                       timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
                } else {
-                       new_detect_count = 0;
-               }
-
-               if (new_detect_count) {
-                       total_detect_count += new_detect_count;
-               } else {
-                       if (!timeout && need_query_count)
-                               timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
-
-                       if (timeout) {
-                               if (!--timeout)
-                                       break;
-                               msleep(1);
-                       }
+                       --timeout;
+                       msleep(1);
                }
-       } while (total_detect_count < need_query_count);
+       } while (timeout);

-       if (total_detect_count)
+       if (prev_de_queried_count)
                schedule_delayed_work(&ras->page_retirement_dwork, 0);

        if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ff63020f9c6c..132b45a362c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,6 @@ struct ras_ecc_err {  struct ras_ecc_log_info {

________________________________
From: Chai, Thomas <YiPeng.Chai at amd.com<mailto:YiPeng.Chai at amd.com>>
Sent: Tuesday, August 12, 2025 4:33 PM
To: Sun, Ce(Overlord) <Ce.Sun at amd.com<mailto:Ce.Sun at amd.com>>; amd-gfx at lists.freedesktop.org<mailto:amd-gfx at lists.freedesktop.org> <amd-gfx at lists.freedesktop.org<mailto:amd-gfx at lists.freedesktop.org>>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com<mailto:Tao.Zhou1 at amd.com>>; Yang, Stanley <Stanley.Yang at amd.com<mailto:Stanley.Yang at amd.com>>; Zhang, Hawking <Hawking.Zhang at amd.com<mailto:Hawking.Zhang at amd.com>>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com<mailto:KevinYang.Wang at amd.com>>
Subject: RE: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info

[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Sun, Ce(Overlord) <Ce.Sun at amd.com<mailto:Ce.Sun at amd.com>>
Sent: Tuesday, August 12, 2025 3:35 PM
To: amd-gfx at lists.freedesktop.org<mailto:amd-gfx at lists.freedesktop.org>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com<mailto:Tao.Zhou1 at amd.com>>; Yang, Stanley <Stanley.Yang at amd.com<mailto:Stanley.Yang at amd.com>>; Zhang, Hawking <Hawking.Zhang at amd.com<mailto:Hawking.Zhang at amd.com>>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com<mailto:KevinYang.Wang at amd.com>>; Chai, Thomas <YiPeng.Chai at amd.com<mailto:YiPeng.Chai at amd.com>>; Sun, Ce(Overlord) <Ce.Sun at amd.com<mailto:Ce.Sun at amd.com>>
Subject: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info

By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained

Signed-off-by: Ce Sun <cesun102 at amd.com<mailto:cesun102 at amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 46 +++++++------------------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 --  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  |  7 ----
 3 files changed, 13 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f00a9e0c9c47..ad8ad08f0f33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)

-#define MAX_UMC_POISON_POLLING_TIME_ASYNC  300  //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  50  //ms

 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms

@@ -3317,8 +3317,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
        mutex_init(&ecc_log->lock);

        INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
 }

 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3337,8 +3335,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
        mutex_unlock(&ecc_log->lock);

        mutex_destroy(&ecc_log->lock);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
 }

 static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, @@ -3386,49 +3382,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
                                uint32_t poison_creation_count)
 {
        int ret = 0;
-       struct ras_ecc_log_info *ecc_log;
        struct ras_query_if info;
-       uint32_t timeout = 0;
+       uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-       uint64_t de_queried_count;
-       uint32_t new_detect_count, total_detect_count;
-       uint32_t need_query_count = poison_creation_count;
        enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+       uint64_t prev_de_queried_count = 0;
+       uint64_t bank_count = 0;

        memset(&info, 0, sizeof(info));
        info.head.block = AMDGPU_RAS_BLOCK__UMC;

-       ecc_log = &ras->umc_ecc_log;
-       total_detect_count = 0;
        do {
                ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
                if (ret)
                        return ret;

-               de_queried_count = ecc_log->de_queried_count;
-               if (de_queried_count > ecc_log->prev_de_queried_count) {
-                       new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
-                       ecc_log->prev_de_queried_count = de_queried_count;
-                       timeout = 0;
+               bank_count = amdgpu_aca_get_bank_count(adev);

[Thomas] Does bank_count  only use for umc deferred error or include umc ce de and other ras block bank error?
                  The amdgpu_ras_poison_creation_handler function is used to handle UMC deferred error. not include umc ce and other ras block bank error.

+               if (bank_count) {
+                       prev_de_queried_count = bank_count;
+                       amdgpu_aca_clear_bank_count(adev);
+                       timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
                } else {
-                       new_detect_count = 0;
-               }
-
-               if (new_detect_count) {
-                       total_detect_count += new_detect_count;
-               } else {
-                       if (!timeout && need_query_count)
-                               timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
-
-                       if (timeout) {
-                               if (!--timeout)
-                                       break;
-                               msleep(1);
-                       }
+                       --timeout;
+                       msleep(1);
                }
-       } while (total_detect_count < need_query_count);
+       } while (timeout);

-       if (total_detect_count)
+       if (prev_de_queried_count)
                schedule_delayed_work(&ras->page_retirement_dwork, 0);

        if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ff63020f9c6c..132b45a362c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,6 @@ struct ras_ecc_err {  struct ras_ecc_log_info {
        struct mutex lock;
        struct radix_tree_root de_page_tree;
-       uint64_t        de_queried_count;
-       uint64_t        prev_de_queried_count;
 };

 struct ras_critical_region {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e590cbdd8de9..8dbffe4d22d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -581,17 +581,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,

        ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
        if (ret) {
-               if (ret == -EEXIST)
-                       con->umc_ecc_log.de_queried_count++;
-               else
-                       dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
-
                kfree(ecc_err);
                return ret;
        }

-       con->umc_ecc_log.de_queried_count++;
-
        memset(page_pfn, 0, sizeof(page_pfn));
        count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
                                pa_addr,
--
2.34.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20250813/4437f218/attachment-0001.htm>