[PATCH 3/3 v2] drm/amdgpu: Correct the loss of aca bank reg info
Chai, Thomas
YiPeng.Chai at amd.com
Thu Aug 14 03:05:44 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
-----Original Message-----
From: Sun, Ce(Overlord) <Ce.Sun at amd.com>
Sent: Wednesday, August 13, 2025 6:36 PM
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Sun, Ce(Overlord) <Ce.Sun at amd.com>
Subject: [PATCH 3/3 v2] drm/amdgpu: Correct the loss of aca bank reg info
By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained
v2: add corresponding delay before send msg to SMU to query mca bank info.
(Stanley)
Signed-off-by: Ce Sun <cesun102 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 -- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 7 +---
4 files changed, 14 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 92c2370831b3..2beaf30ccb96 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -877,7 +877,7 @@ size_t amdgpu_aca_get_bank_count(struct amdgpu_device *adev)
void amdgpu_aca_clear_bank_count(struct amdgpu_device *adev) {
- atomic64_set(&aca->bank_count, 0);
+ atomic64_set(&adev->aca.bank_count, 0);
}
#if defined(CONFIG_DEBUG_FS)
static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 185b9e538f98..23f583492bfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3306,8 +3306,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
mutex_init(&ecc_log->lock);
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
- ecc_log->de_queried_count = 0;
- ecc_log->prev_de_queried_count = 0;
}
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3326,8 +3324,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
mutex_unlock(&ecc_log->lock);
mutex_destroy(&ecc_log->lock);
- ecc_log->de_queried_count = 0;
- ecc_log->prev_de_queried_count = 0;
}
#endif
@@ -3381,49 +3377,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
uint32_t poison_creation_count)
{
int ret = 0;
- struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
- uint32_t timeout = 0;
+ uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- uint64_t de_queried_count;
- uint32_t new_detect_count, total_detect_count;
- uint32_t need_query_count = poison_creation_count;
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+ uint64_t prev_de_queried_count = 0;
+ uint64_t bank_count = 0;
memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;
- ecc_log = &ras->umc_ecc_log;
- total_detect_count = 0;
do {
ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
if (ret)
return ret;
- de_queried_count = ecc_log->de_queried_count;
- if (de_queried_count > ecc_log->prev_de_queried_count) {
- new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
- ecc_log->prev_de_queried_count = de_queried_count;
- timeout = 0;
+ bank_count = amdgpu_aca_get_bank_count(adev);
+ if (bank_count) {
+ prev_de_queried_count = bank_count;
+ amdgpu_aca_clear_bank_count(adev);
+ timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
} else {
- new_detect_count = 0;
- }
-
- if (new_detect_count) {
- total_detect_count += new_detect_count;
- } else {
- if (!timeout && need_query_count)
- timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
-
- if (timeout) {
- if (!--timeout)
- break;
- msleep(1);
- }
+ --timeout;
+ msleep(1);
}
- } while (total_detect_count < need_query_count);
+ } while (timeout);
[Thomas] As discussed offline, this code can cause system hang under stress tested.
- if (total_detect_count)
+ if (prev_de_queried_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7f10a7402160..df93791eb645 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,6 @@ struct ras_ecc_err { struct ras_ecc_log_info {
struct mutex lock;
struct radix_tree_root de_page_tree;
- uint64_t de_queried_count;
- uint64_t prev_de_queried_count;
};
struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e590cbdd8de9..b3bdcf70df2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -581,17 +581,12 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
if (ret) {
- if (ret == -EEXIST)
- con->umc_ecc_log.de_queried_count++;
- else
+ if (ret != -EEXIST)
dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
-
kfree(ecc_err);
return ret;
}
- con->umc_ecc_log.de_queried_count++;
-
memset(page_pfn, 0, sizeof(page_pfn));
count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
pa_addr,
--
2.34.1
More information about the amd-gfx
mailing list