<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<meta name="Generator" content="Microsoft Word 15 (filtered medium)">
<!--[if !mso]><style>v\:* {behavior:url(#default#VML);}
o\:* {behavior:url(#default#VML);}
w\:* {behavior:url(#default#VML);}
.shape {behavior:url(#default#VML);}
</style><![endif]--><style><!--
/* Font Definitions */
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
{font-family:DengXian;
panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:Aptos;}
@font-face
{font-family:"\@DengXian";
panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0in;
font-size:12.0pt;
font-family:"Aptos",sans-serif;}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:#467886;
text-decoration:underline;}
span.EmailStyle20
{mso-style-type:personal-reply;
font-family:"Aptos",sans-serif;
color:windowtext;}
.MsoChpDefault
{mso-style-type:export-only;
font-size:10.0pt;
mso-ligatures:none;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="EN-US" link="#467886" vlink="#96607D" style="word-wrap:break-word">
<p style="font-family:Calibri;font-size:10pt;color:#0000FF;margin:5pt;font-style:normal;font-weight:normal;text-decoration:none;" align="Left">
[AMD Official Use Only - AMD Internal Distribution Only]<br>
</p>
<br>
<div>
<div class="WordSection1">
<p class="MsoNormal">This issue is caused by SMU polling mca bank info delay, driver can add corresponding delay before send msg to SMU to query mca bank info.<o:p></o:p></p>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal">Regards,<o:p></o:p></p>
<p class="MsoNormal">Stanley<o:p></o:p></p>
<div style="border:none;border-left:solid blue 1.5pt;padding:0in 0in 0in 4.0pt">
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in">
<p class="MsoNormal"><b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif">From:</span></b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif"> Sun, Ce(Overlord) <Ce.Sun@amd.com>
<br>
<b>Sent:</b> Tuesday, August 12, 2025 7:10 PM<br>
<b>To:</b> Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org<br>
<b>Cc:</b> Zhou1, Tao <Tao.Zhou1@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com><br>
<b>Subject:</b> Re: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info<o:p></o:p></span></p>
</div>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
<p style="margin:5.0pt"><span style="font-size:10.0pt;font-family:"Calibri",sans-serif;color:blue">[AMD Official Use Only - AMD Internal Distribution Only]<o:p></o:p></span></p>
<p class="MsoNormal"><o:p> </o:p></p>
<div>
<div>
<p class="MsoNormal"><span style="color:black">-----Original Message-----<br>
From: Sun, Ce(Overlord) <<a href="mailto:Ce.Sun@amd.com">Ce.Sun@amd.com</a>><br>
Sent: Tuesday, August 12, 2025 3:35 PM<br>
To: <a href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a><br>
Cc: Zhou1, Tao <<a href="mailto:Tao.Zhou1@amd.com">Tao.Zhou1@amd.com</a>>; Yang, Stanley <<a href="mailto:Stanley.Yang@amd.com">Stanley.Yang@amd.com</a>>; Zhang, Hawking <<a href="mailto:Hawking.Zhang@amd.com">Hawking.Zhang@amd.com</a>>; Wang, Yang(Kevin) <<a href="mailto:KevinYang.Wang@amd.com">KevinYang.Wang@amd.com</a>>;
Chai, Thomas <<a href="mailto:YiPeng.Chai@amd.com">YiPeng.Chai@amd.com</a>>; Sun, Ce(Overlord) <<a href="mailto:Ce.Sun@amd.com">Ce.Sun@amd.com</a>><br>
Subject: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info<br>
<br>
By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained<br>
<br>
Signed-off-by: Ce Sun <<a href="mailto:cesun102@amd.com">cesun102@amd.com</a>><br>
---<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 46 +++++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 -- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 7 ----<br>
3 files changed, 13 insertions(+), 42 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
index f00a9e0c9c47..ad8ad08f0f33 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
@@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)<br>
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */<br>
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)<br>
<br>
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms<br>
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 50 //ms<br>
<br>
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms<br>
<br>
@@ -3317,8 +3317,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)<br>
mutex_init(&ecc_log->lock);<br>
<br>
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);<br>
- ecc_log->de_queried_count = 0;<br>
- ecc_log->prev_de_queried_count = 0;<br>
}<br>
<br>
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3337,8 +3335,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)<br>
mutex_unlock(&ecc_log->lock);<br>
<br>
mutex_destroy(&ecc_log->lock);<br>
- ecc_log->de_queried_count = 0;<br>
- ecc_log->prev_de_queried_count = 0;<br>
}<br>
<br>
static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, @@ -3386,49 +3382,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,<br>
uint32_t poison_creation_count)<br>
{<br>
int ret = 0;<br>
- struct ras_ecc_log_info *ecc_log;<br>
struct ras_query_if info;<br>
- uint32_t timeout = 0;<br>
+ uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;<br>
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);<br>
- uint64_t de_queried_count;<br>
- uint32_t new_detect_count, total_detect_count;<br>
- uint32_t need_query_count = poison_creation_count;<br>
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;<br>
+ uint64_t prev_de_queried_count = 0;<br>
+ uint64_t bank_count = 0;<br>
<br>
memset(&info, 0, sizeof(info));<br>
info.head.block = AMDGPU_RAS_BLOCK__UMC;<br>
<br>
- ecc_log = &ras->umc_ecc_log;<br>
- total_detect_count = 0;<br>
do {<br>
ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);<br>
if (ret)<br>
return ret;<br>
<br>
- de_queried_count = ecc_log->de_queried_count;<br>
- if (de_queried_count > ecc_log->prev_de_queried_count) {<br>
- new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;<br>
- ecc_log->prev_de_queried_count = de_queried_count;<br>
- timeout = 0;<br>
+ bank_count = amdgpu_aca_get_bank_count(adev);<br>
<br>
[Thomas] Does bank_count only use for umc deferred error or include umc ce de and other ras block bank error?<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="color:black;background:yellow">[Ce,Sun]Hi Thomas,thank you for your review.</span><span style="color:black"><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="color:black;background:yellow">Yes, here include umc ce de and other ras block bank error. If there are many bank errors stuck earlier, we will read out all the ones stuck earlier with bank count=12.</span><span style="color:black"><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="color:black"> The amdgpu_ras_poison_creation_handler function is used to handle UMC deferred error. not include umc ce and other ras block bank error.<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="color:black;background:yellow">[Ce,Sun] As mentioned earlier, if a lot of umc ce is stuck and not handled earlier, it will be reported through amdgpu_ras_poison_creation_handler. I think how to handle or parse it is done by
aca parser. we aim is still to read out all the bank reg info.If the include umc ce and other ras block bank error gets stuck ahead.I think it should be a normal behavior to parse out all the umc ce and other ras block bank errors that were not reported earlier
through the creation/consumption interrupt</span><span style="color:black"><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="color:black"><br>
+ if (bank_count) {<br>
+ prev_de_queried_count = bank_count;<br>
+ amdgpu_aca_clear_bank_count(adev);<br>
+ timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;<br>
} else {<br>
- new_detect_count = 0;<br>
- }<br>
-<br>
- if (new_detect_count) {<br>
- total_detect_count += new_detect_count;<br>
- } else {<br>
- if (!timeout && need_query_count)<br>
- timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;<br>
-<br>
- if (timeout) {<br>
- if (!--timeout)<br>
- break;<br>
- msleep(1);<br>
- }<br>
+ --timeout;<br>
+ msleep(1);<br>
}<br>
- } while (total_detect_count < need_query_count);<br>
+ } while (timeout);<br>
<br>
- if (total_detect_count)<br>
+ if (prev_de_queried_count)<br>
schedule_delayed_work(&ras->page_retirement_dwork, 0);<br>
<br>
if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
index ff63020f9c6c..132b45a362c2 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
@@ -492,8 +492,6 @@ struct ras_ecc_err { struct ras_ecc_log_info {<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="color:black"><o:p> </o:p></span></p>
</div>
<div class="MsoNormal" align="center" style="text-align:center">
<hr size="2" width="98%" align="center">
</div>
<div id="divRplyFwdMsg">
<p class="MsoNormal"><b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">From:</span></b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> Chai, Thomas <<a href="mailto:YiPeng.Chai@amd.com">YiPeng.Chai@amd.com</a>><br>
<b>Sent:</b> Tuesday, August 12, 2025 4:33 PM<br>
<b>To:</b> Sun, Ce(Overlord) <<a href="mailto:Ce.Sun@amd.com">Ce.Sun@amd.com</a>>;
<a href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a> <<a href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>><br>
<b>Cc:</b> Zhou1, Tao <<a href="mailto:Tao.Zhou1@amd.com">Tao.Zhou1@amd.com</a>>; Yang, Stanley <<a href="mailto:Stanley.Yang@amd.com">Stanley.Yang@amd.com</a>>; Zhang, Hawking <<a href="mailto:Hawking.Zhang@amd.com">Hawking.Zhang@amd.com</a>>; Wang, Yang(Kevin)
<<a href="mailto:KevinYang.Wang@amd.com">KevinYang.Wang@amd.com</a>><br>
<b>Subject:</b> RE: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info</span>
<o:p></o:p></p>
<div>
<p class="MsoNormal"> <o:p></o:p></p>
</div>
</div>
<div>
<div>
<p class="MsoNormal" style="margin-bottom:12.0pt"><a name="BM_BEGIN"></a><span style="font-size:11.0pt;font-family:"Times New Roman",serif">[AMD Official Use Only - AMD Internal Distribution Only]<br>
<br>
-----Original Message-----<br>
From: Sun, Ce(Overlord) <<a href="mailto:Ce.Sun@amd.com">Ce.Sun@amd.com</a>><br>
Sent: Tuesday, August 12, 2025 3:35 PM<br>
To: <a href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a><br>
Cc: Zhou1, Tao <<a href="mailto:Tao.Zhou1@amd.com">Tao.Zhou1@amd.com</a>>; Yang, Stanley <<a href="mailto:Stanley.Yang@amd.com">Stanley.Yang@amd.com</a>>; Zhang, Hawking <<a href="mailto:Hawking.Zhang@amd.com">Hawking.Zhang@amd.com</a>>; Wang, Yang(Kevin) <<a href="mailto:KevinYang.Wang@amd.com">KevinYang.Wang@amd.com</a>>;
Chai, Thomas <<a href="mailto:YiPeng.Chai@amd.com">YiPeng.Chai@amd.com</a>>; Sun, Ce(Overlord) <<a href="mailto:Ce.Sun@amd.com">Ce.Sun@amd.com</a>><br>
Subject: [PATCH 3/3] drm/amdgpu: Correct the loss of aca bank reg info<br>
<br>
By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained<br>
<br>
Signed-off-by: Ce Sun <<a href="mailto:cesun102@amd.com">cesun102@amd.com</a>><br>
---<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 46 +++++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 -- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 7 ----<br>
3 files changed, 13 insertions(+), 42 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
index f00a9e0c9c47..ad8ad08f0f33 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
@@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)<br>
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */<br>
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)<br>
<br>
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms<br>
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 50 //ms<br>
<br>
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms<br>
<br>
@@ -3317,8 +3317,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)<br>
mutex_init(&ecc_log->lock);<br>
<br>
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);<br>
- ecc_log->de_queried_count = 0;<br>
- ecc_log->prev_de_queried_count = 0;<br>
}<br>
<br>
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3337,8 +3335,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)<br>
mutex_unlock(&ecc_log->lock);<br>
<br>
mutex_destroy(&ecc_log->lock);<br>
- ecc_log->de_queried_count = 0;<br>
- ecc_log->prev_de_queried_count = 0;<br>
}<br>
<br>
static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, @@ -3386,49 +3382,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,<br>
uint32_t poison_creation_count)<br>
{<br>
int ret = 0;<br>
- struct ras_ecc_log_info *ecc_log;<br>
struct ras_query_if info;<br>
- uint32_t timeout = 0;<br>
+ uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;<br>
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);<br>
- uint64_t de_queried_count;<br>
- uint32_t new_detect_count, total_detect_count;<br>
- uint32_t need_query_count = poison_creation_count;<br>
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;<br>
+ uint64_t prev_de_queried_count = 0;<br>
+ uint64_t bank_count = 0;<br>
<br>
memset(&info, 0, sizeof(info));<br>
info.head.block = AMDGPU_RAS_BLOCK__UMC;<br>
<br>
- ecc_log = &ras->umc_ecc_log;<br>
- total_detect_count = 0;<br>
do {<br>
ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);<br>
if (ret)<br>
return ret;<br>
<br>
- de_queried_count = ecc_log->de_queried_count;<br>
- if (de_queried_count > ecc_log->prev_de_queried_count) {<br>
- new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;<br>
- ecc_log->prev_de_queried_count = de_queried_count;<br>
- timeout = 0;<br>
+ bank_count = amdgpu_aca_get_bank_count(adev);<br>
<br>
[Thomas] Does bank_count only use for umc deferred error or include umc ce de and other ras block bank error?<br>
The amdgpu_ras_poison_creation_handler function is used to handle UMC deferred error. not include umc ce and other ras block bank error.<br>
<br>
+ if (bank_count) {<br>
+ prev_de_queried_count = bank_count;<br>
+ amdgpu_aca_clear_bank_count(adev);<br>
+ timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;<br>
} else {<br>
- new_detect_count = 0;<br>
- }<br>
-<br>
- if (new_detect_count) {<br>
- total_detect_count += new_detect_count;<br>
- } else {<br>
- if (!timeout && need_query_count)<br>
- timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;<br>
-<br>
- if (timeout) {<br>
- if (!--timeout)<br>
- break;<br>
- msleep(1);<br>
- }<br>
+ --timeout;<br>
+ msleep(1);<br>
}<br>
- } while (total_detect_count < need_query_count);<br>
+ } while (timeout);<br>
<br>
- if (total_detect_count)<br>
+ if (prev_de_queried_count)<br>
schedule_delayed_work(&ras->page_retirement_dwork, 0);<br>
<br>
if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
index ff63020f9c6c..132b45a362c2 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
@@ -492,8 +492,6 @@ struct ras_ecc_err { struct ras_ecc_log_info {<br>
struct mutex lock;<br>
struct radix_tree_root de_page_tree;<br>
- uint64_t de_queried_count;<br>
- uint64_t prev_de_queried_count;<br>
};<br>
<br>
struct ras_critical_region {<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c<br>
index e590cbdd8de9..8dbffe4d22d1 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c<br>
@@ -581,17 +581,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,<br>
<br>
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);<br>
if (ret) {<br>
- if (ret == -EEXIST)<br>
- con->umc_ecc_log.de_queried_count++;<br>
- else<br>
- dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);<br>
-<br>
kfree(ecc_err);<br>
return ret;<br>
}<br>
<br>
- con->umc_ecc_log.de_queried_count++;<br>
-<br>
memset(page_pfn, 0, sizeof(page_pfn));<br>
count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,<br>
pa_addr,<br>
--<br>
2.34.1</span><span style="font-family:"Times New Roman",serif"><o:p></o:p></span></p>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
</html>