[PATCH V2 1/4] drm/amdgpu: add variable to record the deferred error number read by driver
Chai, Thomas
YiPeng.Chai at amd.com
Fri Jun 21 08:52:33 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
prevd_queried_count and de_queried_count are used to accurately count the number of DE lost after driver receives a large number of poison creation interrupts.
Since amdgpu_ras_query_error_status can be called by page_retirment_thread, xxx_err_count sysfs and gpu recovery,
using local variable to save the old de_queried_count before calling amdgpu_ras_query_error_status in page_retirment_thread will be inaccurate.
-----------------
Best Regards,
Thomas
-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang at amd.com>
Sent: Friday, June 21, 2024 2:37 PM
To: Chai, Thomas <YiPeng.Chai at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>
Subject: RE: [PATCH V2 1/4] drm/amdgpu: add variable to record the deferred error number read by driver
[AMD Official Use Only - AMD Internal Distribution Only]
Shall we make pre_de_queried_count to be local variable? Others look good to me
Regards,
Hawking
-----Original Message-----
From: Chai, Thomas <YiPeng.Chai at amd.com>
Sent: Thursday, June 20, 2024 13:40
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
Subject: [PATCH V2 1/4] drm/amdgpu: add variable to record the deferred error number read by driver
Add variable to record the deferred error number read by driver.
Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 62 ++++++++++++++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 4 +-
3 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 86cb97d2155b..f674e34037b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -120,7 +120,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
@@ -2804,7 +2804,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
- ecc_log->de_updated = false;
+ ecc_log->de_queried_count = 0;
+ ecc_log->prev_de_queried_count = 0;
}
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -2823,7 +2824,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
mutex_unlock(&ecc_log->lock);
mutex_destroy(&ecc_log->lock);
- ecc_log->de_updated = false;
+ ecc_log->de_queried_count = 0;
+ ecc_log->prev_de_queried_count = 0;
}
#endif
@@ -2856,40 +2858,64 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
mutex_unlock(&con->umc_ecc_log.lock);
}
-static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
- uint32_t timeout_ms)
+static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+ uint32_t poison_creation_count)
{
int ret = 0;
struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
- uint32_t timeout = timeout_ms;
+ uint32_t timeout = 0;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ uint64_t de_queried_count;
+ uint32_t new_detect_count, total_detect_count;
+ uint32_t need_query_count = poison_creation_count;
+ bool query_data_timeout = false;
memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;
ecc_log = &ras->umc_ecc_log;
- ecc_log->de_updated = false;
+ total_detect_count = 0;
do {
ret = amdgpu_ras_query_error_status(adev, &info);
- if (ret) {
- dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
- return;
+ if (ret)
+ return ret;
+
+ de_queried_count = ecc_log->de_queried_count;
+ if (de_queried_count > ecc_log->prev_de_queried_count) {
+ new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
+ ecc_log->prev_de_queried_count = de_queried_count;
+ timeout = 0;
+ } else {
+ new_detect_count = 0;
}
- if (timeout && !ecc_log->de_updated) {
- msleep(1);
- timeout--;
+ if (new_detect_count) {
+ total_detect_count += new_detect_count;
+ } else {
+ if (!timeout && need_query_count)
+ timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+
+ if (timeout) {
+ if (!--timeout) {
+ query_data_timeout = true;
+ break;
+ }
+ msleep(1);
+ }
}
- } while (timeout && !ecc_log->de_updated);
+ } while (total_detect_count < need_query_count);
- if (timeout_ms && !timeout) {
- dev_warn(adev->dev, "Can't find deferred error\n");
- return;
+ if (query_data_timeout) {
+ dev_warn(adev->dev, "Can't find deferred error! count: %u\n",
+ (need_query_count - total_detect_count));
+ return -ENOENT;
}
- if (!ret)
+ if (total_detect_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
+
+ return 0;
}
static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 83437fef9df5..748bbac666e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -469,7 +469,8 @@ struct ras_ecc_log_info {
struct mutex lock;
siphash_key_t ecc_key;
struct radix_tree_root de_page_tree;
- bool de_updated;
+ uint64_t de_queried_count;
+ uint64_t prev_de_queried_count;
};
struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6d6350f220b0..0faa21d8a7b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -557,7 +557,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
if (ret) {
if (ret == -EEXIST)
- con->umc_ecc_log.de_updated = true;
+ con->umc_ecc_log.de_queried_count++;
else
dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
@@ -566,7 +566,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
return ret;
}
- con->umc_ecc_log.de_updated = true;
+ con->umc_ecc_log.de_queried_count++;
return 0;
}
--
2.34.1
More information about the amd-gfx
mailing list