[PATCH 08/15] drm/amdgpu: Add delay work to retire bad pages
YiPeng Chai
YiPeng.Chai at amd.com
Thu Apr 18 02:58:29 UTC 2024
Add delay work to retire bad pages.
Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 36 ++++++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 +++
4 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 702229abe7ee..c1f146d3e28d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
+#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
+
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2766,6 +2768,30 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
ecc_log->de_updated = false;
}
+static void amdgpu_ras_do_page_retirement(struct work_struct *work)
+{
+ struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+ page_retirement_dwork.work);
+ struct amdgpu_device *adev = con->adev;
+ struct ras_err_data err_data;
+
+ if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
+ return;
+
+ amdgpu_ras_error_data_init(&err_data);
+
+ amdgpu_umc_handle_bad_pages(adev, &err_data);
+
+ amdgpu_ras_error_data_fini(&err_data);
+
+ mutex_lock(&con->umc_ecc_log.lock);
+ if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+ UMC_ECC_NEW_DETECTED_TAG))
+ schedule_delayed_work(&con->page_retirement_dwork,
+ msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
+ mutex_unlock(&con->umc_ecc_log.lock);
+}
+
static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
{
@@ -2804,7 +2830,12 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
uint32_t timeout)
{
- amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ int ret;
+
+ ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+ if (!ret)
+ schedule_delayed_work(&con->page_retirement_dwork, 0);
}
static int amdgpu_ras_page_retirement_thread(void *param)
@@ -2919,6 +2950,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
}
+ INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
#ifdef CONFIG_X86_MCE_AMD
#ifdef HAVE_SMCA_UMC_V2
@@ -2967,6 +2999,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
cancel_work_sync(&con->recovery_work);
+ cancel_delayed_work_sync(&con->page_retirement_dwork);
+
amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
mutex_lock(&con->recovery_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 634654cf2634..cb5a0f31d201 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -537,6 +537,7 @@ struct amdgpu_ras {
struct mutex page_rsv_lock;
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
struct ras_ecc_log_info umc_ecc_log;
+ struct delayed_work page_retirement_dwork;
/* Fatal error detected flag */
atomic_t fed;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0f2d765c4e2d..2bd88218c20e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -89,7 +89,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
return ret;
}
-static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index c83d24097c5c..2d08d076f7c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -133,4 +133,7 @@ int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
uint64_t *pfns, int len, uint64_t *val);
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
+
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+ void *ras_error_status);
#endif
--
2.34.1
More information about the amd-gfx
mailing list