[PATCH 2/2] drm/amdgpu: timely save bad pages to eeprom after gpu ras reset is complete

Wed Jul 3 08:41:27 UTC 2024

The problem case is as follows:
1. GPU A triggers a gpu ras reset, and GPU A drives
   GPU B to also perform a gpu ras reset.
2. After gpu B ras reset started, gpu B queried a DE
   data. Since the DE data was queried in the ras reset
   thread instead of the page retirement thread, bad
   page retirement work would not be triggered. Then
   even if all gpu resets are completed, the bad pages
   will be cached in RAM until GPU B's bad page retirement
   work is triggered again and then saved to eeprom.

This patch can save the bad pages to eeprom in time after gpu
ras reset is complete.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++++++++++++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  |  6 ++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1b6f5b26957b..b6e047a354a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2844,8 +2844,20 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
 	struct ras_err_data err_data;
 	unsigned long err_cnt;
 
-	if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
+	if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
+		int ret;
+
+		mutex_lock(&con->umc_ecc_log.lock);
+		ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+					UMC_ECC_NEW_DETECTED_TAG);
+		mutex_unlock(&con->umc_ecc_log.lock);
+
+		/* If gpu reset is not completed, schedule delayed work again */
+		if (ret)
+			schedule_delayed_work(&con->page_retirement_dwork,
+				msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3));
 		return;
+	}
 
 	amdgpu_ras_error_data_init(&err_data);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 0faa21d8a7b4..7bdba5532adb 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -29,6 +29,7 @@
 #include "mp/mp_13_0_6_sh_mask.h"
 
 #define MAX_ECC_NUM_PER_RETIREMENT  32
+#define DELAYED_TIME_FOR_GPU_RESET  1000  //ms
 
 static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
 					    uint32_t node_inst,
@@ -568,6 +569,11 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
 
 	con->umc_ecc_log.de_queried_count++;
 
+	/* Try to retire the bad pages detected after gpu ras reset started */
+	if (amdgpu_ras_in_recovery(adev))
+		schedule_delayed_work(&con->page_retirement_dwork,
+			msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET));
+
 	return 0;
 }
 
-- 
2.34.1