[PATCH 2/2] drm/amdgpu: Do bad page retirement for deferred errors

Candice Li candice.li at amd.com
Wed Jan 10 08:38:56 UTC 2024


Needs to do bad page retirement for deferred errors.

Signed-off-by: Candice Li <candice.li at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 848df7acdd3210..df61df7e9b155f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -93,6 +93,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	int ret = 0;
+	unsigned long err_count;
 
 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 	ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
@@ -147,16 +148,17 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 	}
 
 	/* only uncorrectable error needs gpu reset */
-	if (err_data->ue_count) {
-		dev_info(adev->dev, "%ld uncorrectable hardware errors "
-				"detected in UMC block\n",
-				err_data->ue_count);
+	if (err_data->ue_count || err_data->de_count) {
+		dev_info(adev->dev, "%ld uncorrectable hardware errors and "
+				"%ld deferred hardware errors detected in UMC block\n",
+				err_data->ue_count, err_data->de_count);
 
+		err_count = err_data->ue_count + err_data->de_count;
 		if ((amdgpu_bad_page_threshold != 0) &&
 			err_data->err_addr_cnt) {
 			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
 						err_data->err_addr_cnt);
-			amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
+			amdgpu_ras_save_bad_pages(adev, &err_count);
 
 			amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
 
-- 
2.25.1



More information about the amd-gfx mailing list