[PATCH 2/2] drm/amdgpu: add bad_page_threshold check in ras_eeprom_check_err

Tao Zhou tao.zhou1 at amd.com
Tue Feb 21 08:29:23 UTC 2023


bad_page_threshold controls page retirement behavior and it should be
also checked.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9d370465b08d..c88123896fe8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -417,7 +417,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-	if (!__is_ras_eeprom_supported(adev))
+	if (!__is_ras_eeprom_supported(adev) ||
+	    !amdgpu_bad_page_threshold)
 		return false;
 
 	/* skip check eeprom table for VEGA20 Gaming */
@@ -428,10 +429,19 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
 			return false;
 
 	if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
-		dev_warn(adev->dev, "This GPU is in BAD status.");
-		dev_warn(adev->dev, "Please retire it or set a larger "
-			 "threshold value when reloading driver.\n");
-		return true;
+		if (amdgpu_bad_page_threshold == -1) {
+			dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
+				con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
+			dev_warn(adev->dev,
+				"But GPU can be operated due to bad_page_threshold = -1.\n");
+			return false;
+		} else if (amdgpu_bad_page_threshold > 0 ||
+		    amdgpu_bad_page_threshold == -2) {
+			dev_warn(adev->dev, "This GPU is in BAD status.");
+			dev_warn(adev->dev, "Please retire it or set a larger "
+				 "threshold value when reloading driver.\n");
+			return true;
+		}
 	}
 
 	return false;
-- 
2.35.1



More information about the amd-gfx mailing list