[PATCH] drm/amdgpu: change usage definition of amdgpu_bad_page_threshold

ganglxie ganglxie at amd.com
Thu Jun 12 08:19:43 UTC 2025


when amdgpu_bad_page_threshold == -1, driver won't write BADG and RMA
when amdgpu_bad_page_threshold == -2, driver will write BADG and RMA

Signed-off-by: ganglxie <ganglxie at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 26 ++++++++-----------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9dfef13babfe..a1b97d516a27 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3161,7 +3161,7 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
 	 *      which is intended for debugging purpose.
 	 * -2:  Threshold is determined by a formula
 	 *      that assumes 1 bad page per 100M of local memory.
-	 *      Driver will continue runtime services when threhold is reached.
+	 *      Driver will halt runtime services when this custom threshold is reached.
 	 * 0 < threshold < max number of bad page records in EEPROM,
 	 *      A user-defined threshold is set
 	 *      Driver will halt runtime services when this custom threshold is reached.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 2ddedf476542..50a6e975addb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -568,8 +568,7 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
 		if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold)
 			dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
 				 con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
-		if ((amdgpu_bad_page_threshold == -1) ||
-		    (amdgpu_bad_page_threshold == -2)) {
+		if (amdgpu_bad_page_threshold == -1) {
 			dev_warn(adev->dev,
 				 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n");
 			return false;
@@ -763,18 +762,16 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
 		dev_warn(adev->dev,
 			"Saved bad pages %d reaches threshold value %d\n",
 			control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
-		control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
-		if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
-			control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
-			control->tbl_rai.health_percent = 0;
-		}
-
-		if ((amdgpu_bad_page_threshold != -1) &&
-		    (amdgpu_bad_page_threshold != -2))
+		if (amdgpu_bad_page_threshold != -1) {
+			control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
+			if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
+				control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
+				control->tbl_rai.health_percent = 0;
+			}
 			ras->is_rma = true;
-
-		/* ignore the -ENOTSUPP return value */
-		amdgpu_dpm_send_rma_reason(adev);
+			/* ignore the -ENOTSUPP return value */
+			amdgpu_dpm_send_rma_reason(adev);
+		}
 	}
 
 	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
@@ -1508,8 +1505,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
 			dev_warn(adev->dev,
 				"RAS records:%d exceed threshold:%d\n",
 				control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
-			if ((amdgpu_bad_page_threshold == -1) ||
-			    (amdgpu_bad_page_threshold == -2)) {
+			if ((amdgpu_bad_page_threshold == -1)) {
 				res = 0;
 				dev_warn(adev->dev,
 					 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
-- 
2.34.1



More information about the amd-gfx mailing list