[PATCH 2/2] drm/amdgpu: add bad_page_threshold check in ras_eeprom_check_err
Tao Zhou
tao.zhou1 at amd.com
Tue Feb 21 08:29:23 UTC 2023
bad_page_threshold controls page retirement behavior and it should be
also checked.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 20 ++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9d370465b08d..c88123896fe8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -417,7 +417,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- if (!__is_ras_eeprom_supported(adev))
+ if (!__is_ras_eeprom_supported(adev) ||
+ !amdgpu_bad_page_threshold)
return false;
/* skip check eeprom table for VEGA20 Gaming */
@@ -428,10 +429,19 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
return false;
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
- dev_warn(adev->dev, "This GPU is in BAD status.");
- dev_warn(adev->dev, "Please retire it or set a larger "
- "threshold value when reloading driver.\n");
- return true;
+ if (amdgpu_bad_page_threshold == -1) {
+ dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
+ con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
+ dev_warn(adev->dev,
+ "But GPU can be operated due to bad_page_threshold = -1.\n");
+ return false;
+ } else if (amdgpu_bad_page_threshold > 0 ||
+ amdgpu_bad_page_threshold == -2) {
+ dev_warn(adev->dev, "This GPU is in BAD status.");
+ dev_warn(adev->dev, "Please retire it or set a larger "
+ "threshold value when reloading driver.\n");
+ return true;
+ }
}
return false;
--
2.35.1
More information about the amd-gfx
mailing list