[PATCH] drm/amdgpu: Update usage for bad page threshold
Hawking Zhang
Hawking.Zhang at amd.com
Wed Jan 22 11:11:56 UTC 2025
The driver's behavior varies based on
the configuration of amdgpu_bad_page_threshold setting
Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 54 ++++++++++---------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +-
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 28 +++++-----
4 files changed, 46 insertions(+), 40 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 99d884e6763a..87ea2e2a062f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -983,7 +983,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0644);
* result in the GPU entering bad status when the number of total
* faulty pages by ECC exceeds the threshold value.
*/
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = driver sets threshold)");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = threshold determined by a formula, 0 < threshold < max records, user-defined threshold)");
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 960476e6124b..5676ffe5c43a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3071,35 +3071,35 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
/*
- * Justification of value bad_page_cnt_threshold in ras structure
+ * amdgpu_bad_page_threshold is used to config
+ * the threshold for the number of bad pages.
+ * -1: Threshold is set to default value
+ * Driver will issue a warning message when threshold is reached
+ * and continue runtime services.
+ * 0: Disable bad page retirement
+ * Driver will not retire bad pages
+ * which is intended for debugging purpose.
+ * -2: Threshold is determined by a formula
+ * that assumes 1 bad page per 100M of local memory.
+ * Driver will continue runtime services when threhold is reached.
+ * 0 < threshold < max number of bad page records in EEPROM,
+ * A user-defined threshold is set
+ * Driver will halt runtime services when this custom threshold is reached.
*
- * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
- * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
- * scenarios accordingly.
- *
- * Bad page retirement enablement:
- * - If amdgpu_bad_page_threshold = -2,
- * bad_page_cnt_threshold = typical value by formula.
- *
- * - When the value from user is 0 < amdgpu_bad_page_threshold <
- * max record length in eeprom, use it directly.
- *
- * Bad page retirement disablement:
- * - If amdgpu_bad_page_threshold = 0, bad page retirement
- * functionality is disabled, and bad_page_cnt_threshold will
- * take no effect.
*/
+ if (amdgpu_bad_page_threshold == -2) {
+ u64 val = adev->gmc.mc_vram_size;
- if (amdgpu_bad_page_threshold < 0) {
- u64 val = adev->gmc.mc_vram_size;
+ do_div(val, RAS_BAD_PAGE_COVER);
+ con->bad_page_cnt_threshold = min(lower_32_bits(val),
+ max_count);
+ } else if (amdgpu_bad_page_threshold == -1) {
+ con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
+ } else {
+ con->bad_page_cnt_threshold = min_t(int, max_count,
+ amdgpu_bad_page_threshold);
+ }
- do_div(val, RAS_BAD_PAGE_COVER);
- con->bad_page_cnt_threshold = min(lower_32_bits(val),
- max_count);
- } else {
- con->bad_page_cnt_threshold = min_t(int, max_count,
- amdgpu_bad_page_threshold);
- }
}
#ifdef HAVE_KFIFO_PUT_NON_POINTER
@@ -3852,8 +3852,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
+ con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
+ break;
case IP_VERSION(13, 0, 14):
- con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+ con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
break;
default:
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 82db986c36a0..cc4586581dba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -65,7 +65,7 @@ struct amdgpu_iv_entry;
/* Reserve 8 physical dram row for possible retirement.
* In worst cases, it will lose 8 * 2MB memory in vram domain */
-#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20)
+#define AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20)
/* The high three bits indicates socketid */
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 0d824f016916..c05ef519deb7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -758,7 +758,8 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
control->tbl_rai.health_percent = 0;
}
- if (amdgpu_bad_page_threshold != -1)
+ if ((amdgpu_bad_page_threshold != -1) &&
+ (amdgpu_bad_page_threshold != -2))
ras->is_rma = true;
/* ignore the -ENOTSUPP return value */
@@ -1428,8 +1429,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
res = __verify_ras_table_checksum(control);
if (res)
- DRM_ERROR("RAS table incorrect checksum or error:%d\n",
- res);
+ dev_err(adev->dev,
+ "RAS table incorrect checksum or error:%d\n",
+ res);
/* Warn if we are at 90% of the threshold or above
*/
@@ -1447,8 +1449,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
res = __verify_ras_table_checksum(control);
if (res) {
- dev_err(adev->dev, "RAS Table incorrect checksum or error:%d\n",
- res);
+ dev_err(adev->dev,
+ "RAS Table incorrect checksum or error:%d\n",
+ res);
return -EINVAL;
}
if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) {
@@ -1466,17 +1469,18 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
res = amdgpu_ras_eeprom_correct_header_tag(control,
RAS_TABLE_HDR_VAL);
} else {
- dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
+ dev_warn(adev->dev,
+ "RAS records:%d exceed threshold:%d\n",
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
- if (amdgpu_bad_page_threshold == -1) {
- dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
+ if ((amdgpu_bad_page_threshold == -1) ||
+ (amdgpu_bad_page_threshold == -2)) {
res = 0;
+ dev_warn(adev->dev,
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
} else {
ras->is_rma = true;
- dev_err(adev->dev,
- "RAS records:%d exceed threshold:%d, "
- "GPU will not be initialized. Replace this GPU or increase the threshold",
- control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
+ dev_warn(adev->dev,
+ "User defined threshold is set, runtime service will be halt when threshold is reached\n");
}
}
} else {
--
2.17.1
More information about the amd-gfx
mailing list