[PATCH v2] drm/amdgpu: Update usage for bad page threshold
Zhou1, Tao
Tao.Zhou1 at amd.com
Wed Jan 22 11:40:30 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Wednesday, January 22, 2025 7:38 PM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH v2] drm/amdgpu: Update usage for bad page threshold
>
> The driver's behavior varies based on
> the configuration of amdgpu_bad_page_threshold setting
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 54 ++++++++++---------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +-
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 41 +++++++-------
> 4 files changed, 53 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 99d884e6763a..87ea2e2a062f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -983,7 +983,7 @@ module_param_named(reset_method,
> amdgpu_reset_method, int, 0644);
> * result in the GPU entering bad status when the number of total
> * faulty pages by ECC exceeds the threshold value.
> */
> -MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore
> threshold (default value), 0 = disable bad page retirement, -2 = driver sets
> threshold)");
> +MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore
> +threshold (default value), 0 = disable bad page retirement, -2 =
> +threshold determined by a formula, 0 < threshold < max records,
> +user-defined threshold)");
> module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int,
> 0444);
>
> MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to
> setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 960476e6124b..5676ffe5c43a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3071,35 +3071,35 @@ static void amdgpu_ras_validate_threshold(struct
> amdgpu_device *adev,
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>
> /*
> - * Justification of value bad_page_cnt_threshold in ras structure
> + * amdgpu_bad_page_threshold is used to config
> + * the threshold for the number of bad pages.
> + * -1: Threshold is set to default value
> + * Driver will issue a warning message when threshold is reached
> + * and continue runtime services.
> + * 0: Disable bad page retirement
> + * Driver will not retire bad pages
> + * which is intended for debugging purpose.
> + * -2: Threshold is determined by a formula
> + * that assumes 1 bad page per 100M of local memory.
> + * Driver will continue runtime services when threhold is reached.
> + * 0 < threshold < max number of bad page records in EEPROM,
> + * A user-defined threshold is set
> + * Driver will halt runtime services when this custom threshold is
> reached.
> *
> - * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
> - * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
> - * scenarios accordingly.
> - *
> - * Bad page retirement enablement:
> - * - If amdgpu_bad_page_threshold = -2,
> - * bad_page_cnt_threshold = typical value by formula.
> - *
> - * - When the value from user is 0 < amdgpu_bad_page_threshold <
> - * max record length in eeprom, use it directly.
> - *
> - * Bad page retirement disablement:
> - * - If amdgpu_bad_page_threshold = 0, bad page retirement
> - * functionality is disabled, and bad_page_cnt_threshold will
> - * take no effect.
> */
> + if (amdgpu_bad_page_threshold == -2) {
> + u64 val = adev->gmc.mc_vram_size;
>
> - if (amdgpu_bad_page_threshold < 0) {
> - u64 val = adev->gmc.mc_vram_size;
> + do_div(val, RAS_BAD_PAGE_COVER);
> + con->bad_page_cnt_threshold = min(lower_32_bits(val),
> + max_count);
> + } else if (amdgpu_bad_page_threshold == -1) {
> + con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >>
> 21) << 4;
> + } else {
> + con->bad_page_cnt_threshold = min_t(int, max_count,
> + amdgpu_bad_page_threshold);
> + }
>
> - do_div(val, RAS_BAD_PAGE_COVER);
> - con->bad_page_cnt_threshold = min(lower_32_bits(val),
> - max_count);
> - } else {
> - con->bad_page_cnt_threshold = min_t(int, max_count,
> - amdgpu_bad_page_threshold);
> - }
> }
>
> #ifdef HAVE_KFIFO_PUT_NON_POINTER
> @@ -3852,8 +3852,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct
> amdgpu_device *adev)
> case IP_VERSION(13, 0, 2):
> case IP_VERSION(13, 0, 6):
> case IP_VERSION(13, 0, 12):
> + con->reserved_pages_in_bytes =
> AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
> + break;
> case IP_VERSION(13, 0, 14):
> - con->reserved_pages_in_bytes =
> AMDGPU_RAS_RESERVED_VRAM_SIZE;
> + con->reserved_pages_in_bytes =
> (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT
> +<< 1);
> break;
> default:
> break;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 82db986c36a0..cc4586581dba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -65,7 +65,7 @@ struct amdgpu_iv_entry;
>
> /* Reserve 8 physical dram row for possible retirement.
> * In worst cases, it will lose 8 * 2MB memory in vram domain */
> -#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20)
> +#define AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20)
> /* The high three bits indicates socketid */ #define
> AMDGPU_RAS_GET_FEATURES(val) ((val) &
> ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 0d824f016916..0e3e7127c11e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -558,16 +558,17 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct
> amdgpu_device *adev)
> return false;
>
> if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
> - if (amdgpu_bad_page_threshold == -1) {
> + if (con->eeprom_control.ras_num_bad_pages >
> +con->bad_page_cnt_threshold)
> dev_warn(adev->dev, "RAS records:%d exceed
> threshold:%d",
> - con->eeprom_control.ras_num_bad_pages, con-
> >bad_page_cnt_threshold);
> + con->eeprom_control.ras_num_bad_pages, con-
> >bad_page_cnt_threshold);
> + if ((amdgpu_bad_page_threshold == -1) ||
> + (amdgpu_bad_page_threshold == -2)) {
> dev_warn(adev->dev,
> - "But GPU can be operated due to bad_page_threshold
> = -1.\n");
> + "Please consult AMD Service Action Guide (SAG) for
> appropriate
> +service procedures.\n");
> return false;
> } else {
> - dev_warn(adev->dev, "This GPU is in BAD status.");
> - dev_warn(adev->dev, "Please retire it or set a larger "
> - "threshold value when reloading driver.\n");
> + dev_warn(adev->dev,
> + "Please consider adjusting the customized
> threshold.\n");
> return true;
> }
> }
> @@ -758,7 +759,8 @@ amdgpu_ras_eeprom_update_header(struct
> amdgpu_ras_eeprom_control *control)
> control->tbl_rai.health_percent = 0;
> }
>
> - if (amdgpu_bad_page_threshold != -1)
> + if ((amdgpu_bad_page_threshold != -1) &&
> + (amdgpu_bad_page_threshold != -2))
> ras->is_rma = true;
>
> /* ignore the -ENOTSUPP return value */ @@ -1428,8 +1430,9 @@
> int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
>
> res = __verify_ras_table_checksum(control);
> if (res)
> - DRM_ERROR("RAS table incorrect checksum or error:%d\n",
> - res);
> + dev_err(adev->dev,
> + "RAS table incorrect checksum or error:%d\n",
> + res);
>
> /* Warn if we are at 90% of the threshold or above
> */
> @@ -1447,8 +1450,9 @@ int amdgpu_ras_eeprom_check(struct
> amdgpu_ras_eeprom_control *control)
>
> res = __verify_ras_table_checksum(control);
> if (res) {
> - dev_err(adev->dev, "RAS Table incorrect checksum or
> error:%d\n",
> - res);
> + dev_err(adev->dev,
> + "RAS Table incorrect checksum or error:%d\n",
> + res);
> return -EINVAL;
> }
> if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages)
> { @@ -1466,17 +1470,18 @@ int amdgpu_ras_eeprom_check(struct
> amdgpu_ras_eeprom_control *control)
> res = amdgpu_ras_eeprom_correct_header_tag(control,
>
> RAS_TABLE_HDR_VAL);
> } else {
> - dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
> + dev_warn(adev->dev,
> + "RAS records:%d exceed threshold:%d\n",
> control->ras_num_bad_pages, ras-
> >bad_page_cnt_threshold);
> - if (amdgpu_bad_page_threshold == -1) {
> - dev_warn(adev->dev, "GPU will be initialized due to
> bad_page_threshold = -1.");
> + if ((amdgpu_bad_page_threshold == -1) ||
> + (amdgpu_bad_page_threshold == -2)) {
> res = 0;
> + dev_warn(adev->dev,
> + "Please consult AMD Service Action Guide
> (SAG) for appropriate
> +service procedures\n");
> } else {
> ras->is_rma = true;
> - dev_err(adev->dev,
> - "RAS records:%d exceed threshold:%d, "
> - "GPU will not be initialized. Replace this GPU
> or increase the threshold",
> - control->ras_num_bad_pages, ras-
> >bad_page_cnt_threshold);
> + dev_warn(adev->dev,
> + "User defined threshold is set, runtime service
> will be halt when
> +threshold is reached\n");
> }
> }
> } else {
> --
> 2.17.1
More information about the amd-gfx
mailing list