[PATCH] drm/amdgpu: change usage definition of amdgpu_bad_page_threshold

Thu Jun 12 11:19:41 UTC 2025

[AMD Official Use Only - AMD Internal Distribution Only]

There're 3 changes in this patch, for the first and last change, it's better to get Hawking's confirmation.

Tao

> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie at amd.com>
> Sent: Thursday, June 12, 2025 4:20 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Xie, Patrick <Gangliang.Xie at amd.com>
> Subject: [PATCH] drm/amdgpu: change usage definition of
> amdgpu_bad_page_threshold
>
> when amdgpu_bad_page_threshold == -1, driver won't write BADG and RMA when
> amdgpu_bad_page_threshold == -2, driver will write BADG and RMA
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 26 ++++++++-----------
>  2 files changed, 12 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 9dfef13babfe..a1b97d516a27 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3161,7 +3161,7 @@ static void amdgpu_ras_validate_threshold(struct
> amdgpu_device *adev,
>        *      which is intended for debugging purpose.
>        * -2:  Threshold is determined by a formula
>        *      that assumes 1 bad page per 100M of local memory.
> -      *      Driver will continue runtime services when threhold is reached.
> +      *      Driver will halt runtime services when this custom threshold is reached.
>        * 0 < threshold < max number of bad page records in EEPROM,
>        *      A user-defined threshold is set
>        *      Driver will halt runtime services when this custom threshold is reached.
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 2ddedf476542..50a6e975addb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -568,8 +568,7 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct
> amdgpu_device *adev)
>               if (con->eeprom_control.ras_num_bad_pages > con-
> >bad_page_cnt_threshold)
>                       dev_warn(adev->dev, "RAS records:%d exceed
> threshold:%d",
>                                con->eeprom_control.ras_num_bad_pages, con-
> >bad_page_cnt_threshold);
> -             if ((amdgpu_bad_page_threshold == -1) ||
> -                 (amdgpu_bad_page_threshold == -2)) {
> +             if (amdgpu_bad_page_threshold == -1) {
>                       dev_warn(adev->dev,
>                                "Please consult AMD Service Action Guide (SAG) for
> appropriate service procedures.\n");
>                       return false;
> @@ -763,18 +762,16 @@ amdgpu_ras_eeprom_update_header(struct
> amdgpu_ras_eeprom_control *control)
>               dev_warn(adev->dev,
>                       "Saved bad pages %d reaches threshold value %d\n",
>                       control->ras_num_bad_pages, ras-
> >bad_page_cnt_threshold);
> -             control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
> -             if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
> -                     control->tbl_rai.rma_status =
> GPU_RETIRED__ECC_REACH_THRESHOLD;
> -                     control->tbl_rai.health_percent = 0;
> -             }
> -
> -             if ((amdgpu_bad_page_threshold != -1) &&
> -                 (amdgpu_bad_page_threshold != -2))
> +             if (amdgpu_bad_page_threshold != -1) {
> +                     control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
> +                     if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
> +                             control->tbl_rai.rma_status =
> GPU_RETIRED__ECC_REACH_THRESHOLD;
> +                             control->tbl_rai.health_percent = 0;
> +                     }
>                       ras->is_rma = true;
> -
> -             /* ignore the -ENOTSUPP return value */
> -             amdgpu_dpm_send_rma_reason(adev);
> +                     /* ignore the -ENOTSUPP return value */
> +                     amdgpu_dpm_send_rma_reason(adev);
> +             }
>       }
>
>       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) @@ -1508,8
> +1505,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control
> *control)
>                       dev_warn(adev->dev,
>                               "RAS records:%d exceed threshold:%d\n",
>                               control->ras_num_bad_pages, ras-
> >bad_page_cnt_threshold);
> -                     if ((amdgpu_bad_page_threshold == -1) ||
> -                         (amdgpu_bad_page_threshold == -2)) {
> +                     if ((amdgpu_bad_page_threshold == -1)) {
>                               res = 0;
>                               dev_warn(adev->dev,
>                                        "Please consult AMD Service Action Guide
> (SAG) for appropriate service procedures\n");
> --
> 2.34.1