[PATCH] drm/amdgpu: refine ras error injection when eeprom initialization failed

Zhou1, Tao Tao.Zhou1 at amd.com
Fri Jun 27 10:23:23 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie at amd.com>
> Sent: Friday, June 27, 2025 5:37 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Xie, Patrick <Gangliang.Xie at amd.com>
> Subject: [PATCH] drm/amdgpu: refine ras error injection when eeprom initialization
> failed
>
> when eeprom initialization failed, we still support ras error injection, and reserve bad
> pages, but do not save bad pages to eeprom
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 22 ++++++++++++++-----
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  2 ++
>  2 files changed, 18 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 514b56e5d8ba..d24567787f9e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3009,6 +3009,15 @@ int amdgpu_ras_save_bad_pages(struct
> amdgpu_device *adev,
>               return 0;
>       }
>
> +     if (!con->eeprom_control.is_eeprom_valid) {
> +             dev_err(adev->dev,

[Tao] since we return 0 here, it's better to use dev_warn, other than this, the patch is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> +                     "Failed to save EEPROM table data because of EEPROM
> data corruption!");
> +             if (new_cnt)
> +                     *new_cnt = 0;
> +
> +             return 0;
> +     }
> +
>       mutex_lock(&con->recovery_lock);
>       control = &con->eeprom_control;
>       data = con->eh_data;
> @@ -3502,8 +3511,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device
> *adev)
>
>       control = &con->eeprom_control;
>       ret = amdgpu_ras_eeprom_init(control);
> -     if (ret)
> -             return ret;
> +     control->is_eeprom_valid = !ret;
>
>       if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
>               control->ras_num_pa_recs = control->ras_num_recs; @@ -3512,10
> +3520,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
>           adev->umc.ras->get_retire_flip_bits)
>               adev->umc.ras->get_retire_flip_bits(adev);
>
> -     if (control->ras_num_recs) {
> +     if (control->ras_num_recs && control->is_eeprom_valid) {
>               ret = amdgpu_ras_load_bad_pages(adev);
> -             if (ret)
> -                     return ret;
> +             if (ret) {
> +                     control->is_eeprom_valid = false;
> +                     return 0;
> +             }
>
>               amdgpu_dpm_send_hbm_bad_pages_num(
>                       adev, control->ras_num_bad_pages);
> @@ -3534,7 +3544,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device
> *adev)
>                                       dev_warn(adev->dev, "Failed to format RAS
> EEPROM data in V3 version!\n");
>       }
>
> -     return ret;
> +     return 0;
>  }
>
>  int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) diff --
> git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index ec6d7ea37ad0..35c69ac3dbeb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -114,6 +114,8 @@ struct amdgpu_ras_eeprom_control {
>       /* Record channel info which occurred bad pages
>        */
>       u32 bad_channel_bitmap;
> +
> +     bool is_eeprom_valid;
>  };
>
>  /*
> --
> 2.34.1



More information about the amd-gfx mailing list