[PATCH] drm/amdgpu: refine ras error injection when eeprom initialization failed
Zhou1, Tao
Tao.Zhou1 at amd.com
Fri Jun 27 10:23:23 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie at amd.com>
> Sent: Friday, June 27, 2025 5:37 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Xie, Patrick <Gangliang.Xie at amd.com>
> Subject: [PATCH] drm/amdgpu: refine ras error injection when eeprom initialization
> failed
>
> when eeprom initialization failed, we still support ras error injection, and reserve bad
> pages, but do not save bad pages to eeprom
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++++++++++-----
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 ++
> 2 files changed, 18 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 514b56e5d8ba..d24567787f9e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3009,6 +3009,15 @@ int amdgpu_ras_save_bad_pages(struct
> amdgpu_device *adev,
> return 0;
> }
>
> + if (!con->eeprom_control.is_eeprom_valid) {
> + dev_err(adev->dev,
[Tao] since we return 0 here, it's better to use dev_warn, other than this, the patch is:
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> + "Failed to save EEPROM table data because of EEPROM
> data corruption!");
> + if (new_cnt)
> + *new_cnt = 0;
> +
> + return 0;
> + }
> +
> mutex_lock(&con->recovery_lock);
> control = &con->eeprom_control;
> data = con->eh_data;
> @@ -3502,8 +3511,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device
> *adev)
>
> control = &con->eeprom_control;
> ret = amdgpu_ras_eeprom_init(control);
> - if (ret)
> - return ret;
> + control->is_eeprom_valid = !ret;
>
> if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
> control->ras_num_pa_recs = control->ras_num_recs; @@ -3512,10
> +3520,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
> adev->umc.ras->get_retire_flip_bits)
> adev->umc.ras->get_retire_flip_bits(adev);
>
> - if (control->ras_num_recs) {
> + if (control->ras_num_recs && control->is_eeprom_valid) {
> ret = amdgpu_ras_load_bad_pages(adev);
> - if (ret)
> - return ret;
> + if (ret) {
> + control->is_eeprom_valid = false;
> + return 0;
> + }
>
> amdgpu_dpm_send_hbm_bad_pages_num(
> adev, control->ras_num_bad_pages);
> @@ -3534,7 +3544,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device
> *adev)
> dev_warn(adev->dev, "Failed to format RAS
> EEPROM data in V3 version!\n");
> }
>
> - return ret;
> + return 0;
> }
>
> int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) diff --
> git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index ec6d7ea37ad0..35c69ac3dbeb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -114,6 +114,8 @@ struct amdgpu_ras_eeprom_control {
> /* Record channel info which occurred bad pages
> */
> u32 bad_channel_bitmap;
> +
> + bool is_eeprom_valid;
> };
>
> /*
> --
> 2.34.1
More information about the amd-gfx
mailing list