[PATCH 1/2] drm/amdgpu: add RAS is_rma flag
Yang, Stanley
Stanley.Yang at amd.com
Thu May 23 13:56:35 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
> Sent: Thursday, May 23, 2024 6:02 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag
>
> Set the flag to true if bad page number reaches threshold.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++++++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 +--
> 4 files changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index ecce022c657b..934dfb2bf9e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct ras_err_handler_data **data;
> u32 max_eeprom_records_count = 0;
> - bool exc_err_limit = false;
> int ret;
>
> if (!con || amdgpu_sriov_vf(adev))
> @@ -2977,12 +2976,12 @@ int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
> */
> if (adev->gmc.xgmi.pending_reset)
> return 0;
> - ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
> + ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
> /*
> * This calling fails when exc_err_limit is true or
> * ret != 0.
> */
> - if (exc_err_limit || ret)
> + if (con->is_rma || ret)
> goto free;
>
> if (con->eeprom_control.ras_num_recs) { @@ -3033,7 +3032,7 @@ int
> amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> * Except error threshold exceeding case, other failure cases in this
> * function would not fail amdgpu driver init.
> */
> - if (!exc_err_limit)
> + if (!con->is_rma)
> ret = 0;
> else
> ret = -EINVAL;
[Stanley]: Should stop device service if device is under RMA during running? the amdgpu_ras_recovery_init function only be called during the process of loading driver.
Regards,
Stanley
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index d06c01b978cd..437c58c85639 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -521,6 +521,7 @@ struct amdgpu_ras {
> bool update_channel_flag;
> /* Record status of smu mca debug mode */
> bool is_aca_debug_mode;
> + bool is_rma;
>
> /* Record special requirements of gpu reset caller */
> uint32_t gpu_reset_flags;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 9b789dcc2bd1..eae0a555df3c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct
> amdgpu_ras_eeprom_control *control)
> control->tbl_rai.health_percent = 0;
> }
>
> + if (amdgpu_bad_page_threshold != -1)
> + ras->is_rma = true;
> +
> /* ignore the -ENOTSUPP return value */
> amdgpu_dpm_send_rma_reason(adev);
> }
> @@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct
> amdgpu_ras_eeprom_control *control)
> return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; }
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> - bool *exceed_err_limit)
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
> {
> struct amdgpu_device *adev = to_amdgpu_device(control);
> unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7
> +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control
> *control,
> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> int res;
>
> - *exceed_err_limit = false;
> + ras->is_rma = false;
>
> if (!__is_ras_eeprom_supported(adev))
> return 0;
> @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control,
> dev_warn(adev->dev, "GPU will be initialized
> due to bad_page_threshold = -1.");
> res = 0;
> } else {
> - *exceed_err_limit = true;
> + ras->is_rma = true;
> dev_err(adev->dev,
> "RAS records:%d exceed threshold:%d, "
> "GPU will not be initialized. Replace this
> GPU or increase the threshold", diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 6dfd667f3013..b9ebda577797 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -129,8 +129,7 @@ struct eeprom_table_record {
> unsigned char mcumc_id;
> } __packed;
>
> -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> - bool *exceed_err_limit);
> +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
>
> int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control
> *control);
>
> --
> 2.34.1
More information about the amd-gfx
mailing list