[PATCH 1/2] drm/amdgpu: add RAS is_rma flag
Zhou1, Tao
Tao.Zhou1 at amd.com
Mon May 27 03:39:05 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: Yang, Stanley <Stanley.Yang at amd.com>
> Sent: Thursday, May 23, 2024 9:57 PM
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: RE: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> > -----Original Message-----
> > From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao
> > Zhou
> > Sent: Thursday, May 23, 2024 6:02 PM
> > To: amd-gfx at lists.freedesktop.org
> > Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> > Subject: [PATCH 1/2] drm/amdgpu: add RAS is_rma flag
> >
> > Set the flag to true if bad page number reaches threshold.
> >
> > Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++----
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++++++----
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 +--
> > 4 files changed, 11 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index ecce022c657b..934dfb2bf9e5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -2940,7 +2940,6 @@ int amdgpu_ras_recovery_init(struct
> > amdgpu_device
> > *adev)
> > struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> > struct ras_err_handler_data **data;
> > u32 max_eeprom_records_count = 0;
> > - bool exc_err_limit = false;
> > int ret;
> >
> > if (!con || amdgpu_sriov_vf(adev)) @@ -2977,12 +2976,12 @@ int
> > amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> > */
> > if (adev->gmc.xgmi.pending_reset)
> > return 0;
> > - ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
> > + ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
> > /*
> > * This calling fails when exc_err_limit is true or
> > * ret != 0.
> > */
> > - if (exc_err_limit || ret)
> > + if (con->is_rma || ret)
> > goto free;
> >
> > if (con->eeprom_control.ras_num_recs) { @@ -3033,7 +3032,7 @@
> > int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> > * Except error threshold exceeding case, other failure cases in this
> > * function would not fail amdgpu driver init.
> > */
> > - if (!exc_err_limit)
> > + if (!con->is_rma)
> > ret = 0;
> > else
> > ret = -EINVAL;
>
> [Stanley]: Should stop device service if device is under RMA during running? the
> amdgpu_ras_recovery_init function only be called during the process of loading
> driver.
[Tao] yes, I plan to stop service in resume stage after mode-1 if run-time RMA is reported. But I have no environment to verify the design right now, so this is TODO temporarily.
>
> Regards,
> Stanley
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index d06c01b978cd..437c58c85639 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -521,6 +521,7 @@ struct amdgpu_ras {
> > bool update_channel_flag;
> > /* Record status of smu mca debug mode */
> > bool is_aca_debug_mode;
> > + bool is_rma;
> >
> > /* Record special requirements of gpu reset caller */
> > uint32_t gpu_reset_flags;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > index 9b789dcc2bd1..eae0a555df3c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct
> > amdgpu_ras_eeprom_control *control)
> > control->tbl_rai.health_percent = 0;
> > }
> >
> > + if (amdgpu_bad_page_threshold != -1)
> > + ras->is_rma = true;
> > +
> > /* ignore the -ENOTSUPP return value */
> > amdgpu_dpm_send_rma_reason(adev);
> > }
> > @@ -1321,8 +1324,7 @@ static int __read_table_ras_info(struct
> > amdgpu_ras_eeprom_control *control)
> > return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; }
> >
> > -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> > - bool *exceed_err_limit)
> > +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
> > {
> > struct amdgpu_device *adev = to_amdgpu_device(control);
> > unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7
> > +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control
> > *control,
> > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> > int res;
> >
> > - *exceed_err_limit = false;
> > + ras->is_rma = false;
> >
> > if (!__is_ras_eeprom_supported(adev))
> > return 0;
> > @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct
> > amdgpu_ras_eeprom_control *control,
> > dev_warn(adev->dev, "GPU will be
> > initialized due to bad_page_threshold = -1.");
> > res = 0;
> > } else {
> > - *exceed_err_limit = true;
> > + ras->is_rma = true;
> > dev_err(adev->dev,
> > "RAS records:%d exceed threshold:%d, "
> > "GPU will not be initialized.
> > Replace this GPU or increase the threshold", diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > index 6dfd667f3013..b9ebda577797 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > @@ -129,8 +129,7 @@ struct eeprom_table_record {
> > unsigned char mcumc_id;
> > } __packed;
> >
> > -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
> > - bool *exceed_err_limit);
> > +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control
> > +*control);
> >
> > int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control
> > *control);
> >
> > --
> > 2.34.1
>
More information about the amd-gfx
mailing list