[PATCH] drm/amdgpu: Update EEPROM RAS table for mismatched table version

Zhou1, Tao Tao.Zhou1 at amd.com
Fri Mar 29 07:32:16 UTC 2024


[AMD Official Use Only - General]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Candice Li
> Sent: Wednesday, March 27, 2024 2:16 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Li, Candice <Candice.Li at amd.com>
> Subject: [PATCH] drm/amdgpu: Update EEPROM RAS table for mismatched table
> version
>
> Update table version and restore bad page records to EEPROM RAS table for
> mismatched table version case. Otherwise force to reset the table.
>
> Signed-off-by: Candice Li <candice.li at amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 88 ++++++++++++++++---
>  1 file changed, 78 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 06a62a8a992e9b..42d0ef2f512474 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -1319,6 +1319,37 @@ static int __read_table_ras_info(struct
> amdgpu_ras_eeprom_control *control)
>       return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;  }
>
> +static bool amdgpu_ras_eeprom_table_version_validate(struct
> +amdgpu_ras_eeprom_control *control) {
> +     struct amdgpu_device *adev = to_amdgpu_device(control);
> +     struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> +
> +     switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
> +     case IP_VERSION(8, 10, 0):
> +     case IP_VERSION(12, 0, 0):
> +             return hdr->version == RAS_TABLE_VER_V2_1;
> +     default:
> +             return hdr->version == RAS_TABLE_VER_V1;
> +     }
> +}
> +
> +static void amdgpu_ras_update_eeprom_control(struct
> +amdgpu_ras_eeprom_table_header *hdr) {
> +     struct amdgpu_ras_eeprom_control *control =
> +             container_of(hdr, struct amdgpu_ras_eeprom_control, tbl_hdr);
> +
> +     if (hdr->version == RAS_TABLE_VER_V2_1) {
> +             control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
> +             control->ras_record_offset = RAS_RECORD_START_V2_1;
> +             control->ras_max_record_count =
> RAS_MAX_RECORD_COUNT_V2_1;
> +     } else {
> +             control->ras_num_recs = RAS_NUM_RECS(hdr);
> +             control->ras_record_offset = RAS_RECORD_START;
> +             control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
> +     }
> +     control->ras_fri = RAS_OFFSET_TO_INDEX(control,
> +hdr->first_rec_offset); }
> +
>  int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
>                          bool *exceed_err_limit)
>  {
> @@ -1326,7 +1357,9 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control,
>       unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
>       struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
>       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -     int res;
> +     int res, res1;
> +     struct eeprom_table_record *bps;
> +     u32 num_recs;
>
>       *exceed_err_limit = false;
>
> @@ -1355,16 +1388,51 @@ int amdgpu_ras_eeprom_init(struct
> amdgpu_ras_eeprom_control *control,
>
>       __decode_table_header_from_buf(hdr, buf);
>
> -     if (hdr->version == RAS_TABLE_VER_V2_1) {
> -             control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
> -             control->ras_record_offset = RAS_RECORD_START_V2_1;
> -             control->ras_max_record_count =
> RAS_MAX_RECORD_COUNT_V2_1;
> -     } else {
> -             control->ras_num_recs = RAS_NUM_RECS(hdr);
> -             control->ras_record_offset = RAS_RECORD_START;
> -             control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
> +     amdgpu_ras_update_eeprom_control(hdr);
> +
> +     if (!amdgpu_ras_eeprom_table_version_validate(control)) {
> +             num_recs = control->ras_num_recs;
> +             if (num_recs && amdgpu_bad_page_threshold) {
> +                     /* Save bad page records existed in EEPROM */
> +                     bps = kcalloc(num_recs, sizeof(*bps), GFP_KERNEL);
> +                     if (!bps)
> +                             return -ENOMEM;
> +
> +                     res1 = amdgpu_ras_eeprom_read(control, bps,
> num_recs);
> +                     if (res1)
> +                             dev_warn(adev->dev, "Fail to load EEPROM
> table, force to reset
> +it.");
> +
> +                     res = amdgpu_ras_eeprom_reset_table(control);
> +                     if (res) {
> +                             dev_err(adev->dev, "Failed to create a new
> EEPROM table.");
> +                             kfree(bps);
> +                             return res < 0 ? res : 0;
> +                     }
> +
> +                     if (!res1) {
> +                             /* Update the EEPROM table with correct table
> version and
> +                              * original bad page records
> +                              */
> +                             amdgpu_ras_update_eeprom_control(hdr);
> +                             res = amdgpu_ras_eeprom_append(control, bps,
> num_recs);
> +
> +                             if (res) {
> +                                     dev_warn(adev->dev, "Fail to update
> EEPROM table, force to reset it.");
> +                                     res =
> amdgpu_ras_eeprom_reset_table(control);

[Tao] I think the reset here can be dropped, apart from this, the patch is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

It's better to get Stanley's Reviewed-by.

> +                             }
> +                     }
> +
> +                     kfree(bps);
> +             } else
> +                     res = amdgpu_ras_eeprom_reset_table(control);
> +
> +             if (res) {
> +                     dev_err(adev->dev, "Failed to reset EEPROM table.");
> +                     return res < 0 ? res : 0;
> +             }
> +
> +             amdgpu_ras_update_eeprom_control(hdr);
>       }
> -     control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
>
>       if (hdr->header == RAS_TABLE_HDR_VAL) {
>               DRM_DEBUG_DRIVER("Found existing EEPROM table with %d
> records",
> --
> 2.25.1



More information about the amd-gfx mailing list