[PATCH] Add support for leagcy records in eeprom format V3

Zhou1, Tao Tao.Zhou1 at amd.com
Tue Apr 29 08:52:48 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie at amd.com>
> Sent: Tuesday, April 29, 2025 3:01 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Xie, Patrick <Gangliang.Xie at amd.com>
> Subject: [PATCH] Add support for leagcy records in eeprom format V3
>
> After eeprom records format upgrades to V3, records that have 'address == 0' should
> be supported in NPS1
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 73 ++++++++++++++++---------
>  1 file changed, 48 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index e85143acf3a2..4f4e12452ecc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2881,6 +2881,17 @@ static int __amdgpu_ras_convert_rec_from_rom(struct
> amdgpu_device *adev,
>       save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
>       bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
>
> +     /* if address== 0, the rec must be a PA rec under NPS1 */
> +     if (bps->address == 0) {
> +             if (nps == AMDGPU_NPS1_PARTITION_MODE) {
> +                     if (amdgpu_umc_pages_in_a_row(adev, err_data,
> +                             bps->retired_page <<
> AMDGPU_GPU_PAGE_SHIFT))
> +                             return -EINVAL;
> +                     goto out;
> +             } else {
> +                     return -EOPNOTSUPP;
> +             }
> +     }

[Tao] better to add an empty line here.

>       if (save_nps == nps) {

[Tao] is possible that both "bps->address == 0" and "(save_nps == nps " are true?

>               if (amdgpu_umc_pages_in_a_row(adev, err_data,
>                               bps->retired_page <<
> AMDGPU_GPU_PAGE_SHIFT)) @@ -2889,6 +2900,8 @@ static int
> __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
>               if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
>                       return -EINVAL;
>       }
> +
> +out:
>       return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
>                                                                       adev-
> >umc.retire_unit);
>  }
> @@ -2903,7 +2916,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device
> *adev,
>                       &adev->psp.ras_context.ras->eeprom_control;
>       enum amdgpu_memory_partition nps =
> AMDGPU_NPS1_PARTITION_MODE;
>       int ret = 0;
> -     uint32_t i;
> +     uint32_t i = 0;
>
>       if (!con || !con->eh_data || !bps || pages <= 0)
>               return 0;
> @@ -2924,28 +2937,31 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
>       mutex_lock(&con->recovery_lock);
>
>       if (from_rom) {
> -             for (i = 0; i < pages; i++) {
> -                     if (control->ras_num_recs - i >= adev->umc.retire_unit) {
> -                             if ((bps[i].address == bps[i + 1].address) &&
> -                                 (bps[i].mem_channel == bps[i + 1].mem_channel)) {
> -                                     //deal with retire_unit records a time
> -                                     ret =
> __amdgpu_ras_convert_rec_array_from_rom(adev,
> -                                                                     &bps[i],
> &err_data, nps);
> -                                     if (ret)
> -                                             goto free;
> -                                     i += (adev->umc.retire_unit - 1);
> +             /* there is no pa recs in V3, so skip pa recs processing */
> +             if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
> +                     for (i = 0; i < pages; i++) {
> +                             if (control->ras_num_recs - i >= adev->umc.retire_unit)
> {
> +                                     if ((bps[i].address == bps[i + 1].address) &&
> +                                             (bps[i].mem_channel == bps[i +
> 1].mem_channel)) {
> +                                             /* deal with retire_unit records a time */
> +                                             ret =
> __amdgpu_ras_convert_rec_array_from_rom(adev,
> +                                                                             &bps[i],
> &err_data, nps);
> +                                             if (ret)
> +                                                     control->ras_num_bad_pages -
> = adev->umc.retire_unit;
> +                                             i += (adev->umc.retire_unit - 1);
> +                                     } else {
> +                                             break;
> +                                     }
>                               } else {
>                                       break;
>                               }
> -                     } else {
> -                             break;
>                       }
>               }
>               for (; i < pages; i++) {
>                       ret = __amdgpu_ras_convert_rec_from_rom(adev,
>                               &bps[i], &err_data, nps);
>                       if (ret)
> -                             goto free;
> +                             control->ras_num_bad_pages -= adev-
> >umc.retire_unit;
>               }
>       } else {
>               ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); @@ -
> 3040,21 +3056,28 @@ static int amdgpu_ras_load_bad_pages(struct
> amdgpu_device *adev)
>               dev_err(adev->dev, "Failed to load EEPROM table records!");
>       } else {
>               if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
> -                     for (i = 0; i < control->ras_num_recs; i++) {
> -                             if ((control->ras_num_recs - i) >= adev-
> >umc.retire_unit) {
> -                                     if ((bps[i].address == bps[i + 1].address) &&
> -                                             (bps[i].mem_channel == bps[i +
> 1].mem_channel)) {
> -                                             control->ras_num_pa_recs += adev-
> >umc.retire_unit;
> -                                             i += (adev->umc.retire_unit - 1);
> +                     /*In V3, there is no pa recs, and some cases(when
> address==0) may be parsed
> +                     as pa recs, so add verion check to avoid it.
> +                     */
> +                     if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
> +                             for (i = 0; i < control->ras_num_recs; i++) {
> +                                     if ((control->ras_num_recs - i) >= adev-
> >umc.retire_unit) {
> +                                             if ((bps[i].address == bps[i +
> 1].address) &&
> +                                                     (bps[i].mem_channel == bps[i +
> 1].mem_channel)) {
> +                                                     control->ras_num_pa_recs +=
> adev->umc.retire_unit;
> +                                                     i += (adev->umc.retire_unit - 1);
> +                                             } else {
> +                                                     control->ras_num_mca_recs +=
> +                                                                             (control-
> >ras_num_recs - i);
> +                                                     break;
> +                                             }
>                                       } else {
> -                                             control->ras_num_mca_recs +=
> -                                                                     (control-
> >ras_num_recs - i);
> +                                             control->ras_num_mca_recs +=
> (control->ras_num_recs - i);
>                                               break;
>                                       }
> -                             } else {
> -                                     control->ras_num_mca_recs += (control-
> >ras_num_recs - i);
> -                                     break;
>                               }
> +                     } else {
> +                             control->ras_num_mca_recs = control-
> >ras_num_recs;
>                       }
>               }
>
> --
> 2.34.1



More information about the amd-gfx mailing list