[PATCH] Refine RAS bad page records counting and parsing in eeprom V3

Wed Apr 30 14:29:33 UTC 2025

On Wed, Apr 30, 2025 at 12:13 AM ganglxie <ganglxie at amd.com> wrote:

Please fix the patch title.  Please add a drm/amdgpu prefix.  E.g.,

drm/amdgpu: Refine RAS bad page records counting and parsing in eeprom V3

>
> there is only MCA records in V3, no need to care about PA records.
> recalculate the value of ras_num_bad_pages when parsing failed and
> go on with the left records instead of quit.
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 61 +++++++++++++++----------
>  1 file changed, 36 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index e85143acf3a2..32f41f151c82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2889,6 +2889,7 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
>                 if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
>                         return -EINVAL;
>         }
> +
>         return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
>                                                                         adev->umc.retire_unit);
>  }
> @@ -2903,7 +2904,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>                         &adev->psp.ras_context.ras->eeprom_control;
>         enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
>         int ret = 0;
> -       uint32_t i;
> +       uint32_t i = 0;
>
>         if (!con || !con->eh_data || !bps || pages <= 0)
>                 return 0;
> @@ -2924,28 +2925,31 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>         mutex_lock(&con->recovery_lock);
>
>         if (from_rom) {
> -               for (i = 0; i < pages; i++) {
> -                       if (control->ras_num_recs - i >= adev->umc.retire_unit) {
> -                               if ((bps[i].address == bps[i + 1].address) &&
> -                                   (bps[i].mem_channel == bps[i + 1].mem_channel)) {
> -                                       //deal with retire_unit records a time
> -                                       ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
> -                                                                       &bps[i], &err_data, nps);
> -                                       if (ret)
> -                                               goto free;
> -                                       i += (adev->umc.retire_unit - 1);
> +               /* there is no pa recs in V3, so skip pa recs processing */
> +               if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
> +                       for (i = 0; i < pages; i++) {
> +                               if (control->ras_num_recs - i >= adev->umc.retire_unit) {
> +                                       if ((bps[i].address == bps[i + 1].address) &&
> +                                               (bps[i].mem_channel == bps[i + 1].mem_channel)) {
> +                                               /* deal with retire_unit records a time */
> +                                               ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
> +                                                                               &bps[i], &err_data, nps);
> +                                               if (ret)
> +                                                       control->ras_num_bad_pages -= adev->umc.retire_unit;
> +                                               i += (adev->umc.retire_unit - 1);
> +                                       } else {
> +                                               break;
> +                                       }
>                                 } else {
>                                         break;
>                                 }
> -                       } else {
> -                               break;
>                         }
>                 }
>                 for (; i < pages; i++) {
>                         ret = __amdgpu_ras_convert_rec_from_rom(adev,
>                                 &bps[i], &err_data, nps);
>                         if (ret)
> -                               goto free;
> +                               control->ras_num_bad_pages -= adev->umc.retire_unit;
>                 }
>         } else {
>                 ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
> @@ -3040,21 +3044,28 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
>                 dev_err(adev->dev, "Failed to load EEPROM table records!");
>         } else {
>                 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
> -                       for (i = 0; i < control->ras_num_recs; i++) {
> -                               if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
> -                                       if ((bps[i].address == bps[i + 1].address) &&
> -                                               (bps[i].mem_channel == bps[i + 1].mem_channel)) {
> -                                               control->ras_num_pa_recs += adev->umc.retire_unit;
> -                                               i += (adev->umc.retire_unit - 1);
> +                       /*In V3, there is no pa recs, and some cases(when address==0) may be parsed
> +                       as pa recs, so add verion check to avoid it.
> +                       */
> +                       if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
> +                               for (i = 0; i < control->ras_num_recs; i++) {
> +                                       if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
> +                                               if ((bps[i].address == bps[i + 1].address) &&
> +                                                       (bps[i].mem_channel == bps[i + 1].mem_channel)) {
> +                                                       control->ras_num_pa_recs += adev->umc.retire_unit;
> +                                                       i += (adev->umc.retire_unit - 1);
> +                                               } else {
> +                                                       control->ras_num_mca_recs +=
> +                                                                               (control->ras_num_recs - i);
> +                                                       break;
> +                                               }
>                                         } else {
> -                                               control->ras_num_mca_recs +=
> -                                                                       (control->ras_num_recs - i);
> +                                               control->ras_num_mca_recs += (control->ras_num_recs - i);
>                                                 break;
>                                         }
> -                               } else {
> -                                       control->ras_num_mca_recs += (control->ras_num_recs - i);
> -                                       break;
>                                 }
> +                       } else {
> +                               control->ras_num_mca_recs = control->ras_num_recs;
>                         }
>                 }
>
> --
> 2.34.1
>