[PATCH] drm/amdgpu: Save PA of bad pages for old asics
Zhou1, Tao
Tao.Zhou1 at amd.com
Wed Mar 12 07:17:56 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie at amd.com>
> Sent: Wednesday, March 12, 2025 2:16 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Xie, Patrick <Gangliang.Xie at amd.com>
> Subject: [PATCH] drm/amdgpu: Save PA of bad pages for old asics
>
> for old asics that do not support mca translating, we just save PA for them
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 ++++++++++++++++---
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 9 +++++--
> 2 files changed, 28 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 285e3aa2bb2f..7cf8a3036828 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2836,6 +2836,13 @@ static int
> __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
>
> save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) &
> UMC_NPS_MASK;
>
> + /*old asics just have pa in eeprom*/
> + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
> + memcpy(err_data->err_addr, bps,
> + sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
> + goto out;
> + }
> +
> for (i = 0; i < adev->umc.retire_unit; i++)
> bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
>
> @@ -2858,6 +2865,7 @@ static int
> __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
> }
> }
>
> +out:
> return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev-
> >umc.retire_unit); }
>
> @@ -2981,14 +2989,24 @@ int amdgpu_ras_save_bad_pages(struct
> amdgpu_device *adev,
>
> /* only new entries are saved */
> if (save_count > 0) {
> - for (i = 0; i < unit_num; i++) {
> + /*old asics only save pa to eeprom like before*/
> + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) <
> 12) {
> if (amdgpu_ras_eeprom_append(control,
> - &data->bps[bad_page_num + i * adev-
> >umc.retire_unit],
> - 1)) {
> + &data->bps[bad_page_num], save_count)) {
> dev_err(adev->dev, "Failed to save EEPROM table
> data!");
> return -EIO;
> }
> + } else {
> + for (i = 0; i < unit_num; i++) {
> + if (amdgpu_ras_eeprom_append(control,
> + &data->bps[bad_page_num +
> + i * adev->umc.retire_unit], 1)) {
> + dev_err(adev->dev, "Failed to save EEPROM
> table data!");
> + return -EIO;
> + }
> + }
> }
> +
> dev_info(adev->dev, "Saved %d pages to EEPROM table.\n",
> save_count);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 09a6f8bc1a5a..3597ecd9baca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -727,9 +727,14 @@ amdgpu_ras_eeprom_append_table(struct
> amdgpu_ras_eeprom_control *control,
> - control->ras_fri)
> % control->ras_max_record_count;
>
> - control->ras_num_mca_recs += num;
> - control->ras_num_bad_pages += num * adev->umc.retire_unit;
> + /*old asics only save pa to eeprom like before*/
> + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12)
> + control->ras_num_pa_recs += num;
> + else
> + control->ras_num_mca_recs += num;
>
> + control->ras_num_bad_pages = control->ras_num_pa_recs +
> + control->ras_num_mca_recs * adev->umc.retire_unit;
> Out:
> kfree(buf);
> return res;
> --
> 2.34.1
More information about the amd-gfx
mailing list