[PATCH] Refine RAS bad page records counting and parsing in eeprom V3
Zhou1, Tao
Tao.Zhou1 at amd.com
Wed Apr 30 07:41:25 UTC 2025
[AMD Official Use Only - AMD Internal Distribution Only]
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: Xie, Patrick <Gangliang.Xie at amd.com>
> Sent: Wednesday, April 30, 2025 12:05 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Xie, Patrick <Gangliang.Xie at amd.com>
> Subject: [PATCH] Refine RAS bad page records counting and parsing in eeprom V3
>
> there is only MCA records in V3, no need to care about PA records.
> recalculate the value of ras_num_bad_pages when parsing failed and go on with the
> left records instead of quit.
>
> Signed-off-by: ganglxie <ganglxie at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 61 +++++++++++++++----------
> 1 file changed, 36 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index e85143acf3a2..32f41f151c82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2889,6 +2889,7 @@ static int __amdgpu_ras_convert_rec_from_rom(struct
> amdgpu_device *adev,
> if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
> return -EINVAL;
> }
> +
> return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
> adev-
> >umc.retire_unit);
> }
> @@ -2903,7 +2904,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device
> *adev,
> &adev->psp.ras_context.ras->eeprom_control;
> enum amdgpu_memory_partition nps =
> AMDGPU_NPS1_PARTITION_MODE;
> int ret = 0;
> - uint32_t i;
> + uint32_t i = 0;
>
> if (!con || !con->eh_data || !bps || pages <= 0)
> return 0;
> @@ -2924,28 +2925,31 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
> mutex_lock(&con->recovery_lock);
>
> if (from_rom) {
> - for (i = 0; i < pages; i++) {
> - if (control->ras_num_recs - i >= adev->umc.retire_unit) {
> - if ((bps[i].address == bps[i + 1].address) &&
> - (bps[i].mem_channel == bps[i + 1].mem_channel)) {
> - //deal with retire_unit records a time
> - ret =
> __amdgpu_ras_convert_rec_array_from_rom(adev,
> - &bps[i],
> &err_data, nps);
> - if (ret)
> - goto free;
> - i += (adev->umc.retire_unit - 1);
> + /* there is no pa recs in V3, so skip pa recs processing */
> + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
> + for (i = 0; i < pages; i++) {
> + if (control->ras_num_recs - i >= adev->umc.retire_unit)
> {
> + if ((bps[i].address == bps[i + 1].address) &&
> + (bps[i].mem_channel == bps[i +
> 1].mem_channel)) {
> + /* deal with retire_unit records a time */
> + ret =
> __amdgpu_ras_convert_rec_array_from_rom(adev,
> + &bps[i],
> &err_data, nps);
> + if (ret)
> + control->ras_num_bad_pages -
> = adev->umc.retire_unit;
> + i += (adev->umc.retire_unit - 1);
> + } else {
> + break;
> + }
> } else {
> break;
> }
> - } else {
> - break;
> }
> }
> for (; i < pages; i++) {
> ret = __amdgpu_ras_convert_rec_from_rom(adev,
> &bps[i], &err_data, nps);
> if (ret)
> - goto free;
> + control->ras_num_bad_pages -= adev-
> >umc.retire_unit;
> }
> } else {
> ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); @@ -
> 3040,21 +3044,28 @@ static int amdgpu_ras_load_bad_pages(struct
> amdgpu_device *adev)
> dev_err(adev->dev, "Failed to load EEPROM table records!");
> } else {
> if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
> - for (i = 0; i < control->ras_num_recs; i++) {
> - if ((control->ras_num_recs - i) >= adev-
> >umc.retire_unit) {
> - if ((bps[i].address == bps[i + 1].address) &&
> - (bps[i].mem_channel == bps[i +
> 1].mem_channel)) {
> - control->ras_num_pa_recs += adev-
> >umc.retire_unit;
> - i += (adev->umc.retire_unit - 1);
> + /*In V3, there is no pa recs, and some cases(when
> address==0) may be parsed
> + as pa recs, so add verion check to avoid it.
> + */
> + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
> + for (i = 0; i < control->ras_num_recs; i++) {
> + if ((control->ras_num_recs - i) >= adev-
> >umc.retire_unit) {
> + if ((bps[i].address == bps[i +
> 1].address) &&
> + (bps[i].mem_channel == bps[i +
> 1].mem_channel)) {
> + control->ras_num_pa_recs +=
> adev->umc.retire_unit;
> + i += (adev->umc.retire_unit - 1);
> + } else {
> + control->ras_num_mca_recs +=
> + (control-
> >ras_num_recs - i);
> + break;
> + }
> } else {
> - control->ras_num_mca_recs +=
> - (control-
> >ras_num_recs - i);
> + control->ras_num_mca_recs +=
> (control->ras_num_recs - i);
> break;
> }
> - } else {
> - control->ras_num_mca_recs += (control-
> >ras_num_recs - i);
> - break;
> }
> + } else {
> + control->ras_num_mca_recs = control-
> >ras_num_recs;
> }
> }
>
> --
> 2.34.1
More information about the amd-gfx
mailing list