[PATCH] drm/amdgpu: handle old RAS eeprom data in non-nps1 mode

Zhang, Hawking Hawking.Zhang at amd.com
Wed Apr 30 09:12:21 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Wednesday, April 30, 2025 16:39
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH] drm/amdgpu: handle old RAS eeprom data in non-nps1 mode

Get MCA address from PA in nps1, then convert MCA address to PA in specific nps mode.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 22 ++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  2 ++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a5a853894ab0..a9d2e7fb3e47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2889,8 +2889,20 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
                                bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
                        return -EINVAL;
        } else {
-               if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
-                       return -EINVAL;
+               if (bps->address) {
+                       if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
+                               return -EINVAL;
+               } else {
+                       /* for specific old eeprom data, mca address is not stored,
+                        * calc it from pa
+                        */
+                       if (amdgpu_umc_pa2mca(adev, bps->retired_page,
+                               &(bps->address), AMDGPU_NPS1_PARTITION_MODE))
+                               return -EINVAL;
+
+                       if (amdgpu_ras_mca2pa(adev, bps, err_data))
+                               return -EOPNOTSUPP;
+               }
        }
        return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
                                                                        adev->umc.retire_unit);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 8adceeee298b..6337b6406006 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -565,3 +565,25 @@ int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,

        return 0;
 }
+
+int amdgpu_umc_pa2mca(struct amdgpu_device *adev,
+               uint64_t pa, uint64_t *mca, enum amdgpu_memory_partition nps) {
+       struct ta_ras_query_address_input addr_in;
+       struct ta_ras_query_address_output addr_out;
+       int ret;
+
+       /* nps: the pa belongs to */
+       addr_in.pa.pa = pa | ((uint64_t)nps << 58);
+       addr_in.addr_type = TA_RAS_PA_TO_MCA;
+       ret = psp_ras_query_address(&adev->psp, &addr_in, &addr_out);
+       if (ret) {
+               dev_warn(adev->dev, "Failed to query RAS MCA address for 0x%llx",
+pa);
+
+               return ret;
+       }
+
+       *mca = addr_out.ma.err_addr;
+
+       return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 78a8b8654573..d6929d6f64f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -187,4 +187,6 @@ int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
                        uint64_t err_addr, uint32_t ch, uint32_t umc,
                        uint32_t node, uint32_t socket,
                        struct ta_ras_query_address_output *addr_out, bool dump_addr);
+int amdgpu_umc_pa2mca(struct amdgpu_device *adev,
+               uint64_t pa, uint64_t *mca, enum amdgpu_memory_partition nps);
 #endif
--
2.34.1



More information about the amd-gfx mailing list