[PATCH 13/21] drm/amdgpu: support to find RAS bad pages via old TA
Tao Zhou
tao.zhou1 at amd.com
Tue Nov 19 06:35:56 UTC 2024
Old version of RAS TA doesn't support to convert MCA address stored on
eeprom to physical address (PA), support to find all bad pages in one memory
row by PA with old RAS TA. This approach is only suitable for nps1 mode.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 26 ++++++++++++++++++++++---
1 file changed, 23 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c0a011a59d59..681b756f6428 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2703,9 +2703,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *err_rec;
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
+ enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
int ret = 0;
uint32_t i, j, loop_cnt = 1;
- bool is_mca_add = true;
+ bool is_mca_add = true, find_pages_per_pa = false;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
@@ -2736,12 +2737,31 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
}
loop_cnt = adev->umc.retire_unit;
+ if (adev->gmc.gmc_funcs->query_mem_partition_mode)
+ nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
}
for (i = 0; i < pages; i++) {
if (is_mca_add) {
- if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data))
- goto free;
+ if (!find_pages_per_pa) {
+ if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
+ if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
+ /* may use old RAS TA, use PA to find pages in one row */
+ if (amdgpu_umc_pages_in_a_row(adev, &err_data,
+ bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ goto free;
+ else
+ find_pages_per_pa = true;
+ } else {
+ /* unsupported cases */
+ goto free;
+ }
+ }
+ } else {
+ if (amdgpu_umc_pages_in_a_row(adev, &err_data,
+ bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ goto free;
+ }
err_rec = err_data.err_addr;
} else {
--
2.34.1
More information about the amd-gfx
mailing list