[PATCH 21/23] drm/amdgpu: parse legacy RAS bad page mixed with new data in various NPS modes
Tao Zhou
tao.zhou1 at amd.com
Fri Nov 8 11:14:21 UTC 2024
All legacy RAS bad pages are generated in NPS1 mode, but new bad page can be
generated in any NPS mode, so we can't use retired_page sotred on eeprom
directly in non-nps1 mode even for legacy data. We need to take different
actions for different data, new data can be identified from old data by
UMC_CHANNEL_IDX_V2 flag.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 45 +++++++++++++++++++------
1 file changed, 35 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bc4a5db2793a..324c71d99175 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2758,12 +2758,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
is_mca_add = false;
}
- mutex_lock(&con->recovery_lock);
- data = con->eh_data;
- if (!data)
- goto out;
-
- if (is_mca_add) {
+ if (from_rom) {
err_data.err_addr =
kcalloc(adev->umc.retire_unit,
sizeof(struct eeprom_table_record), GFP_KERNEL);
@@ -2774,11 +2769,17 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
goto out;
}
+ err_rec = err_data.err_addr;
loop_cnt = adev->umc.retire_unit;
if (adev->gmc.gmc_funcs->query_mem_partition_mode)
nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
}
+ mutex_lock(&con->recovery_lock);
+ data = con->eh_data;
+ if (!data)
+ goto free;
+
for (i = 0; i < pages; i++) {
if (is_mca_add) {
if (!find_pages_per_pa) {
@@ -2800,10 +2801,34 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
goto free;
}
-
- err_rec = err_data.err_addr;
} else {
- err_rec = &bps[i];
+ if (from_rom && !find_pages_per_pa) {
+ if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
+ /* bad page in any NPS mode in eeprom */
+ if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data))
+ goto free;
+ } else {
+ /* legacy bad page in eeprom, generated only in NPS1 mode */
+ if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
+ /* old RAS TA or ASICs which don't support to convert addrss
+ * via mca address
+ */
+ if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
+ find_pages_per_pa = true;
+ err_rec = &bps[i];
+ loop_cnt = 1;
+ } else {
+ /* non-nps1 mode, old RAS TA can't support it */
+ goto free;
+ }
+ }
+ }
+
+ if (!find_pages_per_pa)
+ i += (adev->umc.retire_unit - 1);
+ } else {
+ err_rec = &bps[i];
+ }
}
for (j = 0; j < loop_cnt; j++) {
@@ -2827,7 +2852,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
}
free:
- if (is_mca_add)
+ if (from_rom)
kfree(err_data.err_addr);
out:
mutex_unlock(&con->recovery_lock);
--
2.34.1
More information about the amd-gfx
mailing list