[PATCH 2/3] drm/amdgpu: Refine bad page adding
ganglxie
ganglxie at amd.com
Fri Feb 21 03:18:35 UTC 2025
bad page adding can be simpler with nps info
Signed-off-by: ganglxie <ganglxie at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 196 +++++++++++++-----------
1 file changed, 105 insertions(+), 91 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5420e2d6d244..439841a2d1c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2801,20 +2801,101 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
return -EINVAL;
}
+static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, int count)
+{
+ int j;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data = con->eh_data;
+
+ for (j = 0; j < count; j++) {
+ if (amdgpu_ras_check_bad_page_unlock(con,
+ bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ continue;
+
+ if (!data->space_left &&
+ amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ return -ENOMEM;
+ }
+
+ amdgpu_ras_reserve_page(adev, bps[j].retired_page);
+
+ memcpy(&data->bps[data->count], &(bps[j]),
+ sizeof(struct eeprom_table_record));
+ data->count++;
+ data->space_left--;
+ }
+
+ return 0;
+}
+
+static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ int i = 0;
+ int ret = 0;
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+
+ for (i = 0; i < adev->umc.retire_unit; i++)
+ bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps) {
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
+ return -EINVAL;
+ }
+ } else {
+ if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
+ if (nps == AMDGPU_NPS1_PARTITION_MODE)
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
+ else
+ return -EOPNOTSUPP;
+ }
+ }
+
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
+}
+
+static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+ bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
+ return -EINVAL;
+ }
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
+ adev->umc.retire_unit);
+}
+
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages, bool from_rom)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_err_handler_data *data;
struct ras_err_data err_data;
- struct eeprom_table_record *err_rec;
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
int ret = 0;
- uint32_t i, j, loop_cnt = 1;
- bool find_pages_per_pa = false;
+ uint32_t i;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
@@ -2825,108 +2906,41 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
sizeof(struct eeprom_table_record), GFP_KERNEL);
if (!err_data.err_addr) {
dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- err_rec = err_data.err_addr;
- loop_cnt = adev->umc.retire_unit;
if (adev->gmc.gmc_funcs->query_mem_partition_mode)
nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
}
mutex_lock(&con->recovery_lock);
- data = con->eh_data;
- if (!data) {
- /* Returning 0 as the absence of eh_data is acceptable */
- goto free;
- }
-
- for (i = 0; i < pages; i++) {
- if (from_rom &&
- control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
- if (!find_pages_per_pa) {
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
- if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
- /* may use old RAS TA, use PA to find pages in
- * one row
- */
- if (amdgpu_umc_pages_in_a_row(adev, &err_data,
- bps[i].retired_page <<
- AMDGPU_GPU_PAGE_SHIFT)) {
- ret = -EINVAL;
- goto free;
- } else {
- find_pages_per_pa = true;
- }
- } else {
- /* unsupported cases */
- ret = -EOPNOTSUPP;
- goto free;
- }
- }
- } else {
- if (amdgpu_umc_pages_in_a_row(adev, &err_data,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
- ret = -EINVAL;
- goto free;
- }
- }
- } else {
- if (from_rom && !find_pages_per_pa) {
- if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
- /* bad page in any NPS mode in eeprom */
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
- ret = -EINVAL;
+
+ if (from_rom) {
+ for (i = 0; i < pages; i++) {
+ if (control->ras_num_recs - i >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ //deal with retire_unit records a time
+ ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
goto free;
- }
+ i += (adev->umc.retire_unit - 1);
} else {
- /* legacy bad page in eeprom, generated only in
- * NPS1 mode
- */
- if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
- /* old RAS TA or ASICs which don't support to
- * convert addrss via mca address
- */
- if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
- find_pages_per_pa = true;
- err_rec = &bps[i];
- loop_cnt = 1;
- } else {
- /* non-nps1 mode, old RAS TA
- * can't support it
- */
- ret = -EOPNOTSUPP;
- goto free;
- }
- }
+ break;
}
-
- if (!find_pages_per_pa)
- i += (adev->umc.retire_unit - 1);
} else {
- err_rec = &bps[i];
+ break;
}
}
-
- for (j = 0; j < loop_cnt; j++) {
- if (amdgpu_ras_check_bad_page_unlock(con,
- err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
- continue;
-
- if (!data->space_left &&
- amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
- ret = -ENOMEM;
+ for (; i < pages; i++) {
+ ret = __amdgpu_ras_convert_rec_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
goto free;
- }
-
- amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
-
- memcpy(&data->bps[data->count], &(err_rec[j]),
- sizeof(struct eeprom_table_record));
- data->count++;
- data->space_left--;
}
+ } else {
+ ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
}
free:
--
2.34.1
More information about the amd-gfx
mailing list