[PATCH] drm/amdgpu: Save nps to eeprom and refine code
ganglxie
ganglxie at amd.com
Thu Feb 20 06:16:37 UTC 2025
add nps info into eeprom records, and refine adding bad page
logic based on nps info.
Signed-off-by: ganglxie <ganglxie at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 244 +++++++++---------
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 +-
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 20 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 7 +
4 files changed, 153 insertions(+), 143 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5420e2d6d244..3945dda54b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2801,20 +2801,101 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
return -EINVAL;
}
+static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, int count)
+{
+ int j;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data = con->eh_data;
+
+ for (j = 0; j < count; j++) {
+ if (amdgpu_ras_check_bad_page_unlock(con,
+ bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ continue;
+
+ if (!data->space_left &&
+ amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ return -ENOMEM;
+ }
+
+ amdgpu_ras_reserve_page(adev, bps[j].retired_page);
+
+ memcpy(&data->bps[data->count], &(bps[j]),
+ sizeof(struct eeprom_table_record));
+ data->count++;
+ data->space_left--;
+ }
+
+ return 0;
+}
+
+static int __amdgpu_ras_convert_unit_rec_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ int i = 0;
+ int ret = 0;
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+
+ for (i = 0; i < adev->umc.retire_unit; i++)
+ bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps) {
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
+ return -EINVAL;
+ }
+ } else {
+ if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
+ if (nps == AMDGPU_NPS1_PARTITION_MODE)
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
+ else
+ return -EOPNOTSUPP;
+ }
+ }
+
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
+}
+
+static int __amdgpu_ras_convert_single_rec_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+ bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
+ return -EINVAL;
+ }
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
+ adev->umc.retire_unit);
+}
+
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages, bool from_rom)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_err_handler_data *data;
struct ras_err_data err_data;
- struct eeprom_table_record *err_rec;
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
int ret = 0;
- uint32_t i, j, loop_cnt = 1;
- bool find_pages_per_pa = false;
+ uint32_t i;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
@@ -2825,108 +2906,41 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
sizeof(struct eeprom_table_record), GFP_KERNEL);
if (!err_data.err_addr) {
dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- err_rec = err_data.err_addr;
- loop_cnt = adev->umc.retire_unit;
if (adev->gmc.gmc_funcs->query_mem_partition_mode)
nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
}
mutex_lock(&con->recovery_lock);
- data = con->eh_data;
- if (!data) {
- /* Returning 0 as the absence of eh_data is acceptable */
- goto free;
- }
-
- for (i = 0; i < pages; i++) {
- if (from_rom &&
- control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
- if (!find_pages_per_pa) {
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
- if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
- /* may use old RAS TA, use PA to find pages in
- * one row
- */
- if (amdgpu_umc_pages_in_a_row(adev, &err_data,
- bps[i].retired_page <<
- AMDGPU_GPU_PAGE_SHIFT)) {
- ret = -EINVAL;
- goto free;
- } else {
- find_pages_per_pa = true;
- }
- } else {
- /* unsupported cases */
- ret = -EOPNOTSUPP;
- goto free;
- }
- }
- } else {
- if (amdgpu_umc_pages_in_a_row(adev, &err_data,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
- ret = -EINVAL;
- goto free;
- }
- }
- } else {
- if (from_rom && !find_pages_per_pa) {
- if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
- /* bad page in any NPS mode in eeprom */
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
- ret = -EINVAL;
+
+ if (from_rom) {
+ for (i = 0; i < pages; i++) {
+ if (control->ras_num_recs - i >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ //deal retire_unit records a time
+ ret = __amdgpu_ras_convert_unit_rec_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
goto free;
- }
+ i += (adev->umc.retire_unit - 1);
} else {
- /* legacy bad page in eeprom, generated only in
- * NPS1 mode
- */
- if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
- /* old RAS TA or ASICs which don't support to
- * convert addrss via mca address
- */
- if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
- find_pages_per_pa = true;
- err_rec = &bps[i];
- loop_cnt = 1;
- } else {
- /* non-nps1 mode, old RAS TA
- * can't support it
- */
- ret = -EOPNOTSUPP;
- goto free;
- }
- }
+ break;
}
-
- if (!find_pages_per_pa)
- i += (adev->umc.retire_unit - 1);
} else {
- err_rec = &bps[i];
+ break;
}
}
-
- for (j = 0; j < loop_cnt; j++) {
- if (amdgpu_ras_check_bad_page_unlock(con,
- err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
- continue;
-
- if (!data->space_left &&
- amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
- ret = -ENOMEM;
+ for (; i < pages; i++) {
+ ret = __amdgpu_ras_convert_single_rec_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
goto free;
- }
-
- amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
-
- memcpy(&data->bps[data->count], &(err_rec[j]),
- sizeof(struct eeprom_table_record));
- data->count++;
- data->space_left--;
}
+ } else {
+ ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
}
free:
@@ -2971,24 +2985,14 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
/* only new entries are saved */
if (save_count > 0) {
- if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) {
+ for (i = 0; i < unit_num; i++) {
if (amdgpu_ras_eeprom_append(control,
- &data->bps[control->ras_num_recs],
- save_count)) {
+ &data->bps[bad_page_num + i * adev->umc.retire_unit],
+ 1)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
}
- } else {
- for (i = 0; i < unit_num; i++) {
- if (amdgpu_ras_eeprom_append(control,
- &data->bps[bad_page_num + i * adev->umc.retire_unit],
- 1)) {
- dev_err(adev->dev, "Failed to save EEPROM table data!");
- return -EIO;
- }
- }
}
-
dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
}
@@ -3005,6 +3009,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
&adev->psp.ras_context.ras->eeprom_control;
struct eeprom_table_record *bps;
int ret;
+ int i = 0;
/* no bad page record, skip eeprom access */
if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
@@ -3018,13 +3023,23 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
if (ret) {
dev_err(adev->dev, "Failed to load EEPROM table records!");
} else {
- if (control->ras_num_recs > 1 &&
- adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
- if ((bps[0].address == bps[1].address) &&
- (bps[0].mem_channel == bps[1].mem_channel))
- control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
- else
- control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+ if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
+ for (i = 0; i < control->ras_num_recs; i++) {
+ if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ control->ras_num_pa_recs += adev->umc.retire_unit;
+ i += (adev->umc.retire_unit - 1);
+ } else {
+ control->ras_num_mca_recs +=
+ (control->ras_num_recs - i);
+ break;
+ }
+ } else {
+ control->ras_num_mca_recs += (control->ras_num_recs - i);
+ break;
+ }
+ }
}
ret = amdgpu_ras_eeprom_check(control);
@@ -3438,12 +3453,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
return ret;
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
- control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
-
- /* default status is MCA storage */
- if (control->ras_num_recs <= 1 &&
- adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
- control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+ control->ras_num_pa_recs = control->ras_num_recs;
if (control->ras_num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 83b54efcaa87..ab27cecb5519 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -727,11 +727,9 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
- control->ras_fri)
% control->ras_max_record_count;
- if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
- control->ras_num_bad_pages = control->ras_num_recs;
- else
- control->ras_num_bad_pages =
- control->ras_num_recs * adev->umc.retire_unit;
+ control->ras_num_mca_recs += num;
+ control->ras_num_bad_pages += num * adev->umc.retire_unit;
+
Out:
kfree(buf);
return res;
@@ -852,6 +850,7 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
{
struct amdgpu_device *adev = to_amdgpu_device(control);
int res, i;
+ uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -865,9 +864,12 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
return -EINVAL;
}
+ if (adev->gmc.gmc_funcs->query_mem_partition_mode)
+ nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
+
/* set the new channel index flag */
for (i = 0; i < num; i++)
- record[i].retired_page |= UMC_CHANNEL_IDX_V2;
+ record[i].retired_page |= (nps << UMC_NPS_SHIFT);
mutex_lock(&control->ras_tbl_mutex);
@@ -881,7 +883,7 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
/* clear channel index flag, the flag is only saved on eeprom */
for (i = 0; i < num; i++)
- record[i].retired_page &= ~UMC_CHANNEL_IDX_V2;
+ record[i].retired_page &= ~(nps << UMC_NPS_SHIFT);
return res;
}
@@ -1392,6 +1394,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
}
control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
+ control->ras_num_mca_recs = 0;
+ control->ras_num_pa_recs = 0;
return 0;
}
@@ -1412,11 +1416,8 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
if (!__get_eeprom_i2c_addr(adev, control))
return -EINVAL;
- if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
- control->ras_num_bad_pages = control->ras_num_recs;
- else
- control->ras_num_bad_pages =
- control->ras_num_recs * adev->umc.retire_unit;
+ control->ras_num_bad_pages = control->ras_num_pa_recs +
+ control->ras_num_mca_recs * adev->umc.retire_unit;
if (hdr->header == RAS_TABLE_HDR_VAL) {
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 81d55cb7b397..13f7eda9a696 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -43,19 +43,6 @@ enum amdgpu_ras_eeprom_err_type {
AMDGPU_RAS_EEPROM_ERR_COUNT,
};
-/*
- * one UMC MCA address could map to multiply physical address (PA),
- * such as 1:16, we use eeprom_table_record.address to store MCA
- * address and use eeprom_table_record.retired_page to save PA.
- *
- * AMDGPU_RAS_EEPROM_REC_PA: one record store one PA
- * AMDGPU_RAS_EEPROM_REC_MCA: one record store one MCA address
- */
-enum amdgpu_ras_eeprom_rec_type {
- AMDGPU_RAS_EEPROM_REC_PA,
- AMDGPU_RAS_EEPROM_REC_MCA,
-};
-
struct amdgpu_ras_eeprom_table_header {
uint32_t header;
uint32_t version;
@@ -100,6 +87,12 @@ struct amdgpu_ras_eeprom_control {
*/
u32 ras_num_bad_pages;
+ /* Number of records store mca address */
+ u32 ras_num_mca_recs;
+
+ /* Number of records store physical address */
+ u32 ras_num_pa_recs;
+
/* First record index to read, 0-based.
* Range is [0, num_recs-1]. This is
* an absolute index, starting right after
@@ -120,7 +113,6 @@ struct amdgpu_ras_eeprom_control {
/* Record channel info which occurred bad pages
*/
u32 bad_channel_bitmap;
- enum amdgpu_ras_eeprom_rec_type rec_type;
};
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index a4a7e61817aa..857693bcd8d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -71,6 +71,13 @@
*/
#define UMC_CHANNEL_IDX_V2 BIT_ULL(47)
+/*
+ * save nps value to eeprom_table_record.retired_page[47:40],
+ * the channel index flag above will be retired.
+ */
+#define UMC_NPS_SHIFT 40
+#define UMC_NPS_MASK 0xffULL
+
typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
uint32_t umc_inst, uint32_t ch_inst, void *data);
--
2.34.1
More information about the amd-gfx
mailing list