[PATCH] drm/amdgpu: Update EEPROM RAS table for mismatched table version
Candice Li
candice.li at amd.com
Wed Mar 27 06:15:44 UTC 2024
Update table version and restore bad page records to EEPROM RAS table
for mismatched table version case. Otherwise force to reset the table.
Signed-off-by: Candice Li <candice.li at amd.com>
---
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 88 ++++++++++++++++---
1 file changed, 78 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 06a62a8a992e9b..42d0ef2f512474 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1319,6 +1319,37 @@ static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control)
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
}
+static bool amdgpu_ras_eeprom_table_version_validate(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+
+ switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
+ case IP_VERSION(8, 10, 0):
+ case IP_VERSION(12, 0, 0):
+ return hdr->version == RAS_TABLE_VER_V2_1;
+ default:
+ return hdr->version == RAS_TABLE_VER_V1;
+ }
+}
+
+static void amdgpu_ras_update_eeprom_control(struct amdgpu_ras_eeprom_table_header *hdr)
+{
+ struct amdgpu_ras_eeprom_control *control =
+ container_of(hdr, struct amdgpu_ras_eeprom_control, tbl_hdr);
+
+ if (hdr->version == RAS_TABLE_VER_V2_1) {
+ control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
+ control->ras_record_offset = RAS_RECORD_START_V2_1;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
+ } else {
+ control->ras_num_recs = RAS_NUM_RECS(hdr);
+ control->ras_record_offset = RAS_RECORD_START;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+ }
+ control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
+}
+
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
bool *exceed_err_limit)
{
@@ -1326,7 +1357,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- int res;
+ int res, res1;
+ struct eeprom_table_record *bps;
+ u32 num_recs;
*exceed_err_limit = false;
@@ -1355,16 +1388,51 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
__decode_table_header_from_buf(hdr, buf);
- if (hdr->version == RAS_TABLE_VER_V2_1) {
- control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
- control->ras_record_offset = RAS_RECORD_START_V2_1;
- control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
- } else {
- control->ras_num_recs = RAS_NUM_RECS(hdr);
- control->ras_record_offset = RAS_RECORD_START;
- control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+ amdgpu_ras_update_eeprom_control(hdr);
+
+ if (!amdgpu_ras_eeprom_table_version_validate(control)) {
+ num_recs = control->ras_num_recs;
+ if (num_recs && amdgpu_bad_page_threshold) {
+ /* Save bad page records existed in EEPROM */
+ bps = kcalloc(num_recs, sizeof(*bps), GFP_KERNEL);
+ if (!bps)
+ return -ENOMEM;
+
+ res1 = amdgpu_ras_eeprom_read(control, bps, num_recs);
+ if (res1)
+ dev_warn(adev->dev, "Fail to load EEPROM table, force to reset it.");
+
+ res = amdgpu_ras_eeprom_reset_table(control);
+ if (res) {
+ dev_err(adev->dev, "Failed to create a new EEPROM table.");
+ kfree(bps);
+ return res < 0 ? res : 0;
+ }
+
+ if (!res1) {
+ /* Update the EEPROM table with correct table version and
+ * original bad page records
+ */
+ amdgpu_ras_update_eeprom_control(hdr);
+ res = amdgpu_ras_eeprom_append(control, bps, num_recs);
+
+ if (res) {
+ dev_warn(adev->dev, "Fail to update EEPROM table, force to reset it.");
+ res = amdgpu_ras_eeprom_reset_table(control);
+ }
+ }
+
+ kfree(bps);
+ } else
+ res = amdgpu_ras_eeprom_reset_table(control);
+
+ if (res) {
+ dev_err(adev->dev, "Failed to reset EEPROM table.");
+ return res < 0 ? res : 0;
+ }
+
+ amdgpu_ras_update_eeprom_control(hdr);
}
- control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
if (hdr->header == RAS_TABLE_HDR_VAL) {
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
--
2.25.1
More information about the amd-gfx
mailing list