[PATCH] drm/amdgpu: Update EEPROM RAS table for mismatched table version

Candice Li candice.li at amd.com
Wed Mar 27 06:15:44 UTC 2024


Update table version and restore bad page records to EEPROM RAS table
for mismatched table version case. Otherwise force to reset the table.

Signed-off-by: Candice Li <candice.li at amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 88 ++++++++++++++++---
 1 file changed, 78 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 06a62a8a992e9b..42d0ef2f512474 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1319,6 +1319,37 @@ static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control)
 	return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
 }
 
+static bool amdgpu_ras_eeprom_table_version_validate(struct amdgpu_ras_eeprom_control *control)
+{
+	struct amdgpu_device *adev = to_amdgpu_device(control);
+	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+
+	switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
+	case IP_VERSION(8, 10, 0):
+	case IP_VERSION(12, 0, 0):
+		return hdr->version == RAS_TABLE_VER_V2_1;
+	default:
+		return hdr->version == RAS_TABLE_VER_V1;
+	}
+}
+
+static void amdgpu_ras_update_eeprom_control(struct amdgpu_ras_eeprom_table_header *hdr)
+{
+	struct amdgpu_ras_eeprom_control *control =
+		container_of(hdr, struct amdgpu_ras_eeprom_control, tbl_hdr);
+
+	if (hdr->version == RAS_TABLE_VER_V2_1) {
+		control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
+		control->ras_record_offset = RAS_RECORD_START_V2_1;
+		control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
+	} else {
+		control->ras_num_recs = RAS_NUM_RECS(hdr);
+		control->ras_record_offset = RAS_RECORD_START;
+		control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+	}
+	control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
+}
+
 int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
 			   bool *exceed_err_limit)
 {
@@ -1326,7 +1357,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
 	unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
 	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-	int res;
+	int res, res1;
+	struct eeprom_table_record *bps;
+	u32 num_recs;
 
 	*exceed_err_limit = false;
 
@@ -1355,16 +1388,51 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
 
 	__decode_table_header_from_buf(hdr, buf);
 
-	if (hdr->version == RAS_TABLE_VER_V2_1) {
-		control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
-		control->ras_record_offset = RAS_RECORD_START_V2_1;
-		control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
-	} else {
-		control->ras_num_recs = RAS_NUM_RECS(hdr);
-		control->ras_record_offset = RAS_RECORD_START;
-		control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+	amdgpu_ras_update_eeprom_control(hdr);
+
+	if (!amdgpu_ras_eeprom_table_version_validate(control)) {
+		num_recs = control->ras_num_recs;
+		if (num_recs && amdgpu_bad_page_threshold) {
+			/* Save bad page records existed in EEPROM */
+			bps = kcalloc(num_recs, sizeof(*bps), GFP_KERNEL);
+			if (!bps)
+				return -ENOMEM;
+
+			res1 = amdgpu_ras_eeprom_read(control, bps, num_recs);
+			if (res1)
+				dev_warn(adev->dev, "Fail to load EEPROM table, force to reset it.");
+
+			res = amdgpu_ras_eeprom_reset_table(control);
+			if (res) {
+				dev_err(adev->dev, "Failed to create a new EEPROM table.");
+				kfree(bps);
+				return res < 0 ? res : 0;
+			}
+
+			if (!res1) {
+				/* Update the EEPROM table with correct table version and
+				 * original bad page records
+				 */
+				amdgpu_ras_update_eeprom_control(hdr);
+				res = amdgpu_ras_eeprom_append(control, bps, num_recs);
+
+				if (res) {
+					dev_warn(adev->dev, "Fail to update EEPROM table, force to reset it.");
+					res = amdgpu_ras_eeprom_reset_table(control);
+				}
+			}
+
+			kfree(bps);
+		} else
+			res = amdgpu_ras_eeprom_reset_table(control);
+
+		if (res) {
+			dev_err(adev->dev, "Failed to reset EEPROM table.");
+			return res < 0 ? res : 0;
+		}
+
+		amdgpu_ras_update_eeprom_control(hdr);
 	}
-	control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
 
 	if (hdr->header == RAS_TABLE_HDR_VAL) {
 		DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
-- 
2.25.1



More information about the amd-gfx mailing list