[PATCH 3/5] drm/amdgpu: conduct bad gpu check during bootup/reset

Wed Jul 22 03:14:13 UTC 2020

During boot up, when init eeprom, RAS will check if the BAD
GPU tag is saved or not. And will break boot up and notify user
to RMA it.

At the meanwhile, when saved bad page count to eeprom exceeds
threshold, one ras recovery will be triggered immediately, and
bad gpu check will be executed and reset will fail as well to
remind user.

User could set bad_page_threshold = 0 as module parameter when
probing driver to disable the bad feature check.

Signed-off-by: Guchun Chen <guchun.chen at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 21 ++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 25 +++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       | 16 +++-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 87 ++++++++++++++++++-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  6 +-
 5 files changed, 142 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2662cd7c8685..d529bf7917f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2065,7 +2065,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 	 * Note: theoretically, this should be called before all vram allocations
 	 * to protect retired page from abusing
 	 */
-	amdgpu_ras_recovery_init(adev);
+	r = amdgpu_ras_recovery_init(adev);
+	if (r)
+		goto init_failed;
 
 	if (adev->gmc.xgmi.num_physical_nodes > 1)
 		amdgpu_xgmi_add_device(adev);
@@ -4133,8 +4135,20 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 
 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
 
-				/* must succeed. */
-				amdgpu_ras_resume(tmp_adev);
+				/*
+				 * The CPU is BAD once faulty pages by ECC has
+				 * reached the threshold, and ras recovery is
+				 * scheduled. So add one check here to prevent
+				 * recovery if it's one BAD GPU, and remind
+				 * user to RMA this GPU.
+				 */
+				if (!amdgpu_ras_check_bad_gpu(tmp_adev)) {
+					/* must succeed. */
+					amdgpu_ras_resume(tmp_adev);
+				} else {
+					r = -EINVAL;
+					goto out;
+				}
 
 				/* Update PSP FW topology after reset */
 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
@@ -4142,7 +4156,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 			}
 		}
 
-
 out:
 		if (!r) {
 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e3d67d85c55f..818d4154e4a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -62,8 +62,6 @@ const char *ras_block_string[] = {
 #define ras_err_str(i) (ras_error_string[ffs(i)])
 #define ras_block_str(i) (ras_block_string[i])
 
-#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS		1
-#define AMDGPU_RAS_FLAG_INIT_NEED_RESET		2
 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
 
 /* inject address is 52 bits */
@@ -1817,6 +1815,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data;
 	uint32_t max_eeprom_records_len = 0;
+	bool bad_gpu = false;
 	int ret;
 
 	if (con)
@@ -1838,9 +1837,14 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
 	amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
 
-	ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
-	if (ret)
+	ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &bad_gpu);
+	/*
+	 * we only fail this calling and halt booting when bad_gpu is true.
+	 */
+	if (bad_gpu) {
+		ret = -EINVAL;
 		goto free;
+	}
 
 	if (con->eeprom_control.num_recs) {
 		ret = amdgpu_ras_load_bad_pages(adev);
@@ -2189,3 +2193,16 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
 
 	return false;
 }
+
+bool amdgpu_ras_check_bad_gpu(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	bool bad_gpu = false;
+
+	if (con && (con->bad_page_cnt_threshold != 0xFFFFFFFF))
+		amdgpu_ras_eeprom_check_bad_gpu(&con->eeprom_control,
+						&bad_gpu);
+
+	/* We are only interested in variable bad_gpu. */
+	return bad_gpu;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 4672649a9293..d7a363b37feb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -31,6 +31,10 @@
 #include "ta_ras_if.h"
 #include "amdgpu_ras_eeprom.h"
 
+#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS		(0x1 << 0)
+#define AMDGPU_RAS_FLAG_INIT_NEED_RESET		(0x1 << 1)
+#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV	(0x1 << 2)
+
 enum amdgpu_ras_block {
 	AMDGPU_RAS_BLOCK__UMC = 0,
 	AMDGPU_RAS_BLOCK__SDMA,
@@ -493,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 		bool is_ce);
 
+bool amdgpu_ras_check_bad_gpu(struct amdgpu_device *adev);
+
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 		struct eeprom_table_record *bps, int pages);
@@ -503,10 +509,14 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-	/* save bad page to eeprom before gpu reset,
-	 * i2c may be unstable in gpu reset
+	/*
+	 * Save bad page to eeprom before gpu reset, i2c may be unstable
+	 * in gpu reset.
+	 *
+	 * Also, exclude the case when ras recovery issuer is
+	 * eerprom page write.
 	 */
-	if (in_task())
+	if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) && in_task())
 		amdgpu_ras_reserve_bad_pages(adev);
 
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index a2c982b1eac6..96b63c026bad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -46,6 +46,9 @@
 #define EEPROM_TABLE_HDR_VAL 0x414d4452
 #define EEPROM_TABLE_VER 0x00010000
 
+/* Bad GPU tag ‘BADG’ */
+#define EEPROM_TABLE_HDR_BAD 0x42414447
+
 /* Assume 2 Mbit size */
 #define EEPROM_SIZE_BYTES 256000
 #define EEPROM_PAGE__SIZE_BYTES 256
@@ -238,12 +241,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 
 }
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+			bool *bad_gpu)
 {
 	int ret = 0;
 	struct amdgpu_device *adev = to_amdgpu_device(control);
 	unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 };
 	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 	struct i2c_msg msg = {
 			.addr	= 0,
 			.flags	= I2C_M_RD,
@@ -251,6 +256,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 			.buf	= buff,
 	};
 
+	*bad_gpu = false;
+
 	/* Verify i2c adapter is initialized */
 	if (!adev->pm.smu_i2c.algo)
 		return -ENOENT;
@@ -279,6 +286,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 		DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
 				 control->num_recs);
 
+	} else if ((ras->bad_page_cnt_threshold != 0xFFFFFFFF) && (
+			hdr->header == EEPROM_TABLE_HDR_BAD)) {
+		*bad_gpu = true;
+		DRM_ERROR("Detect BAD GPU TAG in eeprom table and "
+				"GPU shall be RMAed.\n");
 	} else {
 		DRM_INFO("Creating new EEPROM table");
 
@@ -375,6 +387,44 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
 	return curr_address;
 }
 
+int amdgpu_ras_eeprom_check_bad_gpu(struct amdgpu_ras_eeprom_control *control,
+					bool *bad_gpu)
+{
+	struct amdgpu_device *adev = to_amdgpu_device(control);
+	unsigned char buff[EEPROM_ADDRESS_SIZE +
+			EEPROM_TABLE_HEADER_SIZE] = { 0 };
+	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+	struct i2c_msg msg = {
+			.addr = control->i2c_address,
+			.flags = I2C_M_RD,
+			.len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
+			.buf = buff,
+	};
+	int ret;
+
+	*bad_gpu = false;
+
+	/* read EEPROM table header */
+	mutex_lock(&control->tbl_mutex);
+	ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
+	if (ret < 1) {
+		dev_err(adev->dev, "Failed to read EEPROM table header.\n");
+	goto err;
+	}
+
+	__decode_table_header_from_buff(hdr, &buff[2]);
+
+	if (hdr->header == EEPROM_TABLE_HDR_BAD) {
+		dev_warn(adev->dev, "Current GPU is BAD and should be RMAed.\n");
+		*bad_gpu = true;
+	}
+
+	ret = 0;
+err:
+	mutex_unlock(&control->tbl_mutex);
+	return ret;
+}
+
 int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 					    struct eeprom_table_record *records,
 					    bool write,
@@ -383,8 +433,10 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 	int i, ret = 0;
 	struct i2c_msg *msgs, *msg;
 	unsigned char *buffs, *buff;
+	bool sched_ras_recovery = false;
 	struct eeprom_table_record *record;
 	struct amdgpu_device *adev = to_amdgpu_device(control);
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
 	if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS)
 		return 0;
@@ -402,6 +454,25 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 		goto free_buff;
 	}
 
+	/*
+	 * If saved bad pages number exceeds the bad page threshod for
+	 * the whole VRAM, update table header to mark one BAD GPU and
+	 * schedule one ras recovery after eeprom write is done, this
+	 * can avoid the missing for latest records.
+	 *
+	 * This new header will be picked up and checked in the bootup by
+	 * ras recovery, which may break bootup process to notify user this
+	 * GPU is bad and to RMA(Return Merchandise Authorization) this GPU.
+	 */
+	if (write && (ras->bad_page_cnt_threshold != 0xFFFFFFFF) &&
+		((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
+		dev_warn(adev->dev,
+			"Saved bad pages(%d) reaches threshold value(%d).\n",
+			control->num_recs + num, ras->bad_page_cnt_threshold);
+			control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
+		sched_ras_recovery = true;
+	}
+
 	/* In case of overflow just start from beginning to not lose newest records */
 	if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES))
 		control->next_addr = EEPROM_RECORD_START;
@@ -482,6 +553,20 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 		__update_tbl_checksum(control, records, num, old_hdr_byte_sum);
 
 		__update_table_header(control, buffs);
+
+		if (sched_ras_recovery) {
+			/*
+			 * Before scheduling ras recovery, assert the related
+			 * flag first, which shall bypass common bad page
+			 * reservation execution in amdgpu_ras_reset_gpu.
+			 */
+			amdgpu_ras_get_context(adev)->flags |=
+				AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
+
+			dev_warn(adev->dev, "Conduct ras recovery due to bad "
+					"page threshold reached.\n");
+			amdgpu_ras_reset_gpu(adev);
+		}
 	} else if (!__validate_tbl_checksum(control, records, num)) {
 		DRM_WARN("EEPROM Table checksum mismatch!");
 		/* TODO Uncomment when EEPROM read/write is relliable */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index b272840cb069..4ccd139248a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -77,9 +77,13 @@ struct eeprom_table_record {
 	unsigned char mcumc_id;
 }__attribute__((__packed__));
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+				bool *bad_gpu);
 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
 
+int amdgpu_ras_eeprom_check_bad_gpu(struct amdgpu_ras_eeprom_control *control,
+				bool *bad_gpu);
+
 int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 					    struct eeprom_table_record *records,
 					    bool write,
-- 
2.17.1