[PATCH Review 4/4] query umc error info from ecc_table

Stanley.Yang Stanley.Yang at amd.com
Wed Nov 17 10:11:32 UTC 2021


if smu support ECCTABLE, driver can message smu to get ecc_table
then query umc error info from ECCTABLE
apply pmfw version check to ensure backward compatibility

Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 42 ++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  7 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       | 71 +++++++++++++------
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  1 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 12 ++++
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  4 ++
 6 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..6b0f2ba1e420 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -888,6 +888,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
 	}
 }
 
+static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
+{
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+	/*
+	 * choosing right query method according to
+	 * whether smu support query error information
+	 */
+	if ((ras->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
+			!smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc))) {
+
+		if (adev->umc.ras_funcs &&
+			adev->umc.ras_funcs->message_smu_query_ras_error_count)
+			adev->umc.ras_funcs->message_smu_query_ras_error_count(adev, err_data);
+
+		if (adev->umc.ras_funcs &&
+			adev->umc.ras_funcs->message_smu_query_ras_error_address)
+			adev->umc.ras_funcs->message_smu_query_ras_error_address(adev, err_data);
+	} else {
+		if (adev->umc.ras_funcs &&
+			adev->umc.ras_funcs->query_ras_error_count)
+			adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
+
+		/* umc query_ras_error_address is also responsible for clearing
+		 * error status
+		 */
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs->query_ras_error_address)
+			adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
+	}
+}
+
 /* query/inject/cure begin */
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 				  struct ras_query_if *info)
@@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 
 	switch (info->head.block) {
 	case AMDGPU_RAS_BLOCK__UMC:
-		if (adev->umc.ras_funcs &&
-		    adev->umc.ras_funcs->query_ras_error_count)
-			adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
-		/* umc query_ras_error_address is also responsible for clearing
-		 * error status
-		 */
-		if (adev->umc.ras_funcs &&
-		    adev->umc.ras_funcs->query_ras_error_address)
-			adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
+		amdgpu_ras_get_ecc_info(adev, &err_data);
 		break;
 	case AMDGPU_RAS_BLOCK__SDMA:
 		if (adev->sdma.funcs->query_ras_error_count) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bcbf3264d92f..3f0de0cc8403 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -322,6 +322,12 @@ struct ras_common_if {
 
 #define MAX_UMC_CHANNEL_NUM 32
 
+/*
+ * SMU support ECCTABLE since version 68.42.0,
+ * use this to decide query umc error info method
+ */
+#define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+
 struct ecc_info_per_ch {
 	uint16_t ce_count_lo_chip;
 	uint16_t ce_count_hi_chip;
@@ -375,6 +381,7 @@ struct amdgpu_ras {
 
 	/* record umc error info queried from smu */
 	struct umc_ecc_info umc_ecc;
+	uint32_t smu_version;
 };
 
 struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0c7c56a91b25..2c3e97c9410b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -97,28 +97,57 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->query_ras_error_count)
-	    adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
 
-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->query_ras_error_address &&
-	    adev->umc.max_ras_err_cnt_per_query) {
-		err_data->err_addr =
-			kcalloc(adev->umc.max_ras_err_cnt_per_query,
-				sizeof(struct eeprom_table_record), GFP_KERNEL);
-
-		/* still call query_ras_error_address to clear error status
-		 * even NOMEM error is encountered
-		 */
-		if(!err_data->err_addr)
-			dev_warn(adev->dev, "Failed to alloc memory for "
-					"umc error address record!\n");
-
-		/* umc query_ras_error_address is also responsible for clearing
-		 * error status
-		 */
-		adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+	if ((con->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
+			!smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc))) {
+
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs->message_smu_query_ras_error_count)
+		    adev->umc.ras_funcs->message_smu_query_ras_error_count(adev, ras_error_status);
+
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs->message_smu_query_ras_error_address &&
+		    adev->umc.max_ras_err_cnt_per_query) {
+			err_data->err_addr =
+				kcalloc(adev->umc.max_ras_err_cnt_per_query,
+					sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+			/* still call query_ras_error_address to clear error status
+			 * even NOMEM error is encountered
+			 */
+			if(!err_data->err_addr)
+				dev_warn(adev->dev, "Failed to alloc memory for "
+						"umc error address record!\n");
+
+			/* umc query_ras_error_address is also responsible for clearing
+			 * error status
+			 */
+			adev->umc.ras_funcs->message_smu_query_ras_error_address(adev, ras_error_status);
+		}
+	} else {
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs->query_ras_error_count)
+		    adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
+
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs->query_ras_error_address &&
+		    adev->umc.max_ras_err_cnt_per_query) {
+			err_data->err_addr =
+				kcalloc(adev->umc.max_ras_err_cnt_per_query,
+					sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+			/* still call query_ras_error_address to clear error status
+			 * even NOMEM error is encountered
+			 */
+			if(!err_data->err_addr)
+				dev_warn(adev->dev, "Failed to alloc memory for "
+						"umc error address record!\n");
+
+			/* umc query_ras_error_address is also responsible for clearing
+			 * error status
+			 */
+			adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+		}
 	}
 
 	/* only uncorrectable error needs gpu reset */
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index ea65de0160c3..7a06021a58f0 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -1404,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable);
 
 int smu_wait_for_event(struct amdgpu_device *adev, enum smu_event_type event,
 		       uint64_t event_arg);
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
 
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 01168b8955bf..6340c079f35e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3072,6 +3072,18 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable)
 	return ret;
 }
 
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc)
+{
+	int ret = -1;
+
+	if (smu->ppt_funcs &&
+		smu->ppt_funcs->get_ecc_info)
+		ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
+
+	return ret;
+
+}
+
 static int smu_get_prv_buffer_details(void *handle, void **addr, size_t *size)
 {
 	struct smu_context *smu = handle;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 55421ea622fb..55ef10ca684a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -200,11 +200,15 @@ int smu_v13_0_check_fw_version(struct smu_context *smu)
 	uint16_t smu_major;
 	uint8_t smu_minor, smu_debug;
 	int ret = 0;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(smu->adev);
 
 	ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
 	if (ret)
 		return ret;
 
+	/* record smu interface version, help umc query error method */
+	ras->smu_version = smu_version;
+
 	smu_major = (smu_version >> 16) & 0xffff;
 	smu_minor = (smu_version >> 8) & 0xff;
 	smu_debug = (smu_version >> 0) & 0xff;
-- 
2.17.1



More information about the amd-gfx mailing list