[PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table

Stanley.Yang Stanley.Yang at amd.com
Wed Nov 17 10:11:31 UTC 2021


support ECC TABLE message, this table include unc ras error count
and error address

Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  7 ++++
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 38 +++++++++++++++++++
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 24 ++++++++++++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h        |  3 ++
 5 files changed, 74 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index 3557f4e7fc30..ea65de0160c3 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -324,6 +324,7 @@ enum smu_table_id
 	SMU_TABLE_OVERDRIVE,
 	SMU_TABLE_I2C_COMMANDS,
 	SMU_TABLE_PACE,
+	SMU_TABLE_ECCINFO,
 	SMU_TABLE_COUNT,
 };
 
@@ -340,6 +341,7 @@ struct smu_table_context
 	void				*max_sustainable_clocks;
 	struct smu_bios_boot_up_values	boot_values;
 	void                            *driver_pptable;
+	void                            *ecc_table;
 	struct smu_table		tables[SMU_TABLE_COUNT];
 	/*
 	 * The driver table is just a staging buffer for
@@ -1261,6 +1263,11 @@ struct pptable_funcs {
 	 *										of SMUBUS table.
 	 */
 	int (*send_hbm_bad_pages_num)(struct smu_context *smu, uint32_t size);
+
+	/**
+	 * @get_ecc_table:  message SMU to get ECC INFO table.
+	 */
+	ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
 };
 
 typedef enum {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index f835d86cc2f5..5e4ba0e14a91 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -190,6 +190,7 @@ static const struct cmn2asic_mapping aldebaran_table_map[SMU_TABLE_COUNT] = {
 	TAB_MAP(SMU_METRICS),
 	TAB_MAP(DRIVER_SMU_CONFIG),
 	TAB_MAP(I2C_COMMANDS),
+	TAB_MAP(ECCINFO),
 };
 
 static const uint8_t aldebaran_throttler_map[] = {
@@ -223,6 +224,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
 	SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS, sizeof(SwI2cRequest_t),
 		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
 
+	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
+		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
 	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
 	if (!smu_table->metrics_table)
 		return -ENOMEM;
@@ -235,6 +239,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
 		return -ENOMEM;
 	}
 
+	smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size, GFP_KERNEL);
+	if (!smu_table->ecc_table)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -1765,6 +1773,35 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
 	return sizeof(struct gpu_metrics_v1_3);
 }
 
+static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
+					 void *table)
+{
+	struct smu_table_context *smu_table = &smu->smu_table;
+	EccInfoTable_t ecc_table;
+	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
+	int i, ret = 0;
+	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
+
+	ret = smu_cmn_get_ecc_info_table(smu,
+					&ecc_table);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+		ecc_info_per_channel = &(eccinfo->ecc[i]);
+		ecc_info_per_channel->ce_count_lo_chip =
+			ecc_table.EccInfo[i].ce_count_lo_chip;
+		ecc_info_per_channel->ce_count_hi_chip =
+			ecc_table.EccInfo[i].ce_count_hi_chip;
+		ecc_info_per_channel->mca_umc_status =
+			ecc_table.EccInfo[i].mca_umc_status;
+		ecc_info_per_channel->mca_umc_addr =
+			ecc_table.EccInfo[i].mca_umc_addr;
+	}
+
+	return ret;
+}
+
 static int aldebaran_mode1_reset(struct smu_context *smu)
 {
 	u32 smu_version, fatal_err, param;
@@ -1967,6 +2004,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
 	.i2c_init = aldebaran_i2c_control_init,
 	.i2c_fini = aldebaran_i2c_control_fini,
 	.send_hbm_bad_pages_num = aldebaran_smu_send_hbm_bad_page_num,
+	.get_ecc_info = aldebaran_get_ecc_info,
 };
 
 void aldebaran_set_ppt_funcs(struct smu_context *smu)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 4d96099a9bb1..55421ea622fb 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
 	kfree(smu_table->hardcode_pptable);
 	smu_table->hardcode_pptable = NULL;
 
+	kfree(smu_table->ecc_table);
 	kfree(smu_table->metrics_table);
 	kfree(smu_table->watermarks_table);
+	smu_table->ecc_table = NULL;
 	smu_table->metrics_table = NULL;
 	smu_table->watermarks_table = NULL;
 	smu_table->metrics_time = 0;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 843d2cbfc71d..e229c9b09d80 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -983,6 +983,30 @@ int smu_cmn_get_metrics_table(struct smu_context *smu,
 	return ret;
 }
 
+int smu_cmn_get_ecc_info_table(struct smu_context *smu,
+				     void *ecc_table)
+{
+	struct smu_table_context *smu_table= &smu->smu_table;
+	uint32_t table_size =
+		smu_table->tables[SMU_TABLE_ECCINFO].size;
+	int ret = 0;
+
+	ret = smu_cmn_update_table(smu,
+			       SMU_TABLE_ECCINFO,
+			       0,
+			       smu_table->ecc_table,
+			       false);
+	if (ret) {
+		dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
+		return ret;
+	}
+
+	if (ecc_table)
+		memcpy(ecc_table, smu_table->ecc_table, table_size);
+
+	return 0;
+}
+
 void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t crev)
 {
 	struct metrics_table_header *header = (struct metrics_table_header *)table;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index beea03810bca..0adc5451373b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -105,6 +105,9 @@ int smu_cmn_get_metrics_table(struct smu_context *smu,
 			      void *metrics_table,
 			      bool bypass_cache);
 
+int smu_cmn_get_ecc_info_table(struct smu_context *smu,
+			      void *table);
+
 void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t crev);
 
 int smu_cmn_set_mp1_state(struct smu_context *smu,
-- 
2.17.1



More information about the amd-gfx mailing list