[PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table v2

Lazar, Lijo lijo.lazar at amd.com
Thu Nov 18 11:32:47 UTC 2021



On 11/18/2021 3:03 PM, Stanley.Yang wrote:
> support ECC TABLE message, this table include umc ras error count
> and error address
> 
> v2:
>      add smu version check to query whether support ecctable
>      call smu_cmn_update_table to get ecctable directly
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  8 +++
>   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 14 ++++
>   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 70 +++++++++++++++++++
>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
>   4 files changed, 94 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> index 3557f4e7fc30..7a06021a58f0 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> @@ -324,6 +324,7 @@ enum smu_table_id
>   	SMU_TABLE_OVERDRIVE,
>   	SMU_TABLE_I2C_COMMANDS,
>   	SMU_TABLE_PACE,
> +	SMU_TABLE_ECCINFO,
>   	SMU_TABLE_COUNT,
>   };
>   
> @@ -340,6 +341,7 @@ struct smu_table_context
>   	void				*max_sustainable_clocks;
>   	struct smu_bios_boot_up_values	boot_values;
>   	void                            *driver_pptable;
> +	void                            *ecc_table;
>   	struct smu_table		tables[SMU_TABLE_COUNT];
>   	/*
>   	 * The driver table is just a staging buffer for
> @@ -1261,6 +1263,11 @@ struct pptable_funcs {
>   	 *										of SMUBUS table.
>   	 */
>   	int (*send_hbm_bad_pages_num)(struct smu_context *smu, uint32_t size);
> +
> +	/**
> +	 * @get_ecc_table:  message SMU to get ECC INFO table.
> +	 */
> +	ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
>   };
>   
>   typedef enum {
> @@ -1397,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable);
>   
>   int smu_wait_for_event(struct amdgpu_device *adev, enum smu_event_type event,
>   		       uint64_t event_arg);
> +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
>   
>   #endif
>   #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 01168b8955bf..fd3b6b460b12 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -3072,6 +3072,20 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable)
>   	return ret;
>   }
>   
> +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc)
> +{
> +	int ret = -EOPNOTSUPP;
> +
> +	mutex_lock(&smu->mutex);
> +	if (smu->ppt_funcs &&
> +		smu->ppt_funcs->get_ecc_info)
> +		ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
> +	mutex_unlock(&smu->mutex);
> +
> +	return ret;
> +
> +}
> +
>   static int smu_get_prv_buffer_details(void *handle, void **addr, size_t *size)
>   {
>   	struct smu_context *smu = handle;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index f835d86cc2f5..4c21609ccea5 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -78,6 +78,12 @@
>   
>   #define smnPCIE_ESM_CTRL			0x111003D0
>   
> +/*
> + * SMU support ECCTABLE since version 68.42.0,
> + * use this to check ECCTALE feature whether support
> + */
> +#define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> +
>   static const struct smu_temperature_range smu13_thermal_policy[] =
>   {
>   	{-273150,  99000, 99000, -273150, 99000, 99000, -273150, 99000, 99000},
> @@ -190,6 +196,7 @@ static const struct cmn2asic_mapping aldebaran_table_map[SMU_TABLE_COUNT] = {
>   	TAB_MAP(SMU_METRICS),
>   	TAB_MAP(DRIVER_SMU_CONFIG),
>   	TAB_MAP(I2C_COMMANDS),
> +	TAB_MAP(ECCINFO),
>   };
>   
>   static const uint8_t aldebaran_throttler_map[] = {
> @@ -223,6 +230,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
>   	SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS, sizeof(SwI2cRequest_t),
>   		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>   
> +	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
> +		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> +
>   	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
>   	if (!smu_table->metrics_table)
>   		return -ENOMEM;
> @@ -235,6 +245,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
>   		return -ENOMEM;
>   	}
>   
> +	smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size, GFP_KERNEL);
> +	if (!smu_table->ecc_table)
> +		return -ENOMEM;
> +
>   	return 0;
>   }
>   
> @@ -1765,6 +1779,61 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
>   	return sizeof(struct gpu_metrics_v1_3);
>   }
>   
> +static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> +{
> +	uint32_t if_version = 0xff, smu_version = 0xff;
> +	int ret = 0;
> +
> +	ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
> +	if (ret)
> +		ret = -EOPNOTSUPP;	// return not support if failed get smu_version
> +
> +	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
> +		ret = -EOPNOTSUPP;
> +
> +	return ret;
> +}
> +
> +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
> +					 void *table)
> +{
> +	struct smu_table_context *smu_table = &smu->smu_table;
> +	EccInfoTable_t *ecc_table = NULL;
> +	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> +	int i, ret = 0;
> +	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
> +

Missed to ask last time. Since umc_ecc_info is a common struct, do you 
also want to pass back the number of channels having data?

Now this struct can hold max of 32 channel data. Let's say if the same 
interface is going to be used on another ASIC X having only 16 channels. 
Then the callback for ASIC X fills data only for 16 channels. Or, you 
expect that to be taken care at the caller side?

Thanks,
Lijo

> +	ret = aldebaran_check_ecc_table_support(smu);
> +	if (ret)
> +		return ret;
> +
> +	ret = smu_cmn_update_table(smu,
> +			       SMU_TABLE_ECCINFO,
> +			       0,
> +			       smu_table->ecc_table,
> +			       false);
> +	if (ret) {
> +		dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> +		return ret;
> +	}
> +
> +	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> +
> +	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +		ecc_info_per_channel = &(eccinfo->ecc[i]);
> +		ecc_info_per_channel->ce_count_lo_chip =
> +			ecc_table->EccInfo[i].ce_count_lo_chip;
> +		ecc_info_per_channel->ce_count_hi_chip =
> +			ecc_table->EccInfo[i].ce_count_hi_chip;
> +		ecc_info_per_channel->mca_umc_status =
> +			ecc_table->EccInfo[i].mca_umc_status;
> +		ecc_info_per_channel->mca_umc_addr =
> +			ecc_table->EccInfo[i].mca_umc_addr;
> +	}
> +
> +	return ret;
> +}
> +
>   static int aldebaran_mode1_reset(struct smu_context *smu)
>   {
>   	u32 smu_version, fatal_err, param;
> @@ -1967,6 +2036,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
>   	.i2c_init = aldebaran_i2c_control_init,
>   	.i2c_fini = aldebaran_i2c_control_fini,
>   	.send_hbm_bad_pages_num = aldebaran_smu_send_hbm_bad_page_num,
> +	.get_ecc_info = aldebaran_get_ecc_info,
>   };
>   
>   void aldebaran_set_ppt_funcs(struct smu_context *smu)
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index 4d96099a9bb1..55421ea622fb 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
>   	kfree(smu_table->hardcode_pptable);
>   	smu_table->hardcode_pptable = NULL;
>   
> +	kfree(smu_table->ecc_table);
>   	kfree(smu_table->metrics_table);
>   	kfree(smu_table->watermarks_table);
> +	smu_table->ecc_table = NULL;
>   	smu_table->metrics_table = NULL;
>   	smu_table->watermarks_table = NULL;
>   	smu_table->metrics_time = 0;
> 


More information about the amd-gfx mailing list