[PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable

Lazar, Lijo lijo.lazar at amd.com
Mon May 23 09:12:12 UTC 2022



On 5/23/2022 1:47 PM, Stanley.Yang wrote:
> SMU add a new variable mca_ceumc_addr to record
> umc correctable error address in EccInfo table,
> driver side add ecctable_v2 to support this feature
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
>   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
>   .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
>   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
>   5 files changed, 98 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index b9a6fac2b8b2..28e603243b67 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -328,6 +328,7 @@ struct ecc_info_per_ch {
>   	uint16_t ce_count_hi_chip;
>   	uint64_t mca_umc_status;
>   	uint64_t mca_umc_addr;
> +	uint64_t mca_ceumc_addr;
>   };
>   
>   struct umc_ecc_info {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index a6a7b6c33683..9f7257ada437 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -322,6 +322,7 @@ enum smu_table_id
>   	SMU_TABLE_PACE,
>   	SMU_TABLE_ECCINFO,
>   	SMU_TABLE_COMBO_PPTABLE,
> +	SMU_TABLE_ECCINFO_V2,

Hi Stanley,

This is not the right approach. Need to ask FW team to fix this. There 
shouldn't be any new id with each version of table. You may check Sienna 
Cichlid smu metrics table as an example and ask FW team to follow 
something similar. I don't see 68.55 being released, so it's not late 
anyway. We don't need to keep defining pointers in table context per 
version of ECC table.

Thanks,
Lijo

>   	SMU_TABLE_COUNT,
>   };
>   
> @@ -340,6 +341,7 @@ struct smu_table_context
>   	void				*driver_pptable;
>   	void				*combo_pptable;
>   	void                            *ecc_table;
> +	void                            *ecc_table_v2;	// adapt to smu support record mca_ceumc_addr
>   	void				*driver_smu_config_table;
>   	struct smu_table		tables[SMU_TABLE_COUNT];
>   	/*
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> index 0f67c56c2863..2868604eff49 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> @@ -522,6 +522,21 @@ typedef struct {
>   	EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
>   } EccInfoTable_t;
>   
> +typedef struct {
> +	uint64_t mca_umc_status;
> +	uint64_t mca_umc_addr;
> +	uint64_t mca_ceumc_addr;
> +
> +	uint16_t ce_count_lo_chip;
> +	uint16_t ce_count_hi_chip;
> +
> +	uint32_t eccPadding;
> +} EccInfo_t_v2;
> +
> +typedef struct {
> +	EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> +} EccInfoTable_t_v2;
> +
>   // These defines are used with the following messages:
>   // SMC_MSG_TransferTableDram2Smu
>   // SMC_MSG_TransferTableSmu2Dram
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 38af648cb857..e58df9490cec 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -82,6 +82,12 @@
>    */
>   #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
>   
> +/*
> + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
> + * use this to check mca_ceumc_addr record whether support
> + */
> +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
> +
>   /*
>    * SMU support BAD CHENNEL info MSG since version 68.51.00,
>    * use this to check ECCTALE feature whether support
> @@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
>   	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
>   		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>   
> +	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2),
> +			PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> +
>   	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
>   	if (!smu_table->metrics_table)
>   		return -ENOMEM;
> @@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
>   	if (!smu_table->ecc_table)
>   		return -ENOMEM;
>   
> +	smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
> +	if (!smu_table->ecc_table_v2)
> +		return -ENOMEM;;
> +
>   	return 0;
>   }
>   
> @@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
>   	return sizeof(struct gpu_metrics_v1_3);
>   }
>   
> -static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> +static int aldebaran_check_ecc_table_support(struct smu_context *smu,
> +		int *ecctable_version)
>   {
>   	uint32_t if_version = 0xff, smu_version = 0xff;
>   	int ret = 0;
> @@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
>   
>   	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
>   		ret = -EOPNOTSUPP;
> +	else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
> +			smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
> +		*ecctable_version = 1;
> +	else
> +		*ecctable_version = 2;
>   
>   	return ret;
>   }
> @@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>   {
>   	struct smu_table_context *smu_table = &smu->smu_table;
>   	EccInfoTable_t *ecc_table = NULL;
> +	EccInfoTable_t_v2 *ecc_table_v2 = NULL;
>   	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
>   	int i, ret = 0;
> +	int table_version = 0;
>   	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
>   
> -	ret = aldebaran_check_ecc_table_support(smu);
> +	ret = aldebaran_check_ecc_table_support(smu, &table_version);
>   	if (ret)
>   		return ret;
>   
> -	ret = smu_cmn_update_table(smu,
> -			       SMU_TABLE_ECCINFO,
> -			       0,
> -			       smu_table->ecc_table,
> -			       false);
> -	if (ret) {
> -		dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> -		return ret;
> -	}
> +	if (table_version == 1) {
> +		ret = smu_cmn_update_table(smu,
> +				       SMU_TABLE_ECCINFO,
> +				       0,
> +				       smu_table->ecc_table,
> +				       false);
> +		if (ret) {
> +			dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> +			return ret;
> +		}
> +
> +		ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> +
> +		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +			ecc_info_per_channel = &(eccinfo->ecc[i]);
> +			ecc_info_per_channel->ce_count_lo_chip =
> +				ecc_table->EccInfo[i].ce_count_lo_chip;
> +			ecc_info_per_channel->ce_count_hi_chip =
> +				ecc_table->EccInfo[i].ce_count_hi_chip;
> +			ecc_info_per_channel->mca_umc_status =
> +				ecc_table->EccInfo[i].mca_umc_status;
> +			ecc_info_per_channel->mca_umc_addr =
> +				ecc_table->EccInfo[i].mca_umc_addr;
> +		}
> +	} else if (table_version == 2) {
> +		/* still use SMU_TABLE_ECC_INFO index,
> +		 * smu 68.55.0 add mca_ceumc_addr variable
> +		 * in EccInfo_t struct to report correctable
> +		 * error address and the table_id is not changed
> +		 */
> +		ret = smu_cmn_update_table(smu,
> +				       SMU_TABLE_ECCINFO,
> +				       0,
> +				       smu_table->ecc_table_v2,
> +					   false);
>   
> -	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> -
> -	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> -		ecc_info_per_channel = &(eccinfo->ecc[i]);
> -		ecc_info_per_channel->ce_count_lo_chip =
> -			ecc_table->EccInfo[i].ce_count_lo_chip;
> -		ecc_info_per_channel->ce_count_hi_chip =
> -			ecc_table->EccInfo[i].ce_count_hi_chip;
> -		ecc_info_per_channel->mca_umc_status =
> -			ecc_table->EccInfo[i].mca_umc_status;
> -		ecc_info_per_channel->mca_umc_addr =
> -			ecc_table->EccInfo[i].mca_umc_addr;
> +		if (ret) {
> +			dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n");
> +			return ret;
> +		}
> +
> +		ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
> +
> +		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +			ecc_info_per_channel = &(eccinfo->ecc[i]);
> +			ecc_info_per_channel->ce_count_lo_chip =
> +				ecc_table_v2->EccInfo[i].ce_count_lo_chip;
> +			ecc_info_per_channel->ce_count_hi_chip =
> +				ecc_table_v2->EccInfo[i].ce_count_hi_chip;
> +			ecc_info_per_channel->mca_umc_status =
> +				ecc_table_v2->EccInfo[i].mca_umc_status;
> +			ecc_info_per_channel->mca_umc_addr =
> +				ecc_table_v2->EccInfo[i].mca_umc_addr;
> +			ecc_info_per_channel->mca_ceumc_addr =
> +				ecc_table_v2->EccInfo[i].mca_ceumc_addr;
> +		}
>   	}
>   
>   	return ret;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index ae6321af9d88..af2d84a16f3e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
>   	kfree(smu_table->hardcode_pptable);
>   	smu_table->hardcode_pptable = NULL;
>   
> +	kfree(smu_table->ecc_table_v2);
>   	kfree(smu_table->ecc_table);
>   	kfree(smu_table->metrics_table);
>   	kfree(smu_table->watermarks_table);
> +	smu_table->ecc_table_v2 = NULL;
>   	smu_table->ecc_table = NULL;
>   	smu_table->metrics_table = NULL;
>   	smu_table->watermarks_table = NULL;
> 


More information about the amd-gfx mailing list