[PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable

Lazar, Lijo lijo.lazar at amd.com
Wed May 25 12:33:31 UTC 2022



On 5/25/2022 11:40 AM, Stanley.Yang wrote:
> SMU add a new variable mca_ceumc_addr to record
> umc correctable error address in EccInfo table,
> driver side add EccInfo_V2_t to support this feature
> 
> Changed from V1:
> 	remove ecc_table_v2 and unnecessary table id, define union struct include
> 	EccInfo_t and EccInfo_V2_t.
> 
> Changed from V2:
> 	sync patch verion
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>

Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>

Thanks,
Lijo

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  1 +
>   .../inc/pmfw_if/smu13_driver_if_aldebaran.h   | 16 +++++-
>   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 53 ++++++++++++++-----
>   3 files changed, 57 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index b9a6fac2b8b2..28e603243b67 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -328,6 +328,7 @@ struct ecc_info_per_ch {
>   	uint16_t ce_count_hi_chip;
>   	uint64_t mca_umc_status;
>   	uint64_t mca_umc_addr;
> +	uint64_t mca_ceumc_addr;
>   };
>   
>   struct umc_ecc_info {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> index 0f67c56c2863..6f92038470ec 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> @@ -519,7 +519,21 @@ typedef struct {
>   } EccInfo_t;
>   
>   typedef struct {
> -	EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> +	uint64_t mca_umc_status;
> +	uint64_t mca_umc_addr;
> +	uint64_t mca_ceumc_addr;
> +
> +	uint16_t ce_count_lo_chip;
> +	uint16_t ce_count_hi_chip;
> +
> +	uint32_t eccPadding;
> +} EccInfo_V2_t;
> +
> +typedef struct {
> +	union {
> +		EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> +		EccInfo_V2_t EccInfo_V2[ALDEBARAN_UMC_CHANNEL_NUM];
> +	};
>   } EccInfoTable_t;
>   
>   // These defines are used with the following messages:
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 38af648cb857..9cdfeea58085 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -82,6 +82,12 @@
>    */
>   #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
>   
> +/*
> + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
> + * use this to check mca_ceumc_addr record whether support
> + */
> +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
> +
>   /*
>    * SMU support BAD CHENNEL info MSG since version 68.51.00,
>    * use this to check ECCTALE feature whether support
> @@ -1802,7 +1808,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
>   	return sizeof(struct gpu_metrics_v1_3);
>   }
>   
> -static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> +static int aldebaran_check_ecc_table_support(struct smu_context *smu,
> +		int *ecctable_version)
>   {
>   	uint32_t if_version = 0xff, smu_version = 0xff;
>   	int ret = 0;
> @@ -1815,6 +1822,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
>   
>   	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
>   		ret = -EOPNOTSUPP;
> +	else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
> +			smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
> +		*ecctable_version = 1;
> +	else
> +		*ecctable_version = 2;
>   
>   	return ret;
>   }
> @@ -1826,9 +1838,10 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>   	EccInfoTable_t *ecc_table = NULL;
>   	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
>   	int i, ret = 0;
> +	int table_version = 0;
>   	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
>   
> -	ret = aldebaran_check_ecc_table_support(smu);
> +	ret = aldebaran_check_ecc_table_support(smu, &table_version);
>   	if (ret)
>   		return ret;
>   
> @@ -1844,16 +1857,32 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>   
>   	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
>   
> -	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> -		ecc_info_per_channel = &(eccinfo->ecc[i]);
> -		ecc_info_per_channel->ce_count_lo_chip =
> -			ecc_table->EccInfo[i].ce_count_lo_chip;
> -		ecc_info_per_channel->ce_count_hi_chip =
> -			ecc_table->EccInfo[i].ce_count_hi_chip;
> -		ecc_info_per_channel->mca_umc_status =
> -			ecc_table->EccInfo[i].mca_umc_status;
> -		ecc_info_per_channel->mca_umc_addr =
> -			ecc_table->EccInfo[i].mca_umc_addr;
> +	if (table_version == 1) {
> +		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +			ecc_info_per_channel = &(eccinfo->ecc[i]);
> +			ecc_info_per_channel->ce_count_lo_chip =
> +				ecc_table->EccInfo[i].ce_count_lo_chip;
> +			ecc_info_per_channel->ce_count_hi_chip =
> +				ecc_table->EccInfo[i].ce_count_hi_chip;
> +			ecc_info_per_channel->mca_umc_status =
> +				ecc_table->EccInfo[i].mca_umc_status;
> +			ecc_info_per_channel->mca_umc_addr =
> +				ecc_table->EccInfo[i].mca_umc_addr;
> +		}
> +	} else if (table_version == 2) {
> +		for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> +			ecc_info_per_channel = &(eccinfo->ecc[i]);
> +			ecc_info_per_channel->ce_count_lo_chip =
> +				ecc_table->EccInfo_V2[i].ce_count_lo_chip;
> +			ecc_info_per_channel->ce_count_hi_chip =
> +				ecc_table->EccInfo_V2[i].ce_count_hi_chip;
> +			ecc_info_per_channel->mca_umc_status =
> +				ecc_table->EccInfo_V2[i].mca_umc_status;
> +			ecc_info_per_channel->mca_umc_addr =
> +				ecc_table->EccInfo_V2[i].mca_umc_addr;
> +			ecc_info_per_channel->mca_ceumc_addr =
> +				ecc_table->EccInfo_V2[i].mca_ceumc_addr;
> +		}
>   	}
>   
>   	return ret;
> 


More information about the amd-gfx mailing list