[PATCH Review v3 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
Lazar, Lijo
lijo.lazar at amd.com
Wed May 25 12:33:31 UTC 2022
On 5/25/2022 11:40 AM, Stanley.Yang wrote:
> SMU add a new variable mca_ceumc_addr to record
> umc correctable error address in EccInfo table,
> driver side add EccInfo_V2_t to support this feature
>
> Changed from V1:
> remove ecc_table_v2 and unnecessary table id, define union struct include
> EccInfo_t and EccInfo_V2_t.
>
> Changed from V2:
> sync patch verion
>
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>
Thanks,
Lijo
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
> .../inc/pmfw_if/smu13_driver_if_aldebaran.h | 16 +++++-
> .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 53 ++++++++++++++-----
> 3 files changed, 57 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index b9a6fac2b8b2..28e603243b67 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -328,6 +328,7 @@ struct ecc_info_per_ch {
> uint16_t ce_count_hi_chip;
> uint64_t mca_umc_status;
> uint64_t mca_umc_addr;
> + uint64_t mca_ceumc_addr;
> };
>
> struct umc_ecc_info {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> index 0f67c56c2863..6f92038470ec 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> @@ -519,7 +519,21 @@ typedef struct {
> } EccInfo_t;
>
> typedef struct {
> - EccInfo_t EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> + uint64_t mca_umc_status;
> + uint64_t mca_umc_addr;
> + uint64_t mca_ceumc_addr;
> +
> + uint16_t ce_count_lo_chip;
> + uint16_t ce_count_hi_chip;
> +
> + uint32_t eccPadding;
> +} EccInfo_V2_t;
> +
> +typedef struct {
> + union {
> + EccInfo_t EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> + EccInfo_V2_t EccInfo_V2[ALDEBARAN_UMC_CHANNEL_NUM];
> + };
> } EccInfoTable_t;
>
> // These defines are used with the following messages:
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 38af648cb857..9cdfeea58085 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -82,6 +82,12 @@
> */
> #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
>
> +/*
> + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
> + * use this to check mca_ceumc_addr record whether support
> + */
> +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
> +
> /*
> * SMU support BAD CHENNEL info MSG since version 68.51.00,
> * use this to check ECCTALE feature whether support
> @@ -1802,7 +1808,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
> return sizeof(struct gpu_metrics_v1_3);
> }
>
> -static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> +static int aldebaran_check_ecc_table_support(struct smu_context *smu,
> + int *ecctable_version)
> {
> uint32_t if_version = 0xff, smu_version = 0xff;
> int ret = 0;
> @@ -1815,6 +1822,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
>
> if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
> ret = -EOPNOTSUPP;
> + else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
> + smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
> + *ecctable_version = 1;
> + else
> + *ecctable_version = 2;
>
> return ret;
> }
> @@ -1826,9 +1838,10 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
> EccInfoTable_t *ecc_table = NULL;
> struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> int i, ret = 0;
> + int table_version = 0;
> struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
>
> - ret = aldebaran_check_ecc_table_support(smu);
> + ret = aldebaran_check_ecc_table_support(smu, &table_version);
> if (ret)
> return ret;
>
> @@ -1844,16 +1857,32 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>
> ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
>
> - for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> - ecc_info_per_channel = &(eccinfo->ecc[i]);
> - ecc_info_per_channel->ce_count_lo_chip =
> - ecc_table->EccInfo[i].ce_count_lo_chip;
> - ecc_info_per_channel->ce_count_hi_chip =
> - ecc_table->EccInfo[i].ce_count_hi_chip;
> - ecc_info_per_channel->mca_umc_status =
> - ecc_table->EccInfo[i].mca_umc_status;
> - ecc_info_per_channel->mca_umc_addr =
> - ecc_table->EccInfo[i].mca_umc_addr;
> + if (table_version == 1) {
> + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> + ecc_info_per_channel = &(eccinfo->ecc[i]);
> + ecc_info_per_channel->ce_count_lo_chip =
> + ecc_table->EccInfo[i].ce_count_lo_chip;
> + ecc_info_per_channel->ce_count_hi_chip =
> + ecc_table->EccInfo[i].ce_count_hi_chip;
> + ecc_info_per_channel->mca_umc_status =
> + ecc_table->EccInfo[i].mca_umc_status;
> + ecc_info_per_channel->mca_umc_addr =
> + ecc_table->EccInfo[i].mca_umc_addr;
> + }
> + } else if (table_version == 2) {
> + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> + ecc_info_per_channel = &(eccinfo->ecc[i]);
> + ecc_info_per_channel->ce_count_lo_chip =
> + ecc_table->EccInfo_V2[i].ce_count_lo_chip;
> + ecc_info_per_channel->ce_count_hi_chip =
> + ecc_table->EccInfo_V2[i].ce_count_hi_chip;
> + ecc_info_per_channel->mca_umc_status =
> + ecc_table->EccInfo_V2[i].mca_umc_status;
> + ecc_info_per_channel->mca_umc_addr =
> + ecc_table->EccInfo_V2[i].mca_umc_addr;
> + ecc_info_per_channel->mca_ceumc_addr =
> + ecc_table->EccInfo_V2[i].mca_ceumc_addr;
> + }
> }
>
> return ret;
>
More information about the amd-gfx
mailing list