回复: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable

Yang, Stanley Stanley.Yang at amd.com
Mon May 23 10:11:01 UTC 2022


[AMD Official Use Only - General]

Hi Lijo,
+ at Joo, Maria

> -----邮件原件-----
> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
> 发送时间: Monday, May 23, 2022 5:12 PM
> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1,
> Tao <Tao.Zhou1 at amd.com>; Quan, Evan <Evan.Quan at amd.com>
> 主题: Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in
> ecctable
>
>
>
> On 5/23/2022 1:47 PM, Stanley.Yang wrote:
> > SMU add a new variable mca_ceumc_addr to record umc correctable error
> > address in EccInfo table, driver side add ecctable_v2 to support this
> > feature
> >
> > Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
> >   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |   2 +
> >   .../inc/pmfw_if/smu13_driver_if_aldebaran.h   |  15 +++
> >   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 101 ++++++++++++++----
> >   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |   2 +
> >   5 files changed, 98 insertions(+), 23 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index b9a6fac2b8b2..28e603243b67 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -328,6 +328,7 @@ struct ecc_info_per_ch {
> >     uint16_t ce_count_hi_chip;
> >     uint64_t mca_umc_status;
> >     uint64_t mca_umc_addr;
> > +   uint64_t mca_ceumc_addr;
> >   };
> >
> >   struct umc_ecc_info {
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > index a6a7b6c33683..9f7257ada437 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > @@ -322,6 +322,7 @@ enum smu_table_id
> >     SMU_TABLE_PACE,
> >     SMU_TABLE_ECCINFO,
> >     SMU_TABLE_COMBO_PPTABLE,
> > +   SMU_TABLE_ECCINFO_V2,
>
> Hi Stanley,
>
> This is not the right approach. Need to ask FW team to fix this. There shouldn't
> be any new id with each version of table. You may check Sienna Cichlid smu
> metrics table as an example and ask FW team to follow something similar. I
> don't see 68.55 being released, so it's not late anyway. We don't need to keep
> defining pointers in table context per version of ECC table.
>
> Thanks,
> Lijo
[Yang, Stanley] : There is not enough padding space in ecc_table, you can check below struct,
the new added variable is uint32_t type, I think smu can't add uint32_t type in ecc_table directly without change ecc_tabe size.
If you have any better approach, we can discuss a better method to complete it.
512 typedef struct {
513     uint64_t mca_umc_status;
514     uint64_t mca_umc_addr;
515     uint16_t ce_count_lo_chip;
516     uint16_t ce_count_hi_chip;
517
518     uint32_t eccPadding;
519 } EccInfo_t;

Thanks,
Stanley
>
> >     SMU_TABLE_COUNT,
> >   };
> >
> > @@ -340,6 +341,7 @@ struct smu_table_context
> >     void                            *driver_pptable;
> >     void                            *combo_pptable;
> >     void                            *ecc_table;
> > +   void                            *ecc_table_v2;  // adapt to smu support record
> mca_ceumc_addr
> >     void                            *driver_smu_config_table;
> >     struct smu_table                tables[SMU_TABLE_COUNT];
> >     /*
> > diff --git
> >
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> >
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> > index 0f67c56c2863..2868604eff49 100644
> > ---
> >
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> > +++
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebar
> > +++ an.h
> > @@ -522,6 +522,21 @@ typedef struct {
> >     EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> >   } EccInfoTable_t;
> >
> > +typedef struct {
> > +   uint64_t mca_umc_status;
> > +   uint64_t mca_umc_addr;
> > +   uint64_t mca_ceumc_addr;
> > +
> > +   uint16_t ce_count_lo_chip;
> > +   uint16_t ce_count_hi_chip;
> > +
> > +   uint32_t eccPadding;
> > +} EccInfo_t_v2;
> > +
> > +typedef struct {
> > +   EccInfo_t_v2  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> > +} EccInfoTable_t_v2;
> > +
> >   // These defines are used with the following messages:
> >   // SMC_MSG_TransferTableDram2Smu
> >   // SMC_MSG_TransferTableSmu2Dram
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > index 38af648cb857..e58df9490cec 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > @@ -82,6 +82,12 @@
> >    */
> >   #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> >
> > +/*
> > + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
> > + * use this to check mca_ceumc_addr record whether support  */
> > +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
> > +
> >   /*
> >    * SMU support BAD CHENNEL info MSG since version 68.51.00,
> >    * use this to check ECCTALE feature whether support @@ -239,6
> > +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
> >     SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
> >                    PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> >
> > +   SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2,
> sizeof(EccInfoTable_t_v2),
> > +                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> > +
> >     smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
> >     if (!smu_table->metrics_table)
> >             return -ENOMEM;
> > @@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context
> *smu)
> >     if (!smu_table->ecc_table)
> >             return -ENOMEM;
> >
> > +   smu_table->ecc_table_v2 =
> kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
> > +   if (!smu_table->ecc_table_v2)
> > +           return -ENOMEM;;
> > +
> >     return 0;
> >   }
> >
> > @@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct
> smu_context *smu,
> >     return sizeof(struct gpu_metrics_v1_3);
> >   }
> >
> > -static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> > +static int aldebaran_check_ecc_table_support(struct smu_context *smu,
> > +           int *ecctable_version)
> >   {
> >     uint32_t if_version = 0xff, smu_version = 0xff;
> >     int ret = 0;
> > @@ -1815,6 +1829,11 @@ static int
> > aldebaran_check_ecc_table_support(struct smu_context *smu)
> >
> >     if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
> >             ret = -EOPNOTSUPP;
> > +   else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
> > +                   smu_version <
> SUPPORT_ECCTABLE_V2_SMU_VERSION)
> > +           *ecctable_version = 1;
> > +   else
> > +           *ecctable_version = 2;
> >
> >     return ret;
> >   }
> > @@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct
> smu_context *smu,
> >   {
> >     struct smu_table_context *smu_table = &smu->smu_table;
> >     EccInfoTable_t *ecc_table = NULL;
> > +   EccInfoTable_t_v2 *ecc_table_v2 = NULL;
> >     struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> >     int i, ret = 0;
> > +   int table_version = 0;
> >     struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
> >
> > -   ret = aldebaran_check_ecc_table_support(smu);
> > +   ret = aldebaran_check_ecc_table_support(smu, &table_version);
> >     if (ret)
> >             return ret;
> >
> > -   ret = smu_cmn_update_table(smu,
> > -                          SMU_TABLE_ECCINFO,
> > -                          0,
> > -                          smu_table->ecc_table,
> > -                          false);
> > -   if (ret) {
> > -           dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> > -           return ret;
> > -   }
> > +   if (table_version == 1) {
> > +           ret = smu_cmn_update_table(smu,
> > +                                  SMU_TABLE_ECCINFO,
> > +                                  0,
> > +                                  smu_table->ecc_table,
> > +                                  false);
> > +           if (ret) {
> > +                   dev_info(smu->adev->dev, "Failed to export SMU ecc
> table!\n");
> > +                   return ret;
> > +           }
> > +
> > +           ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> > +
> > +           for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > +                   ecc_info_per_channel = &(eccinfo->ecc[i]);
> > +                   ecc_info_per_channel->ce_count_lo_chip =
> > +                           ecc_table->EccInfo[i].ce_count_lo_chip;
> > +                   ecc_info_per_channel->ce_count_hi_chip =
> > +                           ecc_table->EccInfo[i].ce_count_hi_chip;
> > +                   ecc_info_per_channel->mca_umc_status =
> > +                           ecc_table->EccInfo[i].mca_umc_status;
> > +                   ecc_info_per_channel->mca_umc_addr =
> > +                           ecc_table->EccInfo[i].mca_umc_addr;
> > +           }
> > +   } else if (table_version == 2) {
> > +           /* still use SMU_TABLE_ECC_INFO index,
> > +            * smu 68.55.0 add mca_ceumc_addr variable
> > +            * in EccInfo_t struct to report correctable
> > +            * error address and the table_id is not changed
> > +            */
> > +           ret = smu_cmn_update_table(smu,
> > +                                  SMU_TABLE_ECCINFO,
> > +                                  0,
> > +                                  smu_table->ecc_table_v2,
> > +                                      false);
> >
> > -   ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> > -
> > -   for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > -           ecc_info_per_channel = &(eccinfo->ecc[i]);
> > -           ecc_info_per_channel->ce_count_lo_chip =
> > -                   ecc_table->EccInfo[i].ce_count_lo_chip;
> > -           ecc_info_per_channel->ce_count_hi_chip =
> > -                   ecc_table->EccInfo[i].ce_count_hi_chip;
> > -           ecc_info_per_channel->mca_umc_status =
> > -                   ecc_table->EccInfo[i].mca_umc_status;
> > -           ecc_info_per_channel->mca_umc_addr =
> > -                   ecc_table->EccInfo[i].mca_umc_addr;
> > +           if (ret) {
> > +                   dev_info(smu->adev->dev, "Failed to export SMU ecc
> table_v2!\n");
> > +                   return ret;
> > +           }
> > +
> > +           ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
> > +
> > +           for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > +                   ecc_info_per_channel = &(eccinfo->ecc[i]);
> > +                   ecc_info_per_channel->ce_count_lo_chip =
> > +                           ecc_table_v2->EccInfo[i].ce_count_lo_chip;
> > +                   ecc_info_per_channel->ce_count_hi_chip =
> > +                           ecc_table_v2->EccInfo[i].ce_count_hi_chip;
> > +                   ecc_info_per_channel->mca_umc_status =
> > +                           ecc_table_v2->EccInfo[i].mca_umc_status;
> > +                   ecc_info_per_channel->mca_umc_addr =
> > +                           ecc_table_v2->EccInfo[i].mca_umc_addr;
> > +                   ecc_info_per_channel->mca_ceumc_addr =
> > +                           ecc_table_v2->EccInfo[i].mca_ceumc_addr;
> > +           }
> >     }
> >
> >     return ret;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > index ae6321af9d88..af2d84a16f3e 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > @@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context
> *smu)
> >     kfree(smu_table->hardcode_pptable);
> >     smu_table->hardcode_pptable = NULL;
> >
> > +   kfree(smu_table->ecc_table_v2);
> >     kfree(smu_table->ecc_table);
> >     kfree(smu_table->metrics_table);
> >     kfree(smu_table->watermarks_table);
> > +   smu_table->ecc_table_v2 = NULL;
> >     smu_table->ecc_table = NULL;
> >     smu_table->metrics_table = NULL;
> >     smu_table->watermarks_table = NULL;
> >


More information about the amd-gfx mailing list