回复: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
Yang, Stanley
Stanley.Yang at amd.com
Mon May 23 10:11:01 UTC 2022
[AMD Official Use Only - General]
Hi Lijo,
+ at Joo, Maria
> -----邮件原件-----
> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
> 发送时间: Monday, May 23, 2022 5:12 PM
> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1,
> Tao <Tao.Zhou1 at amd.com>; Quan, Evan <Evan.Quan at amd.com>
> 主题: Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in
> ecctable
>
>
>
> On 5/23/2022 1:47 PM, Stanley.Yang wrote:
> > SMU add a new variable mca_ceumc_addr to record umc correctable error
> > address in EccInfo table, driver side add ecctable_v2 to support this
> > feature
> >
> > Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
> > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 2 +
> > .../inc/pmfw_if/smu13_driver_if_aldebaran.h | 15 +++
> > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 101 ++++++++++++++----
> > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 +
> > 5 files changed, 98 insertions(+), 23 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index b9a6fac2b8b2..28e603243b67 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -328,6 +328,7 @@ struct ecc_info_per_ch {
> > uint16_t ce_count_hi_chip;
> > uint64_t mca_umc_status;
> > uint64_t mca_umc_addr;
> > + uint64_t mca_ceumc_addr;
> > };
> >
> > struct umc_ecc_info {
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > index a6a7b6c33683..9f7257ada437 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > @@ -322,6 +322,7 @@ enum smu_table_id
> > SMU_TABLE_PACE,
> > SMU_TABLE_ECCINFO,
> > SMU_TABLE_COMBO_PPTABLE,
> > + SMU_TABLE_ECCINFO_V2,
>
> Hi Stanley,
>
> This is not the right approach. Need to ask FW team to fix this. There shouldn't
> be any new id with each version of table. You may check Sienna Cichlid smu
> metrics table as an example and ask FW team to follow something similar. I
> don't see 68.55 being released, so it's not late anyway. We don't need to keep
> defining pointers in table context per version of ECC table.
>
> Thanks,
> Lijo
[Yang, Stanley] : There is not enough padding space in ecc_table, you can check below struct,
the new added variable is uint32_t type, I think smu can't add uint32_t type in ecc_table directly without change ecc_tabe size.
If you have any better approach, we can discuss a better method to complete it.
512 typedef struct {
513 uint64_t mca_umc_status;
514 uint64_t mca_umc_addr;
515 uint16_t ce_count_lo_chip;
516 uint16_t ce_count_hi_chip;
517
518 uint32_t eccPadding;
519 } EccInfo_t;
Thanks,
Stanley
>
> > SMU_TABLE_COUNT,
> > };
> >
> > @@ -340,6 +341,7 @@ struct smu_table_context
> > void *driver_pptable;
> > void *combo_pptable;
> > void *ecc_table;
> > + void *ecc_table_v2; // adapt to smu support record
> mca_ceumc_addr
> > void *driver_smu_config_table;
> > struct smu_table tables[SMU_TABLE_COUNT];
> > /*
> > diff --git
> >
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> >
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> > index 0f67c56c2863..2868604eff49 100644
> > ---
> >
> a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
> > +++
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebar
> > +++ an.h
> > @@ -522,6 +522,21 @@ typedef struct {
> > EccInfo_t EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> > } EccInfoTable_t;
> >
> > +typedef struct {
> > + uint64_t mca_umc_status;
> > + uint64_t mca_umc_addr;
> > + uint64_t mca_ceumc_addr;
> > +
> > + uint16_t ce_count_lo_chip;
> > + uint16_t ce_count_hi_chip;
> > +
> > + uint32_t eccPadding;
> > +} EccInfo_t_v2;
> > +
> > +typedef struct {
> > + EccInfo_t_v2 EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
> > +} EccInfoTable_t_v2;
> > +
> > // These defines are used with the following messages:
> > // SMC_MSG_TransferTableDram2Smu
> > // SMC_MSG_TransferTableSmu2Dram
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > index 38af648cb857..e58df9490cec 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > @@ -82,6 +82,12 @@
> > */
> > #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> >
> > +/*
> > + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
> > + * use this to check mca_ceumc_addr record whether support */
> > +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
> > +
> > /*
> > * SMU support BAD CHENNEL info MSG since version 68.51.00,
> > * use this to check ECCTALE feature whether support @@ -239,6
> > +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
> > SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
> > PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> >
> > + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2,
> sizeof(EccInfoTable_t_v2),
> > + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> > +
> > smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
> > if (!smu_table->metrics_table)
> > return -ENOMEM;
> > @@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context
> *smu)
> > if (!smu_table->ecc_table)
> > return -ENOMEM;
> >
> > + smu_table->ecc_table_v2 =
> kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
> > + if (!smu_table->ecc_table_v2)
> > + return -ENOMEM;;
> > +
> > return 0;
> > }
> >
> > @@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct
> smu_context *smu,
> > return sizeof(struct gpu_metrics_v1_3);
> > }
> >
> > -static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> > +static int aldebaran_check_ecc_table_support(struct smu_context *smu,
> > + int *ecctable_version)
> > {
> > uint32_t if_version = 0xff, smu_version = 0xff;
> > int ret = 0;
> > @@ -1815,6 +1829,11 @@ static int
> > aldebaran_check_ecc_table_support(struct smu_context *smu)
> >
> > if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
> > ret = -EOPNOTSUPP;
> > + else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
> > + smu_version <
> SUPPORT_ECCTABLE_V2_SMU_VERSION)
> > + *ecctable_version = 1;
> > + else
> > + *ecctable_version = 2;
> >
> > return ret;
> > }
> > @@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct
> smu_context *smu,
> > {
> > struct smu_table_context *smu_table = &smu->smu_table;
> > EccInfoTable_t *ecc_table = NULL;
> > + EccInfoTable_t_v2 *ecc_table_v2 = NULL;
> > struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> > int i, ret = 0;
> > + int table_version = 0;
> > struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
> >
> > - ret = aldebaran_check_ecc_table_support(smu);
> > + ret = aldebaran_check_ecc_table_support(smu, &table_version);
> > if (ret)
> > return ret;
> >
> > - ret = smu_cmn_update_table(smu,
> > - SMU_TABLE_ECCINFO,
> > - 0,
> > - smu_table->ecc_table,
> > - false);
> > - if (ret) {
> > - dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> > - return ret;
> > - }
> > + if (table_version == 1) {
> > + ret = smu_cmn_update_table(smu,
> > + SMU_TABLE_ECCINFO,
> > + 0,
> > + smu_table->ecc_table,
> > + false);
> > + if (ret) {
> > + dev_info(smu->adev->dev, "Failed to export SMU ecc
> table!\n");
> > + return ret;
> > + }
> > +
> > + ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> > +
> > + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > + ecc_info_per_channel = &(eccinfo->ecc[i]);
> > + ecc_info_per_channel->ce_count_lo_chip =
> > + ecc_table->EccInfo[i].ce_count_lo_chip;
> > + ecc_info_per_channel->ce_count_hi_chip =
> > + ecc_table->EccInfo[i].ce_count_hi_chip;
> > + ecc_info_per_channel->mca_umc_status =
> > + ecc_table->EccInfo[i].mca_umc_status;
> > + ecc_info_per_channel->mca_umc_addr =
> > + ecc_table->EccInfo[i].mca_umc_addr;
> > + }
> > + } else if (table_version == 2) {
> > + /* still use SMU_TABLE_ECC_INFO index,
> > + * smu 68.55.0 add mca_ceumc_addr variable
> > + * in EccInfo_t struct to report correctable
> > + * error address and the table_id is not changed
> > + */
> > + ret = smu_cmn_update_table(smu,
> > + SMU_TABLE_ECCINFO,
> > + 0,
> > + smu_table->ecc_table_v2,
> > + false);
> >
> > - ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> > -
> > - for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > - ecc_info_per_channel = &(eccinfo->ecc[i]);
> > - ecc_info_per_channel->ce_count_lo_chip =
> > - ecc_table->EccInfo[i].ce_count_lo_chip;
> > - ecc_info_per_channel->ce_count_hi_chip =
> > - ecc_table->EccInfo[i].ce_count_hi_chip;
> > - ecc_info_per_channel->mca_umc_status =
> > - ecc_table->EccInfo[i].mca_umc_status;
> > - ecc_info_per_channel->mca_umc_addr =
> > - ecc_table->EccInfo[i].mca_umc_addr;
> > + if (ret) {
> > + dev_info(smu->adev->dev, "Failed to export SMU ecc
> table_v2!\n");
> > + return ret;
> > + }
> > +
> > + ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
> > +
> > + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > + ecc_info_per_channel = &(eccinfo->ecc[i]);
> > + ecc_info_per_channel->ce_count_lo_chip =
> > + ecc_table_v2->EccInfo[i].ce_count_lo_chip;
> > + ecc_info_per_channel->ce_count_hi_chip =
> > + ecc_table_v2->EccInfo[i].ce_count_hi_chip;
> > + ecc_info_per_channel->mca_umc_status =
> > + ecc_table_v2->EccInfo[i].mca_umc_status;
> > + ecc_info_per_channel->mca_umc_addr =
> > + ecc_table_v2->EccInfo[i].mca_umc_addr;
> > + ecc_info_per_channel->mca_ceumc_addr =
> > + ecc_table_v2->EccInfo[i].mca_ceumc_addr;
> > + }
> > }
> >
> > return ret;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > index ae6321af9d88..af2d84a16f3e 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > @@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context
> *smu)
> > kfree(smu_table->hardcode_pptable);
> > smu_table->hardcode_pptable = NULL;
> >
> > + kfree(smu_table->ecc_table_v2);
> > kfree(smu_table->ecc_table);
> > kfree(smu_table->metrics_table);
> > kfree(smu_table->watermarks_table);
> > + smu_table->ecc_table_v2 = NULL;
> > smu_table->ecc_table = NULL;
> > smu_table->metrics_table = NULL;
> > smu_table->watermarks_table = NULL;
> >
More information about the amd-gfx
mailing list