回复: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table
Yang, Stanley
Stanley.Yang at amd.com
Thu Nov 18 04:26:56 UTC 2021
[AMD Official Use Only]
> -----邮件原件-----
> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
> 发送时间: Thursday, November 18, 2021 12:04 PM
> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>;
> Clements, John <John.Clements at amd.com>; Quan, Evan
> <Evan.Quan at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
> 主题: Re: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get
> ecc_table
>
>
>
> On 11/18/2021 9:07 AM, Yang, Stanley wrote:
> > [AMD Official Use Only]
> >
> >
> >
> >> -----邮件原件-----
> >> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
> >> 发送时间: Wednesday, November 17, 2021 7:24 PM
> >> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
> >> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>;
> >> Clements, John <John.Clements at amd.com>; Quan, Evan
> >> <Evan.Quan at amd.com>; Wang, Yang(Kevin)
> <KevinYang.Wang at amd.com>
> >> 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get
> >> ecc_table
> >>
> >>
> >>
> >> On 11/17/2021 3:41 PM, Stanley.Yang wrote:
> >>> support ECC TABLE message, this table include unc ras error count
> >>> and error address
> >>>
> >>> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> >>> ---
> >>> drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 7 ++++
> >>> .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 38
> >> +++++++++++++++++++
> >>> .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 +
> >>> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 24 ++++++++++++
> >>> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 3 ++
> >>> 5 files changed, 74 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> >>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> >>> index 3557f4e7fc30..ea65de0160c3 100644
> >>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> >>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> >>> @@ -324,6 +324,7 @@ enum smu_table_id
> >>> SMU_TABLE_OVERDRIVE,
> >>> SMU_TABLE_I2C_COMMANDS,
> >>> SMU_TABLE_PACE,
> >>> + SMU_TABLE_ECCINFO,
> >>> SMU_TABLE_COUNT,
> >>> };
> >>>
> >>> @@ -340,6 +341,7 @@ struct smu_table_context
> >>> void *max_sustainable_clocks;
> >>> struct smu_bios_boot_up_values boot_values;
> >>> void *driver_pptable;
> >>> + void *ecc_table;
> >>> struct smu_table tables[SMU_TABLE_COUNT];
> >>> /*
> >>> * The driver table is just a staging buffer for @@ -1261,6
> >>> +1263,11 @@ struct pptable_funcs {
> >>> *
> >> of SMUBUS table.
> >>> */
> >>> int (*send_hbm_bad_pages_num)(struct smu_context *smu,
> >> uint32_t
> >>> size);
> >>> +
> >>> + /**
> >>> + * @get_ecc_table: message SMU to get ECC INFO table.
> >>> + */
> >>> + ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
> >>> };
> >>>
> >>> typedef enum {
> >>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> >>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> >>> index f835d86cc2f5..5e4ba0e14a91 100644
> >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> >>> @@ -190,6 +190,7 @@ static const struct cmn2asic_mapping
> >> aldebaran_table_map[SMU_TABLE_COUNT] = {
> >>> TAB_MAP(SMU_METRICS),
> >>> TAB_MAP(DRIVER_SMU_CONFIG),
> >>> TAB_MAP(I2C_COMMANDS),
> >>> + TAB_MAP(ECCINFO),
> >>> };
> >>>
> >>> static const uint8_t aldebaran_throttler_map[] = { @@ -223,6
> >>> +224,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
> >>> SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS,
> >> sizeof(SwI2cRequest_t),
> >>> PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> >>>
> >>> + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO,
> >> sizeof(EccInfoTable_t),
> >>> + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> >>> +
> >>> smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t),
> >> GFP_KERNEL);
> >>> if (!smu_table->metrics_table)
> >>> return -ENOMEM;
> >>> @@ -235,6 +239,10 @@ static int aldebaran_tables_init(struct
> >>> smu_context
> >> *smu)
> >>> return -ENOMEM;
> >>> }
> >>>
> >>> + smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size,
> >> GFP_KERNEL);
> >>> + if (!smu_table->ecc_table)
> >>> + return -ENOMEM;
> >>> +
> >>> return 0;
> >>> }
> >>>
> >>> @@ -1765,6 +1773,35 @@ static ssize_t
> >>> aldebaran_get_gpu_metrics(struct
> >> smu_context *smu,
> >>> return sizeof(struct gpu_metrics_v1_3);
> >>> }
> >>>
> >>> +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
> >>> + void *table)
> >>> +{
> >>> + struct smu_table_context *smu_table = &smu->smu_table;
> >>> + EccInfoTable_t ecc_table;
> >>> + struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> >>> + int i, ret = 0;
> >>> + struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
> >>> +
> >>> + ret = smu_cmn_get_ecc_info_table(smu,
> >>> + &ecc_table);
> >>> + if (ret)
> >>> + return ret;
> >>> +
> >>> + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> >>> + ecc_info_per_channel = &(eccinfo->ecc[i]);
> >>> + ecc_info_per_channel->ce_count_lo_chip =
> >>> + ecc_table.EccInfo[i].ce_count_lo_chip;
> >>> + ecc_info_per_channel->ce_count_hi_chip =
> >>> + ecc_table.EccInfo[i].ce_count_hi_chip;
> >>> + ecc_info_per_channel->mca_umc_status =
> >>> + ecc_table.EccInfo[i].mca_umc_status;
> >>> + ecc_info_per_channel->mca_umc_addr =
> >>> + ecc_table.EccInfo[i].mca_umc_addr;
> >>> + }
> >>> +
> >>> + return ret;
> >>> +}
> >>> +
> >>> static int aldebaran_mode1_reset(struct smu_context *smu)
> >>> {
> >>> u32 smu_version, fatal_err, param; @@ -1967,6 +2004,7 @@ static
> >>> const struct pptable_funcs
> >> aldebaran_ppt_funcs = {
> >>> .i2c_init = aldebaran_i2c_control_init,
> >>> .i2c_fini = aldebaran_i2c_control_fini,
> >>> .send_hbm_bad_pages_num =
> >> aldebaran_smu_send_hbm_bad_page_num,
> >>> + .get_ecc_info = aldebaran_get_ecc_info,
> >>> };
> >>>
> >>> void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git
> >>> a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> >>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> >>> index 4d96099a9bb1..55421ea622fb 100644
> >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> >>> @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct
> >> smu_context *smu)
> >>> kfree(smu_table->hardcode_pptable);
> >>> smu_table->hardcode_pptable = NULL;
> >>>
> >>> + kfree(smu_table->ecc_table);
> >>> kfree(smu_table->metrics_table);
> >>> kfree(smu_table->watermarks_table);
> >>> + smu_table->ecc_table = NULL;
> >>> smu_table->metrics_table = NULL;
> >>> smu_table->watermarks_table = NULL;
> >>> smu_table->metrics_time = 0;
> >>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> >>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> >>> index 843d2cbfc71d..e229c9b09d80 100644
> >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> >>> @@ -983,6 +983,30 @@ int smu_cmn_get_metrics_table(struct
> >> smu_context *smu,
> >>> return ret;
> >>> }
> >>>
> >>> +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
> >>> + void *ecc_table)
> >>> +{
> >>> + struct smu_table_context *smu_table= &smu->smu_table;
> >>> + uint32_t table_size =
> >>> + smu_table->tables[SMU_TABLE_ECCINFO].size;
> >>> + int ret = 0;
> >>> +
> >>> + ret = smu_cmn_update_table(smu,
> >>> + SMU_TABLE_ECCINFO,
> >>> + 0,
> >>> + smu_table->ecc_table,
> >>> + false);
> >>> + if (ret) {
> >>> + dev_info(smu->adev->dev, "Failed to export SMU ecc
> >> table!\n");
> >>> + return ret;
> >>> + }
> >>> +
> >>> + if (ecc_table)
> >>> + memcpy(ecc_table, smu_table->ecc_table, table_size);
> >>
> >> This copy to another buffer is redundant. You may use ecc_table
> >> directly in the callback, then this method itself looks unnecessary.
> >> Instead of calling smu_cmn_get_ecc_info_table(), call
> >> smu_cmn_update_table() and copy directly from ecc_table.
> > [Yang, Stanley] This design consider to protect ecc_table in further if multi-
> thread call smu_cmn_get_ecc_info_table same time, it should add mutex
> lock just like metrics table handle if it is necessary, but now test case is simple
> I didn't do that.
> This is not like a metrics table use case. RAS error harvesting is not a
> multithread case. The error registers are cleared after reading, so I thought
> it's always expected to be one user at a time. Besides, I don't know if there is
> a case where driver needs to report errors from multiple threads.
[Yang, Stanley] not ras error harvesting, considering debugfs node file umc_error_cnt and sysfs node file umc_error_cnt, is there any mechanism ensure user read them only one thread on time?
>
> Thanks,
> Lijo
> >
> >>
> >> Thanks,
> >> Lijo
> >>
> >>> +
> >>> + return 0;
> >>> +}
> >>> +
> >>> void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev,
> >>> uint8_t
> >> crev)
> >>> {
> >>> struct metrics_table_header *header = (struct
> >>> metrics_table_header *)table; diff --git
> >>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> >>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> >>> index beea03810bca..0adc5451373b 100644
> >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> >>> @@ -105,6 +105,9 @@ int smu_cmn_get_metrics_table(struct
> >> smu_context *smu,
> >>> void *metrics_table,
> >>> bool bypass_cache);
> >>>
> >>> +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
> >>> + void *table);
> >>> +
> >>> void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev,
> >>> uint8_t crev);
> >>>
> >>> int smu_cmn_set_mp1_state(struct smu_context *smu,
> >>>
More information about the amd-gfx
mailing list