[PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table
Lazar, Lijo
lijo.lazar at amd.com
Wed Nov 17 11:23:43 UTC 2021
On 11/17/2021 3:41 PM, Stanley.Yang wrote:
> support ECC TABLE message, this table include unc ras error count
> and error address
>
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 7 ++++
> .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 38 +++++++++++++++++++
> .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 +
> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 24 ++++++++++++
> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 3 ++
> 5 files changed, 74 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> index 3557f4e7fc30..ea65de0160c3 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> @@ -324,6 +324,7 @@ enum smu_table_id
> SMU_TABLE_OVERDRIVE,
> SMU_TABLE_I2C_COMMANDS,
> SMU_TABLE_PACE,
> + SMU_TABLE_ECCINFO,
> SMU_TABLE_COUNT,
> };
>
> @@ -340,6 +341,7 @@ struct smu_table_context
> void *max_sustainable_clocks;
> struct smu_bios_boot_up_values boot_values;
> void *driver_pptable;
> + void *ecc_table;
> struct smu_table tables[SMU_TABLE_COUNT];
> /*
> * The driver table is just a staging buffer for
> @@ -1261,6 +1263,11 @@ struct pptable_funcs {
> * of SMUBUS table.
> */
> int (*send_hbm_bad_pages_num)(struct smu_context *smu, uint32_t size);
> +
> + /**
> + * @get_ecc_table: message SMU to get ECC INFO table.
> + */
> + ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
> };
>
> typedef enum {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index f835d86cc2f5..5e4ba0e14a91 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -190,6 +190,7 @@ static const struct cmn2asic_mapping aldebaran_table_map[SMU_TABLE_COUNT] = {
> TAB_MAP(SMU_METRICS),
> TAB_MAP(DRIVER_SMU_CONFIG),
> TAB_MAP(I2C_COMMANDS),
> + TAB_MAP(ECCINFO),
> };
>
> static const uint8_t aldebaran_throttler_map[] = {
> @@ -223,6 +224,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
> SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS, sizeof(SwI2cRequest_t),
> PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>
> + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
> + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> +
> smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
> if (!smu_table->metrics_table)
> return -ENOMEM;
> @@ -235,6 +239,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
> return -ENOMEM;
> }
>
> + smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size, GFP_KERNEL);
> + if (!smu_table->ecc_table)
> + return -ENOMEM;
> +
> return 0;
> }
>
> @@ -1765,6 +1773,35 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
> return sizeof(struct gpu_metrics_v1_3);
> }
>
> +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
> + void *table)
> +{
> + struct smu_table_context *smu_table = &smu->smu_table;
> + EccInfoTable_t ecc_table;
> + struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> + int i, ret = 0;
> + struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
> +
> + ret = smu_cmn_get_ecc_info_table(smu,
> + &ecc_table);
> + if (ret)
> + return ret;
> +
> + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> + ecc_info_per_channel = &(eccinfo->ecc[i]);
> + ecc_info_per_channel->ce_count_lo_chip =
> + ecc_table.EccInfo[i].ce_count_lo_chip;
> + ecc_info_per_channel->ce_count_hi_chip =
> + ecc_table.EccInfo[i].ce_count_hi_chip;
> + ecc_info_per_channel->mca_umc_status =
> + ecc_table.EccInfo[i].mca_umc_status;
> + ecc_info_per_channel->mca_umc_addr =
> + ecc_table.EccInfo[i].mca_umc_addr;
> + }
> +
> + return ret;
> +}
> +
> static int aldebaran_mode1_reset(struct smu_context *smu)
> {
> u32 smu_version, fatal_err, param;
> @@ -1967,6 +2004,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
> .i2c_init = aldebaran_i2c_control_init,
> .i2c_fini = aldebaran_i2c_control_fini,
> .send_hbm_bad_pages_num = aldebaran_smu_send_hbm_bad_page_num,
> + .get_ecc_info = aldebaran_get_ecc_info,
> };
>
> void aldebaran_set_ppt_funcs(struct smu_context *smu)
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index 4d96099a9bb1..55421ea622fb 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
> kfree(smu_table->hardcode_pptable);
> smu_table->hardcode_pptable = NULL;
>
> + kfree(smu_table->ecc_table);
> kfree(smu_table->metrics_table);
> kfree(smu_table->watermarks_table);
> + smu_table->ecc_table = NULL;
> smu_table->metrics_table = NULL;
> smu_table->watermarks_table = NULL;
> smu_table->metrics_time = 0;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 843d2cbfc71d..e229c9b09d80 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -983,6 +983,30 @@ int smu_cmn_get_metrics_table(struct smu_context *smu,
> return ret;
> }
>
> +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
> + void *ecc_table)
> +{
> + struct smu_table_context *smu_table= &smu->smu_table;
> + uint32_t table_size =
> + smu_table->tables[SMU_TABLE_ECCINFO].size;
> + int ret = 0;
> +
> + ret = smu_cmn_update_table(smu,
> + SMU_TABLE_ECCINFO,
> + 0,
> + smu_table->ecc_table,
> + false);
> + if (ret) {
> + dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
> + return ret;
> + }
> +
> + if (ecc_table)
> + memcpy(ecc_table, smu_table->ecc_table, table_size);
This copy to another buffer is redundant. You may use ecc_table directly
in the callback, then this method itself looks unnecessary. Instead of
calling smu_cmn_get_ecc_info_table(), call smu_cmn_update_table() and
copy directly from ecc_table.
Thanks,
Lijo
> +
> + return 0;
> +}
> +
> void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t crev)
> {
> struct metrics_table_header *header = (struct metrics_table_header *)table;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> index beea03810bca..0adc5451373b 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> @@ -105,6 +105,9 @@ int smu_cmn_get_metrics_table(struct smu_context *smu,
> void *metrics_table,
> bool bypass_cache);
>
> +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
> + void *table);
> +
> void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t crev);
>
> int smu_cmn_set_mp1_state(struct smu_context *smu,
>
More information about the amd-gfx
mailing list