Re: 回复: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table

Thu Nov 18 04:58:37 UTC 2021

On 11/18/2021 9:56 AM, Yang, Stanley wrote:
> [AMD Official Use Only]
> 
> 
> 
>> -----邮件原件-----
>> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
>> 发送时间: Thursday, November 18, 2021 12:04 PM
>> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
>> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>;
>> Clements, John <John.Clements at amd.com>; Quan, Evan
>> <Evan.Quan at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
>> 主题: Re: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get
>> ecc_table
>>
>>
>>
>> On 11/18/2021 9:07 AM, Yang, Stanley wrote:
>>> [AMD Official Use Only]
>>>
>>>
>>>
>>>> -----邮件原件-----
>>>> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
>>>> 发送时间: Wednesday, November 17, 2021 7:24 PM
>>>> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
>>>> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>;
>>>> Clements, John <John.Clements at amd.com>; Quan, Evan
>>>> <Evan.Quan at amd.com>; Wang, Yang(Kevin)
>> <KevinYang.Wang at amd.com>
>>>> 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get
>>>> ecc_table
>>>>
>>>>
>>>>
>>>> On 11/17/2021 3:41 PM, Stanley.Yang wrote:
>>>>> support ECC TABLE message, this table include unc ras error count
>>>>> and error address
>>>>>
>>>>> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  7 ++++
>>>>>     .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 38
>>>> +++++++++++++++++++
>>>>>     .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
>>>>>     drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 24 ++++++++++++
>>>>>     drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h        |  3 ++
>>>>>     5 files changed, 74 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>> index 3557f4e7fc30..ea65de0160c3 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>> @@ -324,6 +324,7 @@ enum smu_table_id
>>>>>     	SMU_TABLE_OVERDRIVE,
>>>>>     	SMU_TABLE_I2C_COMMANDS,
>>>>>     	SMU_TABLE_PACE,
>>>>> +	SMU_TABLE_ECCINFO,
>>>>>     	SMU_TABLE_COUNT,
>>>>>     };
>>>>>
>>>>> @@ -340,6 +341,7 @@ struct smu_table_context
>>>>>     	void				*max_sustainable_clocks;
>>>>>     	struct smu_bios_boot_up_values	boot_values;
>>>>>     	void                            *driver_pptable;
>>>>> +	void                            *ecc_table;
>>>>>     	struct smu_table		tables[SMU_TABLE_COUNT];
>>>>>     	/*
>>>>>     	 * The driver table is just a staging buffer for @@ -1261,6
>>>>> +1263,11 @@ struct pptable_funcs {
>>>>>     	 *
>>>> 		of SMUBUS table.
>>>>>     	 */
>>>>>     	int (*send_hbm_bad_pages_num)(struct smu_context *smu,
>>>> uint32_t
>>>>> size);
>>>>> +
>>>>> +	/**
>>>>> +	 * @get_ecc_table:  message SMU to get ECC INFO table.
>>>>> +	 */
>>>>> +	ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
>>>>>     };
>>>>>
>>>>>     typedef enum {
>>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>>> index f835d86cc2f5..5e4ba0e14a91 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>>> @@ -190,6 +190,7 @@ static const struct cmn2asic_mapping
>>>> aldebaran_table_map[SMU_TABLE_COUNT] = {
>>>>>     	TAB_MAP(SMU_METRICS),
>>>>>     	TAB_MAP(DRIVER_SMU_CONFIG),
>>>>>     	TAB_MAP(I2C_COMMANDS),
>>>>> +	TAB_MAP(ECCINFO),
>>>>>     };
>>>>>
>>>>>     static const uint8_t aldebaran_throttler_map[] = { @@ -223,6
>>>>> +224,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
>>>>>     	SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS,
>>>> sizeof(SwI2cRequest_t),
>>>>>     		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>>>>>
>>>>> +	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO,
>>>> sizeof(EccInfoTable_t),
>>>>> +		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>>>>> +
>>>>>     	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t),
>>>> GFP_KERNEL);
>>>>>     	if (!smu_table->metrics_table)
>>>>>     		return -ENOMEM;
>>>>> @@ -235,6 +239,10 @@ static int aldebaran_tables_init(struct
>>>>> smu_context
>>>> *smu)
>>>>>     		return -ENOMEM;
>>>>>     	}
>>>>>
>>>>> +	smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size,
>>>> GFP_KERNEL);
>>>>> +	if (!smu_table->ecc_table)
>>>>> +		return -ENOMEM;
>>>>> +
>>>>>     	return 0;
>>>>>     }
>>>>>
>>>>> @@ -1765,6 +1773,35 @@ static ssize_t
>>>>> aldebaran_get_gpu_metrics(struct
>>>> smu_context *smu,
>>>>>     	return sizeof(struct gpu_metrics_v1_3);
>>>>>     }
>>>>>
>>>>> +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>>>>> +					 void *table)
>>>>> +{
>>>>> +	struct smu_table_context *smu_table = &smu->smu_table;
>>>>> +	EccInfoTable_t ecc_table;
>>>>> +	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
>>>>> +	int i, ret = 0;
>>>>> +	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
>>>>> +
>>>>> +	ret = smu_cmn_get_ecc_info_table(smu,
>>>>> +					&ecc_table);
>>>>> +	if (ret)
>>>>> +		return ret;
>>>>> +
>>>>> +	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
>>>>> +		ecc_info_per_channel = &(eccinfo->ecc[i]);
>>>>> +		ecc_info_per_channel->ce_count_lo_chip =
>>>>> +			ecc_table.EccInfo[i].ce_count_lo_chip;
>>>>> +		ecc_info_per_channel->ce_count_hi_chip =
>>>>> +			ecc_table.EccInfo[i].ce_count_hi_chip;
>>>>> +		ecc_info_per_channel->mca_umc_status =
>>>>> +			ecc_table.EccInfo[i].mca_umc_status;
>>>>> +		ecc_info_per_channel->mca_umc_addr =
>>>>> +			ecc_table.EccInfo[i].mca_umc_addr;
>>>>> +	}
>>>>> +
>>>>> +	return ret;
>>>>> +}
>>>>> +
>>>>>     static int aldebaran_mode1_reset(struct smu_context *smu)
>>>>>     {
>>>>>     	u32 smu_version, fatal_err, param; @@ -1967,6 +2004,7 @@ static
>>>>> const struct pptable_funcs
>>>> aldebaran_ppt_funcs = {
>>>>>     	.i2c_init = aldebaran_i2c_control_init,
>>>>>     	.i2c_fini = aldebaran_i2c_control_fini,
>>>>>     	.send_hbm_bad_pages_num =
>>>> aldebaran_smu_send_hbm_bad_page_num,
>>>>> +	.get_ecc_info = aldebaran_get_ecc_info,
>>>>>     };
>>>>>
>>>>>     void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git
>>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>>>> index 4d96099a9bb1..55421ea622fb 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>>>> @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct
>>>> smu_context *smu)
>>>>>     	kfree(smu_table->hardcode_pptable);
>>>>>     	smu_table->hardcode_pptable = NULL;
>>>>>
>>>>> +	kfree(smu_table->ecc_table);
>>>>>     	kfree(smu_table->metrics_table);
>>>>>     	kfree(smu_table->watermarks_table);
>>>>> +	smu_table->ecc_table = NULL;
>>>>>     	smu_table->metrics_table = NULL;
>>>>>     	smu_table->watermarks_table = NULL;
>>>>>     	smu_table->metrics_time = 0;
>>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> index 843d2cbfc71d..e229c9b09d80 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> @@ -983,6 +983,30 @@ int smu_cmn_get_metrics_table(struct
>>>> smu_context *smu,
>>>>>     	return ret;
>>>>>     }
>>>>>
>>>>> +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
>>>>> +				     void *ecc_table)
>>>>> +{
>>>>> +	struct smu_table_context *smu_table= &smu->smu_table;
>>>>> +	uint32_t table_size =
>>>>> +		smu_table->tables[SMU_TABLE_ECCINFO].size;
>>>>> +	int ret = 0;
>>>>> +
>>>>> +	ret = smu_cmn_update_table(smu,
>>>>> +			       SMU_TABLE_ECCINFO,
>>>>> +			       0,
>>>>> +			       smu_table->ecc_table,
>>>>> +			       false);
>>>>> +	if (ret) {
>>>>> +		dev_info(smu->adev->dev, "Failed to export SMU ecc
>>>> table!\n");
>>>>> +		return ret;
>>>>> +	}
>>>>> +
>>>>> +	if (ecc_table)
>>>>> +		memcpy(ecc_table, smu_table->ecc_table, table_size);
>>>>
>>>> This copy to another buffer is redundant. You may use ecc_table
>>>> directly in the callback, then this method itself looks unnecessary.
>>>> Instead of calling smu_cmn_get_ecc_info_table(), call
>>>> smu_cmn_update_table() and copy directly from ecc_table.
>>> [Yang, Stanley] This design consider to protect ecc_table in further if multi-
>> thread call smu_cmn_get_ecc_info_table same time, it should add mutex
>> lock just like metrics table handle if it is necessary, but now test case is simple
>> I didn't do that.
>> This is not like a metrics table use case. RAS error harvesting is not a
>> multithread case. The error registers are cleared after reading, so I thought
>> it's always expected to be one user at a time. Besides, I don't know if there is
>> a case where driver needs to report errors from multiple threads.
> 
> [Yang, Stanley] not ras error harvesting, considering debugfs node file umc_error_cnt and sysfs node file umc_error_cnt, is there any mechanism ensure user read them only one thread on time?

I see. For that, suggest to protect smu_get_ecc_info() with
	mutex_lock(&smu->mutex);

Usually we do that for swsmu APIs and that seems reasonable for 
sys/debugfs cases.

Thanks,
Lijo

>>
>> Thanks,
>> Lijo
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> +
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>>     void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev,
>>>>> uint8_t
>>>> crev)
>>>>>     {
>>>>>     	struct metrics_table_header *header = (struct
>>>>> metrics_table_header *)table; diff --git
>>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> index beea03810bca..0adc5451373b 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> @@ -105,6 +105,9 @@ int smu_cmn_get_metrics_table(struct
>>>> smu_context *smu,
>>>>>     			      void *metrics_table,
>>>>>     			      bool bypass_cache);
>>>>>
>>>>> +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
>>>>> +			      void *table);
>>>>> +
>>>>>     void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev,
>>>>> uint8_t crev);
>>>>>
>>>>>     int smu_cmn_set_mp1_state(struct smu_context *smu,
>>>>>