RE: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table v2

Thu Nov 18 14:43:54 UTC 2021

[AMD Official Use Only]

Series looks good to me.

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking

-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar at amd.com> 
Sent: Thursday, November 18, 2021 22:41
To: Yang, Stanley <Stanley.Yang at amd.com>; amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Clements, John <John.Clements at amd.com>; Quan, Evan <Evan.Quan at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
Subject: Re: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table v2

On 11/18/2021 6:05 PM, Yang, Stanley wrote:
> [AMD Official Use Only]
> 
> 
> 
>> -----邮件原件-----
>> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
>> 发送时间: Thursday, November 18, 2021 7:33 PM
>> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd- 
>> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; 
>> Clements, John <John.Clements at amd.com>; Quan, Evan 
>> <Evan.Quan at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
>> 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get 
>> ecc_table v2
>>
>>
>>
>> On 11/18/2021 3:03 PM, Stanley.Yang wrote:
>>> support ECC TABLE message, this table include umc ras error count 
>>> and error address
>>>
>>> v2:
>>>       add smu version check to query whether support ecctable
>>>       call smu_cmn_update_table to get ecctable directly
>>>
>>> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  8 +++
>>>    drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 14 ++++
>>>    .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 70
>> +++++++++++++++++++
>>>    .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
>>>    4 files changed, 94 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>> index 3557f4e7fc30..7a06021a58f0 100644
>>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>> @@ -324,6 +324,7 @@ enum smu_table_id
>>>    	SMU_TABLE_OVERDRIVE,
>>>    	SMU_TABLE_I2C_COMMANDS,
>>>    	SMU_TABLE_PACE,
>>> +	SMU_TABLE_ECCINFO,
>>>    	SMU_TABLE_COUNT,
>>>    };
>>>
>>> @@ -340,6 +341,7 @@ struct smu_table_context
>>>    	void				*max_sustainable_clocks;
>>>    	struct smu_bios_boot_up_values	boot_values;
>>>    	void                            *driver_pptable;
>>> +	void                            *ecc_table;
>>>    	struct smu_table		tables[SMU_TABLE_COUNT];
>>>    	/*
>>>    	 * The driver table is just a staging buffer for @@ -1261,6
>>> +1263,11 @@ struct pptable_funcs {
>>>    	 *
>> 		of SMUBUS table.
>>>    	 */
>>>    	int (*send_hbm_bad_pages_num)(struct smu_context *smu,
>> uint32_t
>>> size);
>>> +
>>> +	/**
>>> +	 * @get_ecc_table:  message SMU to get ECC INFO table.
>>> +	 */
>>> +	ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
>>>    };
>>>
>>>    typedef enum {
>>> @@ -1397,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, 
>>> bool enable);
>>>
>>>    int smu_wait_for_event(struct amdgpu_device *adev, enum
>> smu_event_type event,
>>>    		       uint64_t event_arg);
>>> +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
>>>
>>>    #endif
>>>    #endif
>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>> index 01168b8955bf..fd3b6b460b12 100644
>>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>> @@ -3072,6 +3072,20 @@ int smu_set_light_sbr(struct smu_context 
>>> *smu,
>> bool enable)
>>>    	return ret;
>>>    }
>>>
>>> +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) {
>>> +	int ret = -EOPNOTSUPP;
>>> +
>>> +	mutex_lock(&smu->mutex);
>>> +	if (smu->ppt_funcs &&
>>> +		smu->ppt_funcs->get_ecc_info)
>>> +		ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
>>> +	mutex_unlock(&smu->mutex);
>>> +
>>> +	return ret;
>>> +
>>> +}
>>> +
>>>    static int smu_get_prv_buffer_details(void *handle, void **addr, 
>>> size_t
>> *size)
>>>    {
>>>    	struct smu_context *smu = handle; diff --git 
>>> a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>> index f835d86cc2f5..4c21609ccea5 100644
>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>> @@ -78,6 +78,12 @@
>>>
>>>    #define smnPCIE_ESM_CTRL			0x111003D0
>>>
>>> +/*
>>> + * SMU support ECCTABLE since version 68.42.0,
>>> + * use this to check ECCTALE feature whether support  */ #define 
>>> +SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
>>> +
>>>    static const struct smu_temperature_range smu13_thermal_policy[] =
>>>    {
>>>    	{-273150,  99000, 99000, -273150, 99000, 99000, -273150, 99000, 
>>> 99000}, @@ -190,6 +196,7 @@ static const struct cmn2asic_mapping
>> aldebaran_table_map[SMU_TABLE_COUNT] = {
>>>    	TAB_MAP(SMU_METRICS),
>>>    	TAB_MAP(DRIVER_SMU_CONFIG),
>>>    	TAB_MAP(I2C_COMMANDS),
>>> +	TAB_MAP(ECCINFO),
>>>    };
>>>
>>>    static const uint8_t aldebaran_throttler_map[] = { @@ -223,6 
>>> +230,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
>>>    	SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS,
>> sizeof(SwI2cRequest_t),
>>>    		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>>>
>>> +	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO,
>> sizeof(EccInfoTable_t),
>>> +		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>>> +
>>>    	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t),
>> GFP_KERNEL);
>>>    	if (!smu_table->metrics_table)
>>>    		return -ENOMEM;
>>> @@ -235,6 +245,10 @@ static int aldebaran_tables_init(struct 
>>> smu_context
>> *smu)
>>>    		return -ENOMEM;
>>>    	}
>>>
>>> +	smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size,
>> GFP_KERNEL);
>>> +	if (!smu_table->ecc_table)
>>> +		return -ENOMEM;
>>> +
>>>    	return 0;
>>>    }
>>>
>>> @@ -1765,6 +1779,61 @@ static ssize_t 
>>> aldebaran_get_gpu_metrics(struct
>> smu_context *smu,
>>>    	return sizeof(struct gpu_metrics_v1_3);
>>>    }
>>>
>>> +static int aldebaran_check_ecc_table_support(struct smu_context 
>>> +*smu) {
>>> +	uint32_t if_version = 0xff, smu_version = 0xff;
>>> +	int ret = 0;
>>> +
>>> +	ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
>>> +	if (ret)
>>> +		ret = -EOPNOTSUPP;	// return not support if failed get

Nitpick - comment style

>> smu_version
>>> +
>>> +	if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
>>> +		ret = -EOPNOTSUPP;
>>> +
>>> +	return ret;
>>> +}
>>> +
>>> +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>>> +					 void *table)
>>> +{
>>> +	struct smu_table_context *smu_table = &smu->smu_table;
>>> +	EccInfoTable_t *ecc_table = NULL;
>>> +	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
>>> +	int i, ret = 0;
>>> +	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
>>> +
>>
>> Missed to ask last time. Since umc_ecc_info is a common struct, do 
>> you also want to pass back the number of channels having data?
>>
>> Now this struct can hold max of 32 channel data. Let's say if the 
>> same interface is going to be used on another ASIC X having only 16 channels.
>> Then the callback for ASIC X fills data only for 16 channels. Or, you 
>> expect that to be taken care at the caller side?
> 
> [Yang, Stanley] : If ASIC X have only 16 channels, the callback only fill data for 16 channels, and caller side also need consider its own channel number to handle with umc_ecc_info.
> 

Thanks for the details. With the nitpick above and Evan's comments on the patch subject addressed, patches 1 and 3 are

Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>

2 and 4 look good to me. Hawking or John should a take look though.

Thanks,
Lijo

>>
>> Thanks,
>> Lijo
>>
>>> +	ret = aldebaran_check_ecc_table_support(smu);
>>> +	if (ret)
>>> +		return ret;
>>> +
>>> +	ret = smu_cmn_update_table(smu,
>>> +			       SMU_TABLE_ECCINFO,
>>> +			       0,
>>> +			       smu_table->ecc_table,
>>> +			       false);
>>> +	if (ret) {
>>> +		dev_info(smu->adev->dev, "Failed to export SMU ecc
>> table!\n");
>>> +		return ret;
>>> +	}
>>> +
>>> +	ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
>>> +
>>> +	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
>>> +		ecc_info_per_channel = &(eccinfo->ecc[i]);
>>> +		ecc_info_per_channel->ce_count_lo_chip =
>>> +			ecc_table->EccInfo[i].ce_count_lo_chip;
>>> +		ecc_info_per_channel->ce_count_hi_chip =
>>> +			ecc_table->EccInfo[i].ce_count_hi_chip;
>>> +		ecc_info_per_channel->mca_umc_status =
>>> +			ecc_table->EccInfo[i].mca_umc_status;
>>> +		ecc_info_per_channel->mca_umc_addr =
>>> +			ecc_table->EccInfo[i].mca_umc_addr;
>>> +	}
>>> +
>>> +	return ret;
>>> +}
>>> +
>>>    static int aldebaran_mode1_reset(struct smu_context *smu)
>>>    {
>>>    	u32 smu_version, fatal_err, param; @@ -1967,6 +2036,7 @@ static 
>>> const struct pptable_funcs
>> aldebaran_ppt_funcs = {
>>>    	.i2c_init = aldebaran_i2c_control_init,
>>>    	.i2c_fini = aldebaran_i2c_control_fini,
>>>    	.send_hbm_bad_pages_num =
>> aldebaran_smu_send_hbm_bad_page_num,
>>> +	.get_ecc_info = aldebaran_get_ecc_info,
>>>    };
>>>
>>>    void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git 
>>> a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>> index 4d96099a9bb1..55421ea622fb 100644
>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>> @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct
>> smu_context *smu)
>>>    	kfree(smu_table->hardcode_pptable);
>>>    	smu_table->hardcode_pptable = NULL;
>>>
>>> +	kfree(smu_table->ecc_table);
>>>    	kfree(smu_table->metrics_table);
>>>    	kfree(smu_table->watermarks_table);
>>> +	smu_table->ecc_table = NULL;
>>>    	smu_table->metrics_table = NULL;
>>>    	smu_table->watermarks_table = NULL;
>>>    	smu_table->metrics_time = 0;
>>>