Re: 回复: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table
Lazar, Lijo
lijo.lazar at amd.com
Thu Nov 18 04:58:37 UTC 2021
On 11/18/2021 9:56 AM, Yang, Stanley wrote:
> [AMD Official Use Only]
>
>
>
>> -----邮件原件-----
>> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
>> 发送时间: Thursday, November 18, 2021 12:04 PM
>> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
>> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>;
>> Clements, John <John.Clements at amd.com>; Quan, Evan
>> <Evan.Quan at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
>> 主题: Re: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get
>> ecc_table
>>
>>
>>
>> On 11/18/2021 9:07 AM, Yang, Stanley wrote:
>>> [AMD Official Use Only]
>>>
>>>
>>>
>>>> -----邮件原件-----
>>>> 发件人: Lazar, Lijo <Lijo.Lazar at amd.com>
>>>> 发送时间: Wednesday, November 17, 2021 7:24 PM
>>>> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
>>>> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>;
>>>> Clements, John <John.Clements at amd.com>; Quan, Evan
>>>> <Evan.Quan at amd.com>; Wang, Yang(Kevin)
>> <KevinYang.Wang at amd.com>
>>>> 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get
>>>> ecc_table
>>>>
>>>>
>>>>
>>>> On 11/17/2021 3:41 PM, Stanley.Yang wrote:
>>>>> support ECC TABLE message, this table include unc ras error count
>>>>> and error address
>>>>>
>>>>> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
>>>>> ---
>>>>> drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 7 ++++
>>>>> .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 38
>>>> +++++++++++++++++++
>>>>> .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 +
>>>>> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 24 ++++++++++++
>>>>> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 3 ++
>>>>> 5 files changed, 74 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>> index 3557f4e7fc30..ea65de0160c3 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
>>>>> @@ -324,6 +324,7 @@ enum smu_table_id
>>>>> SMU_TABLE_OVERDRIVE,
>>>>> SMU_TABLE_I2C_COMMANDS,
>>>>> SMU_TABLE_PACE,
>>>>> + SMU_TABLE_ECCINFO,
>>>>> SMU_TABLE_COUNT,
>>>>> };
>>>>>
>>>>> @@ -340,6 +341,7 @@ struct smu_table_context
>>>>> void *max_sustainable_clocks;
>>>>> struct smu_bios_boot_up_values boot_values;
>>>>> void *driver_pptable;
>>>>> + void *ecc_table;
>>>>> struct smu_table tables[SMU_TABLE_COUNT];
>>>>> /*
>>>>> * The driver table is just a staging buffer for @@ -1261,6
>>>>> +1263,11 @@ struct pptable_funcs {
>>>>> *
>>>> of SMUBUS table.
>>>>> */
>>>>> int (*send_hbm_bad_pages_num)(struct smu_context *smu,
>>>> uint32_t
>>>>> size);
>>>>> +
>>>>> + /**
>>>>> + * @get_ecc_table: message SMU to get ECC INFO table.
>>>>> + */
>>>>> + ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
>>>>> };
>>>>>
>>>>> typedef enum {
>>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>>> index f835d86cc2f5..5e4ba0e14a91 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>>>>> @@ -190,6 +190,7 @@ static const struct cmn2asic_mapping
>>>> aldebaran_table_map[SMU_TABLE_COUNT] = {
>>>>> TAB_MAP(SMU_METRICS),
>>>>> TAB_MAP(DRIVER_SMU_CONFIG),
>>>>> TAB_MAP(I2C_COMMANDS),
>>>>> + TAB_MAP(ECCINFO),
>>>>> };
>>>>>
>>>>> static const uint8_t aldebaran_throttler_map[] = { @@ -223,6
>>>>> +224,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
>>>>> SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS,
>>>> sizeof(SwI2cRequest_t),
>>>>> PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>>>>>
>>>>> + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO,
>>>> sizeof(EccInfoTable_t),
>>>>> + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
>>>>> +
>>>>> smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t),
>>>> GFP_KERNEL);
>>>>> if (!smu_table->metrics_table)
>>>>> return -ENOMEM;
>>>>> @@ -235,6 +239,10 @@ static int aldebaran_tables_init(struct
>>>>> smu_context
>>>> *smu)
>>>>> return -ENOMEM;
>>>>> }
>>>>>
>>>>> + smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size,
>>>> GFP_KERNEL);
>>>>> + if (!smu_table->ecc_table)
>>>>> + return -ENOMEM;
>>>>> +
>>>>> return 0;
>>>>> }
>>>>>
>>>>> @@ -1765,6 +1773,35 @@ static ssize_t
>>>>> aldebaran_get_gpu_metrics(struct
>>>> smu_context *smu,
>>>>> return sizeof(struct gpu_metrics_v1_3);
>>>>> }
>>>>>
>>>>> +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
>>>>> + void *table)
>>>>> +{
>>>>> + struct smu_table_context *smu_table = &smu->smu_table;
>>>>> + EccInfoTable_t ecc_table;
>>>>> + struct ecc_info_per_ch *ecc_info_per_channel = NULL;
>>>>> + int i, ret = 0;
>>>>> + struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
>>>>> +
>>>>> + ret = smu_cmn_get_ecc_info_table(smu,
>>>>> + &ecc_table);
>>>>> + if (ret)
>>>>> + return ret;
>>>>> +
>>>>> + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
>>>>> + ecc_info_per_channel = &(eccinfo->ecc[i]);
>>>>> + ecc_info_per_channel->ce_count_lo_chip =
>>>>> + ecc_table.EccInfo[i].ce_count_lo_chip;
>>>>> + ecc_info_per_channel->ce_count_hi_chip =
>>>>> + ecc_table.EccInfo[i].ce_count_hi_chip;
>>>>> + ecc_info_per_channel->mca_umc_status =
>>>>> + ecc_table.EccInfo[i].mca_umc_status;
>>>>> + ecc_info_per_channel->mca_umc_addr =
>>>>> + ecc_table.EccInfo[i].mca_umc_addr;
>>>>> + }
>>>>> +
>>>>> + return ret;
>>>>> +}
>>>>> +
>>>>> static int aldebaran_mode1_reset(struct smu_context *smu)
>>>>> {
>>>>> u32 smu_version, fatal_err, param; @@ -1967,6 +2004,7 @@ static
>>>>> const struct pptable_funcs
>>>> aldebaran_ppt_funcs = {
>>>>> .i2c_init = aldebaran_i2c_control_init,
>>>>> .i2c_fini = aldebaran_i2c_control_fini,
>>>>> .send_hbm_bad_pages_num =
>>>> aldebaran_smu_send_hbm_bad_page_num,
>>>>> + .get_ecc_info = aldebaran_get_ecc_info,
>>>>> };
>>>>>
>>>>> void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git
>>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>>>> index 4d96099a9bb1..55421ea622fb 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>>>>> @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct
>>>> smu_context *smu)
>>>>> kfree(smu_table->hardcode_pptable);
>>>>> smu_table->hardcode_pptable = NULL;
>>>>>
>>>>> + kfree(smu_table->ecc_table);
>>>>> kfree(smu_table->metrics_table);
>>>>> kfree(smu_table->watermarks_table);
>>>>> + smu_table->ecc_table = NULL;
>>>>> smu_table->metrics_table = NULL;
>>>>> smu_table->watermarks_table = NULL;
>>>>> smu_table->metrics_time = 0;
>>>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> index 843d2cbfc71d..e229c9b09d80 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>>>>> @@ -983,6 +983,30 @@ int smu_cmn_get_metrics_table(struct
>>>> smu_context *smu,
>>>>> return ret;
>>>>> }
>>>>>
>>>>> +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
>>>>> + void *ecc_table)
>>>>> +{
>>>>> + struct smu_table_context *smu_table= &smu->smu_table;
>>>>> + uint32_t table_size =
>>>>> + smu_table->tables[SMU_TABLE_ECCINFO].size;
>>>>> + int ret = 0;
>>>>> +
>>>>> + ret = smu_cmn_update_table(smu,
>>>>> + SMU_TABLE_ECCINFO,
>>>>> + 0,
>>>>> + smu_table->ecc_table,
>>>>> + false);
>>>>> + if (ret) {
>>>>> + dev_info(smu->adev->dev, "Failed to export SMU ecc
>>>> table!\n");
>>>>> + return ret;
>>>>> + }
>>>>> +
>>>>> + if (ecc_table)
>>>>> + memcpy(ecc_table, smu_table->ecc_table, table_size);
>>>>
>>>> This copy to another buffer is redundant. You may use ecc_table
>>>> directly in the callback, then this method itself looks unnecessary.
>>>> Instead of calling smu_cmn_get_ecc_info_table(), call
>>>> smu_cmn_update_table() and copy directly from ecc_table.
>>> [Yang, Stanley] This design consider to protect ecc_table in further if multi-
>> thread call smu_cmn_get_ecc_info_table same time, it should add mutex
>> lock just like metrics table handle if it is necessary, but now test case is simple
>> I didn't do that.
>> This is not like a metrics table use case. RAS error harvesting is not a
>> multithread case. The error registers are cleared after reading, so I thought
>> it's always expected to be one user at a time. Besides, I don't know if there is
>> a case where driver needs to report errors from multiple threads.
>
> [Yang, Stanley] not ras error harvesting, considering debugfs node file umc_error_cnt and sysfs node file umc_error_cnt, is there any mechanism ensure user read them only one thread on time?
I see. For that, suggest to protect smu_get_ecc_info() with
mutex_lock(&smu->mutex);
Usually we do that for swsmu APIs and that seems reasonable for
sys/debugfs cases.
Thanks,
Lijo
>>
>> Thanks,
>> Lijo
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> +
>>>>> + return 0;
>>>>> +}
>>>>> +
>>>>> void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev,
>>>>> uint8_t
>>>> crev)
>>>>> {
>>>>> struct metrics_table_header *header = (struct
>>>>> metrics_table_header *)table; diff --git
>>>>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> index beea03810bca..0adc5451373b 100644
>>>>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
>>>>> @@ -105,6 +105,9 @@ int smu_cmn_get_metrics_table(struct
>>>> smu_context *smu,
>>>>> void *metrics_table,
>>>>> bool bypass_cache);
>>>>>
>>>>> +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
>>>>> + void *table);
>>>>> +
>>>>> void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev,
>>>>> uint8_t crev);
>>>>>
>>>>> int smu_cmn_set_mp1_state(struct smu_context *smu,
>>>>>
More information about the amd-gfx
mailing list