回复: [PATCH Review 1/1] drm/amdgpu: support send bad channel info to smu
Yang, Stanley
Stanley.Yang at amd.com
Thu Mar 3 03:21:54 UTC 2022
> -----邮件原件-----
> 发件人: Zhou1, Tao <Tao.Zhou1 at amd.com>
> 发送时间: Wednesday, March 2, 2022 3:45 PM
> 收件人: Yang, Stanley <Stanley.Yang at amd.com>; amd-
> gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>;
> Joo, Maria <Maria.Joo at amd.com>
> 抄送: Yang, Stanley <Stanley.Yang at amd.com>
> 主题: RE: [PATCH Review 1/1] drm/amdgpu: support send bad channel info
> to smu
>
> [AMD Official Use Only]
>
>
>
> > -----Original Message-----
> > From: Stanley.Yang <Stanley.Yang at amd.com>
> > Sent: Tuesday, March 1, 2022 9:30 PM
> > To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> > <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Joo,
> Maria
> > <Maria.Joo at amd.com>
> > Cc: Yang, Stanley <Stanley.Yang at amd.com>
> > Subject: [PATCH Review 1/1] drm/amdgpu: support send bad channel info
> > to smu
> >
> > Message SMU bad channel information bitmap to update OOB table
> >
> > Change-Id: I49a79af64d5263c28db059ecb8b8405a471431b4
> > Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++
> > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 ++++++++++-
> > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 4 ++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++
> > drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 12 ++++++
> > drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 1 +
> > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +++++
> > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 7 +++
> > .../pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h | 3 +-
> > drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 +-
> > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 43
> +++++++++++++++++++
> > 12 files changed, 119 insertions(+), 4 deletions(-)
>
> [Tao] It's better to split the patch into two parts, one for amdgpu and one for
> pm.
[Yang, Stanley] : yeah, it makes sense, will update.
>
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index d3875618ebf5..f9104f99eb9c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct
> > amdgpu_device
> > *adev)
> > mutex_init(&con->recovery_lock);
> > INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
> > atomic_set(&con->in_recovery, 0);
> > + con->eeprom_control.bad_channel_bitmap = 0;
> >
> > max_eeprom_records_count =
> > amdgpu_ras_eeprom_max_record_count();
> > amdgpu_ras_validate_threshold(adev,
> max_eeprom_records_count); @@
> > -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> > *adev)
> > goto free;
> >
> > amdgpu_dpm_send_hbm_bad_pages_num(adev, con-
> > >eeprom_control.ras_num_recs);
> > +
> > + if (con->update_channel_flag == true) {
> [Tao] It can be simplified to "if (con->update_channel_flag)"
[Yang, Stanley] : Yeah, both the "if (con->update_channel_flag)" and "if (con->update_channel_flag == ture)" are feasible.
>
> > + amdgpu_dpm_send_hbm_bad_channel_flag(adev,
> con-
> > >eeprom_control.bad_channel_bitmap);
>
> [Tao] do we need to check status of the function and stop recovery_init if it
> fails?
[Yang, Stanley] : No, it don't affect ras recovery process even message smu failed.
>
> > + con->update_channel_flag = false;
> > + }
> > }
> >
> > #ifdef CONFIG_X86_MCE_AMD
> > @@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
> > goto release_con;
> > }
> >
> > + con->update_channel_flag = false;
> > con->features = 0;
> > INIT_LIST_HEAD(&con->head);
> > /* Might need get this flag from vbios. */ diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index 7cddaad90d6d..9314fde81e68 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -374,6 +374,9 @@ struct amdgpu_ras {
> >
> > /* record umc error info queried from smu */
> > struct umc_ecc_info umc_ecc;
> > +
> > + /* Indicates smu whether need update bad channel info */
> > + bool update_channel_flag;
> > };
> >
> > struct ras_fs_data {
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > index 2b844a5aafdb..ad5d8667756d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > @@ -265,6 +265,7 @@ int amdgpu_ras_eeprom_reset_table(struct
> > amdgpu_ras_eeprom_control *control) {
> > struct amdgpu_device *adev = to_amdgpu_device(control);
> > struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> > u8 csum;
> > int res;
> >
> > @@ -285,6 +286,10 @@ int amdgpu_ras_eeprom_reset_table(struct
> > amdgpu_ras_eeprom_control *control)
> >
> > amdgpu_dpm_send_hbm_bad_pages_num(adev, control-
> > >ras_num_recs);
> >
> > + control->bad_channel_bitmap = 0;
> > + amdgpu_dpm_send_hbm_bad_channel_flag(adev, control-
> > >bad_channel_bitmap);
> > + con->update_channel_flag = false;
> > +
> > amdgpu_ras_debugfs_set_ret_size(control);
> >
> > mutex_unlock(&control->ras_tbl_mutex);
> > @@ -418,6 +423,7 @@ amdgpu_ras_eeprom_append_table(struct
> > amdgpu_ras_eeprom_control *control,
> > struct eeprom_table_record *record,
> > const u32 num)
> > {
> > + struct amdgpu_ras *con =
> > +amdgpu_ras_get_context(to_amdgpu_device(control));
> > u32 a, b, i;
> > u8 *buf, *pp;
> > int res;
> > @@ -429,9 +435,16 @@ amdgpu_ras_eeprom_append_table(struct
> > amdgpu_ras_eeprom_control *control,
> > /* Encode all of them in one go.
> > */
> > pp = buf;
> > - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
> > + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
> > __encode_table_record_to_buf(control, &record[i], pp);
> >
> > + /* update bad channel bitmap */
> > + if (!(control->bad_channel_bitmap & (1 <<
> > record[i].mem_channel))) {
> > + control->bad_channel_bitmap |= 1 <<
> > record[i].mem_channel;
> > + con->update_channel_flag = true;
> > + }
> > + }
> > +
> > /* a, first record index to write into.
> > * b, last record index to write into.
> > * a = first index to read (fri) + number of records in the table,
> > @@ -
> > 684,6 +697,7 @@ int amdgpu_ras_eeprom_read(struct
> > amdgpu_ras_eeprom_control *control,
> > const u32 num)
> > {
> > struct amdgpu_device *adev = to_amdgpu_device(control);
> > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> > int i, res;
> > u8 *buf, *pp;
> > u32 g0, g1;
> > @@ -751,8 +765,15 @@ int amdgpu_ras_eeprom_read(struct
> > amdgpu_ras_eeprom_control *control,
> > /* Read up everything? Then transform.
> > */
> > pp = buf;
> > - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
> > + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
> > __decode_table_record_from_buf(control, &record[i], pp);
> > +
> > + /* update bad channel bitmap */
> > + if (!(control->bad_channel_bitmap & (1 <<
> > record[i].mem_channel))) {
> > + control->bad_channel_bitmap |= 1 <<
> > record[i].mem_channel;
> > + con->update_channel_flag = true;
> > + }
> > + }
> > Out:
> > kfree(buf);
> > mutex_unlock(&control->ras_tbl_mutex);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > index 6bb00578bfbb..54d9bfe0881d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > @@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
> > /* Protect table access via this mutex.
> > */
> > struct mutex ras_tbl_mutex;
> > +
> > + /* Record channel info which occurred bad pages
> > + */
> > + u32 bad_channel_bitmap;
> > };
> >
> > /*
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > index 85da6cbaf3b7..aad3c8b4c810 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > @@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct
> > amdgpu_device *adev,
> > amdgpu_ras_save_bad_pages(adev);
> >
> > amdgpu_dpm_send_hbm_bad_pages_num(adev,
> con-
> > >eeprom_control.ras_num_recs);
> > +
> > + if (con->update_channel_flag == true) {
> > +
> > amdgpu_dpm_send_hbm_bad_channel_flag(adev, con-
> > >eeprom_control.bad_channel_bitmap);
> > + con->update_channel_flag = false;
> > + }
> > }
> >
> > if (reset)
> > diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> > b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> > index 1d63f1e8884c..9a892d6d1d7a 100644
> > --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> > +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> > @@ -507,6 +507,18 @@ int
> amdgpu_dpm_send_hbm_bad_pages_num(struct
> > amdgpu_device *adev, uint32_t size)
> > return ret;
> > }
> >
> > +int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device
> *adev,
> > +uint32_t size) {
> > + struct smu_context *smu = adev->powerplay.pp_handle;
> > + int ret = 0;
> > +
> > + mutex_lock(&adev->pm.mutex);
> > + ret = smu_send_hbm_bad_channel_flag(smu, size);
> > + mutex_unlock(&adev->pm.mutex);
> > +
> > + return ret;
> > +}
> > +
> > int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev,
> > enum pp_clock_type type,
> > uint32_t *min,
> > diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> > b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> > index ddfa55b59d02..3e78b3057277 100644
> > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> > @@ -412,6 +412,7 @@ void amdgpu_dpm_enable_jpeg(struct
> amdgpu_device
> > *adev, bool enable); int amdgpu_pm_load_smu_firmware(struct
> > amdgpu_device *adev, uint32_t *smu_version); int
> > amdgpu_dpm_handle_passthrough_sbr(struct amdgpu_device *adev,
> bool
> > enable); int amdgpu_dpm_send_hbm_bad_pages_num(struct
> amdgpu_device
> > *adev, uint32_t size);
> > +int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device
> *adev,
> > +uint32_t size);
> > int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev,
> > enum pp_clock_type type,
> > uint32_t *min,
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > index 7e79a67bb8ef..f1544755d8b4 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > @@ -3052,3 +3052,13 @@ int smu_send_hbm_bad_pages_num(struct
> > smu_context *smu, uint32_t size)
> >
> > return ret;
> > }
> > +
> > +int smu_send_hbm_bad_channel_flag(struct smu_context *smu,
> uint32_t
> > +size) {
> > + int ret = 0;
> > +
> > + if (smu->ppt_funcs && smu->ppt_funcs-
> >send_hbm_bad_channel_flag)
> > + ret = smu->ppt_funcs->send_hbm_bad_channel_flag(smu,
> size);
> > +
> > + return ret;
> > +}
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > index fbef3ab8d487..ef57b6089c69 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > @@ -1292,6 +1292,12 @@ struct pptable_funcs {
> > * @set_config_table: Apply the input DriverSmuConfig table settings.
> > */
> > int (*set_config_table)(struct smu_context *smu, struct
> > config_table_setting *table);
> > +
> > + /**
> > + * @sned_hbm_bad_channel_flag: message SMU to update bad
> > channel info
> > + *
> > of SMUBUS table.
> > + */
> > + int (*send_hbm_bad_channel_flag)(struct smu_context *smu,
> uint32_t
> > +size);
> > };
> >
> > typedef enum {
> > @@ -1428,5 +1434,6 @@ int smu_get_ecc_info(struct smu_context *smu,
> > void *umc_ecc); int smu_stb_collect_info(struct smu_context *smu,
> > void *buff, uint32_t size); void amdgpu_smu_stb_debug_fs_init(struct
> > amdgpu_device *adev); int smu_send_hbm_bad_pages_num(struct
> > smu_context *smu, uint32_t size);
> > +int smu_send_hbm_bad_channel_flag(struct smu_context *smu,
> uint32_t
> > +size);
> > #endif
> > #endif
> > diff --git
> > a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
> > b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
> > index ab66a4b9e438..0f498baf6838 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
> > @@ -103,7 +103,8 @@
> > #define PPSMC_MSG_GfxDriverResetRecovery 0x42
> > #define PPSMC_MSG_BoardPowerCalibration 0x43
> > #define PPSMC_MSG_HeavySBR 0x45
> > -#define PPSMC_Message_Count 0x46
> > +#define PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel 0x46
> > +#define PPSMC_Message_Count 0x47
> >
> >
> > //PPSMC Reset Types
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> > b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> > index d787c3b9fc52..9f6f306eeca0 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> > @@ -232,7 +232,8 @@
> > __SMU_DUMMY_MAP(ForceGfxVid), \
> > __SMU_DUMMY_MAP(Spare0), \
> > __SMU_DUMMY_MAP(UnforceGfxVid), \
> > - __SMU_DUMMY_MAP(HeavySBR),
> > + __SMU_DUMMY_MAP(HeavySBR), \
> > + __SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel),
> >
> > #undef __SMU_DUMMY_MAP
> > #define __SMU_DUMMY_MAP(type) SMU_MSG_##type
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > index 890acc4e2cb8..e5e249968244 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > @@ -82,6 +82,12 @@
> > */
> > #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> >
> > +/*
> > + * SMU support BAD CHENNEL info MSG since version 68.51.00,
> > + * use this to check ECCTALE feature whether support */ #define
> > +SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION 0x00443300
> > +
> > static const struct smu_temperature_range smu13_thermal_policy[] = {
> > {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000,
> > 99000}, @@ -140,6 +146,7 @@ static const struct cmn2asic_msg_mapping
> > aldebaran_message_map[SMU_MSG_MAX_COUNT
> > MSG_MAP(GfxDriverResetRecovery,
> > PPSMC_MSG_GfxDriverResetRecovery, 0),
> > MSG_MAP(BoardPowerCalibration,
> > PPSMC_MSG_BoardPowerCalibration, 0),
> > MSG_MAP(HeavySBR, PPSMC_MSG_HeavySBR,
> > 0),
> > + MSG_MAP(SetBadHBMPagesRetiredFlagsPerChannel,
> > PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel, 0),
> > };
> >
> > static const struct cmn2asic_mapping
> aldebaran_clk_map[SMU_CLK_COUNT]
> > = { @@ -1997,6 +2004,41 @@ static int
> > aldebaran_smu_send_hbm_bad_page_num(struct smu_context *smu,
> > return ret;
> > }
> >
> > +static int aldebaran_check_bad_channel_info_support(struct
> > +smu_context
> > +*smu) {
> > + uint32_t if_version = 0xff, smu_version = 0xff;
> > + int ret = 0;
> > +
> > + ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
> > + if (ret) {
> > + /* return not support if failed get smu_version */
> > + ret = -EOPNOTSUPP;
> > + }
> > +
> > + if (smu_version < SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION)
> > + ret = -EOPNOTSUPP;
> > +
> > + return ret;
> > +}
> > +
> > +static int aldebaran_send_hbm_bad_channel_flag(struct smu_context
> *smu,
> > + uint32_t size)
> > +{
> > + int ret = 0;
> > +
> > + ret = aldebaran_check_bad_channel_info_support(smu);
> > + if (ret)
> > + return ret;
> > +
> > + /* message SMU to update the bad channel info on SMUBUS */
> > + ret = smu_cmn_send_smc_msg_with_param(smu,
> > SMU_MSG_SetBadHBMPagesRetiredFlagsPerChannel, size, NULL);
> > + if (ret)
> > + dev_err(smu->adev->dev, "[%s] failed to message SMU to
> > update HBM bad channel info\n",
> > + __func__);
> > +
> > + return ret;
> > +}
> > +
> > static const struct pptable_funcs aldebaran_ppt_funcs = {
> > /* init dpm */
> > .get_allowed_feature_mask =
> aldebaran_get_allowed_feature_mask,
> > @@ -2062,6 +2104,7 @@ static const struct pptable_funcs
> > aldebaran_ppt_funcs = {
> > .i2c_fini = aldebaran_i2c_control_fini,
> > .send_hbm_bad_pages_num =
> > aldebaran_smu_send_hbm_bad_page_num,
> > .get_ecc_info = aldebaran_get_ecc_info,
> > + .send_hbm_bad_channel_flag =
> > aldebaran_send_hbm_bad_channel_flag,
> > };
> >
> > void aldebaran_set_ppt_funcs(struct smu_context *smu)
> > --
> > 2.17.1
More information about the amd-gfx
mailing list