[PATCH Review 1/1] drm/amdgpu: support send bad channel info to smu
Stanley.Yang
Stanley.Yang at amd.com
Tue Mar 1 13:30:22 UTC 2022
Message SMU bad channel information bitmap to update OOB table
Change-Id: I49a79af64d5263c28db059ecb8b8405a471431b4
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 ++++++++++-
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 4 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++
drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 12 ++++++
drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 1 +
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +++++
drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 7 +++
.../pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h | 3 +-
drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 +-
.../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 43 +++++++++++++++++++
12 files changed, 119 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d3875618ebf5..f9104f99eb9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
+ con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
@@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
goto free;
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
+
+ if (con->update_channel_flag == true) {
+ amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
+ con->update_channel_flag = false;
+ }
}
#ifdef CONFIG_X86_MCE_AMD
@@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
+ con->update_channel_flag = false;
con->features = 0;
INIT_LIST_HEAD(&con->head);
/* Might need get this flag from vbios. */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7cddaad90d6d..9314fde81e68 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -374,6 +374,9 @@ struct amdgpu_ras {
/* record umc error info queried from smu */
struct umc_ecc_info umc_ecc;
+
+ /* Indicates smu whether need update bad channel info */
+ bool update_channel_flag;
};
struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 2b844a5aafdb..ad5d8667756d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -265,6 +265,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
u8 csum;
int res;
@@ -285,6 +286,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
+ control->bad_channel_bitmap = 0;
+ amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
+ con->update_channel_flag = false;
+
amdgpu_ras_debugfs_set_ret_size(control);
mutex_unlock(&control->ras_tbl_mutex);
@@ -418,6 +423,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *record,
const u32 num)
{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
u32 a, b, i;
u8 *buf, *pp;
int res;
@@ -429,9 +435,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
/* Encode all of them in one go.
*/
pp = buf;
- for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
+ for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
__encode_table_record_to_buf(control, &record[i], pp);
+ /* update bad channel bitmap */
+ if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+ control->bad_channel_bitmap |= 1 << record[i].mem_channel;
+ con->update_channel_flag = true;
+ }
+ }
+
/* a, first record index to write into.
* b, last record index to write into.
* a = first index to read (fri) + number of records in the table,
@@ -684,6 +697,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int i, res;
u8 *buf, *pp;
u32 g0, g1;
@@ -751,8 +765,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
/* Read up everything? Then transform.
*/
pp = buf;
- for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
+ for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
__decode_table_record_from_buf(control, &record[i], pp);
+
+ /* update bad channel bitmap */
+ if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+ control->bad_channel_bitmap |= 1 << record[i].mem_channel;
+ con->update_channel_flag = true;
+ }
+ }
Out:
kfree(buf);
mutex_unlock(&control->ras_tbl_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 6bb00578bfbb..54d9bfe0881d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
/* Protect table access via this mutex.
*/
struct mutex ras_tbl_mutex;
+
+ /* Record channel info which occurred bad pages
+ */
+ u32 bad_channel_bitmap;
};
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 85da6cbaf3b7..aad3c8b4c810 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
amdgpu_ras_save_bad_pages(adev);
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
+
+ if (con->update_channel_flag == true) {
+ amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
+ con->update_channel_flag = false;
+ }
}
if (reset)
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index 1d63f1e8884c..9a892d6d1d7a 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -507,6 +507,18 @@ int amdgpu_dpm_send_hbm_bad_pages_num(struct amdgpu_device *adev, uint32_t size)
return ret;
}
+int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device *adev, uint32_t size)
+{
+ struct smu_context *smu = adev->powerplay.pp_handle;
+ int ret = 0;
+
+ mutex_lock(&adev->pm.mutex);
+ ret = smu_send_hbm_bad_channel_flag(smu, size);
+ mutex_unlock(&adev->pm.mutex);
+
+ return ret;
+}
+
int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev,
enum pp_clock_type type,
uint32_t *min,
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index ddfa55b59d02..3e78b3057277 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -412,6 +412,7 @@ void amdgpu_dpm_enable_jpeg(struct amdgpu_device *adev, bool enable);
int amdgpu_pm_load_smu_firmware(struct amdgpu_device *adev, uint32_t *smu_version);
int amdgpu_dpm_handle_passthrough_sbr(struct amdgpu_device *adev, bool enable);
int amdgpu_dpm_send_hbm_bad_pages_num(struct amdgpu_device *adev, uint32_t size);
+int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device *adev, uint32_t size);
int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev,
enum pp_clock_type type,
uint32_t *min,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 7e79a67bb8ef..f1544755d8b4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3052,3 +3052,13 @@ int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t size)
return ret;
}
+
+int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size)
+{
+ int ret = 0;
+
+ if (smu->ppt_funcs && smu->ppt_funcs->send_hbm_bad_channel_flag)
+ ret = smu->ppt_funcs->send_hbm_bad_channel_flag(smu, size);
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index fbef3ab8d487..ef57b6089c69 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -1292,6 +1292,12 @@ struct pptable_funcs {
* @set_config_table: Apply the input DriverSmuConfig table settings.
*/
int (*set_config_table)(struct smu_context *smu, struct config_table_setting *table);
+
+ /**
+ * @sned_hbm_bad_channel_flag: message SMU to update bad channel info
+ * of SMUBUS table.
+ */
+ int (*send_hbm_bad_channel_flag)(struct smu_context *smu, uint32_t size);
};
typedef enum {
@@ -1428,5 +1434,6 @@ int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
int smu_stb_collect_info(struct smu_context *smu, void *buff, uint32_t size);
void amdgpu_smu_stb_debug_fs_init(struct amdgpu_device *adev);
int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t size);
+int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size);
#endif
#endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
index ab66a4b9e438..0f498baf6838 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
@@ -103,7 +103,8 @@
#define PPSMC_MSG_GfxDriverResetRecovery 0x42
#define PPSMC_MSG_BoardPowerCalibration 0x43
#define PPSMC_MSG_HeavySBR 0x45
-#define PPSMC_Message_Count 0x46
+#define PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel 0x46
+#define PPSMC_Message_Count 0x47
//PPSMC Reset Types
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index d787c3b9fc52..9f6f306eeca0 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -232,7 +232,8 @@
__SMU_DUMMY_MAP(ForceGfxVid), \
__SMU_DUMMY_MAP(Spare0), \
__SMU_DUMMY_MAP(UnforceGfxVid), \
- __SMU_DUMMY_MAP(HeavySBR),
+ __SMU_DUMMY_MAP(HeavySBR), \
+ __SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel),
#undef __SMU_DUMMY_MAP
#define __SMU_DUMMY_MAP(type) SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 890acc4e2cb8..e5e249968244 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -82,6 +82,12 @@
*/
#define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+/*
+ * SMU support BAD CHENNEL info MSG since version 68.51.00,
+ * use this to check ECCTALE feature whether support
+ */
+#define SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION 0x00443300
+
static const struct smu_temperature_range smu13_thermal_policy[] =
{
{-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, 99000},
@@ -140,6 +146,7 @@ static const struct cmn2asic_msg_mapping aldebaran_message_map[SMU_MSG_MAX_COUNT
MSG_MAP(GfxDriverResetRecovery, PPSMC_MSG_GfxDriverResetRecovery, 0),
MSG_MAP(BoardPowerCalibration, PPSMC_MSG_BoardPowerCalibration, 0),
MSG_MAP(HeavySBR, PPSMC_MSG_HeavySBR, 0),
+ MSG_MAP(SetBadHBMPagesRetiredFlagsPerChannel, PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel, 0),
};
static const struct cmn2asic_mapping aldebaran_clk_map[SMU_CLK_COUNT] = {
@@ -1997,6 +2004,41 @@ static int aldebaran_smu_send_hbm_bad_page_num(struct smu_context *smu,
return ret;
}
+static int aldebaran_check_bad_channel_info_support(struct smu_context *smu)
+{
+ uint32_t if_version = 0xff, smu_version = 0xff;
+ int ret = 0;
+
+ ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
+ if (ret) {
+ /* return not support if failed get smu_version */
+ ret = -EOPNOTSUPP;
+ }
+
+ if (smu_version < SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION)
+ ret = -EOPNOTSUPP;
+
+ return ret;
+}
+
+static int aldebaran_send_hbm_bad_channel_flag(struct smu_context *smu,
+ uint32_t size)
+{
+ int ret = 0;
+
+ ret = aldebaran_check_bad_channel_info_support(smu);
+ if (ret)
+ return ret;
+
+ /* message SMU to update the bad channel info on SMUBUS */
+ ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetBadHBMPagesRetiredFlagsPerChannel, size, NULL);
+ if (ret)
+ dev_err(smu->adev->dev, "[%s] failed to message SMU to update HBM bad channel info\n",
+ __func__);
+
+ return ret;
+}
+
static const struct pptable_funcs aldebaran_ppt_funcs = {
/* init dpm */
.get_allowed_feature_mask = aldebaran_get_allowed_feature_mask,
@@ -2062,6 +2104,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
.i2c_fini = aldebaran_i2c_control_fini,
.send_hbm_bad_pages_num = aldebaran_smu_send_hbm_bad_page_num,
.get_ecc_info = aldebaran_get_ecc_info,
+ .send_hbm_bad_channel_flag = aldebaran_send_hbm_bad_channel_flag,
};
void aldebaran_set_ppt_funcs(struct smu_context *smu)
--
2.17.1
More information about the amd-gfx
mailing list