[PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
Wang, Yang(Kevin)
KevinYang.Wang at amd.com
Mon May 23 08:48:48 UTC 2022
[AMD Official Use Only - General]
________________________________
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> on behalf of Stanley.Yang <Stanley.Yang at amd.com>
Sent: Monday, May 23, 2022 4:17 PM
To: amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>; Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Quan, Evan <Evan.Quan at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>
Cc: Yang, Stanley <Stanley.Yang at amd.com>
Subject: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in ecctable
SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table,
driver side add ecctable_v2 to support this feature
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 2 +
.../inc/pmfw_if/smu13_driver_if_aldebaran.h | 15 +++
.../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 101 ++++++++++++++----
.../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 +
5 files changed, 98 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b9a6fac2b8b2..28e603243b67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -328,6 +328,7 @@ struct ecc_info_per_ch {
uint16_t ce_count_hi_chip;
uint64_t mca_umc_status;
uint64_t mca_umc_addr;
+ uint64_t mca_ceumc_addr;
};
struct umc_ecc_info {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index a6a7b6c33683..9f7257ada437 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -322,6 +322,7 @@ enum smu_table_id
SMU_TABLE_PACE,
SMU_TABLE_ECCINFO,
SMU_TABLE_COMBO_PPTABLE,
+ SMU_TABLE_ECCINFO_V2,
SMU_TABLE_COUNT,
};
@@ -340,6 +341,7 @@ struct smu_table_context
void *driver_pptable;
void *combo_pptable;
void *ecc_table;
+ void *ecc_table_v2; // adapt to smu support record mca_ceumc_addr
void *driver_smu_config_table;
struct smu_table tables[SMU_TABLE_COUNT];
/*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
index 0f67c56c2863..2868604eff49 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
@@ -522,6 +522,21 @@ typedef struct {
EccInfo_t EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
} EccInfoTable_t;
+typedef struct {
+ uint64_t mca_umc_status;
+ uint64_t mca_umc_addr;
+ uint64_t mca_ceumc_addr;
+
+ uint16_t ce_count_lo_chip;
+ uint16_t ce_count_hi_chip;
+
+ uint32_t eccPadding;
+} EccInfo_t_v2;
+
+typedef struct {
+ EccInfo_t_v2 EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+} EccInfoTable_t_v2;
+
// These defines are used with the following messages:
// SMC_MSG_TransferTableDram2Smu
// SMC_MSG_TransferTableSmu2Dram
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 38af648cb857..e58df9490cec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -82,6 +82,12 @@
*/
#define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+/*
+ * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
+ * use this to check mca_ceumc_addr record whether support
+ */
+#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
+
/*
* SMU support BAD CHENNEL info MSG since version 68.51.00,
* use this to check ECCTALE feature whether support
@@ -239,6 +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+ SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, sizeof(EccInfoTable_t_v2),
+ PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
[kevin]:
this table mapping is not needed, the reason as below.
smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
if (!smu_table->metrics_table)
return -ENOMEM;
@@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
if (!smu_table->ecc_table)
return -ENOMEM;
+ smu_table->ecc_table_v2 = kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL);
+ if (!smu_table->ecc_table_v2)
+ return -ENOMEM;;
+
[kevin]:
add eccinfo table v2 is not needed for this case, this table is only used store table data from pmfw to driver,
you can create a large enough table which can save ecc table data directly.
e.g:
size = max(sizeof(EccInfoTable_t_v2), sizeof(EccInfoTable_t));
smu_table->ecc_table = kzalloc(size, GFP_KERNEL);
Best Regards,
Kevin
return 0;
}
@@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
return sizeof(struct gpu_metrics_v1_3);
}
-static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+static int aldebaran_check_ecc_table_support(struct smu_context *smu,
+ int *ecctable_version)
{
uint32_t if_version = 0xff, smu_version = 0xff;
int ret = 0;
@@ -1815,6 +1829,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
ret = -EOPNOTSUPP;
+ else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
+ smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
+ *ecctable_version = 1;
+ else
+ *ecctable_version = 2;
return ret;
}
@@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
{
struct smu_table_context *smu_table = &smu->smu_table;
EccInfoTable_t *ecc_table = NULL;
+ EccInfoTable_t_v2 *ecc_table_v2 = NULL;
struct ecc_info_per_ch *ecc_info_per_channel = NULL;
int i, ret = 0;
+ int table_version = 0;
struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
- ret = aldebaran_check_ecc_table_support(smu);
+ ret = aldebaran_check_ecc_table_support(smu, &table_version);
if (ret)
return ret;
- ret = smu_cmn_update_table(smu,
- SMU_TABLE_ECCINFO,
- 0,
- smu_table->ecc_table,
- false);
- if (ret) {
- dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
- return ret;
- }
+ if (table_version == 1) {
+ ret = smu_cmn_update_table(smu,
+ SMU_TABLE_ECCINFO,
+ 0,
+ smu_table->ecc_table,
+ false);
+ if (ret) {
+ dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
+ return ret;
+ }
+
+ ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+ for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+ ecc_info_per_channel = &(eccinfo->ecc[i]);
+ ecc_info_per_channel->ce_count_lo_chip =
+ ecc_table->EccInfo[i].ce_count_lo_chip;
+ ecc_info_per_channel->ce_count_hi_chip =
+ ecc_table->EccInfo[i].ce_count_hi_chip;
+ ecc_info_per_channel->mca_umc_status =
+ ecc_table->EccInfo[i].mca_umc_status;
+ ecc_info_per_channel->mca_umc_addr =
+ ecc_table->EccInfo[i].mca_umc_addr;
+ }
+ } else if (table_version == 2) {
+ /* still use SMU_TABLE_ECC_INFO index,
+ * smu 68.55.0 add mca_ceumc_addr variable
+ * in EccInfo_t struct to report correctable
+ * error address and the table_id is not changed
+ */
+ ret = smu_cmn_update_table(smu,
+ SMU_TABLE_ECCINFO,
+ 0,
+ smu_table->ecc_table_v2,
+ false);
- ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
-
- for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
- ecc_info_per_channel = &(eccinfo->ecc[i]);
- ecc_info_per_channel->ce_count_lo_chip =
- ecc_table->EccInfo[i].ce_count_lo_chip;
- ecc_info_per_channel->ce_count_hi_chip =
- ecc_table->EccInfo[i].ce_count_hi_chip;
- ecc_info_per_channel->mca_umc_status =
- ecc_table->EccInfo[i].mca_umc_status;
- ecc_info_per_channel->mca_umc_addr =
- ecc_table->EccInfo[i].mca_umc_addr;
+ if (ret) {
+ dev_info(smu->adev->dev, "Failed to export SMU ecc table_v2!\n");
+ return ret;
+ }
+
+ ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2;
+
+ for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+ ecc_info_per_channel = &(eccinfo->ecc[i]);
+ ecc_info_per_channel->ce_count_lo_chip =
+ ecc_table_v2->EccInfo[i].ce_count_lo_chip;
+ ecc_info_per_channel->ce_count_hi_chip =
+ ecc_table_v2->EccInfo[i].ce_count_hi_chip;
+ ecc_info_per_channel->mca_umc_status =
+ ecc_table_v2->EccInfo[i].mca_umc_status;
+ ecc_info_per_channel->mca_umc_addr =
+ ecc_table_v2->EccInfo[i].mca_umc_addr;
+ ecc_info_per_channel->mca_ceumc_addr =
+ ecc_table_v2->EccInfo[i].mca_ceumc_addr;
+ }
}
return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index ae6321af9d88..af2d84a16f3e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
kfree(smu_table->hardcode_pptable);
smu_table->hardcode_pptable = NULL;
+ kfree(smu_table->ecc_table_v2);
kfree(smu_table->ecc_table);
kfree(smu_table->metrics_table);
kfree(smu_table->watermarks_table);
+ smu_table->ecc_table_v2 = NULL;
smu_table->ecc_table = NULL;
smu_table->metrics_table = NULL;
smu_table->watermarks_table = NULL;
--
2.17.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20220523/e520bfa2/attachment-0001.htm>
More information about the amd-gfx
mailing list