[PATCH Review 4/4] query umc error info from ecc_table
Stanley.Yang
Stanley.Yang at amd.com
Wed Nov 17 10:11:32 UTC 2021
if smu support ECCTABLE, driver can message smu to get ecc_table
then query umc error info from ECCTABLE
apply pmfw version check to ensure backward compatibility
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 42 ++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 7 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 71 +++++++++++++------
drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 1 +
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 12 ++++
.../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 4 ++
6 files changed, 107 insertions(+), 30 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..6b0f2ba1e420 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -888,6 +888,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
}
}
+static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+ /*
+ * choosing right query method according to
+ * whether smu support query error information
+ */
+ if ((ras->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
+ !smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc))) {
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->message_smu_query_ras_error_count)
+ adev->umc.ras_funcs->message_smu_query_ras_error_count(adev, err_data);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->message_smu_query_ras_error_address)
+ adev->umc.ras_funcs->message_smu_query_ras_error_address(adev, err_data);
+ } else {
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_count)
+ adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_address)
+ adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
+ }
+}
+
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
@@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_count)
- adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
- /* umc query_ras_error_address is also responsible for clearing
- * error status
- */
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_address)
- adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
+ amdgpu_ras_get_ecc_info(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->query_ras_error_count) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bcbf3264d92f..3f0de0cc8403 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -322,6 +322,12 @@ struct ras_common_if {
#define MAX_UMC_CHANNEL_NUM 32
+/*
+ * SMU support ECCTABLE since version 68.42.0,
+ * use this to decide query umc error info method
+ */
+#define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+
struct ecc_info_per_ch {
uint16_t ce_count_lo_chip;
uint16_t ce_count_hi_chip;
@@ -375,6 +381,7 @@ struct amdgpu_ras {
/* record umc error info queried from smu */
struct umc_ecc_info umc_ecc;
+ uint32_t smu_version;
};
struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0c7c56a91b25..2c3e97c9410b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -97,28 +97,57 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_count)
- adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_address &&
- adev->umc.max_ras_err_cnt_per_query) {
- err_data->err_addr =
- kcalloc(adev->umc.max_ras_err_cnt_per_query,
- sizeof(struct eeprom_table_record), GFP_KERNEL);
-
- /* still call query_ras_error_address to clear error status
- * even NOMEM error is encountered
- */
- if(!err_data->err_addr)
- dev_warn(adev->dev, "Failed to alloc memory for "
- "umc error address record!\n");
-
- /* umc query_ras_error_address is also responsible for clearing
- * error status
- */
- adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+ if ((con->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
+ !smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc))) {
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->message_smu_query_ras_error_count)
+ adev->umc.ras_funcs->message_smu_query_ras_error_count(adev, ras_error_status);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->message_smu_query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+ /* still call query_ras_error_address to clear error status
+ * even NOMEM error is encountered
+ */
+ if(!err_data->err_addr)
+ dev_warn(adev->dev, "Failed to alloc memory for "
+ "umc error address record!\n");
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ adev->umc.ras_funcs->message_smu_query_ras_error_address(adev, ras_error_status);
+ }
+ } else {
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_count)
+ adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+ /* still call query_ras_error_address to clear error status
+ * even NOMEM error is encountered
+ */
+ if(!err_data->err_addr)
+ dev_warn(adev->dev, "Failed to alloc memory for "
+ "umc error address record!\n");
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+ }
}
/* only uncorrectable error needs gpu reset */
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index ea65de0160c3..7a06021a58f0 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -1404,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable);
int smu_wait_for_event(struct amdgpu_device *adev, enum smu_event_type event,
uint64_t event_arg);
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
#endif
#endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 01168b8955bf..6340c079f35e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3072,6 +3072,18 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable)
return ret;
}
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc)
+{
+ int ret = -1;
+
+ if (smu->ppt_funcs &&
+ smu->ppt_funcs->get_ecc_info)
+ ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
+
+ return ret;
+
+}
+
static int smu_get_prv_buffer_details(void *handle, void **addr, size_t *size)
{
struct smu_context *smu = handle;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 55421ea622fb..55ef10ca684a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -200,11 +200,15 @@ int smu_v13_0_check_fw_version(struct smu_context *smu)
uint16_t smu_major;
uint8_t smu_minor, smu_debug;
int ret = 0;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(smu->adev);
ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
if (ret)
return ret;
+ /* record smu interface version, help umc query error method */
+ ras->smu_version = smu_version;
+
smu_major = (smu_version >> 16) & 0xffff;
smu_minor = (smu_version >> 8) & 0xff;
smu_debug = (smu_version >> 0) & 0xff;
--
2.17.1
More information about the amd-gfx
mailing list