[PATCH Review 4/4] query umc error info from ecc_table v2
Stanley.Yang
Stanley.Yang at amd.com
Thu Nov 18 09:33:44 UTC 2021
if smu support ECCTABLE, driver can message smu to get ecc_table
then query umc error info from ECCTABLE
v2:
optimize source code makes logical more reasonable
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 42 +++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 72 +++++++++++++++++--------
2 files changed, 83 insertions(+), 31 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..3a4e483cd5e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -888,6 +888,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
}
}
+static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ int ret = 0;
+
+ /*
+ * choosing right query method according to
+ * whether smu support query error information
+ */
+ ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc));
+ if (ret == -EOPNOTSUPP) {
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_count)
+ adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_address)
+ adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
+ } else if (!ret) {
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->ecc_info_query_ras_error_count)
+ adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->ecc_info_query_ras_error_address)
+ adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data);
+ }
+}
+
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
@@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_count)
- adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
- /* umc query_ras_error_address is also responsible for clearing
- * error status
- */
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_address)
- adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
+ amdgpu_ras_get_ecc_info(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->query_ras_error_count) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0c7c56a91b25..2b37b1c293b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -95,30 +95,58 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ int ret = 0;
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_count)
- adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
-
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_address &&
- adev->umc.max_ras_err_cnt_per_query) {
- err_data->err_addr =
- kcalloc(adev->umc.max_ras_err_cnt_per_query,
- sizeof(struct eeprom_table_record), GFP_KERNEL);
-
- /* still call query_ras_error_address to clear error status
- * even NOMEM error is encountered
- */
- if(!err_data->err_addr)
- dev_warn(adev->dev, "Failed to alloc memory for "
- "umc error address record!\n");
-
- /* umc query_ras_error_address is also responsible for clearing
- * error status
- */
- adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+ ret = smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc));
+ if (ret == -EOPNOTSUPP) {
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_count)
+ adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+ /* still call query_ras_error_address to clear error status
+ * even NOMEM error is encountered
+ */
+ if(!err_data->err_addr)
+ dev_warn(adev->dev, "Failed to alloc memory for "
+ "umc error address record!\n");
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+ }
+ } else if (!ret) {
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->ecc_info_query_ras_error_count)
+ adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, ras_error_status);
+
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->ecc_info_query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+ /* still call query_ras_error_address to clear error status
+ * even NOMEM error is encountered
+ */
+ if(!err_data->err_addr)
+ dev_warn(adev->dev, "Failed to alloc memory for "
+ "umc error address record!\n");
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, ras_error_status);
+ }
}
/* only uncorrectable error needs gpu reset */
--
2.17.1
More information about the amd-gfx
mailing list