[PATCH 2/6] drm/amdgpu: add ras_err_info to identify RAS error source
Zhang, Hawking
Hawking.Zhang at amd.com
Thu Oct 12 13:18:37 UTC 2023
[AMD Official Use Only - General]
+ dev_info(adev->dev, "Total %ld uncorrectable hardware errors detected in %s block\n",
+ ras_mgr->err_data.ue_count, blk_name);
Please remove "Total" from the kernel message. With that addressed, the series is
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Yang Wang
Sent: Thursday, October 12, 2023 21:14
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
Subject: [PATCH 2/6] drm/amdgpu: add ras_err_info to identify RAS error source
introduced "ras_err_info" to better identify a RAS ERROR source.
NOTE:
For legacy chips, keep the original RAS error print format.
v1:
RAS errors may come from different dies during a RAS error query, therefore, need a new data structure to identify the source of RAS ERROR.
v2:
- use new data structure 'amdgpu_smuio_mcm_config_info' instead of
ras_err_id (in v1 patch)
- refine ras error dump function name
- refine ras error dump log format
Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 289 ++++++++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 28 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h | 5 +
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 27 +-
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 7 +-
drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 7 +-
6 files changed, 310 insertions(+), 53 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5fb57419ef77..2522d29806dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -152,8 +152,9 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) {
- struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct ras_err_data err_data;
struct eeprom_table_record err_rec;
+ int ret;
if ((address >= adev->gmc.mc_vram_size) ||
(address >= RAS_UMC_INJECT_ADDR_LIMIT)) { @@ -170,6 +171,10 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
return 0;
}
+ ret = amdgpu_ras_error_data_init(&err_data);
+ if (ret)
+ return ret;
+
memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
err_data.err_addr = &err_rec;
amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); @@ -180,6 +185,8 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
amdgpu_ras_save_bad_pages(adev, NULL);
}
+ amdgpu_ras_error_data_fini(&err_data);
+
dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
dev_warn(adev->dev, "Clear EEPROM:\n");
dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
@@ -1015,17 +1022,113 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
}
}
+static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, struct ras_query_if *query_if,
+ struct ras_err_data *err_data, bool is_ue) {
+ struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
+ const char *blk_name = get_ras_block_str(&query_if->head);
+ struct amdgpu_smuio_mcm_config_info *mcm_info;
+ struct ras_err_node *err_node;
+ struct ras_err_info *err_info;
+
+ if (is_ue)
+ dev_info(adev->dev, "Total %ld uncorrectable hardware errors detected in %s block\n",
+ ras_mgr->err_data.ue_count, blk_name);
+ else
+ dev_info(adev->dev, "Total %ld correctable hardware errors detected in %s block\n",
+ ras_mgr->err_data.ue_count, blk_name);
+
+ for_each_ras_error(err_node, err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ if (is_ue && err_info->ue_count) {
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld uncorrectable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ue_count,
+ blk_name);
+ } else if (!is_ue && err_info->ce_count) {
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld correctable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ue_count,
+ blk_name);
+ }
+ }
+}
+
+static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, struct ras_query_if *query_if,
+ struct ras_err_data *err_data) {
+ struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
+ const char *blk_name = get_ras_block_str(&query_if->head);
+
+ if (err_data->ce_count) {
+ if (!list_empty(&err_data->err_node_list)) {
+ amdgpu_ras_error_print_error_data(adev, query_if, err_data, false);
+ } else if (!adev->aid_mask &&
+ adev->smuio.funcs &&
+ adev->smuio.funcs->get_socket_id &&
+ adev->smuio.funcs->get_die_id) {
+ dev_info(adev->dev, "socket: %d, die: %d "
+ "%ld correctable hardware errors "
+ "detected in %s block, no user "
+ "action is needed.\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.ce_count,
+ blk_name);
+ } else {
+ dev_info(adev->dev, "%ld correctable hardware errors "
+ "detected in %s block, no user "
+ "action is needed.\n",
+ ras_mgr->err_data.ce_count,
+ blk_name);
+ }
+ }
+
+ if (err_data->ue_count) {
+ if (!list_empty(&err_data->err_node_list)) {
+ amdgpu_ras_error_print_error_data(adev, query_if, err_data, true);
+ } else if (!adev->aid_mask &&
+ adev->smuio.funcs &&
+ adev->smuio.funcs->get_socket_id &&
+ adev->smuio.funcs->get_die_id) {
+ dev_info(adev->dev, "socket: %d, die: %d "
+ "%ld uncorrectable hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.ue_count,
+ blk_name);
+ } else {
+ dev_info(adev->dev, "%ld uncorrectable hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.ue_count,
+ blk_name);
+ }
+ }
+
+}
+
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
{
struct amdgpu_ras_block_object *block_obj = NULL;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
- struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct ras_err_data err_data;
+ int ret;
if (!obj)
return -EINVAL;
+ ret = amdgpu_ras_error_data_init(&err_data);
+ if (ret)
+ return ret;
+
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
amdgpu_ras_get_ecc_info(adev, &err_data);
} else {
@@ -1033,7 +1136,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
get_ras_block_str(&info->head));
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_fini_err_data;
}
if (block_obj->hw_ops->query_ras_error_count)
@@ -1053,48 +1157,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count;
- if (err_data.ce_count) {
- if (!adev->aid_mask &&
- adev->smuio.funcs &&
- adev->smuio.funcs->get_socket_id &&
- adev->smuio.funcs->get_die_id) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%ld correctable hardware errors "
- "detected in %s block, no user "
- "action is needed.\n",
- adev->smuio.funcs->get_socket_id(adev),
- adev->smuio.funcs->get_die_id(adev),
- obj->err_data.ce_count,
- get_ras_block_str(&info->head));
- } else {
- dev_info(adev->dev, "%ld correctable hardware errors "
- "detected in %s block, no user "
- "action is needed.\n",
- obj->err_data.ce_count,
- get_ras_block_str(&info->head));
- }
- }
- if (err_data.ue_count) {
- if (!adev->aid_mask &&
- adev->smuio.funcs &&
- adev->smuio.funcs->get_socket_id &&
- adev->smuio.funcs->get_die_id) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%ld uncorrectable hardware errors "
- "detected in %s block\n",
- adev->smuio.funcs->get_socket_id(adev),
- adev->smuio.funcs->get_die_id(adev),
- obj->err_data.ue_count,
- get_ras_block_str(&info->head));
- } else {
- dev_info(adev->dev, "%ld uncorrectable hardware errors "
- "detected in %s block\n",
- obj->err_data.ue_count,
- get_ras_block_str(&info->head));
- }
- }
+ amdgpu_ras_error_generate_report(adev, info, &err_data);
- return 0;
+out_fini_err_data:
+ amdgpu_ras_error_data_fini(&err_data);
+
+ return ret;
}
int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, @@ -1744,12 +1812,16 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
struct amdgpu_iv_entry *entry)
{
struct ras_ih_data *data = &obj->ih_data;
- struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct ras_err_data err_data;
int ret;
if (!data->cb)
return;
+ ret = amdgpu_ras_error_data_init(&err_data);
+ if (ret)
+ return;
+
/* Let IP handle its data, maybe we need get the output
* from the callback to update the error type/count, etc
*/
@@ -1766,6 +1838,8 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count;
}
+
+ amdgpu_ras_error_data_fini(&err_data);
}
static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) @@ -3383,3 +3457,128 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
WREG32(err_status_hi_offset, 0);
}
}
+
+int amdgpu_ras_error_data_init(struct ras_err_data *err_data) {
+ memset(err_data, 0, sizeof(*err_data));
+
+ INIT_LIST_HEAD(&err_data->err_node_list);
+
+ return 0;
+}
+
+static void amdgpu_ras_error_node_release(struct ras_err_node
+*err_node) {
+ if (!err_node)
+ return;
+
+ list_del(&err_node->node);
+ kvfree(err_node);
+}
+
+void amdgpu_ras_error_data_fini(struct ras_err_data *err_data) {
+ struct ras_err_node *err_node, *tmp;
+
+ list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) {
+ amdgpu_ras_error_node_release(err_node);
+ list_del(&err_node->node);
+ }
+}
+
+static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info) {
+ struct ras_err_node *err_node;
+ struct amdgpu_smuio_mcm_config_info *ref_id;
+
+ if (!err_data || !mcm_info)
+ return NULL;
+
+ for_each_ras_error(err_node, err_data) {
+ ref_id = &err_node->err_info.mcm_info;
+ if ((mcm_info->socket_id >= 0 && mcm_info->socket_id != ref_id->socket_id) ||
+ (mcm_info->die_id >= 0 && mcm_info->die_id != ref_id->die_id))
+ continue;
+
+ return err_node;
+ }
+
+ return NULL;
+}
+
+static struct ras_err_node *amdgpu_ras_error_node_new(void) {
+ struct ras_err_node *err_node;
+
+ err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL);
+ if (!err_node)
+ return NULL;
+
+ INIT_LIST_HEAD(&err_node->node);
+
+ return err_node;
+}
+
+static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info) {
+ struct ras_err_node *err_node;
+
+ err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info);
+ if (err_node)
+ return &err_node->err_info;
+
+ err_node = amdgpu_ras_error_node_new();
+ if (!err_node)
+ return NULL;
+
+ memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
+
+ err_data->err_list_count++;
+ list_add_tail(&err_node->node, &err_data->err_node_list);
+
+ return &err_node->err_info;
+}
+
+int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info, uint64_t count) {
+ struct ras_err_info *err_info;
+
+ if (!err_data || !mcm_info)
+ return -EINVAL;
+
+ if (!count)
+ return 0;
+
+ err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+ if (!err_info)
+ return -EINVAL;
+
+ err_info->ue_count += count;
+ err_data->ue_count += count;
+
+ return 0;
+}
+
+int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info, uint64_t count) {
+ struct ras_err_info *err_info;
+
+ if (!err_data || !mcm_info)
+ return -EINVAL;
+
+ if (!count)
+ return 0;
+
+ err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+ if (!err_info)
+ return -EINVAL;
+
+ err_info->ce_count += count;
+ err_data->ce_count += count;
+
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 46b0dcf846dc..d6a37d5b9809 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include "ta_ras_if.h"
#include "amdgpu_ras_eeprom.h"
+#include "amdgpu_smuio.h"
struct amdgpu_iv_entry;
@@ -443,13 +444,30 @@ struct ras_fs_data {
char debugfs_name[32];
};
+
+struct ras_err_info {
+ struct amdgpu_smuio_mcm_config_info mcm_info;
+ uint64_t ce_count;
+ uint64_t ue_count;
+};
+
+struct ras_err_node {
+ struct list_head node;
+ struct ras_err_info err_info;
+};
+
struct ras_err_data {
unsigned long ue_count;
unsigned long ce_count;
unsigned long err_addr_cnt;
struct eeprom_table_record *err_addr;
+ uint32_t err_list_count;
+ struct list_head err_node_list;
};
+#define for_each_ras_error(err_node, err_data) \
+ list_for_each_entry(err_node, &(err_data)->err_node_list, node)
+
struct ras_err_handler_data {
/* point to bad page records array */
struct eeprom_table_record *bps;
@@ -773,4 +791,14 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
const struct amdgpu_ras_err_status_reg_entry *reg_list,
uint32_t reg_list_size,
uint32_t instance);
+
+int amdgpu_ras_error_data_init(struct ras_err_data *err_data); void
+amdgpu_ras_error_data_fini(struct ras_err_data *err_data); int
+amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info, uint64_t count);
+int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info, uint64_t count);
+
#endif
+
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h
index 5910d50ac74d..ff4435181055 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h
@@ -30,6 +30,11 @@ enum amdgpu_pkg_type {
AMDGPU_PKG_TYPE_UNKNOWN,
};
+struct amdgpu_smuio_mcm_config_info {
+ int socket_id;
+ int die_id;
+};
+
struct amdgpu_smuio_funcs {
u32 (*get_rom_index_offset)(struct amdgpu_device *adev);
u32 (*get_rom_data_offset)(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 24fcc9a2e422..f74347cc087a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -45,8 +45,12 @@ static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) {
- struct ras_err_data err_data = {0, 0, 0, NULL};
- int ret = AMDGPU_RAS_FAIL;
+ struct ras_err_data err_data;
+ int ret;
+
+ ret = amdgpu_ras_error_data_init(&err_data);
+ if (ret)
+ return ret;
err_data.err_addr =
kcalloc(adev->umc.max_ras_err_cnt_per_query,
@@ -54,7 +58,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
if (!err_data.err_addr) {
dev_warn(adev->dev,
"Failed to alloc memory for umc error record in MCA notifier!\n");
- return AMDGPU_RAS_FAIL;
+ ret = AMDGPU_RAS_FAIL;
+ goto out_fini_err_data;
}
/*
@@ -63,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
ch_inst, umc_inst);
if (ret)
- goto out;
+ goto out_free_err_addr;
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -71,8 +76,12 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
amdgpu_ras_save_bad_pages(adev, NULL);
}
-out:
+out_free_err_addr:
kfree(err_data.err_addr);
+
+out_fini_err_data:
+ amdgpu_ras_error_data_fini(&err_data);
+
return ret;
}
@@ -182,18 +191,24 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
}
if (!amdgpu_sriov_vf(adev)) {
- struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct ras_err_data err_data;
struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__UMC,
};
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+ ret = amdgpu_ras_error_data_init(&err_data);
+ if (ret)
+ return ret;
+
ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
if (ret == AMDGPU_RAS_SUCCESS && obj) {
obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count;
}
+
+ amdgpu_ras_error_data_fini(&err_data);
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
adev->virt.ops->ras_poison_handler(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 7d6d7734dbec..6d24c84924cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -365,9 +365,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device {
uint32_t bif_doorbell_intr_cntl;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
- struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct ras_err_data err_data;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ if (amdgpu_ras_error_data_init(&err_data))
+ return;
+
if (adev->asic_type == CHIP_ALDEBARAN)
bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL_ALDE);
else
@@ -418,6 +421,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
*/
amdgpu_ras_reset_gpu(adev);
}
+
+ amdgpu_ras_error_data_fini(&err_data);
}
static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index cc034a740605..eccb006e78aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -560,9 +560,12 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device {
uint32_t bif_doorbell_intr_cntl;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
- struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct ras_err_data err_data;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ if (amdgpu_ras_error_data_init(&err_data))
+ return;
+
bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
if (REG_GET_FIELD(bif_doorbell_intr_cntl,
@@ -607,6 +610,8 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device
*/
amdgpu_ras_reset_gpu(adev);
}
+
+ amdgpu_ras_error_data_fini(&err_data);
}
static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev)
--
2.34.1
More information about the amd-gfx
mailing list