[PATCH 2/4] drm/amdgpu: add amdgpu MCA bank dispatch function support
Yang Wang
kevinyang.wang at amd.com
Tue Apr 23 08:26:38 UTC 2024
- Refine mca driver code.
- Centralize mca bank dispatch code logic.
Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 97 ++++++++++++++-----------
1 file changed, 55 insertions(+), 42 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 67c208861994..859d594c02a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -267,7 +267,8 @@ static int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_
return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
}
-static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set)
+static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set,
+ struct ras_query_context *qctx)
{
struct mca_bank_entry entry;
uint32_t count = 0, i;
@@ -287,6 +288,8 @@ static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mc
return ret;
amdgpu_mca_bank_set_add_entry(mca_set, &entry);
+
+ amdgpu_mca_smu_mca_bank_dump(adev, i, &entry, qctx);
}
return 0;
@@ -306,36 +309,36 @@ static int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum
return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count);
}
-int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
- struct ras_err_data *err_data, struct ras_query_context *qctx)
+static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
+ struct mca_bank_set *mca_set, struct ras_err_data *err_data)
{
+ struct ras_err_addr err_addr;
struct amdgpu_smuio_mcm_config_info mcm_info;
- struct ras_err_addr err_addr = {0};
- struct mca_bank_set mca_set;
struct mca_bank_node *node;
struct mca_bank_entry *entry;
uint32_t count;
- int ret, i = 0;
+ int ret;
- amdgpu_mca_bank_set_init(&mca_set);
+ if (!mca_set)
+ return -EINVAL;
- ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set);
- if (ret)
- goto out_mca_release;
+ if (!mca_set->nr_entries)
+ return 0;
- list_for_each_entry(node, &mca_set.list, node) {
+ list_for_each_entry(node, &mca_set->list, node) {
entry = &node->entry;
- amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
-
count = 0;
ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
if (ret)
- goto out_mca_release;
+ return ret;
if (!count)
continue;
+ memset(&mcm_info, 0, sizeof(mcm_info));
+ memset(&err_addr, 0, sizeof(err_addr));
+
mcm_info.socket_id = entry->info.socket_id;
mcm_info.die_id = entry->info.aid;
@@ -345,19 +348,36 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR];
}
- if (type == AMDGPU_MCA_ERROR_TYPE_UE)
+ if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
amdgpu_ras_error_statistic_ue_count(err_data,
- &mcm_info, &err_addr, (uint64_t)count);
- else {
+ &mcm_info, &err_addr, (uint64_t)count);
+ } else {
if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]))
amdgpu_ras_error_statistic_de_count(err_data,
- &mcm_info, &err_addr, (uint64_t)count);
+ &mcm_info, &err_addr, (uint64_t)count);
else
amdgpu_ras_error_statistic_ce_count(err_data,
- &mcm_info, &err_addr, (uint64_t)count);
+ &mcm_info, &err_addr, (uint64_t)count);
}
}
+ return 0;
+}
+
+int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
+ struct ras_err_data *err_data, struct ras_query_context *qctx)
+{
+ struct mca_bank_set mca_set;
+ int ret;
+
+ amdgpu_mca_bank_set_init(&mca_set);
+
+ ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, qctx);
+ if (ret)
+ goto out_mca_release;
+
+ ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_set, err_data);
+
out_mca_release:
amdgpu_mca_bank_set_release(&mca_set);
@@ -402,36 +422,29 @@ static void mca_dump_entry(struct seq_file *m, struct mca_bank_entry *entry)
static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type)
{
struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
- struct mca_bank_entry *entry;
- uint32_t count = 0;
- int i, ret;
+ struct mca_bank_node *node;
+ struct mca_bank_set mca_set;
+ struct ras_query_context qctx;
+ int ret;
- ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count);
+ amdgpu_mca_bank_set_init(&mca_set);
+
+ qctx.event_id = 0ULL;
+ ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx);
if (ret)
- return ret;
+ goto err_free_mca_set;
seq_printf(m, "amdgpu smu %s valid mca count: %d\n",
- type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", count);
-
- if (!count)
- return 0;
-
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- return -ENOMEM;
-
- for (i = 0; i < count; i++) {
- memset(entry, 0, sizeof(*entry));
+ type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", mca_set.nr_entries);
- ret = amdgpu_mca_smu_get_mca_entry(adev, type, i, entry);
- if (ret)
- goto err_free_entry;
+ if (!mca_set.nr_entries)
+ goto err_free_mca_set;
- mca_dump_entry(m, entry);
- }
+ list_for_each_entry(node, &mca_set.list, node)
+ mca_dump_entry(m, &node->entry);
-err_free_entry:
- kfree(entry);
+err_free_mca_set:
+ amdgpu_mca_bank_set_release(&mca_set);
return ret;
}
--
2.34.1
More information about the amd-gfx
mailing list