[PATCH 3/3] drm/amdgpu: add MCA smu cache support
Yang Wang
kevinyang.wang at amd.com
Mon Apr 22 09:35:36 UTC 2024
v1:
because SMU CE valid mca bank will be cleared after reading,
this patch adds mca cache at the driver level to ensure that the mca bank is not lost.
v2:
refine amdgpu_mca_init/fini/reset() function name.
v3:
add mca_cache.lock support
only add CE bank to mca bank cache.
Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 94 ++++++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 20 ++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++
3 files changed, 116 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 68b0e3608b9c..db4640016fe2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -183,6 +183,29 @@ static int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mc
return 0;
}
+static int amdgpu_mca_bank_set_merge(struct mca_bank_set *mca_set, struct mca_bank_set *new)
+{
+ struct mca_bank_node *node;
+
+ list_for_each_entry(node, &new->list, node)
+ amdgpu_mca_bank_set_add_entry(mca_set, &node->entry);
+
+ return 0;
+}
+
+static int amdgpu_mca_bank_set_remove_node(struct mca_bank_set *mca_set, struct mca_bank_node *node)
+{
+ if (!node)
+ return -EINVAL;
+
+ list_del(&node->node);
+ kvfree(node);
+
+ mca_set->nr_entries--;
+
+ return 0;
+}
+
static void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set)
{
struct mca_bank_node *node, *tmp;
@@ -200,6 +223,41 @@ void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_m
mca->mca_funcs = mca_funcs;
}
+int amdgpu_mca_init(struct amdgpu_device *adev)
+{
+ struct amdgpu_mca *mca = &adev->mca;
+ struct mca_bank_cache *mca_cache;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
+ mca_cache = &mca->mca_caches[i];
+ mutex_init(&mca_cache->lock);
+ amdgpu_mca_bank_set_init(&mca_cache->mca_set);
+ }
+
+ return 0;
+}
+
+void amdgpu_mca_fini(struct amdgpu_device *adev)
+{
+ struct amdgpu_mca *mca = &adev->mca;
+ struct mca_bank_cache *mca_cache;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
+ mca_cache = &mca->mca_caches[i];
+ amdgpu_mca_bank_set_release(&mca_cache->mca_set);
+ mutex_destroy(&mca_cache->lock);
+ }
+}
+
+int amdgpu_mca_reset(struct amdgpu_device *adev)
+{
+ amdgpu_mca_fini(adev);
+
+ return amdgpu_mca_init(adev);
+}
+
int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
@@ -314,7 +372,7 @@ static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_r
{
struct ras_err_addr err_addr;
struct amdgpu_smuio_mcm_config_info mcm_info;
- struct mca_bank_node *node;
+ struct mca_bank_node *node, *tmp;
struct mca_bank_entry *entry;
uint32_t count;
int ret;
@@ -322,7 +380,7 @@ static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_r
if (!mca_set)
return -EINVAL;
- list_for_each_entry(node, &mca_set->list, node) {
+ list_for_each_entry_safe(node, tmp, &mca_set->list, node) {
entry = &node->entry;
count = 0;
@@ -356,15 +414,30 @@ static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_r
amdgpu_ras_error_statistic_ce_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
}
+
+ amdgpu_mca_bank_set_remove_node(mca_set, node);
}
return 0;
}
+static int amdgpu_mca_add_mca_set_to_cache(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *new)
+{
+ struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
+ int ret;
+
+ mutex_lock(&mca_cache->lock);
+ ret = amdgpu_mca_bank_set_merge(&mca_cache->mca_set, new);
+ mutex_unlock(&mca_cache->lock);
+
+ return ret;
+}
+
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
struct ras_err_data *err_data, struct ras_query_context *qctx)
{
struct mca_bank_set mca_set;
+ struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
int ret;
amdgpu_mca_bank_set_init(&mca_set);
@@ -374,6 +447,21 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
goto out_mca_release;
ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_set, err_data);
+ if (ret)
+ goto out_mca_release;
+
+ /* add remain mca bank to mca cache */
+ if (type == AMDGPU_MCA_ERROR_TYPE_CE && mca_set.nr_entries) {
+ ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set);
+ if (ret)
+ goto out_mca_release;
+
+ /* dispatch mca set again if mca cache has valid data */
+ mutex_lock(&mca_cache->lock);
+ if (mca_cache->mca_set.nr_entries)
+ ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_cache->mca_set, err_data);
+ mutex_unlock(&mca_cache->lock);
+ }
out_mca_release:
amdgpu_mca_bank_set_release(&mca_set);
@@ -440,6 +528,8 @@ static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type)
list_for_each_entry(node, &mca_set.list, node)
mca_dump_entry(m, &node->entry);
+ amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set);
+
err_free_mca_set:
amdgpu_mca_bank_set_release(&mca_set);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index 4d0a0f91c375..96fc632b38fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -66,6 +66,7 @@ enum amdgpu_mca_error_type {
AMDGPU_MCA_ERROR_TYPE_UE = 0,
AMDGPU_MCA_ERROR_TYPE_CE,
AMDGPU_MCA_ERROR_TYPE_DE,
+ AMDGPU_MCA_ERROR_TYPE_COUNT,
};
struct amdgpu_mca_ras_block {
@@ -77,11 +78,22 @@ struct amdgpu_mca_ras {
struct amdgpu_mca_ras_block *ras;
};
+struct mca_bank_set {
+ int nr_entries;
+ struct list_head list;
+};
+
+struct mca_bank_cache {
+ struct mca_bank_set mca_set;
+ struct mutex lock;
+};
+
struct amdgpu_mca {
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
const struct amdgpu_mca_smu_funcs *mca_funcs;
+ struct mca_bank_cache mca_caches[AMDGPU_MCA_ERROR_TYPE_DE];
};
enum mca_reg_idx {
@@ -113,11 +125,6 @@ struct mca_bank_node {
struct list_head node;
};
-struct mca_bank_set {
- int nr_entries;
- struct list_head list;
-};
-
struct amdgpu_mca_smu_funcs {
int max_ue_count;
int max_ce_count;
@@ -149,6 +156,9 @@ int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs);
+int amdgpu_mca_init(struct amdgpu_device *adev);
+void amdgpu_mca_fini(struct amdgpu_device *adev);
+int amdgpu_mca_reset(struct amdgpu_device *adev);
int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable);
int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *total);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 46b7f0c5cd8a..23766bdffe5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3426,6 +3426,13 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
amdgpu_ras_set_aca_debug_mode(adev, false);
} else {
+ if (amdgpu_in_reset(adev))
+ r = amdgpu_mca_reset(adev);
+ else
+ r = amdgpu_mca_init(adev);
+ if (r)
+ return r;
+
amdgpu_ras_set_mca_debug_mode(adev, false);
}
@@ -3498,6 +3505,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
if (amdgpu_aca_is_enabled(adev))
amdgpu_aca_fini(adev);
+ else
+ amdgpu_mca_fini(adev);
WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared");
--
2.34.1
More information about the amd-gfx
mailing list