[PATCH 1/4] drm/amdgpu: add amdgpu smu mca dump feature support
Yang Wang
kevinyang.wang at amd.com
Wed Sep 13 09:31:57 UTC 2023
add amdgpu smu mca dump feature support.
Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 68 +++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 57 +++++++++++++++++++++
2 files changed, 125 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 8d9ff9e151de..9fa88ae81b12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -142,3 +142,71 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
return 0;
}
+
+void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs)
+{
+ struct amdgpu_mca *mca = &adev->mca;
+
+ mca->mca_funcs = mca_funcs;
+}
+
+int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
+{
+ const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+ if (mca_funcs && mca_funcs->mca_set_debug_mode)
+ return mca_funcs->mca_set_debug_mode(adev, enable);
+
+ return -ENOTSUPP;
+}
+
+int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
+{
+ const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+ if (!count)
+ return -EINVAL;
+
+ if (mca_funcs && mca_funcs->mca_get_valid_mca_count)
+ return mca_funcs->mca_get_valid_mca_count(adev, type, count);
+
+ return -ENOTSUPP;
+}
+
+int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, uint32_t *count)
+{
+ const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+ if (!count)
+ return -EINVAL;
+
+ if (mca_funcs && mca_funcs->mca_get_error_count)
+ return mca_funcs->mca_get_error_count(adev, blk, type, count);
+
+ return -ENOTSUPP;
+}
+
+int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev,enum amdgpu_mca_error_type type, int idx, struct mca_bank_entry *entry)
+{
+ const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+ int count;
+
+ switch (type) {
+ case AMDGPU_MCA_ERROR_TYPE_UE:
+ count = mca_funcs->max_ue_count;
+ break;
+ case AMDGPU_MCA_ERROR_TYPE_CE:
+ count = mca_funcs->max_ce_count;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (idx >= count)
+ return -EINVAL;
+
+ if (mca_funcs && mca_funcs->mca_get_mca_entry)
+ return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
+
+ return -ENOTSUPP;
+}
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index 997a073e2409..6915ae0d5b92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -21,6 +21,26 @@
#ifndef __AMDGPU_MCA_H__
#define __AMDGPU_MCA_H__
+#include "amdgpu_ras.h"
+
+#define MCA_MAX_REGS_COUNT (16)
+
+enum amdgpu_mca_ip {
+ AMDGPU_MCA_IP_UNKNOW = -1,
+ AMDGPU_MCA_IP_PSP = 0,
+ AMDGPU_MCA_IP_SDMA,
+ AMDGPU_MCA_IP_GC,
+ AMDGPU_MCA_IP_SMU,
+ AMDGPU_MCA_IP_MP5,
+ AMDGPU_MCA_IP_UMC,
+ AMDGPU_MCA_IP_COUNT,
+};
+
+enum amdgpu_mca_error_type {
+ AMDGPU_MCA_ERROR_TYPE_UE = 0,
+ AMDGPU_MCA_ERROR_TYPE_CE,
+};
+
struct amdgpu_mca_ras_block {
struct amdgpu_ras_block_object ras_block;
};
@@ -34,6 +54,36 @@ struct amdgpu_mca {
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
+ const struct amdgpu_mca_smu_funcs *mca_funcs;
+};
+
+struct mca_bank_info {
+ int socket_id;
+ int aid;
+ int hwid;
+ int mcatype;
+};
+
+struct mca_bank_entry {
+ int idx;
+ enum amdgpu_mca_error_type type;
+ enum amdgpu_mca_ip ip;
+ struct mca_bank_info info;
+ uint64_t regs[MCA_MAX_REGS_COUNT];
+};
+
+struct amdgpu_mca_smu_funcs {
+ int max_ue_count;
+ int max_ce_count;
+ int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable);
+ int (*mca_get_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ enum amdgpu_mca_error_type type, uint32_t *count);
+ int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
+ uint32_t *count);
+ int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
+ int idx, struct mca_bank_entry *entry);
+ int (*mca_get_ras_mca_idx_array)(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size);
};
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
@@ -53,4 +103,11 @@ void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
+
+void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs);
+int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable);
+int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count);
+int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, uint32_t *count);
+int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev,enum amdgpu_mca_error_type type, int idx, struct mca_bank_entry *entry);
+
#endif
--
2.34.1
More information about the amd-gfx
mailing list