[PATCH 11/14] drm/amdgpu: add xgmi v6.4.0 ACA support
Zhang, Hawking
Hawking.Zhang at amd.com
Wed Jan 3 11:33:39 UTC 2024
[AMD Official Use Only - General]
+ if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
+ (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) {
+ report->type = type;
+ report->count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+ }
Gentle reminder that we should be able to extend the error logging to all the pcs errors. Just read back the config registers so we know which error is configured to UE and which error is configured to CE.
Regards,
Hawking
-----Original Message-----
From: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
Sent: Wednesday, January 3, 2024 16:02
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
Subject: [PATCH 11/14] drm/amdgpu: add xgmi v6.4.0 ACA support
add xgmi v6.4.0 ACA driver support
Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 63 +++++++++++++++++++++++-
1 file changed, 62 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index a6c88f2fe6e5..61208ca94442 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1035,15 +1035,76 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
return 0;
}
+static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
+ struct aca_bank_report *report, void *data) {
+ struct amdgpu_device *adev = handle->adev;
+ const char *error_str;
+ u64 status;
+ int ret, ext_error_code;
+
+ ret = aca_bank_info_decode(bank, &report->info);
+ if (ret)
+ return ret;
+
+ status = bank->regs[MCA_REG_IDX_STATUS];
+ ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(status);
+
+ error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ?
+ xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
+ if (error_str)
+ dev_info(adev->dev, "%s detected\n", error_str);
+
+ if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
+ (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) {
+ report->type = type;
+ report->count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+ }
+
+ return 0;
+}
+
+static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
+ .aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report,
+};
+
+static const struct aca_info xgmi_v6_4_0_aca_info = {
+ .hwip = ACA_HWIP_TYPE_PCS_XGMI,
+ .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK,
+ .bank_ops = &xgmi_v6_4_0_aca_bank_ops, };
+
static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) {
+ int r;
+
if (!adev->gmc.xgmi.supported ||
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
- return amdgpu_ras_block_late_init(adev, ras_block);
+ r = amdgpu_ras_block_late_init(adev, ras_block);
+ if (r)
+ return r;
+
+ switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+ case IP_VERSION(6, 4, 0):
+ r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL,
+ &xgmi_v6_4_0_aca_info, NULL);
+ if (r)
+ goto late_fini;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+
+late_fini:
+ amdgpu_ras_block_late_fini(adev, ras_block);
+
+ return r;
}
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
--
2.34.1
More information about the amd-gfx
mailing list