[PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt
Zhang, Hawking
Hawking.Zhang at amd.com
Wed Dec 7 15:25:18 UTC 2022
[AMD Official Use Only - General]
I suggest split the patch into two
One is adding ras poison handler for mxgpu ai products, similar as patch #2, including
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 6 ++++
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 1 +
The other is adding common umc poison handling path for sriov, including
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 44 ++++++++++++++----------
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Wan, Gavin <Gavin.Wan at amd.com>; Chander, Vignesh <Vignesh.Chander at amd.com>; Yu, David <David.Yu at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt
PF will do page retirement, reset VF and inform VF to reserve RAS bad pages.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 44 ++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 6 ++++
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 1 +
4 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index f76c19fc0392..1c7fcb4f2380 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -169,25 +169,33 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) {
int ret = AMDGPU_RAS_SUCCESS;
- if (!adev->gmc.xgmi.connected_to_cpu) {
- struct ras_err_data err_data = {0, 0, 0, NULL};
- struct ras_common_if head = {
- .block = AMDGPU_RAS_BLOCK__UMC,
- };
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
-
- ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
-
- if (ret == AMDGPU_RAS_SUCCESS && obj) {
- obj->err_data.ue_count += err_data.ue_count;
- obj->err_data.ce_count += err_data.ce_count;
+ if (!amdgpu_sriov_vf(adev)) {
+ if (!adev->gmc.xgmi.connected_to_cpu) {
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct ras_common_if head = {
+ .block = AMDGPU_RAS_BLOCK__UMC,
+ };
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+ ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+
+ if (ret == AMDGPU_RAS_SUCCESS && obj) {
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
+ }
+ } else if (reset) {
+ /* MCA poison handler is only responsible for GPU reset,
+ * let MCA notifier do page retirement.
+ */
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ amdgpu_ras_reset_gpu(adev);
}
- } else if (reset) {
- /* MCA poison handler is only responsible for GPU reset,
- * let MCA notifier do page retirement.
- */
- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- amdgpu_ras_reset_gpu(adev);
+ } else {
+ if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
+ adev->virt.ops->ras_poison_handler(adev);
+ else
+ dev_warn(adev->dev,
+ "No ras_poison_handler interface in SRIOV!\n");
}
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 2b9d806e23af..b9e9480448af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,6 +88,7 @@ struct amdgpu_virt_ops {
int (*wait_reset)(struct amdgpu_device *adev);
void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
u32 data1, u32 data2, u32 data3);
+ void (*ras_poison_handler)(struct amdgpu_device *adev);
};
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 12906ba74462..63725b2ebc03 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,6 +404,11 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA); }
+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev) {
+ xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); }
+
const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.req_full_gpu = xgpu_ai_request_full_gpu_access,
.rel_full_gpu = xgpu_ai_release_full_gpu_access,
@@ -411,4 +416,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.wait_reset = NULL,
.trans_msg = xgpu_ai_mailbox_trans_msg,
.req_init_data = xgpu_ai_request_init_data,
+ .ras_poison_handler = xgpu_ai_ras_poison_handler,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index fa7e13e0459e..0136bd059f68 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -39,6 +39,7 @@ enum idh_request {
IDH_LOG_VF_ERROR = 200,
IDH_READY_TO_RESET = 201,
+ IDH_RAS_POISON = 202,
};
enum idh_event {
--
2.35.1
More information about the amd-gfx
mailing list