[PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt

Wed Dec 7 15:25:18 UTC 2022

[AMD Official Use Only - General]

I suggest split the patch into two

One is adding ras poison handler for mxgpu ai products, similar as patch #2, including

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  1 +
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c    |  6 ++++
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h    |  1 +

The other is adding common umc poison handling path for sriov, including

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 44 ++++++++++++++----------
Regards,
Hawking
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Wednesday, December 7, 2022 18:04
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Wan, Gavin <Gavin.Wan at amd.com>; Chander, Vignesh <Vignesh.Chander at amd.com>; Yu, David <David.Yu at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 1/6] drm/amdgpu: inform PF if VF receives RAS poison interrupt

PF will do page retirement, reset VF and inform VF to reserve RAS bad pages.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 44 ++++++++++++++----------  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c    |  6 ++++
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h    |  1 +
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index f76c19fc0392..1c7fcb4f2380 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -169,25 +169,33 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)  {
        int ret = AMDGPU_RAS_SUCCESS;

-       if (!adev->gmc.xgmi.connected_to_cpu) {
-               struct ras_err_data err_data = {0, 0, 0, NULL};
-               struct ras_common_if head = {
-                       .block = AMDGPU_RAS_BLOCK__UMC,
-               };
-               struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
-
-               ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
-
-               if (ret == AMDGPU_RAS_SUCCESS && obj) {
-                       obj->err_data.ue_count += err_data.ue_count;
-                       obj->err_data.ce_count += err_data.ce_count;
+       if (!amdgpu_sriov_vf(adev)) {
+               if (!adev->gmc.xgmi.connected_to_cpu) {
+                       struct ras_err_data err_data = {0, 0, 0, NULL};
+                       struct ras_common_if head = {
+                               .block = AMDGPU_RAS_BLOCK__UMC,
+                       };
+                       struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+                       ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+
+                       if (ret == AMDGPU_RAS_SUCCESS && obj) {
+                               obj->err_data.ue_count += err_data.ue_count;
+                               obj->err_data.ce_count += err_data.ce_count;
+                       }
+               } else if (reset) {
+                       /* MCA poison handler is only responsible for GPU reset,
+                        * let MCA notifier do page retirement.
+                        */
+                       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+                       amdgpu_ras_reset_gpu(adev);
                }
-       } else if (reset) {
-               /* MCA poison handler is only responsible for GPU reset,
-                * let MCA notifier do page retirement.
-                */
-               kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-               amdgpu_ras_reset_gpu(adev);
+       } else {
+               if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
+                       adev->virt.ops->ras_poison_handler(adev);
+               else
+                       dev_warn(adev->dev,
+                               "No ras_poison_handler interface in SRIOV!\n");
        }

        return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 2b9d806e23af..b9e9480448af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,6 +88,7 @@ struct amdgpu_virt_ops {
        int (*wait_reset)(struct amdgpu_device *adev);
        void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
                          u32 data1, u32 data2, u32 data3);
+       void (*ras_poison_handler)(struct amdgpu_device *adev);
 };

 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 12906ba74462..63725b2ebc03 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,6 +404,11 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
        return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);  }

+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev) {
+       xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); }
+
 const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
        .req_full_gpu   = xgpu_ai_request_full_gpu_access,
        .rel_full_gpu   = xgpu_ai_release_full_gpu_access,
@@ -411,4 +416,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
        .wait_reset = NULL,
        .trans_msg = xgpu_ai_mailbox_trans_msg,
        .req_init_data  = xgpu_ai_request_init_data,
+       .ras_poison_handler = xgpu_ai_ras_poison_handler,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index fa7e13e0459e..0136bd059f68 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -39,6 +39,7 @@ enum idh_request {

        IDH_LOG_VF_ERROR       = 200,
        IDH_READY_TO_RESET      = 201,
+       IDH_RAS_POISON          = 202,
 };

 enum idh_event {
--
2.35.1