[PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

Zhang, Hawking Hawking.Zhang at amd.com
Thu Apr 25 03:00:53 UTC 2024


[AMD Official Use Only - General]

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t reset);

So we ultimately switch to above poison consumption handler for all the existing v9 adapters, right? If so, we shall be able to make this function backwards compatible. I'm wondering if we can just change the existing amdgpu_amdkfd_ras_poison_consumption_handler.

Pasid_poison_consumption_handler is a little bit confusing.

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai at amd.com>
Sent: Thursday, April 18, 2024 10:59
To: amd-gfx at lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
Subject: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

Prepare to handle pasid poison consumption.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  9 ++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       | 20 ++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h       |  3 +++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 66753940bb4d..287ce431901c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -759,10 +759,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
        return amdgpu_ras_get_fed_status(adev);  }

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                               enum amdgpu_ras_block block, uint16_t pasid,
+                               pasid_notify pasid_fn, void *data, uint32_t reset) {
+       amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, data,
+reset); }
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
        enum amdgpu_ras_block block, uint32_t reset)  {
-       amdgpu_umc_poison_handler(adev, block, reset);
+       amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
 }

 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ad50c7bbc326..54e15994d02b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -401,6 +401,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, uint32_t reset);
+
+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t reset);
+
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);  bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dcda3d24bee3..8ebbca9e2e22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -252,8 +252,9 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
        return 0;
 }

-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-                       enum amdgpu_ras_block block, uint32_t reset)
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t reset)
 {
        int ret = AMDGPU_RAS_SUCCESS;

@@ -291,16 +292,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,

                        amdgpu_ras_error_data_fini(&err_data);
                } else {
-                       if (reset) {
-                               amdgpu_umc_bad_page_polling_timeout(adev,
-                                                       reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
-                       } else {
                                struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

+                               amdgpu_ras_put_poison_req(adev,
+                                       block, pasid, pasid_fn, data, reset);
+
                                atomic_inc(&con->page_retirement_req_cnt);

                                wake_up(&con->page_retirement_wq);
-                       }
                }
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
@@ -313,6 +312,13 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
        return ret;
 }

+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint32_t reset) {
+       return amdgpu_umc_pasid_poison_handler(adev,
+                               block, 0, NULL, NULL, reset);
+}
+
 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
                void *ras_error_status,
                struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 9e77e6d48e3b..5f50c69c3cec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -106,6 +106,9 @@ int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);  int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);  int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, uint32_t reset);
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t pasid,
+                       pasid_notify pasid_fn, void *data, uint32_t reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4..6bf4bbc3cffa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -201,7 +201,8 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);

-       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
+       amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
+               block, pasid, NULL, NULL, reset);
 }

 static bool context_id_expected(struct kfd_dev *dev)
--
2.34.1



More information about the amd-gfx mailing list