[PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison

Zhang, Hawking Hawking.Zhang at amd.com
Mon Dec 13 10:43:01 UTC 2021


[AMD Official Use Only]

                } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-                       kfd_signal_poison_consumed_event(dev, pasid);
-                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+                       event_interrupt_poison_consumption(dev, pasid);
                        return;
                }

SDMA shouldn't go to the same handler and need a separated one. (i.e. re-initialize the RB)

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Friday, December 10, 2021 19:15
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; thomas.chai at amd.com; Kuehling, Felix <Felix.Kuehling at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison

CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
Replacing whole gpu reset with reset queue mode for RAS poison consumption saves much time, and we can also fallback to gpu reset solution if reset queue fails.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  6 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  3 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 14 ++++++++++  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 27 ++++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 ++
 6 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 46cf48b3904a..0bf09a94d944 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
        return adev->have_atomics_support;
 }

-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device
+*adev, bool reset)
 {
        struct ras_err_data err_data = {0, 0, 0, NULL};

        /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
        if (!adev->gmc.xgmi.connected_to_cpu)
-               amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
-       else
+               amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+       else if (reset)
                amdgpu_amdkfd_gpu_reset(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..61f899e54fd5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
                                      uint64_t *mmap_offset);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+                               bool reset);
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 01a2cc3928ac..095b2e0822aa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1476,6 +1476,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
        return retval;
 }

+int unmap_queues_cpsch_poison(struct device_queue_manager *dqm,
+uint32_t pasid) {
+       int ret;
+
+       dqm_lock(dqm);
+
+       ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
+                       pasid, true);
+
+       dqm_unlock(dqm);
+
+       return ret;
+}
+
 /* dqm->lock mutex has to be locked before calling this function */  static int execute_queues_cpsch(struct device_queue_manager *dqm,
                                enum kfd_unmap_queues_filter filter, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 499fc0ea387f..c52869133159 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -219,6 +219,7 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);  unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);  unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);  unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
+int unmap_queues_cpsch_poison(struct device_queue_manager *dqm,
+uint32_t pasid);

 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)  { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index deb64168c9e8..2863bb9e5bca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -89,6 +89,27 @@ enum SQ_INTERRUPT_ERROR_TYPE {  #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000  #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20

+static void event_interrupt_poison_consumption(struct kfd_dev *dev,
+                               uint16_t pasid)
+{
+       int ret;
+       struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+       /* all queues of a process will be unmapped in one time */
+       if (p && atomic_read(&p->poison))
+               return;
+
+       atomic_set(&p->poison, 1);
+       ret = unmap_queues_cpsch_poison(dev->dqm, pasid);
+       kfd_signal_poison_consumed_event(dev, pasid);
+       /* resetting queue passes, do page retirement without gpu reset
+          resetting queue fails, fallback to gpu reset solution */
+       if (!ret)
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+       else
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); }
+
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
                                        const uint32_t *ih_ring_entry,
                                        uint32_t *patched_ihre,
@@ -230,8 +251,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                                        sq_intr_err);
                                if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
                                        sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-                                       kfd_signal_poison_consumed_event(dev, pasid);
-                                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+                                       event_interrupt_poison_consumption(dev, pasid);
                                        return;
                                }
                                break;
@@ -252,8 +272,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                if (source_id == SOC15_INTSRC_SDMA_TRAP) {
                        kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
                } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-                       kfd_signal_poison_consumed_event(dev, pasid);
-                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+                       event_interrupt_poison_consumption(dev, pasid);
                        return;
                }
        } else if (client_id == SOC15_IH_CLIENTID_VMC || diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0c3f911e3bf4..ea68f3b3a4e9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -856,6 +856,8 @@ struct kfd_process {
        struct svm_range_list svms;

        bool xnack_enabled;
+
+       atomic_t poison;
 };

 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
--
2.17.1



More information about the amd-gfx mailing list