[PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma (v2)

Zhou1, Tao Tao.Zhou1 at amd.com
Tue Sep 28 03:22:31 UTC 2021


[AMD Official Use Only]

Ping...
________________________________
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Friday, September 24, 2021 4:37 PM
To: amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>; Zhang, Hawking <Hawking.Zhang at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Joshi, Mukul <Mukul.Joshi at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma (v2)

In ras poison mode, page retirement will be handled by the irq handler of the
module which consumes corrupted data.

v2: rename ras_process_cb to ras_poison_consumption_handler.
    move the handler's implementation from ASIC specific file to common
file.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c      | 14 ++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h      |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c |  4 ++--
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1d41c2c00623..7bc4248a8d49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -31,6 +31,8 @@
 #include <linux/dma-buf.h>
 #include "amdgpu_xgmi.h"
 #include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_ras.h"
+#include "amdgpu_umc.h"

 /* Total memory size in system memory and all GPU VRAM. Used to
  * estimate worst case amount of memory to reserve for page tables
@@ -780,3 +782,15 @@ bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)

         return adev->have_atomics_support;
 }
+
+int amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+
+       /* CPU MCA will handle it if connected_to_cpu is 1 */
+       if (!adev->gmc.xgmi.connected_to_cpu)
+               return amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
+       else
+               return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3bc52b2c604f..d118e1dc273d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -290,6 +290,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
                                       uint64_t *mmap_offset);
 int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
                                 struct tile_config *config);
+int amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd);
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 12d91e53556c..543e7ea75593 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -231,7 +231,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                                 if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
                                         sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
                                         kfd_signal_poison_consumed_event(dev, pasid);
-                                       amdgpu_amdkfd_gpu_reset(dev->kgd);
+                                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);
                                         return;
                                 }
                                 break;
@@ -253,7 +253,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                         kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
                 } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
                         kfd_signal_poison_consumed_event(dev, pasid);
-                       amdgpu_amdkfd_gpu_reset(dev->kgd);
+                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);
                         return;
                 }
         } else if (client_id == SOC15_IH_CLIENTID_VMC ||
--
2.17.1

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20210928/eb9335f8/attachment-0001.htm>


More information about the amd-gfx mailing list