<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Arial;font-size:10pt;color:#0000FF;margin:5pt;" align="Left">
[AMD Official Use Only]<br>
</p>
<br>
<div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
Ping...<br>
</div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Zhou1, Tao <Tao.Zhou1@amd.com><br>
<b>Sent:</b> Friday, September 24, 2021 4:37 PM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Zhang, Hawking <Hawking.Zhang@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Joshi, Mukul <Mukul.Joshi@amd.com>; Yang, Stanley <Stanley.Yang@amd.com><br>
<b>Cc:</b> Zhou1, Tao <Tao.Zhou1@amd.com><br>
<b>Subject:</b> [PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma (v2)</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">In ras poison mode, page retirement will be handled by the irq handler of the<br>
module which consumes corrupted data.<br>
<br>
v2: rename ras_process_cb to ras_poison_consumption_handler.<br>
    move the handler's implementation from ASIC specific file to common<br>
file.<br>
<br>
Signed-off-by: Tao Zhou <tao.zhou1@amd.com><br>
---<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c      | 14 ++++++++++++++<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h      |  1 +<br>
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c |  4 ++--<br>
 3 files changed, 17 insertions(+), 2 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br>
index 1d41c2c00623..7bc4248a8d49 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br>
@@ -31,6 +31,8 @@<br>
 #include <linux/dma-buf.h><br>
 #include "amdgpu_xgmi.h"<br>
 #include <uapi/linux/kfd_ioctl.h><br>
+#include "amdgpu_ras.h"<br>
+#include "amdgpu_umc.h"<br>
 <br>
 /* Total memory size in system memory and all GPU VRAM. Used to<br>
  * estimate worst case amount of memory to reserve for page tables<br>
@@ -780,3 +782,15 @@ bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)<br>
 <br>
         return adev->have_atomics_support;<br>
 }<br>
+<br>
+int amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd)<br>
+{<br>
+       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;<br>
+       struct ras_err_data err_data = {0, 0, 0, NULL};<br>
+<br>
+       /* CPU MCA will handle it if connected_to_cpu is 1 */<br>
+       if (!adev->gmc.xgmi.connected_to_cpu)<br>
+               return amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);<br>
+       else<br>
+               return 0;<br>
+}<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br>
index 3bc52b2c604f..d118e1dc273d 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br>
@@ -290,6 +290,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,<br>
                                       uint64_t *mmap_offset);<br>
 int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,<br>
                                 struct tile_config *config);<br>
+int amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd);<br>
 #if IS_ENABLED(CONFIG_HSA_AMD)<br>
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);<br>
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c<br>
index 12d91e53556c..543e7ea75593 100644<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c<br>
@@ -231,7 +231,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,<br>
                                 if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&<br>
                                         sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {<br>
                                         kfd_signal_poison_consumed_event(dev, pasid);<br>
-                                       amdgpu_amdkfd_gpu_reset(dev->kgd);<br>
+                                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);<br>
                                         return;<br>
                                 }<br>
                                 break;<br>
@@ -253,7 +253,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,<br>
                         kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);<br>
                 } else if (source_id == SOC15_INTSRC_SDMA_ECC) {<br>
                         kfd_signal_poison_consumed_event(dev, pasid);<br>
-                       amdgpu_amdkfd_gpu_reset(dev->kgd);<br>
+                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);<br>
                         return;<br>
                 }<br>
         } else if (client_id == SOC15_IH_CLIENTID_VMC ||<br>
-- <br>
2.17.1<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>