[PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma interrupt

Zhou1, Tao Tao.Zhou1 at amd.com
Tue Aug 24 09:28:19 UTC 2021


[AMD Official Use Only]

Hi Hawking,

GPU reset will also be called in dev->kfd2kgd->ras_process_cb, this patch is to add page retirement handling before gpu reset.
unmap_queue mode (reset or preemption) is another story, I'll write a new patch after unmap_queue reset mode becomes functional.

I have another patch which adds poison mode flag in amdgpu_ras, page retirement will be skipped in DF irq handler if poison mode is ture. The patch will be sent out after RAS TA supports query of poison status.

Regards,
Tao
________________________________
From: Zhang, Hawking <Hawking.Zhang at amd.com>
Sent: Tuesday, August 24, 2021 5:10 PM
To: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>; Yang, Stanley <Stanley.Yang at amd.com>; Clements, John <John.Clements at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>
Subject: RE: [PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma interrupt

[AMD Official Use Only]

How about we add a new member in ras context (amdgpu_ras) to indicate the poison consumption handling mode/approach? In such way, we can initialize that member per ASIC.

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Zhang, Hawking
Sent: Tuesday, August 24, 2021 17:04
To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org; Yang, Stanley <Stanley.Yang at amd.com>; Clements, John <John.Clements at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>
Subject: RE: [PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma interrupt

[AMD Official Use Only]

Hi Tao,

This will break mode 2 reset solution, right? But we have to keep mode 2 reset solution as the default one for now. I think we need a new interface to allow KFD switch between unmap_queue and mode 2 reset solution

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Tuesday, August 24, 2021 16:43
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Clements, John <John.Clements at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma interrupt

In ras poison mode, page retirement will be handled by the irq handler of the module which consumes corrupted data.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c    | 13 ++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c     | 10 ++++++++--
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h     |  1 +
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 46cd4ee6bafb..eb5e9c1b1073 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,16 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_umc.h"
+
+int kgd_aldebaran_ras_process_cb(struct kgd_dev *kgd) {
+       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+
+       return amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL); }

 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
         .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -44,5 +54,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
         .get_atc_vmid_pasid_mapping_info =
                                 kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
         .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
-       .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
+       .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
+       .ras_process_cb = kgd_aldebaran_ras_process_cb
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 12d91e53556c..851b5120927a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -231,7 +231,10 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                                 if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
                                         sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
                                         kfd_signal_poison_consumed_event(dev, pasid);
-                                       amdgpu_amdkfd_gpu_reset(dev->kgd);
+                                       if (dev->kfd2kgd->ras_process_cb)
+                                               dev->kfd2kgd->ras_process_cb(dev->kgd);
+                                       else
+                                               amdgpu_amdkfd_gpu_reset(dev->kgd);
                                         return;
                                 }
                                 break;
@@ -253,7 +256,10 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                         kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
                 } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
                         kfd_signal_poison_consumed_event(dev, pasid);
-                       amdgpu_amdkfd_gpu_reset(dev->kgd);
+                       if (dev->kfd2kgd->ras_process_cb)
+                               dev->kfd2kgd->ras_process_cb(dev->kgd);
+                       else
+                               amdgpu_amdkfd_gpu_reset(dev->kgd);
                         return;
                 }
         } else if (client_id == SOC15_IH_CLIENTID_VMC || diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index c84bd7b2cf59..9e6525871ad4 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -301,6 +301,7 @@ struct kfd2kgd_calls {
                         int *max_waves_per_cu);
         void (*program_trap_handler_settings)(struct kgd_dev *kgd,
                         uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
+       int (*ras_process_cb)(struct kgd_dev *kgd);
 };

 #endif  /* KGD_KFD_INTERFACE_H_INCLUDED */
--
2.17.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20210824/1e15113a/attachment-0001.htm>


More information about the amd-gfx mailing list