[PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma
Tao Zhou
tao.zhou1 at amd.com
Thu Sep 23 10:05:29 UTC 2021
In ras poison mode, page retirement will be handled by the irq handler of the
module which consumes corrupted data.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
.../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 17 ++++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 6 ++++--
drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 2 ++
3 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 46cd4ee6bafb..27fc4e52aba9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,20 @@
#include "amdgpu_amdkfd.h"
#include "amdgpu_amdkfd_arcturus.h"
#include "amdgpu_amdkfd_gfx_v9.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_umc.h"
+
+int kgd_aldebaran_ras_process_cb(struct kgd_dev *kgd)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+
+ /* cpu mca will handle it if connected_to_cpu is 1 */
+ if (!adev->gmc.xgmi.connected_to_cpu)
+ return amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
+ else
+ return 0;
+}
const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -44,5 +58,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_atc_vmid_pasid_mapping_info =
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
- .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
+ .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
+ .ras_process_cb = kgd_aldebaran_ras_process_cb
};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 12d91e53556c..4a48b78f918e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -231,7 +231,8 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
kfd_signal_poison_consumed_event(dev, pasid);
- amdgpu_amdkfd_gpu_reset(dev->kgd);
+ if (dev->kfd2kgd->ras_process_cb)
+ dev->kfd2kgd->ras_process_cb(dev->kgd);
return;
}
break;
@@ -253,7 +254,8 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
kfd_signal_poison_consumed_event(dev, pasid);
- amdgpu_amdkfd_gpu_reset(dev->kgd);
+ if (dev->kfd2kgd->ras_process_cb)
+ dev->kfd2kgd->ras_process_cb(dev->kgd);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index c84bd7b2cf59..828131415901 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -301,6 +301,8 @@ struct kfd2kgd_calls {
int *max_waves_per_cu);
void (*program_trap_handler_settings)(struct kgd_dev *kgd,
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
+
+ int (*ras_process_cb)(struct kgd_dev *kgd);
};
#endif /* KGD_KFD_INTERFACE_H_INCLUDED */
--
2.17.1
More information about the amd-gfx
mailing list