[PATCH 07/25] drm/amdkfd: Handle VM faults in KFD
Felix Kuehling
Felix.Kuehling at amd.com
Thu Jul 12 02:32:50 UTC 2018
From: shaoyunl <Shaoyun.Liu at amd.com>
1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per
per-vmid. amdkfd needs to get the information from amdgpu through the
new get_vm_fault_info interface. On GFX9 and later, all the required
information is in the IH ring
2. amdkfd unmaps all queues from the faulting process and create new
run-list without the guilty process
3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY
Signed-off-by: shaoyun liu <shaoyun.liu at amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
---
drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 25 ++++++++++++---
drivers/gpu/drm/amd/amdkfd/cik_int.h | 2 ++
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 17 ++++++++++
drivers/gpu/drm/amd/amdkfd/kfd_events.c | 37 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 18 +++++++++--
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 +++
include/uapi/linux/kfd_ioctl.h | 2 +-
7 files changed, 98 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 49df6c7..cc33870 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
- ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
+ ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
+ ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
}
static void cik_event_interrupt_wq(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
{
- unsigned int pasid;
const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry;
uint32_t context_id = ihre->data & 0xfffffff;
-
- pasid = (ihre->ring_id & 0xffff0000) >> 16;
+ unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8;
+ unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
if (pasid == 0)
return;
@@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
kfd_signal_hw_exception_event(pasid);
+ else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
+ struct kfd_vm_fault_info info;
+
+ kfd_process_vm_fault(dev->dqm, pasid);
+
+ memset(&info, 0, sizeof(info));
+ dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
+ if (!info.page_addr && !info.status)
+ return;
+
+ if (info.vmid == vmid)
+ kfd_signal_vm_fault_event(dev, pasid, &info);
+ else
+ kfd_signal_vm_fault_event(dev, pasid, NULL);
+ }
}
const struct kfd_event_interrupt_class event_interrupt_class_cik = {
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
index 109298b..a2079a0 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
@@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6
#define CIK_INTSRC_SDMA_TRAP 0xE0
#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF
+#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92
+#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f2f81d2..44fc203 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
kfree(dqm);
}
+int kfd_process_vm_fault(struct device_queue_manager *dqm,
+ unsigned int pasid)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ int ret = 0;
+
+ if (!p)
+ return -EINVAL;
+ pdd = kfd_get_process_device_data(dqm->dev, p);
+ if (pdd)
+ ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+ kfd_unref_process(p);
+
+ return ret;
+}
+
#if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 3d5a833..b58a0e6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
mutex_unlock(&p->event_mutex);
kfd_unref_process(p);
}
+
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+ struct kfd_vm_fault_info *info)
+{
+ struct kfd_event *ev;
+ uint32_t id;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_hsa_memory_exception_data memory_exception_data;
+
+ if (!p)
+ return; /* Presumably process exited. */
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+ memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.failure.imprecise = 1;
+ /* Set failure reason */
+ if (info) {
+ memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
+ memory_exception_data.failure.NotPresent =
+ info->prot_valid ? 1 : 0;
+ memory_exception_data.failure.NoExecute =
+ info->prot_exec ? 1 : 0;
+ memory_exception_data.failure.ReadOnly =
+ info->prot_write ? 1 : 0;
+ memory_exception_data.failure.imprecise = 0;
+ }
+ mutex_lock(&p->event_mutex);
+
+ id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+ idr_for_each_entry_continue(&p->event_idr, ev, id)
+ if (ev->type == KFD_EVENT_TYPE_MEMORY) {
+ ev->memory_exception_data = memory_exception_data;
+ set_event(ev);
+ }
+
+ mutex_unlock(&p->event_mutex);
+ kfd_unref_process(p);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 37029ba..d6b64e6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
source_id == SOC15_INTSRC_SDMA_TRAP ||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
- source_id == SOC15_INTSRC_CP_BAD_OPCODE;
+ source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+ client_id == SOC15_IH_CLIENTID_VMC ||
+ client_id == SOC15_IH_CLIENTID_UTCL2;
}
static void event_interrupt_wq_v9(struct kfd_dev *dev,
@@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
kfd_signal_hw_exception_event(pasid);
else if (client_id == SOC15_IH_CLIENTID_VMC ||
client_id == SOC15_IH_CLIENTID_UTCL2) {
- /* TODO */
+ struct kfd_vm_fault_info info = {0};
+ uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+ info.vmid = vmid;
+ info.mc_id = client_id;
+ info.page_addr = ih_ring_entry[4] |
+ (uint64_t)(ih_ring_entry[5] & 0xf) << 32;
+ info.prot_valid = ring_id & 0x08;
+ info.prot_read = ring_id & 0x10;
+ info.prot_write = ring_id & 0x20;
+
+ kfd_process_vm_fault(dev->dqm, pasid);
+ kfd_signal_vm_fault_event(dev, pasid, &info);
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 5e3990b..91a3368 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type);
void kernel_queue_uninit(struct kernel_queue *kq);
+int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
/* Process Queue Manager */
struct process_queue_node {
@@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint64_t *event_page_offset, uint32_t *event_slot_index);
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+ struct kfd_vm_fault_info *info);
+
void kfd_flush_tlb(struct kfd_process_device *pdd);
int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index b4f5073..46a54ab 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -219,7 +219,7 @@ struct kfd_memory_exception_failure {
__u32 NotPresent; /* Page not present or supervisor privilege */
__u32 ReadOnly; /* Write access to a read-only page */
__u32 NoExecute; /* Execute access to a page marked NX */
- __u32 pad;
+ __u32 imprecise; /* Can't determine the exact fault address */
};
/* memory exception data*/
--
2.7.4
More information about the amd-gfx
mailing list