[PATCH 5/9] drm/amdkfd: Add memory exception handling
Oded Gabbay
oded.gabbay at amd.com
Tue Apr 21 05:47:22 PDT 2015
From: Alexey Skidanov <Alexey.Skidanov at amd.com>
This patch adds Peripheral Page Request (PPR) failure processing and reporting.
Bad address or pointer to a system memory block with inappropriate
read/write permission cause such PPR failure during a user queue
processing. PPR request handling is done by IOMMU driver notifying
AMDKFD module on PPR failure.
The process triggering a PPR failure will be notified by appropriate event
or SIGTERM signal will be sent to it.
Signed-off-by: Alexey Skidanov <Alexey.Skidanov at amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay at amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 29 ++++++
drivers/gpu/drm/amd/amdkfd/kfd_events.c | 154 ++++++++++++++++++++++++++++++--
drivers/gpu/drm/amd/amdkfd/kfd_events.h | 7 ++
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +-
4 files changed, 186 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 4c03169..52cab0f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -182,6 +182,32 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
kfd_unbind_process_from_device(dev, pasid);
}
+/*
+ * This function called by IOMMU driver on PPR failure
+ */
+static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
+ unsigned long address, u16 flags)
+{
+ struct kfd_dev *dev;
+
+ dev_warn(kfd_device,
+ "Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X",
+ PCI_BUS_NUM(pdev->devfn),
+ PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn),
+ pasid,
+ address,
+ flags);
+
+ dev = kfd_device_by_pci_dev(pdev);
+ BUG_ON(dev == NULL);
+
+ kfd_signal_iommu_event(dev, pasid, address,
+ flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC);
+
+ return AMD_IOMMU_INV_PRI_RSP_INVALID;
+}
+
bool kgd2kfd_device_init(struct kfd_dev *kfd,
const struct kgd2kfd_shared_resources *gpu_resources)
{
@@ -251,6 +277,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
}
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
iommu_pasid_shutdown_callback);
+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
kfd->dqm = device_queue_manager_init(kfd);
if (!kfd->dqm) {
@@ -316,6 +343,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
if (kfd->init_complete) {
kfd->dqm->ops.stop(kfd->dqm);
amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
amd_iommu_free_device(kfd->pdev);
}
}
@@ -335,6 +363,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
return -ENXIO;
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
iommu_pasid_shutdown_callback);
+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
kfd->dqm->ops.start(kfd->dqm);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 984251d..ac60fb2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -30,6 +30,7 @@
#include <linux/memory.h>
#include "kfd_priv.h"
#include "kfd_events.h"
+#include <linux/device.h>
#define SIGNAL_EVENT_LIMIT 256
@@ -47,6 +48,10 @@ struct kfd_event_waiter {
/* Transitions to true when the event this belongs to is signaled. */
bool activated;
+
+ /* Event */
+ struct kfd_event *event;
+ uint32_t input_index;
};
/*
@@ -586,14 +591,17 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
}
static int init_event_waiter(struct kfd_process *p,
- struct kfd_event_waiter *waiter,
- uint32_t event_id)
+ struct kfd_event_waiter *waiter,
+ uint32_t event_id,
+ uint32_t input_index)
{
struct kfd_event *ev = lookup_event_by_id(p, event_id);
if (!ev)
return -EINVAL;
+ waiter->event = ev;
+ waiter->input_index = input_index;
waiter->activated = ev->signaled;
ev->signaled = ev->signaled && !ev->auto_reset;
@@ -620,6 +628,37 @@ static bool test_event_condition(bool all, uint32_t num_events,
return activated_count == num_events;
}
+/*
+ * Copy event specific data, if defined.
+ * Currently only memory exception events have additional data to copy to user
+ */
+static bool copy_signaled_event_data(uint32_t num_events,
+ struct kfd_event_waiter *event_waiters,
+ struct kfd_event_data __user *data)
+{
+ struct kfd_hsa_memory_exception_data *src, *dst;
+ struct kfd_event_waiter *waiter;
+ struct kfd_event *event;
+ uint32_t i;
+
+ for (i = 0; i < num_events; i++) {
+ waiter = &event_waiters[i];
+ event = waiter->event;
+ if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
+ dst = &data[waiter->input_index].memory_exception_data;
+ src = &event->memory_exception_data;
+ if (copy_to_user(dst, src,
+ sizeof(struct kfd_hsa_memory_exception_data)))
+ return false;
+ }
+ }
+
+ return true;
+
+}
+
+
+
static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
{
if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
@@ -649,10 +688,12 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
}
int kfd_wait_on_events(struct kfd_process *p,
- uint32_t num_events, const uint32_t __user *event_ids,
+ uint32_t num_events, void __user *data,
bool all, uint32_t user_timeout_ms,
enum kfd_event_wait_result *wait_result)
{
+ struct kfd_event_data __user *events =
+ (struct kfd_event_data __user *) data;
uint32_t i;
int ret = 0;
struct kfd_event_waiter *event_waiters = NULL;
@@ -667,13 +708,14 @@ int kfd_wait_on_events(struct kfd_process *p,
}
for (i = 0; i < num_events; i++) {
- uint32_t event_id;
+ struct kfd_event_data event_data;
- ret = get_user(event_id, &event_ids[i]);
- if (ret)
+ if (copy_from_user(&event_data, &events[i],
+ sizeof(struct kfd_event_data)))
goto fail;
- ret = init_event_waiter(p, &event_waiters[i], event_id);
+ ret = init_event_waiter(p, &event_waiters[i],
+ event_data.event_id, i);
if (ret)
goto fail;
}
@@ -700,7 +742,11 @@ int kfd_wait_on_events(struct kfd_process *p,
}
if (test_event_condition(all, num_events, event_waiters)) {
- *wait_result = KFD_WAIT_COMPLETE;
+ if (copy_signaled_event_data(num_events,
+ event_waiters, events))
+ *wait_result = KFD_WAIT_COMPLETE;
+ else
+ *wait_result = KFD_WAIT_ERROR;
break;
}
@@ -773,3 +819,95 @@ int radeon_kfd_event_mmap(struct kfd_process *p,
return remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE,
vma->vm_page_prot);
}
+
+/*
+ * Assumes that p->event_mutex is held and of course
+ * that p is not going away (current or locked).
+ */
+static void lookup_events_by_type_and_signal(struct kfd_process *p,
+ int type, void *event_data)
+{
+ struct kfd_hsa_memory_exception_data *ev_data;
+ struct kfd_event *ev;
+ int bkt;
+ bool send_signal = true;
+
+ ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
+
+ hash_for_each(p->events, bkt, ev, events)
+ if (ev->type == type) {
+ send_signal = false;
+ dev_dbg(kfd_device,
+ "Event found: id %X type %d",
+ ev->event_id, ev->type);
+ set_event(ev);
+ if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
+ ev->memory_exception_data = *ev_data;
+ }
+
+ /* Send SIGTERM no event of type "type" has been found*/
+ if (send_signal) {
+ dev_warn(kfd_device,
+ "Sending SIGTERM to HSA Process with PID %d ",
+ p->lead_thread->pid);
+ send_sig(SIGTERM, p->lead_thread, 0);
+ }
+}
+
+void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+ unsigned long address, bool is_write_requested,
+ bool is_execute_requested)
+{
+ struct kfd_hsa_memory_exception_data memory_exception_data;
+ struct vm_area_struct *vma;
+
+ /*
+ * Because we are called from arbitrary context (workqueue) as opposed
+ * to process context, kfd_process could attempt to exit while we are
+ * running so the lookup function returns a locked process.
+ */
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+ if (!p)
+ return; /* Presumably process exited. */
+
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+
+ down_read(&p->mm->mmap_sem);
+ vma = find_vma(p->mm, address);
+
+ memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.va = address;
+ /* Set failure reason */
+ memory_exception_data.failure.NotPresent = true;
+ memory_exception_data.failure.NoExecute = false;
+ memory_exception_data.failure.ReadOnly = false;
+ if (vma) {
+ if (vma->vm_start > address) {
+ memory_exception_data.failure.NotPresent = true;
+ memory_exception_data.failure.NoExecute = false;
+ memory_exception_data.failure.ReadOnly = false;
+ } else {
+ memory_exception_data.failure.NotPresent = false;
+ if (is_write_requested && !(vma->vm_flags & VM_WRITE))
+ memory_exception_data.failure.ReadOnly = true;
+ else
+ memory_exception_data.failure.ReadOnly = false;
+ if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
+ memory_exception_data.failure.NoExecute = true;
+ else
+ memory_exception_data.failure.NoExecute = false;
+ }
+ }
+
+ up_read(&p->mm->mmap_sem);
+
+ mutex_lock(&p->event_mutex);
+
+ /* Lookup events by type and signal them */
+ lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY,
+ &memory_exception_data);
+
+ mutex_unlock(&p->event_mutex);
+ mutex_unlock(&p->mutex);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
index d9b5b38..691cf85 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
@@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include "kfd_priv.h"
+#include <uapi/linux/kfd_ioctl.h>
#define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U
#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
@@ -61,6 +62,11 @@ struct kfd_event {
struct signal_page *signal_page;
unsigned int signal_slot_index;
uint64_t __user *user_signal_address;
+
+ /* type specific data */
+ union {
+ struct kfd_hsa_memory_exception_data memory_exception_data;
+ };
};
#define KFD_EVENT_TIMEOUT_IMMEDIATE 0
@@ -69,6 +75,7 @@ struct kfd_event {
/* Matching HSA_EVENTTYPE */
#define KFD_EVENT_TYPE_SIGNAL 0
#define KFD_EVENT_TYPE_DEBUG 5
+#define KFD_EVENT_TYPE_MEMORY 8
extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
uint32_t valid_id_bits);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 7f6ea96..8ecc033 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -694,11 +694,14 @@ void kfd_event_free_process(struct kfd_process *p);
int radeon_kfd_event_mmap(struct kfd_process *process,
struct vm_area_struct *vma);
int kfd_wait_on_events(struct kfd_process *p,
- uint32_t num_events, const uint32_t __user *event_ids,
+ uint32_t num_events, void __user *data,
bool all, uint32_t user_timeout_ms,
enum kfd_event_wait_result *wait_result);
void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
uint32_t valid_id_bits);
+void kfd_signal_iommu_event(struct kfd_dev *dev,
+ unsigned int pasid, unsigned long address,
+ bool is_write_requested, bool is_execute_requested);
int kfd_set_event(struct kfd_process *p, uint32_t event_id);
int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
int kfd_event_create(struct file *devkfd, struct kfd_process *p,
--
1.9.1
More information about the dri-devel
mailing list