[PATCH v6] drm/amdkfd: Provide SMI events watch

Sat Apr 18 03:09:25 UTC 2020

[AMD Public Use]

Now I understand what you mean by stack overflow. Thank you for the link. I didn't know about the kernel stack size of a thread. Learn something again today :)

Regards,
Amber

-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling at amd.com> 
Sent: Friday, April 17, 2020 10:19 PM
To: Lin, Amber <Amber.Lin at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH v6] drm/amdkfd: Provide SMI events watch

Am 2020-04-17 um 9:48 p.m. schrieb Amber Lin:
>
>
> On 2020-04-17 6:31 p.m., Felix Kuehling wrote:
>> Am 2020-04-17 um 4:07 p.m. schrieb Amber Lin:
>>> When the compute is malfunctioning or performance drops, the system 
>>> admin will use SMI (System Management Interface) tool to 
>>> monitor/diagnostic what went wrong. This patch provides an event 
>>> watch interface for the user space to register devices and subscribe 
>>> events they are interested.
>>> After
>>> registered, the user can use annoymous file descriptor's poll 
>>> function with wait-time specified and wait for events to happen. 
>>> Once an event happens, the user can use read() to retrieve 
>>> information related to the event.
>>>
>>> VM fault event is done in this patch.
>>>
>>> v2: - remove UNREGISTER and add event ENABLE/DISABLE
>>>      - correct kfifo usage
>>>      - move event message API to kfd_ioctl.h
>>> v3: send the event msg in text than in binary
>>> v4: support multiple clients
>>> v5: move events enablement from ioctl to fd write
>>>
>>> Signed-off-by: Amber Lin <Amber.Lin at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/Makefile              |   1 +
>>>   drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  18 ++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   7 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |   4 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 215
>>> +++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  29 +++
>>>   include/uapi/linux/kfd_ioctl.h                   |  16 +-
>>>   9 files changed, 293 insertions(+), 1 deletion(-)
>>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> index 6147462..e1e4115 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> @@ -53,6 +53,7 @@ AMDKFD_FILES    := $(AMDKFD_PATH)/kfd_module.o \
>>>           $(AMDKFD_PATH)/kfd_int_process_v9.o \
>>>           $(AMDKFD_PATH)/kfd_dbgdev.o \
>>>           $(AMDKFD_PATH)/kfd_dbgmgr.o \
>>> +        $(AMDKFD_PATH)/kfd_smi_events.o \
>>>           $(AMDKFD_PATH)/kfd_crat.o
>>>     ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> index 9f59ba9..24b4717 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> @@ -24,6 +24,7 @@
>>>   #include "kfd_events.h"
>>>   #include "cik_int.h"
>>>   #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>     static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>>                       const uint32_t *ih_ring_entry, @@ -107,6 
>>> +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>>>           ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>>           struct kfd_vm_fault_info info;
>>>   +        kfd_smi_event_update_vmfault(dev, pasid);
>>>           kfd_process_vm_fault(dev->dqm, pasid);
>>>             memset(&info, 0, sizeof(info)); diff --git 
>>> a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index f8fa03a..2baaaec 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -39,6 +39,7 @@
>>>   #include "kfd_device_queue_manager.h"
>>>   #include "kfd_dbgmgr.h"
>>>   #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>     static long kfd_ioctl(struct file *, unsigned int, unsigned 
>>> long);
>>>   static int kfd_open(struct inode *, struct file *); @@ -1732,6 
>>> +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
>>>       return r;
>>>   }
>>>   +/* Handle requests for watching SMI events */
>>> +static int kfd_ioctl_smi_events(struct file *filep,
>>> +                struct kfd_process *p, void *data) {
>>> +    struct kfd_ioctl_smi_events_args *args = data;
>>> +    struct kfd_dev *dev;
>>> +
>>> +    dev = kfd_device_by_id(args->gpuid);
>>> +    if (!dev)
>>> +        return -EINVAL;
>>> +
>>> +    return kfd_smi_event_open(dev, &args->anon_fd); }
>>> +
>>>   #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
>>>       [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = 
>>> _flags, \
>>>                   .cmd_drv = 0, .name = #ioctl} @@ -1827,6 +1842,9 
>>> @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>>>         AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>>>               kfd_ioctl_alloc_queue_gws, 0),
>>> +
>>> +    AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
>>> +            kfd_ioctl_smi_events, 0),
>>>   };
>>>     #define AMDKFD_CORE_IOCTL_COUNT    ARRAY_SIZE(amdkfd_ioctls) 
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 0491ab2..2c030c2 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
>>>       return ret;
>>>   }
>>>   +static void kfd_smi_init(struct kfd_dev *dev) {
>>> +    INIT_LIST_HEAD(&dev->smi_clients);
>>> +    spin_lock_init(&dev->smi_lock); }
>>> +
>>>   bool kgd2kfd_device_init(struct kfd_dev *kfd,
>>>                struct drm_device *ddev,
>>>                const struct kgd2kfd_shared_resources *gpu_resources) 
>>> @@ -700,6 +705,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>>>           goto kfd_topology_add_device_error;
>>>       }
>>>   +    kfd_smi_init(kfd);
>>> +
>>>       kfd->init_complete = true;
>>>       dev_info(kfd_device, "added device %x:%x\n", 
>>> kfd->pdev->vendor,
>>>            kfd->pdev->device);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> index e05d75e..151e83e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> @@ -24,6 +24,7 @@
>>>   #include "kfd_events.h"
>>>   #include "soc15_int.h"
>>>   #include "kfd_device_queue_manager.h"
>>> +#include "kfd_smi_events.h"
>>>     static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>>>                       const uint32_t *ih_ring_entry, @@ -117,6 
>>> +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>>>           info.prot_read  = ring_id & 0x10;
>>>           info.prot_write = ring_id & 0x20;
>>>   +        kfd_smi_event_update_vmfault(dev, pasid);
>>>           kfd_process_vm_fault(dev->dqm, pasid);
>>>           kfd_signal_vm_fault_event(dev, pasid, &info);
>>>       }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 43b888b..dc873b0 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -309,6 +309,10 @@ struct kfd_dev {
>>>         /* Global GWS resource shared b/t processes*/
>>>       void *gws;
>>> +
>>> +    /* Clients watching SMI events */
>>> +    struct list_head smi_clients;
>>> +    spinlock_t smi_lock;
>>>   };
>>>     enum kfd_mempool {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> new file mode 100644
>>> index 0000000..137aec8
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> @@ -0,0 +1,215 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person
>>> obtaining a
>>> + * copy of this software and associated documentation files (the
>>> "Software"),
>>> + * to deal in the Software without restriction, including without
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute,
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to
>>> whom the
>>> + * Software is furnished to do so, subject to the following
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
>>> EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>>> DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>>> OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
>>> USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#include <linux/poll.h>
>>> +#include <linux/wait.h>
>>> +#include <linux/anon_inodes.h>
>>> +#include <uapi/linux/kfd_ioctl.h>
>>> +#include "amdgpu_vm.h"
>>> +#include "kfd_priv.h"
>>> +#include "kfd_smi_events.h"
>>> +
>>> +struct kfd_smi_client {
>>> +    struct list_head list;
>>> +    struct kfifo fifo;
>>> +    wait_queue_head_t wait_queue;
>>> +    /* events enabled */
>>> +    uint64_t events;
>>> +    struct kfd_dev *dev;
>>> +    spinlock_t lock;
>>> +};
>>> +
>>> +#define MAX_KFIFO_SIZE    1024
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *, struct
>>> poll_table_struct *);
>>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *,
>>> size_t, loff_t *);
>>> +static ssize_t kfd_smi_ev_write(struct file *, const char __user *,
>>> size_t,
>>> +                loff_t *);
>>> +static int kfd_smi_ev_release(struct inode *, struct file *);
>>> +
>>> +static const char kfd_smi_name[] = "kfd_smi_ev";
>>> +
>>> +static const struct file_operations kfd_smi_ev_fops = {
>>> +    .owner = THIS_MODULE,
>>> +    .poll = kfd_smi_ev_poll,
>>> +    .read = kfd_smi_ev_read,
>>> +    .write = kfd_smi_ev_write,
>>> +    .release = kfd_smi_ev_release
>>> +};
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *filep,
>>> +                struct poll_table_struct *wait) {
>>> +    __poll_t mask;
>>> +    struct kfd_smi_client *client = filep->private_data;
>>> +
>>> +    poll_wait(filep, &client->wait_queue, wait);
>>> +
>>> +    spin_lock(&client->lock);
>>> +    mask = kfifo_is_empty(&client->fifo) ? 0: POLLIN | POLLRDNORM;
>>> +    spin_unlock(&client->lock);
>>> +
>>> +    return mask;
>>> +}
>>> +
>>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user 
>>> +*user,
>>> +                   size_t size, loff_t *offset) {
>>> +    int ret, to_copy;
>>> +    struct kfd_smi_client *client = filep->private_data;
>>> +    unsigned char buf[MAX_KFIFO_SIZE];
>> If you grow the MAX_KFIFO_SIZE in the future, this will lead to a 
>> stack overflow. It's OK for now. Maybe add a 
>> BUILD_BUG_ON(MAX_KFIFO_SIZE >
>> 1024) to make it obvious for anyone who wants to grow the FIFO size 
>> later.
>>
> I have changed kfifo_alloc to use MAX_KFIFO_SIZE (not hard coded 1024) 
> when changing kfifo_to_user to kfifo_out, so this doesn't overflow.
> When people grow the fifo size, they change MAX_KFIFO_SIZE definition.

The problem is that buf is on the stack. The kernel stack is quite limited in size (see https://www.kernel.org/doc/html/latest/x86/kernel-stacks.html). On
x86_64 it's 8KB. So if MAX_KFIFO_SIZE grows significantly, it will overflow the small kernel stack. This is not obvious to someone changing the value of MAX_KFIFO_SIZE. Therefore I recommend adding a BUILD_BUG_ON, so that this problem will be obvious when the valuue of MAX_KFIFO_SIZE is changed.

>>> +
>>> +    /* kfifo_to_user can sleep so we can't use spinlock protection
>>> around
>>> +     * it. Instead, we kfifo out as spinlocked then copy them to
>>> the user.
>>> +     */
>>> +    spin_lock(&client->lock);
>>> +    to_copy = kfifo_len(&client->fifo);
>>> +    if (!to_copy) {
>>> +        spin_unlock(&client->lock);
>>> +        return -EAGAIN;
>>> +    }
>>> +    to_copy = to_copy > size ? size : to_copy;
>> You could use min(size, to_copy) here. But you also need to check 
>> against sizeof(buf), otherwise you risk overwriting your stack. Maybe 
>> min3(size, sizeof(buf), to_copy).
>>
> Same reason as above, kfifo_len should never return greater than 
> MAX_KFIFO_SIZE, so min(size, to_copy) should be good enough.

OK, that's a good argument. But again, this will break if the buf size ever gets decoupled from the FIFO size, potentially resulting in an exploitable kernel security bug. It's an easy mistake to make, and easy to miss in a code review that doesn't take a lot of context into account. I'd rather protect against that by explicitly checking the size of the access against the allocated buffer size.

>>> +    ret = kfifo_out(&client->fifo, buf, to_copy);
>>> +    spin_unlock(&client->lock);
>>> +    if (ret <= 0)
>>> +        return -EAGAIN;
>>> +
>>> +    ret = copy_to_user(user, buf, to_copy);
>>> +    if (ret) {
>>> +        pr_debug("smi-events: fail to send msg (%i) (%i)\n",
>>> +            ret, to_copy);
>>> +        return ret;
>>> +    }
>>> +
>>> +    return to_copy;
>>> +}
>>> +
>>> +static ssize_t kfd_smi_ev_write(struct file *filep, const char
>>> __user *user,
>>> +                size_t size, loff_t *offset) {
>>> +    struct kfd_smi_client *client = filep->private_data;
>>> +    uint64_t events;
>>> +
>>> +    if (!access_ok(user, size) || size < sizeof(events))
>>> +        return -EFAULT;
>>> +    if (copy_from_user(&events, user, sizeof(events)))
>>> +        return -EFAULT;
>>> +
>>> +    WRITE_ONCE(client->events, events);
>>> +
>>> +    return sizeof(events);
>>> +}
>>> +
>>> +static int kfd_smi_ev_release(struct inode *inode, struct file 
>>> +*filep) {
>>> +    struct kfd_smi_client *client = filep->private_data;
>>> +    struct kfd_dev *dev = client->dev;
>>> +
>>> +    spin_lock(&dev->smi_lock);
>>> +    list_del_rcu(&client->list);
>>> +    spin_unlock(&dev->smi_lock);
>>> +
>>> +    synchronize_rcu();
>>> +    kfifo_free(&client->fifo);
>>> +    kfree(client);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t 
>>> +pasid) {
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
>>> +    struct amdgpu_task_info task_info;
>>> +    /* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 
>>> +*/
>>> +    /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 
>>> +43
>>> +     */
>>> +    char fifo_in[43];
>>> +    struct kfd_smi_client *client;
>>> +
>>> +    rcu_read_lock();
>>> +    if (list_empty(&dev->smi_clients)){
>>> +        rcu_read_unlock();
>>> +        return;
>>> +    }
>>> +    rcu_read_unlock();
>> I don't think you need an rcu_read_lock for this atomic list_empty 
>> check.
> I'll remove them.
>>
>>> +
>>> +    amdgpu_vm_get_task_info(adev, pasid, &task_info);
>>> +    snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
>>> +        task_info.pid, task_info.task_name);
>>> +
>>> +    rcu_read_lock();
>>> +
>>> +    list_for_each_entry_rcu(client, &dev->smi_clients, list) {
>>> +        if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
>>> +            continue;
>>> +        spin_lock(&client->lock);
>>> +        if (kfifo_avail(&client->fifo) < sizeof(fifo_in)) {
>>> +            spin_unlock(&client->lock);
>>> +            rcu_read_unlock();
>>> +            pr_debug("smi_event(vmfault): no space left\n");
>>> +            return;
>> If one client is overflowing, that should not stop event delivery to 
>> other clients. That would allow one client to stage a denial of 
>> service attack on other clients simply by never reading its events. 
>> So exiting here is wrong. You should just continue with the next client.
> Right. My mistake. I'll change it.
>>
>>> +        }
>>> +        kfifo_in(&client->fifo, fifo_in, sizeof(fifo_in));
>>> +        wake_up_all(&client->wait_queue);
>>> +        spin_unlock(&client->lock);
>>> +    }
>>> +
>>> +    rcu_read_unlock();
>>> +}
>>> +
>>> +int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) {
>>> +    struct kfd_smi_client *client;
>>> +    int ret;
>>> +
>>> +    client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL);
>>> +    if (!client)
>>> +        return -ENOMEM;
>>> +    INIT_LIST_HEAD(&client->list);
>>> +
>>> +    ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL);
>>> +    if (ret) {
>>> +        kfree(client);
>>> +        return ret;
>>> +    }
>>> +
>>> +    ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void
>>> *)client,
>>> +                   O_RDWR);
>>> +    if (ret < 0) {
>>> +        kfifo_free(&client->fifo);
>>> +        kfree(client);
>>> +        *fd = 0;
>> You don't need to overwrite *fd here. If the function fails, it 
>> should not write its output parameter.
> I just feel that I shouldn't leave args.anon_fd as-is if the user 
> didn't initialize it as an invalid number.

Ideally a function that fails should have no side effects. It should be as if the function was never called at all. That simplifies further error handling in the calling function. It won't have to guess whether it needs to undo any partial work done by the failed function. Output parameters such as *fd are a kind of side effect.

This is a trivial example, but applying this rule consistently as much as possible, can help with error handling in more complex scenarios.

Thanks,
  Felix

>>
>>
>>> +        return ret;
>>> +    }
>>> +    *fd = ret;
>>> +
>>> +    init_waitqueue_head(&client->wait_queue);
>>> +    spin_lock_init(&client->lock);
>>> +    client->events = 0;
>>> +    client->dev = dev;
>>> +
>>> +    spin_lock(&dev->smi_lock);
>>> +    list_add_rcu(&client->list, &dev->smi_clients);
>>> +    spin_unlock(&dev->smi_lock);
>>> +
>>> +    return 0;
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> new file mode 100644
>>> index 0000000..a9cb218
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> @@ -0,0 +1,29 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person
>>> obtaining a
>>> + * copy of this software and associated documentation files (the
>>> "Software"),
>>> + * to deal in the Software without restriction, including without
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute,
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to
>>> whom the
>>> + * Software is furnished to do so, subject to the following
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
>>> EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>>> DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>>> OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
>>> USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
>>> +#define KFD_SMI_EVENTS_H_INCLUDED
>>> +
>>> +int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); void 
>>> +kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t
>>> pasid);
>>> +
>>> +#endif
>>> diff --git a/include/uapi/linux/kfd_ioctl.h 
>>> b/include/uapi/linux/kfd_ioctl.h index 4f66764..ad33c18 100644
>>> --- a/include/uapi/linux/kfd_ioctl.h
>>> +++ b/include/uapi/linux/kfd_ioctl.h
>> As we discussed, we should bump the KFD_ICOTL_MINOR_VERSION for this 
>> change. Skip version 1.2 and go straight to 1.3 because version 1.2 
>> is buing used by the DKMS builds.
>>
>> Regards,
>>    Felix
> Yes I do plan to have a 2nd patch following this one once you are 
> happy with my patch :)
>> @@ -442,6 +442,17 @@ struct kfd_ioctl_import_dmabuf_args {
>>       __u32 dmabuf_fd;    /* to KFD */
>>   };
>>   +/*
>> + * KFD SMI(System Management Interface) events  */
>> +/* Event type (defined by bitmask) */ #define KFD_SMI_EVENT_VMFAULT     
>> +0x0000000000000001
>> +
>> +struct kfd_ioctl_smi_events_args {
>> +    __u32 gpuid;    /* to KFD */
>> +    __u32 anon_fd;    /* from KFD */ };
>> +
>>   /* Register offset inside the remapped mmio page
>>    */
>>   enum kfd_mmio_remap {
>> @@ -546,7 +557,10 @@ enum kfd_mmio_remap {
>>   #define AMDKFD_IOC_ALLOC_QUEUE_GWS        \
>>           AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>   +#define AMDKFD_IOC_SMI_EVENTS            \
>> +        AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>> +
>>   #define AMDKFD_COMMAND_START        0x01 -#define 
>> AMDKFD_COMMAND_END        0x1F
>> +#define AMDKFD_COMMAND_END        0x20
>>     #endif
>