[PATCH v2] drm/amdkfd: Provide SMI events watch

Fri Apr 3 15:38:24 UTC 2020

Further thinking about it, I'll use struct kfd_smi_msg_header. Instead 
of using struct kfd_smi_msg_vmfault, it's a description about the event. 
This way we make it generic to all events.

On 2020-04-03 9:38 a.m., Amber Lin wrote:
> Thanks Felix. I'll make changes accordingly but please pay attention 
> to my last reply inline.
>
> On 2020-04-02 7:51 p.m., Felix Kuehling wrote:
>> On 2020-04-02 4:46 p.m., Amber Lin wrote:
>>> When the compute is malfunctioning or performance drops, the system 
>>> admin
>>> will use SMI (System Management Interface) tool to 
>>> monitor/diagnostic what
>>> went wrong. This patch provides an event watch interface for the user
>>> space to register events they are interested. After the event is
>>> registered, the user can use annoymous file descriptor's poll function
>>> with wait-time specified to wait for the event to happen. Once the 
>>> event
>>> happens, the user can use read() to retrieve information related to the
>>> event.
>>>
>>> VM fault event is done in this patch.
>>>
>>> v2: - remove UNREGISTER and add event ENABLE/DISABLE
>>>      - correct kfifo usage
>>>      - move event message API to kfd_ioctl.h
>>>
>>> Signed-off-by: Amber Lin <Amber.Lin at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>>>   drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  30 ++++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  12 ++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 177 
>>> +++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  31 ++++
>>>   include/uapi/linux/kfd_ioctl.h                   |  30 +++-
>>>   9 files changed, 286 insertions(+), 2 deletions(-)
>>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
>>> b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> index 6147462..cc98b4a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> @@ -53,7 +53,8 @@ AMDKFD_FILES    := $(AMDKFD_PATH)/kfd_module.o \
>>>           $(AMDKFD_PATH)/kfd_int_process_v9.o \
>>>           $(AMDKFD_PATH)/kfd_dbgdev.o \
>>>           $(AMDKFD_PATH)/kfd_dbgmgr.o \
>>> -        $(AMDKFD_PATH)/kfd_crat.o
>>> +        $(AMDKFD_PATH)/kfd_crat.o \
>>> +        $(AMDKFD_PATH)/kfd_smi_events.o
>>>     ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
>>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> index 9f59ba9..24b4717 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> @@ -24,6 +24,7 @@
>>>   #include "kfd_events.h"
>>>   #include "cik_int.h"
>>>   #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>     static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>>                       const uint32_t *ih_ring_entry,
>>> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct 
>>> kfd_dev *dev,
>>>           ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>>           struct kfd_vm_fault_info info;
>>>   +        kfd_smi_event_update_vmfault(dev, pasid);
>>>           kfd_process_vm_fault(dev->dqm, pasid);
>>>             memset(&info, 0, sizeof(info));
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index f8fa03a..591ac28 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -39,6 +39,7 @@
>>>   #include "kfd_device_queue_manager.h"
>>>   #include "kfd_dbgmgr.h"
>>>   #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>     static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>>>   static int kfd_open(struct inode *, struct file *);
>>> @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file 
>>> *filep, struct kfd_process *p,
>>>       return ret;
>>>   }
>>>   +/* Handle requests for watching SMI events */
>>> +static int kfd_ioctl_smi_events(struct file *filep,
>>> +                struct kfd_process *p, void *data)
>>> +{
>>> +    struct kfd_ioctl_smi_events_args *args = data;
>>> +    struct kfd_dev *dev;
>>> +
>>> +    dev = kfd_device_by_id(args->gpu_id);
>>> +    if (!dev)
>>> +        return -EINVAL;
>>> +
>>> +    switch (args->op) {
>>> +    case KFD_SMI_EVENTS_REGISTER:
>>> +        /* register the device */
>>> +        return kfd_smi_event_register(dev, &args->data);
>>> +    case KFD_SMI_EVENTS_ENABLE:
>>> +        /* subscribe events to the device */
>>> +        return kfd_smi_event_enable(dev, args->events);
>>> +    case KFD_SMI_EVENTS_DISABLE:
>>> +        /* unsubscribe events */
>>> +        return kfd_smi_event_disable(dev, args->events);
>>> +    }
>>> +
>>> +    return -EINVAL;
>>> +}
>>> +
>>>   bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>>>   {
>>>       struct kfd_local_mem_info mem_info;
>>> @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc 
>>> amdkfd_ioctls[] = {
>>>         AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>>>               kfd_ioctl_alloc_queue_gws, 0),
>>> +
>>> +    AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
>>> +            kfd_ioctl_smi_events, 0),
>>>   };
>>>     #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 0491ab2..6ac6f31 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>>>       kfd->device_info = device_info;
>>>       kfd->pdev = pdev;
>>>       kfd->init_complete = false;
>>> +    kfd->smi_events.registered = false;
>>>       kfd->kfd2kgd = f2g;
>>>       atomic_set(&kfd->compute_profile, 0);
>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> index e05d75e..151e83e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> @@ -24,6 +24,7 @@
>>>   #include "kfd_events.h"
>>>   #include "soc15_int.h"
>>>   #include "kfd_device_queue_manager.h"
>>> +#include "kfd_smi_events.h"
>>>     static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>>>                       const uint32_t *ih_ring_entry,
>>> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev 
>>> *dev,
>>>           info.prot_read  = ring_id & 0x10;
>>>           info.prot_write = ring_id & 0x20;
>>>   +        kfd_smi_event_update_vmfault(dev, pasid);
>>>           kfd_process_vm_fault(dev->dqm, pasid);
>>>           kfd_signal_vm_fault_event(dev, pasid, &info);
>>>       }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 43b888b..b10b665 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -223,6 +223,15 @@ struct kfd_vmid_info {
>>>       uint32_t vmid_num_kfd;
>>>   };
>>>   +struct kfd_smi_events {
>>> +    /* device is registered to watch events */
>>> +    bool registered;
>>> +    /* events enabled */
>>> +    uint32_t events;
>>
>> This should be 64-bit to allow for more future expansion. It doesn't 
>> matter as much here, because you can always change the internal 
>> header later. But in the ioctl API we can't change it later, so we 
>> should define it as 64-bit from the start.
>>
> Ok, I'll change back to 64 bits. I was thinking to use "data" as group 
> if we have to run out of bits anyway, but with further thinking, we 
> can squish several events into one since it's event "type", not "ID". 
> 64 will be quite sufficient and no need to worry about running out of it.
Ignore what I wrote about squishing events. I was thinking the output 
format.
>>
>>> +    DECLARE_KFIFO_PTR(fifo, uint32_t);
>>> +    wait_queue_head_t wait_queue;
>>> +};
>>> +
>>>   struct kfd_dev {
>>>       struct kgd_dev *kgd;
>>>   @@ -309,6 +318,9 @@ struct kfd_dev {
>>>         /* Global GWS resource shared b/t processes*/
>>>       void *gws;
>>> +
>>> +    /* if this device is in SMI events watch */
>>> +    struct kfd_smi_events smi_events;
>>>   };
>>>     enum kfd_mempool {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> new file mode 100644
>>> index 0000000..aab4dac
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> @@ -0,0 +1,177 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person 
>>> obtaining a
>>> + * copy of this software and associated documentation files (the 
>>> "Software"),
>>> + * to deal in the Software without restriction, including without 
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, 
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to 
>>> whom the
>>> + * Software is furnished to do so, subject to the following 
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be 
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>>> EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
>>> DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
>>> OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
>>> USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#include <linux/poll.h>
>>> +#include <linux/wait.h>
>>> +#include <linux/anon_inodes.h>
>>> +#include <uapi/linux/kfd_ioctl.h>
>>> +#include "amdgpu_vm.h"
>>> +#include "kfd_priv.h"
>>> +#include "kfd_smi_events.h"
>>> +
>>> +static DEFINE_MUTEX(kfd_smi_mutex);
>>> +struct mutex *kfd_get_smi_mutex(void)
>>> +{
>>> +    return &kfd_smi_mutex;
>>> +}
>>
>> Why do you need a function for this. Just reference kfd_smi_mutex 
>> directly below. But since the fifo is per device, should there also 
>> be one mutex per device?
>>
> I'm not sure how to create mutex for smi_event so I referenced the 
> debugger code.
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *, struct 
>>> poll_table_struct *);
>>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, 
>>> size_t, loff_t *);
>>> +static int kfd_smi_ev_release(struct inode *, struct file *);
>>> +
>>> +static uint32_t smi_fifo_len;
>>
>> This could be statically initialized rather than doing it in a 
>> function below because this can be calculated at compile time. It's 
>> also constant:
>>
>> static const uint32_t smi_fifo_len = 32 * (sizeof(struct 
>> kfd_smi_msg_vmfault)/sizeof(uint32_t) + 1);
>>
>> Or you could probably even make is a #define.
>>
> True. I'll change it.
>>
>>> +static const char kfd_smi_name[] = "kfd_smi_ev";
>>> +
>>> +static const struct file_operations kfd_smi_ev_fops = {
>>> +    .owner = THIS_MODULE,
>>> +    .poll = kfd_smi_ev_poll,
>>> +    .read = kfd_smi_ev_read,
>>> +    .release = kfd_smi_ev_release
>>> +};
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *filep,
>>> +                struct poll_table_struct *wait)
>>> +{
>>> +    __poll_t mask;
>>> +    struct kfd_dev *dev = filep->private_data;
>>> +
>>> +    poll_wait(filep, &dev->smi_events.wait_queue, wait);
>>> +    mask = kfifo_is_empty(&dev->smi_events.fifo) ? 0: POLLIN | 
>>> POLLRDNORM;
>>> +
>>> +    return mask;
>>> +}
>>> +
>>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
>>> +                   size_t size, loff_t *offset)
>>> +{
>>> +    int ret, copied = 0;
>>> +    struct kfd_dev *dev = filep->private_data;
>>> +
>>> +    mutex_lock(kfd_get_smi_mutex());
>>> +    ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
>>> +    mutex_unlock(kfd_get_smi_mutex());
>>> +    if (ret || !copied) {
>>> +        pr_debug("smi-events: fail to send msg (%i) (%i)\n",
>>> +            ret, copied);
>>> +        return ret ? ret : -EAGAIN;
>>> +    }
>>> +
>>> +    return copied;
>>> +}
>>> +
>>> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>>> +{
>>> +    struct kfd_dev *dev = filep->private_data;
>>> +
>>> +    dev->smi_events.events = 0;
>>> +    kfifo_free(&dev->smi_events.fifo);
>>
>> This can cause race conditions. You should do this under the fifo 
>> mutex. Also set dev->smi_events.registered = false to allow a new 
>> registration after this.
>>
> Good catch. I'll add them.
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>> +{
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
>>> +    struct amdgpu_task_info task_info;
>>> +    struct kfd_smi_msg_vmfault msg;
>>> +    uint32_t *fifo_in = (uint32_t *)&msg;
>>> +    uint32_t fifo_in_len = sizeof(msg) / sizeof(uint32_t);
>>> +
>>> +    if (!(dev->smi_events.events & KFD_SMI_EVENT_VMFAULT))
>>> +        return;
>>
>> You need to do this while holding the fifo mutex, otherwise the fifo 
>> can be destroyed before you access it below.
>>
> Ok
>>
>>> +
>>> +    amdgpu_vm_get_task_info(adev, pasid, &task_info);
>>> +    msg.pid = task_info.pid;
>>> +    strncpy(msg.task_name, task_info.task_name, 16);
>>> +
>>> +    mutex_lock(kfd_get_smi_mutex());
>>> +    if ((fifo_in_len + 1) >
>>> +            (smi_fifo_len - kfifo_len(&dev->smi_events.fifo))) {
>>
>> You could use kfifo_avail to get the space available in the fifo.
>>
> Ah. Thanks.
>>
>>> +        pr_err("smi_event: no space left for vmfault event\n");
>>> +        mutex_unlock(kfd_get_smi_mutex());
>>> +        return;
>>> +    }
>>> +    /* Always send the event type first to function as a header */
>>> +    kfifo_put(&dev->smi_events.fifo, KFD_SMI_EVENT_VMFAULT);
>>> +    /* Then the msg as the payload. Event type reveals the payload 
>>> size. */
>>> +    kfifo_in(&dev->smi_events.fifo, fifo_in, fifo_in_len);
>>> +    mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> +    wake_up_all(&dev->smi_events.wait_queue);
>>> +}
>>> +
>>> +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events)
>>> +{
>>> +    mutex_lock(kfd_get_smi_mutex());
>>> +    dev->smi_events.events &= ~events;
>>> +    mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events)
>>> +{
>>> +    /* If the user didn't register SMI events for this device, 
>>> kfifo is not
>>> +     * created to report events.
>>> +     */
>>> +    if (!dev->smi_events.registered)
>>> +        return -EINVAL;
>>> +
>>> +    mutex_lock(kfd_get_smi_mutex());
>>> +    dev->smi_events.events |= events;
>>> +    mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void assign_fifo_len(void)
>>> +{
>>> +    /* When a new event is added into support and this new event 
>>> msg uses
>>> +     * more memory, replace the msg struct here.
>>> +     */
>>> +    uint32_t max_msg = sizeof(struct 
>>> kfd_smi_msg_vmfault)/sizeof(uint32_t);
>>> +    /* +1 for the event type in front of event message. up to 32 
>>> entries */
>>> +    smi_fifo_len = (++max_msg) * 32;
>>
>> See above. This could be statically initialized.
>>
>>
>>> +}
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd)
>>> +{
>>> +    int ret = 0;
>>
>> This function should return failure if the event was already 
>> registered. You can only allow one FD to be created per device, 
>> because that FD owns the FIFO.
>>
>> This entire function must be under the fifo mutex.
>>
> Ok
>>
>>> +
>>> +    if (!smi_fifo_len)
>>> +        assign_fifo_len();
>>> +
>>> +    ret = kfifo_alloc(&dev->smi_events.fifo, smi_fifo_len, 
>>> GFP_KERNEL);
>>> +    if (ret) {
>>> +        pr_err("smi_event: fail to allocate kfifo\n");
>>> +        return ret;
>>> +    }
>>> +    init_waitqueue_head(&dev->smi_events.wait_queue);
>>> +    dev->smi_events.events = 0;
>>> +    dev->smi_events.registered = true;
>>> +
>>> +    ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
>>> +                (void *)dev, 0);
>>> +    *fd = ret;
>>> +
>>> +    return ret;
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> new file mode 100644
>>> index 0000000..e10bb5d
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> @@ -0,0 +1,31 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person 
>>> obtaining a
>>> + * copy of this software and associated documentation files (the 
>>> "Software"),
>>> + * to deal in the Software without restriction, including without 
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, 
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to 
>>> whom the
>>> + * Software is furnished to do so, subject to the following 
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be 
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>>> EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
>>> DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
>>> OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
>>> USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
>>> +#define KFD_SMI_EVENTS_H_INCLUDED
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd);
>>> +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events);
>>> +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events);
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t 
>>> pasid);
>>> +
>>> +#endif
>>> diff --git a/include/uapi/linux/kfd_ioctl.h 
>>> b/include/uapi/linux/kfd_ioctl.h
>>> index 4f66764..dc9309e 100644
>>> --- a/include/uapi/linux/kfd_ioctl.h
>>> +++ b/include/uapi/linux/kfd_ioctl.h
>>> @@ -442,6 +442,31 @@ struct kfd_ioctl_import_dmabuf_args {
>>>       __u32 dmabuf_fd;    /* to KFD */
>>>   };
>>>   +/*
>>> + * KFD SMI(System Management Interface) events
>>> + */
>>> +enum kfd_smi_events_op {
>>> +    KFD_SMI_EVENTS_REGISTER = 1,
>>> +    KFD_SMI_EVENTS_ENABLE,
>>> +    KFD_SMI_EVENTS_DISABLE
>>> +};
>>> +
>>> +/* Event type (defined by bitmask) */
>>> +#define KFD_SMI_EVENT_VMFAULT     0x00000001
>>
>> Since you can only have 64 events, you only need one byte. You have 
>> plenty of space in the header dword to add the message size. That 
>> would make it easier to parse the events and allow variable size 
>> events in the future. Then you should define the header as a struct. 
>> Something like this:
>>
>> struct kfd_smi_msg_header {
>>     __u8 type;
>>     __u8 pad;
>>     __u16 size;
>> };
>>
> It sounds a good idea, but I don't see how it improves the parsing... 
> The user reads the content based on the event type. My test code on 
> the user space is like this:
>
> read(fd, &event, sizeof(uint32_t)); //will change uint64_t
>
> switch (event) {
>
> case event_A:
>
>     read(fd, &struct_event_A_variable, sizeof(struct event_A));
>
> case event_B:
>
>     read(fd, &struct_event_B_variable, sizeof(struct event_B));
>
> Not exactly like that, such as read() is done in a separate function, 
> but you see the idea.
>
> This is what I mean by the event type tells the size itself. If I 
> provide the header you suggested, the user still needs to decide which 
> struct to use.... Unless we don't use struct for the message.  I can 
> change the output to become event:description_of_the event where 
> description is a string. The header you suggest will apply to this 
> method pretty well.
>
As stated in the beginning, I'll use header+description where 
description is a string whose size rounds up to power of 4 
(sizeof(uint32_t)).
>>
>>> +
>>> +struct kfd_ioctl_smi_events_args {
>>> +    __u32 op;       /* to KFD */
>>> +    __u32 events;   /* to KFD */
>>
>> I thought this should be 64-bit.
>>
>>
>>> +    __u32 gpu_id;   /* to KFD */
>>> +    __u32 data;    /* from KFD */
>>> +};
>>> +
>>> +/* Message to user space for each event */
>>> +struct kfd_smi_msg_vmfault {
>>
>> If you define the header like suggested above, you could include it 
>> here and at the start of all message structures. Might simplify the 
>> code a little:
>>
>>     struct kfd_smi_msg_header header;
>>
>> Regards,
>>   Felix
>>
>>> +    uint32_t pid;
>>> +    char task_name[16];
>>> +};
>>> +
>>>   /* Register offset inside the remapped mmio page
>>>    */
>>>   enum kfd_mmio_remap {
>>> @@ -546,7 +571,10 @@ enum kfd_mmio_remap {
>>>   #define AMDKFD_IOC_ALLOC_QUEUE_GWS        \
>>>           AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>>   +#define AMDKFD_IOC_SMI_EVENTS            \
>>> +        AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>>> +
>>>   #define AMDKFD_COMMAND_START        0x01
>>> -#define AMDKFD_COMMAND_END        0x1F
>>> +#define AMDKFD_COMMAND_END        0x20
>>>     #endif