[PATCH v2] drm/amdkfd: Provide SMI events watch
Kuehling, Felix
Felix.Kuehling at amd.com
Fri Apr 3 19:01:45 UTC 2020
[AMD Official Use Only - Internal Distribution Only]
So are you saying you'll make the event descriptions text rather than binary?
If you switch to a text format, I wouldn't use a binary header. Rather I'd make it a text format completely. You could use one line per event, that makes it easy to use something like fgets to read a line (event) at a time in user mode.
Each line could still start with an event identifier, but it would be text rather than a binary. And you don’t need the size if you define "\n" as delimiter between events.
Regards,
Felix
-----Original Message-----
From: Lin, Amber <Amber.Lin at amd.com>
Sent: Friday, April 3, 2020 11:38
To: Kuehling, Felix <Felix.Kuehling at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH v2] drm/amdkfd: Provide SMI events watch
Further thinking about it, I'll use struct kfd_smi_msg_header. Instead of using struct kfd_smi_msg_vmfault, it's a description about the event.
This way we make it generic to all events.
On 2020-04-03 9:38 a.m., Amber Lin wrote:
> Thanks Felix. I'll make changes accordingly but please pay attention
> to my last reply inline.
>
> On 2020-04-02 7:51 p.m., Felix Kuehling wrote:
>> On 2020-04-02 4:46 p.m., Amber Lin wrote:
>>> When the compute is malfunctioning or performance drops, the system
>>> admin will use SMI (System Management Interface) tool to
>>> monitor/diagnostic what went wrong. This patch provides an event
>>> watch interface for the user space to register events they are
>>> interested. After the event is registered, the user can use
>>> annoymous file descriptor's poll function with wait-time specified
>>> to wait for the event to happen. Once the event happens, the user
>>> can use read() to retrieve information related to the event.
>>>
>>> VM fault event is done in this patch.
>>>
>>> v2: - remove UNREGISTER and add event ENABLE/DISABLE
>>> - correct kfifo usage
>>> - move event message API to kfd_ioctl.h
>>>
>>> Signed-off-by: Amber Lin <Amber.Lin at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdkfd/Makefile | 3 +-
>>> drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 +
>>> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++++
>>> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 +
>>> drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 +
>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 12 ++
>>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177
>>> +++++++++++++++++++++++
>>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 ++++
>>> include/uapi/linux/kfd_ioctl.h | 30 +++-
>>> 9 files changed, 286 insertions(+), 2 deletions(-)
>>> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> index 6147462..cc98b4a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
>>> $(AMDKFD_PATH)/kfd_int_process_v9.o \
>>> $(AMDKFD_PATH)/kfd_dbgdev.o \
>>> $(AMDKFD_PATH)/kfd_dbgmgr.o \
>>> - $(AMDKFD_PATH)/kfd_crat.o
>>> + $(AMDKFD_PATH)/kfd_crat.o \
>>> + $(AMDKFD_PATH)/kfd_smi_events.o
>>> ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>> AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git
>>> a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> index 9f59ba9..24b4717 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> @@ -24,6 +24,7 @@
>>> #include "kfd_events.h"
>>> #include "cik_int.h"
>>> #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>> static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>> const uint32_t *ih_ring_entry, @@ -107,6
>>> +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>>> ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>> struct kfd_vm_fault_info info;
>>> + kfd_smi_event_update_vmfault(dev, pasid);
>>> kfd_process_vm_fault(dev->dqm, pasid);
>>> memset(&info, 0, sizeof(info)); diff --git
>>> a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index f8fa03a..591ac28 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -39,6 +39,7 @@
>>> #include "kfd_device_queue_manager.h"
>>> #include "kfd_dbgmgr.h"
>>> #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>> static long kfd_ioctl(struct file *, unsigned int, unsigned
>>> long);
>>> static int kfd_open(struct inode *, struct file *); @@ -1243,6
>>> +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep,
>>> struct kfd_process *p,
>>> return ret;
>>> }
>>> +/* Handle requests for watching SMI events */
>>> +static int kfd_ioctl_smi_events(struct file *filep,
>>> + struct kfd_process *p, void *data) {
>>> + struct kfd_ioctl_smi_events_args *args = data;
>>> + struct kfd_dev *dev;
>>> +
>>> + dev = kfd_device_by_id(args->gpu_id);
>>> + if (!dev)
>>> + return -EINVAL;
>>> +
>>> + switch (args->op) {
>>> + case KFD_SMI_EVENTS_REGISTER:
>>> + /* register the device */
>>> + return kfd_smi_event_register(dev, &args->data);
>>> + case KFD_SMI_EVENTS_ENABLE:
>>> + /* subscribe events to the device */
>>> + return kfd_smi_event_enable(dev, args->events);
>>> + case KFD_SMI_EVENTS_DISABLE:
>>> + /* unsubscribe events */
>>> + return kfd_smi_event_disable(dev, args->events);
>>> + }
>>> +
>>> + return -EINVAL;
>>> +}
>>> +
>>> bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>>> {
>>> struct kfd_local_mem_info mem_info; @@ -1827,6 +1854,9 @@
>>> static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>>> AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>>> kfd_ioctl_alloc_queue_gws, 0),
>>> +
>>> + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
>>> + kfd_ioctl_smi_events, 0),
>>> };
>>> #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff
>>> --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 0491ab2..6ac6f31 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev
>>> *kgd,
>>> kfd->device_info = device_info;
>>> kfd->pdev = pdev;
>>> kfd->init_complete = false;
>>> + kfd->smi_events.registered = false;
>>> kfd->kfd2kgd = f2g;
>>> atomic_set(&kfd->compute_profile, 0);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> index e05d75e..151e83e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> @@ -24,6 +24,7 @@
>>> #include "kfd_events.h"
>>> #include "soc15_int.h"
>>> #include "kfd_device_queue_manager.h"
>>> +#include "kfd_smi_events.h"
>>> static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>>> const uint32_t *ih_ring_entry, @@ -117,6
>>> +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>>> info.prot_read = ring_id & 0x10;
>>> info.prot_write = ring_id & 0x20;
>>> + kfd_smi_event_update_vmfault(dev, pasid);
>>> kfd_process_vm_fault(dev->dqm, pasid);
>>> kfd_signal_vm_fault_event(dev, pasid, &info);
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 43b888b..b10b665 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -223,6 +223,15 @@ struct kfd_vmid_info {
>>> uint32_t vmid_num_kfd;
>>> };
>>> +struct kfd_smi_events {
>>> + /* device is registered to watch events */
>>> + bool registered;
>>> + /* events enabled */
>>> + uint32_t events;
>>
>> This should be 64-bit to allow for more future expansion. It doesn't
>> matter as much here, because you can always change the internal
>> header later. But in the ioctl API we can't change it later, so we
>> should define it as 64-bit from the start.
>>
> Ok, I'll change back to 64 bits. I was thinking to use "data" as group
> if we have to run out of bits anyway, but with further thinking, we
> can squish several events into one since it's event "type", not "ID".
> 64 will be quite sufficient and no need to worry about running out of it.
Ignore what I wrote about squishing events. I was thinking the output format.
>>
>>> + DECLARE_KFIFO_PTR(fifo, uint32_t);
>>> + wait_queue_head_t wait_queue;
>>> +};
>>> +
>>> struct kfd_dev {
>>> struct kgd_dev *kgd;
>>> @@ -309,6 +318,9 @@ struct kfd_dev {
>>> /* Global GWS resource shared b/t processes*/
>>> void *gws;
>>> +
>>> + /* if this device is in SMI events watch */
>>> + struct kfd_smi_events smi_events;
>>> };
>>> enum kfd_mempool {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> new file mode 100644
>>> index 0000000..aab4dac
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> @@ -0,0 +1,177 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person
>>> obtaining a
>>> + * copy of this software and associated documentation files (the
>>> "Software"),
>>> + * to deal in the Software without restriction, including without
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute,
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to
>>> whom the
>>> + * Software is furnished to do so, subject to the following
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
>>> EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>>> DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>>> OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
>>> USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#include <linux/poll.h>
>>> +#include <linux/wait.h>
>>> +#include <linux/anon_inodes.h>
>>> +#include <uapi/linux/kfd_ioctl.h>
>>> +#include "amdgpu_vm.h"
>>> +#include "kfd_priv.h"
>>> +#include "kfd_smi_events.h"
>>> +
>>> +static DEFINE_MUTEX(kfd_smi_mutex); struct mutex
>>> +*kfd_get_smi_mutex(void) {
>>> + return &kfd_smi_mutex;
>>> +}
>>
>> Why do you need a function for this. Just reference kfd_smi_mutex
>> directly below. But since the fifo is per device, should there also
>> be one mutex per device?
>>
> I'm not sure how to create mutex for smi_event so I referenced the
> debugger code.
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *, struct
>>> poll_table_struct *);
>>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *,
>>> size_t, loff_t *);
>>> +static int kfd_smi_ev_release(struct inode *, struct file *);
>>> +
>>> +static uint32_t smi_fifo_len;
>>
>> This could be statically initialized rather than doing it in a
>> function below because this can be calculated at compile time. It's
>> also constant:
>>
>> static const uint32_t smi_fifo_len = 32 * (sizeof(struct
>> kfd_smi_msg_vmfault)/sizeof(uint32_t) + 1);
>>
>> Or you could probably even make is a #define.
>>
> True. I'll change it.
>>
>>> +static const char kfd_smi_name[] = "kfd_smi_ev";
>>> +
>>> +static const struct file_operations kfd_smi_ev_fops = {
>>> + .owner = THIS_MODULE,
>>> + .poll = kfd_smi_ev_poll,
>>> + .read = kfd_smi_ev_read,
>>> + .release = kfd_smi_ev_release
>>> +};
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *filep,
>>> + struct poll_table_struct *wait) {
>>> + __poll_t mask;
>>> + struct kfd_dev *dev = filep->private_data;
>>> +
>>> + poll_wait(filep, &dev->smi_events.wait_queue, wait);
>>> + mask = kfifo_is_empty(&dev->smi_events.fifo) ? 0: POLLIN |
>>> POLLRDNORM;
>>> +
>>> + return mask;
>>> +}
>>> +
>>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user
>>> +*user,
>>> + size_t size, loff_t *offset) {
>>> + int ret, copied = 0;
>>> + struct kfd_dev *dev = filep->private_data;
>>> +
>>> + mutex_lock(kfd_get_smi_mutex());
>>> + ret = kfifo_to_user(&dev->smi_events.fifo, user, size,
>>> +&copied);
>>> + mutex_unlock(kfd_get_smi_mutex());
>>> + if (ret || !copied) {
>>> + pr_debug("smi-events: fail to send msg (%i) (%i)\n",
>>> + ret, copied);
>>> + return ret ? ret : -EAGAIN;
>>> + }
>>> +
>>> + return copied;
>>> +}
>>> +
>>> +static int kfd_smi_ev_release(struct inode *inode, struct file
>>> +*filep) {
>>> + struct kfd_dev *dev = filep->private_data;
>>> +
>>> + dev->smi_events.events = 0;
>>> + kfifo_free(&dev->smi_events.fifo);
>>
>> This can cause race conditions. You should do this under the fifo
>> mutex. Also set dev->smi_events.registered = false to allow a new
>> registration after this.
>>
> Good catch. I'll add them.
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t
>>> +pasid) {
>>> + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
>>> + struct amdgpu_task_info task_info;
>>> + struct kfd_smi_msg_vmfault msg;
>>> + uint32_t *fifo_in = (uint32_t *)&msg;
>>> + uint32_t fifo_in_len = sizeof(msg) / sizeof(uint32_t);
>>> +
>>> + if (!(dev->smi_events.events & KFD_SMI_EVENT_VMFAULT))
>>> + return;
>>
>> You need to do this while holding the fifo mutex, otherwise the fifo
>> can be destroyed before you access it below.
>>
> Ok
>>
>>> +
>>> + amdgpu_vm_get_task_info(adev, pasid, &task_info);
>>> + msg.pid = task_info.pid;
>>> + strncpy(msg.task_name, task_info.task_name, 16);
>>> +
>>> + mutex_lock(kfd_get_smi_mutex());
>>> + if ((fifo_in_len + 1) >
>>> + (smi_fifo_len - kfifo_len(&dev->smi_events.fifo))) {
>>
>> You could use kfifo_avail to get the space available in the fifo.
>>
> Ah. Thanks.
>>
>>> + pr_err("smi_event: no space left for vmfault event\n");
>>> + mutex_unlock(kfd_get_smi_mutex());
>>> + return;
>>> + }
>>> + /* Always send the event type first to function as a header */
>>> + kfifo_put(&dev->smi_events.fifo, KFD_SMI_EVENT_VMFAULT);
>>> + /* Then the msg as the payload. Event type reveals the payload
>>> size. */
>>> + kfifo_in(&dev->smi_events.fifo, fifo_in, fifo_in_len);
>>> + mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> + wake_up_all(&dev->smi_events.wait_queue);
>>> +}
>>> +
>>> +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events) {
>>> + mutex_lock(kfd_get_smi_mutex());
>>> + dev->smi_events.events &= ~events;
>>> + mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events) {
>>> + /* If the user didn't register SMI events for this device,
>>> kfifo is not
>>> + * created to report events.
>>> + */
>>> + if (!dev->smi_events.registered)
>>> + return -EINVAL;
>>> +
>>> + mutex_lock(kfd_get_smi_mutex());
>>> + dev->smi_events.events |= events;
>>> + mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void assign_fifo_len(void)
>>> +{
>>> + /* When a new event is added into support and this new event
>>> msg uses
>>> + * more memory, replace the msg struct here.
>>> + */
>>> + uint32_t max_msg = sizeof(struct
>>> kfd_smi_msg_vmfault)/sizeof(uint32_t);
>>> + /* +1 for the event type in front of event message. up to 32
>>> entries */
>>> + smi_fifo_len = (++max_msg) * 32;
>>
>> See above. This could be statically initialized.
>>
>>
>>> +}
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd) {
>>> + int ret = 0;
>>
>> This function should return failure if the event was already
>> registered. You can only allow one FD to be created per device,
>> because that FD owns the FIFO.
>>
>> This entire function must be under the fifo mutex.
>>
> Ok
>>
>>> +
>>> + if (!smi_fifo_len)
>>> + assign_fifo_len();
>>> +
>>> + ret = kfifo_alloc(&dev->smi_events.fifo, smi_fifo_len,
>>> GFP_KERNEL);
>>> + if (ret) {
>>> + pr_err("smi_event: fail to allocate kfifo\n");
>>> + return ret;
>>> + }
>>> + init_waitqueue_head(&dev->smi_events.wait_queue);
>>> + dev->smi_events.events = 0;
>>> + dev->smi_events.registered = true;
>>> +
>>> + ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
>>> + (void *)dev, 0);
>>> + *fd = ret;
>>> +
>>> + return ret;
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> new file mode 100644
>>> index 0000000..e10bb5d
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> @@ -0,0 +1,31 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person
>>> obtaining a
>>> + * copy of this software and associated documentation files (the
>>> "Software"),
>>> + * to deal in the Software without restriction, including without
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute,
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to
>>> whom the
>>> + * Software is furnished to do so, subject to the following
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
>>> EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>>> DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>>> OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
>>> USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
>>> +#define KFD_SMI_EVENTS_H_INCLUDED
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd); int
>>> +kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events); int
>>> +kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events); void
>>> +kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t
>>> pasid);
>>> +
>>> +#endif
>>> diff --git a/include/uapi/linux/kfd_ioctl.h
>>> b/include/uapi/linux/kfd_ioctl.h index 4f66764..dc9309e 100644
>>> --- a/include/uapi/linux/kfd_ioctl.h
>>> +++ b/include/uapi/linux/kfd_ioctl.h
>>> @@ -442,6 +442,31 @@ struct kfd_ioctl_import_dmabuf_args {
>>> __u32 dmabuf_fd; /* to KFD */
>>> };
>>> +/*
>>> + * KFD SMI(System Management Interface) events */ enum
>>> +kfd_smi_events_op {
>>> + KFD_SMI_EVENTS_REGISTER = 1,
>>> + KFD_SMI_EVENTS_ENABLE,
>>> + KFD_SMI_EVENTS_DISABLE
>>> +};
>>> +
>>> +/* Event type (defined by bitmask) */ #define KFD_SMI_EVENT_VMFAULT
>>> +0x00000001
>>
>> Since you can only have 64 events, you only need one byte. You have
>> plenty of space in the header dword to add the message size. That
>> would make it easier to parse the events and allow variable size
>> events in the future. Then you should define the header as a struct.
>> Something like this:
>>
>> struct kfd_smi_msg_header {
>> __u8 type;
>> __u8 pad;
>> __u16 size;
>> };
>>
> It sounds a good idea, but I don't see how it improves the parsing...
> The user reads the content based on the event type. My test code on
> the user space is like this:
>
> read(fd, &event, sizeof(uint32_t)); //will change uint64_t
>
> switch (event) {
>
> case event_A:
>
> read(fd, &struct_event_A_variable, sizeof(struct event_A));
>
> case event_B:
>
> read(fd, &struct_event_B_variable, sizeof(struct event_B));
>
> Not exactly like that, such as read() is done in a separate function,
> but you see the idea.
>
> This is what I mean by the event type tells the size itself. If I
> provide the header you suggested, the user still needs to decide which
> struct to use.... Unless we don't use struct for the message. I can
> change the output to become event:description_of_the event where
> description is a string. The header you suggest will apply to this
> method pretty well.
>
As stated in the beginning, I'll use header+description where description is a string whose size rounds up to power of 4 (sizeof(uint32_t)).
>>
>>> +
>>> +struct kfd_ioctl_smi_events_args {
>>> + __u32 op; /* to KFD */
>>> + __u32 events; /* to KFD */
>>
>> I thought this should be 64-bit.
>>
>>
>>> + __u32 gpu_id; /* to KFD */
>>> + __u32 data; /* from KFD */
>>> +};
>>> +
>>> +/* Message to user space for each event */ struct
>>> +kfd_smi_msg_vmfault {
>>
>> If you define the header like suggested above, you could include it
>> here and at the start of all message structures. Might simplify the
>> code a little:
>>
>> struct kfd_smi_msg_header header;
>>
>> Regards,
>> Felix
>>
>>> + uint32_t pid;
>>> + char task_name[16];
>>> +};
>>> +
>>> /* Register offset inside the remapped mmio page
>>> */
>>> enum kfd_mmio_remap {
>>> @@ -546,7 +571,10 @@ enum kfd_mmio_remap {
>>> #define AMDKFD_IOC_ALLOC_QUEUE_GWS \
>>> AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>> +#define AMDKFD_IOC_SMI_EVENTS \
>>> + AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>>> +
>>> #define AMDKFD_COMMAND_START 0x01 -#define
>>> AMDKFD_COMMAND_END 0x1F
>>> +#define AMDKFD_COMMAND_END 0x20
>>> #endif
More information about the amd-gfx
mailing list