[PATCH] drm/amdkfd: Provide SMI events watch

Felix Kuehling felix.kuehling at amd.com
Wed Apr 1 20:54:14 UTC 2020


Am 2020-04-01 um 9:10 a.m. schrieb Amber Lin:
>
> Thanks Felix for the review. I have a better understanding of how
> kfifo works now and have changed my code quite a bit. Couple of
> questions below inline regarding the gpu_id and data arguments.
>
Replies inline ...


> Thanks.
>
> Amber
>
> On 2020-03-26 4:53 p.m., Felix Kuehling wrote:
>>
>> Hi Amber,
>>
>> I see that this is based on the debugger event code. Jon and I are
>> just working through some issues with that code. The lessons from
>> that will need to be applied to this as well. But I think we can
>> define your API to simplify this a bit.
>>
>> The basic problem is, that we have one Fifo in the kfd_device, but
>> potentially multiple file descriptors referring to it. For the event
>> interface I think we can enforce only a single file descriptor per
>> device. If there is already one, your register call can fail. See
>> more comments inline.
>>
>> On 2020-03-17 13:57, Amber Lin wrote:
>>> When the compute is malfunctioning or performance drops, the system admin
>>> will use SMI (System Management Interface) tool to monitor/diagnostic what
>>> went wrong. This patch provides an event watch interface for the user
>>> space to register events they are interested. After the event is
>>> registered, the user can use annoymous file descriptor's pull function
>>
>> pull -> poll
>>
> Thank you for spotting the typo. I’ll change that.
>
>>> with wait-time specified to wait for the event to happen. Once the event
>>> happens, the user can use read() to retrieve information related to the
>>> event.
>>>
>>> VM fault event is done in this patch.
>>>
>>> Signed-off-by: Amber Lin <Amber.Lin at amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>>>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>>>  include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>>>  9 files changed, 265 insertions(+), 2 deletions(-)
>>>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> index 6147462..cc98b4a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> @@ -53,7 +53,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>>>  		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>>>  		$(AMDKFD_PATH)/kfd_dbgdev.o \
>>>  		$(AMDKFD_PATH)/kfd_dbgmgr.o \
>>> -		$(AMDKFD_PATH)/kfd_crat.o
>>> +		$(AMDKFD_PATH)/kfd_crat.o \
>>> +		$(AMDKFD_PATH)/kfd_smi_events.o
>>>  
>>>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> index 9f59ba9..24b4717 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> @@ -24,6 +24,7 @@
>>>  #include "kfd_events.h"
>>>  #include "cik_int.h"
>>>  #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>  
>>>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>>  					const uint32_t *ih_ring_entry,
>>> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>>>  		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>>  		struct kfd_vm_fault_info info;
>>>  
>>> +		kfd_smi_event_update_vmfault(dev, pasid);
>>>  		kfd_process_vm_fault(dev->dqm, pasid);
>>>  
>>>  		memset(&info, 0, sizeof(info));
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index f8fa03a..8e92956 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -39,6 +39,7 @@
>>>  #include "kfd_device_queue_manager.h"
>>>  #include "kfd_dbgmgr.h"
>>>  #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>  
>>>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>>>  static int kfd_open(struct inode *, struct file *);
>>> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>>>  	return ret;
>>>  }
>>>  
>>> +/* Handle requests for watching SMI events */
>>> +static int kfd_ioctl_smi_events(struct file *filep,
>>> +				struct kfd_process *p, void *data)
>>> +{
>>> +	struct kfd_ioctl_smi_events_args *args = data;
>>> +	struct kfd_dev *dev;
>>> +	int ret = 0;
>>> +
>>> +	dev = kfd_device_by_id(args->gpu_id);
>>> +	if (!dev)
>>> +		return -EINVAL;
>>> +
>>> +	switch (args->op) {
>>> +	case KFD_SMI_EVENTS_REGISTER:
>>> +		ret = kfd_smi_event_register(dev, args->events);
>>> +		if (ret >= 0) {
>>> +			/* When the registration is successful, it returns the
>>> +			 * annoymous inode. Pass it to the user in data1
>>> +			 */
>>> +			args->data1 = ret;
>>> +			ret = 0;
>>
>> You could return the file descriptor as the return value. On success
>> it returns a positive fd. On failure it returns a negative error code.
>>
> I'm thinking to have a consistent return value regardless of the
> argument content. When args->op is not REGISTER, the return value as 0
> is success.

OK.


>>> +		}
>>> +		break;
>>> +	case KFD_SMI_EVENTS_UNREGISTER:
>>> +		kfd_smi_event_unregister(dev, args->events);
>>
>> Register seems to do two things: create a file descriptor, and
>> subscribe to specific events. This unregister function only affects
>> the subscription but not the file descriptor. I'd suggest a cleaner API:
>>
>>   * Register: creates the file descriptor
>>   * Subscribe: enables/disables subscription to specific events
>>
>> The unregistration is done implicitly by closing the file descriptor,
>> so there is no explicit ioctl API call for this.
>>
> Make sense. I've changed this.
>>
>>> +		break;
>>> +	default:
>>> +		ret = -EINVAL;
>>> +		break;
>>> +	}
>>> +
>>> +	return ret;
>>> +}
>>> +
>>>  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>>>  {
>>>  	struct kfd_local_mem_info mem_info;
>>> @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>>>  
>>>  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>>>  			kfd_ioctl_alloc_queue_gws, 0),
>>> +
>>> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
>>> +			kfd_ioctl_smi_events, 0),
>>>  };
>>>  
>>>  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 7866cd06..450368c 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>>>  	kfd->device_info = device_info;
>>>  	kfd->pdev = pdev;
>>>  	kfd->init_complete = false;
>>> +	kfd->smi_events.events = 0;
>>>  	kfd->kfd2kgd = f2g;
>>>  	atomic_set(&kfd->compute_profile, 0);
>>>  
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> index e05d75e..151e83e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> @@ -24,6 +24,7 @@
>>>  #include "kfd_events.h"
>>>  #include "soc15_int.h"
>>>  #include "kfd_device_queue_manager.h"
>>> +#include "kfd_smi_events.h"
>>>  
>>>  static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>>>  					const uint32_t *ih_ring_entry,
>>> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>>>  		info.prot_read  = ring_id & 0x10;
>>>  		info.prot_write = ring_id & 0x20;
>>>  
>>> +		kfd_smi_event_update_vmfault(dev, pasid);
>>>  		kfd_process_vm_fault(dev->dqm, pasid);
>>>  		kfd_signal_vm_fault_event(dev, pasid, &info);
>>>  	}
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 43b888b..fdb51de 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>>>  	uint32_t vmid_num_kfd;
>>>  };
>>>  
>>> +struct kfd_smi_events {
>>> +	uint64_t events;
>>> +	struct kfifo fifo;
>>> +	wait_queue_head_t wait_queue;
>>> +	uint32_t max_events;
>>> +};
>>> +
>>>  struct kfd_dev {
>>>  	struct kgd_dev *kgd;
>>>  
>>> @@ -309,6 +316,9 @@ struct kfd_dev {
>>>  
>>>  	/* Global GWS resource shared b/t processes*/
>>>  	void *gws;
>>> +
>>> +	/* if this device is in SMI events watch */
>>> +	struct kfd_smi_events smi_events;
>>>  };
>>>  
>>>  enum kfd_mempool {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> new file mode 100644
>>> index 0000000..ba9d036
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> @@ -0,0 +1,143 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a
>>> + * copy of this software and associated documentation files (the "Software"),
>>> + * to deal in the Software without restriction, including without limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#include <linux/poll.h>
>>> +#include <linux/wait.h>
>>> +#include <linux/anon_inodes.h>
>>> +#include <uapi/linux/kfd_ioctl.h>
>>> +#include "amdgpu_vm.h"
>>> +#include "kfd_priv.h"
>>> +#include "kfd_smi_events.h"
>>> +
>>> +static DEFINE_MUTEX(kfd_smi_mutex);
>>> +
>>> +struct mutex *kfd_get_smi_mutex(void)
>>> +{
>>> +	return &kfd_smi_mutex;
>>> +}
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
>>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
>>> +static int kfd_smi_ev_release(struct inode *, struct file *);
>>> +
>>> +static const char kfd_smi_name[] = "kfd_smi_ev";
>>> +
>>> +static const struct file_operations kfd_smi_ev_fops = {
>>> +	.owner = THIS_MODULE,
>>> +	.poll = kfd_smi_ev_poll,
>>> +	.read = kfd_smi_ev_read,
>>> +	.release = kfd_smi_ev_release
>>> +};
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *filep,
>>> +				struct poll_table_struct *wait)
>>> +{
>>> +	struct kfd_dev *dev = filep->private_data;
>>> +	struct kfd_smi_events *ev = &dev->smi_events;
>>> +
>>> +	__poll_t mask = 0;
>>> +
>>> +	poll_wait(filep, &ev->wait_queue, wait);
>>> +	mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask;
>>> +
>>> +	return mask;
>>> +}
>>> +
>>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
>>> +			       size_t size, loff_t *offset)
>>> +{
>>> +	int ret, copied = 0;
>>> +	struct kfd_dev *dev = filep->private_data;
>>> +
>>> +	ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
>>> +	if (ret || !copied) {
>>> +		pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
>>> +				ret, copied);
>>> +		return ret ? ret : -EAGAIN;
>>> +	}
>>> +
>>> +	return copied;
>>> +}
>>> +
>>> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>>> +{
>>> +	struct kfd_dev *dev = filep->private_data;
>>> +
>>> +	kfifo_free(&dev->smi_events.fifo);
>>> +	return 0;
>>> +}
>>> +
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid)
>>> +{
>>> +	struct kfd_smi_vmfault_fifo fifo_out;
>>> +	struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
>>> +	struct amdgpu_task_info task_info;
>>> +
>>> +	if (!kdev->smi_events.events)
>>> +		return;
>>
>> This condition is redundant.
>>
> Removed it. It was from my original plan to handle multiple events in
> one function but after realizing not do-able and changing the function
> name, I forgot to remove this check.
>>
>>> +
>>> +	if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
>>> +		return;
>>> +
>>> +	memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>> +	amdgpu_vm_get_task_info(adev, pasid, &task_info);
>>> +
>>> +	fifo_out.group = 0;
>>> +	fifo_out.event = KFD_SMI_EV_VMFAULT;
>>> +	fifo_out.gpu_id = kdev->id;
>>> +	fifo_out.pid = task_info.pid;
>>> +	strcpy(fifo_out.task_name, task_info.task_name);
>>> +	kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
>>> +	wake_up_all(&kdev->smi_events.wait_queue);
>>> +}
>>> +
>>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
>>> +{
>>> +	mutex_lock(kfd_get_smi_mutex());
>>> +	dev->smi_events.events &= ~events;
>>> +	mutex_unlock(kfd_get_smi_mutex());
>>> +}
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
>>> +{
>>> +	int ret;
>>> +
>>> +	mutex_lock(kfd_get_smi_mutex());
>>> +	dev->smi_events.events |= events;
>>> +	mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> +	/* We use the lower 32 bits for now. Each bit represents one event. If
>>> +	 * featured events are increased to more than 32, we'll use the upper
>>> +	 * bits as groups so the total number of events can be up to 32*32.
>>> +	 */
>>> +	dev->smi_events.max_events = 32;
>>
>> I don't understand the explanation above. It seems to refer to the
>> event subscription mechanism. But you use this as the size of the
>> fifo. That's the size in bytes. Your struct kfd_smi_vmfault_fifo is
>> bigger than that, so even a single entry will overflow your FIFO queue.
>>
> I’m afraid we’ll eventually need to support more than 64 events, so I
> want to reserve the upper 32 bits to serve as “group”. Once we reach
> 32 events, 0x00000000_80000000 will be the last event in group 0, and
> 0x00000001_00000001 is the first event in group 1. This way we can
> expand the 64 bits to support more than 64 events.
>
It seems you can only use one group at a time. I don't see how you can
enable events in different groups at the same time using this method. At
least internally you'll need a bitmask that's wide enough to represent
all event types. In the subscribe/unsubscribe API you can use different
ways to encode the events in a more compact way because you can make
multiple calls to enable/disable events.


> This seems too complicated to explain in code. I'll just use the
> events parameter as event IDs. Once the featured events grows to more
> than 64, we can use "data" in the argument to express group when the
> op is ENABLE/DISABLE events.
>
>>> +	ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
>>> +			 GFP_KERNEL);
>>> +	if (ret) {
>>> +		pr_err("fail to allocate kfifo\n");
>>> +		return ret;
>>> +	}
>>> +	init_waitqueue_head(&dev->smi_events.wait_queue);
>>> +
>>> +	return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
>>> +				(void *)dev, 0);
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> new file mode 100644
>>> index 0000000..2e320d3
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> @@ -0,0 +1,41 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a
>>> + * copy of this software and associated documentation files (the "Software"),
>>> + * to deal in the Software without restriction, including without limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
>>> +#define KFD_SMI_EVENTS_H_INCLUDED
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
>>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
>>> +
>>> +/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user
>>> + * can read the first 8 bytes to determine the next read length.
>>
>> Note about terminology: FIFO refers to a fifo queue. I think you're
>> using it to describe an entry or record in the FIFO.
>>
>> I don't know why you need to split group and event? The event mask is
>> only 64-bit, so there can be no more than 64 events.
>>
> I've removed group
>>
>>> + */
>>> +struct kfd_smi_vmfault_fifo {
>>> +	uint32_t group;
>>> +	uint32_t event;
>>> +	unsigned int gpu_id;
>>
>> The gpu_id is redundant because the file descriptor used to read the
>> events is associated with a kfd_dev (GPU). If you want to have a
>> single fifo, you should change the register API to not require a
>> gpu_id and the fifo should be global, not a member of the kfd_dev.
>>
> True
>>
>>> +	pid_t pid;
>>> +	char task_name[TASK_COMM_LEN];
>>> +};
>>
>> This needs to be part of the user API because usermode needs to parse
>> this structure. So it should be defined in kfd_ioctl.h. In your
>> proposed API I think the size of the FIFO entry is implied by the
>> event type. It would be cleaner to separate the FIFO entries into a
>> header and payload. The header would be the same for all events and
>> doesn't need to be duplicated in each event structure. It would
>> contain the event type and size (to allow variable size events). The
>> payload would depend on the event type.
>>
> I'll change this
>>
>>> +
>>> +#endif
>>> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
>>> index 4f66764..6ce7c69 100644
>>> --- a/include/uapi/linux/kfd_ioctl.h
>>> +++ b/include/uapi/linux/kfd_ioctl.h
>>> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>>>  	__u32 dmabuf_fd;	/* to KFD */
>>>  };
>>>  
>>> +/*
>>> + * KFD SMI(System Management Interface) events
>>> + */
>>> +enum kfd_smi_events_op {
>>> +	KFD_SMI_EVENTS_REGISTER = 1,
>>> +	KFD_SMI_EVENTS_UNREGISTER
>>> +};
>>> +
>>> +/* Event ID (mask) */
>>> +#define KFD_SMI_EV_VMFAULT     0x00000001
>>> +
>>> +struct kfd_ioctl_smi_events_args {
>>> +	__u32 op;       /* to KFD */
>>> +	/* upper 32 bits: group. lower 32 bits: event IDs */
>>> +	__u64 events;   /* to KFD */
>>> +	__u32 gpu_id;   /* to KFD */
>>> +	pid_t pid;      /* to KFD */
>>> +	__u32 data1;    /* from KFD */
>>> +	__u32 data2;
>>> +	__u32 data3;
>>
>> This looks like you copied it from the debug API.  pid, data1-3 are
>> not used by your API. I think gpu_id should not be used either if you
>> want the event FIFO to be global.
>>
>> Regards,
>>   Felix
>>
> gpu_id is for the user space to tell KFD which device it wants to
> watch events. I'll use separate fd/fifo for each device. I want to
> reserve some extra parameters in case I need to feedback extra
> information for possible future events. Further thinking of it, that
> will be handled in the anonymous fd's output, not here. I'll remove
> data 2 and 3 and rename data1 as data. I want to use it for anon_fd in
> REGISTRATION and for group in DISABLE/ENABLE when we need to support
> more than 64 events.

OK.

Regards,
  Felix

>>> +};
>>> +
>>>  /* Register offset inside the remapped mmio page
>>>   */
>>>  enum kfd_mmio_remap {
>>> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>>>  #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
>>>  		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>>  
>>> +#define AMDKFD_IOC_SMI_EVENTS			\
>>> +		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>>> +
>>>  #define AMDKFD_COMMAND_START		0x01
>>> -#define AMDKFD_COMMAND_END		0x1F
>>> +#define AMDKFD_COMMAND_END		0x20
>>>  
>>>  #endif
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20200401/e9c6e345/attachment-0001.htm>


More information about the amd-gfx mailing list