[PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro
Chen, Xiaogang
xiaogang.chen at amd.com
Thu Feb 15 17:54:38 UTC 2024
On 2/15/2024 9:18 AM, Philip Yang wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Document how to use SMI system management interface to receive SVM
> events.
>
> Define SVM events message string format macro that could use by user
> mode for sscanf to parse the event. Add it to uAPI header file to make
> it obvious that is changing uAPI in future.
>
> No functional changes.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51 +++++++-------
> include/uapi/linux/kfd_ioctl.h | 77 ++++++++++++++++++++-
> 2 files changed, 102 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index d9953c2b2661..85465eb303a9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
> event = KFD_SMI_EVENT_GPU_PRE_RESET;
> ++(dev->reset_seq_num);
> }
> - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
> + kfd_smi_event_add(0, dev, event,
> + KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num));
> }
>
> void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
> uint64_t throttle_bitmask)
> {
> - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
> - throttle_bitmask,
> - amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
> + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE,
> + KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask,
> + amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
> }
>
> void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
> @@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
> if (!task_info.pid)
> return;
>
> - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
> - task_info.pid, task_info.task_name);
> + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,
> + KFD_EVENT_FMT_VMFAULT(task_info.pid, task_info.task_name));
> }
>
> void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
> @@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
> ktime_t ts)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
> - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
> - address, node->id, write_fault ? 'W' : 'R');
> + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
> + address, node->id, write_fault ? 'W' : 'R'));
> }
>
> void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
> unsigned long address, bool migration)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
> - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
> - pid, address, node->id, migration ? 'M' : 'U');
> + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
> + pid, address, node->id, migration ? 'M' : 'U'));
> }
>
> void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
> @@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
> uint32_t trigger)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
> - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
> - ktime_get_boottime_ns(), pid, start, end - start,
> - from, to, prefetch_loc, preferred_loc, trigger);
> + KFD_EVENT_FMT_MIGRATE_START(ktime_get_boottime_ns(),
> + pid, start, end - start, from, to, prefetch_loc,
> + preferred_loc, trigger));
> }
>
> void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
> @@ -284,24 +285,23 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
> uint32_t from, uint32_t to, uint32_t trigger)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
> - "%lld -%d @%lx(%lx) %x->%x %d\n",
> - ktime_get_boottime_ns(), pid, start, end - start,
> - from, to, trigger);
> + KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), pid,
> + start, end - start, from, to, trigger));
> }
>
> void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
> uint32_t trigger)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION,
> - "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
> - node->id, trigger);
> + KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(),
> + pid, node->id, trigger));
> }
>
> void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE,
> - "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
> - node->id);
> + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(),
> + pid, node->id));
> }
>
> void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
> @@ -317,9 +317,10 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
> struct kfd_process_device *pdd = p->pdds[i];
>
> kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
> - KFD_SMI_EVENT_QUEUE_RESTORE,
> - "%lld -%d %x %c\n", ktime_get_boottime_ns(),
> - p->lead_thread->pid, pdd->dev->id, 'R');
> + KFD_SMI_EVENT_QUEUE_RESTORE_RESCHEDULED,
> + KFD_EVENT_FMT_QUEUE_RESTORE_RESCHEDULED(
> + ktime_get_boottime_ns(), p->lead_thread->pid,
> + pdd->dev->id, 'R'));
> }
> kfd_unref_process(p);
> }
> @@ -329,8 +330,8 @@ void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid,
> uint32_t trigger)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_UNMAP_FROM_GPU,
> - "%lld -%d @%lx(%lx) %x %d\n", ktime_get_boottime_ns(),
> - pid, address, last - address + 1, node->id, trigger);
> + KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(),
> + pid, address, last - address + 1, node->id, trigger));
> }
>
> int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 9ce46edc62a5..430c01f4148b 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -523,7 +523,8 @@ enum kfd_smi_event {
> KFD_SMI_EVENT_PAGE_FAULT_END = 8,
> KFD_SMI_EVENT_QUEUE_EVICTION = 9,
> KFD_SMI_EVENT_QUEUE_RESTORE = 10,
> - KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
> + KFD_SMI_EVENT_QUEUE_RESTORE_RESCHEDULED = 11,
> + KFD_SMI_EVENT_UNMAP_FROM_GPU = 12,
Why change KFD_SMI_EVENT_UNMAP_FROM_GPU from 11 to 12? That breaks
existing api with user space, ex, thunk use HSA_SMI_EVENT_UNMAP_FROM_GPU
= 11,
Regards
Xiaogang
> /*
> * max event number, as a flag bit to get events from all processes,
> @@ -564,6 +565,80 @@ struct kfd_ioctl_smi_events_args {
> __u32 anon_fd; /* from KFD */
> };
>
> +/*
> + * SVM event tracing via SMI system management interface
> + *
> + * Open event file descriptor
> + * use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file
> + * descriptor to receive SMI events.
> + * If calling with sudo permission, then file descriptor can be used to receive
> + * SVM events from all processes, otherwise, to only receive SVM events of same
> + * process.
> + *
> + * To enable the SVM event
> + * Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
> + * mask to start record the event to the kfifo, use bitmap mask combination
> + * for multiple events. New event mask will overwrite the previous event mask.
> + * KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo
> + * permisson to receive SVM events from all process.
> + *
> + * To receive the event
> + * Application can poll file descriptor to wait for the events, then read event
> + * from the file into a buffer. Each event is one line string message, starting
> + * with the event id, then the event specific information.
> + *
> + * To decode event information
> + * The following event format string macro can be used with sscanf to decode
> + * the specific event information.
> + * event triggers: the reason to generate the event, defined as enum for unmap,
> + * eviction and migrate events.
> + * node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory.
> + * addr: user mode address, in pages
> + * size: in pages
> + * pid: the process ID to generate the event
> + * ns: timestamp in nanosecond-resolution, starts at system boot time but
> + * stops during suspend
> + * migrate_update: the GPU page is recovered by 'M' for migrate, 'U' for update
> + * rescheduled: 'R' if the queue restore failed and rescheduled to try again
> + * rw: 'W' for write page fault, 'R' for read page fault
> + */
> +#define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num)\
> + "%x\n", (reset_seq_num)
> +
> +#define KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(bitmask, counter)\
> + "%llx:%llx\n", (bitmask), (counter)
> +
> +#define KFD_EVENT_FMT_VMFAULT(pid, task_name)\
> + "%x:%s\n", (pid), (task_name)
> +
> +#define KFD_EVENT_FMT_PAGEFAULT_START(ns, pid, addr, node, rw)\
> + "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw)
> +
> +#define KFD_EVENT_FMT_PAGEFAULT_END(ns, pid, addr, node, migrate_update)\
> + "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update)
> +
> +#define KFD_EVENT_FMT_MIGRATE_START(ns, pid, start, size, from, to, prefetch_loc,\
> + preferred_loc, migrate_trigger)\
> + "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\
> + (from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger)
> +
> +#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\
> + "%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\
> + (from), (to), (migrate_trigger)
> +
> +#define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\
> + "%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)
> +
> +#define KFD_EVENT_FMT_QUEUE_RESTORE(ns, pid, node)\
> + "%lld -%d %x\n", (ns), (pid), (node)
> +
> +#define KFD_EVENT_FMT_QUEUE_RESTORE_RESCHEDULED(ns, pid, node, rescheduled)\
> + "%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled)
> +
> +#define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node, unmap_trigger)\
> + "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
> + (node), (unmap_trigger)
> +
> /**************************************************************************************************
> * CRIU IOCTLs (Checkpoint Restore In Userspace)
> *
> --
> 2.35.1
>
More information about the amd-gfx
mailing list