[PATCH v2 1/4] drm/amdkfd: Document and define SVM events message macro
James Zhu
jamesz at amd.com
Thu Aug 22 14:32:44 UTC 2024
On 2024-07-30 16:15, Philip Yang wrote:
> Document how to use SMI system management interface to enable and
> receive SVM events. Document SVM event triggers.
>
> Define SVM events message string format macro that could be used by user
> mode for sscanf to parse the event. Add it to uAPI header file to make
> it obvious that is changing uAPI in future.
>
> No functional changes.
>
> Signed-off-by: Philip Yang<Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 45 +++++----
> include/uapi/linux/kfd_ioctl.h | 100 +++++++++++++++++---
> 2 files changed, 109 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index ea6a8e43bd5b..de8b9abf7afc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
> amdgpu_reset_get_desc(reset_context, reset_cause,
> sizeof(reset_cause));
>
> - kfd_smi_event_add(0, dev, event, "%x %s\n",
> - dev->reset_seq_num,
> - reset_cause);
> + kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET(
> + dev->reset_seq_num, reset_cause));
> }
>
> void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
> uint64_t throttle_bitmask)
> {
> - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
> + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING(
> throttle_bitmask,
> - amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
> + amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
> }
>
> void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
> @@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
> if (task_info) {
> /* Report VM faults from user applications, not retry from kernel */
> if (task_info->pid)
> - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
> - task_info->pid, task_info->task_name);
> + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT(
> + task_info->pid, task_info->task_name));
> amdgpu_vm_put_task_info(task_info);
> }
> }
> @@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
> ktime_t ts)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
> - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
> - address, node->id, write_fault ? 'W' : 'R');
> + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
> + address, node->id, write_fault ? 'W' : 'R'));
> }
>
> void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
> unsigned long address, bool migration)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
> - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
> - pid, address, node->id, migration ? 'M' : 'U');
> + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
> + pid, address, node->id, migration ? 'M' : 'U'));
> }
>
> void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
> @@ -286,9 +285,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
> uint32_t trigger)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
> - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
> + KFD_EVENT_FMT_MIGRATE_START(
> ktime_get_boottime_ns(), pid, start, end - start,
> - from, to, prefetch_loc, preferred_loc, trigger);
> + from, to, prefetch_loc, preferred_loc, trigger));
> }
>
> void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
> @@ -296,24 +295,24 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
> uint32_t from, uint32_t to, uint32_t trigger)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
> - "%lld -%d @%lx(%lx) %x->%x %d\n",
> + KFD_EVENT_FMT_MIGRATE_END(
> ktime_get_boottime_ns(), pid, start, end - start,
> - from, to, trigger);
> + from, to, trigger));
> }
>
> void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
> uint32_t trigger)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION,
> - "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
> - node->id, trigger);
> + KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid,
> + node->id, trigger));
> }
>
> void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE,
> - "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
> - node->id);
> + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid,
> + node->id, 0));
> }
>
> void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
> @@ -330,8 +329,8 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
>
> kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
> KFD_SMI_EVENT_QUEUE_RESTORE,
> - "%lld -%d %x %c\n", ktime_get_boottime_ns(),
> - p->lead_thread->pid, pdd->dev->id, 'R');
> + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(),
> + p->lead_thread->pid, pdd->dev->id, 'R'));
> }
> kfd_unref_process(p);
> }
> @@ -341,8 +340,8 @@ void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid,
> uint32_t trigger)
> {
> kfd_smi_event_add(pid, node, KFD_SMI_EVENT_UNMAP_FROM_GPU,
> - "%lld -%d @%lx(%lx) %x %d\n", ktime_get_boottime_ns(),
> - pid, address, last - address + 1, node->id, trigger);
> + KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(),
> + pid, address, last - address + 1, node->id, trigger));
> }
>
> int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 71a7ce5f2d4c..c94182ad8fb8 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -540,26 +540,29 @@ enum kfd_smi_event {
> KFD_SMI_EVENT_ALL_PROCESS = 64
> };
>
> +/* The reason of the page migration event */
> enum KFD_MIGRATE_TRIGGERS {
> - KFD_MIGRATE_TRIGGER_PREFETCH,
> - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
> - KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
> - KFD_MIGRATE_TRIGGER_TTM_EVICTION
> + KFD_MIGRATE_TRIGGER_PREFETCH, /* Prefetch to GPU */
[JZ] could it be per-fetched to system RAM also?
> + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, /* GPU page fault recover */
> + KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, /* CPU page fault recover */
> + KFD_MIGRATE_TRIGGER_TTM_EVICTION /* TTM eviction */
> };
>
> +/* The reason of user queue evition event */
> enum KFD_QUEUE_EVICTION_TRIGGERS {
> - KFD_QUEUE_EVICTION_TRIGGER_SVM,
> - KFD_QUEUE_EVICTION_TRIGGER_USERPTR,
> - KFD_QUEUE_EVICTION_TRIGGER_TTM,
> - KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,
> - KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,
> - KFD_QUEUE_EVICTION_CRIU_RESTORE
> + KFD_QUEUE_EVICTION_TRIGGER_SVM, /* SVM buffer migration */
> + KFD_QUEUE_EVICTION_TRIGGER_USERPTR, /* userptr movement */
> + KFD_QUEUE_EVICTION_TRIGGER_TTM, /* TTM move buffer */
> + KFD_QUEUE_EVICTION_TRIGGER_SUSPEND, /* GPU suspend */
> + KFD_QUEUE_EVICTION_CRIU_CHECKPOINT, /* CRIU checkpoint */
> + KFD_QUEUE_EVICTION_CRIU_RESTORE /* CRIU restore */
> };
>
> +/* The reason of unmap buffer from GPU event */
> enum KFD_SVM_UNMAP_TRIGGERS {
> - KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY,
> - KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
> - KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
> + KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY, /* MMU notifier CPU buffer movement */
> + KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,/* MMU notifier page migration */
> + KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU /* Unmap to free the buffer */
> };
>
> #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
> @@ -570,6 +573,77 @@ struct kfd_ioctl_smi_events_args {
> __u32 anon_fd; /* from KFD */
> };
>
> +/*
> + * SVM event tracing via SMI system management interface
> + *
> + * Open event file descriptor
> + * use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file
> + * descriptor to receive SMI events.
> + * If calling with sudo permission, then file descriptor can be used to receive
> + * SVM events from all processes, otherwise, to only receive SVM events of same
> + * process.
> + *
> + * To enable the SVM event
> + * Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
> + * mask to start record the event to the kfifo, use bitmap mask combination
> + * for multiple events. New event mask will overwrite the previous event mask.
> + * KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo
> + * permisson to receive SVM events from all process.
> + *
> + * To receive the event
> + * Application can poll file descriptor to wait for the events, then read event
> + * from the file into a buffer. Each event is one line string message, starting
> + * with the event id, then the event specific information.
> + *
> + * To decode event information
> + * The following event format string macro can be used with sscanf to decode
> + * the specific event information.
> + * event triggers: the reason to generate the event, defined as enum for unmap,
> + * eviction and migrate events.
> + * node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory.
> + * addr: user mode address, in pages
> + * size: in pages
> + * pid: the process ID to generate the event
> + * ns: timestamp in nanosecond-resolution, starts at system boot time but
> + * stops during suspend
> + * migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update
> + * rw: 'W' for write page fault, 'R' for read page fault
> + * rescheduled: 'R' if the queue restore failed and rescheduled to try again
> + */
> +#define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\
> + "%x %s\n", (reset_seq_num), (reset_cause)
> +
> +#define KFD_EVENT_FMT_THERMAL_THROTTLING(bitmask, counter)\
> + "%llx:%llx\n", (bitmask), (counter)
> +
> +#define KFD_EVENT_FMT_VMFAULT(pid, task_name)\
> + "%x:%s\n", (pid), (task_name)
> +
> +#define KFD_EVENT_FMT_PAGEFAULT_START(ns, pid, addr, node, rw)\
> + "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw)
> +
> +#define KFD_EVENT_FMT_PAGEFAULT_END(ns, pid, addr, node, migrate_update)\
> + "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update)
> +
> +#define KFD_EVENT_FMT_MIGRATE_START(ns, pid, start, size, from, to, prefetch_loc,\
> + preferred_loc, migrate_trigger)\
> + "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\
> + (from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger)
> +
> +#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\
> + "%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\
> + (from), (to), (migrate_trigger)
> +
> +#define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\
> + "%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)
> +
> +#define KFD_EVENT_FMT_QUEUE_RESTORE(ns, pid, node, rescheduled)\
> + "%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled)
> +
> +#define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node, unmap_trigger)\
> + "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
> + (node), (unmap_trigger)
> +
> /**************************************************************************************************
> * CRIU IOCTLs (Checkpoint Restore In Userspace)
> *
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240822/701b707e/attachment-0001.htm>
More information about the amd-gfx
mailing list