[PATCH v2 1/4] drm/amdkfd: Document and define SVM events message macro

James Zhu jamesz at amd.com
Thu Aug 22 14:32:44 UTC 2024


On 2024-07-30 16:15, Philip Yang wrote:
> Document how to use SMI system management interface to enable and
> receive SVM events. Document SVM event triggers.
>
> Define SVM events message string format macro that could be used by user
> mode for sscanf to parse the event. Add it to uAPI header file to make
> it obvious that is changing uAPI in future.
>
> No functional changes.
>
> Signed-off-by: Philip Yang<Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  45 +++++----
>   include/uapi/linux/kfd_ioctl.h              | 100 +++++++++++++++++---
>   2 files changed, 109 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index ea6a8e43bd5b..de8b9abf7afc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -235,17 +235,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
>   		amdgpu_reset_get_desc(reset_context, reset_cause,
>   				      sizeof(reset_cause));
>   
> -	kfd_smi_event_add(0, dev, event, "%x %s\n",
> -			  dev->reset_seq_num,
> -			  reset_cause);
> +	kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET(
> +			  dev->reset_seq_num, reset_cause));
>   }
>   
>   void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
>   					     uint64_t throttle_bitmask)
>   {
> -	kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
> +	kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING(
>   			  throttle_bitmask,
> -			  amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
> +			  amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
>   }
>   
>   void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
> @@ -256,8 +255,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
>   	if (task_info) {
>   		/* Report VM faults from user applications, not retry from kernel */
>   		if (task_info->pid)
> -			kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
> -					 task_info->pid, task_info->task_name);
> +			kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT(
> +					  task_info->pid, task_info->task_name));
>   		amdgpu_vm_put_task_info(task_info);
>   	}
>   }
> @@ -267,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
>   				    ktime_t ts)
>   {
>   	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
> -			  "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
> -			  address, node->id, write_fault ? 'W' : 'R');
> +			  KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
> +			  address, node->id, write_fault ? 'W' : 'R'));
>   }
>   
>   void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
>   				  unsigned long address, bool migration)
>   {
>   	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
> -			  "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
> -			  pid, address, node->id, migration ? 'M' : 'U');
> +			  KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
> +			  pid, address, node->id, migration ? 'M' : 'U'));
>   }
>   
>   void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
> @@ -286,9 +285,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
>   				   uint32_t trigger)
>   {
>   	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
> -			  "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
> +			  KFD_EVENT_FMT_MIGRATE_START(
>   			  ktime_get_boottime_ns(), pid, start, end - start,
> -			  from, to, prefetch_loc, preferred_loc, trigger);
> +			  from, to, prefetch_loc, preferred_loc, trigger));
>   }
>   
>   void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
> @@ -296,24 +295,24 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
>   				 uint32_t from, uint32_t to, uint32_t trigger)
>   {
>   	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
> -			  "%lld -%d @%lx(%lx) %x->%x %d\n",
> +			  KFD_EVENT_FMT_MIGRATE_END(
>   			  ktime_get_boottime_ns(), pid, start, end - start,
> -			  from, to, trigger);
> +			  from, to, trigger));
>   }
>   
>   void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
>   				  uint32_t trigger)
>   {
>   	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION,
> -			  "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
> -			  node->id, trigger);
> +			  KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid,
> +			  node->id, trigger));
>   }
>   
>   void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid)
>   {
>   	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE,
> -			  "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
> -			  node->id);
> +			  KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid,
> +			  node->id, 0));
>   }
>   
>   void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
> @@ -330,8 +329,8 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
>   
>   		kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
>   				  KFD_SMI_EVENT_QUEUE_RESTORE,
> -				  "%lld -%d %x %c\n", ktime_get_boottime_ns(),
> -				  p->lead_thread->pid, pdd->dev->id, 'R');
> +				  KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(),
> +				  p->lead_thread->pid, pdd->dev->id, 'R'));
>   	}
>   	kfd_unref_process(p);
>   }
> @@ -341,8 +340,8 @@ void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid,
>   				  uint32_t trigger)
>   {
>   	kfd_smi_event_add(pid, node, KFD_SMI_EVENT_UNMAP_FROM_GPU,
> -			  "%lld -%d @%lx(%lx) %x %d\n", ktime_get_boottime_ns(),
> -			  pid, address, last - address + 1, node->id, trigger);
> +			  KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(),
> +			  pid, address, last - address + 1, node->id, trigger));
>   }
>   
>   int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 71a7ce5f2d4c..c94182ad8fb8 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -540,26 +540,29 @@ enum kfd_smi_event {
>   	KFD_SMI_EVENT_ALL_PROCESS = 64
>   };
>   
> +/* The reason of the page migration event */
>   enum KFD_MIGRATE_TRIGGERS {
> -	KFD_MIGRATE_TRIGGER_PREFETCH,
> -	KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
> -	KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
> -	KFD_MIGRATE_TRIGGER_TTM_EVICTION
> +	KFD_MIGRATE_TRIGGER_PREFETCH,		/* Prefetch to GPU */
[JZ] could it be per-fetched to system RAM also?
> +	KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,	/* GPU page fault recover */
> +	KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,	/* CPU page fault recover */
> +	KFD_MIGRATE_TRIGGER_TTM_EVICTION	/* TTM eviction */
>   };
>   
> +/* The reason of user queue evition event */
>   enum KFD_QUEUE_EVICTION_TRIGGERS {
> -	KFD_QUEUE_EVICTION_TRIGGER_SVM,
> -	KFD_QUEUE_EVICTION_TRIGGER_USERPTR,
> -	KFD_QUEUE_EVICTION_TRIGGER_TTM,
> -	KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,
> -	KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,
> -	KFD_QUEUE_EVICTION_CRIU_RESTORE
> +	KFD_QUEUE_EVICTION_TRIGGER_SVM,		/* SVM buffer migration */
> +	KFD_QUEUE_EVICTION_TRIGGER_USERPTR,	/* userptr movement */
> +	KFD_QUEUE_EVICTION_TRIGGER_TTM,		/* TTM move buffer */
> +	KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,	/* GPU suspend */
> +	KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,	/* CRIU checkpoint */
> +	KFD_QUEUE_EVICTION_CRIU_RESTORE		/* CRIU restore */
>   };
>   
> +/* The reason of unmap buffer from GPU event */
>   enum KFD_SVM_UNMAP_TRIGGERS {
> -	KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY,
> -	KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
> -	KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
> +	KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY,	/* MMU notifier CPU buffer movement */
> +	KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,/* MMU notifier page migration */
> +	KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU	/* Unmap to free the buffer */
>   };
>   
>   #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
> @@ -570,6 +573,77 @@ struct kfd_ioctl_smi_events_args {
>   	__u32 anon_fd;	/* from KFD */
>   };
>   
> +/*
> + * SVM event tracing via SMI system management interface
> + *
> + * Open event file descriptor
> + *    use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file
> + *    descriptor to receive SMI events.
> + *    If calling with sudo permission, then file descriptor can be used to receive
> + *    SVM events from all processes, otherwise, to only receive SVM events of same
> + *    process.
> + *
> + * To enable the SVM event
> + *    Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
> + *    mask to start record the event to the kfifo, use bitmap mask combination
> + *    for multiple events. New event mask will overwrite the previous event mask.
> + *    KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo
> + *    permisson to receive SVM events from all process.
> + *
> + * To receive the event
> + *    Application can poll file descriptor to wait for the events, then read event
> + *    from the file into a buffer. Each event is one line string message, starting
> + *    with the event id, then the event specific information.
> + *
> + * To decode event information
> + *    The following event format string macro can be used with sscanf to decode
> + *    the specific event information.
> + *    event triggers: the reason to generate the event, defined as enum for unmap,
> + *    eviction and migrate events.
> + *    node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory.
> + *    addr: user mode address, in pages
> + *    size: in pages
> + *    pid: the process ID to generate the event
> + *    ns: timestamp in nanosecond-resolution, starts at system boot time but
> + *        stops during suspend
> + *    migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update
> + *    rw: 'W' for write page fault, 'R' for read page fault
> + *    rescheduled: 'R' if the queue restore failed and rescheduled to try again
> + */
> +#define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\
> +		"%x %s\n", (reset_seq_num), (reset_cause)
> +
> +#define KFD_EVENT_FMT_THERMAL_THROTTLING(bitmask, counter)\
> +		"%llx:%llx\n", (bitmask), (counter)
> +
> +#define KFD_EVENT_FMT_VMFAULT(pid, task_name)\
> +		"%x:%s\n", (pid), (task_name)
> +
> +#define KFD_EVENT_FMT_PAGEFAULT_START(ns, pid, addr, node, rw)\
> +		"%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw)
> +
> +#define KFD_EVENT_FMT_PAGEFAULT_END(ns, pid, addr, node, migrate_update)\
> +		"%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update)
> +
> +#define KFD_EVENT_FMT_MIGRATE_START(ns, pid, start, size, from, to, prefetch_loc,\
> +		preferred_loc, migrate_trigger)\
> +		"%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\
> +		(from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger)
> +
> +#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\
> +		"%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\
> +		(from), (to), (migrate_trigger)
> +
> +#define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\
> +		"%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)
> +
> +#define KFD_EVENT_FMT_QUEUE_RESTORE(ns, pid, node, rescheduled)\
> +		"%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled)
> +
> +#define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node, unmap_trigger)\
> +		"%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
> +		(node), (unmap_trigger)
> +
>   /**************************************************************************************************
>    * CRIU IOCTLs (Checkpoint Restore In Userspace)
>    *
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240822/701b707e/attachment-0001.htm>


More information about the amd-gfx mailing list