<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2024-02-15 12:54, Chen, Xiaogang
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:22db7ab1-c48c-c982-8c67-f2418becdda9@amd.com">
      <br>
      On 2/15/2024 9:18 AM, Philip Yang wrote:
      <br>
      <blockquote type="cite">Caution: This message originated from an
        External Source. Use proper caution when opening attachments,
        clicking links, or responding.
        <br>
        <br>
        <br>
        Document how to use SMI system management interface to receive
        SVM
        <br>
        events.
        <br>
        <br>
        Define SVM events message string format macro that could use by
        user
        <br>
        mode for sscanf to parse the event. Add it to uAPI header file
        to make
        <br>
        it obvious that is changing uAPI in future.
        <br>
        <br>
        No functional changes.
        <br>
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
        ---
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51
        +++++++-------
        <br>
          include/uapi/linux/kfd_ioctl.h              | 77
        ++++++++++++++++++++-
        <br>
          2 files changed, 102 insertions(+), 26 deletions(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
        <br>
        index d9953c2b2661..85465eb303a9 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
        <br>
        @@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct
        kfd_node *dev, bool post_reset)
        <br>
                         event = KFD_SMI_EVENT_GPU_PRE_RESET;
        <br>
                         ++(dev->reset_seq_num);
        <br>
                 }
        <br>
        -       kfd_smi_event_add(0, dev, event, "%x\n",
        dev->reset_seq_num);
        <br>
        +       kfd_smi_event_add(0, dev, event,
        <br>
        +                        
        KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_update_thermal_throttling(struct kfd_node
        *dev,
        <br>
                                                      uint64_t
        throttle_bitmask)
        <br>
          {
        <br>
        -       kfd_smi_event_add(0, dev,
        KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
        <br>
        -                         throttle_bitmask,
        <br>
        -                        
        amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
        <br>
        +       kfd_smi_event_add(0, dev,
        KFD_SMI_EVENT_THERMAL_THROTTLE,
        <br>
        +                        
        KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask,
        <br>
        +                        
        amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_update_vmfault(struct kfd_node *dev,
        uint16_t pasid)
        <br>
        @@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct
        kfd_node *dev, uint16_t pasid)
        <br>
                 if (!task_info.pid)
        <br>
                         return;
        <br>
        <br>
        -       kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,
        "%x:%s\n",
        <br>
        -                         task_info.pid, task_info.task_name);
        <br>
        +       kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,
        <br>
        +                         KFD_EVENT_FMT_VMFAULT(task_info.pid,
        task_info.task_name));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_page_fault_start(struct kfd_node *node,
        pid_t pid,
        <br>
        @@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct
        kfd_node *node, pid_t pid,
        <br>
                                             ktime_t ts)
        <br>
          {
        <br>
                 kfd_smi_event_add(pid, node,
        KFD_SMI_EVENT_PAGE_FAULT_START,
        <br>
        -                         "%lld -%d @%lx(%x) %c\n",
        ktime_to_ns(ts), pid,
        <br>
        -                         address, node->id, write_fault ?
        'W' : 'R');
        <br>
        +                        
        KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
        <br>
        +                         address, node->id, write_fault ?
        'W' : 'R'));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t
        pid,
        <br>
                                           unsigned long address, bool
        migration)
        <br>
          {
        <br>
                 kfd_smi_event_add(pid, node,
        KFD_SMI_EVENT_PAGE_FAULT_END,
        <br>
        -                         "%lld -%d @%lx(%x) %c\n",
        ktime_get_boottime_ns(),
        <br>
        -                         pid, address, node->id, migration ?
        'M' : 'U');
        <br>
        +                        
        KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
        <br>
        +                         pid, address, node->id, migration ?
        'M' : 'U'));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_migration_start(struct kfd_node *node,
        pid_t pid,
        <br>
        @@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct
        kfd_node *node, pid_t pid,
        <br>
                                            uint32_t trigger)
        <br>
          {
        <br>
                 kfd_smi_event_add(pid, node,
        KFD_SMI_EVENT_MIGRATE_START,
        <br>
        -                         "%lld -%d @%lx(%lx) %x->%x %x:%x
        %d\n",
        <br>
        -                         ktime_get_boottime_ns(), pid, start,
        end - start,
        <br>
        -                         from, to, prefetch_loc, preferred_loc,
        trigger);
        <br>
        +                        
        KFD_EVENT_FMT_MIGRATE_START(ktime_get_boottime_ns(),
        <br>
        +                         pid, start, end - start, from, to,
        prefetch_loc,
        <br>
        +                         preferred_loc, trigger));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_migration_end(struct kfd_node *node, pid_t
        pid,
        <br>
        @@ -284,24 +285,23 @@ void kfd_smi_event_migration_end(struct
        kfd_node *node, pid_t pid,
        <br>
                                          uint32_t from, uint32_t to,
        uint32_t trigger)
        <br>
          {
        <br>
                 kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
        <br>
        -                         "%lld -%d @%lx(%lx) %x->%x %d\n",
        <br>
        -                         ktime_get_boottime_ns(), pid, start,
        end - start,
        <br>
        -                         from, to, trigger);
        <br>
        +                        
        KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), pid,
        <br>
        +                         start, end - start, from, to,
        trigger));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t
        pid,
        <br>
                                           uint32_t trigger)
        <br>
          {
        <br>
                 kfd_smi_event_add(pid, node,
        KFD_SMI_EVENT_QUEUE_EVICTION,
        <br>
        -                         "%lld -%d %x %d\n",
        ktime_get_boottime_ns(), pid,
        <br>
        -                         node->id, trigger);
        <br>
        +                        
        KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(),
        <br>
        +                         pid, node->id, trigger));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t
        pid)
        <br>
          {
        <br>
                 kfd_smi_event_add(pid, node,
        KFD_SMI_EVENT_QUEUE_RESTORE,
        <br>
        -                         "%lld -%d %x\n",
        ktime_get_boottime_ns(), pid,
        <br>
        -                         node->id);
        <br>
        +                        
        KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(),
        <br>
        +                         pid, node->id));
        <br>
          }
        <br>
        <br>
          void kfd_smi_event_queue_restore_rescheduled(struct mm_struct
        *mm)
        <br>
        @@ -317,9 +317,10 @@ void
        kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
        <br>
                         struct kfd_process_device *pdd = p->pdds[i];
        <br>
        <br>
                         kfd_smi_event_add(p->lead_thread->pid,
        pdd->dev,
        <br>
        -                                 KFD_SMI_EVENT_QUEUE_RESTORE,
        <br>
        -                                 "%lld -%d %x %c\n",
        ktime_get_boottime_ns(),
        <br>
        -                                 p->lead_thread->pid,
        pdd->dev->id, 'R');
        <br>
        +                                
        KFD_SMI_EVENT_QUEUE_RESTORE_RESCHEDULED,
        <br>
        +                                
        KFD_EVENT_FMT_QUEUE_RESTORE_RESCHEDULED(
        <br>
        +                                 ktime_get_boottime_ns(),
        p->lead_thread->pid,
        <br>
        +                                 pdd->dev->id, 'R'));
        <br>
                 }
        <br>
                 kfd_unref_process(p);
        <br>
          }
        <br>
        @@ -329,8 +330,8 @@ void kfd_smi_event_unmap_from_gpu(struct
        kfd_node *node, pid_t pid,
        <br>
                                           uint32_t trigger)
        <br>
          {
        <br>
                 kfd_smi_event_add(pid, node,
        KFD_SMI_EVENT_UNMAP_FROM_GPU,
        <br>
        -                         "%lld -%d @%lx(%lx) %x %d\n",
        ktime_get_boottime_ns(),
        <br>
        -                         pid, address, last - address + 1,
        node->id, trigger);
        <br>
        +                        
        KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(),
        <br>
        +                         pid, address, last - address + 1,
        node->id, trigger));
        <br>
          }
        <br>
        <br>
          int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)
        <br>
        diff --git a/include/uapi/linux/kfd_ioctl.h
        b/include/uapi/linux/kfd_ioctl.h
        <br>
        index 9ce46edc62a5..430c01f4148b 100644
        <br>
        --- a/include/uapi/linux/kfd_ioctl.h
        <br>
        +++ b/include/uapi/linux/kfd_ioctl.h
        <br>
        @@ -523,7 +523,8 @@ enum kfd_smi_event {
        <br>
                 KFD_SMI_EVENT_PAGE_FAULT_END = 8,
        <br>
                 KFD_SMI_EVENT_QUEUE_EVICTION = 9,
        <br>
                 KFD_SMI_EVENT_QUEUE_RESTORE = 10,
        <br>
        -       KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
        <br>
        +       KFD_SMI_EVENT_QUEUE_RESTORE_RESCHEDULED = 11,
        <br>
        +       KFD_SMI_EVENT_UNMAP_FROM_GPU = 12,
        <br>
      </blockquote>
      <br>
      Why change KFD_SMI_EVENT_UNMAP_FROM_GPU from 11 to 12? That breaks
      existing api with user space, ex, thunk use
      HSA_SMI_EVENT_UNMAP_FROM_GPU = 11,
      <br>
    </blockquote>
    <p>yes, this does break the existing ID API, patch 2/2 change the
      migrate end message format, also breaks the existing API, will
      send v2 patch to add new event ID for both restore rescheduled
      event and migrate failure end event.</p>
    <p>Thanks,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:22db7ab1-c48c-c982-8c67-f2418becdda9@amd.com">
      <br>
      Regards
      <br>
      <br>
      Xiaogang
      <br>
      <br>
      <blockquote type="cite">         /*
        <br>
                  * max event number, as a flag bit to get events from
        all processes,
        <br>
        @@ -564,6 +565,80 @@ struct kfd_ioctl_smi_events_args {
        <br>
                 __u32 anon_fd;  /* from KFD */
        <br>
          };
        <br>
        <br>
        +/*
        <br>
        + * SVM event tracing via SMI system management interface
        <br>
        + *
        <br>
        + * Open event file descriptor
        <br>
        + *    use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return
        a anonymous file
        <br>
        + *    descriptor to receive SMI events.
        <br>
        + *    If calling with sudo permission, then file descriptor can
        be used to receive
        <br>
        + *    SVM events from all processes, otherwise, to only receive
        SVM events of same
        <br>
        + *    process.
        <br>
        + *
        <br>
        + * To enable the SVM event
        <br>
        + *    Write event file descriptor with
        KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
        <br>
        + *    mask to start record the event to the kfifo, use bitmap
        mask combination
        <br>
        + *    for multiple events. New event mask will overwrite the
        previous event mask.
        <br>
        + *    KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS)
        bit requires sudo
        <br>
        + *    permisson to receive SVM events from all process.
        <br>
        + *
        <br>
        + * To receive the event
        <br>
        + *    Application can poll file descriptor to wait for the
        events, then read event
        <br>
        + *    from the file into a buffer. Each event is one line
        string message, starting
        <br>
        + *    with the event id, then the event specific information.
        <br>
        + *
        <br>
        + * To decode event information
        <br>
        + *    The following event format string macro can be used with
        sscanf to decode
        <br>
        + *    the specific event information.
        <br>
        + *    event triggers: the reason to generate the event, defined
        as enum for unmap,
        <br>
        + *    eviction and migrate events.
        <br>
        + *    node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0
        for system memory.
        <br>
        + *    addr: user mode address, in pages
        <br>
        + *    size: in pages
        <br>
        + *    pid: the process ID to generate the event
        <br>
        + *    ns: timestamp in nanosecond-resolution, starts at system
        boot time but
        <br>
        + *        stops during suspend
        <br>
        + *    migrate_update: the GPU page is recovered by 'M' for
        migrate, 'U' for update
        <br>
        + *    rescheduled: 'R' if the queue restore failed and
        rescheduled to try again
        <br>
        + *    rw: 'W' for write page fault, 'R' for read page fault
        <br>
        + */
        <br>
        +#define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num)\
        <br>
        +               "%x\n", (reset_seq_num)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(bitmask,
        counter)\
        <br>
        +               "%llx:%llx\n", (bitmask), (counter)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_VMFAULT(pid, task_name)\
        <br>
        +               "%x:%s\n", (pid), (task_name)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_PAGEFAULT_START(ns, pid, addr, node, rw)\
        <br>
        +               "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr),
        (node), (rw)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_PAGEFAULT_END(ns, pid, addr, node,
        migrate_update)\
        <br>
        +               "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr),
        (node), (migrate_update)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_MIGRATE_START(ns, pid, start, size, from,
        to, prefetch_loc,\
        <br>
        +               preferred_loc, migrate_trigger)\
        <br>
        +               "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns),
        (pid), (start), (size),\
        <br>
        +               (from), (to), (prefetch_loc), (preferred_loc),
        (migrate_trigger)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from,
        to, migrate_trigger)\
        <br>
        +               "%lld -%d @%lx(%lx) %x->%x %d\n", (ns),
        (pid), (start), (size),\
        <br>
        +               (from), (to), (migrate_trigger)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node,
        evict_trigger)\
        <br>
        +               "%lld -%d %x %d\n", (ns), (pid), (node),
        (evict_trigger)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_QUEUE_RESTORE(ns, pid, node)\
        <br>
        +               "%lld -%d %x\n", (ns), (pid), (node)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_QUEUE_RESTORE_RESCHEDULED(ns, pid, node,
        rescheduled)\
        <br>
        +               "%lld -%d %x %c\n", (ns), (pid), (node),
        (rescheduled)
        <br>
        +
        <br>
        +#define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node,
        unmap_trigger)\
        <br>
        +               "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid),
        (addr), (size),\
        <br>
        +               (node), (unmap_trigger)
        <br>
        +
        <br>
         
/**************************************************************************************************<br>
           * CRIU IOCTLs (Checkpoint Restore In Userspace)
        <br>
           *
        <br>
        --
        <br>
        2.35.1
        <br>
        <br>
      </blockquote>
    </blockquote>
  </body>
</html>