[PATCH 27/29] drm/amdkfd: add debug queue snapshot operation

Fri Dec 2 19:13:04 UTC 2022

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling at amd.com>
> Sent: November 30, 2022 6:55 PM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-
> gfx at lists.freedesktop.org
> Subject: Re: [PATCH 27/29] drm/amdkfd: add debug queue snapshot
> operation
>
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
> > Allow the debugger to get a snapshot of a specified number of queues
> > containing various queue property information that is copied to the
> > debugger.
> >
> > Since the debugger doesn't know how many queues exist at any given
> time,
> > allow the debugger to pass the requested number of snapshots as 0 to get
> > the actual number of potential snapshots to use for a subsequent snapshot
> > request for actual information.
> >
> > To prevent future ABI breakage, pass in the requested entry_size.
> > The KFD will return it's own entry_size in case the debugger still wants
> > log the information in a core dump on sizing failure.
> >
> > Also allow the debugger to clear exceptions when doing a snapshot.
> >
> > v2: change buf_size arg to num_queues for clarity.
> > fix minimum entry size calculation.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
>
> Two nit-picks inline.
>
>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +++
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 41
> +++++++++++++++++++
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  4 ++
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +++
> >   .../amd/amdkfd/kfd_process_queue_manager.c    | 40
> ++++++++++++++++++
> >   5 files changed, 96 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index 2c8f107237ee..cea393350980 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -2961,6 +2961,12 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >                             &args->query_exception_info.info_size);
> >             break;
> >     case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> > +           r = pqm_get_queue_snapshot(&target->pqm,
> > +                           args->queue_snapshot.exception_mask,
> > +                           (void __user *)args-
> >queue_snapshot.snapshot_buf_ptr,
> > +                           &args->queue_snapshot.num_queues,
> > +                           &args->queue_snapshot.entry_size);
> > +           break;
> >     case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> >             pr_warn("Debug op %i not supported yet\n", args->op);
> >             r = -EACCES;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index 589efbefc8dc..51f8c5676c56 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -2950,6 +2950,47 @@ int suspend_queues(struct kfd_process *p,
> >     return total_suspended;
> >   }
> >
> > +static uint32_t set_queue_type_for_user(struct queue_properties
> *q_props)
> > +{
> > +   switch (q_props->type) {
> > +   case KFD_QUEUE_TYPE_COMPUTE:
> > +           return q_props->format == KFD_QUEUE_FORMAT_PM4
> > +                                   ? KFD_IOC_QUEUE_TYPE_COMPUTE
> > +                                   :
> KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
> > +   case KFD_QUEUE_TYPE_SDMA:
> > +           return KFD_IOC_QUEUE_TYPE_SDMA;
> > +   case KFD_QUEUE_TYPE_SDMA_XGMI:
> > +           return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
> > +   default:
> > +           WARN_ONCE(true, "queue type not recognized!");
> > +           return 0xffffffff;
> > +   };
> > +}
> > +
> > +void set_queue_snapshot_entry(struct device_queue_manager *dqm,
> > +                         struct queue *q,
> > +                         uint64_t exception_clear_mask,
> > +                         struct kfd_queue_snapshot_entry *qss_entry)
>
> The dqm parameter is not needed. The function can get this from
> q->device->dqm. It's also only needed for dqm locking. I'm not sure
> that's even necessary. Aren't the event_mutex and target process mutex
> held by the caller enough to protect the exception_status and other
> queue properties?

I can't really remember why we device locked in the experimental phase tbh but I think you're right.
The process event lock should protect event status on interrupt writes.
The process lock should protect everything else (property updates/destruction etc).

Thanks,

Jon

>
>
> > +{
> > +   dqm_lock(dqm);
> > +
> > +   qss_entry->ring_base_address = q->properties.queue_address;
> > +   qss_entry->write_pointer_address = (uint64_t)q-
> >properties.write_ptr;
> > +   qss_entry->read_pointer_address = (uint64_t)q-
> >properties.read_ptr;
> > +   qss_entry->ctx_save_restore_address =
> > +                           q-
> >properties.ctx_save_restore_area_address;
> > +   qss_entry->ctx_save_restore_area_size =
> > +                           q->properties.ctx_save_restore_area_size;
> > +   qss_entry->exception_status = q->properties.exception_status;
> > +   qss_entry->queue_id = q->properties.queue_id;
> > +   qss_entry->gpu_id = q->device->id;
> > +   qss_entry->ring_size = (uint32_t)q->properties.queue_size;
> > +   qss_entry->queue_type = set_queue_type_for_user(&q-
> >properties);
> > +   q->properties.exception_status &= ~exception_clear_mask;
> > +
> > +   dqm_unlock(dqm);
> > +}
> > +
> >   int debug_lock_and_unmap(struct device_queue_manager *dqm)
> >   {
> >     int r;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > index 12643528684c..094705b932fc 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > @@ -297,6 +297,10 @@ int resume_queues(struct kfd_process *p,
> >             bool resume_all_queues,
> >             uint32_t num_queues,
> >             uint32_t *usr_queue_id_array);
> > +void set_queue_snapshot_entry(struct device_queue_manager *dqm,
> > +                         struct queue *q,
> > +                         uint64_t exception_clear_mask,
> > +                         struct kfd_queue_snapshot_entry *qss_entry);
> >   int debug_lock_and_unmap(struct device_queue_manager *dqm);
> >   int debug_map_and_unlock(struct device_queue_manager *dqm);
> >   int debug_refresh_runlist(struct device_queue_manager *dqm);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index aee4fe20e676..ebd701143981 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -1302,6 +1302,11 @@ int pqm_get_wave_state(struct
> process_queue_manager *pqm,
> >                    void __user *ctl_stack,
> >                    u32 *ctl_stack_used_size,
> >                    u32 *save_area_used_size);
> > +int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
> > +                      uint64_t exception_clear_mask,
> > +                      struct kfd_queue_snapshot_entry __user *buf,
> > +                      int *num_qss_entries,
> > +                      uint32_t *entry_size);
> >
> >   int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
> >                           uint64_t fence_value,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > index 15db83c9a585..30df1046c30b 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > @@ -569,6 +569,46 @@ int pqm_get_wave_state(struct
> process_queue_manager *pqm,
> >                                                    save_area_used_size);
> >   }
> >
> > +int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
> > +                      uint64_t exception_clear_mask,
> > +                      struct kfd_queue_snapshot_entry __user *buf,
> > +                      int *num_qss_entries,
> > +                      uint32_t *entry_size)
> > +{
> > +   struct process_queue_node *pqn;
> > +   uint32_t tmp_entry_size = *entry_size, tmp_qss_entries =
> *num_qss_entries;
> > +   int r;
> > +
> > +   *num_qss_entries = 0;
> > +   if (!(*entry_size))
> > +           return -EINVAL;
> > +
> > +   *entry_size = min_t(size_t, *entry_size, sizeof(struct
> kfd_queue_snapshot_entry));
> > +   mutex_lock(&pqm->process->event_mutex);
> > +
> > +   list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> > +           if (!pqn->q)
> > +                   continue;
> > +
> > +           if (*num_qss_entries < tmp_qss_entries) {
> > +                   struct kfd_queue_snapshot_entry src = {0};
>
> It's safer to use memset here. This initialization may not initialize
> padding, so it doesn't guarantee that no uninitialized data leaks from
> kernel mode to user mode.
>
> Regards,
>    Felix
>
>
> > +
> > +                   set_queue_snapshot_entry(pqn->q->device->dqm,
> > +                                   pqn->q, exception_clear_mask,
> &src);
> > +
> > +                   if (copy_to_user(buf, &src, *entry_size)) {
> > +                           r = -EFAULT;
> > +                           break;
> > +                   }
> > +                   buf += tmp_entry_size;
> > +           }
> > +           *num_qss_entries += 1;
> > +   }
> > +
> > +   mutex_unlock(&pqm->process->event_mutex);
> > +   return r;
> > +}
> > +
> >   static int get_queue_data_sizes(struct kfd_process_device *pdd,
> >                             struct queue *q,
> >                             uint32_t *mqd_size,
-------------- next part --------------
A non-text attachment was scrubbed...
Name: winmail.dat
Type: application/ms-tnef
Size: 19417 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20221202/540e0bc6/attachment-0001.bin>