[PATCH 2/2] drm/amdkfd: support the debugger during per-queue reset

Fri Jul 26 19:36:03 UTC 2024

[Public]

> -----Original Message-----
> From: Alex Deucher <alexdeucher at gmail.com>
> Sent: Friday, July 26, 2024 2:57 PM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>
> Cc: amd-gfx at lists.freedesktop.org; Kuehling, Felix
> <Felix.Kuehling at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>
> Subject: Re: [PATCH 2/2] drm/amdkfd: support the debugger during per-
> queue reset
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> On Fri, Jul 26, 2024 at 11:40 AM Jonathan Kim <Jonathan.Kim at amd.com>
> wrote:
> >
> > In order to allow ROCm GDB to handle reset queues, raise an
> > EC_QUEUE_RESET exception so that the debugger can subscribe and
> > query this exception.
> >
> > Reset queues should still be considered suspendable with a status
> > flag of KFD_DBG_QUEUE_RESET_MASK.
> > However they should not be resumable since user space will no longer
> > be able to access reset queues.
>
> Do you have a link to the proposed debugger code which uses this?

Not yet.  + Laurent/Lancelot for awareness.

Jon

>
> Alex
>
> >
> > v2: move per-queue reset flag to this patch
> > rebase based on patch 1 changes
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> > ---
> >  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31
> ++++++++++++++++---
> >  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
> >  include/uapi/linux/kfd_ioctl.h                |  4 +++
> >  3 files changed, 31 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index e335703eff84..cb7b5bbf5c40 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -164,6 +164,10 @@ static void kfd_hws_hang(struct
> device_queue_manager *dqm)
> >                         struct kfd_process_device *pdd = qpd_to_pdd(qpd);
> >
> >                         pdd->has_reset_queue = true;
> > +                       q->properties.is_reset = true;
> > +                       kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
> > +                                        q->process, q->device, q->doorbell_id,
> > +                                        false, NULL, 0);
> >                 }
> >         }
> >
> > @@ -986,7 +990,7 @@ static int suspend_single_queue(struct
> device_queue_manager *dqm,
> >  {
> >         bool is_new;
> >
> > -       if (q->properties.is_suspended)
> > +       if (q->properties.is_suspended || q->properties.is_reset)
> >                 return 0;
> >
> >         pr_debug("Suspending PASID %u queue [%i]\n",
> > @@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct
> device_queue_manager *dqm,
> >                 if (dqm->dev->kfd->shared_resources.enable_mes) {
> >                         int r = remove_queue_mes(dqm, q, &pdd->qpd);
> >
> > +                       if (q->properties.is_reset)
> > +                               return 0;
> > +
> >                         if (r)
> >                                 return r;
> >                 }
> > @@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct
> device_queue_manager *dqm, struct queue *q
> >                q->properties.queue_id, q->process->pasid);
> >
> >         pdd->has_reset_queue = true;
> > +       q->properties.is_reset = true;
> >         if (q->properties.is_active) {
> >                 q->properties.is_active = false;
> >                 decrement_queue_count(dqm, qpd, q);
> >         }
> > +
> > +       kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q-
> >device,
> > +                        q->doorbell_id, false, NULL, 0);
> >  }
> >
> >  static int detect_queue_hang(struct device_queue_manager *dqm)
> > @@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p,
> >                                                 queue_ids[q_idx] &=
> >                                                         ~KFD_DBG_QUEUE_INVALID_MASK;
> >                                         } else {
> > -                                               queue_ids[q_idx] |=
> > +                                               queue_ids[q_idx] |= q->properties.is_reset ?
> > +                                                       KFD_DBG_QUEUE_RESET_MASK :
> >                                                         KFD_DBG_QUEUE_ERROR_MASK;
> >                                                 break;
> >                                         }
> > @@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p,
> >                                                         queue_ids);
> >
> >                                         /* mask queue as error on resume fail */
> > -                                       if (q_idx != QUEUE_NOT_FOUND)
> > +                                       if (q_idx != QUEUE_NOT_FOUND && !q-
> >properties.is_reset)
> >                                                 queue_ids[q_idx] |=
> >                                                         KFD_DBG_QUEUE_ERROR_MASK;
> >                                 }
> > @@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p,
> >                 struct qcm_process_device *qpd = &pdd->qpd;
> >                 struct queue *q;
> >                 int r, per_device_suspended = 0;
> > +               bool has_queue_reset_fail = false;
> >
> >                 mutex_lock(&p->event_mutex);
> >                 dqm_lock(dqm);
> > @@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p,
> >
> >                                 if (!err) {
> >                                         queue_ids[q_idx] &=
> ~KFD_DBG_QUEUE_INVALID_MASK;
> > +                                       if (q->properties.is_reset)
> > +                                               queue_ids[q_idx] |=
> KFD_DBG_QUEUE_RESET_MASK;
> > +
> >                                         if (exception_clear_mask && is_mes)
> >                                                 q->properties.exception_status &=
> >                                                         ~exception_clear_mask;
> > @@ -3176,13 +3192,18 @@ int suspend_queues(struct kfd_process *p,
> >                                 continue;
> >
> >                         /* mask queue as error on suspend fail */
> > -                       if (r)
> > +                       if (r && !q->properties.is_reset) {
> > +                               has_queue_reset_fail = true;
> >                                 queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
> > -                       else if (exception_clear_mask)
> > +                       } else if (exception_clear_mask) {
> >                                 q->properties.exception_status &=
> >                                                         ~exception_clear_mask;
> > +                       }
> >                 }
> >
> > +               if (!has_queue_reset_fail)
> > +                       total_suspended += per_device_suspended;
> > +
> >                 dqm_unlock(dqm);
> >                 mutex_unlock(&p->event_mutex);
> >                 amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 892a85408c09..192e3102c152 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -504,6 +504,7 @@ struct queue_properties {
> >         bool is_being_destroyed;
> >         bool is_active;
> >         bool is_gws;
> > +       bool is_reset;
> >         uint32_t pm4_target_xcc;
> >         bool is_dbg_wa;
> >         bool is_user_cu_masked;
> > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> > index 285a36601dc9..4713f9a6796e 100644
> > --- a/include/uapi/linux/kfd_ioctl.h
> > +++ b/include/uapi/linux/kfd_ioctl.h
> > @@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code {
> >         EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
> >         EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
> >         EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
> > +       EC_QUEUE_RESET = 29,
> >         EC_QUEUE_PREEMPTION_ERROR = 30,
> >         EC_QUEUE_NEW = 31,
> >         /* per device */
> > @@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code {
> >
> KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID
> ) |        \
> >
> KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |       \
> >
> KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)        |       \
> > +                                KFD_EC_MASK(EC_QUEUE_RESET)    |       \
> >                                  KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) |       \
> >                                  KFD_EC_MASK(EC_QUEUE_NEW))
> >  #define KFD_EC_MASK_DEVICE
> (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |          \
> > @@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry {
> >  };
> >
> >  /* Queue status return for suspend/resume */
> > +#define KFD_DBG_QUEUE_RESET_BIT                29
> >  #define KFD_DBG_QUEUE_ERROR_BIT                30
> >  #define KFD_DBG_QUEUE_INVALID_BIT      31
> > +#define KFD_DBG_QUEUE_RESET_MASK       (1 <<
> KFD_DBG_QUEUE_RESET_BIT)
> >  #define KFD_DBG_QUEUE_ERROR_MASK       (1 <<
> KFD_DBG_QUEUE_ERROR_BIT)
> >  #define KFD_DBG_QUEUE_INVALID_MASK     (1 <<
> KFD_DBG_QUEUE_INVALID_BIT)
> >
> > --
> > 2.34.1
> >