[PATCH 2/2] drm/amdkfd: support the debugger during per-queue reset

Kim, Jonathan Jonathan.Kim at amd.com
Tue Jul 30 23:23:21 UTC 2024


[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling at amd.com>
> Sent: Tuesday, July 30, 2024 6:17 PM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>
> Subject: Re: [PATCH 2/2] drm/amdkfd: support the debugger during per-
> queue reset
>
>
>
> On 2024-07-26 11:30, Jonathan Kim wrote:
> > In order to allow ROCm GDB to handle reset queues, raise an
> > EC_QUEUE_RESET exception so that the debugger can subscribe and
> > query this exception.
> >
> > Reset queues should still be considered suspendable with a status
> > flag of KFD_DBG_QUEUE_RESET_MASK.
> > However they should not be resumable since user space will no longer
> > be able to access reset queues.
> >
> > v2: move per-queue reset flag to this patch
> > rebase based on patch 1 changes
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> > ---
> >  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31
> ++++++++++++++++---
> >  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
> >  include/uapi/linux/kfd_ioctl.h                |  4 +++
> >  3 files changed, 31 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index e335703eff84..cb7b5bbf5c40 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -164,6 +164,10 @@ static void kfd_hws_hang(struct
> device_queue_manager *dqm)
> >                     struct kfd_process_device *pdd = qpd_to_pdd(qpd);
> >
> >                     pdd->has_reset_queue = true;
> > +                   q->properties.is_reset = true;
> > +                   kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
> > +                                    q->process, q->device, q-
> >doorbell_id,
> > +                                    false, NULL, 0);
> >             }
> >     }
> >
> > @@ -986,7 +990,7 @@ static int suspend_single_queue(struct
> device_queue_manager *dqm,
> >  {
> >     bool is_new;
> >
> > -   if (q->properties.is_suspended)
> > +   if (q->properties.is_suspended || q->properties.is_reset)
> >             return 0;
> >
> >     pr_debug("Suspending PASID %u queue [%i]\n",
> > @@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct
> device_queue_manager *dqm,
> >             if (dqm->dev->kfd->shared_resources.enable_mes) {
> >                     int r = remove_queue_mes(dqm, q, &pdd->qpd);
> >
> > +                   if (q->properties.is_reset)
> > +                           return 0;
> > +
> >                     if (r)
> >                             return r;
> >             }
> > @@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct
> device_queue_manager *dqm, struct queue *q
> >            q->properties.queue_id, q->process->pasid);
> >
> >     pdd->has_reset_queue = true;
> > +   q->properties.is_reset = true;
> >     if (q->properties.is_active) {
> >             q->properties.is_active = false;
> >             decrement_queue_count(dqm, qpd, q);
> >     }
> > +
> > +   kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q-
> >device,
> > +                    q->doorbell_id, false, NULL, 0);
> >  }
> >
> >  static int detect_queue_hang(struct device_queue_manager *dqm)
> > @@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p,
> >                                             queue_ids[q_idx] &=
> >
>       ~KFD_DBG_QUEUE_INVALID_MASK;
> >                                     } else {
> > -                                           queue_ids[q_idx] |=
> > +                                           queue_ids[q_idx] |= q-
> >properties.is_reset ?
> > +
>       KFD_DBG_QUEUE_RESET_MASK :
> >
>       KFD_DBG_QUEUE_ERROR_MASK;
> >                                             break;
> >                                     }
> > @@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p,
> >                                                     queue_ids);
> >
> >                                     /* mask queue as error on resume fail
> */
> > -                                   if (q_idx != QUEUE_NOT_FOUND)
> > +                                   if (q_idx != QUEUE_NOT_FOUND
> && !q->properties.is_reset)
> >                                             queue_ids[q_idx] |=
> >
>       KFD_DBG_QUEUE_ERROR_MASK;
> >                             }
> > @@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p,
> >             struct qcm_process_device *qpd = &pdd->qpd;
> >             struct queue *q;
> >             int r, per_device_suspended = 0;
> > +           bool has_queue_reset_fail = false;
> >
> >             mutex_lock(&p->event_mutex);
> >             dqm_lock(dqm);
> > @@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p,
> >
> >                             if (!err) {
> >                                     queue_ids[q_idx] &=
> ~KFD_DBG_QUEUE_INVALID_MASK;
> > +                                   if (q->properties.is_reset)
> > +                                           queue_ids[q_idx] |=
> KFD_DBG_QUEUE_RESET_MASK;
> > +
> >                                     if (exception_clear_mask && is_mes)
> >                                             q-
> >properties.exception_status &=
> >
>       ~exception_clear_mask;
> > @@ -3176,13 +3192,18 @@ int suspend_queues(struct kfd_process *p,
> >                             continue;
> >
> >                     /* mask queue as error on suspend fail */
> > -                   if (r)
> > +                   if (r && !q->properties.is_reset) {
> > +                           has_queue_reset_fail = true;
> >                             queue_ids[q_idx] |=
> KFD_DBG_QUEUE_ERROR_MASK;
> > -                   else if (exception_clear_mask)
> > +                   } else if (exception_clear_mask) {
> >                             q->properties.exception_status &=
> >
>       ~exception_clear_mask;
> > +                   }
> >             }
> >
> > +           if (!has_queue_reset_fail)
> > +                   total_suspended += per_device_suspended;
> > +
> >             dqm_unlock(dqm);
> >             mutex_unlock(&p->event_mutex);
> >             amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 892a85408c09..192e3102c152 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -504,6 +504,7 @@ struct queue_properties {
> >     bool is_being_destroyed;
> >     bool is_active;
> >     bool is_gws;
> > +   bool is_reset;
> >     uint32_t pm4_target_xcc;
> >     bool is_dbg_wa;
> >     bool is_user_cu_masked;
> > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> > index 285a36601dc9..4713f9a6796e 100644
> > --- a/include/uapi/linux/kfd_ioctl.h
> > +++ b/include/uapi/linux/kfd_ioctl.h
> > @@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code {
> >     EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
> >     EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
> >     EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
> > +   EC_QUEUE_RESET = 29,
> >     EC_QUEUE_PREEMPTION_ERROR = 30,
>
> Do we really need to distinguish between preemption errors and resets? I
> mean, both are caused by preemption errors. The only difference is, that in the
> "reset" case we succeeded in resetting only the affected queues.
>
> >     EC_QUEUE_NEW = 31,
> >     /* per device */
> > @@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code {
> >
> KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID
> ) |   \
> >
> KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |      \
> >
> KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)       |       \
> > +                            KFD_EC_MASK(EC_QUEUE_RESET)    |
>       \
> >
> KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR)        |       \
> >                              KFD_EC_MASK(EC_QUEUE_NEW))
> >  #define KFD_EC_MASK_DEVICE
>       (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |          \
> > @@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry {
> >  };
> >
> >  /* Queue status return for suspend/resume */
> > +#define KFD_DBG_QUEUE_RESET_BIT            29
> >  #define KFD_DBG_QUEUE_ERROR_BIT            30
> >  #define KFD_DBG_QUEUE_INVALID_BIT  31
> > +#define KFD_DBG_QUEUE_RESET_MASK   (1 <<
> KFD_DBG_QUEUE_RESET_BIT)
>
> Same as above. If the distinction is not necessary, we don't need to change the
> user mode API. Just report queue resets as generic preemption errors.

Ok.  Let me think more about this.  Right now preemption errors don't get counted as suspended queues so we'd end up changing the behaviour of the existing API anyways because the request is for reset queues to be counted as suspended so that the debugger knows to inspect them.
That being said, maybe there's a need to distinguish generic errors when a gpu reset fails versus when things actual reset whether per-adapter or per-queue.

Thanks,

Jon

>
> Regards,
>   Felix
>
> >  #define KFD_DBG_QUEUE_ERROR_MASK   (1 <<
> KFD_DBG_QUEUE_ERROR_BIT)
> >  #define KFD_DBG_QUEUE_INVALID_MASK (1 <<
> KFD_DBG_QUEUE_INVALID_BIT)
> >


More information about the amd-gfx mailing list