[PATCH v2 5/9] drm/xe: Add dependency scheduler for GT TLB invalidations to bind queues

Tue Jul 15 22:01:59 UTC 2025

On Tue, Jul 15, 2025 at 03:53:27PM -0600, Summers, Stuart wrote:
> On Tue, 2025-07-15 at 14:52 -0700, Matthew Brost wrote:
> > On Tue, Jul 15, 2025 at 03:45:55PM -0600, Summers, Stuart wrote:
> > > On Tue, 2025-07-15 at 14:44 -0700, Matthew Brost wrote:
> > > > On Tue, Jul 15, 2025 at 03:34:29PM -0600, Summers, Stuart wrote:
> > > > > On Wed, 2025-07-02 at 16:42 -0700, Matthew Brost wrote:
> > > > > > Add a generic dependency scheduler for GT TLB invalidations,
> > > > > > used
> > > > > > to
> > > > > > schedule jobs that issue GT TLB invalidations to bind queues.
> > > > > > 
> > > > > > v2:
> > > > > >  - Use shared GT TLB invalidation queue for dep scheduler
> > > > > >  - Break allocation of dep scheduler into its own function
> > > > > >  - Add define for max number tlb invalidations
> > > > > >  - Skip media if not present
> > > > > > 
> > > > > > Suggested-by: Thomas Hellström
> > > > > > <thomas.hellstrom at linux.intel.com>
> > > > > > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > > > > > ---
> > > > > >  drivers/gpu/drm/xe/xe_exec_queue.c       | 48
> > > > > > ++++++++++++++++++++++++
> > > > > >  drivers/gpu/drm/xe/xe_exec_queue_types.h | 13 +++++++
> > > > > >  2 files changed, 61 insertions(+)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > > index fee22358cc09..7aaf669cf5fc 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > > > > @@ -12,6 +12,7 @@
> > > > > >  #include <drm/drm_file.h>
> > > > > >  #include <uapi/drm/xe_drm.h>
> > > > > >  
> > > > > > +#include "xe_dep_scheduler.h"
> > > > > >  #include "xe_device.h"
> > > > > >  #include "xe_gt.h"
> > > > > >  #include "xe_hw_engine_class_sysfs.h"
> > > > > > @@ -39,6 +40,12 @@ static int
> > > > > > exec_queue_user_extensions(struct
> > > > > > xe_device *xe, struct xe_exec_queue
> > > > > >  
> > > > > >  static void __xe_exec_queue_free(struct xe_exec_queue *q)
> > > > > >  {
> > > > > > +       int i;
> > > > > > +
> > > > > > +       for (i = 0; i < XE_EXEC_QUEUE_TLB_INVAL_COUNT; ++i)
> > > > > > +               if (q->tlb_inval[i].dep_scheduler)
> > > > > > +                       xe_dep_scheduler_fini(q-
> > > > > > > tlb_inval[i].dep_scheduler);
> > > > > > +
> > > > > >         if (xe_exec_queue_uses_pxp(q))
> > > > > >                 xe_pxp_exec_queue_remove(gt_to_xe(q->gt)-
> > > > > > >pxp,
> > > > > > q);
> > > > > >         if (q->vm)
> > > > > > @@ -50,6 +57,39 @@ static void __xe_exec_queue_free(struct
> > > > > > xe_exec_queue *q)
> > > > > >         kfree(q);
> > > > > >  }
> > > > > >  
> > > > > > +static int alloc_dep_schedulers(struct xe_device *xe, struct
> > > > > > xe_exec_queue *q)
> > > > > > +{
> > > > > > +       struct xe_tile *tile = gt_to_tile(q->gt);
> > > > > > +       int i;
> > > > > > +
> > > > > > +       for (i = 0; i < XE_EXEC_QUEUE_TLB_INVAL_COUNT; ++i) {
> > > > > > +               struct xe_dep_scheduler *dep_scheduler;
> > > > > > +               struct xe_gt *gt;
> > > > > > +               struct workqueue_struct *wq;
> > > > > > +
> > > > > > +               if (i == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT)
> > > > > > +                       gt = tile->primary_gt;
> > > > > > +               else
> > > > > > +                       gt = tile->media_gt;
> > > > > > +
> > > > > > +               if (!gt)
> > > > > > +                       continue;
> > > > > > +
> > > > > > +               wq = gt->tlb_invalidation.job_wq;
> > > > > > +
> > > > > > +#define MAX_TLB_INVAL_JOBS     16      /* Picking a
> > > > > > reasonable
> > > > > > value
> > > > > > */
> > > > > 
> > > > > So if we exceed tihs number, that means we won't get an
> > > > > invalidation of
> > > > > that range before a context switch to a new queue?
> > 
> > Sorry missed this.
> > 
> > No. This is just maximum number of TLB invalidations, on this queue,
> > which can be inflight to the hardware at once. If MAX_TLB_INVAL_JOBS
> > is
> > exceeded, the scheduler just holds jobs until prior onces complete
> > and
> > more job credits become available.
> > 
> > Context switching or other queues are not really related here. Think
> > of
> > this a FIFO in which only MAX_TLB_INVAL_JOBS can be outstanding at
> > time.
> 
> Got it. So basically summarizing, if I were to submit 17 TLB
> invalidations at once (before hardware completed any one of those)
> theoretically, followed by a submission from userspace on a new
> context, the scheduler would still accept these, but would only submit
> the first 16 to hardware. It would then submit the 17th once a hardware
> slot became available, then finally that new user context. Is that
> right?

Yes, exactly. TLB invalidations from a bind or unbind resolve to a
fence, which is installed in the dma-resv slots and returned to users
via sync objects. The dma-resv slots are used to ensure that KMD-issued
binds are complete before scheduling a user submission. Meanwhile, the
user should use the output DRM syncobj from a bind/unbind as input to a
submission if there is a dependency.

This is all part of the existing flow—this series only changes how TLB
invalidations are scheduled and how the fences are generated.

Matt

> 
> Thanks,
> Stuart
> 
> > 
> > Matt
> > 
> > > > > 
> > > > > > +               dep_scheduler = xe_dep_scheduler_create(xe,
> > > > > > wq,
> > > > > > q-
> > > > > > > name,
> > > > > > +                                                       MAX_T
> > > > > > LB_I
> > > > > > NVAL
> > > > > > _JOBS);
> > > > > > +               if (IS_ERR(dep_scheduler))
> > > > > > +                       return PTR_ERR(dep_scheduler);
> > > > > > +
> > > > > > +               q->tlb_inval[i].dep_scheduler =
> > > > > > dep_scheduler;
> > > > > > +       }
> > > > > > +#undef MAX_TLB_INVAL_JOBS
> > > > > > +
> > > > > > +       return 0;
> > > > > > +}
> > > > > > +
> > > > > >  static struct xe_exec_queue *__xe_exec_queue_alloc(struct
> > > > > > xe_device
> > > > > > *xe,
> > > > > >                                                    struct
> > > > > > xe_vm
> > > > > > *vm,
> > > > > >                                                    u32
> > > > > > logical_mask,
> > > > > > @@ -94,6 +134,14 @@ static struct xe_exec_queue
> > > > > > *__xe_exec_queue_alloc(struct xe_device *xe,
> > > > > >         else
> > > > > >                 q->sched_props.priority =
> > > > > > XE_EXEC_QUEUE_PRIORITY_NORMAL;
> > > > > 
> > > > > We want these to be high priority right? I.e. if another user
> > > > > creates a
> > > > > queue and submits at high priority can we get stale data on
> > > > > those
> > > > > executions if this one is blocked behind that high priority
> > > > > one?
> > > > 
> > > > I'm struggling with exactly what you are asking here.
> > > > 
> > > > The q->sched_props.priority is priority of exec queue being
> > > > created
> > > > and is existing code. In the case of bind queues (i.e., ones that
> > > > call alloc_dep_schedulers) this maps to priority of the GuC
> > > > context
> > > > which runs bind jobs on a hardware copy engine. 
> > > > 
> > > > dep_schedulers do not have priority, at least that is used in any
> > > > way.
> > > > This is just software queue which runs TLB invalidations once the
> > > > bind
> > > > jobs complete.
> > > > 
> > > > wrt high priority passing lower priority ones, that isn't
> > > > possible if
> > > > there is a dependency chain. The driver holds any jobs in the KMD
> > > > until
> > > > all dependecies are resolved - high priority queues just get
> > > > serviced
> > > > by
> > > > the GuC over low priority queues if both queues have runnable
> > > > jobs.
> > > > 
> > > > TL;DR nothing is patch changes anything wrt to priorities.
> > > 
> > > You're right I think I read this wrong somehow :(, ignore my
> > > comment!
> > > 
> > > Still the minor question above about the max number of dependency
> > > jobs
> > > when you have time.
> > > 
> > > Thanks,
> > > Stuart
> > > 
> > > > 
> > > > Matt
> > > > 
> > > > > 
> > > > > Thanks,
> > > > > Stuart
> > > > > 
> > > > > >  
> > > > > > +       if (q->flags & (EXEC_QUEUE_FLAG_MIGRATE |
> > > > > > EXEC_QUEUE_FLAG_VM)) {
> > > > > > +               err = alloc_dep_schedulers(xe, q);
> > > > > > +               if (err) {
> > > > > > +                       __xe_exec_queue_free(q);
> > > > > > +                       return ERR_PTR(err);
> > > > > > +               }
> > > > > > +       }
> > > > > > +
> > > > > >         if (vm)
> > > > > >                 q->vm = xe_vm_get(vm);
> > > > > >  
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > > b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > > index abdf4a57e6e2..ba443a497b38 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > > > > > @@ -134,6 +134,19 @@ struct xe_exec_queue {
> > > > > >                 struct list_head link;
> > > > > >         } lr;
> > > > > >  
> > > > > > +#define XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT     0
> > > > > > +#define XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT       1
> > > > > > +#define
> > > > > > XE_EXEC_QUEUE_TLB_INVAL_COUNT          (XE_EXEC_QUEUE_TLB_INV
> > > > > > AL_M
> > > > > > EDIA
> > > > > > _GT  + 1)
> > > > > > +
> > > > > > +       /** @tlb_inval: TLB invalidations exec queue state */
> > > > > > +       struct {
> > > > > > +               /**
> > > > > > +                * @tlb_inval.dep_scheduler: The TLB
> > > > > > invalidation
> > > > > > +                * dependency scheduler
> > > > > > +                */
> > > > > > +               struct xe_dep_scheduler *dep_scheduler;
> > > > > > +       } tlb_inval[XE_EXEC_QUEUE_TLB_INVAL_COUNT];
> > > > > > +
> > > > > >         /** @pxp: PXP info tracking */
> > > > > >         struct {
> > > > > >                 /** @pxp.type: PXP session type used by this
> > > > > > queue */
> > > > > 
> > > 
>