[PATCH 2/2] drm/xe: Use topology to determine page fault queue size

Thu Jul 11 17:26:11 UTC 2024

On Tue, 2024-07-09 at 22:53 +0000, Summers, Stuart wrote:
> On Tue, 2024-07-09 at 20:52 +0000, Matthew Brost wrote:
> > On Tue, Jul 09, 2024 at 06:41:00PM +0000, Stuart Summers wrote:
> > > Currently the page fault queue size is hard coded. However
> > > the hardware supports faulting for each EU and each CS.
> > > For some applications running on hardware with a large
> > > number of EUs and CSs, this can result in an overflow of
> > > the page fault queue.
> > > 
> > > Add a small calculation to determine the page fault queue
> > > size based on the number of EUs and CSs in the platform as
> > > detmined by fuses.
> > > 
> > > Signed-off-by: Stuart Summers <stuart.summers at intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/xe_gt_pagefault.c | 54 +++++++++++++++++++++-
> > > --
> > > ----
> > >  drivers/gpu/drm/xe/xe_gt_types.h     |  9 +++--
> > >  2 files changed, 49 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > index ded8e3c33a82..8f7f68032805 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > @@ -295,7 +295,7 @@ static bool get_pagefault(struct pf_queue
> > > *pf_queue, struct pagefault *pf)
> > >                         PFD_VIRTUAL_ADDR_LO_SHIFT;
> > >  
> > >                 pf_queue->tail = (pf_queue->tail + PF_MSG_LEN_DW)
> > > %
> > > -                       PF_QUEUE_NUM_DW;
> > > +                       pf_queue->pf_queue_num_dw;
> > >                 ret = true;
> > >         }
> > >         spin_unlock_irq(&pf_queue->lock);
> > > @@ -307,7 +307,8 @@ static bool pf_queue_full(struct pf_queue
> > > *pf_queue)
> > >  {
> > >         lockdep_assert_held(&pf_queue->lock);
> > >  
> > > -       return CIRC_SPACE(pf_queue->head, pf_queue->tail,
> > > PF_QUEUE_NUM_DW) <=
> > > +       return CIRC_SPACE(pf_queue->head, pf_queue->tail,
> > > +                         pf_queue->pf_queue_num_dw) <=
> > >                 PF_MSG_LEN_DW;
> > >  }
> > >  
> > > @@ -320,22 +321,23 @@ int xe_guc_pagefault_handler(struct xe_guc
> > > *guc, u32 *msg, u32 len)
> > >         u32 asid;
> > >         bool full;
> > >  
> > > -       /*
> > > -        * The below logic doesn't work unless PF_QUEUE_NUM_DW %
> > > PF_MSG_LEN_DW == 0
> > > -        */
> > > -       BUILD_BUG_ON(PF_QUEUE_NUM_DW % PF_MSG_LEN_DW);
> > > -
> > >         if (unlikely(len != PF_MSG_LEN_DW))
> > >                 return -EPROTO;
> > >  
> > >         asid = FIELD_GET(PFD_ASID, msg[1]);
> > >         pf_queue = gt->usm.pf_queue + (asid % NUM_PF_QUEUE);
> > >  
> > > +       /*
> > > +        * The below logic doesn't work unless PF_QUEUE_NUM_DW %
> > > PF_MSG_LEN_DW == 0
> > > +        */
> > > +       xe_gt_assert(gt, !(pf_queue->pf_queue_num_dw %
> > > PF_MSG_LEN_DW));
> > > +
> > >         spin_lock_irqsave(&pf_queue->lock, flags);
> > >         full = pf_queue_full(pf_queue);
> > >         if (!full) {
> > >                 memcpy(pf_queue->data + pf_queue->head, msg, len
> > > *
> > > sizeof(u32));
> > > -               pf_queue->head = (pf_queue->head + len) %
> > > PF_QUEUE_NUM_DW;
> > > +               pf_queue->head = (pf_queue->head + len) %
> > > +                       pf_queue->pf_queue_num_dw;
> > >                 queue_work(gt->usm.pf_wq, &pf_queue->worker);
> > >         } else {
> > >                 drm_warn(&xe->drm, "PF Queue full, shouldn't be
> > > possible");
> > > @@ -402,26 +404,54 @@ static void pagefault_fini(void *arg)
> > >  {
> > >         struct xe_gt *gt = arg;
> > >         struct xe_device *xe = gt_to_xe(gt);
> > > +       int i;
> > >  
> > >         if (!xe->info.has_usm)
> > >                 return;
> > >  
> > >         destroy_workqueue(gt->usm.acc_wq);
> > >         destroy_workqueue(gt->usm.pf_wq);
> > > +
> > > +       for (i = 0; i < NUM_PF_QUEUE; ++i)
> > > +               kfree(gt->usm.pf_queue[i].data);
> > > +}
> > > +
> > > +static int xe_alloc_pf_queue(struct xe_gt *gt, struct pf_queue
> > > *pf_queue)
> > > +{
> > > +       xe_dss_mask_t all_dss;
> > > +       int num_dss, num_eus;
> > > +
> > > +       bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, gt-
> > > > fuse_topo.c_dss_mask,
> > > +                 XE_MAX_DSS_FUSE_BITS);
> > > +
> > > +       num_dss = bitmap_weight(all_dss, XE_MAX_DSS_FUSE_BITS);
> > > +       num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss,
> > > +                               XE_MAX_EU_FUSE_BITS) * num_dss;
> > > +
> > > +       /* user can issue separate page faults per EU and per CS
> > > */
> > > +       pf_queue->pf_queue_num_dw =
> > > +               ALIGN(num_eus + XE_NUM_HW_ENGINES,
> > > PF_MSG_LEN_DW);
> > 
> > This doesn't look correct.
> > 
> > If should be (num_eus + XE_NUM_HW_ENGINES) * PF_MSG_LEN_DW, right?
> 
> Yeah thanks for the correction here, you're right.
> 
> > 
> > Also I believe we need simlilar math when sizing the G2H queue too
> > as
> > PF
> > show up that queue first before getting sunk into this queue.

Hey Matt,

So on second thought, do we really need to make this change in the ctb?
For the PF queue it makes sense - each queue might be doing some memory
operation which takes time. So presumably all of the queues could be
active at once.

On the other hand, the ctb is processing messages and handing them off
quickly, agnostic to whether they are page faults or something else.

IMO we should keep the ctb separate and increase only if it seems like
the memory traffic there is causing unnecessary backpressure.

Thanks,
Stuart

> 
> Ah you had made that comment the first time around and I missed it :(
> 
> Let me clean that up and get back.
> 
> Thanks,
> Stuart
> 
> > 
> > Matt
> > 
> > > +
> > > +       pf_queue->gt = gt;
> > > +       pf_queue->data = kzalloc(pf_queue->pf_queue_num_dw,
> > > GFP_KERNEL);
> > > +       spin_lock_init(&pf_queue->lock);
> > > +       INIT_WORK(&pf_queue->worker, pf_queue_work_func);
> > > +
> > > +       return 0;
> > >  }
> > >  
> > >  int xe_gt_pagefault_init(struct xe_gt *gt)
> > >  {
> > >         struct xe_device *xe = gt_to_xe(gt);
> > > -       int i;
> > > +       int i, ret = 0;
> > >  
> > >         if (!xe->info.has_usm)
> > >                 return 0;
> > >  
> > >         for (i = 0; i < NUM_PF_QUEUE; ++i) {
> > > -               gt->usm.pf_queue[i].gt = gt;
> > > -               spin_lock_init(&gt->usm.pf_queue[i].lock);
> > > -               INIT_WORK(&gt->usm.pf_queue[i].worker,
> > > pf_queue_work_func);
> > > +               ret = xe_alloc_pf_queue(gt, &gt-
> > > >usm.pf_queue[i]);
> > > +               if (ret)
> > > +                       return ret;
> > >         }
> > >         for (i = 0; i < NUM_ACC_QUEUE; ++i) {
> > >                 gt->usm.acc_queue[i].gt = gt;
> > > diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
> > > b/drivers/gpu/drm/xe/xe_gt_types.h
> > > index ccb4f7b2bf15..ec67d6ae3226 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > > @@ -272,9 +272,14 @@ struct xe_gt {
> > >                 struct pf_queue {
> > >                         /** @usm.pf_queue.gt: back pointer to GT
> > > */
> > >                         struct xe_gt *gt;
> > > -#define PF_QUEUE_NUM_DW        128
> > >                         /** @usm.pf_queue.data: data in the page
> > > fault queue */
> > > -                       u32 data[PF_QUEUE_NUM_DW];
> > > +                       u32 *data;
> > > +                       /**
> > > +                        * @usm.pf_queue_num_dw: number of DWORDS
> > > in the page
> > > +                        * fault queue. Dynamically calculated
> > > based on the number
> > > +                        * of compute resources available.
> > > +                        */
> > > +                       u32 pf_queue_num_dw;
> > >                         /**
> > >                          * @usm.pf_queue.tail: tail pointer in
> > > DWs
> > > for page fault queue,
> > >                          * moved by worker which processes faults
> > > (consumer).
> > > -- 
> > > 2.34.1
> > > 
>