[PATCH 2/2] drm/xe: Use topology to determine page fault queue size
Matthew Brost
matthew.brost at intel.com
Tue Jul 16 17:11:20 UTC 2024
On Thu, Jul 11, 2024 at 01:36:12PM -0600, Summers, Stuart wrote:
> On Thu, 2024-07-11 at 18:32 +0000, Matthew Brost wrote:
> > On Thu, Jul 11, 2024 at 05:36:29PM +0000, Stuart Summers wrote:
> > > Currently the page fault queue size is hard coded. However
> > > the hardware supports faulting for each EU and each CS.
> > > For some applications running on hardware with a large
> > > number of EUs and CSs, this can result in an overflow of
> > > the page fault queue.
> > >
> > > Add a small calculation to determine the page fault queue
> > > size based on the number of EUs and CSs in the platform as
> > > detmined by fuses.
> > >
> >
> > Missing is the G2H queue size update. Adding this patch by itself
> > kinda
> > makes this problem worse as if the G2H queue is overflowed by page
> > faults it is basically a slient bug unless we look at the GuC log.
> > That
> > itself really is bug GuC in the GuC interface which should fixed
> > (e.g.
> > the G2H queue should always reserve the 1 slot to communicate a G2H
> > overflow message). Anyways, to stay on topic can you include an
> > update
> > to G2H as part of this series?
>
> Hey Matt,
>
> I had replied to the last series, sorry for the confusion!
>
> I'm not convinced we need to update the g2h. The main reason we have
> all of these simultaneous queues on the page fault side is each of them
> can be copying data around in a worker thread while the other queues
> are in use. The g2h should just receive and copy to this queue and be
> re-used, right?
>
> Or are you thinking we'll hit a scenario where GuC writes all page
> faults to the ctb at once and we overflow? But in that case what really
Yes, I want to avoid overflowing the ctb.
> is the right calculation? Is it possible that an application could
> simultaneously access N number of bad memory addresses resulting in N
> number (where N is greater than the g2h size) of cat faults?
>
> To me this really should be more specification defined where we know
> the maximum number of writes GuC can perform to the g2h before it is
> read and recycled by the KMD.
You are right, the interface as defined is a bit unsafe. They are
working to fix this (been at least a few years now since this issue was
identified) but in the meantime we just need to size G2H to avoid
overflows. Maybe it is just easiest to statically increase the G2H
buffer for now (i.e. make it like 64k, 128k or something) instead adding
a calculation there.
Matt
>
> Thanks,
> Stuart
>
> >
> > Matt
> >
> > > Signed-off-by: Stuart Summers <stuart.summers at intel.com>
> > > ---
> > > drivers/gpu/drm/xe/xe_gt_pagefault.c | 54 +++++++++++++++++++++---
> > > ----
> > > drivers/gpu/drm/xe/xe_gt_types.h | 9 +++--
> > > 2 files changed, 49 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > index b2a7fa55bd18..6bfc60c0274a 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > @@ -287,7 +287,7 @@ static bool get_pagefault(struct pf_queue
> > > *pf_queue, struct pagefault *pf)
> > > PFD_VIRTUAL_ADDR_LO_SHIFT;
> > >
> > > pf_queue->tail = (pf_queue->tail + PF_MSG_LEN_DW) %
> > > - PF_QUEUE_NUM_DW;
> > > + pf_queue->pf_queue_num_dw;
> > > ret = true;
> > > }
> > > spin_unlock_irq(&pf_queue->lock);
> > > @@ -299,7 +299,8 @@ static bool pf_queue_full(struct pf_queue
> > > *pf_queue)
> > > {
> > > lockdep_assert_held(&pf_queue->lock);
> > >
> > > - return CIRC_SPACE(pf_queue->head, pf_queue->tail,
> > > PF_QUEUE_NUM_DW) <=
> > > + return CIRC_SPACE(pf_queue->head, pf_queue->tail,
> > > + pf_queue->pf_queue_num_dw) <=
> > > PF_MSG_LEN_DW;
> > > }
> > >
> > > @@ -312,22 +313,23 @@ int xe_guc_pagefault_handler(struct xe_guc
> > > *guc, u32 *msg, u32 len)
> > > u32 asid;
> > > bool full;
> > >
> > > - /*
> > > - * The below logic doesn't work unless PF_QUEUE_NUM_DW %
> > > PF_MSG_LEN_DW == 0
> > > - */
> > > - BUILD_BUG_ON(PF_QUEUE_NUM_DW % PF_MSG_LEN_DW);
> > > -
> > > if (unlikely(len != PF_MSG_LEN_DW))
> > > return -EPROTO;
> > >
> > > asid = FIELD_GET(PFD_ASID, msg[1]);
> > > pf_queue = gt->usm.pf_queue + (asid % NUM_PF_QUEUE);
> > >
> > > + /*
> > > + * The below logic doesn't work unless PF_QUEUE_NUM_DW %
> > > PF_MSG_LEN_DW == 0
> > > + */
> > > + xe_gt_assert(gt, !(pf_queue->pf_queue_num_dw %
> > > PF_MSG_LEN_DW));
> > > +
> > > spin_lock_irqsave(&pf_queue->lock, flags);
> > > full = pf_queue_full(pf_queue);
> > > if (!full) {
> > > memcpy(pf_queue->data + pf_queue->head, msg, len *
> > > sizeof(u32));
> > > - pf_queue->head = (pf_queue->head + len) %
> > > PF_QUEUE_NUM_DW;
> > > + pf_queue->head = (pf_queue->head + len) %
> > > + pf_queue->pf_queue_num_dw;
> > > queue_work(gt->usm.pf_wq, &pf_queue->worker);
> > > } else {
> > > drm_warn(&xe->drm, "PF Queue full, shouldn't be
> > > possible");
> > > @@ -386,26 +388,54 @@ static void pagefault_fini(void *arg)
> > > {
> > > struct xe_gt *gt = arg;
> > > struct xe_device *xe = gt_to_xe(gt);
> > > + int i;
> > >
> > > if (!xe->info.has_usm)
> > > return;
> > >
> > > destroy_workqueue(gt->usm.acc_wq);
> > > destroy_workqueue(gt->usm.pf_wq);
> > > +
> > > + for (i = 0; i < NUM_PF_QUEUE; ++i)
> > > + kfree(gt->usm.pf_queue[i].data);
> > > +}
> > > +
> > > +static int xe_alloc_pf_queue(struct xe_gt *gt, struct pf_queue
> > > *pf_queue)
> > > +{
> > > + xe_dss_mask_t all_dss;
> > > + int num_dss, num_eus;
> > > +
> > > + bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, gt-
> > > >fuse_topo.c_dss_mask,
> > > + XE_MAX_DSS_FUSE_BITS);
> > > +
> > > + num_dss = bitmap_weight(all_dss, XE_MAX_DSS_FUSE_BITS);
> > > + num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss,
> > > + XE_MAX_EU_FUSE_BITS) * num_dss;
> > > +
> > > + /* user can issue separate page faults per EU and per CS */
> > > + pf_queue->pf_queue_num_dw =
> > > + (num_eus + XE_NUM_HW_ENGINES) * PF_MSG_LEN_DW;
> > > +
> > > + pf_queue->gt = gt;
> > > + pf_queue->data = kzalloc(pf_queue->pf_queue_num_dw,
> > > GFP_KERNEL);
> > > + spin_lock_init(&pf_queue->lock);
> > > + INIT_WORK(&pf_queue->worker, pf_queue_work_func);
> > > +
> > > + return 0;
> > > }
> > >
> > > int xe_gt_pagefault_init(struct xe_gt *gt)
> > > {
> > > struct xe_device *xe = gt_to_xe(gt);
> > > - int i;
> > > + int i, ret = 0;
> > >
> > > if (!xe->info.has_usm)
> > > return 0;
> > >
> > > for (i = 0; i < NUM_PF_QUEUE; ++i) {
> > > - gt->usm.pf_queue[i].gt = gt;
> > > - spin_lock_init(>->usm.pf_queue[i].lock);
> > > - INIT_WORK(>->usm.pf_queue[i].worker,
> > > pf_queue_work_func);
> > > + ret = xe_alloc_pf_queue(gt, >->usm.pf_queue[i]);
> > > + if (ret)
> > > + return ret;
> > > }
> > > for (i = 0; i < NUM_ACC_QUEUE; ++i) {
> > > gt->usm.acc_queue[i].gt = gt;
> > > diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
> > > b/drivers/gpu/drm/xe/xe_gt_types.h
> > > index 38a0d0e178c8..263a604b76b7 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > > @@ -233,9 +233,14 @@ struct xe_gt {
> > > struct pf_queue {
> > > /** @usm.pf_queue.gt: back pointer to GT */
> > > struct xe_gt *gt;
> > > -#define PF_QUEUE_NUM_DW 128
> > > /** @usm.pf_queue.data: data in the page
> > > fault queue */
> > > - u32 data[PF_QUEUE_NUM_DW];
> > > + u32 *data;
> > > + /**
> > > + * @usm.pf_queue_num_dw: number of DWORDS
> > > in the page
> > > + * fault queue. Dynamically calculated
> > > based on the number
> > > + * of compute resources available.
> > > + */
> > > + u32 pf_queue_num_dw;
> > > /**
> > > * @usm.pf_queue.tail: tail pointer in DWs
> > > for page fault queue,
> > > * moved by worker which processes faults
> > > (consumer).
> > > --
> > > 2.34.1
> > >
>
More information about the Intel-xe
mailing list