[Intel-xe] [2/2] drm/xe: Fix pagefault and access counter worker functions

Wed Nov 1 14:32:53 UTC 2023

On Tue, 2023-10-31 at 21:53 -0700, Welty, Brian wrote:
> 
> 
> On 10/31/2023 3:53 PM, Summers, Stuart wrote:
> > On Tue, 2023-10-31 at 15:15 -0700, Brian Welty wrote:
> > > When processing G2H messages for pagefault or access counters, we
> > > queue a
> > > work item and call queue_work(). This fails if the worker thread
> > > is
> > > already
> > > queued to run.
> > > The expectation is that the worker function will process more
> > > than a
> > > single
> > > item, and is updated here to continue executing until all queued
> > > work
> > > items
> > > are handled.
> > > 
> > > This resolves issues seen with several igt at xe_exec_fault_mode
> > > subtests.
> > > 
> > > Signed-off-by: Brian Welty <brian.welty at intel.com>
> > > ---
> > >   drivers/gpu/drm/xe/xe_gt_pagefault.c | 66 +++++++++++++--------
> > > -----
> > > --
> > >   1 file changed, 30 insertions(+), 36 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > index ab6daebbd77c..63fba83c0839 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > > @@ -276,10 +276,10 @@ static void print_pagefault(struct
> > > xe_device
> > > *xe, struct pagefault *pf)
> > >   
> > >   #define PF_MSG_LEN_DW  4
> > >   
> > > -static int get_pagefault(struct pf_queue *pf_queue, struct
> > > pagefault
> > > *pf)
> > > +static bool get_pagefault(struct pf_queue *pf_queue, struct
> > > pagefault *pf)
> > >   {
> > >          const struct xe_guc_pagefault_desc *desc;
> > > -       int ret = 0;
> > > +       bool ret = false;
> > >   
> > >          spin_lock_irq(&pf_queue->lock);
> > >          if (pf_queue->head != pf_queue->tail) {
> > > @@ -303,8 +303,7 @@ static int get_pagefault(struct pf_queue
> > > *pf_queue, struct pagefault *pf)
> > >   
> > >                  pf_queue->head = (pf_queue->head +
> > > PF_MSG_LEN_DW) %
> > >                          PF_QUEUE_NUM_DW;
> > > -       } else {

Yeah ok I'm just blind. I missed that you dropped the else condition
here...

Ok no problem from me, looks great. Thanks for the patch!

Reviewed-by: Stuart Summers <stuart.summers at intel.com>

> > > -               ret = -1;
> > > +               ret = true;
> > >          }
> > >          spin_unlock_irq(&pf_queue->lock);
> > >   
> > > @@ -357,29 +356,27 @@ static void pf_queue_work_func(struct
> > > work_struct *w)
> > >          struct pagefault pf = {};
> > >          int ret;
> > >   
> > > -       ret = get_pagefault(pf_queue, &pf);
> > > -       if (ret)
> > > -               return;
> > > -
> > > -       ret = handle_pagefault(gt, &pf);
> > > -       if (unlikely(ret)) {
> > > -               print_pagefault(xe, &pf);
> > > -               pf.fault_unsuccessful = 1;
> > > -               drm_dbg(&xe->drm, "Fault response: Unsuccessful
> > > %d\n", ret);
> > > -       }
> > > +       while (get_pagefault(pf_queue, &pf)) {
> > 
> > Sorry maybe I'm missing... don't we want this to be
> > while(!get_pagefault())
> > 
> 
> That would be true if get_pagefault() was still identical to the
> current
> code in drm-xe-next....  but it's less readable that way.
> So I changed it. get_pagefault() now returns true if it dequeued
> a work item.  And false if it did not.
> 
> Make sense now?
> 
> 
> > ?
> > 
> > -Stuart
> > 
> > > +               ret = handle_pagefault(gt, &pf);
> > > +               if (unlikely(ret)) {
> > > +                       print_pagefault(xe, &pf);
> > > +                       pf.fault_unsuccessful = 1;
> > > +                       drm_dbg(&xe->drm, "Fault response:
> > > Unsuccessful %d\n", ret);
> > > +               }
> > >   
> > > -       reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
> > > -               FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
> > > -               FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
> > > -               FIELD_PREP(PFR_DESC_TYPE, FAULT_RESPONSE_DESC) |
> > > -               FIELD_PREP(PFR_ASID, pf.asid);
> > > +               reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
> > > +                       FIELD_PREP(PFR_SUCCESS,
> > > pf.fault_unsuccessful) |
> > > +                       FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
> > > +                       FIELD_PREP(PFR_DESC_TYPE,
> > > FAULT_RESPONSE_DESC) |
> > > +                       FIELD_PREP(PFR_ASID, pf.asid);
> > >   
> > > -       reply.dw1 = FIELD_PREP(PFR_VFID, pf.vfid) |
> > > -               FIELD_PREP(PFR_ENG_INSTANCE, pf.engine_instance)
> > > |
> > > -               FIELD_PREP(PFR_ENG_CLASS, pf.engine_class) |
> > > -               FIELD_PREP(PFR_PDATA, pf.pdata);
> > > +               reply.dw1 = FIELD_PREP(PFR_VFID, pf.vfid) |
> > > +                       FIELD_PREP(PFR_ENG_INSTANCE,
> > > pf.engine_instance) |
> > > +                       FIELD_PREP(PFR_ENG_CLASS,
> > > pf.engine_class) |
> > > +                       FIELD_PREP(PFR_PDATA, pf.pdata);
> > >   
> > > -       send_pagefault_reply(&gt->uc.guc, &reply);
> > > +               send_pagefault_reply(&gt->uc.guc, &reply);
> > > +       }
> > >   }
> > >   
> > >   static void acc_queue_work_func(struct work_struct *w);
> > > @@ -544,10 +541,10 @@ static int handle_acc(struct xe_gt *gt,
> > > struct
> > > acc *acc)
> > >   
> > >   #define ACC_MSG_LEN_DW        4
> > >   
> > > -static int get_acc(struct acc_queue *acc_queue, struct acc *acc)
> > > +static bool get_acc(struct acc_queue *acc_queue, struct acc
> > > *acc)
> > >   {
> > >          const struct xe_guc_acc_desc *desc;
> > > -       int ret = 0;
> > > +       bool ret = false;
> > >   
> > >          spin_lock(&acc_queue->lock);
> > >          if (acc_queue->head != acc_queue->tail) {
> > > @@ -567,8 +564,7 @@ static int get_acc(struct acc_queue
> > > *acc_queue,
> > > struct acc *acc)
> > >   
> > >                  acc_queue->head = (acc_queue->head +
> > > ACC_MSG_LEN_DW)
> > > %
> > >                                    ACC_QUEUE_NUM_DW;
> > > -       } else {
> > > -               ret = -1;
> > > +               ret = true;
> > >          }
> > >          spin_unlock(&acc_queue->lock);
> > >   
> > > @@ -583,14 +579,12 @@ static void acc_queue_work_func(struct
> > > work_struct *w)
> > >          struct acc acc = {};
> > >          int ret;
> > >   
> > > -       ret = get_acc(acc_queue, &acc);
> > > -       if (ret)
> > > -               return;
> > > -
> > > -       ret = handle_acc(gt, &acc);
> > > -       if (unlikely(ret)) {
> > > -               print_acc(xe, &acc);
> > > -               drm_warn(&xe->drm, "ACC: Unsuccessful %d\n",
> > > ret);
> > > +       while (get_acc(acc_queue, &acc)) {
> > > +               ret = handle_acc(gt, &acc);
> > > +               if (unlikely(ret)) {
> > > +                       print_acc(xe, &acc);
> > > +                       drm_warn(&xe->drm, "ACC: Unsuccessful
> > > %d\n",
> > > ret);
> > > +               }
> > >          }
> > >   }
> > >   
> >