[Intel-xe] [PATCH v3] drm/xe: Fix pagefault and access counter worker functions

Fri Nov 3 19:21:21 UTC 2023

On 11/3/2023 7:58 AM, Summers, Stuart wrote:
> On Thu, 2023-11-02 at 18:43 -0700, Brian Welty wrote:
>> When processing G2H messages for pagefault or access counters, we
>> queue a
>> work item and call queue_work(). This fails if the worker thread is
>> already
>> queued to run.
>> The expectation is that the worker function will do more than process
>> a
>> single item and return. It needs to either process all pending items
>> or
>> requeue itself if items are pending. But requeuing will add latency
>> and
>> potential context switch can occur.
>>
>> We don't want to add unnecessary latency and so the worker should
>> process
>> as many faults as it can within a reasonable duration of time.
>> We also do not want to hog the cpu core, so here we execute in a loop
>> and requeue if still running after more than 20 ms.
>> This seems reasonable framework and easy to tune this further if
>> needed.
>>
>> This resolves issues seen with several igt at xe_exec_fault_mode
>> subtests
>> where the GPU will hang when KMD ignores a pending pagefault.
>>
>> v2: requeue the worker instead of having an internal processing loop.
>> v3: implement hybrid model of v1 and v2
>>      now, run for 20 msec before we will requeue if still running
> 
> Agreed this approach is much better.
> 
>>
>> Signed-off-by: Brian Welty <brian.welty at intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_gt_pagefault.c | 80 ++++++++++++++++----------
>> --
>>   1 file changed, 46 insertions(+), 34 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
>> b/drivers/gpu/drm/xe/xe_gt_pagefault.c
>> index ab6daebbd77c..c99af751d1fb 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
>> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
>> @@ -276,10 +276,10 @@ static void print_pagefault(struct xe_device
>> *xe, struct pagefault *pf)
>>   
>>   #define PF_MSG_LEN_DW  4
>>   
>> -static int get_pagefault(struct pf_queue *pf_queue, struct pagefault
>> *pf)
>> +static bool get_pagefault(struct pf_queue *pf_queue, struct
>> pagefault *pf)
>>   {
>>          const struct xe_guc_pagefault_desc *desc;
>> -       int ret = 0;
>> +       bool ret = false;
>>   
>>          spin_lock_irq(&pf_queue->lock);
>>          if (pf_queue->head != pf_queue->tail) {
>> @@ -303,8 +303,7 @@ static int get_pagefault(struct pf_queue
>> *pf_queue, struct pagefault *pf)
>>   
>>                  pf_queue->head = (pf_queue->head + PF_MSG_LEN_DW) %
>>                          PF_QUEUE_NUM_DW;
>> -       } else {
>> -               ret = -1;
>> +               ret = true;
>>          }
>>          spin_unlock_irq(&pf_queue->lock);
>>   
>> @@ -348,6 +347,8 @@ int xe_guc_pagefault_handler(struct xe_guc *guc,
>> u32 *msg, u32 len)
>>          return full ? -ENOSPC : 0;
>>   }
>>   
>> +#define USM_QUEUE_MAX_RUNTIME_MS       20
>> +
>>   static void pf_queue_work_func(struct work_struct *w)
>>   {
>>          struct pf_queue *pf_queue = container_of(w, struct pf_queue,
>> worker);
>> @@ -355,31 +356,37 @@ static void pf_queue_work_func(struct
>> work_struct *w)
>>          struct xe_device *xe = gt_to_xe(gt);
>>          struct xe_guc_pagefault_reply reply = {};
>>          struct pagefault pf = {};
>> +       unsigned long threshold;
>>          int ret;
>>   
>> -       ret = get_pagefault(pf_queue, &pf);
>> -       if (ret)
>> -               return;
>> +       threshold = jiffies +
>> msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS);
>>   
>> -       ret = handle_pagefault(gt, &pf);
>> -       if (unlikely(ret)) {
>> -               print_pagefault(xe, &pf);
>> -               pf.fault_unsuccessful = 1;
>> -               drm_dbg(&xe->drm, "Fault response: Unsuccessful
>> %d\n", ret);
>> -       }
>> +       while (get_pagefault(pf_queue, &pf)) {
>> +               ret = handle_pagefault(gt, &pf);
>> +               if (unlikely(ret)) {
>> +                       print_pagefault(xe, &pf);
>> +                       pf.fault_unsuccessful = 1;
>> +                       drm_dbg(&xe->drm, "Fault response:
>> Unsuccessful %d\n", ret);
>> +               }
>> +
>> +               reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
>> +                       FIELD_PREP(PFR_SUCCESS,
>> pf.fault_unsuccessful) |
>> +                       FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
>> +                       FIELD_PREP(PFR_DESC_TYPE,
>> FAULT_RESPONSE_DESC) |
>> +                       FIELD_PREP(PFR_ASID, pf.asid);
>>   
>> -       reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
>> -               FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
>> -               FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
>> -               FIELD_PREP(PFR_DESC_TYPE, FAULT_RESPONSE_DESC) |
>> -               FIELD_PREP(PFR_ASID, pf.asid);
>> +               reply.dw1 = FIELD_PREP(PFR_VFID, pf.vfid) |
>> +                       FIELD_PREP(PFR_ENG_INSTANCE,
>> pf.engine_instance) |
>> +                       FIELD_PREP(PFR_ENG_CLASS, pf.engine_class) |
>> +                       FIELD_PREP(PFR_PDATA, pf.pdata);
>>   
>> -       reply.dw1 = FIELD_PREP(PFR_VFID, pf.vfid) |
>> -               FIELD_PREP(PFR_ENG_INSTANCE, pf.engine_instance) |
>> -               FIELD_PREP(PFR_ENG_CLASS, pf.engine_class) |
>> -               FIELD_PREP(PFR_PDATA, pf.pdata);
>> +               send_pagefault_reply(&gt->uc.guc, &reply);
>>   
>> -       send_pagefault_reply(&gt->uc.guc, &reply);
>> +               if (time_after(jiffies, threshold)) {
>> +                       queue_work(gt->usm.pf_wq, w);
> 
> If we queue from here instead of just breaking out and letting the
> parent function requeue the work, isn't there still a chance we could
> get into some kind of infinite CPU loop? I guess in this case we would
> still queue out to a new CPU each time so this would allow us to
> reschedule on the local node, but doesn't that leave us in the same
> place we were in with v1?
> 
> Basically, why don't we just break out at this point. Why add the
> nested queue?

We can't rely on parent to queue us...  I mean parent does the initial
queueing of worker.   But for the case where queue_work() returned 
false, the parent may then go idle for long time and not run again.
The parent shouldn't have to keep doing queue_work() until it returns 
true.  That approach adds latency to servicing incoming G2H messages.
The better solution is to have the worker requeue itself as it is the
worker thread who is managing how much work it does in one iteration...
Does this help explain it?

And will make small change to above as mention in other thread with Matt.

-Brian

> 
> Thanks,
> Stuart
> 
>> +                       break;
>> +               }
>> +       }
>>   }
>>   
>>   static void acc_queue_work_func(struct work_struct *w);
>> @@ -544,10 +551,10 @@ static int handle_acc(struct xe_gt *gt, struct
>> acc *acc)
>>   
>>   #define ACC_MSG_LEN_DW        4
>>   
>> -static int get_acc(struct acc_queue *acc_queue, struct acc *acc)
>> +static bool get_acc(struct acc_queue *acc_queue, struct acc *acc)
>>   {
>>          const struct xe_guc_acc_desc *desc;
>> -       int ret = 0;
>> +       bool ret = false;
>>   
>>          spin_lock(&acc_queue->lock);
>>          if (acc_queue->head != acc_queue->tail) {
>> @@ -567,8 +574,7 @@ static int get_acc(struct acc_queue *acc_queue,
>> struct acc *acc)
>>   
>>                  acc_queue->head = (acc_queue->head + ACC_MSG_LEN_DW)
>> %
>>                                    ACC_QUEUE_NUM_DW;
>> -       } else {
>> -               ret = -1;
>> +               ret = true;
>>          }
>>          spin_unlock(&acc_queue->lock);
>>   
>> @@ -581,16 +587,22 @@ static void acc_queue_work_func(struct
>> work_struct *w)
>>          struct xe_gt *gt = acc_queue->gt;
>>          struct xe_device *xe = gt_to_xe(gt);
>>          struct acc acc = {};
>> +       unsigned long threshold;
>>          int ret;
>>   
>> -       ret = get_acc(acc_queue, &acc);
>> -       if (ret)
>> -               return;
>> +       threshold = jiffies +
>> msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS);
>>   
>> -       ret = handle_acc(gt, &acc);
>> -       if (unlikely(ret)) {
>> -               print_acc(xe, &acc);
>> -               drm_warn(&xe->drm, "ACC: Unsuccessful %d\n", ret);
>> +       while (get_acc(acc_queue, &acc)) {
>> +               ret = handle_acc(gt, &acc);
>> +               if (unlikely(ret)) {
>> +                       print_acc(xe, &acc);
>> +                       drm_warn(&xe->drm, "ACC: Unsuccessful %d\n",
>> ret);
>> +               }
>> +
>> +               if (time_after(jiffies, threshold)) {
>> +                       queue_work(gt->usm.acc_wq, w);
>> +                       break;
>> +               }
>>          }
>>   }
>>   
>