[PATCH 2/2] drm/xe: Use topology to determine page fault queue size

Wed Jun 12 20:37:58 UTC 2024

On 6/12/2024 07:48, Summers, Stuart wrote:
> On Tue, 2024-06-11 at 17:58 -0700, John Harrison wrote:
>> On 6/10/2024 13:33, Stuart Summers wrote:
>>> Currently the page fault queue size is hard coded. However
>>> the hardware supports faulting for each EU and each CS.
>>> For some applications running on hardware with a large
>>> number of EUs and CSs, this can result in an overflow of
>>> the page fault queue.
>>>
>>> Add a small calculation to determine the page fault queue
>>> size based on the number of EUs and CSs in the platform as
>>> detmined by fuses.
>>>
>>> Signed-off-by: Stuart Summers <stuart.summers at intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_gt_pagefault.c | 49 +++++++++++++++++++++--
>>> -----
>>>    drivers/gpu/drm/xe/xe_gt_types.h     |  4 +--
>>>    2 files changed, 39 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
>>> b/drivers/gpu/drm/xe/xe_gt_pagefault.c
>>> index 3858c8e0b707..5565732e79e9 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
>>> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
>>> @@ -287,7 +287,7 @@ static bool get_pagefault(struct pf_queue
>>> *pf_queue, struct pagefault *pf)
>>>                          PFD_VIRTUAL_ADDR_LO_SHIFT;
>>>    
>>>                  pf_queue->tail = (pf_queue->tail + PF_MSG_LEN_DW) %
>>> -                       PF_QUEUE_NUM_DW;
>>> +                       pf_queue->pf_queue_num_dw;
>>>                  ret = true;
>>>          }
>>>          spin_unlock_irq(&pf_queue->lock);
>>> @@ -299,7 +299,8 @@ static bool pf_queue_full(struct pf_queue
>>> *pf_queue)
>>>    {
>>>          lockdep_assert_held(&pf_queue->lock);
>>>    
>>> -       return CIRC_SPACE(pf_queue->head, pf_queue->tail,
>>> PF_QUEUE_NUM_DW) <=
>>> +       return CIRC_SPACE(pf_queue->head, pf_queue->tail,
>>> +                         pf_queue->pf_queue_num_dw) <=
>>>                  PF_MSG_LEN_DW;
>>>    }
>>>    
>>> @@ -312,11 +313,6 @@ int xe_guc_pagefault_handler(struct xe_guc
>>> *guc, u32 *msg, u32 len)
>>>          u32 asid;
>>>          bool full;
>>>    
>>> -       /*
>>> -        * The below logic doesn't work unless PF_QUEUE_NUM_DW %
>>> PF_MSG_LEN_DW == 0
>>> -        */
>>> -       BUILD_BUG_ON(PF_QUEUE_NUM_DW % PF_MSG_LEN_DW);
>>> -
>>>          if (unlikely(len != PF_MSG_LEN_DW))
>>>                  return -EPROTO;
>>>    
>>> @@ -327,7 +323,8 @@ int xe_guc_pagefault_handler(struct xe_guc
>>> *guc, u32 *msg, u32 len)
>>>          full = pf_queue_full(pf_queue);
>>>          if (!full) {
>>>                  memcpy(pf_queue->data + pf_queue->head, msg, len *
>>> sizeof(u32));
>>> -               pf_queue->head = (pf_queue->head + len) %
>>> PF_QUEUE_NUM_DW;
>>> +               pf_queue->head = (pf_queue->head + len) %
>>> +                       pf_queue->pf_queue_num_dw;
>>>                  queue_work(gt->usm.pf_wq, &pf_queue->worker);
>>>          } else {
>>>                  drm_warn(&xe->drm, "PF Queue full, shouldn't be
>>> possible");
>>> @@ -382,18 +379,42 @@ static void pf_queue_work_func(struct
>>> work_struct *w)
>>>    
>>>    static void acc_queue_work_func(struct work_struct *w);
>>>    
>>> +static int xe_alloc_pf_queue(struct xe_gt *gt, struct pf_queue
>>> *pf_queue)
>>> +{
>>> +       xe_dss_mask_t all_dss;
>>> +       int num_dss, num_eus;
>>> +
>>> +       bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, gt-
>>>> fuse_topo.c_dss_mask,
>>> +                 XE_MAX_DSS_FUSE_BITS);
>>> +
>>> +       num_dss = bitmap_weight(all_dss, XE_MAX_DSS_FUSE_BITS);
>>> +       num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss,
>>> +                               XE_MAX_EU_FUSE_BITS) * num_dss;
>>> +
>>> +       /* user can issue separate page faults per EU and per CS */
>>> +       pf_queue->pf_queue_num_dw =
>>> +               ALIGN(num_eus + XE_NUM_HW_ENGINES, PF_MSG_LEN_DW);
>> Do you mean ALIGN here? This is queue size in dwords according to the
>> name so isn't it just max_queue_entries * entry_size_in_dwords? I.e.
>> num_dw = (num_eus + num_engines) * msg_len_dw. ALIGN is just going to
>> round up not multiply up.
> Thanks John. No actually this variable is the number of dwords in the
> queue, not the size in dwords. The alignment is to adhere to some
> assumptions made elsewhere in the pagefault queue handling.
That doesn't make sense. You mean the current queue content size rather 
than the total size? If not, then how is 'the number of dwords in the 
queue' not equal to the 'size in dwords'? And if it is the current count 
then it should be initialised to zero not to some confused max.

Either way, num_eus + num_engines is the maximum number of page faults. 
I.e. the maximum number of *items* in the queue. That number has no 
direct connection to dwords. You have to multiple by the dword size of 
each item to get a dword count. And given that the name is _dw and you 
say it must be aligned to a dword size, it does rather seem like it 
should be the dword count.

E.g. 10 EUs and 3 engines would be a max queue depth of 13. If it is 4 
dwords per queue entry then rounding to 16 does you nothing. That is 
only 4 actual entries before it overflows. You need a queue size of 52 
dwords to fit all 13 entries. And if this is really meant to just be 13, 
i.e. the max item count, then the name should be pf_queue_num not 
pf_queue_num_dw and any code which is assuming a multiple of 4 is broken 
code that needs to be fixed because it is working in the wrong units.

John.

>
> Thanks,
> Stuart
>
>> John.
>>
>>> +
>>> +       pf_queue->gt = gt;
>>> +       pf_queue->data = kzalloc(pf_queue->pf_queue_num_dw,
>>> GFP_KERNEL);
>>> +       spin_lock_init(&pf_queue->lock);
>>> +       INIT_WORK(&pf_queue->worker, pf_queue_work_func);
>>> +
>>> +       return 0;
>>> +}
>>> +
>>>    int xe_gt_pagefault_init(struct xe_gt *gt)
>>>    {
>>>          struct xe_device *xe = gt_to_xe(gt);
>>> -       int i;
>>> +       int i, ret = 0;
>>>    
>>>          if (!xe->info.has_usm)
>>>                  return 0;
>>>    
>>>          for (i = 0; i < NUM_PF_QUEUE; ++i) {
>>> -               gt->usm.pf_queue[i].gt = gt;
>>> -               spin_lock_init(&gt->usm.pf_queue[i].lock);
>>> -               INIT_WORK(&gt->usm.pf_queue[i].worker,
>>> pf_queue_work_func);
>>> +               ret = xe_alloc_pf_queue(gt, &gt->usm.pf_queue[i]);
>>> +               if (ret)
>>> +                       return ret;
>>>          }
>>>          for (i = 0; i < NUM_ACC_QUEUE; ++i) {
>>>                  gt->usm.acc_queue[i].gt = gt;
>>> @@ -418,12 +439,16 @@ int xe_gt_pagefault_init(struct xe_gt *gt)
>>>    void xe_gt_pagefault_fini(struct xe_gt *gt)
>>>    {
>>>          struct xe_device *xe = gt_to_xe(gt);
>>> +       int i;
>>>    
>>>          if (!xe->info.has_usm)
>>>                  return;
>>>    
>>>          destroy_workqueue(gt->usm.acc_wq);
>>>          destroy_workqueue(gt->usm.pf_wq);
>>> +
>>> +       for (i = 0; i < NUM_PF_QUEUE; ++i)
>>> +               kfree(gt->usm.pf_queue[i].data);
>>>    }
>>>    
>>>    void xe_gt_pagefault_reset(struct xe_gt *gt)
>>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
>>> b/drivers/gpu/drm/xe/xe_gt_types.h
>>> index 10a9a9529377..c40b9c1b5fec 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>>> @@ -232,9 +232,9 @@ struct xe_gt {
>>>                  struct pf_queue {
>>>                          /** @usm.pf_queue.gt: back pointer to GT */
>>>                          struct xe_gt *gt;
>>> -#define PF_QUEUE_NUM_DW        128
>>>                          /** @usm.pf_queue.data: data in the page
>>> fault queue */
>>> -                       u32 data[PF_QUEUE_NUM_DW];
>>> +                       u32 *data;
>>> +                       u32 pf_queue_num_dw;
>>>                          /**
>>>                           * @usm.pf_queue.tail: tail pointer in DWs
>>> for page fault queue,
>>>                           * moved by worker which processes faults
>>> (consumer).