[Intel-xe] [PATCH v2] drm/xe: handle TLB invalidations from CT fast-path
Matthew Auld
matthew.auld at intel.com
Fri Jun 30 18:29:09 UTC 2023
On 30/06/2023 18:56, Souza, Jose wrote:
> On Fri, 2023-06-30 at 18:14 +0100, Matthew Auld wrote:
>> In various test cases that put the system under a heavy load, we can
>> sometimes see errors with missed TLB invalidations. In such cases we see
>> the interrupt arrive for the invalidation from the GuC, however the
>> actual processing of the completion is pushed onto a workqueue and
>> handled with all the other CT stuff, which might take longer than
>> expected. Since we expect TLB invalidations to complete within a
>> reasonable amount of time (at most ~250ms), and they do seem pretty
>> critical, allow handling directly from the CT fast-path.
>>
>> v2 (José):
>> - Actually use the correct spinlock/unlock_irq, since pending_lock is
>> grabbed from IRQ.
>
> Thank you, warnings fixed with this version.
>
> Looks like it improved but I still get 'TLB invalidation time'd out', see at time 460.562659 in https://pastebin.com/raw/dnqXRM7T
> I'm running piglit with 6 threads in a TGL with 4 cores / 8 threads.
Yeah, I'm hoping those are a different issue. From the logs we just did
a GT reset and I think that path ends up disabling the CT communication
channel somewhere (and maybe also resets it?), so it might be that we
"lose" the in-flight TLB completions which are not tracked with an
explicit fence. Before this patch did those also trigger the
drm_WARN_ON(>_to_xe(gt)->drm, expected_seqno != msg[0]) somewhere?
That usually indicates that the message was lost and not so much that
the completion is taking too long.
>
>>
>> References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/297
>> References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/320
>> References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/449
>> Signed-off-by: Matthew Auld <matthew.auld at intel.com>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Cc: José Roberto de Souza <jose.souza at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 74 +++++++++++++--------
>> drivers/gpu/drm/xe/xe_gt_types.h | 5 ++
>> drivers/gpu/drm/xe/xe_guc_ct.c | 11 +--
>> 3 files changed, 54 insertions(+), 36 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>> index 2fcb477604e2..150c7b856b59 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>> @@ -25,7 +25,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
>> tlb_invalidation.fence_tdr.work);
>> struct xe_gt_tlb_invalidation_fence *fence, *next;
>>
>> - mutex_lock(>->uc.guc.ct.lock);
>> + spin_lock_irq(>->tlb_invalidation.pending_lock);
>> list_for_each_entry_safe(fence, next,
>> >->tlb_invalidation.pending_fences, link) {
>> s64 since_inval_ms = ktime_ms_delta(ktime_get(),
>> @@ -47,7 +47,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
>> queue_delayed_work(system_wq,
>> >->tlb_invalidation.fence_tdr,
>> TLB_TIMEOUT);
>> - mutex_unlock(>->uc.guc.ct.lock);
>> + spin_unlock_irq(>->tlb_invalidation.pending_lock);
>> }
>>
>> /**
>> @@ -63,6 +63,7 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
>> {
>> gt->tlb_invalidation.seqno = 1;
>> INIT_LIST_HEAD(>->tlb_invalidation.pending_fences);
>> + spin_lock_init(>->tlb_invalidation.pending_lock);
>> spin_lock_init(>->tlb_invalidation.lock);
>> gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1);
>> INIT_DELAYED_WORK(>->tlb_invalidation.fence_tdr,
>> @@ -92,11 +93,11 @@ invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
>>
>> cancel_delayed_work(>->tlb_invalidation.fence_tdr);
>>
>> - mutex_lock(>->uc.guc.ct.lock);
>> + spin_lock_irq(>->tlb_invalidation.pending_lock);
>> list_for_each_entry_safe(fence, next,
>> >->tlb_invalidation.pending_fences, link)
>> invalidation_fence_signal(fence);
>> - mutex_unlock(>->uc.guc.ct.lock);
>> + spin_unlock_irq(>->tlb_invalidation.pending_lock);
>> }
>>
>> static int send_tlb_invalidation(struct xe_guc *guc,
>> @@ -117,10 +118,12 @@ static int send_tlb_invalidation(struct xe_guc *guc,
>> mutex_lock(&guc->ct.lock);
>> seqno = gt->tlb_invalidation.seqno;
>> if (fence) {
>> + spin_lock_irq(>->tlb_invalidation.pending_lock);
>> queue_work = list_empty(>->tlb_invalidation.pending_fences);
>> fence->seqno = seqno;
>> list_add_tail(&fence->link,
>> >->tlb_invalidation.pending_fences);
>> + spin_unlock_irq(>->tlb_invalidation.pending_lock);
>> trace_xe_gt_tlb_invalidation_fence_send(fence);
>> }
>> action[1] = seqno;
>> @@ -139,8 +142,11 @@ static int send_tlb_invalidation(struct xe_guc *guc,
>> }
>> if (!ret)
>> ret = seqno;
>> - if (ret < 0 && fence)
>> + if (ret < 0 && fence) {
>> + spin_lock_irq(>->tlb_invalidation.pending_lock);
>> invalidation_fence_signal(fence);
>> + spin_unlock_irq(>->tlb_invalidation.pending_lock);
>> + }
>> mutex_unlock(&guc->ct.lock);
>>
>> return ret;
>> @@ -315,41 +321,55 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno)
>> int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
>> {
>> struct xe_gt *gt = guc_to_gt(guc);
>> - struct xe_gt_tlb_invalidation_fence *fence;
>> - int expected_seqno;
>> -
>> - lockdep_assert_held(&guc->ct.lock);
>> + struct xe_gt_tlb_invalidation_fence *fence, *next;
>> + unsigned long flags;
>>
>> if (unlikely(len != 1))
>> return -EPROTO;
>>
>> - /* Sanity check on seqno */
>> - expected_seqno = (gt->tlb_invalidation.seqno_recv + 1) %
>> - TLB_INVALIDATION_SEQNO_MAX;
>> - if (!expected_seqno)
>> - expected_seqno = 1;
>> - if (drm_WARN_ON(>_to_xe(gt)->drm, expected_seqno != msg[0])) {
>> - drm_err(>_to_xe(gt)->drm, "TLB expected_seqno(%d) != msg(%u)\n",
>> - expected_seqno, msg[0]);
>> + /*
>> + * This can also be run both directly from the IRQ handler and also in
>> + * process_g2h_msg(). Only one may process any individual CT message,
>> + * however the order they are processed here could result in skipping a
>> + * seqno. To handle that we just process all the seqnos from the last
>> + * seqno_recv up to and including the one in msg[0]. The delta should be
>> + * very small so there shouldn't be much of pending_fences we actually
>> + * need to iterate over here.
>> + *
>> + * From GuC POV we expect the seqnos to always appear in-order, so if we
>> + * see something later in the timeline we can be sure that anything
>> + * appearing earlier has already signalled, just that we have yet to
>> + * officially process the CT message like if racing against
>> + * process_g2h_msg().
>> + */
>> + spin_lock_irqsave(>->tlb_invalidation.pending_lock, flags);
>> + if (tlb_invalidation_seqno_past(gt, msg[0])) {
>> + spin_unlock_irqrestore(>->tlb_invalidation.pending_lock, flags);
>> + return 0;
>> }
>>
>> gt->tlb_invalidation.seqno_recv = msg[0];
>> smp_wmb();
>> wake_up_all(&guc->ct.wq);
>>
>> - fence = list_first_entry_or_null(>->tlb_invalidation.pending_fences,
>> - typeof(*fence), link);
>> - if (fence)
>> + list_for_each_entry_safe(fence, next,
>> + >->tlb_invalidation.pending_fences, link) {
>> trace_xe_gt_tlb_invalidation_fence_recv(fence);
>> - if (fence && tlb_invalidation_seqno_past(gt, fence->seqno)) {
>> +
>> + if (!tlb_invalidation_seqno_past(gt, fence->seqno))
>> + break;
>> +
>> invalidation_fence_signal(fence);
>> - if (!list_empty(>->tlb_invalidation.pending_fences))
>> - mod_delayed_work(system_wq,
>> - >->tlb_invalidation.fence_tdr,
>> - TLB_TIMEOUT);
>> - else
>> - cancel_delayed_work(>->tlb_invalidation.fence_tdr);
>> }
>>
>> + if (!list_empty(>->tlb_invalidation.pending_fences))
>> + mod_delayed_work(system_wq,
>> + >->tlb_invalidation.fence_tdr,
>> + TLB_TIMEOUT);
>> + else
>> + cancel_delayed_work(>->tlb_invalidation.fence_tdr);
>> +
>> + spin_unlock_irqrestore(>->tlb_invalidation.pending_lock, flags);
>> +
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
>> index 7d4de019f9a5..28b8e8a86fc9 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>> @@ -163,6 +163,11 @@ struct xe_gt {
>> * invaliations, protected by CT lock
>> */
>> struct list_head pending_fences;
>> + /**
>> + * @pending_lock: protects @pending_fences and updating
>> + * @seqno_recv.
>> + */
>> + spinlock_t pending_lock;
>> /**
>> * @fence_tdr: schedules a delayed call to
>> * xe_gt_tlb_fence_timeout after the timeut interval is over.
>> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
>> index 22bc9ce846db..fa2e2749279c 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>> @@ -976,15 +976,8 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
>> return 0;
>>
>> switch (FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1])) {
>> - /*
>> - * FIXME: We really should process
>> - * XE_GUC_ACTION_TLB_INVALIDATION_DONE here in the fast-path as
>> - * these critical for page fault performance. We currently can't
>> - * due to TLB invalidation done algorithm expecting the seqno
>> - * returned in-order. With some small changes to the algorithm
>> - * and locking we should be able to support out-of-order seqno.
>> - */
>> case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
>> + case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
>> break; /* Process these in fast-path */
>> default:
>> return 0;
>> @@ -1038,7 +1031,7 @@ void xe_guc_ct_fast_path(struct xe_guc_ct *ct)
>> struct xe_device *xe = ct_to_xe(ct);
>> int len;
>>
>> - if (!xe_device_in_fault_mode(xe) || !xe_device_mem_access_ongoing(xe))
>> + if (!xe_device_mem_access_ongoing(xe))
>> return;
>>
>> spin_lock(&ct->fast_lock);
>
More information about the Intel-xe
mailing list