[Intel-xe] [PATCH v4 7/7] drm/xe: handle TLB invalidations from CT fast-path
Matthew Brost
matthew.brost at intel.com
Thu Jul 6 04:14:12 UTC 2023
On Wed, Jul 05, 2023 at 05:06:10PM +0100, Matthew Auld wrote:
> In various test cases that put the system under a heavy load, we can
> sometimes see errors with missed TLB invalidations. In such cases we see
> the interrupt arrive for the invalidation from the GuC, however the
> actual processing of the completion is pushed onto a workqueue and
> handled with all the other CT stuff, which might take longer than
> expected. Since we expect TLB invalidations to complete within a
> reasonable amount of time (at most ~250ms), and they do seem pretty
> critical, allow handling directly from the CT fast-path.
>
> v2 (José):
> - Actually use the correct spinlock/unlock_irq, since pending_lock is
> grabbed from IRQ.
> v3:
> - Don't publish the TLB fence on the list until after we fully
> initialize it and successfully do the CT send. The list is now only
> protected by the spin_lock pending_lock and we can't hold that
> across the entire TLB send operation.
>
> References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/297
> References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/320
> References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/449
> Signed-off-by: Matthew Auld <matthew.auld at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
This all looks correct to me but I suggest some extensive testing (e.g.
more than just CI) as this look like a risky change.
Assuming this is throughly tested:
Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> Cc: José Roberto de Souza <jose.souza at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 83 +++++++++++++--------
> drivers/gpu/drm/xe/xe_gt_types.h | 5 ++
> drivers/gpu/drm/xe/xe_guc_ct.c | 12 +--
> 3 files changed, 59 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> index 51789ec9ad57..18ce3149446d 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> @@ -25,7 +25,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
> tlb_invalidation.fence_tdr.work);
> struct xe_gt_tlb_invalidation_fence *fence, *next;
>
> - mutex_lock(>->uc.guc.ct.lock);
> + spin_lock_irq(>->tlb_invalidation.pending_lock);
> list_for_each_entry_safe(fence, next,
> >->tlb_invalidation.pending_fences, link) {
> s64 since_inval_ms = ktime_ms_delta(ktime_get(),
> @@ -47,7 +47,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
> queue_delayed_work(system_wq,
> >->tlb_invalidation.fence_tdr,
> TLB_TIMEOUT);
> - mutex_unlock(>->uc.guc.ct.lock);
> + spin_unlock_irq(>->tlb_invalidation.pending_lock);
> }
>
> /**
> @@ -63,6 +63,7 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
> {
> gt->tlb_invalidation.seqno = 1;
> INIT_LIST_HEAD(>->tlb_invalidation.pending_fences);
> + spin_lock_init(>->tlb_invalidation.pending_lock);
> spin_lock_init(>->tlb_invalidation.lock);
> gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1);
> INIT_DELAYED_WORK(>->tlb_invalidation.fence_tdr,
> @@ -97,6 +98,7 @@ invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
> */
>
> mutex_lock(>->uc.guc.ct.lock);
> + spin_lock_irq(>->tlb_invalidation.pending_lock);
> cancel_delayed_work(>->tlb_invalidation.fence_tdr);
> /*
> * We might have various kworkers waiting for TLB flushes to complete
> @@ -112,6 +114,7 @@ invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
> list_for_each_entry_safe(fence, next,
> >->tlb_invalidation.pending_fences, link)
> invalidation_fence_signal(fence);
> + spin_unlock_irq(>->tlb_invalidation.pending_lock);
> mutex_unlock(>->uc.guc.ct.lock);
> }
>
> @@ -122,7 +125,6 @@ static int send_tlb_invalidation(struct xe_guc *guc,
> struct xe_gt *gt = guc_to_gt(guc);
> int seqno;
> int ret;
> - bool queue_work;
>
> /*
> * XXX: The seqno algorithm relies on TLB invalidation being processed
> @@ -133,21 +135,28 @@ static int send_tlb_invalidation(struct xe_guc *guc,
> mutex_lock(&guc->ct.lock);
> seqno = gt->tlb_invalidation.seqno;
> if (fence) {
> - queue_work = list_empty(>->tlb_invalidation.pending_fences);
> fence->seqno = seqno;
> - list_add_tail(&fence->link,
> - >->tlb_invalidation.pending_fences);
> trace_xe_gt_tlb_invalidation_fence_send(fence);
> }
> action[1] = seqno;
> ret = xe_guc_ct_send_locked(&guc->ct, action, len,
> G2H_LEN_DW_TLB_INVALIDATE, 1);
> if (!ret && fence) {
> + spin_lock_irq(>->tlb_invalidation.pending_lock);
> fence->invalidation_time = ktime_get();
> - if (queue_work)
> + list_add_tail(&fence->link,
> + >->tlb_invalidation.pending_fences);
> +
> + if (list_is_singular(>->tlb_invalidation.pending_fences))
> queue_delayed_work(system_wq,
> >->tlb_invalidation.fence_tdr,
> TLB_TIMEOUT);
> +
> + spin_unlock_irq(>->tlb_invalidation.pending_lock);
> + } else if (ret < 0 && fence) {
> + trace_xe_gt_tlb_invalidation_fence_signal(fence);
> + dma_fence_signal(&fence->base);
> + dma_fence_put(&fence->base);
> }
> if (!ret) {
> gt->tlb_invalidation.seqno = (gt->tlb_invalidation.seqno + 1) %
> @@ -156,8 +165,6 @@ static int send_tlb_invalidation(struct xe_guc *guc,
> gt->tlb_invalidation.seqno = 1;
> ret = seqno;
> }
> - if (ret < 0 && fence)
> - invalidation_fence_signal(fence);
> mutex_unlock(&guc->ct.lock);
>
> return ret;
> @@ -332,41 +339,55 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno)
> int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
> {
> struct xe_gt *gt = guc_to_gt(guc);
> - struct xe_gt_tlb_invalidation_fence *fence;
> - int expected_seqno;
> -
> - lockdep_assert_held(&guc->ct.lock);
> + struct xe_gt_tlb_invalidation_fence *fence, *next;
> + unsigned long flags;
>
> if (unlikely(len != 1))
> return -EPROTO;
>
> - /* Sanity check on seqno */
> - expected_seqno = (gt->tlb_invalidation.seqno_recv + 1) %
> - TLB_INVALIDATION_SEQNO_MAX;
> - if (!expected_seqno)
> - expected_seqno = 1;
> - if (drm_WARN_ON(>_to_xe(gt)->drm, expected_seqno != msg[0])) {
> - drm_err(>_to_xe(gt)->drm, "TLB expected_seqno(%d) != msg(%u)\n",
> - expected_seqno, msg[0]);
> + /*
> + * This can also be run both directly from the IRQ handler and also in
> + * process_g2h_msg(). Only one may process any individual CT message,
> + * however the order they are processed here could result in skipping a
> + * seqno. To handle that we just process all the seqnos from the last
> + * seqno_recv up to and including the one in msg[0]. The delta should be
> + * very small so there shouldn't be much of pending_fences we actually
> + * need to iterate over here.
> + *
> + * From GuC POV we expect the seqnos to always appear in-order, so if we
> + * see something later in the timeline we can be sure that anything
> + * appearing earlier has already signalled, just that we have yet to
> + * officially process the CT message like if racing against
> + * process_g2h_msg().
> + */
> + spin_lock_irqsave(>->tlb_invalidation.pending_lock, flags);
> + if (tlb_invalidation_seqno_past(gt, msg[0])) {
> + spin_unlock_irqrestore(>->tlb_invalidation.pending_lock, flags);
> + return 0;
> }
>
> gt->tlb_invalidation.seqno_recv = msg[0];
> smp_wmb();
> wake_up_all(&guc->ct.wq);
>
> - fence = list_first_entry_or_null(>->tlb_invalidation.pending_fences,
> - typeof(*fence), link);
> - if (fence)
> + list_for_each_entry_safe(fence, next,
> + >->tlb_invalidation.pending_fences, link) {
> trace_xe_gt_tlb_invalidation_fence_recv(fence);
> - if (fence && tlb_invalidation_seqno_past(gt, fence->seqno)) {
> +
> + if (!tlb_invalidation_seqno_past(gt, fence->seqno))
> + break;
> +
> invalidation_fence_signal(fence);
> - if (!list_empty(>->tlb_invalidation.pending_fences))
> - mod_delayed_work(system_wq,
> - >->tlb_invalidation.fence_tdr,
> - TLB_TIMEOUT);
> - else
> - cancel_delayed_work(>->tlb_invalidation.fence_tdr);
> }
>
> + if (!list_empty(>->tlb_invalidation.pending_fences))
> + mod_delayed_work(system_wq,
> + >->tlb_invalidation.fence_tdr,
> + TLB_TIMEOUT);
> + else
> + cancel_delayed_work(>->tlb_invalidation.fence_tdr);
> +
> + spin_unlock_irqrestore(>->tlb_invalidation.pending_lock, flags);
> +
> return 0;
> }
> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> index 7d4de019f9a5..28b8e8a86fc9 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -163,6 +163,11 @@ struct xe_gt {
> * invaliations, protected by CT lock
> */
> struct list_head pending_fences;
> + /**
> + * @pending_lock: protects @pending_fences and updating
> + * @seqno_recv.
> + */
> + spinlock_t pending_lock;
> /**
> * @fence_tdr: schedules a delayed call to
> * xe_gt_tlb_fence_timeout after the timeut interval is over.
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index 811518230262..d5e9ed53c655 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -988,15 +988,8 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
> return 0;
>
> switch (FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1])) {
> - /*
> - * FIXME: We really should process
> - * XE_GUC_ACTION_TLB_INVALIDATION_DONE here in the fast-path as
> - * these critical for page fault performance. We currently can't
> - * due to TLB invalidation done algorithm expecting the seqno
> - * returned in-order. With some small changes to the algorithm
> - * and locking we should be able to support out-of-order seqno.
> - */
> case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
> + case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
> break; /* Process these in fast-path */
> default:
> return 0;
> @@ -1050,8 +1043,7 @@ void xe_guc_ct_fast_path(struct xe_guc_ct *ct)
> struct xe_device *xe = ct_to_xe(ct);
> int len;
>
> - if (!xe_device_in_fault_mode(xe) ||
> - !xe_device_mem_access_get_if_ongoing(xe))
> + if (!xe_device_mem_access_get_if_ongoing(xe))
> return;
>
> spin_lock(&ct->fast_lock);
> --
> 2.41.0
>
More information about the Intel-xe
mailing list