[PATCH 1/2] drm/xe: Add sent and recv counters for tlb invalidations

Tue Jul 23 16:23:45 UTC 2024

On Tue, Jul 23, 2024 at 03:07:05PM +0200, Nirmoy Das wrote:
> 
> On 7/23/2024 2:22 PM, Michal Wajdeczko wrote:
> > 
> > On 23.07.2024 13:16, Nirmoy Das wrote:
> > > Add counters for TLB invalidation sent, receive requests which
> > > then could be query as sysfs files from userspace.
> > s/sysfs/debugfs ?
> 
> 
> I will fix it.
> 

I think this debugfs then I think we certainly need to hide this
implementation behind a Kconfig option as atomics have a non-zero
execution cost.

I'm thinking a generic DRM_XE_DEBUG_STATS or something.

Then also with that, build generic stats layer which other code calls
into and stats object which encapsulates all the stats.

Fine with only having TLB invalidations to start but having the
infrastructure to add more stats over time would be good.

Matt

> > 
> > > Cc: Matthew Brost <matthew.brost at intel.com>
> > > Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > > Cc: Sai Gowtham Ch <sai.gowtham.ch at intel.com>
> > > Signed-off-by: Nirmoy Das <nirmoy.das at intel.com>
> > > ---
> > >   drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 37 +++++++++++++++------
> > >   drivers/gpu/drm/xe/xe_gt_types.h            |  4 +++
> > >   2 files changed, 30 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > > index 481d83d07367..f84717c1aafa 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > > +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> > > @@ -37,8 +37,11 @@ static long tlb_timeout_jiffies(struct xe_gt *gt)
> > >   }
> > >   static void
> > > -__invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence)
> > > +__invalidation_fence_signal(struct xe_gt *gt,
> > > +			    struct xe_gt_tlb_invalidation_fence *fence,
> > > +			    bool failed)
> > >   {
> > > +	struct xe_device *xe = gt_to_xe(gt);
> > >   	bool stack = test_bit(FENCE_STACK_BIT, &fence->base.flags);
> > >   	trace_xe_gt_tlb_invalidation_fence_signal(xe, fence);
> > > @@ -46,13 +49,19 @@ __invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_
> > >   	dma_fence_signal(&fence->base);
> > >   	if (!stack)
> > >   		dma_fence_put(&fence->base);
> > > +
> > > +	/* Only increment the counter when tlb inval is done successfully */
> > > +	if (!failed)
> > > +		atomic64_inc(&gt->tlb_invalidation.received_count);
> > >   }
> > >   static void
> > > -invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence)
> > > +invalidation_fence_signal(struct xe_gt *gt,
> > > +			  struct xe_gt_tlb_invalidation_fence *fence,
> > > +			  bool failed)
> > >   {
> > >   	list_del(&fence->link);
> > > -	__invalidation_fence_signal(xe, fence);
> > > +	__invalidation_fence_signal(gt, fence, failed);
> > >   }
> > >   static void xe_gt_tlb_fence_timeout(struct work_struct *work)
> > > @@ -76,7 +85,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
> > >   			  fence->seqno, gt->tlb_invalidation.seqno_recv);
> > >   		fence->base.error = -ETIME;
> > > -		invalidation_fence_signal(xe, fence);
> > > +		invalidation_fence_signal(gt, fence, true);
> > >   	}
> > >   	if (!list_empty(&gt->tlb_invalidation.pending_fences))
> > >   		queue_delayed_work(system_wq,
> > > @@ -102,6 +111,8 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
> > >   	spin_lock_init(&gt->tlb_invalidation.lock);
> > >   	INIT_DELAYED_WORK(&gt->tlb_invalidation.fence_tdr,
> > >   			  xe_gt_tlb_fence_timeout);
> > > +	atomic64_set(&gt->tlb_invalidation.sent_count, 0);
> > > +	atomic64_set(&gt->tlb_invalidation.received_count, 0);
> > >   	return 0;
> > >   }
> > > @@ -140,7 +151,9 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt)
> > >   	list_for_each_entry_safe(fence, next,
> > >   				 &gt->tlb_invalidation.pending_fences, link)
> > > -		invalidation_fence_signal(gt_to_xe(gt), fence);
> > > +		invalidation_fence_signal(gt, fence, false);
> > > +	atomic64_set(&gt->tlb_invalidation.sent_count, 0);
> > > +	atomic64_set(&gt->tlb_invalidation.received_count, 0);
> > hmm, any TLB invalidation timeouts/errors, which would make
> > received_count != sent_count, should trigger a GT reset, which in turn
> > will reset those counters, so under which condition you expect those two
> > stats being not equal?
> 
> We tolerate GGTT tlb inval timeouts without needed to do a GT reset,
> probably we shouldn't? If not then, I agree that we can have
> 
> a total sent counter  and another for inflight counter.
> 
> 
> > is it just during the waiting for some ack?
> > 
> > maybe better/cleaner option would be to track/display number of TLB
> > invalidation requests in flight ?
> 
> 
> Request from Sai was about having total tlb inval sent counter and I think
> inflight would be a bonus and should be useful for debugging.
> 
> 
> Regards,
> 
> Nirmoy
> 
> > 
> > >   	spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
> > >   	mutex_unlock(&gt->uc.guc.ct.lock);
> > >   }
> > > @@ -182,7 +195,7 @@ static int send_tlb_invalidation(struct xe_guc *guc,
> > >   	action[1] = seqno;
> > >   	ret = xe_guc_ct_send_locked(&guc->ct, action, len,
> > >   				    G2H_LEN_DW_TLB_INVALIDATE, 1);
> > > -	if (!ret && fence) {
> > > +	if (!ret) {
> > >   		spin_lock_irq(&gt->tlb_invalidation.pending_lock);
> > >   		/*
> > >   		 * We haven't actually published the TLB fence as per
> > > @@ -191,7 +204,7 @@ static int send_tlb_invalidation(struct xe_guc *guc,
> > >   		 * we can just go ahead and signal the fence here.
> > >   		 */
> > >   		if (tlb_invalidation_seqno_past(gt, seqno)) {
> > > -			__invalidation_fence_signal(xe, fence);
> > > +			__invalidation_fence_signal(gt, fence, false);
> > >   		} else {
> > >   			fence->invalidation_time = ktime_get();
> > >   			list_add_tail(&fence->link,
> > > @@ -203,14 +216,16 @@ static int send_tlb_invalidation(struct xe_guc *guc,
> > >   						   tlb_timeout_jiffies(gt));
> > >   		}
> > >   		spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
> > > -	} else if (ret < 0 && fence) {
> > > -		__invalidation_fence_signal(xe, fence);
> > > +	} else if (ret < 0) {
> > > +		__invalidation_fence_signal(gt, fence, true);
> > >   	}
> > >   	if (!ret) {
> > >   		gt->tlb_invalidation.seqno = (gt->tlb_invalidation.seqno + 1) %
> > >   			TLB_INVALIDATION_SEQNO_MAX;
> > >   		if (!gt->tlb_invalidation.seqno)
> > >   			gt->tlb_invalidation.seqno = 1;
> > > +
> > > +		atomic64_inc(&gt->tlb_invalidation.sent_count);
> > >   	}
> > >   	mutex_unlock(&guc->ct.lock);
> > > @@ -321,7 +336,7 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
> > >   	/* Execlists not supported */
> > >   	if (gt_to_xe(gt)->info.force_execlist) {
> > > -		__invalidation_fence_signal(xe, fence);
> > > +		__invalidation_fence_signal(gt, fence, true);
> > >   		return 0;
> > >   	}
> > > @@ -455,7 +470,7 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > >   		if (!tlb_invalidation_seqno_past(gt, fence->seqno))
> > >   			break;
> > > -		invalidation_fence_signal(xe, fence);
> > > +		invalidation_fence_signal(gt, fence, false);
> > >   	}
> > >   	if (!list_empty(&gt->tlb_invalidation.pending_fences))
> > > diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> > > index ef68c4a92972..130d9f5cb5c2 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > > @@ -199,6 +199,10 @@ struct xe_gt {
> > >   		struct delayed_work fence_tdr;
> > >   		/** @tlb_invalidation.lock: protects TLB invalidation fences */
> > >   		spinlock_t lock;
> > > +		/** @tlb_invalidation.sent_count: counter for sent TLB inval requests */
> > > +		atomic64_t sent_count;
> > > +		/** @tlb_invalidation.received_count: counter for received TLB inval requestes */
> > > +		atomic64_t received_count;
> > >   	} tlb_invalidation;
> > >   	/**