[PATCH 1/2] drm/xe: Add sent and recv counters for tlb invalidations

Wed Jul 24 09:20:19 UTC 2024

On 7/23/2024 6:23 PM, Matthew Brost wrote:
> On Tue, Jul 23, 2024 at 03:07:05PM +0200, Nirmoy Das wrote:
>> On 7/23/2024 2:22 PM, Michal Wajdeczko wrote:
>>> On 23.07.2024 13:16, Nirmoy Das wrote:
>>>> Add counters for TLB invalidation sent, receive requests which
>>>> then could be query as sysfs files from userspace.
>>> s/sysfs/debugfs ?
>>
>> I will fix it.
>>
> I think this debugfs then I think we certainly need to hide this
> implementation behind a Kconfig option as atomics have a non-zero
> execution cost.
>
> I'm thinking a generic DRM_XE_DEBUG_STATS or something.
>
> Then also with that, build generic stats layer which other code calls
> into and stats object which encapsulates all the stats.
>
> Fine with only having TLB invalidations to start but having the
> infrastructure to add more stats over time would be good.

Ok, I will give it a try and improve it with further suggestions.

Thanks,

Nirmoy

>
> Matt
>
>>>> Cc: Matthew Brost <matthew.brost at intel.com>
>>>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>>>> Cc: Sai Gowtham Ch <sai.gowtham.ch at intel.com>
>>>> Signed-off-by: Nirmoy Das <nirmoy.das at intel.com>
>>>> ---
>>>>    drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 37 +++++++++++++++------
>>>>    drivers/gpu/drm/xe/xe_gt_types.h            |  4 +++
>>>>    2 files changed, 30 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>>>> index 481d83d07367..f84717c1aafa 100644
>>>> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>>>> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>>>> @@ -37,8 +37,11 @@ static long tlb_timeout_jiffies(struct xe_gt *gt)
>>>>    }
>>>>    static void
>>>> -__invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence)
>>>> +__invalidation_fence_signal(struct xe_gt *gt,
>>>> +			    struct xe_gt_tlb_invalidation_fence *fence,
>>>> +			    bool failed)
>>>>    {
>>>> +	struct xe_device *xe = gt_to_xe(gt);
>>>>    	bool stack = test_bit(FENCE_STACK_BIT, &fence->base.flags);
>>>>    	trace_xe_gt_tlb_invalidation_fence_signal(xe, fence);
>>>> @@ -46,13 +49,19 @@ __invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_
>>>>    	dma_fence_signal(&fence->base);
>>>>    	if (!stack)
>>>>    		dma_fence_put(&fence->base);
>>>> +
>>>> +	/* Only increment the counter when tlb inval is done successfully */
>>>> +	if (!failed)
>>>> +		atomic64_inc(&gt->tlb_invalidation.received_count);
>>>>    }
>>>>    static void
>>>> -invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence)
>>>> +invalidation_fence_signal(struct xe_gt *gt,
>>>> +			  struct xe_gt_tlb_invalidation_fence *fence,
>>>> +			  bool failed)
>>>>    {
>>>>    	list_del(&fence->link);
>>>> -	__invalidation_fence_signal(xe, fence);
>>>> +	__invalidation_fence_signal(gt, fence, failed);
>>>>    }
>>>>    static void xe_gt_tlb_fence_timeout(struct work_struct *work)
>>>> @@ -76,7 +85,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
>>>>    			  fence->seqno, gt->tlb_invalidation.seqno_recv);
>>>>    		fence->base.error = -ETIME;
>>>> -		invalidation_fence_signal(xe, fence);
>>>> +		invalidation_fence_signal(gt, fence, true);
>>>>    	}
>>>>    	if (!list_empty(&gt->tlb_invalidation.pending_fences))
>>>>    		queue_delayed_work(system_wq,
>>>> @@ -102,6 +111,8 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
>>>>    	spin_lock_init(&gt->tlb_invalidation.lock);
>>>>    	INIT_DELAYED_WORK(&gt->tlb_invalidation.fence_tdr,
>>>>    			  xe_gt_tlb_fence_timeout);
>>>> +	atomic64_set(&gt->tlb_invalidation.sent_count, 0);
>>>> +	atomic64_set(&gt->tlb_invalidation.received_count, 0);
>>>>    	return 0;
>>>>    }
>>>> @@ -140,7 +151,9 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt)
>>>>    	list_for_each_entry_safe(fence, next,
>>>>    				 &gt->tlb_invalidation.pending_fences, link)
>>>> -		invalidation_fence_signal(gt_to_xe(gt), fence);
>>>> +		invalidation_fence_signal(gt, fence, false);
>>>> +	atomic64_set(&gt->tlb_invalidation.sent_count, 0);
>>>> +	atomic64_set(&gt->tlb_invalidation.received_count, 0);
>>> hmm, any TLB invalidation timeouts/errors, which would make
>>> received_count != sent_count, should trigger a GT reset, which in turn
>>> will reset those counters, so under which condition you expect those two
>>> stats being not equal?
>> We tolerate GGTT tlb inval timeouts without needed to do a GT reset,
>> probably we shouldn't? If not then, I agree that we can have
>>
>> a total sent counter  and another for inflight counter.
>>
>>
>>> is it just during the waiting for some ack?
>>>
>>> maybe better/cleaner option would be to track/display number of TLB
>>> invalidation requests in flight ?
>>
>> Request from Sai was about having total tlb inval sent counter and I think
>> inflight would be a bonus and should be useful for debugging.
>>
>>
>> Regards,
>>
>> Nirmoy
>>
>>>>    	spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
>>>>    	mutex_unlock(&gt->uc.guc.ct.lock);
>>>>    }
>>>> @@ -182,7 +195,7 @@ static int send_tlb_invalidation(struct xe_guc *guc,
>>>>    	action[1] = seqno;
>>>>    	ret = xe_guc_ct_send_locked(&guc->ct, action, len,
>>>>    				    G2H_LEN_DW_TLB_INVALIDATE, 1);
>>>> -	if (!ret && fence) {
>>>> +	if (!ret) {
>>>>    		spin_lock_irq(&gt->tlb_invalidation.pending_lock);
>>>>    		/*
>>>>    		 * We haven't actually published the TLB fence as per
>>>> @@ -191,7 +204,7 @@ static int send_tlb_invalidation(struct xe_guc *guc,
>>>>    		 * we can just go ahead and signal the fence here.
>>>>    		 */
>>>>    		if (tlb_invalidation_seqno_past(gt, seqno)) {
>>>> -			__invalidation_fence_signal(xe, fence);
>>>> +			__invalidation_fence_signal(gt, fence, false);
>>>>    		} else {
>>>>    			fence->invalidation_time = ktime_get();
>>>>    			list_add_tail(&fence->link,
>>>> @@ -203,14 +216,16 @@ static int send_tlb_invalidation(struct xe_guc *guc,
>>>>    						   tlb_timeout_jiffies(gt));
>>>>    		}
>>>>    		spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
>>>> -	} else if (ret < 0 && fence) {
>>>> -		__invalidation_fence_signal(xe, fence);
>>>> +	} else if (ret < 0) {
>>>> +		__invalidation_fence_signal(gt, fence, true);
>>>>    	}
>>>>    	if (!ret) {
>>>>    		gt->tlb_invalidation.seqno = (gt->tlb_invalidation.seqno + 1) %
>>>>    			TLB_INVALIDATION_SEQNO_MAX;
>>>>    		if (!gt->tlb_invalidation.seqno)
>>>>    			gt->tlb_invalidation.seqno = 1;
>>>> +
>>>> +		atomic64_inc(&gt->tlb_invalidation.sent_count);
>>>>    	}
>>>>    	mutex_unlock(&guc->ct.lock);
>>>> @@ -321,7 +336,7 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
>>>>    	/* Execlists not supported */
>>>>    	if (gt_to_xe(gt)->info.force_execlist) {
>>>> -		__invalidation_fence_signal(xe, fence);
>>>> +		__invalidation_fence_signal(gt, fence, true);
>>>>    		return 0;
>>>>    	}
>>>> @@ -455,7 +470,7 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
>>>>    		if (!tlb_invalidation_seqno_past(gt, fence->seqno))
>>>>    			break;
>>>> -		invalidation_fence_signal(xe, fence);
>>>> +		invalidation_fence_signal(gt, fence, false);
>>>>    	}
>>>>    	if (!list_empty(&gt->tlb_invalidation.pending_fences))
>>>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
>>>> index ef68c4a92972..130d9f5cb5c2 100644
>>>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>>>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>>>> @@ -199,6 +199,10 @@ struct xe_gt {
>>>>    		struct delayed_work fence_tdr;
>>>>    		/** @tlb_invalidation.lock: protects TLB invalidation fences */
>>>>    		spinlock_t lock;
>>>> +		/** @tlb_invalidation.sent_count: counter for sent TLB inval requests */
>>>> +		atomic64_t sent_count;
>>>> +		/** @tlb_invalidation.received_count: counter for received TLB inval requestes */
>>>> +		atomic64_t received_count;
>>>>    	} tlb_invalidation;
>>>>    	/**