[Intel-gfx] [PATCH 5/7] drm/i915: Request watchdog infrastructure

Fri Mar 26 00:00:10 UTC 2021

On Wed, Mar 24, 2021 at 12:13:33PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> Prepares the plumbing for setting request/fence expiration time. All code
> is put in place but is never activated due yet missing ability to actually
> configure the timer.
> 
> Outline of the basic operation:
> 
> A timer is started when request is ready for execution. If the request
> completes (retires) before the timer fires, timer is cancelled and nothing
> further happens.
> 
> If the timer fires request is added to a lockless list and worker queued.
> Purpose of this is twofold: a) It allows request cancellation from a more
> friendly context and b) coalesces multiple expirations into a single event
> of consuming the list.
> 
> Worker locklessly consumes the list of expired requests and cancels them
> all using previous added i915_request_cancel().
> 
> Associated timeout value is stored in rq->context.watchdog.timeout_us.
> 
> v2:
>  * Log expiration.
> 
> v3:
>  * Include more information about user timeline in the log message.
> 
> v4:
>  * Remove obsolete comment and fix formatting. (Matt)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> Reviewed-by: Matthew Auld <matthew.auld at intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_context_types.h |  4 ++
>  .../drm/i915/gt/intel_execlists_submission.h  |  2 +
>  drivers/gpu/drm/i915/gt/intel_gt.c            |  3 ++
>  drivers/gpu/drm/i915/gt/intel_gt.h            |  2 +
>  drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 28 ++++++++++
>  drivers/gpu/drm/i915/gt/intel_gt_types.h      |  7 +++
>  drivers/gpu/drm/i915/i915_request.c           | 52 +++++++++++++++++++
>  drivers/gpu/drm/i915/i915_request.h           |  8 +++
>  8 files changed, 106 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 0ea18c9e2aca..65a5730a4f5b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -99,6 +99,10 @@ struct intel_context {
>  #define CONTEXT_FORCE_SINGLE_SUBMISSION	7
>  #define CONTEXT_NOPREEMPT		8
>  
> +	struct {
> +		u64 timeout_us;
> +	} watchdog;
> +
>  	u32 *lrc_reg_state;
>  	union {
>  		struct {
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> index f7bd3fccfee8..4ca9b475e252 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> @@ -6,6 +6,7 @@
>  #ifndef __INTEL_EXECLISTS_SUBMISSION_H__
>  #define __INTEL_EXECLISTS_SUBMISSION_H__
>  
> +#include <linux/llist.h>
>  #include <linux/types.h>
>  
>  struct drm_printer;
> @@ -13,6 +14,7 @@ struct drm_printer;
>  struct i915_request;
>  struct intel_context;
>  struct intel_engine_cs;
> +struct intel_gt;
>  
>  enum {
>  	INTEL_CONTEXT_SCHEDULE_IN = 0,
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index ca76f93bc03d..8d77dcbad059 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
>  	INIT_LIST_HEAD(&gt->closed_vma);
>  	spin_lock_init(&gt->closed_lock);
>  
> +	init_llist_head(&gt->watchdog.list);
> +	INIT_WORK(&gt->watchdog.work, intel_gt_watchdog_work);
> +
>  	intel_gt_init_buffer_pool(gt);
>  	intel_gt_init_reset(gt);
>  	intel_gt_init_requests(gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> index a17bd8b3195f..7ec395cace69 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
>  void intel_gt_info_print(const struct intel_gt_info *info,
>  			 struct drm_printer *p);
>  
> +void intel_gt_watchdog_work(struct work_struct *work);
> +
>  #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 36ec97f79174..fbfd19b2e5f2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -8,6 +8,7 @@
>  #include "i915_drv.h" /* for_each_engine() */
>  #include "i915_request.h"
>  #include "intel_engine_heartbeat.h"
> +#include "intel_execlists_submission.h"
>  #include "intel_gt.h"
>  #include "intel_gt_pm.h"
>  #include "intel_gt_requests.h"
> @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt)
>  {
>  	/* Wait until the work is marked as finished before unloading! */
>  	cancel_delayed_work_sync(&gt->requests.retire_work);
> +
> +	flush_work(&gt->watchdog.work);
> +}
> +
> +void intel_gt_watchdog_work(struct work_struct *work)
> +{
> +	struct intel_gt *gt =
> +		container_of(work, typeof(*gt), watchdog.work);
> +	struct i915_request *rq, *rn;
> +	struct llist_node *first;
> +
> +	first = llist_del_all(&gt->watchdog.list);
> +	if (!first)
> +		return;
> +
> +	llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
> +		if (!i915_request_completed(rq)) {
> +			struct dma_fence *f = &rq->fence;
> +
> +			pr_notice("Fence expiration time out i915-%s:%s:%llx!\n",
> +				  f->ops->get_driver_name(f),
> +				  f->ops->get_timeline_name(f),
> +				  f->seqno);
> +			i915_request_cancel(rq, -EINTR);
> +		}
> +		i915_request_put(rq);
> +	}
>  }
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index 626af37c7790..d70ebcc6f19f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -8,10 +8,12 @@
>  
>  #include <linux/ktime.h>
>  #include <linux/list.h>
> +#include <linux/llist.h>
>  #include <linux/mutex.h>
>  #include <linux/notifier.h>
>  #include <linux/spinlock.h>
>  #include <linux/types.h>
> +#include <linux/workqueue.h>
>  
>  #include "uc/intel_uc.h"
>  
> @@ -62,6 +64,11 @@ struct intel_gt {
>  		struct delayed_work retire_work;
>  	} requests;
>  
> +	struct {
> +		struct llist_head list;
> +		struct work_struct work;
> +	} watchdog;
> +
>  	struct intel_wakeref wakeref;
>  	atomic_t user_wakeref;
>  
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index d1a4a3fa7425..d58052f3410c 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -321,6 +321,53 @@ static void remove_from_engine(struct i915_request *rq)
>  	__notify_execute_cb_imm(rq);
>  }
>  
> +static void __rq_init_watchdog(struct i915_request *rq)
> +{
> +	rq->watchdog.timer.function = NULL;
> +}
> +
> +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer)
> +{
> +	struct i915_request *rq =
> +		container_of(hrtimer, struct i915_request, watchdog.timer);
> +	struct intel_gt *gt = rq->engine->gt;
> +
> +	if (!i915_request_completed(rq)) {
> +		if (llist_add(&rq->watchdog.link, &gt->watchdog.list))
> +			schedule_work(&gt->watchdog.work);
> +	} else {
> +		i915_request_put(rq);
> +	}
> +
> +	return HRTIMER_NORESTART;
> +}
> +
> +static void __rq_arm_watchdog(struct i915_request *rq)
> +{
> +	struct i915_request_watchdog *wdg = &rq->watchdog;
> +	struct intel_context *ce = rq->context;
> +
> +	if (!ce->watchdog.timeout_us)
> +		return;
> +
> +	hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +	wdg->timer.function = __rq_watchdog_expired;
> +	hrtimer_start_range_ns(&wdg->timer,
> +			       ns_to_ktime(ce->watchdog.timeout_us *
> +					   NSEC_PER_USEC),
> +			       NSEC_PER_MSEC,
> +			       HRTIMER_MODE_REL);
> +	i915_request_get(rq);

Shouldn't we grab the new reference before we arm the timer? Either way
since fairly academic I went ahead and applied, but if you agree pls do a
follow up patch.
-Daniel

> +}
> +
> +static void __rq_cancel_watchdog(struct i915_request *rq)
> +{
> +	struct i915_request_watchdog *wdg = &rq->watchdog;
> +
> +	if (wdg->timer.function && hrtimer_try_to_cancel(&wdg->timer) > 0)
> +		i915_request_put(rq);
> +}
> +
>  bool i915_request_retire(struct i915_request *rq)
>  {
>  	if (!__i915_request_is_complete(rq))
> @@ -332,6 +379,8 @@ bool i915_request_retire(struct i915_request *rq)
>  	trace_i915_request_retire(rq);
>  	i915_request_mark_complete(rq);
>  
> +	__rq_cancel_watchdog(rq);
> +
>  	/*
>  	 * We know the GPU must have read the request to have
>  	 * sent us the seqno + interrupt, so use the position
> @@ -692,6 +741,8 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>  
>  		if (unlikely(fence->error))
>  			i915_request_set_error_once(request, fence->error);
> +		else
> +			__rq_arm_watchdog(request);
>  
>  		/*
>  		 * We need to serialize use of the submit_request() callback
> @@ -879,6 +930,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
>  
>  	/* No zalloc, everything must be cleared after use */
>  	rq->batch = NULL;
> +	__rq_init_watchdog(rq);
>  	GEM_BUG_ON(rq->capture_list);
>  	GEM_BUG_ON(!llist_empty(&rq->execute_cb));
>  
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 2dea55ea69e1..5e0946992f1a 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -26,7 +26,9 @@
>  #define I915_REQUEST_H
>  
>  #include <linux/dma-fence.h>
> +#include <linux/hrtimer.h>
>  #include <linux/irq_work.h>
> +#include <linux/llist.h>
>  #include <linux/lockdep.h>
>  
>  #include "gem/i915_gem_context_types.h"
> @@ -289,6 +291,12 @@ struct i915_request {
>  	/** timeline->request entry for this request */
>  	struct list_head link;
>  
> +	/** Watchdog support fields. */
> +	struct i915_request_watchdog {
> +		struct llist_node link;
> +		struct hrtimer timer;
> +	} watchdog;
> +
>  	I915_SELFTEST_DECLARE(struct {
>  		struct list_head link;
>  		unsigned long delay;
> -- 
> 2.27.0
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch