[PATCH 3/6] watchdog

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Tue Jan 12 17:22:17 UTC 2021


From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context_types.h |  4 ++
 .../drm/i915/gt/intel_execlists_submission.h  |  2 +
 drivers/gpu/drm/i915/gt/intel_gt.c            |  3 ++
 drivers/gpu/drm/i915/gt/intel_gt.h            |  2 +
 drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 20 +++++++++
 drivers/gpu/drm/i915/gt/intel_gt_types.h      |  7 ++++
 drivers/gpu/drm/i915/i915_request.c           | 41 ++++++++++++++++++-
 drivers/gpu/drm/i915/i915_request.h           |  9 ++++
 8 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index e10d78601bbd..b457d6c49325 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -97,6 +97,10 @@ struct intel_context {
 #define CONTEXT_FORCE_SINGLE_SUBMISSION	7
 #define CONTEXT_NOPREEMPT		8
 
+	struct {
+		u64 timeout_us;
+	} watchdog;
+
 	u32 *lrc_reg_state;
 	union {
 		struct {
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
index 0c675bbff351..e53f8f3dc636 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
@@ -6,6 +6,7 @@
 #ifndef __INTEL_EXECLISTS_SUBMISSION_H__
 #define __INTEL_EXECLISTS_SUBMISSION_H__
 
+#include <linux/llist.h>
 #include <linux/types.h>
 
 struct drm_printer;
@@ -13,6 +14,7 @@ struct drm_printer;
 struct i915_request;
 struct intel_context;
 struct intel_engine_cs;
+struct intel_gt;
 
 enum {
 	INTEL_CONTEXT_SCHEDULE_IN = 0,
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index d8e1ab412634..ff63034cff9c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -29,6 +29,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
 	INIT_LIST_HEAD(&gt->closed_vma);
 	spin_lock_init(&gt->closed_lock);
 
+	init_llist_head(&gt->watchdog.list);
+	INIT_WORK(&gt->watchdog.work, intel_gt_watchdog_work);
+
 	intel_gt_init_buffer_pool(gt);
 	intel_gt_init_reset(gt);
 	intel_gt_init_requests(gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
index 9157c7411f60..35d3bb13372f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt.h
@@ -77,4 +77,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
 void intel_gt_info_print(const struct intel_gt_info *info,
 			 struct drm_printer *p);
 
+void intel_gt_watchdog_work(struct work_struct *work);
+
 #endif /* __INTEL_GT_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index dc06c78c9eeb..b3dcb1a79838 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -9,6 +9,7 @@
 #include "i915_drv.h" /* for_each_engine() */
 #include "i915_request.h"
 #include "intel_engine_heartbeat.h"
+#include "intel_execlists_submission.h"
 #include "intel_gt.h"
 #include "intel_gt_pm.h"
 #include "intel_gt_requests.h"
@@ -243,4 +244,23 @@ void intel_gt_fini_requests(struct intel_gt *gt)
 {
 	/* Wait until the work is marked as finished before unloading! */
 	cancel_delayed_work_sync(&gt->requests.retire_work);
+
+	flush_work(&gt->watchdog.work);
+}
+
+void intel_gt_watchdog_work(struct work_struct *work)
+{
+	struct intel_gt *gt =
+		container_of(work, typeof(*gt), watchdog.work);
+	struct i915_request *rq, *rn;
+	struct llist_node *first;
+
+	first = llist_del_all(&gt->watchdog.list);
+	if (!first)
+		return;
+
+	llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
+		i915_request_cancel(rq, -ETIMEDOUT);
+		i915_request_put(rq);
+	}
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index a83d3e18254d..b17fa14603f2 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -8,10 +8,12 @@
 
 #include <linux/ktime.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/workqueue.h>
 
 #include "uc/intel_uc.h"
 
@@ -56,6 +58,11 @@ struct intel_gt {
 		struct delayed_work retire_work;
 	} requests;
 
+	struct {
+		struct llist_head list;
+		struct work_struct work;
+	} watchdog;
+
 	struct intel_wakeref wakeref;
 	atomic_t user_wakeref;
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 7660a0a4602a..85efd5cc9b40 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -477,8 +477,8 @@ static bool fatal_error(int error)
 	switch (error) {
 	case 0: /* not an error! */
 	case -EAGAIN: /* innocent victim of a GT reset (__i915_request_reset) */
-	case -ETIMEDOUT: /* waiting for Godot (timer_i915_sw_fence_wake) */
 		return false;
+	case -ETIMEDOUT: /* waiting for Godot (timer_i915_sw_fence_wake) */
 	default:
 		return true;
 	}
@@ -1655,9 +1655,48 @@ void __i915_request_queue_bh(struct i915_request *rq)
 	i915_sw_fence_commit(&rq->submit);
 }
 
+static enum hrtimer_restart request_watchdog_expired(struct hrtimer *hrtimer)
+{
+	struct i915_request *rq =
+		container_of(hrtimer, struct i915_request, watchdog.timer);
+
+	if (i915_request_get_rcu(rq)) {
+		struct intel_gt *gt = rq->engine->gt;
+
+		if (llist_add(&rq->watchdog.link, &gt->watchdog.list))
+			schedule_work(&gt->watchdog.work);
+	}
+
+	return HRTIMER_NORESTART;
+}
+
+static void
+request_watchdog_cancel(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct i915_request *rq = to_request(fence);
+
+	hrtimer_try_to_cancel(&rq->watchdog.timer);
+}
+
 void __i915_request_queue(struct i915_request *rq,
 			  const struct i915_sched_attr *attr)
 {
+	struct intel_context *ce = rq->context;
+
+	if (ce->watchdog.timeout_us) {
+		hrtimer_init(&rq->watchdog.timer, CLOCK_MONOTONIC,
+			     HRTIMER_MODE_REL);
+		rq->watchdog.timer.function = request_watchdog_expired;
+		dma_fence_add_callback(&rq->fence,
+				       &rq->watchdog.fence_cb,
+				       request_watchdog_cancel);
+		hrtimer_start_range_ns(&rq->watchdog.timer,
+				       ns_to_ktime(ce->watchdog.timeout_us *
+						   NSEC_PER_USEC),
+				       NSEC_PER_MSEC, /* FIXME check if it gives the "not sooner" guarantee or slack is both ways */
+				       HRTIMER_MODE_REL);
+	}
+
 	/*
 	 * Let the backend know a new request has arrived that may need
 	 * to adjust the existing execution schedule due to a high priority
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 92f07c8cf0b8..33a474d66550 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -26,7 +26,9 @@
 #define I915_REQUEST_H
 
 #include <linux/dma-fence.h>
+#include <linux/hrtimer.h>
 #include <linux/irq_work.h>
+#include <linux/llist.h>
 #include <linux/lockdep.h>
 
 #include "gem/i915_gem_context_types.h"
@@ -287,6 +289,13 @@ struct i915_request {
 	/** timeline->request entry for this request */
 	struct list_head link;
 
+	/** Watchdog support fields. */
+	struct {
+		struct llist_node link;
+		struct hrtimer timer;
+		struct dma_fence_cb fence_cb;
+	} watchdog;
+
 	I915_SELFTEST_DECLARE(struct {
 		struct list_head link;
 		unsigned long delay;
-- 
2.27.0



More information about the Intel-gfx-trybot mailing list