[Intel-gfx] [RFC] drm/i915: Don't reset on preemptible workloads

Wed Aug 1 13:56:11 UTC 2018

The current behaviour of the hangcheck is that if we detect that a request
is not making any forward progress, the driver will attempt the engine
reset. If that's not successful, we fall back to a full device reset.

This patch would change it so that if hangcheck encounters a low-priority
workload, it will attempt to preempt it before declaring a hang. If the
preemption is successful, we allow the workload to continue "in background"
(until the next hangcheck run, and the next attempt to preempt it). If the
context was closed, we're simply skipping the workload's execution.

This new behaviour would allow the user to define intentionally large or
passive workloads, that would normally be affected by the hangcheck,
without having to divide them into smaller work.

Suggested-by: Michał Winiarski <michal.winiarski at intel.com>
Signed-off-by: Jakub Bartmiński <jakub.bartminski at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
---
 drivers/gpu/drm/i915/intel_hangcheck.c  | 29 +++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c        | 37 +++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_lrc.h        |  1 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
 4 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 2fc7a0dd0df9..5ebd3ca74855 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -398,6 +398,28 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
 	return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
 }
 
+static bool hangcheck_preempt_workload(struct intel_engine_cs *engine)
+{
+	struct i915_request *active_request;
+	int workload_priority;
+
+	/* We have already tried preempting, but the hardware did not react */
+	if (engine->hangcheck.try_preempt)
+		return false;
+
+	active_request = i915_gem_find_active_request(engine);
+	workload_priority = active_request->gem_context->sched.priority;
+
+	if (workload_priority == I915_CONTEXT_MIN_USER_PRIORITY) {
+		engine->hangcheck.try_preempt = true;
+		engine->hangcheck.active_request = active_request;
+		intel_lr_inject_preempt_context(engine);
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -440,6 +462,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		hangcheck_store_sample(engine, &hc);
 
 		if (engine->hangcheck.stalled) {
+			/*
+			 * Try preempting the current workload before
+			 * declaring the engine hung.
+			 */
+			if (hangcheck_preempt_workload(engine))
+				continue;
+
 			hung |= intel_engine_flag(engine);
 			if (hc.action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fad689efb67a..3ec8dcf64000 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -326,15 +326,39 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
 	struct i915_request *rq, *rn;
 	struct i915_priolist *uninitialized_var(p);
+	struct i915_gem_context *active_context = NULL;
+	bool skip_seqno = false;
+	u32 new_seqno = 0;
 	int last_prio = I915_PRIORITY_INVALID;
 
 	lockdep_assert_held(&engine->timeline.lock);
 
+	if (engine->hangcheck.try_preempt) {
+		rq = engine->hangcheck.active_request;
+		GEM_BUG_ON(!rq);
+
+		active_context = rq->gem_context;
+		GEM_BUG_ON(!active_context);
+
+		/*
+		 * If the workload is preemptible but its context was closed
+		 * we force the engine to skip its execution instead.
+		 */
+		if (i915_gem_context_is_closed(active_context))
+			skip_seqno = true;
+	}
+
 	list_for_each_entry_safe_reverse(rq, rn,
 					 &engine->timeline.requests,
 					 link) {
 		if (i915_request_completed(rq))
-			return;
+			break;
+
+		if (skip_seqno && rq->gem_context == active_context) {
+			new_seqno = max(new_seqno,
+					i915_request_global_seqno(rq));
+			continue;
+		}
 
 		__i915_request_unsubmit(rq);
 		unwind_wa_tail(rq);
@@ -348,6 +372,11 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		GEM_BUG_ON(p->priority != rq_prio(rq));
 		list_add(&rq->sched.link, &p->requests);
 	}
+
+	if (skip_seqno) {
+		intel_write_status_page(engine, I915_GEM_HWS_INDEX, new_seqno);
+		engine->timeline.seqno = new_seqno;
+	}
 }
 
 void
@@ -532,7 +561,7 @@ static void port_assign(struct execlist_port *port, struct i915_request *rq)
 	port_set(port, port_pack(i915_request_get(rq), port_count(port)));
 }
 
-static void inject_preempt_context(struct intel_engine_cs *engine)
+void intel_lr_inject_preempt_context(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists *execlists = &engine->execlists;
 	struct intel_context *ce =
@@ -632,7 +661,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			return;
 
 		if (need_preempt(engine, last, execlists->queue_priority)) {
-			inject_preempt_context(engine);
+			intel_lr_inject_preempt_context(engine);
 			return;
 		}
 
@@ -981,6 +1010,8 @@ static void process_csb(struct intel_engine_cs *engine)
 		    buf[2*head + 1] == execlists->preempt_complete_status) {
 			GEM_TRACE("%s preempt-idle\n", engine->name);
 			complete_preempt_context(execlists);
+			/* We tried and succeeded in preempting the engine */
+			engine->hangcheck.try_preempt = false;
 			continue;
 		}
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index f5a5502ecf70..164cd9e7ce05 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -101,6 +101,7 @@ struct drm_i915_private;
 struct i915_gem_context;
 
 void intel_lr_context_resume(struct drm_i915_private *dev_priv);
+void intel_lr_inject_preempt_context(struct intel_engine_cs *engine);
 
 void intel_execlists_set_default_submission(struct intel_engine_cs *engine);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 57f3787ed6ec..eb38c1bec96b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -124,6 +124,7 @@ struct intel_engine_hangcheck {
 	struct i915_request *active_request;
 	bool stalled:1;
 	bool wedged:1;
+	bool try_preempt:1;
 };
 
 struct intel_ring {
-- 
2.17.1