[PATCH 38/38] drm/i915/gt: Support virtual engine queues

Tue Jan 5 10:25:52 UTC 2021

Allow multiple requests to be queued unto a virtual engine, whereas
before we only allowed a single request to be queued at a time. The
advantage of keeping just one request in the queue was to ensure that we
always decided late which engine to use. However, with the introduction
of the virtual deadline we throttle submission and still only drip one
request into the sibling at a time (unless it is truly empty, but then a
second request will have an earlier deadline than the queued virtual
engine and force itself in front). This also takes advantage that a
virtual engine will remain bound while it is active, i.e. we can not
switch to a second engine until the context is completed -- such that we
cannot be as lazy as lazy can be.

By allowing a full queue, we avoid having to synchronize via the
breadcrumb interrupt everytime, letting the virtual engine reach the
full throughput of the siblings.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_breadcrumbs.c   |   8 +-
 drivers/gpu/drm/i915/gt/intel_breadcrumbs.h   |   9 +-
 .../drm/i915/gt/intel_execlists_submission.c  | 399 +++++++++---------
 drivers/gpu/drm/i915/i915_request.c           |   3 +-
 drivers/gpu/drm/i915/i915_scheduler.c         |  65 ++-
 drivers/gpu/drm/i915/i915_scheduler.h         |   4 +-
 6 files changed, 255 insertions(+), 233 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index 9d385a4ca7f1..f847692d8151 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -474,16 +474,13 @@ void i915_request_cancel_breadcrumb(struct i915_request *rq)
 	i915_request_put(rq);
 }
 
-bool intel_context_remove_breadcrumbs(struct intel_context *ce,
+void intel_context_remove_breadcrumbs(struct intel_context *ce,
 				      struct intel_breadcrumbs *b)
 {
 	struct i915_request *rq, *rn;
 	unsigned long flags;
 	bool release;
 
-	if (list_empty(&ce->signals))
-		return false;
-
 	spin_lock_irqsave(&ce->signal_lock, flags);
 	list_for_each_entry_safe(rq, rn, &ce->signals, signal_link) {
 		GEM_BUG_ON(!__i915_request_is_complete(rq));
@@ -501,7 +498,8 @@ bool intel_context_remove_breadcrumbs(struct intel_context *ce,
 	if (release)
 		intel_context_put(ce);
 
-	return true;
+	while (atomic_read(&b->signaler_active))
+		cpu_relax();
 }
 
 static void print_signals(struct intel_breadcrumbs *b, struct drm_printer *p)
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h
index 5a44a56529bc..3ce5ce270b04 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h
@@ -45,14 +45,7 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
 bool i915_request_enable_breadcrumb(struct i915_request *request);
 void i915_request_cancel_breadcrumb(struct i915_request *request);
 
-bool intel_context_remove_breadcrumbs(struct intel_context *ce,
+void intel_context_remove_breadcrumbs(struct intel_context *ce,
 				      struct intel_breadcrumbs *b);
 
-static inline void
-intel_breadcrumbs_flush_signals(struct intel_breadcrumbs *b)
-{
-	while (atomic_read(&b->signaler_active))
-		cpu_relax();
-}
-
 #endif /* __INTEL_BREADCRUMBS__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 41a79b76e035..5895e0055f64 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -160,17 +160,6 @@ struct virtual_engine {
 	struct intel_context context;
 	struct rcu_work rcu;
 
-	/*
-	 * We allow only a single request through the virtual engine at a time
-	 * (each request in the timeline waits for the completion fence of
-	 * the previous before being submitted). By restricting ourselves to
-	 * only submitting a single request, each request is placed on to a
-	 * physical to maximise load spreading (by virtue of the late greedy
-	 * scheduling -- each real engine takes the next available request
-	 * upon idling).
-	 */
-	struct i915_request *request;
-
 	/*
 	 * We keep a rbtree of available virtual engines inside each physical
 	 * engine, sorted by priority. Here we preallocate the nodes we need
@@ -282,17 +271,27 @@ static struct i915_request *first_request(struct i915_sched_engine *se)
 	return NULL;
 }
 
-static struct i915_request *first_virtual(const struct intel_engine_cs *engine)
+static struct virtual_engine *
+first_virtual_engine(struct intel_engine_cs *engine)
 {
-	struct rb_node *rb;
+	return rb_entry_safe(rb_first_cached(&engine->execlists.virtual),
+			     struct virtual_engine,
+			     nodes[engine->id].rb);
+}
 
-	rb = rb_first_cached(&engine->execlists.virtual);
-	if (!rb)
-		return NULL;
+static const struct i915_request *first_virtual(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq = NULL;
+	struct virtual_engine *ve;
+
+	ve = first_virtual_engine(engine);
+	if (ve) {
+		spin_lock(&ve->base.active.lock);
+		rq = first_request(&ve->base.active);
+		spin_unlock(&ve->base.active.lock);
+	}
 
-	return READ_ONCE(rb_entry(rb,
-				  struct virtual_engine,
-				  nodes[engine->id].rb)->request);
+	return rq;
 }
 
 static const struct i915_request *
@@ -389,7 +388,15 @@ assert_priority_queue(const struct i915_request *prev,
 	if (i915_request_is_active(prev))
 		return true;
 
-	return rq_deadline(prev) <= rq_deadline(next);
+	if (rq_deadline(prev) <= rq_deadline(next))
+		return true;
+
+	ENGINE_TRACE(prev->engine,
+		     "next %llx:%lld dl %lld is before prev %llx:%lld dl %lld\n",
+		     next->fence.context, next->fence.seqno, rq_deadline(next),
+		     prev->fence.context, prev->fence.seqno, rq_deadline(prev));
+
+	return false;
 }
 
 static inline void
@@ -499,7 +506,7 @@ static inline void execlists_schedule_in(struct i915_request *rq, int idx)
 	trace_i915_request_in(rq, idx);
 
 	old = ce->inflight;
-	if (!old)
+	if (!__intel_context_inflight_count(old))
 		old = __execlists_schedule_in(rq);
 	WRITE_ONCE(ce->inflight, ptr_inc(old));
 
@@ -510,30 +517,41 @@ static void
 resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
 {
 	struct intel_engine_cs *engine = rq->engine;
+	struct i915_request *pos = rq;
+	struct intel_timeline *tl;
 
 	spin_lock_irq(&engine->active.lock);
 
-	clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
-	WRITE_ONCE(rq->engine, &ve->base);
-	ve->base.submit_request(rq);
+	if (__i915_request_is_complete(rq))
+		goto unlock;
+
+	tl = i915_request_active_timeline(rq);
+
+	/* Rewind back to the start of this virtual engine queue */
+	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
+		if (__i915_request_is_complete(rq))
+			break;
 
+		pos = rq;
+	}
+
+	/* Resubmit the queue in execution order */
+	spin_lock(&ve->base.active.lock);
+	list_for_each_entry_from(pos, &tl->requests, link) {
+		if (pos->engine != engine)
+			break;
+
+		__i915_request_requeue(rq, &ve->base);
+	}
+	spin_unlock(&ve->base.active.lock);
+
+unlock:
 	spin_unlock_irq(&engine->active.lock);
 }
 
 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
 {
 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
-	struct intel_engine_cs *engine = rq->engine;
-	bool signals;
-
-	/*
-	 * After this point, the rq may be transferred to a new sibling, so
-	 * before we clear ce->inflight make sure that the context has been
-	 * removed from the b->signalers and furthermore we need to make sure
-	 * that the concurrent iterator in signal_irq_work is no longer
-	 * following ce->signal_link.
-	 */
-	signals = intel_context_remove_breadcrumbs(ce, engine->breadcrumbs);
 
 	/*
 	 * This engine is now too busy to run this virtual request, so
@@ -542,15 +560,11 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
 	 * same as other native request.
 	 */
 	if (i915_request_in_priority_queue(rq) &&
-	    rq->execution_mask != engine->mask)
+	    rq->execution_mask != rq->engine->mask)
 		resubmit_virtual_request(rq, ve);
 
-	if (READ_ONCE(ve->request))
+	if (!i915_sched_is_idle(&ve->base.active))
 		i915_sched_kick(&ve->base.active);
-
-	/* Flush concurrent signal_irq_work before we reuse the link */
-	if (signals)
-		intel_breadcrumbs_flush_signals(engine->breadcrumbs);
 }
 
 static inline void __execlists_schedule_out(struct i915_request *rq)
@@ -896,10 +910,16 @@ static bool ctx_single_port_submission(const struct intel_context *ce)
 		intel_context_force_single_submission(ce));
 }
 
+static bool __can_merge_ctx(const struct intel_context *prev,
+			    const struct intel_context *next)
+{
+	return prev == next;
+}
+
 static bool can_merge_ctx(const struct intel_context *prev,
 			  const struct intel_context *next)
 {
-	if (prev != next)
+	if (!__can_merge_ctx(prev, next))
 		return false;
 
 	if (ctx_single_port_submission(prev))
@@ -970,31 +990,6 @@ static bool virtual_matches(const struct virtual_engine *ve,
 	return true;
 }
 
-static struct virtual_engine *
-first_virtual_engine(struct intel_engine_cs *engine)
-{
-	struct intel_engine_execlists *el = &engine->execlists;
-	struct rb_node *rb = rb_first_cached(&el->virtual);
-
-	while (rb) {
-		struct virtual_engine *ve =
-			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
-		struct i915_request *rq = READ_ONCE(ve->request);
-
-		/* lazily cleanup after another engine handled rq */
-		if (!rq || !virtual_matches(ve, rq, engine)) {
-			rb_erase_cached(rb, &el->virtual);
-			RB_CLEAR_NODE(rb);
-			rb = rb_first_cached(&el->virtual);
-			continue;
-		}
-
-		return ve;
-	}
-
-	return NULL;
-}
-
 static void virtual_xfer_context(struct virtual_engine *ve,
 				 struct intel_engine_cs *engine)
 {
@@ -1003,6 +998,10 @@ static void virtual_xfer_context(struct virtual_engine *ve,
 	if (likely(engine == ve->siblings[0]))
 		return;
 
+	if (!list_empty(&ve->context.signals))
+		intel_context_remove_breadcrumbs(&ve->context,
+						 ve->siblings[0]->breadcrumbs);
+
 	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
 	if (!intel_engine_has_relative_mmio(engine))
 		lrc_update_offsets(&ve->context, engine);
@@ -1158,15 +1157,124 @@ static bool completed(const struct i915_request *rq)
 	return __i915_request_is_complete(rq);
 }
 
+static void __virtual_dequeue(struct virtual_engine *ve,
+			      struct intel_engine_cs *sibling)
+{
+	struct ve_node * const node = &ve->nodes[sibling->id];
+	struct rb_node **parent, *rb;
+	struct i915_request *rq;
+	u64 deadline;
+	bool first;
+
+	rb_erase_cached(&node->rb, &sibling->execlists.virtual);
+	RB_CLEAR_NODE(&node->rb);
+
+	rq = first_request(&ve->base.active);
+	if (!virtual_matches(ve, rq, sibling))
+		return;
+
+	rb = NULL;
+	first = true;
+	parent = &sibling->execlists.virtual.rb_root.rb_node;
+	deadline = rq_deadline(rq);
+	while (*parent) {
+		struct ve_node *other;
+
+		rb = *parent;
+		other = rb_entry(rb, typeof(*other), rb);
+		if (deadline <= other->deadline) {
+			parent = &rb->rb_left;
+		} else {
+			parent = &rb->rb_right;
+			first = false;
+		}
+	}
+
+	rb_link_node(&node->rb, rb, parent);
+	rb_insert_color_cached(&node->rb, &sibling->execlists.virtual, first);
+}
+
+static void virtual_requeue(struct intel_engine_cs *engine,
+			    struct i915_request *last)
+{
+	const struct i915_request * const first =
+		first_request(&engine->active);
+	struct virtual_engine *ve;
+
+	while ((ve = first_virtual_engine(engine))) {
+		struct i915_request *rq;
+
+		spin_lock(&ve->base.active.lock);
+
+		rq = first_request(&ve->base.active);
+		if (unlikely(!virtual_matches(ve, rq, engine)))
+			/* lost the race to a sibling */
+			goto unlock;
+
+		GEM_BUG_ON(rq->engine != &ve->base);
+		GEM_BUG_ON(rq->context != &ve->context);
+
+		if (!dl_before(rq, first)) {
+			spin_unlock(&ve->base.active.lock);
+			return;
+		}
+
+		if (last && !__can_merge_ctx(last->context, rq->context)) {
+			spin_unlock(&ve->base.active.lock);
+			return; /* leave this for another sibling? */
+		}
+
+		ENGINE_TRACE(engine,
+			     "virtual rq=%llx:%lld%s, dl %llx, new engine? %s\n",
+			     rq->fence.context,
+			     rq->fence.seqno,
+			     __i915_request_is_complete(rq) ? "!" :
+			     __i915_request_has_started(rq) ? "*" :
+			     "",
+			     rq_deadline(rq),
+			     yesno(engine != ve->siblings[0]));
+
+		GEM_BUG_ON(!(rq->execution_mask & engine->mask));
+		if (__i915_request_requeue(rq, engine)) {
+			/*
+			 * Only after we confirm that we will submit
+			 * this request (i.e. it has not already
+			 * completed), do we want to update the context.
+			 *
+			 * This serves two purposes. It avoids
+			 * unnecessary work if we are resubmitting an
+			 * already completed request after timeslicing.
+			 * But more importantly, it prevents us altering
+			 * ve->siblings[] on an idle context, where
+			 * we may be using ve->siblings[] in
+			 * virtual_context_enter / virtual_context_exit.
+			 */
+			virtual_xfer_context(ve, engine);
+
+			/* Bind this ve before we release the lock */
+			if (!ve->context.inflight)
+				WRITE_ONCE(ve->context.inflight, engine);
+
+			GEM_BUG_ON(rq->engine != engine);
+			GEM_BUG_ON(ve->siblings[0] != engine);
+			GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
+
+			last = rq;
+		}
+
+unlock:
+		__virtual_dequeue(ve, engine);
+		spin_unlock(&ve->base.active.lock);
+	}
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct i915_request **port = execlists->pending;
 	struct i915_request ** const last_port = port + execlists->port_mask;
 	struct i915_request *last, * const *active;
-	struct virtual_engine *ve;
 	struct i915_priolist *pl;
-	struct rb_node *rb;
 	bool submit = false;
 
 	/*
@@ -1284,83 +1392,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		}
 	}
 
-	/* XXX virtual is always taking precedence */
-	while ((ve = first_virtual_engine(engine))) {
-		struct i915_request *rq;
-
-		spin_lock(&ve->base.active.lock);
-
-		rq = ve->request;
-		if (unlikely(!virtual_matches(ve, rq, engine)))
-			goto unlock; /* lost the race to a sibling */
-
-		GEM_BUG_ON(rq->engine != &ve->base);
-		GEM_BUG_ON(rq->context != &ve->context);
-
-		if (!dl_before(rq, first_request(&engine->active))) {
-			spin_unlock(&ve->base.active.lock);
-			break;
-		}
-
-		if (last && !can_merge_rq(last, rq)) {
-			spin_unlock(&ve->base.active.lock);
-			spin_unlock(&engine->active.lock);
-			return; /* leave this for another sibling */
-		}
-
-		ENGINE_TRACE(engine,
-			     "virtual rq=%llx:%lld%s, dl %llx, new engine? %s\n",
-			     rq->fence.context,
-			     rq->fence.seqno,
-			     __i915_request_is_complete(rq) ? "!" :
-			     __i915_request_has_started(rq) ? "*" :
-			     "",
-			     rq_deadline(rq),
-			     yesno(engine != ve->siblings[0]));
-		WRITE_ONCE(ve->request, NULL);
-
-		rb = &ve->nodes[engine->id].rb;
-		rb_erase_cached(rb, &execlists->virtual);
-		RB_CLEAR_NODE(rb);
-
-		GEM_BUG_ON(!(rq->execution_mask & engine->mask));
-		WRITE_ONCE(rq->engine, engine);
-
-		if (__i915_request_submit(rq)) {
-			/*
-			 * Only after we confirm that we will submit
-			 * this request (i.e. it has not already
-			 * completed), do we want to update the context.
-			 *
-			 * This serves two purposes. It avoids
-			 * unnecessary work if we are resubmitting an
-			 * already completed request after timeslicing.
-			 * But more importantly, it prevents us altering
-			 * ve->siblings[] on an idle context, where
-			 * we may be using ve->siblings[] in
-			 * virtual_context_enter / virtual_context_exit.
-			 */
-			virtual_xfer_context(ve, engine);
-			GEM_BUG_ON(ve->siblings[0] != engine);
-
-			submit = true;
-			last = rq;
-		}
-
-		i915_request_put(rq);
-unlock:
-		spin_unlock(&ve->base.active.lock);
-
-		/*
-		 * Hmm, we have a bunch of virtual engine requests,
-		 * but the first one was already completed (thanks
-		 * preempt-to-busy!). Keep looking at the veng queue
-		 * until we have no more relevant requests (i.e.
-		 * the normal submit queue has higher priority).
-		 */
-		if (submit)
-			break;
-	}
+	if (!RB_EMPTY_ROOT(&execlists->virtual.rb_root))
+		virtual_requeue(engine, last);
 
 	for_each_priolist(pl, &engine->active.queue) {
 		struct i915_request *rq, *rn;
@@ -1368,6 +1401,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		priolist_for_each_request_safe(rq, rn, pl) {
 			bool merge = true;
 
+			GEM_BUG_ON(rq->engine != engine);
+
 			/*
 			 * Can we combine this request with the current port?
 			 * It has to be the same context/ringbuffer and not
@@ -2662,13 +2697,11 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
 		RB_CLEAR_NODE(rb);
 
 		spin_lock(&ve->base.active.lock);
-		rq = fetch_and_zero(&ve->request);
-		if (rq) {
+		while ((rq = first_request(&ve->base.active))) {
 			mark_eio(rq);
 
 			rq->engine = engine;
 			__i915_request_submit(rq);
-			i915_request_put(rq);
 		}
 		spin_unlock(&ve->base.active.lock);
 	}
@@ -2915,11 +2948,6 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
 	return 0;
 }
 
-static struct list_head *virtual_queue(struct virtual_engine *ve)
-{
-	return &ve->base.active.default_priolist.requests;
-}
-
 static void rcu_virtual_context_destroy(struct work_struct *wrk)
 {
 	struct virtual_engine *ve =
@@ -2929,17 +2957,13 @@ static void rcu_virtual_context_destroy(struct work_struct *wrk)
 	GEM_BUG_ON(ve->context.inflight);
 
 	/* Preempt-to-busy may leave a stale request behind. */
-	if (unlikely(ve->request)) {
+	if (unlikely(!i915_sched_is_idle(&ve->base.active))) {
 		struct i915_request *old;
 
 		spin_lock_irq(&ve->base.active.lock);
 
-		old = fetch_and_zero(&ve->request);
-		if (old) {
-			GEM_BUG_ON(!i915_request_completed(old));
+		while ((old = first_request(&ve->base.active)))
 			__i915_request_submit(old);
-			i915_request_put(old);
-		}
 
 		spin_unlock_irq(&ve->base.active.lock);
 	}
@@ -2970,7 +2994,6 @@ static void rcu_virtual_context_destroy(struct work_struct *wrk)
 		spin_unlock_irq(&sibling->active.lock);
 	}
 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.active.tasklet));
-	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
 
 	lrc_fini(&ve->context);
 	intel_context_fini(&ve->context);
@@ -3095,7 +3118,9 @@ virtual_submission_mask(struct virtual_engine *ve, u64 *deadline)
 	struct i915_request *rq;
 	intel_engine_mask_t mask;
 
-	rq = READ_ONCE(ve->request);
+	spin_lock_irq(&ve->base.active.lock);
+	rq = first_request(&ve->base.active);
+	spin_unlock_irq(&ve->base.active.lock);
 	if (!rq)
 		return 0;
 
@@ -3135,9 +3160,6 @@ static void virtual_submission_tasklet(unsigned long data)
 		struct rb_node **parent, *rb;
 		bool first;
 
-		if (!READ_ONCE(ve->request))
-			break; /* already handled by a sibling's tasklet */
-
 		spin_lock_irq(&sibling->active.lock);
 
 		if (unlikely(!(mask & sibling->mask))) {
@@ -3199,45 +3221,6 @@ static void virtual_submission_tasklet(unsigned long data)
 	}
 }
 
-static void virtual_submit_request(struct i915_request *rq)
-{
-	struct virtual_engine *ve = to_virtual_engine(rq->engine);
-	unsigned long flags;
-
-	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
-		     rq->fence.context,
-		     rq->fence.seqno);
-
-	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
-
-	spin_lock_irqsave(&ve->base.active.lock, flags);
-
-	/* By the time we resubmit a request, it may be completed */
-	if (__i915_request_is_complete(rq)) {
-		__i915_request_submit(rq);
-		goto unlock;
-	}
-
-	if (ve->request) { /* background completion from preempt-to-busy */
-		GEM_BUG_ON(!i915_request_completed(ve->request));
-		__i915_request_submit(ve->request);
-		i915_request_put(ve->request);
-	}
-
-	rq->sched.deadline =
-		min(rq->sched.deadline,
-		    i915_scheduler_next_virtual_deadline(rq_prio(rq)));
-	ve->request = i915_request_get(rq);
-
-	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
-	list_move_tail(&rq->sched.link, virtual_queue(ve));
-
-	i915_sched_kick(&ve->base.active);
-
-unlock:
-	spin_unlock_irqrestore(&ve->base.active.lock, flags);
-}
-
 static struct ve_bond *
 virtual_find_bond(struct virtual_engine *ve,
 		  const struct intel_engine_cs *master)
@@ -3325,10 +3308,9 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings,
 	ve->base.cops = &virtual_context_ops;
 	ve->base.request_alloc = execlists_request_alloc;
 
-	ve->base.submit_request = virtual_submit_request;
+	ve->base.submit_request = i915_request_enqueue;
 	ve->base.bond_execute = virtual_bond_execute;
 
-	INIT_LIST_HEAD(virtual_queue(ve));
 	tasklet_init(&ve->base.active.tasklet,
 		     virtual_submission_tasklet,
 		     (unsigned long)ve);
@@ -3536,14 +3518,17 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
 		struct virtual_engine *ve =
 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
-		struct i915_request *rq = READ_ONCE(ve->request);
+		struct i915_request *rq;
 
+		spin_lock(&ve->base.active.lock);
+		rq = first_request(&ve->base.active);
 		if (rq) {
 			if (count++ < max - 1)
 				show_request(m, rq, "\t\t", 0);
 			else
 				last = rq;
 		}
+		spin_unlock(&ve->base.active.lock);
 	}
 	if (last) {
 		if (count > max) {
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index dd315455874f..3f9bcc427d07 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1343,6 +1343,7 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
 
 	GEM_BUG_ON(to == from);
 	GEM_BUG_ON(to->timeline == from->timeline);
+	GEM_BUG_ON(to->context == from->context);
 
 	if (i915_request_completed(from)) {
 		i915_sw_fence_set_error_once(&to->submit, from->fence.error);
@@ -1528,7 +1529,7 @@ __i915_request_add_to_timeline(struct i915_request *rq)
 			   i915_seqno_passed(prev->fence.seqno,
 					     rq->fence.seqno));
 
-		if (is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask))
+		if (prev->context == rq->context)
 			i915_sw_fence_await_sw_fence(&rq->submit,
 						     &prev->submit,
 						     &rq->submitq);
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 266e3468bcee..444de5962d19 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -592,7 +592,7 @@ static u64 virtual_deadline(u64 kt, int priority)
 	return i915_sched_to_ticks(kt + prio_slice(priority));
 }
 
-u64 i915_scheduler_next_virtual_deadline(int priority)
+static u64 next_virtual_deadline(int priority)
 {
 	return virtual_deadline(ktime_get_mono_fast_ns(), priority);
 }
@@ -822,20 +822,17 @@ void i915_request_set_priority(struct i915_request *rq, int prio)
 	spin_unlock_irqrestore(&engine->active.lock, flags);
 }
 
-void __intel_engine_defer_request(struct intel_engine_cs *engine,
-				  struct i915_request *rq)
+static void __defer_request(struct intel_engine_cs *engine,
+			    struct i915_request *rq,
+			    u64 deadline)
 {
 	struct list_head *pos = &rq->sched.waiters_list;
 	struct i915_request *rn;
 	LIST_HEAD(dfs);
-	u64 deadline;
 
 	lockdep_assert_held(&engine->active.lock);
 	GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags));
 
-	deadline = max(rq_deadline(rq),
-		       i915_scheduler_next_virtual_deadline(adj_prio(rq)));
-
 	/*
 	 * When we defer a request, we must maintain its order with respect
 	 * to those that are waiting upon it. So we traverse its chain of
@@ -903,6 +900,14 @@ void __intel_engine_defer_request(struct intel_engine_cs *engine,
 	}
 }
 
+void __intel_engine_defer_request(struct intel_engine_cs *engine,
+				  struct i915_request *rq)
+{
+	__defer_request(engine, rq,
+			max(rq_deadline(rq),
+			    next_virtual_deadline(adj_prio(rq))));
+}
+
 static bool
 queue_request(struct intel_engine_cs *engine, struct i915_request *rq)
 {
@@ -943,6 +948,46 @@ static bool ancestor_on_hold(const struct intel_engine_cs *engine,
 	return unlikely(!list_empty(&engine->active.hold)) && hold_request(rq);
 }
 
+bool __i915_request_requeue(struct i915_request *rq,
+			    struct intel_engine_cs *engine)
+{
+	RQ_TRACE(rq, "transfer from %s to %s\n",
+		 rq->engine->name, engine->name);
+
+	lockdep_assert_held(&engine->active.lock);
+	lockdep_assert_held(&rq->engine->active.lock);
+	GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags));
+	GEM_BUG_ON(rq->engine == engine);
+
+	list_del_init(&rq->sched.link);
+	WRITE_ONCE(rq->engine, engine);
+
+	if (__i915_request_is_complete(rq)) {
+		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
+		set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags);
+		return false;
+	}
+
+	if (unlikely(ancestor_on_hold(engine, rq))) {
+		RQ_TRACE(rq, "ancestor on hold\n");
+		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
+		list_add_tail(&rq->sched.link, &engine->active.hold);
+		i915_request_set_hold(rq);
+	} else {
+		u64 deadline = min(earliest_deadline(rq), rq_deadline(rq));
+
+		/* Maintain request ordering wrt to existing on target */
+		__i915_request_set_deadline(rq, deadline);
+		if (!list_empty(&rq->sched.waiters_list))
+			__defer_request(engine, rq, deadline);
+
+		GEM_BUG_ON(rq_deadline(rq) == I915_DEADLINE_NEVER);
+	}
+
+	GEM_BUG_ON(list_empty(&rq->sched.link));
+	return true;
+}
+
 void i915_request_enqueue(struct i915_request *rq)
 {
 	struct intel_engine_cs *engine = rq->engine;
@@ -993,9 +1038,9 @@ __intel_engine_rewind_requests(struct intel_engine_cs *engine)
 		__i915_request_unsubmit(rq);
 
 		if (__i915_request_has_started(rq)) {
-			u64 deadline =
-				i915_scheduler_next_virtual_deadline(rq_prio(rq));
-			rq->sched.deadline = min(rq_deadline(rq), deadline);
+			rq->sched.deadline =
+				min(rq_deadline(rq),
+				    next_virtual_deadline(rq_prio(rq)));
 		}
 		GEM_BUG_ON(rq_deadline(rq) == I915_DEADLINE_NEVER);
 
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 1431ae5df5fc..b4725d9eb42e 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -40,9 +40,9 @@ void i915_request_set_deadline(struct i915_request *request, u64 deadline);
 
 void i915_request_update_deadline(struct i915_request *request);
 
-u64 i915_scheduler_next_virtual_deadline(int priority);
-
 void i915_request_enqueue(struct i915_request *request);
+bool __i915_request_requeue(struct i915_request *rq,
+			    struct intel_engine_cs *engine);
 
 struct i915_request *
 __intel_engine_rewind_requests(struct intel_engine_cs *engine);
-- 
2.20.1