[Intel-gfx] [PATCH 40/40] drm/i915: Load balancing across a virtual engine

Wed Sep 19 19:55:44 UTC 2018

Having allowed the user to define a set of engines that they will want
to only use, we go one step further and allow them to bind those engines
into a single virtual instance. Submitting a batch to the virtual engine
will then forward it to any one of the set in a manner as best to
distribute load.  The virtual engine has a single timeline across all
engines (it operates as a single queue), so it is not able to concurrently
run batches across multiple engines by itself; that is left up to the user
to submit multiple concurrent batches to multiple queues. Multiple users
will be load balanced across the system.

The mechanism used for load balancing in this patch is a late greedy
balancer. When a request is ready for execution, it is added to each
engine's queue, and when an engine is ready for its next request it
claims it from the virtual engine. The first engine to do so, wins, i.e.
the request is executed at the earliest opportunity (idle moment) in the
system.

As not all HW is created equal, the user is still able to skip the
virtual engine and execute the batch on a specific engine, all within the
same queue. It will then be executed in order on the correct engine,
with execution on other virtual engines being moved away due to the load
detection.

A couple of areas for potential improvement left!

- The virtual engine always take priority over equal-priority tasks.
Mostly broken up by applying FQ_CODEL rules for prioritising new clients,
and hopefully the virtual and real engines are not then congested (i.e.
all work is via virtual engines, or all work is to the real engine).

- We require the breadcrumb irq around every virtual engine request. For
normal engines, we eliminate the need for the slow round trip via
interrupt by using the submit fence and queueing in order. For virtual
engines, we have to allow any job to transfer to a new ring, and cannot
coalesce the submissions, so require the completion fence instead,
forcing the persistent use of interrupts.

Other areas of improvement are more general, such as reducing lock
contention, reducing dispatch overhead, looking at direct submission
rather than bouncing around tasklets etc.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
 drivers/gpu/drm/i915/i915_gem.h            |   5 +
 drivers/gpu/drm/i915/i915_gem_context.c    |  85 +++-
 drivers/gpu/drm/i915/i915_gem_context.h    |   1 +
 drivers/gpu/drm/i915/i915_request.c        |   2 +-
 drivers/gpu/drm/i915/i915_scheduler.c      |   1 +
 drivers/gpu/drm/i915/i915_timeline.h       |   1 +
 drivers/gpu/drm/i915/intel_engine_cs.c     |  48 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 500 ++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_lrc.h           |  16 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   8 +
 drivers/gpu/drm/i915/selftests/intel_lrc.c | 216 +++++++++
 include/uapi/drm/i915_drm.h                |  27 ++
 12 files changed, 851 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index 599c4f6eb1ea..69d137786700 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -87,4 +87,9 @@ static inline bool __tasklet_is_enabled(const struct tasklet_struct *t)
 	return !atomic_read(&t->count);
 }
 
+static inline bool __tasklet_is_scheduled(struct tasklet_struct *t)
+{
+	return test_bit(TASKLET_STATE_SCHED, &t->state);
+}
+
 #endif /* __I915_GEM_H__ */
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 313471253f51..0ee39772df03 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -91,6 +91,7 @@
 #include "i915_drv.h"
 #include "i915_trace.h"
 #include "i915_user_extensions.h"
+#include "intel_lrc.h"
 #include "intel_workarounds.h"
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
@@ -224,7 +225,10 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
 			ce->ops->destroy(ce);
 	}
 
-	kfree(ctx->engines);
+	if (ctx->engines) {
+		intel_virtual_engine_put(ctx->engines[0]);
+		kfree(ctx->engines);
+	}
 
 	if (ctx->timeline)
 		i915_timeline_put(ctx->timeline);
@@ -351,6 +355,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 		struct intel_context *ce = &ctx->__engine[n];
 
 		ce->gem_context = ctx;
+		ce->owner = dev_priv->engine[n];
 	}
 
 	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
@@ -679,7 +684,8 @@ last_request_on_engine(struct i915_timeline *timeline,
 
 	rq = i915_gem_active_raw(&timeline->last_request,
 				 &engine->i915->drm.struct_mutex);
-	if (rq && rq->engine == engine) {
+	if (rq &&
+	    (rq->engine == engine || intel_engine_is_virtual(rq->engine))) {
 		GEM_TRACE("last request for %s on engine %s: %llx:%d\n",
 			  timeline->name, engine->name,
 			  rq->fence.context, rq->fence.seqno);
@@ -951,13 +957,82 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	return ret;
 }
 
+static int check_user_mbz64(u64 __user *user)
+{
+	u64 mbz;
+
+	if (get_user(mbz, user))
+		return -EFAULT;
+
+	return mbz ? -EINVAL : 0;
+}
+
 struct set_engines {
 	struct i915_gem_context *ctx;
 	struct intel_engine_cs **engines;
 	unsigned int nengine;
 };
 
+static int set_engines__load_balance(struct i915_user_extension __user *base,
+				     void *data)
+
+{
+	struct i915_context_engines_load_balance __user *ext =
+		container_of(base, typeof(*ext) __user, base);
+	const struct set_engines *set = data;
+	struct intel_engine_cs *ve;
+	unsigned int n;
+	u64 mask;
+	int err;
+
+	if (set->engines[0])
+		return -EEXIST;
+
+	if (!HAS_EXECLISTS(set->ctx->i915))
+		return -ENODEV;
+
+	if (USES_GUC_SUBMISSION(set->ctx->i915))
+		return -ENODEV;
+
+	if (!set->ctx->timeline)
+		return -EINVAL;
+
+	err = check_user_mbz64(&ext->flags);
+	if (err)
+		return err;
+
+	for (n = 0; n < ARRAY_SIZE(ext->mbz); n++) {
+		err = check_user_mbz64(&ext->mbz[n]);
+		if (err)
+			return err;
+	}
+
+	if (get_user(mask, &ext->engines_mask))
+		return -EFAULT;
+
+	if (mask == ~0ull) {
+		ve = intel_execlists_create_virtual(set->ctx,
+						    set->engines + 1,
+						    set->nengine);
+	} else {
+		struct intel_engine_cs *stack[64];
+		int bit;
+
+		n = 0;
+		for_each_set_bit(bit, (unsigned long *)&mask, set->nengine)
+			stack[n++] = set->engines[bit + 1];
+
+		ve = intel_execlists_create_virtual(set->ctx, stack, n);
+	}
+	if (IS_ERR(ve))
+		return PTR_ERR(ve);
+
+	set->engines[0] = ve;
+	return 0;
+}
+
 static const i915_user_extension_fn set_engines__extensions[] = {
+	[I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
 };
 
 static int set_engines(struct i915_gem_context *ctx,
@@ -1020,12 +1095,16 @@ static int set_engines(struct i915_gem_context *ctx,
 					   ARRAY_SIZE(set_engines__extensions),
 					   &set);
 	if (err) {
+		intel_virtual_engine_put(set.engines[0]);
 		kfree(set.engines);
 		return err;
 	}
 
 out:
-	kfree(ctx->engines);
+	if (ctx->engines) {
+		intel_virtual_engine_put(ctx->engines[0]);
+		kfree(ctx->engines);
+	}
 	ctx->engines = set.engines;
 	ctx->nengine = set.nengine + 1;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 1fd71dfdfa62..6bab994cecba 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -171,6 +171,7 @@ struct i915_gem_context {
 	struct intel_context {
 		struct i915_gem_context *gem_context;
 		struct intel_engine_cs *active;
+		struct intel_engine_cs *owner;
 		struct i915_vma *state;
 		struct intel_ring *ring;
 		u32 *lrc_reg_state;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 34d410cfa577..fda5a936bcc0 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1007,7 +1007,7 @@ void i915_request_add(struct i915_request *request)
 	prev = i915_gem_active_raw(&timeline->last_request,
 				   &request->i915->drm.struct_mutex);
 	if (prev && !i915_request_completed(prev)) {
-		if (prev->engine == engine)
+		if (prev->engine == engine && !intel_engine_is_virtual(engine))
 			i915_sw_fence_await_sw_fence(&request->submit,
 						     &prev->submit,
 						     &request->submitq);
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 1423088dceff..cfb0a0353e15 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -335,6 +335,7 @@ static void __i915_schedule(struct i915_request *rq,
 
 		node->attr.priority = prio;
 		if (!list_empty(&node->link)) {
+			GEM_BUG_ON(intel_engine_is_virtual(engine));
 			if (last != engine) {
 				pl = i915_sched_lookup_priolist(engine, prio);
 				last = engine;
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index a2c2c3ab5fb0..b5321dc2d5a5 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -39,6 +39,7 @@ struct i915_timeline {
 	spinlock_t lock;
 #define TIMELINE_CLIENT 0 /* default subclass */
 #define TIMELINE_ENGINE 1
+#define TIMELINE_VIRTUAL 2
 
 	/**
 	 * List of breadcrumbs associated with GPU requests currently
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index b122d82465d0..3659f7e68348 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1444,13 +1444,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 {
 	const int MAX_REQUESTS_TO_SHOW = 8;
 	struct intel_breadcrumbs * const b = &engine->breadcrumbs;
-	const struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct i915_gpu_error * const error = &engine->i915->gpu_error;
-	struct i915_request *rq, *last;
+	struct i915_request *rq;
 	intel_wakeref_t wakeref;
 	unsigned long flags;
 	struct rb_node *rb;
-	int count;
 
 	if (header) {
 		va_list ap;
@@ -1515,49 +1513,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	}
 
 	local_irq_save(flags);
-	spin_lock(&engine->timeline.lock);
-
-	last = NULL;
-	count = 0;
-	list_for_each_entry(rq, &engine->timeline.requests, link) {
-		if (count++ < MAX_REQUESTS_TO_SHOW - 1)
-			print_request(m, rq, "\t\tE ");
-		else
-			last = rq;
-	}
-	if (last) {
-		if (count > MAX_REQUESTS_TO_SHOW) {
-			drm_printf(m,
-				   "\t\t...skipping %d executing requests...\n",
-				   count - MAX_REQUESTS_TO_SHOW);
-		}
-		print_request(m, last, "\t\tE ");
-	}
-
-	last = NULL;
-	count = 0;
-	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
-	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
-		int i;
-
-		priolist_for_each_request(rq, p, i) {
-			if (count++ < MAX_REQUESTS_TO_SHOW - 1)
-				print_request(m, rq, "\t\tQ ");
-			else
-				last = rq;
-		}
-	}
-	if (last) {
-		if (count > MAX_REQUESTS_TO_SHOW) {
-			drm_printf(m,
-				   "\t\t...skipping %d queued requests...\n",
-				   count - MAX_REQUESTS_TO_SHOW);
-		}
-		print_request(m, last, "\t\tQ ");
-	}
 
-	spin_unlock(&engine->timeline.lock);
+	intel_execlists_show_requests(engine, m,
+				      print_request, MAX_REQUESTS_TO_SHOW);
 
 	spin_lock(&b->rb_lock);
 	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 679ce521be16..999fb13fe0e0 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -164,6 +164,29 @@
 #define WA_TAIL_DWORDS 2
 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 
+struct virtual_engine {
+	struct intel_engine_cs base;
+
+	struct intel_context context;
+	struct kref kref;
+
+	struct intel_engine_cs *bound;
+
+	struct i915_request *request;
+	struct ve_node {
+		struct rb_node rb;
+		int prio;
+	} nodes[I915_NUM_ENGINES];
+
+	unsigned int count;
+	struct intel_engine_cs *siblings[0];
+};
+
+static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
+{
+	return container_of(engine, struct virtual_engine, base);
+}
+
 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 					    struct intel_engine_cs *engine,
 					    struct intel_context *ce);
@@ -276,6 +299,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	list_for_each_entry_safe_reverse(rq, rn,
 					 &engine->timeline.requests,
 					 link) {
+		struct intel_engine_cs *owner;
+
 		if (i915_request_completed(rq))
 			break;
 
@@ -284,14 +309,20 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 
 		GEM_BUG_ON(rq->hw_context->active);
 
-		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
-		if (rq_prio(rq) != prio) {
-			prio = rq_prio(rq);
-			pl = i915_sched_lookup_priolist(engine, prio);
-		}
-		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
+		owner = rq->hw_context->owner;
+		if (likely(owner == engine)) {
+			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
+			if (rq_prio(rq) != prio) {
+				prio = rq_prio(rq);
+				pl = i915_sched_lookup_priolist(engine, prio);
+			}
+			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 
-		list_add(&rq->sched.link, pl);
+			list_add(&rq->sched.link, pl);
+		} else {
+			rq->engine = owner;
+			owner->submit_request(rq);
+		}
 
 		active = rq;
 	}
@@ -301,7 +332,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	 * stream, so give it the equivalent small priority bump to prevent
 	 * it being gazumped a second time by another peer.
 	 */
-	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
+	if (!(prio & I915_PRIORITY_NEWCLIENT) &&
+	    active->hw_context->owner == engine) {
 		prio |= I915_PRIORITY_NEWCLIENT;
 		list_move_tail(&active->sched.link,
 			       i915_sched_lookup_priolist(engine, prio));
@@ -538,6 +570,50 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists)
 						  execlists));
 }
 
+static void virtual_update_register_offsets(u32 *regs,
+					    struct intel_engine_cs *engine)
+{
+	u32 base = engine->mmio_base;
+
+	regs[CTX_CONTEXT_CONTROL] =
+		i915_mmio_reg_offset(RING_CONTEXT_CONTROL(engine));
+	regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
+	regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
+	regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
+	regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
+
+	regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
+	regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
+	regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
+	regs[CTX_SECOND_BB_HEAD_U] =
+		i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
+	regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
+	regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
+
+	regs[CTX_CTX_TIMESTAMP] =
+		i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
+	regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 3));
+	regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 3));
+	regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 2));
+	regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 2));
+	regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 1));
+	regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 1));
+	regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 0));
+	regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 0));
+
+	if (engine->class == RENDER_CLASS) {
+		regs[CTX_RCS_INDIRECT_CTX] =
+			i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
+		regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
+			i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
+		regs[CTX_BB_PER_CTX_PTR] =
+			i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
+
+		regs[CTX_R_PWR_CLK_STATE] =
+			i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
+	}
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -547,6 +623,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	struct i915_request *last = port_request(port);
 	struct rb_node *rb;
 	bool submit = false;
+	int prio;
 
 	/*
 	 * Hardware submission is through 2 ports. Conceptually each port
@@ -570,6 +647,31 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
+restart_virtual_engine:
+	prio = execlists->queue_priority;
+	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq = READ_ONCE(ve->request);
+		struct intel_engine_cs *active;
+
+		if (!rq) {
+			rb_erase_cached(rb, &execlists->virtual);
+			RB_CLEAR_NODE(rb);
+			rb = rb_first_cached(&execlists->virtual);
+			continue;
+		}
+
+		active = READ_ONCE(ve->context.active);
+		if (active && active != engine) {
+			rb = rb_next(rb);
+			continue;
+		}
+
+		prio = max(prio, rq_prio(rq));
+		break;
+	}
+
 	if (last) {
 		/*
 		 * Don't resubmit or switch until all outstanding
@@ -591,7 +693,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 			return;
 
-		if (need_preempt(engine, last, execlists->queue_priority)) {
+		if (need_preempt(engine, last, prio)) {
 			inject_preempt_context(engine);
 			return;
 		}
@@ -631,6 +733,67 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		last->tail = last->wa_tail;
 	}
 
+	if (rb) { /* XXX virtual is always taking precedence */
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq;
+
+		spin_lock(&ve->base.timeline.lock);
+
+		rq = ve->request;
+		if (unlikely(!rq)) { /* lost the race to a sibling */
+			spin_unlock(&ve->base.timeline.lock);
+			goto restart_virtual_engine;
+		}
+
+		if (rq_prio(rq) >= prio) {
+			if (last && !can_merge_rq(rq, last)) {
+				spin_unlock(&ve->base.timeline.lock);
+				return;
+			}
+
+			GEM_BUG_ON(rq->engine != &ve->base);
+			ve->request = NULL;
+			ve->base.execlists.queue_priority = INT_MIN;
+			rb_erase_cached(rb, &execlists->virtual);
+			RB_CLEAR_NODE(rb);
+
+			GEM_BUG_ON(rq->hw_context != &ve->context);
+			rq->engine = engine;
+
+			if (engine != ve->bound) {
+				u32 *regs = ve->context.lrc_reg_state;
+				unsigned int n;
+
+				GEM_BUG_ON(READ_ONCE(ve->context.active));
+				virtual_update_register_offsets(regs, engine);
+				ve->bound = engine;
+
+				/*
+				 * Move the bound engine to the top of the list
+				 * for future execution. We then kick this
+				 * tasklet first before checking others, so that
+				 * we preferentially reuse this set of bound
+				 * registers.
+				 */
+				for (n = 1; n < ve->count; n++) {
+					if (ve->siblings[n] == engine) {
+						swap(ve->siblings[n],
+						     ve->siblings[0]);
+						break;
+					}
+				}
+			}
+
+			__i915_request_submit(rq);
+			trace_i915_request_in(rq, port_index(port, execlists));
+			submit = true;
+			last = rq;
+		}
+
+		spin_unlock(&ve->base.timeline.lock);
+	}
+
 	while ((rb = rb_first_cached(&execlists->queue))) {
 		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
@@ -2735,6 +2898,325 @@ void intel_lr_context_resume(struct drm_i915_private *i915)
 	}
 }
 
+static void virtual_engine_free(struct kref *kref)
+{
+	struct virtual_engine *ve = container_of(kref, typeof(*ve), kref);
+	unsigned int n;
+
+	GEM_BUG_ON(ve->request);
+	GEM_BUG_ON(ve->context.active);
+
+	for (n = 0; n < ve->count; n++) {
+		struct intel_engine_cs *sibling = ve->siblings[n];
+		struct rb_node *node = &ve->nodes[sibling->id].rb;
+
+		if (RB_EMPTY_NODE(node))
+			continue;
+
+		spin_lock_irq(&sibling->timeline.lock);
+
+		if (!RB_EMPTY_NODE(node))
+			rb_erase_cached(node, &sibling->execlists.virtual);
+
+		spin_unlock_irq(&sibling->timeline.lock);
+	}
+	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
+
+	if (ve->context.state)
+		execlists_context_destroy(&ve->context);
+
+	intel_engine_cleanup_scratch(&ve->base);
+	i915_timeline_fini(&ve->base.timeline);
+	kfree(ve);
+}
+
+static void virtual_context_unpin(struct intel_context *ce)
+{
+	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
+
+	execlists_context_unpin(ce);
+
+	kref_put(&ve->kref, virtual_engine_free);
+}
+
+static const struct intel_context_ops virtual_context_ops = {
+	.unpin = virtual_context_unpin,
+};
+
+static struct intel_context *
+virtual_context_pin(struct intel_engine_cs *engine,
+		    struct i915_gem_context *ctx)
+{
+	struct virtual_engine *ve = to_virtual_engine(engine);
+	struct intel_context *ce = &ve->context;
+
+	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
+
+	if (likely(ce->pin_count++))
+		return ce;
+	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
+
+	kref_get(&ve->kref);
+	ce->ops = &virtual_context_ops;
+
+	if (!ve->bound)
+		ve->bound = ve->siblings[0];
+
+	return __execlists_context_pin(ve->bound, ctx, ce);
+}
+
+static void virtual_submission_tasklet(unsigned long data)
+{
+	struct virtual_engine * const ve = (struct virtual_engine *)data;
+	unsigned int n;
+	int prio;
+
+	prio = READ_ONCE(ve->base.execlists.queue_priority);
+	if (prio == INT_MIN)
+		return;
+
+	local_irq_disable();
+	for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) {
+		struct intel_engine_cs *sibling = ve->siblings[n];
+		struct ve_node * const node = &ve->nodes[sibling->id];
+		struct rb_node **parent, *rb;
+		bool first;
+
+		spin_lock(&sibling->timeline.lock);
+
+		if (!RB_EMPTY_NODE(&node->rb)) {
+			first = rb_first_cached(&sibling->execlists.virtual) == &node->rb;
+			if (prio == node->prio || (prio > node->prio && first))
+				goto submit_engine;
+
+			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
+		}
+
+		rb = NULL;
+		first = true;
+		parent = &sibling->execlists.virtual.rb_root.rb_node;
+		while (*parent) {
+			struct ve_node *other;
+
+			rb = *parent;
+			other = rb_entry(rb, typeof(*other), rb);
+			if (prio > other->prio) {
+				parent = &rb->rb_left;
+			} else {
+				parent = &rb->rb_right;
+				first = false;
+			}
+		}
+
+		rb_link_node(&node->rb, rb, parent);
+		rb_insert_color_cached(&node->rb,
+				       &sibling->execlists.virtual,
+				       first);
+
+submit_engine:
+		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
+		node->prio = prio;
+		if (first && prio > sibling->execlists.queue_priority)
+			tasklet_hi_schedule(&sibling->execlists.tasklet);
+
+		spin_unlock(&sibling->timeline.lock);
+	}
+	local_irq_enable();
+}
+
+static void virtual_submit_request(struct i915_request *request)
+{
+	struct virtual_engine *ve = to_virtual_engine(request->engine);
+
+	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
+
+	GEM_BUG_ON(ve->request);
+	ve->base.execlists.queue_priority = rq_prio(request);
+	WRITE_ONCE(ve->request, request);
+
+	tasklet_schedule(&ve->base.execlists.tasklet);
+}
+
+struct intel_engine_cs *
+intel_execlists_create_virtual(struct i915_gem_context *ctx,
+			       struct intel_engine_cs **siblings,
+			       unsigned int count)
+{
+	struct virtual_engine *ve;
+	unsigned int n;
+	int err;
+
+	if (!count)
+		return ERR_PTR(-EINVAL);
+
+	ve = kzalloc(sizeof(*ve) + count * sizeof(*ve->siblings), GFP_KERNEL);
+	if (!ve)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&ve->kref);
+	ve->base.i915 = ctx->i915;
+	ve->base.id = -1;
+	ve->base.class = OTHER_CLASS;
+	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
+	ve->base.flags = I915_ENGINE_IS_VIRTUAL;
+
+	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
+	i915_timeline_init(ctx->i915, &ve->base.timeline, ve->base.name);
+	lockdep_set_subclass(&ve->base.timeline.lock, TIMELINE_VIRTUAL);
+
+	err = intel_engine_create_scratch(&ve->base, 4096);
+	if (err)
+		goto err_put;
+
+	ve->context.gem_context = ctx;
+	ve->context.owner = &ve->base;
+
+	ve->base.context_pin = virtual_context_pin;
+	ve->base.request_alloc = execlists_request_alloc;
+
+	ve->base.schedule = i915_schedule;
+	ve->base.submit_request = virtual_submit_request;
+
+	ve->base.execlists.queue_priority = INT_MIN;
+	tasklet_init(&ve->base.execlists.tasklet,
+		     virtual_submission_tasklet,
+		     (unsigned long)ve);
+
+	ve->count = count;
+	for (n = 0; n < count; n++) {
+		struct intel_engine_cs *sibling = siblings[n];
+
+		ve->siblings[n] = sibling;
+
+		if (sibling->execlists.tasklet.func != execlists_submission_tasklet) {
+			err = -ENODEV;
+			ve->count = n;
+			goto err_put;
+		}
+
+		if (RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)) {
+			err = -EINVAL;
+			ve->count = n;
+			goto err_put;
+		}
+
+		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
+
+		if (ve->base.class != OTHER_CLASS) {
+			if (ve->base.class != sibling->class) {
+				err = -EINVAL;
+				ve->count = n;
+				goto err_put;
+			}
+			continue;
+		}
+
+		ve->base.class = sibling->class;
+		snprintf(ve->base.name, sizeof(ve->base.name),
+			 "v%dx%d", ve->base.class, count);
+		ve->base.context_size = sibling->context_size;
+
+		ve->base.emit_bb_start = sibling->emit_bb_start;
+		ve->base.emit_flush = sibling->emit_flush;
+		ve->base.emit_breadcrumb = sibling->emit_breadcrumb;
+		ve->base.emit_breadcrumb_sz = sibling->emit_breadcrumb_sz;
+	}
+
+	return &ve->base;
+
+err_put:
+	virtual_engine_free(&ve->kref);
+	return ERR_PTR(err);
+}
+
+void intel_virtual_engine_put(struct intel_engine_cs *engine)
+{
+	if (!engine)
+		return;
+
+	kref_put(&to_virtual_engine(engine)->kref, virtual_engine_free);
+}
+
+void intel_execlists_show_requests(struct intel_engine_cs *engine,
+				   struct drm_printer *m,
+				   void (*show_request)(struct drm_printer *m,
+							struct i915_request *rq,
+							const char *prefix),
+				   int max)
+{
+	const struct intel_engine_execlists *execlists = &engine->execlists;
+	struct i915_request *rq, *last;
+	struct rb_node *rb;
+	int count;
+
+	spin_lock(&engine->timeline.lock);
+
+	last = NULL;
+	count = 0;
+	list_for_each_entry(rq, &engine->timeline.requests, link) {
+		if (count++ < max - 1)
+			show_request(m, rq, "\t\tE ");
+		else
+			last = rq;
+	}
+	if (last) {
+		if (count > max) {
+			drm_printf(m,
+				   "\t\t...skipping %d executing requests...\n",
+				   count - max);
+		}
+		show_request(m, last, "\t\tE ");
+	}
+
+	last = NULL;
+	count = 0;
+	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
+	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
+		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+		int i;
+
+		priolist_for_each_request(rq, p, i) {
+			if (count++ < max - 1)
+				show_request(m, rq, "\t\tQ ");
+			else
+				last = rq;
+		}
+	}
+	if (last) {
+		if (count > max) {
+			drm_printf(m,
+				   "\t\t...skipping %d queued requests...\n",
+				   count - max);
+		}
+		show_request(m, last, "\t\tQ ");
+	}
+
+	last = NULL;
+	count = 0;
+	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq = READ_ONCE(ve->request);
+
+		if (rq) {
+			if (count++ < max - 1)
+				show_request(m, rq, "\t\tV ");
+			else
+				last = rq;
+		}
+	}
+	if (last) {
+		if (count > max) {
+			drm_printf(m,
+				   "\t\t...skipping %d virtual requests...\n",
+				   count - max);
+		}
+		show_request(m, last, "\t\tV ");
+	}
+
+	spin_unlock(&engine->timeline.lock);
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/intel_lrc.c"
 #endif
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index f5a5502ecf70..1d96d3bbca3c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -97,11 +97,25 @@ int logical_xcs_ring_init(struct intel_engine_cs *engine);
  */
 #define LRC_HEADER_PAGES LRC_PPHWSP_PN
 
+struct drm_printer;
+
 struct drm_i915_private;
 struct i915_gem_context;
 
 void intel_lr_context_resume(struct drm_i915_private *dev_priv);
-
 void intel_execlists_set_default_submission(struct intel_engine_cs *engine);
 
+struct intel_engine_cs *
+intel_execlists_create_virtual(struct i915_gem_context *ctx,
+			       struct intel_engine_cs **siblings,
+			       unsigned int count);
+void intel_virtual_engine_put(struct intel_engine_cs *engine);
+
+void intel_execlists_show_requests(struct intel_engine_cs *engine,
+				   struct drm_printer *m,
+				   void (*show_request)(struct drm_printer *m,
+							struct i915_request *rq,
+							const char *prefix),
+				   int max);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index f6ec48a75a69..ec772d836f49 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -311,6 +311,7 @@ struct intel_engine_execlists {
 	 * @queue: queue of requests, in priority lists
 	 */
 	struct rb_root_cached queue;
+	struct rb_root_cached virtual;
 
 	/**
 	 * @csb_read: control register for Context Switch buffer
@@ -598,6 +599,7 @@ struct intel_engine_cs {
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_IS_VIRTUAL       BIT(3)
 	unsigned int flags;
 
 	/*
@@ -680,6 +682,12 @@ static inline bool __execlists_need_preempt(int prio, int last)
 	return prio > max(0, last);
 }
 
+static inline bool
+intel_engine_is_virtual(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_IS_VIRTUAL;
+}
+
 static inline void
 execlists_set_active(struct intel_engine_execlists *execlists,
 		     unsigned int bit)
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index b6e11b58b3c9..0fb7a6f9aedb 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -4,6 +4,8 @@
  * Copyright © 2018 Intel Corporation
  */
 
+#include <linux/prime_numbers.h>
+
 #include "../i915_reset.h"
 
 #include "../i915_selftest.h"
@@ -581,6 +583,219 @@ static int live_preempt_hang(void *arg)
 	return err;
 }
 
+struct live_test {
+	struct drm_i915_private *i915;
+	const char *func;
+	const char *name;
+
+	unsigned int reset_count;
+	bool wedge;
+};
+
+static int begin_live_test(struct live_test *t,
+			   struct drm_i915_private *i915,
+			   const char *func,
+			   const char *name)
+{
+	t->i915 = i915;
+	t->func = func;
+	t->name = name;
+
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		return -EIO;
+
+	i915->gpu_error.missed_irq_rings = 0;
+	t->reset_count = i915_reset_count(&i915->gpu_error);
+
+	return 0;
+}
+
+static int end_live_test(struct live_test *t)
+{
+	struct drm_i915_private *i915 = t->i915;
+
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		return -EIO;
+
+	if (t->reset_count != i915_reset_count(&i915->gpu_error)) {
+		pr_err("%s(%s): GPU was reset %d times!\n",
+		       t->func, t->name,
+		       i915_reset_count(&i915->gpu_error) - t->reset_count);
+		return -EIO;
+	}
+
+	if (i915->gpu_error.missed_irq_rings) {
+		pr_err("%s(%s): Missed interrupts on engines %lx\n",
+		       t->func, t->name, i915->gpu_error.missed_irq_rings);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int nop_virtual_engine(struct drm_i915_private *i915,
+			      struct intel_engine_cs **siblings,
+			      unsigned int nsibling,
+			      unsigned int nctx,
+			      unsigned int flags)
+#define CHAIN BIT(0)
+{
+	IGT_TIMEOUT(end_time);
+	struct i915_request *request[16];
+	struct i915_gem_context *ctx[16];
+	struct intel_engine_cs *ve[16];
+	unsigned long n, prime, nc;
+	ktime_t times[2] = {};
+	struct live_test t;
+	int err;
+
+	GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx));
+
+	for (n = 0; n < nctx; n++) {
+		ctx[n] = kernel_context(i915);
+		if (!ctx[n])
+			return -ENOMEM;
+
+		ve[n] = intel_execlists_create_virtual(ctx[n],
+						       siblings, nsibling);
+		if (IS_ERR(ve[n]))
+			return PTR_ERR(ve[n]);
+	}
+
+	err = begin_live_test(&t, i915, __func__, ve[0]->name);
+	if (err)
+		goto out;
+
+	for_each_prime_number_from(prime, 1, 8192) {
+		times[1] = ktime_get_raw();
+
+		if (flags & CHAIN) {
+			for (nc = 0; nc < nctx; nc++) {
+				for (n = 0; n < prime; n++) {
+					request[nc] =
+						i915_request_alloc(ve[nc], ctx[nc]);
+					if (IS_ERR(request[nc])) {
+						err = PTR_ERR(request[nc]);
+						goto out;
+					}
+
+					i915_request_add(request[nc]);
+				}
+			}
+		} else {
+			for (n = 0; n < prime; n++) {
+				for (nc = 0; nc < nctx; nc++) {
+					request[nc] =
+						i915_request_alloc(ve[nc], ctx[nc]);
+					if (IS_ERR(request[nc])) {
+						err = PTR_ERR(request[nc]);
+						goto out;
+					}
+
+					i915_request_add(request[nc]);
+				}
+			}
+		}
+
+		for (nc = 0; nc < nctx; nc++) {
+			if (i915_request_wait(request[nc],
+					      I915_WAIT_LOCKED,
+					      HZ / 10) < 0) {
+				pr_err("%s(%s): wait for %llx:%d timed out\n",
+				       __func__, ve[0]->name,
+				       request[nc]->fence.context,
+				       request[nc]->fence.seqno);
+
+				GEM_TRACE("%s(%s) failed at request %llx:%d\n",
+					  __func__, ve[0]->name,
+					  request[nc]->fence.context,
+					  request[nc]->fence.seqno);
+				GEM_TRACE_DUMP();
+				i915_gem_set_wedged(i915);
+				break;
+			}
+		}
+
+		times[1] = ktime_sub(ktime_get_raw(), times[1]);
+		if (prime == 1)
+			times[0] = times[1];
+
+		if (__igt_timeout(end_time, NULL))
+			break;
+	}
+
+	err = end_live_test(&t);
+	if (err)
+		goto out;
+
+	pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n",
+		nctx, ve[0]->name, ktime_to_ns(times[0]),
+		prime, div64_u64(ktime_to_ns(times[1]), prime));
+
+out:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	for (nc = 0; nc < nctx; nc++) {
+		intel_virtual_engine_put(ve[nc]);
+		kernel_context_close(ctx[nc]);
+	}
+	return err;
+}
+
+static int live_virtual_engine(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned int class, inst;
+	int err = -ENODEV;
+
+	if (USES_GUC_SUBMISSION(i915))
+		return 0;
+
+	mutex_lock(&i915->drm.struct_mutex);
+
+	for_each_engine(engine, i915, id) {
+		err = nop_virtual_engine(i915, &engine, 1, 1, 0);
+		if (err) {
+			pr_err("Failed to wrap engine %s: err=%d\n",
+			       engine->name, err);
+			goto out_unlock;
+		}
+	}
+
+	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
+		int nsibling, n;
+
+		nsibling = 0;
+		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
+			if (!i915->engine_class[class][inst])
+				break;
+
+			siblings[nsibling++] = i915->engine_class[class][inst];
+		}
+		if (nsibling < 2)
+			continue;
+
+		for (n = 1; n <= nsibling + 1; n++) {
+			err = nop_virtual_engine(i915, siblings, nsibling,
+						 n, 0);
+			if (err)
+				goto out_unlock;
+		}
+
+		err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN);
+		if (err)
+			goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
 int intel_execlists_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
@@ -588,6 +803,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_preempt),
 		SUBTEST(live_late_preempt),
 		SUBTEST(live_preempt_hang),
+		SUBTEST(live_virtual_engine),
 	};
 
 	if (!HAS_EXECLISTS(i915))
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index d41b4c673af4..6dd2a89410e8 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1523,14 +1523,41 @@ struct drm_i915_gem_context_param {
  * default settings.
  *
  * See struct i915_context_param_engines.
+ *
+ * Extensions:
+ *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
  */
 #define I915_CONTEXT_PARAM_ENGINES	0x7
 
 	__u64 value;
 };
 
+/*
+ * i915_context_engines_load_balance:
+ *
+ * Enable load balancing across this set of engines.
+ *
+ * Into the I915_EXEC_DEFAULT slot, a virtual engine is created that when
+ * used will proxy the execbuffer request onto one of the set of engines
+ * in such a way as to distribute the load evenly across the set.
+ *
+ * The set of engines must be compatible (e.g. the same HW class) as they
+ * will share the same logical GPU context and ring.
+ *
+ * The context must be defined to use a single timeline for all engines.
+ */
+struct i915_context_engines_load_balance {
+	struct i915_user_extension base;
+
+	__u64 flags; /* all undefined flags must be zero */
+	__u64 engines_mask;
+
+	__u64 mbz[4]; /* reserved for future use; must be zero */
+};
+
 struct i915_context_param_engines {
 	__u64 extensions;
+#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
 
 	struct {
 		__u32 class; /* see enum drm_i915_gem_engine_class */
-- 
2.19.0