[Intel-gfx] [PATCH 40/40] drm/i915: Load balancing across a virtual engine

Mon Oct 1 11:37:12 UTC 2018

On 19/09/2018 20:55, Chris Wilson wrote:
> Having allowed the user to define a set of engines that they will want
> to only use, we go one step further and allow them to bind those engines
> into a single virtual instance. Submitting a batch to the virtual engine
> will then forward it to any one of the set in a manner as best to
> distribute load.  The virtual engine has a single timeline across all
> engines (it operates as a single queue), so it is not able to concurrently
> run batches across multiple engines by itself; that is left up to the user
> to submit multiple concurrent batches to multiple queues. Multiple users
> will be load balanced across the system.
> 
> The mechanism used for load balancing in this patch is a late greedy
> balancer. When a request is ready for execution, it is added to each
> engine's queue, and when an engine is ready for its next request it
> claims it from the virtual engine. The first engine to do so, wins, i.e.
> the request is executed at the earliest opportunity (idle moment) in the
> system.
> 
> As not all HW is created equal, the user is still able to skip the
> virtual engine and execute the batch on a specific engine, all within the
> same queue. It will then be executed in order on the correct engine,
> with execution on other virtual engines being moved away due to the load
> detection.

I think a few paragraphs describing the implementation are warranted. To 
help current and future readers. :)

> A couple of areas for potential improvement left!
> 
> - The virtual engine always take priority over equal-priority tasks.
> Mostly broken up by applying FQ_CODEL rules for prioritising new clients,
> and hopefully the virtual and real engines are not then congested (i.e.
> all work is via virtual engines, or all work is to the real engine).
> 
> - We require the breadcrumb irq around every virtual engine request. For
> normal engines, we eliminate the need for the slow round trip via
> interrupt by using the submit fence and queueing in order. For virtual
> engines, we have to allow any job to transfer to a new ring, and cannot
> coalesce the submissions, so require the completion fence instead,
> forcing the persistent use of interrupts.
> 
> Other areas of improvement are more general, such as reducing lock
> contention, reducing dispatch overhead, looking at direct submission
> rather than bouncing around tasklets etc.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.h            |   5 +
>   drivers/gpu/drm/i915/i915_gem_context.c    |  85 +++-
>   drivers/gpu/drm/i915/i915_gem_context.h    |   1 +
>   drivers/gpu/drm/i915/i915_request.c        |   2 +-
>   drivers/gpu/drm/i915/i915_scheduler.c      |   1 +
>   drivers/gpu/drm/i915/i915_timeline.h       |   1 +
>   drivers/gpu/drm/i915/intel_engine_cs.c     |  48 +-
>   drivers/gpu/drm/i915/intel_lrc.c           | 500 ++++++++++++++++++++-
>   drivers/gpu/drm/i915/intel_lrc.h           |  16 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h    |   8 +
>   drivers/gpu/drm/i915/selftests/intel_lrc.c | 216 +++++++++
>   include/uapi/drm/i915_drm.h                |  27 ++
>   12 files changed, 851 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
> index 599c4f6eb1ea..69d137786700 100644
> --- a/drivers/gpu/drm/i915/i915_gem.h
> +++ b/drivers/gpu/drm/i915/i915_gem.h
> @@ -87,4 +87,9 @@ static inline bool __tasklet_is_enabled(const struct tasklet_struct *t)
>   	return !atomic_read(&t->count);
>   }
>   
> +static inline bool __tasklet_is_scheduled(struct tasklet_struct *t)
> +{
> +	return test_bit(TASKLET_STATE_SCHED, &t->state);
> +}
> +
>   #endif /* __I915_GEM_H__ */
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 313471253f51..0ee39772df03 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -91,6 +91,7 @@
>   #include "i915_drv.h"
>   #include "i915_trace.h"
>   #include "i915_user_extensions.h"
> +#include "intel_lrc.h"
>   #include "intel_workarounds.h"
>   
>   #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
> @@ -224,7 +225,10 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
>   			ce->ops->destroy(ce);
>   	}
>   
> -	kfree(ctx->engines);
> +	if (ctx->engines) {
> +		intel_virtual_engine_put(ctx->engines[0]);
> +		kfree(ctx->engines);
> +	}
>   
>   	if (ctx->timeline)
>   		i915_timeline_put(ctx->timeline);
> @@ -351,6 +355,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>   		struct intel_context *ce = &ctx->__engine[n];
>   
>   		ce->gem_context = ctx;
> +		ce->owner = dev_priv->engine[n];

So owner is actually the engine backpointer AFAICT, it doesn't change as 
the context moves around? Call it engine in that case?

>   	}
>   
>   	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
> @@ -679,7 +684,8 @@ last_request_on_engine(struct i915_timeline *timeline,
>   
>   	rq = i915_gem_active_raw(&timeline->last_request,
>   				 &engine->i915->drm.struct_mutex);
> -	if (rq && rq->engine == engine) {
> +	if (rq &&
> +	    (rq->engine == engine || intel_engine_is_virtual(rq->engine))) {
>   		GEM_TRACE("last request for %s on engine %s: %llx:%d\n",
>   			  timeline->name, engine->name,
>   			  rq->fence.context, rq->fence.seqno);
> @@ -951,13 +957,82 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   	return ret;
>   }
>   
> +static int check_user_mbz64(u64 __user *user)
> +{
> +	u64 mbz;
> +
> +	if (get_user(mbz, user))
> +		return -EFAULT;
> +
> +	return mbz ? -EINVAL : 0;
> +}
> +
>   struct set_engines {
>   	struct i915_gem_context *ctx;
>   	struct intel_engine_cs **engines;
>   	unsigned int nengine;
>   };
>   
> +static int set_engines__load_balance(struct i915_user_extension __user *base,
> +				     void *data)
> +
> +{
> +	struct i915_context_engines_load_balance __user *ext =
> +		container_of(base, typeof(*ext) __user, base);
> +	const struct set_engines *set = data;
> +	struct intel_engine_cs *ve;
> +	unsigned int n;
> +	u64 mask;
> +	int err;
> +
> +	if (set->engines[0])
> +		return -EEXIST;
> +
> +	if (!HAS_EXECLISTS(set->ctx->i915))
> +		return -ENODEV;
> +
> +	if (USES_GUC_SUBMISSION(set->ctx->i915))
> +		return -ENODEV;
> +
> +	if (!set->ctx->timeline)
> +		return -EINVAL;
> +
> +	err = check_user_mbz64(&ext->flags);
> +	if (err)
> +		return err;
> +
> +	for (n = 0; n < ARRAY_SIZE(ext->mbz); n++) {
> +		err = check_user_mbz64(&ext->mbz[n]);
> +		if (err)
> +			return err;
> +	}
> +
> +	if (get_user(mask, &ext->engines_mask))
> +		return -EFAULT;
> +
> +	if (mask == ~0ull) {
> +		ve = intel_execlists_create_virtual(set->ctx,
> +						    set->engines + 1,
> +						    set->nengine);

You think it is worth special casing this instead of just letting the 
branch below handle it?

> +	} else {
> +		struct intel_engine_cs *stack[64];

This is a bit chunky stack allocation. We could limit to 
I915_NUM_ENGINES, if also limiting the set engines uAPI like that, to 
disallow specifying the same engine multiple times?

> +		int bit;
> +
> +		n = 0;
> +		for_each_set_bit(bit, (unsigned long *)&mask, set->nengine)
> +			stack[n++] = set->engines[bit + 1];
> +
> +		ve = intel_execlists_create_virtual(set->ctx, stack, n);
> +	}
> +	if (IS_ERR(ve))
> +		return PTR_ERR(ve);
> +
> +	set->engines[0] = ve;
> +	return 0;
> +}
> +
>   static const i915_user_extension_fn set_engines__extensions[] = {
> +	[I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
>   };
>   
>   static int set_engines(struct i915_gem_context *ctx,
> @@ -1020,12 +1095,16 @@ static int set_engines(struct i915_gem_context *ctx,
>   					   ARRAY_SIZE(set_engines__extensions),
>   					   &set);
>   	if (err) {
> +		intel_virtual_engine_put(set.engines[0]);
>   		kfree(set.engines);
>   		return err;
>   	}
>   
>   out:
> -	kfree(ctx->engines);
> +	if (ctx->engines) {
> +		intel_virtual_engine_put(ctx->engines[0]);
> +		kfree(ctx->engines);
> +	}
>   	ctx->engines = set.engines;
>   	ctx->nengine = set.nengine + 1;
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 1fd71dfdfa62..6bab994cecba 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -171,6 +171,7 @@ struct i915_gem_context {
>   	struct intel_context {
>   		struct i915_gem_context *gem_context;
>   		struct intel_engine_cs *active;
> +		struct intel_engine_cs *owner;
>   		struct i915_vma *state;
>   		struct intel_ring *ring;
>   		u32 *lrc_reg_state;
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 34d410cfa577..fda5a936bcc0 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1007,7 +1007,7 @@ void i915_request_add(struct i915_request *request)
>   	prev = i915_gem_active_raw(&timeline->last_request,
>   				   &request->i915->drm.struct_mutex);
>   	if (prev && !i915_request_completed(prev)) {
> -		if (prev->engine == engine)
> +		if (prev->engine == engine && !intel_engine_is_virtual(engine))
>   			i915_sw_fence_await_sw_fence(&request->submit,
>   						     &prev->submit,
>   						     &request->submitq);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 1423088dceff..cfb0a0353e15 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -335,6 +335,7 @@ static void __i915_schedule(struct i915_request *rq,
>   
>   		node->attr.priority = prio;
>   		if (!list_empty(&node->link)) {
> +			GEM_BUG_ON(intel_engine_is_virtual(engine));
>   			if (last != engine) {
>   				pl = i915_sched_lookup_priolist(engine, prio);
>   				last = engine;
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index a2c2c3ab5fb0..b5321dc2d5a5 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -39,6 +39,7 @@ struct i915_timeline {
>   	spinlock_t lock;
>   #define TIMELINE_CLIENT 0 /* default subclass */
>   #define TIMELINE_ENGINE 1
> +#define TIMELINE_VIRTUAL 2
>   
>   	/**
>   	 * List of breadcrumbs associated with GPU requests currently
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index b122d82465d0..3659f7e68348 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1444,13 +1444,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   {
>   	const int MAX_REQUESTS_TO_SHOW = 8;
>   	struct intel_breadcrumbs * const b = &engine->breadcrumbs;
> -	const struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_gpu_error * const error = &engine->i915->gpu_error;
> -	struct i915_request *rq, *last;
> +	struct i915_request *rq;
>   	intel_wakeref_t wakeref;
>   	unsigned long flags;
>   	struct rb_node *rb;
> -	int count;
>   
>   	if (header) {
>   		va_list ap;
> @@ -1515,49 +1513,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	}
>   
>   	local_irq_save(flags);
> -	spin_lock(&engine->timeline.lock);
> -
> -	last = NULL;
> -	count = 0;
> -	list_for_each_entry(rq, &engine->timeline.requests, link) {
> -		if (count++ < MAX_REQUESTS_TO_SHOW - 1)
> -			print_request(m, rq, "\t\tE ");
> -		else
> -			last = rq;
> -	}
> -	if (last) {
> -		if (count > MAX_REQUESTS_TO_SHOW) {
> -			drm_printf(m,
> -				   "\t\t...skipping %d executing requests...\n",
> -				   count - MAX_REQUESTS_TO_SHOW);
> -		}
> -		print_request(m, last, "\t\tE ");
> -	}
> -
> -	last = NULL;
> -	count = 0;
> -	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
> -	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -		int i;
> -
> -		priolist_for_each_request(rq, p, i) {
> -			if (count++ < MAX_REQUESTS_TO_SHOW - 1)
> -				print_request(m, rq, "\t\tQ ");
> -			else
> -				last = rq;
> -		}
> -	}
> -	if (last) {
> -		if (count > MAX_REQUESTS_TO_SHOW) {
> -			drm_printf(m,
> -				   "\t\t...skipping %d queued requests...\n",
> -				   count - MAX_REQUESTS_TO_SHOW);
> -		}
> -		print_request(m, last, "\t\tQ ");
> -	}
>   
> -	spin_unlock(&engine->timeline.lock);
> +	intel_execlists_show_requests(engine, m,
> +				      print_request, MAX_REQUESTS_TO_SHOW);
>   
>   	spin_lock(&b->rb_lock);
>   	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 679ce521be16..999fb13fe0e0 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -164,6 +164,29 @@
>   #define WA_TAIL_DWORDS 2
>   #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
>   
> +struct virtual_engine {
> +	struct intel_engine_cs base;
> +
> +	struct intel_context context;

hw_context?

> +	struct kref kref;
> +
> +	struct intel_engine_cs *bound;
> +
> +	struct i915_request *request;

Not rq? ;)

> +	struct ve_node {
> +		struct rb_node rb;
> +		int prio;
> +	} nodes[I915_NUM_ENGINES];
> +
> +	unsigned int count;

sibling_count / num_siblings? Hm.. or are they actually children?

> +	struct intel_engine_cs *siblings[0];
> +};

For the whole struct - we will need comments on the members to give some 
hints on the implementation, lifecycles and similar.

> +
> +static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
> +{
> +	return container_of(engine, struct virtual_engine, base);
> +}
> +
>   static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
>   					    struct intel_engine_cs *engine,
>   					    struct intel_context *ce);
> @@ -276,6 +299,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   	list_for_each_entry_safe_reverse(rq, rn,
>   					 &engine->timeline.requests,
>   					 link) {
> +		struct intel_engine_cs *owner;
> +
>   		if (i915_request_completed(rq))
>   			break;
>   
> @@ -284,14 +309,20 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   
>   		GEM_BUG_ON(rq->hw_context->active);
>   
> -		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> -		if (rq_prio(rq) != prio) {
> -			prio = rq_prio(rq);
> -			pl = i915_sched_lookup_priolist(engine, prio);
> -		}
> -		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
> +		owner = rq->hw_context->owner;
> +		if (likely(owner == engine)) {
> +			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> +			if (rq_prio(rq) != prio) {
> +				prio = rq_prio(rq);
> +				pl = i915_sched_lookup_priolist(engine, prio);
> +			}
> +			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
>   
> -		list_add(&rq->sched.link, pl);
> +			list_add(&rq->sched.link, pl);
> +		} else {
> +			rq->engine = owner;
> +			owner->submit_request(rq);

Comment here.

> +		}
>   
>   		active = rq;
>   	}
> @@ -301,7 +332,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   	 * stream, so give it the equivalent small priority bump to prevent
>   	 * it being gazumped a second time by another peer.
>   	 */
> -	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> +	if (!(prio & I915_PRIORITY_NEWCLIENT) &&
> +	    active->hw_context->owner == engine) {

And comment paragraph above for this special consideration.

>   		prio |= I915_PRIORITY_NEWCLIENT;
>   		list_move_tail(&active->sched.link,
>   			       i915_sched_lookup_priolist(engine, prio));
> @@ -538,6 +570,50 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists)
>   						  execlists));
>   }
>   
> +static void virtual_update_register_offsets(u32 *regs,
> +					    struct intel_engine_cs *engine)
> +{
> +	u32 base = engine->mmio_base;
> +
> +	regs[CTX_CONTEXT_CONTROL] =
> +		i915_mmio_reg_offset(RING_CONTEXT_CONTROL(engine));
> +	regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
> +	regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
> +	regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
> +	regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
> +
> +	regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
> +	regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
> +	regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
> +	regs[CTX_SECOND_BB_HEAD_U] =
> +		i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
> +	regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
> +	regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
> +
> +	regs[CTX_CTX_TIMESTAMP] =
> +		i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
> +	regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 3));
> +	regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 3));
> +	regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 2));
> +	regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 2));
> +	regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 1));
> +	regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 1));
> +	regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 0));
> +	regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 0));
> +
> +	if (engine->class == RENDER_CLASS) {
> +		regs[CTX_RCS_INDIRECT_CTX] =
> +			i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
> +		regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
> +			i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
> +		regs[CTX_BB_PER_CTX_PTR] =
> +			i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
> +
> +		regs[CTX_R_PWR_CLK_STATE] =
> +			i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
> +	}
> +}
> +
>   static void execlists_dequeue(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -547,6 +623,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request *last = port_request(port);
>   	struct rb_node *rb;
>   	bool submit = false;
> +	int prio;
>   
>   	/*
>   	 * Hardware submission is through 2 ports. Conceptually each port
> @@ -570,6 +647,31 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	 * and context switches) submission.
>   	 */
>   
> +restart_virtual_engine:
> +	prio = execlists->queue_priority;
> +	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
> +		struct virtual_engine *ve =
> +			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +		struct i915_request *rq = READ_ONCE(ve->request);
> +		struct intel_engine_cs *active;
> +
> +		if (!rq) {
> +			rb_erase_cached(rb, &execlists->virtual);
> +			RB_CLEAR_NODE(rb);
> +			rb = rb_first_cached(&execlists->virtual);
> +			continue;
> +		}
> +
> +		active = READ_ONCE(ve->context.active);
> +		if (active && active != engine) {
> +			rb = rb_next(rb);
> +			continue;
> +		}
> +
> +		prio = max(prio, rq_prio(rq));
> +		break;
> +	}
> +
>   	if (last) {
>   		/*
>   		 * Don't resubmit or switch until all outstanding
> @@ -591,7 +693,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
>   			return;
>   
> -		if (need_preempt(engine, last, execlists->queue_priority)) {
> +		if (need_preempt(engine, last, prio)) {
>   			inject_preempt_context(engine);
>   			return;
>   		}
> @@ -631,6 +733,67 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		last->tail = last->wa_tail;
>   	}
>   
> +	if (rb) { /* XXX virtual is always taking precedence */
> +		struct virtual_engine *ve =
> +			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +		struct i915_request *rq;
> +
> +		spin_lock(&ve->base.timeline.lock);
> +
> +		rq = ve->request;
> +		if (unlikely(!rq)) { /* lost the race to a sibling */
> +			spin_unlock(&ve->base.timeline.lock);
> +			goto restart_virtual_engine;
> +		}
> +
> +		if (rq_prio(rq) >= prio) {
> +			if (last && !can_merge_rq(rq, last)) {
> +				spin_unlock(&ve->base.timeline.lock);
> +				return;
> +			}
> +
> +			GEM_BUG_ON(rq->engine != &ve->base);
> +			ve->request = NULL;
> +			ve->base.execlists.queue_priority = INT_MIN;
> +			rb_erase_cached(rb, &execlists->virtual);
> +			RB_CLEAR_NODE(rb);
> +
> +			GEM_BUG_ON(rq->hw_context != &ve->context);
> +			rq->engine = engine;
> +
> +			if (engine != ve->bound) {
> +				u32 *regs = ve->context.lrc_reg_state;
> +				unsigned int n;
> +
> +				GEM_BUG_ON(READ_ONCE(ve->context.active));
> +				virtual_update_register_offsets(regs, engine);
> +				ve->bound = engine;
> +
> +				/*
> +				 * Move the bound engine to the top of the list
> +				 * for future execution. We then kick this
> +				 * tasklet first before checking others, so that
> +				 * we preferentially reuse this set of bound
> +				 * registers.
> +				 */
> +				for (n = 1; n < ve->count; n++) {
> +					if (ve->siblings[n] == engine) {
> +						swap(ve->siblings[n],
> +						     ve->siblings[0]);
> +						break;
> +					}
> +				}
> +			}
> +
> +			__i915_request_submit(rq);
> +			trace_i915_request_in(rq, port_index(port, execlists));
> +			submit = true;
> +			last = rq;
> +		}
> +
> +		spin_unlock(&ve->base.timeline.lock);
> +	}
> +
>   	while ((rb = rb_first_cached(&execlists->queue))) {
>   		struct i915_priolist *p = to_priolist(rb);
>   		struct i915_request *rq, *rn;
> @@ -2735,6 +2898,325 @@ void intel_lr_context_resume(struct drm_i915_private *i915)
>   	}
>   }
>   
> +static void virtual_engine_free(struct kref *kref)
> +{
> +	struct virtual_engine *ve = container_of(kref, typeof(*ve), kref);
> +	unsigned int n;
> +
> +	GEM_BUG_ON(ve->request);
> +	GEM_BUG_ON(ve->context.active);
> +
> +	for (n = 0; n < ve->count; n++) {
> +		struct intel_engine_cs *sibling = ve->siblings[n];
> +		struct rb_node *node = &ve->nodes[sibling->id].rb;
> +
> +		if (RB_EMPTY_NODE(node))
> +			continue;
> +
> +		spin_lock_irq(&sibling->timeline.lock);
> +
> +		if (!RB_EMPTY_NODE(node))
> +			rb_erase_cached(node, &sibling->execlists.virtual);
> +
> +		spin_unlock_irq(&sibling->timeline.lock);
> +	}
> +	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
> +
> +	if (ve->context.state)
> +		execlists_context_destroy(&ve->context);
> +
> +	intel_engine_cleanup_scratch(&ve->base);
> +	i915_timeline_fini(&ve->base.timeline);
> +	kfree(ve);
> +}
> +
> +static void virtual_context_unpin(struct intel_context *ce)
> +{
> +	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
> +
> +	execlists_context_unpin(ce);
> +
> +	kref_put(&ve->kref, virtual_engine_free);

Hm.. freed on context complete? How does this work.. reading on.

> +}
> +
> +static const struct intel_context_ops virtual_context_ops = {
> +	.unpin = virtual_context_unpin,
> +};
> +
> +static struct intel_context *
> +virtual_context_pin(struct intel_engine_cs *engine,
> +		    struct i915_gem_context *ctx)
> +{
> +	struct virtual_engine *ve = to_virtual_engine(engine);
> +	struct intel_context *ce = &ve->context;
> +
> +	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
> +
> +	if (likely(ce->pin_count++))
> +		return ce;
> +	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
> +
> +	kref_get(&ve->kref);
> +	ce->ops = &virtual_context_ops;
> +
> +	if (!ve->bound)
> +		ve->bound = ve->siblings[0];
> +
> +	return __execlists_context_pin(ve->bound, ctx, ce);
> +}
> +
> +static void virtual_submission_tasklet(unsigned long data)
> +{
> +	struct virtual_engine * const ve = (struct virtual_engine *)data;
> +	unsigned int n;
> +	int prio;
> +
> +	prio = READ_ONCE(ve->base.execlists.queue_priority);
> +	if (prio == INT_MIN)
> +		return;

What is this checking for?

> +
> +	local_irq_disable();
> +	for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) {
> +		struct intel_engine_cs *sibling = ve->siblings[n];
> +		struct ve_node * const node = &ve->nodes[sibling->id];
> +		struct rb_node **parent, *rb;
> +		bool first;
> +
> +		spin_lock(&sibling->timeline.lock);
> +
> +		if (!RB_EMPTY_NODE(&node->rb)) {
> +			first = rb_first_cached(&sibling->execlists.virtual) == &node->rb;
> +			if (prio == node->prio || (prio > node->prio && first))
> +				goto submit_engine;
> +
> +			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
> +		}
> +
> +		rb = NULL;
> +		first = true;
> +		parent = &sibling->execlists.virtual.rb_root.rb_node;
> +		while (*parent) {
> +			struct ve_node *other;
> +
> +			rb = *parent;
> +			other = rb_entry(rb, typeof(*other), rb);
> +			if (prio > other->prio) {
> +				parent = &rb->rb_left;
> +			} else {
> +				parent = &rb->rb_right;
> +				first = false;
> +			}
> +		}
> +
> +		rb_link_node(&node->rb, rb, parent);
> +		rb_insert_color_cached(&node->rb,
> +				       &sibling->execlists.virtual,
> +				       first);
> +
> +submit_engine:
> +		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
> +		node->prio = prio;
> +		if (first && prio > sibling->execlists.queue_priority)
> +			tasklet_hi_schedule(&sibling->execlists.tasklet);
> +
> +		spin_unlock(&sibling->timeline.lock);
> +	}
> +	local_irq_enable();
> +}
> +
> +static void virtual_submit_request(struct i915_request *request)
> +{
> +	struct virtual_engine *ve = to_virtual_engine(request->engine);
> +
> +	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
> +
> +	GEM_BUG_ON(ve->request);
> +	ve->base.execlists.queue_priority = rq_prio(request);
> +	WRITE_ONCE(ve->request, request);
> +
> +	tasklet_schedule(&ve->base.execlists.tasklet);
> +}
> +
> +struct intel_engine_cs *
> +intel_execlists_create_virtual(struct i915_gem_context *ctx,
> +			       struct intel_engine_cs **siblings,
> +			       unsigned int count)
> +{
> +	struct virtual_engine *ve;
> +	unsigned int n;
> +	int err;
> +
> +	if (!count)
> +		return ERR_PTR(-EINVAL);
> +
> +	ve = kzalloc(sizeof(*ve) + count * sizeof(*ve->siblings), GFP_KERNEL);
> +	if (!ve)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&ve->kref);
> +	ve->base.i915 = ctx->i915;
> +	ve->base.id = -1;
> +	ve->base.class = OTHER_CLASS;
> +	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
> +	ve->base.flags = I915_ENGINE_IS_VIRTUAL;
> +
> +	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");

Add fence.context to the name?

> +	i915_timeline_init(ctx->i915, &ve->base.timeline, ve->base.name);
> +	lockdep_set_subclass(&ve->base.timeline.lock, TIMELINE_VIRTUAL);
> +
> +	err = intel_engine_create_scratch(&ve->base, 4096);
> +	if (err)
> +		goto err_put;
> +
> +	ve->context.gem_context = ctx;
> +	ve->context.owner = &ve->base;
> +
> +	ve->base.context_pin = virtual_context_pin;
> +	ve->base.request_alloc = execlists_request_alloc;
> +
> +	ve->base.schedule = i915_schedule;
> +	ve->base.submit_request = virtual_submit_request;
> +
> +	ve->base.execlists.queue_priority = INT_MIN;
> +	tasklet_init(&ve->base.execlists.tasklet,
> +		     virtual_submission_tasklet,
> +		     (unsigned long)ve);
> +
> +	ve->count = count;
> +	for (n = 0; n < count; n++) {
> +		struct intel_engine_cs *sibling = siblings[n];
> +
> +		ve->siblings[n] = sibling;
> +
> +		if (sibling->execlists.tasklet.func != execlists_submission_tasklet) {
> +			err = -ENODEV;

Will this fail context creating during reset? By design or accident? I 
think the !guc and has_execlists is checked for at the top level so 
should be enough.

> +			ve->count = n;
> +			goto err_put;
> +		}
> +
> +		if (RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)) {
> +			err = -EINVAL;
> +			ve->count = n;
> +			goto err_put;
> +		}
> +
> +		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
> +
> +		if (ve->base.class != OTHER_CLASS) {
> +			if (ve->base.class != sibling->class) {
> +				err = -EINVAL;

Just a consequence of context image compatibility or you have 
implementation limitions as well?

> +				ve->count = n;
> +				goto err_put;
> +			}
> +			continue;
> +		}
> +
> +		ve->base.class = sibling->class;
> +		snprintf(ve->base.name, sizeof(ve->base.name),
> +			 "v%dx%d", ve->base.class, count);

Oh the previous snprint was just using it as temporary storage?

Fence context might be usable here as well.

> +		ve->base.context_size = sibling->context_size;
> +
> +		ve->base.emit_bb_start = sibling->emit_bb_start;
> +		ve->base.emit_flush = sibling->emit_flush;
> +		ve->base.emit_breadcrumb = sibling->emit_breadcrumb;
> +		ve->base.emit_breadcrumb_sz = sibling->emit_breadcrumb_sz;
> +	}
> +
> +	return &ve->base;
> +
> +err_put:
> +	virtual_engine_free(&ve->kref);
> +	return ERR_PTR(err);
> +}
> +
> +void intel_virtual_engine_put(struct intel_engine_cs *engine)
> +{
> +	if (!engine)
> +		return;
> +
> +	kref_put(&to_virtual_engine(engine)->kref, virtual_engine_free);
> +}
> +
> +void intel_execlists_show_requests(struct intel_engine_cs *engine,
> +				   struct drm_printer *m,
> +				   void (*show_request)(struct drm_printer *m,
> +							struct i915_request *rq,
> +							const char *prefix),
> +				   int max)
> +{
> +	const struct intel_engine_execlists *execlists = &engine->execlists;
> +	struct i915_request *rq, *last;
> +	struct rb_node *rb;
> +	int count;
> +
> +	spin_lock(&engine->timeline.lock);
> +
> +	last = NULL;
> +	count = 0;
> +	list_for_each_entry(rq, &engine->timeline.requests, link) {
> +		if (count++ < max - 1)
> +			show_request(m, rq, "\t\tE ");
> +		else
> +			last = rq;
> +	}
> +	if (last) {
> +		if (count > max) {
> +			drm_printf(m,
> +				   "\t\t...skipping %d executing requests...\n",
> +				   count - max);
> +		}
> +		show_request(m, last, "\t\tE ");
> +	}
> +
> +	last = NULL;
> +	count = 0;
> +	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
> +	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> +		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +		int i;
> +
> +		priolist_for_each_request(rq, p, i) {
> +			if (count++ < max - 1)
> +				show_request(m, rq, "\t\tQ ");
> +			else
> +				last = rq;
> +		}
> +	}
> +	if (last) {
> +		if (count > max) {
> +			drm_printf(m,
> +				   "\t\t...skipping %d queued requests...\n",
> +				   count - max);
> +		}
> +		show_request(m, last, "\t\tQ ");
> +	}
> +
> +	last = NULL;
> +	count = 0;
> +	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
> +		struct virtual_engine *ve =
> +			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +		struct i915_request *rq = READ_ONCE(ve->request);
> +
> +		if (rq) {
> +			if (count++ < max - 1)
> +				show_request(m, rq, "\t\tV ");
> +			else
> +				last = rq;
> +		}
> +	}
> +	if (last) {
> +		if (count > max) {
> +			drm_printf(m,
> +				   "\t\t...skipping %d virtual requests...\n",
> +				   count - max);
> +		}
> +		show_request(m, last, "\t\tV ");
> +	}
> +
> +	spin_unlock(&engine->timeline.lock);
> +}
> +
>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>   #include "selftests/intel_lrc.c"
>   #endif
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index f5a5502ecf70..1d96d3bbca3c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -97,11 +97,25 @@ int logical_xcs_ring_init(struct intel_engine_cs *engine);
>    */
>   #define LRC_HEADER_PAGES LRC_PPHWSP_PN
>   
> +struct drm_printer;
> +
>   struct drm_i915_private;
>   struct i915_gem_context;
>   
>   void intel_lr_context_resume(struct drm_i915_private *dev_priv);
> -
>   void intel_execlists_set_default_submission(struct intel_engine_cs *engine);
>   
> +struct intel_engine_cs *
> +intel_execlists_create_virtual(struct i915_gem_context *ctx,
> +			       struct intel_engine_cs **siblings,
> +			       unsigned int count);
> +void intel_virtual_engine_put(struct intel_engine_cs *engine);
> +
> +void intel_execlists_show_requests(struct intel_engine_cs *engine,
> +				   struct drm_printer *m,
> +				   void (*show_request)(struct drm_printer *m,
> +							struct i915_request *rq,
> +							const char *prefix),
> +				   int max);
> +
>   #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index f6ec48a75a69..ec772d836f49 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -311,6 +311,7 @@ struct intel_engine_execlists {
>   	 * @queue: queue of requests, in priority lists
>   	 */
>   	struct rb_root_cached queue;
> +	struct rb_root_cached virtual;
>   
>   	/**
>   	 * @csb_read: control register for Context Switch buffer
> @@ -598,6 +599,7 @@ struct intel_engine_cs {
>   #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
>   #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
>   #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> +#define I915_ENGINE_IS_VIRTUAL       BIT(3)
>   	unsigned int flags;
>   
>   	/*
> @@ -680,6 +682,12 @@ static inline bool __execlists_need_preempt(int prio, int last)
>   	return prio > max(0, last);
>   }
>   
> +static inline bool
> +intel_engine_is_virtual(const struct intel_engine_cs *engine)
> +{
> +	return engine->flags & I915_ENGINE_IS_VIRTUAL;
> +}
> +
>   static inline void
>   execlists_set_active(struct intel_engine_execlists *execlists,
>   		     unsigned int bit)
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index b6e11b58b3c9..0fb7a6f9aedb 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -4,6 +4,8 @@
>    * Copyright © 2018 Intel Corporation
>    */
>   
> +#include <linux/prime_numbers.h>
> +
>   #include "../i915_reset.h"
>   
>   #include "../i915_selftest.h"
> @@ -581,6 +583,219 @@ static int live_preempt_hang(void *arg)
>   	return err;
>   }
>   
> +struct live_test {
> +	struct drm_i915_private *i915;
> +	const char *func;
> +	const char *name;
> +
> +	unsigned int reset_count;
> +	bool wedge;
> +};
> +
> +static int begin_live_test(struct live_test *t,
> +			   struct drm_i915_private *i915,
> +			   const char *func,
> +			   const char *name)
> +{
> +	t->i915 = i915;
> +	t->func = func;
> +	t->name = name;
> +
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		return -EIO;
> +
> +	i915->gpu_error.missed_irq_rings = 0;
> +	t->reset_count = i915_reset_count(&i915->gpu_error);
> +
> +	return 0;
> +}
> +
> +static int end_live_test(struct live_test *t)
> +{
> +	struct drm_i915_private *i915 = t->i915;
> +
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		return -EIO;
> +
> +	if (t->reset_count != i915_reset_count(&i915->gpu_error)) {
> +		pr_err("%s(%s): GPU was reset %d times!\n",
> +		       t->func, t->name,
> +		       i915_reset_count(&i915->gpu_error) - t->reset_count);
> +		return -EIO;
> +	}
> +
> +	if (i915->gpu_error.missed_irq_rings) {
> +		pr_err("%s(%s): Missed interrupts on engines %lx\n",
> +		       t->func, t->name, i915->gpu_error.missed_irq_rings);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +static int nop_virtual_engine(struct drm_i915_private *i915,
> +			      struct intel_engine_cs **siblings,
> +			      unsigned int nsibling,
> +			      unsigned int nctx,
> +			      unsigned int flags)
> +#define CHAIN BIT(0)
> +{
> +	IGT_TIMEOUT(end_time);
> +	struct i915_request *request[16];
> +	struct i915_gem_context *ctx[16];
> +	struct intel_engine_cs *ve[16];
> +	unsigned long n, prime, nc;
> +	ktime_t times[2] = {};
> +	struct live_test t;
> +	int err;
> +
> +	GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx));
> +
> +	for (n = 0; n < nctx; n++) {
> +		ctx[n] = kernel_context(i915);
> +		if (!ctx[n])
> +			return -ENOMEM;
> +
> +		ve[n] = intel_execlists_create_virtual(ctx[n],
> +						       siblings, nsibling);
> +		if (IS_ERR(ve[n]))
> +			return PTR_ERR(ve[n]);
> +	}
> +
> +	err = begin_live_test(&t, i915, __func__, ve[0]->name);
> +	if (err)
> +		goto out;
> +
> +	for_each_prime_number_from(prime, 1, 8192) {
> +		times[1] = ktime_get_raw();
> +
> +		if (flags & CHAIN) {
> +			for (nc = 0; nc < nctx; nc++) {
> +				for (n = 0; n < prime; n++) {
> +					request[nc] =
> +						i915_request_alloc(ve[nc], ctx[nc]);
> +					if (IS_ERR(request[nc])) {
> +						err = PTR_ERR(request[nc]);
> +						goto out;
> +					}
> +
> +					i915_request_add(request[nc]);
> +				}
> +			}
> +		} else {
> +			for (n = 0; n < prime; n++) {
> +				for (nc = 0; nc < nctx; nc++) {
> +					request[nc] =
> +						i915_request_alloc(ve[nc], ctx[nc]);
> +					if (IS_ERR(request[nc])) {
> +						err = PTR_ERR(request[nc]);
> +						goto out;
> +					}
> +
> +					i915_request_add(request[nc]);
> +				}
> +			}
> +		}
> +
> +		for (nc = 0; nc < nctx; nc++) {
> +			if (i915_request_wait(request[nc],
> +					      I915_WAIT_LOCKED,
> +					      HZ / 10) < 0) {
> +				pr_err("%s(%s): wait for %llx:%d timed out\n",
> +				       __func__, ve[0]->name,
> +				       request[nc]->fence.context,
> +				       request[nc]->fence.seqno);
> +
> +				GEM_TRACE("%s(%s) failed at request %llx:%d\n",
> +					  __func__, ve[0]->name,
> +					  request[nc]->fence.context,
> +					  request[nc]->fence.seqno);
> +				GEM_TRACE_DUMP();
> +				i915_gem_set_wedged(i915);
> +				break;
> +			}
> +		}
> +
> +		times[1] = ktime_sub(ktime_get_raw(), times[1]);
> +		if (prime == 1)
> +			times[0] = times[1];
> +
> +		if (__igt_timeout(end_time, NULL))
> +			break;
> +	}
> +
> +	err = end_live_test(&t);
> +	if (err)
> +		goto out;
> +
> +	pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n",
> +		nctx, ve[0]->name, ktime_to_ns(times[0]),
> +		prime, div64_u64(ktime_to_ns(times[1]), prime));
> +
> +out:
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +
> +	for (nc = 0; nc < nctx; nc++) {
> +		intel_virtual_engine_put(ve[nc]);
> +		kernel_context_close(ctx[nc]);
> +	}
> +	return err;
> +}
> +
> +static int live_virtual_engine(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	unsigned int class, inst;
> +	int err = -ENODEV;
> +
> +	if (USES_GUC_SUBMISSION(i915))
> +		return 0;
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +
> +	for_each_engine(engine, i915, id) {
> +		err = nop_virtual_engine(i915, &engine, 1, 1, 0);
> +		if (err) {
> +			pr_err("Failed to wrap engine %s: err=%d\n",
> +			       engine->name, err);
> +			goto out_unlock;
> +		}
> +	}
> +
> +	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
> +		int nsibling, n;
> +
> +		nsibling = 0;
> +		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
> +			if (!i915->engine_class[class][inst])
> +				break;
> +
> +			siblings[nsibling++] = i915->engine_class[class][inst];
> +		}
> +		if (nsibling < 2)
> +			continue;
> +
> +		for (n = 1; n <= nsibling + 1; n++) {
> +			err = nop_virtual_engine(i915, siblings, nsibling,
> +						 n, 0);
> +			if (err)
> +				goto out_unlock;
> +		}
> +
> +		err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN);
> +		if (err)
> +			goto out_unlock;
> +	}
> +
> +out_unlock:
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	return err;
> +}
> +
>   int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
> @@ -588,6 +803,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(live_preempt),
>   		SUBTEST(live_late_preempt),
>   		SUBTEST(live_preempt_hang),
> +		SUBTEST(live_virtual_engine),
>   	};
>   
>   	if (!HAS_EXECLISTS(i915))
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index d41b4c673af4..6dd2a89410e8 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1523,14 +1523,41 @@ struct drm_i915_gem_context_param {
>    * default settings.
>    *
>    * See struct i915_context_param_engines.
> + *
> + * Extensions:
> + *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
>    */
>   #define I915_CONTEXT_PARAM_ENGINES	0x7
>   
>   	__u64 value;
>   };
>   
> +/*
> + * i915_context_engines_load_balance:
> + *
> + * Enable load balancing across this set of engines.
> + *
> + * Into the I915_EXEC_DEFAULT slot, a virtual engine is created that when
> + * used will proxy the execbuffer request onto one of the set of engines
> + * in such a way as to distribute the load evenly across the set.
> + *
> + * The set of engines must be compatible (e.g. the same HW class) as they
> + * will share the same logical GPU context and ring.
> + *
> + * The context must be defined to use a single timeline for all engines.
> + */
> +struct i915_context_engines_load_balance {
> +	struct i915_user_extension base;
> +
> +	__u64 flags; /* all undefined flags must be zero */
> +	__u64 engines_mask;
> +
> +	__u64 mbz[4]; /* reserved for future use; must be zero */
> +};
> +
>   struct i915_context_param_engines {
>   	__u64 extensions;
> +#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
>   
>   	struct {
>   		__u32 class; /* see enum drm_i915_gem_engine_class */
> 

This was just a first pass reading. Will need more passes to figure it 
all out.

Regards,

Tvrtko