[Intel-gfx] [PATCH 14/40] drm/i915: Load balancing across a virtual engine

Wed May 8 10:29:34 UTC 2019

On 08/05/2019 09:06, Chris Wilson wrote:
> Having allowed the user to define a set of engines that they will want
> to only use, we go one step further and allow them to bind those engines
> into a single virtual instance. Submitting a batch to the virtual engine
> will then forward it to any one of the set in a manner as best to
> distribute load.  The virtual engine has a single timeline across all
> engines (it operates as a single queue), so it is not able to concurrently
> run batches across multiple engines by itself; that is left up to the user
> to submit multiple concurrent batches to multiple queues. Multiple users
> will be load balanced across the system.
> 
> The mechanism used for load balancing in this patch is a late greedy
> balancer. When a request is ready for execution, it is added to each
> engine's queue, and when an engine is ready for its next request it
> claims it from the virtual engine. The first engine to do so, wins, i.e.
> the request is executed at the earliest opportunity (idle moment) in the
> system.
> 
> As not all HW is created equal, the user is still able to skip the
> virtual engine and execute the batch on a specific engine, all within the
> same queue. It will then be executed in order on the correct engine,
> with execution on other virtual engines being moved away due to the load
> detection.
> 
> A couple of areas for potential improvement left!
> 
> - The virtual engine always take priority over equal-priority tasks.
> Mostly broken up by applying FQ_CODEL rules for prioritising new clients,
> and hopefully the virtual and real engines are not then congested (i.e.
> all work is via virtual engines, or all work is to the real engine).
> 
> - We require the breadcrumb irq around every virtual engine request. For
> normal engines, we eliminate the need for the slow round trip via
> interrupt by using the submit fence and queueing in order. For virtual
> engines, we have to allow any job to transfer to a new ring, and cannot
> coalesce the submissions, so require the completion fence instead,
> forcing the persistent use of interrupts.
> 
> - We only drip feed single requests through each virtual engine and onto
> the physical engines, even if there was enough work to fill all ELSP,
> leaving small stalls with an idle CS event at the end of every request.
> Could we be greedy and fill both slots? Being lazy is virtuous for load
> distribution on less-than-full workloads though.
> 
> Other areas of improvement are more general, such as reducing lock
> contention, reducing dispatch overhead, looking at direct submission
> rather than bouncing around tasklets etc.
> 
> sseu: Lift the restriction to allow sseu to be reconfigured on virtual
> engines composed of RENDER_CLASS (rcs).
> 
> v2: macroize check_user_mbz()
> v3: Cancel virtual engines on wedging
> v4: Commence commenting
> v5: Replace 64b sibling_mask with a list of class:instance
> v6: Drop the one-element array in the uabi
> v7: Assert it is an virtual engine in to_virtual_engine()
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_types.h |   8 +
>   drivers/gpu/drm/i915/gt/intel_lrc.c          | 683 ++++++++++++++++++-
>   drivers/gpu/drm/i915/gt/intel_lrc.h          |   9 +
>   drivers/gpu/drm/i915/gt/selftest_lrc.c       | 180 +++++
>   drivers/gpu/drm/i915/i915_gem.h              |   5 +
>   drivers/gpu/drm/i915/i915_gem_context.c      | 116 +++-
>   drivers/gpu/drm/i915/i915_scheduler.c        |  19 +-
>   drivers/gpu/drm/i915/i915_timeline_types.h   |   1 +
>   include/uapi/drm/i915_drm.h                  |  39 ++
>   9 files changed, 1032 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index e381c1c73902..7b47e00fa082 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -227,6 +227,7 @@ struct intel_engine_execlists {
>   	 * @queue: queue of requests, in priority lists
>   	 */
>   	struct rb_root_cached queue;
> +	struct rb_root_cached virtual;
>   
>   	/**
>   	 * @csb_write: control register for Context Switch buffer
> @@ -445,6 +446,7 @@ struct intel_engine_cs {
>   #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
>   #define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
>   #define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
> +#define I915_ENGINE_IS_VIRTUAL       BIT(5)
>   	unsigned int flags;
>   
>   	/*
> @@ -534,6 +536,12 @@ intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine)
>   	return engine->flags & I915_ENGINE_NEEDS_BREADCRUMB_TASKLET;
>   }
>   
> +static inline bool
> +intel_engine_is_virtual(const struct intel_engine_cs *engine)
> +{
> +	return engine->flags & I915_ENGINE_IS_VIRTUAL;
> +}
> +
>   #define instdone_slice_mask(dev_priv__) \
>   	(IS_GEN(dev_priv__, 7) ? \
>   	 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask)
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index f1d62746e066..bc388df39802 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -166,6 +166,42 @@
>   
>   #define ACTIVE_PRIORITY (I915_PRIORITY_NEWCLIENT | I915_PRIORITY_NOSEMAPHORE)
>   
> +struct virtual_engine {
> +	struct intel_engine_cs base;
> +	struct intel_context context;
> +
> +	/*
> +	 * We allow only a single request through the virtual engine at a time
> +	 * (each request in the timeline waits for the completion fence of
> +	 * the previous before being submitted). By restricting ourselves to
> +	 * only submitting a single request, each request is placed on to a
> +	 * physical to maximise load spreading (by virtue of the late greedy
> +	 * scheduling -- each real engine takes the next available request
> +	 * upon idling).
> +	 */
> +	struct i915_request *request;
> +
> +	/*
> +	 * We keep a rbtree of available virtual engines inside each physical
> +	 * engine, sorted by priority. Here we preallocate the nodes we need
> +	 * for the virtual engine, indexed by physical_engine->id.
> +	 */
> +	struct ve_node {
> +		struct rb_node rb;
> +		int prio;
> +	} nodes[I915_NUM_ENGINES];
> +
> +	/* And finally, which physical engines this virtual engine maps onto. */
> +	unsigned int num_siblings;
> +	struct intel_engine_cs *siblings[0];
> +};
> +
> +static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
> +{
> +	GEM_BUG_ON(!intel_engine_is_virtual(engine));
> +	return container_of(engine, struct virtual_engine, base);
> +}
> +
>   static int execlists_context_deferred_alloc(struct intel_context *ce,
>   					    struct intel_engine_cs *engine);
>   static void execlists_init_reg_state(u32 *reg_state,
> @@ -229,7 +265,8 @@ static int queue_prio(const struct intel_engine_execlists *execlists)
>   }
>   
>   static inline bool need_preempt(const struct intel_engine_cs *engine,
> -				const struct i915_request *rq)
> +				const struct i915_request *rq,
> +				struct rb_node *rb)
>   {
>   	int last_prio;
>   
> @@ -264,6 +301,25 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
>   	    rq_prio(list_next_entry(rq, link)) > last_prio)
>   		return true;
>   
> +	if (rb) {
> +		struct virtual_engine *ve =
> +			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +		bool preempt = false;
> +
> +		if (engine == ve->siblings[0]) { /* only preempt one sibling */
> +			struct i915_request *next;
> +
> +			rcu_read_lock();
> +			next = READ_ONCE(ve->request);
> +			if (next)
> +				preempt = rq_prio(next) > last_prio;
> +			rcu_read_unlock();
> +		}
> +
> +		if (preempt)
> +			return preempt;
> +	}
> +
>   	/*
>   	 * If the inflight context did not trigger the preemption, then maybe
>   	 * it was the set of queued requests? Pick the highest priority in
> @@ -382,6 +438,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine, int boost)
>   	list_for_each_entry_safe_reverse(rq, rn,
>   					 &engine->timeline.requests,
>   					 link) {
> +		struct intel_engine_cs *owner;
> +
>   		if (i915_request_completed(rq))
>   			break;
>   
> @@ -390,16 +448,32 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine, int boost)
>   
>   		GEM_BUG_ON(rq->hw_context->active);
>   
> -		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> -		if (rq_prio(rq) != prio) {
> -			prio = rq_prio(rq);
> -			pl = i915_sched_lookup_priolist(engine, prio);
> -		}
> -		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
> +		/*
> +		 * Push the request back into the queue for later resubmission.
> +		 * If this request is not native to this physical engine (i.e.
> +		 * it came from a virtual source), push it back onto the virtual
> +		 * engine so that it can be moved across onto another physical
> +		 * engine as load dictates.
> +		 */
> +		owner = rq->hw_context->engine;
> +		if (likely(owner == engine)) {
> +			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> +			if (rq_prio(rq) != prio) {
> +				prio = rq_prio(rq);
> +				pl = i915_sched_lookup_priolist(engine, prio);
> +			}
> +			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
>   
> -		list_add(&rq->sched.link, pl);
> +			list_add(&rq->sched.link, pl);
> +			active = rq;
> +		} else {
> +			if (__i915_request_has_started(rq))
> +				rq->sched.attr.priority |= boost;
>   
> -		active = rq;
> +			rq->engine = owner;
> +			owner->submit_request(rq);
> +			active = NULL;
> +		}
>   	}
>   
>   	/*
> @@ -419,7 +493,7 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine, int boost)
>   	 * in the priority queue, but they will not gain immediate access to
>   	 * the GPU.
>   	 */
> -	if (~prio & boost && __i915_request_has_started(active)) {
> +	if (~prio & boost && active && __i915_request_has_started(active)) {
>   		prio |= boost;
>   		GEM_BUG_ON(active->sched.attr.priority >= prio);
>   		active->sched.attr.priority = prio;
> @@ -661,6 +735,90 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists)
>   				     ACTIVE_PRIORITY);
>   }
>   
> +static void virtual_update_register_offsets(u32 *regs,
> +					    struct intel_engine_cs *engine)
> +{
> +	u32 base = engine->mmio_base;
> +
> +	/* Must match execlists_init_reg_state()! */
> +
> +	regs[CTX_CONTEXT_CONTROL] =
> +		i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base));
> +	regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
> +	regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
> +	regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
> +	regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
> +
> +	regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
> +	regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
> +	regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
> +	regs[CTX_SECOND_BB_HEAD_U] =
> +		i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
> +	regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
> +	regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
> +
> +	regs[CTX_CTX_TIMESTAMP] =
> +		i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
> +	regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3));
> +	regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3));
> +	regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2));
> +	regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2));
> +	regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1));
> +	regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1));
> +	regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
> +	regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0));
> +
> +	if (engine->class == RENDER_CLASS) {
> +		regs[CTX_RCS_INDIRECT_CTX] =
> +			i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
> +		regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
> +			i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
> +		regs[CTX_BB_PER_CTX_PTR] =
> +			i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
> +
> +		regs[CTX_R_PWR_CLK_STATE] =
> +			i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
> +	}
> +}
> +
> +static bool virtual_matches(const struct virtual_engine *ve,
> +			    const struct i915_request *rq,
> +			    const struct intel_engine_cs *engine)
> +{
> +	const struct intel_engine_cs *active;
> +
> +	/*
> +	 * We track when the HW has completed saving the context image
> +	 * (i.e. when we have seen the final CS event switching out of
> +	 * the context) and must not overwrite the context image before
> +	 * then. This restricts us to only using the active engine
> +	 * while the previous virtualized request is inflight (so
> +	 * we reuse the register offsets). This is a very small
> +	 * hystersis on the greedy seelction algorithm.
> +	 */
> +	active = READ_ONCE(ve->context.active);
> +	if (active && active != engine)
> +		return false;
> +
> +	return true;
> +}
> +
> +static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
> +				     struct intel_engine_cs *engine)
> +{
> +	struct intel_engine_cs *old = ve->siblings[0];
> +
> +	/* All unattached (rq->engine == old) must already be completed */
> +
> +	spin_lock(&old->breadcrumbs.irq_lock);
> +	if (!list_empty(&ve->context.signal_link)) {
> +		list_move_tail(&ve->context.signal_link,
> +			       &engine->breadcrumbs.signalers);
> +		intel_engine_queue_breadcrumbs(engine);
> +	}
> +	spin_unlock(&old->breadcrumbs.irq_lock);
> +}
> +
>   static void execlists_dequeue(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -693,6 +851,26 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	 * and context switches) submission.
>   	 */
>   
> +	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
> +		struct virtual_engine *ve =
> +			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +		struct i915_request *rq = READ_ONCE(ve->request);
> +
> +		if (!rq) { /* lazily cleanup after another engine handled rq */
> +			rb_erase_cached(rb, &execlists->virtual);
> +			RB_CLEAR_NODE(rb);
> +			rb = rb_first_cached(&execlists->virtual);
> +			continue;
> +		}
> +
> +		if (!virtual_matches(ve, rq, engine)) {
> +			rb = rb_next(rb);
> +			continue;
> +		}
> +
> +		break;
> +	}
> +
>   	if (last) {
>   		/*
>   		 * Don't resubmit or switch until all outstanding
> @@ -714,7 +892,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
>   			return;
>   
> -		if (need_preempt(engine, last)) {
> +		if (need_preempt(engine, last, rb)) {
>   			inject_preempt_context(engine);
>   			return;
>   		}
> @@ -754,6 +932,92 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		last->tail = last->wa_tail;
>   	}
>   
> +	while (rb) { /* XXX virtual is always taking precedence */
> +		struct virtual_engine *ve =
> +			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +		struct i915_request *rq;
> +
> +		spin_lock(&ve->base.timeline.lock);
> +
> +		rq = ve->request;
> +		if (unlikely(!rq)) { /* lost the race to a sibling */
> +			spin_unlock(&ve->base.timeline.lock);
> +			rb_erase_cached(rb, &execlists->virtual);
> +			RB_CLEAR_NODE(rb);
> +			rb = rb_first_cached(&execlists->virtual);
> +			continue;
> +		}
> +
> +		GEM_BUG_ON(rq != ve->request);
> +		GEM_BUG_ON(rq->engine != &ve->base);
> +		GEM_BUG_ON(rq->hw_context != &ve->context);
> +
> +		if (rq_prio(rq) >= queue_prio(execlists)) {
> +			if (!virtual_matches(ve, rq, engine)) {
> +				spin_unlock(&ve->base.timeline.lock);
> +				rb = rb_next(rb);
> +				continue;
> +			}
> +
> +			if (last && !can_merge_rq(last, rq)) {
> +				spin_unlock(&ve->base.timeline.lock);
> +				return; /* leave this rq for another engine */
> +			}
> +
> +			GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
> +				  engine->name,
> +				  rq->fence.context,
> +				  rq->fence.seqno,
> +				  i915_request_completed(rq) ? "!" :
> +				  i915_request_started(rq) ? "*" :
> +				  "",
> +				  yesno(engine != ve->siblings[0]));
> +
> +			ve->request = NULL;
> +			ve->base.execlists.queue_priority_hint = INT_MIN;
> +			rb_erase_cached(rb, &execlists->virtual);
> +			RB_CLEAR_NODE(rb);
> +
> +			rq->engine = engine;
> +
> +			if (engine != ve->siblings[0]) {
> +				u32 *regs = ve->context.lrc_reg_state;
> +				unsigned int n;
> +
> +				GEM_BUG_ON(READ_ONCE(ve->context.active));
> +				virtual_update_register_offsets(regs, engine);
> +
> +				if (!list_empty(&ve->context.signals))
> +					virtual_xfer_breadcrumbs(ve, engine);
> +
> +				/*
> +				 * Move the bound engine to the top of the list
> +				 * for future execution. We then kick this
> +				 * tasklet first before checking others, so that
> +				 * we preferentially reuse this set of bound
> +				 * registers.
> +				 */
> +				for (n = 1; n < ve->num_siblings; n++) {
> +					if (ve->siblings[n] == engine) {
> +						swap(ve->siblings[n],
> +						     ve->siblings[0]);
> +						break;
> +					}
> +				}
> +
> +				GEM_BUG_ON(ve->siblings[0] != engine);
> +			}
> +
> +			__i915_request_submit(rq);
> +			trace_i915_request_in(rq, port_index(port, execlists));
> +			submit = true;
> +			last = rq;
> +		}
> +
> +		spin_unlock(&ve->base.timeline.lock);
> +		break;
> +	}
> +
>   	while ((rb = rb_first_cached(&execlists->queue))) {
>   		struct i915_priolist *p = to_priolist(rb);
>   		struct i915_request *rq, *rn;
> @@ -1877,6 +2141,25 @@ static void reset_csb_pointers(struct intel_engine_execlists *execlists)
>   			       &execlists->csb_status[reset_value]);
>   }
>   
> +static struct i915_request *active_request(struct i915_request *rq)
> +{
> +	const struct list_head * const list = &rq->engine->timeline.requests;
> +	const struct intel_context * const context = rq->hw_context;
> +	struct i915_request *active = NULL;
> +
> +	list_for_each_entry_from_reverse(rq, list, link) {
> +		if (i915_request_completed(rq))
> +			break;
> +
> +		if (rq->hw_context != context)
> +			break;
> +
> +		active = rq;
> +	}
> +
> +	return active;
> +}
> +
>   static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -1897,7 +2180,8 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
>   	if (!port_isset(execlists->port))
>   		goto out_clear;
>   
> -	ce = port_request(execlists->port)->hw_context;
> +	rq = port_request(execlists->port);
> +	ce = rq->hw_context;
>   
>   	/*
>   	 * Catch up with any missed context-switch interrupts.
> @@ -1910,16 +2194,10 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
>   	 */
>   	execlists_cancel_port_requests(execlists);
>   
> -	/* Push back any incomplete requests for replay after the reset. */
> -	rq = __unwind_incomplete_requests(engine, 0);
> +	rq = active_request(rq);
>   	if (!rq)
>   		goto out_replay;
>   
> -	if (rq->hw_context != ce) { /* caught just before a CS event */
> -		rq = NULL;
> -		goto out_replay;
> -	}
> -
>   	/*
>   	 * If this request hasn't started yet, e.g. it is waiting on a
>   	 * semaphore, we need to avoid skipping the request or else we
> @@ -1966,13 +2244,16 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
>   	}
>   	execlists_init_reg_state(regs, ce, engine, ce->ring);
>   
> -	/* Rerun the request; its payload has been neutered (if guilty). */
>   out_replay:
> +	/* Rerun the request; its payload has been neutered (if guilty). */
>   	ce->ring->head =
>   		rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail;
>   	intel_ring_update_space(ce->ring);
>   	__execlists_update_reg_state(ce, engine);
>   
> +	/* Push back any incomplete requests for replay after the reset. */
> +	__unwind_incomplete_requests(engine, 0);
> +
>   out_clear:
>   	execlists_clear_all_active(execlists);
>   }
> @@ -2046,6 +2327,26 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>   		i915_priolist_free(p);
>   	}
>   
> +	/* Cancel all attached virtual engines */
> +	while ((rb = rb_first_cached(&execlists->virtual))) {
> +		struct virtual_engine *ve =
> +			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +
> +		rb_erase_cached(rb, &execlists->virtual);
> +		RB_CLEAR_NODE(rb);
> +
> +		spin_lock(&ve->base.timeline.lock);
> +		if (ve->request) {
> +			ve->request->engine = engine;
> +			__i915_request_submit(ve->request);
> +			dma_fence_set_error(&ve->request->fence, -EIO);
> +			i915_request_mark_complete(ve->request);
> +			ve->base.execlists.queue_priority_hint = INT_MIN;
> +			ve->request = NULL;
> +		}
> +		spin_unlock(&ve->base.timeline.lock);
> +	}
> +
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> @@ -2529,12 +2830,15 @@ static void execlists_init_reg_state(u32 *regs,
>   	bool rcs = engine->class == RENDER_CLASS;
>   	u32 base = engine->mmio_base;
>   
> -	/* A context is actually a big batch buffer with several
> +	/*
> +	 * A context is actually a big batch buffer with several
>   	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
>   	 * values we are setting here are only for the first context restore:
>   	 * on a subsequent save, the GPU will recreate this batchbuffer with new
>   	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
>   	 * we are not initializing here).
> +	 *
> +	 * Must keep consistent with virtual_update_register_offsets().
>   	 */
>   	regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
>   				 MI_LRI_FORCE_POSTED;
> @@ -2753,6 +3057,320 @@ static int execlists_context_deferred_alloc(struct intel_context *ce,
>   	return ret;
>   }
>   
> +static void virtual_context_destroy(struct kref *kref)
> +{
> +	struct virtual_engine *ve =
> +		container_of(kref, typeof(*ve), context.ref);
> +	unsigned int n;
> +
> +	GEM_BUG_ON(ve->request);
> +	GEM_BUG_ON(ve->context.active);
> +
> +	for (n = 0; n < ve->num_siblings; n++) {
> +		struct intel_engine_cs *sibling = ve->siblings[n];
> +		struct rb_node *node = &ve->nodes[sibling->id].rb;
> +
> +		if (RB_EMPTY_NODE(node))
> +			continue;
> +
> +		spin_lock_irq(&sibling->timeline.lock);
> +
> +		/* Detachment is lazily performed in the execlists tasklet */
> +		if (!RB_EMPTY_NODE(node))
> +			rb_erase_cached(node, &sibling->execlists.virtual);
> +
> +		spin_unlock_irq(&sibling->timeline.lock);
> +	}
> +	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
> +
> +	if (ve->context.state)
> +		__execlists_context_fini(&ve->context);
> +
> +	i915_timeline_fini(&ve->base.timeline);
> +	kfree(ve);
> +}
> +
> +static void virtual_engine_initial_hint(struct virtual_engine *ve)
> +{
> +	int swp;
> +
> +	/*
> +	 * Pick a random sibling on starting to help spread the load around.
> +	 *
> +	 * New contexts are typically created with exactly the same order
> +	 * of siblings, and often started in batches. Due to the way we iterate
> +	 * the array of sibling when submitting requests, sibling[0] is
> +	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
> +	 * randomised across the system, we also help spread the load by the
> +	 * first engine we inspect being different each time.
> +	 *
> +	 * NB This does not force us to execute on this engine, it will just
> +	 * typically be the first we inspect for submission.
> +	 */
> +	swp = prandom_u32_max(ve->num_siblings);
> +	if (!swp)
> +		return;
> +
> +	swap(ve->siblings[swp], ve->siblings[0]);
> +	virtual_update_register_offsets(ve->context.lrc_reg_state,
> +					ve->siblings[0]);
> +}
> +
> +static int virtual_context_pin(struct intel_context *ce)
> +{
> +	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
> +	int err;
> +
> +	/* Note: we must use a real engine class for setting up reg state */
> +	err = __execlists_context_pin(ce, ve->siblings[0]);
> +	if (err)
> +		return err;
> +
> +	virtual_engine_initial_hint(ve);
> +	return 0;
> +}
> +
> +static void virtual_context_enter(struct intel_context *ce)
> +{
> +	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
> +	unsigned int n;
> +
> +	for (n = 0; n < ve->num_siblings; n++)
> +		intel_engine_pm_get(ve->siblings[n]);
> +}
> +
> +static void virtual_context_exit(struct intel_context *ce)
> +{
> +	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
> +	unsigned int n;
> +
> +	ce->saturated = 0;
> +	for (n = 0; n < ve->num_siblings; n++)
> +		intel_engine_pm_put(ve->siblings[n]);
> +}
> +
> +static const struct intel_context_ops virtual_context_ops = {
> +	.pin = virtual_context_pin,
> +	.unpin = execlists_context_unpin,
> +
> +	.enter = virtual_context_enter,
> +	.exit = virtual_context_exit,
> +
> +	.destroy = virtual_context_destroy,
> +};
> +
> +static void virtual_submission_tasklet(unsigned long data)
> +{
> +	struct virtual_engine * const ve = (struct virtual_engine *)data;
> +	const int prio = ve->base.execlists.queue_priority_hint;
> +	unsigned int n;
> +
> +	local_irq_disable();
> +	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
> +		struct intel_engine_cs *sibling = ve->siblings[n];
> +		struct ve_node * const node = &ve->nodes[sibling->id];
> +		struct rb_node **parent, *rb;
> +		bool first;
> +
> +		spin_lock(&sibling->timeline.lock);
> +
> +		if (!RB_EMPTY_NODE(&node->rb)) {
> +			/*
> +			 * Cheat and avoid rebalancing the tree if we can
> +			 * reuse this node in situ.
> +			 */
> +			first = rb_first_cached(&sibling->execlists.virtual) ==
> +				&node->rb;
> +			if (prio == node->prio || (prio > node->prio && first))
> +				goto submit_engine;
> +
> +			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
> +		}
> +
> +		rb = NULL;
> +		first = true;
> +		parent = &sibling->execlists.virtual.rb_root.rb_node;
> +		while (*parent) {
> +			struct ve_node *other;
> +
> +			rb = *parent;
> +			other = rb_entry(rb, typeof(*other), rb);
> +			if (prio > other->prio) {
> +				parent = &rb->rb_left;
> +			} else {
> +				parent = &rb->rb_right;
> +				first = false;
> +			}
> +		}
> +
> +		rb_link_node(&node->rb, rb, parent);
> +		rb_insert_color_cached(&node->rb,
> +				       &sibling->execlists.virtual,
> +				       first);
> +
> +submit_engine:
> +		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
> +		node->prio = prio;
> +		if (first && prio > sibling->execlists.queue_priority_hint) {
> +			sibling->execlists.queue_priority_hint = prio;
> +			tasklet_hi_schedule(&sibling->execlists.tasklet);
> +		}
> +
> +		spin_unlock(&sibling->timeline.lock);
> +	}
> +	local_irq_enable();
> +}
> +
> +static void virtual_submit_request(struct i915_request *rq)
> +{
> +	struct virtual_engine *ve = to_virtual_engine(rq->engine);
> +
> +	GEM_TRACE("%s: rq=%llx:%lld\n",
> +		  ve->base.name,
> +		  rq->fence.context,
> +		  rq->fence.seqno);
> +
> +	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
> +
> +	GEM_BUG_ON(ve->request);
> +	ve->base.execlists.queue_priority_hint = rq_prio(rq);
> +	WRITE_ONCE(ve->request, rq);
> +
> +	tasklet_schedule(&ve->base.execlists.tasklet);
> +}
> +
> +struct intel_context *
> +intel_execlists_create_virtual(struct i915_gem_context *ctx,
> +			       struct intel_engine_cs **siblings,
> +			       unsigned int count)
> +{
> +	struct virtual_engine *ve;
> +	unsigned int n;
> +	int err;
> +
> +	if (count == 0)
> +		return ERR_PTR(-EINVAL);
> +
> +	if (count == 1)
> +		return intel_context_create(ctx, siblings[0]);
> +
> +	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
> +	if (!ve)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ve->base.i915 = ctx->i915;
> +	ve->base.id = -1;
> +	ve->base.class = OTHER_CLASS;
> +	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
> +	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
> +	ve->base.flags = I915_ENGINE_IS_VIRTUAL;
> +
> +	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
> +
> +	err = i915_timeline_init(ctx->i915, &ve->base.timeline, NULL);
> +	if (err)
> +		goto err_put;
> +	i915_timeline_set_subclass(&ve->base.timeline, TIMELINE_VIRTUAL);
> +
> +	intel_engine_init_execlists(&ve->base);
> +
> +	ve->base.cops = &virtual_context_ops;
> +	ve->base.request_alloc = execlists_request_alloc;
> +
> +	ve->base.schedule = i915_schedule;
> +	ve->base.submit_request = virtual_submit_request;
> +
> +	ve->base.execlists.queue_priority_hint = INT_MIN;
> +	tasklet_init(&ve->base.execlists.tasklet,
> +		     virtual_submission_tasklet,
> +		     (unsigned long)ve);
> +
> +	intel_context_init(&ve->context, ctx, &ve->base);
> +
> +	for (n = 0; n < count; n++) {
> +		struct intel_engine_cs *sibling = siblings[n];
> +
> +		GEM_BUG_ON(!is_power_of_2(sibling->mask));
> +		if (sibling->mask & ve->base.mask) {
> +			DRM_DEBUG("duplicate %s entry in load balancer\n",
> +				  sibling->name);
> +			err = -EINVAL;
> +			goto err_put;
> +		}
> +
> +		/*
> +		 * The virtual engine implementation is tightly coupled to
> +		 * the execlists backend -- we push out request directly
> +		 * into a tree inside each physical engine. We could support
> +		 * layering if we handle cloning of the requests and
> +		 * submitting a copy into each backend.
> +		 */
> +		if (sibling->execlists.tasklet.func !=
> +		    execlists_submission_tasklet) {
> +			err = -ENODEV;
> +			goto err_put;
> +		}
> +
> +		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
> +		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
> +
> +		ve->siblings[ve->num_siblings++] = sibling;
> +		ve->base.mask |= sibling->mask;
> +
> +		/*
> +		 * All physical engines must be compatible for their emission
> +		 * functions (as we build the instructions during request
> +		 * construction and do not alter them before submission
> +		 * on the physical engine). We use the engine class as a guide
> +		 * here, although that could be refined.
> +		 */
> +		if (ve->base.class != OTHER_CLASS) {
> +			if (ve->base.class != sibling->class) {
> +				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
> +					  sibling->class, ve->base.class);
> +				err = -EINVAL;
> +				goto err_put;
> +			}
> +			continue;
> +		}
> +
> +		ve->base.class = sibling->class;
> +		ve->base.uabi_class = sibling->uabi_class;
> +		snprintf(ve->base.name, sizeof(ve->base.name),
> +			 "v%dx%d", ve->base.class, count);
> +		ve->base.context_size = sibling->context_size;
> +
> +		ve->base.emit_bb_start = sibling->emit_bb_start;
> +		ve->base.emit_flush = sibling->emit_flush;
> +		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
> +		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
> +		ve->base.emit_fini_breadcrumb_dw =
> +			sibling->emit_fini_breadcrumb_dw;
> +	}
> +
> +	return &ve->context;
> +
> +err_put:
> +	intel_context_put(&ve->context);
> +	return ERR_PTR(err);
> +}
> +
> +struct intel_context *
> +intel_execlists_clone_virtual(struct i915_gem_context *ctx,
> +			      struct intel_engine_cs *src)
> +{
> +	struct virtual_engine *se = to_virtual_engine(src);
> +	struct intel_context *dst;
> +
> +	dst = intel_execlists_create_virtual(ctx,
> +					     se->siblings,
> +					     se->num_siblings);
> +	if (IS_ERR(dst))
> +		return dst;
> +
> +	return dst;
> +}
> +
>   void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   				   struct drm_printer *m,
>   				   void (*show_request)(struct drm_printer *m,
> @@ -2810,6 +3428,29 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   		show_request(m, last, "\t\tQ ");
>   	}
>   
> +	last = NULL;
> +	count = 0;
> +	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
> +		struct virtual_engine *ve =
> +			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +		struct i915_request *rq = READ_ONCE(ve->request);
> +
> +		if (rq) {
> +			if (count++ < max - 1)
> +				show_request(m, rq, "\t\tV ");
> +			else
> +				last = rq;
> +		}
> +	}
> +	if (last) {
> +		if (count > max) {
> +			drm_printf(m,
> +				   "\t\t...skipping %d virtual requests...\n",
> +				   count - max);
> +		}
> +		show_request(m, last, "\t\tV ");
> +	}
> +
>   	spin_unlock_irqrestore(&engine->timeline.lock, flags);
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h
> index a0dc907a7249..5530606052e5 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.h
> @@ -114,4 +114,13 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   							const char *prefix),
>   				   unsigned int max);
>   
> +struct intel_context *
> +intel_execlists_create_virtual(struct i915_gem_context *ctx,
> +			       struct intel_engine_cs **siblings,
> +			       unsigned int count);
> +
> +struct intel_context *
> +intel_execlists_clone_virtual(struct i915_gem_context *ctx,
> +			      struct intel_engine_cs *src);
> +
>   #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 5b3d8e33f1cf..ccc0b6350123 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -1310,6 +1310,185 @@ static int live_preempt_smoke(void *arg)
>   	return err;
>   }
>   
> +static int nop_virtual_engine(struct drm_i915_private *i915,
> +			      struct intel_engine_cs **siblings,
> +			      unsigned int nsibling,
> +			      unsigned int nctx,
> +			      unsigned int flags)
> +#define CHAIN BIT(0)
> +{
> +	IGT_TIMEOUT(end_time);
> +	struct i915_request *request[16];
> +	struct i915_gem_context *ctx[16];
> +	struct intel_context *ve[16];
> +	unsigned long n, prime, nc;
> +	struct igt_live_test t;
> +	ktime_t times[2] = {};
> +	int err;
> +
> +	GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx));
> +
> +	for (n = 0; n < nctx; n++) {
> +		ctx[n] = kernel_context(i915);
> +		if (!ctx[n]) {
> +			err = -ENOMEM;
> +			nctx = n;
> +			goto out;
> +		}
> +
> +		ve[n] = intel_execlists_create_virtual(ctx[n],
> +						       siblings, nsibling);
> +		if (IS_ERR(ve[n])) {
> +			kernel_context_close(ctx[n]);
> +			err = PTR_ERR(ve[n]);
> +			nctx = n;
> +			goto out;
> +		}
> +
> +		err = intel_context_pin(ve[n]);
> +		if (err) {
> +			intel_context_put(ve[n]);
> +			kernel_context_close(ctx[n]);
> +			nctx = n;
> +			goto out;
> +		}
> +	}
> +
> +	err = igt_live_test_begin(&t, i915, __func__, ve[0]->engine->name);
> +	if (err)
> +		goto out;
> +
> +	for_each_prime_number_from(prime, 1, 8192) {
> +		times[1] = ktime_get_raw();
> +
> +		if (flags & CHAIN) {
> +			for (nc = 0; nc < nctx; nc++) {
> +				for (n = 0; n < prime; n++) {
> +					request[nc] =
> +						i915_request_create(ve[nc]);
> +					if (IS_ERR(request[nc])) {
> +						err = PTR_ERR(request[nc]);
> +						goto out;
> +					}
> +
> +					i915_request_add(request[nc]);
> +				}
> +			}
> +		} else {
> +			for (n = 0; n < prime; n++) {
> +				for (nc = 0; nc < nctx; nc++) {
> +					request[nc] =
> +						i915_request_create(ve[nc]);
> +					if (IS_ERR(request[nc])) {
> +						err = PTR_ERR(request[nc]);
> +						goto out;
> +					}
> +
> +					i915_request_add(request[nc]);
> +				}
> +			}
> +		}
> +
> +		for (nc = 0; nc < nctx; nc++) {
> +			if (i915_request_wait(request[nc],
> +					      I915_WAIT_LOCKED,
> +					      HZ / 10) < 0) {
> +				pr_err("%s(%s): wait for %llx:%lld timed out\n",
> +				       __func__, ve[0]->engine->name,
> +				       request[nc]->fence.context,
> +				       request[nc]->fence.seqno);
> +
> +				GEM_TRACE("%s(%s) failed at request %llx:%lld\n",
> +					  __func__, ve[0]->engine->name,
> +					  request[nc]->fence.context,
> +					  request[nc]->fence.seqno);
> +				GEM_TRACE_DUMP();
> +				i915_gem_set_wedged(i915);
> +				break;
> +			}
> +		}
> +
> +		times[1] = ktime_sub(ktime_get_raw(), times[1]);
> +		if (prime == 1)
> +			times[0] = times[1];
> +
> +		if (__igt_timeout(end_time, NULL))
> +			break;
> +	}
> +
> +	err = igt_live_test_end(&t);
> +	if (err)
> +		goto out;
> +
> +	pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n",
> +		nctx, ve[0]->engine->name, ktime_to_ns(times[0]),
> +		prime, div64_u64(ktime_to_ns(times[1]), prime));
> +
> +out:
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +
> +	for (nc = 0; nc < nctx; nc++) {
> +		intel_context_unpin(ve[nc]);
> +		intel_context_put(ve[nc]);
> +		kernel_context_close(ctx[nc]);
> +	}
> +	return err;
> +}
> +
> +static int live_virtual_engine(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	unsigned int class, inst;
> +	int err = -ENODEV;
> +
> +	if (USES_GUC_SUBMISSION(i915))
> +		return 0;
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +
> +	for_each_engine(engine, i915, id) {
> +		err = nop_virtual_engine(i915, &engine, 1, 1, 0);
> +		if (err) {
> +			pr_err("Failed to wrap engine %s: err=%d\n",
> +			       engine->name, err);
> +			goto out_unlock;
> +		}
> +	}
> +
> +	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
> +		int nsibling, n;
> +
> +		nsibling = 0;
> +		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
> +			if (!i915->engine_class[class][inst])
> +				break;

I previous review I said I think this should be continue instead of 
break so vcs0 + vcs2 skus can also be tested.

> +
> +			siblings[nsibling++] = i915->engine_class[class][inst];
> +		}
> +		if (nsibling < 2)
> +			continue;

And also that single engine VE could be tested just as well, unless I am 
missing something.

That's all remaining feedback I have on this patch.

Regards,

Tvrtko

> +
> +		for (n = 1; n <= nsibling + 1; n++) {
> +			err = nop_virtual_engine(i915, siblings, nsibling,
> +						 n, 0);
> +			if (err)
> +				goto out_unlock;
> +		}
> +
> +		err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN);
> +		if (err)
> +			goto out_unlock;
> +	}
> +
> +out_unlock:
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	return err;
> +}
> +
>   int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
> @@ -1322,6 +1501,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(live_chain_preempt),
>   		SUBTEST(live_preempt_hang),
>   		SUBTEST(live_preempt_smoke),
> +		SUBTEST(live_virtual_engine),
>   	};
>   
>   	if (!HAS_EXECLISTS(i915))
> diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
> index 67f8a4a807a0..fe82d3571072 100644
> --- a/drivers/gpu/drm/i915/i915_gem.h
> +++ b/drivers/gpu/drm/i915/i915_gem.h
> @@ -91,4 +91,9 @@ static inline bool __tasklet_enable(struct tasklet_struct *t)
>   	return atomic_dec_and_test(&t->count);
>   }
>   
> +static inline bool __tasklet_is_scheduled(struct tasklet_struct *t)
> +{
> +	return test_bit(TASKLET_STATE_SCHED, &t->state);
> +}
> +
>   #endif /* __I915_GEM_H__ */
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 040c0b770c21..3ae1d27130a8 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -86,6 +86,7 @@
>    */
>   
>   #include <linux/log2.h>
> +#include <linux/nospec.h>
>   
>   #include <drm/i915_drm.h>
>   
> @@ -1218,7 +1219,6 @@ __intel_context_reconfigure_sseu(struct intel_context *ce,
>   	int ret;
>   
>   	GEM_BUG_ON(INTEL_GEN(ce->gem_context->i915) < 8);
> -	GEM_BUG_ON(ce->engine->id != RCS0);
>   
>   	ret = intel_context_lock_pinned(ce);
>   	if (ret)
> @@ -1412,7 +1412,100 @@ struct set_engines {
>   	struct i915_gem_engines *engines;
>   };
>   
> +static int
> +set_engines__load_balance(struct i915_user_extension __user *base, void *data)
> +{
> +	struct i915_context_engines_load_balance __user *ext =
> +		container_of_user(base, typeof(*ext), base);
> +	const struct set_engines *set = data;
> +	struct intel_engine_cs *stack[16];
> +	struct intel_engine_cs **siblings;
> +	struct intel_context *ce;
> +	u16 num_siblings, idx;
> +	unsigned int n;
> +	int err;
> +
> +	if (!HAS_EXECLISTS(set->ctx->i915))
> +		return -ENODEV;
> +
> +	if (USES_GUC_SUBMISSION(set->ctx->i915))
> +		return -ENODEV; /* not implement yet */
> +
> +	if (get_user(idx, &ext->engine_index))
> +		return -EFAULT;
> +
> +	if (idx >= set->engines->num_engines) {
> +		DRM_DEBUG("Invalid placement value, %d >= %d\n",
> +			  idx, set->engines->num_engines);
> +		return -EINVAL;
> +	}
> +
> +	idx = array_index_nospec(idx, set->engines->num_engines);
> +	if (set->engines->engines[idx]) {
> +		DRM_DEBUG("Invalid placement[%d], already occupied\n", idx);
> +		return -EEXIST;
> +	}
> +
> +	if (get_user(num_siblings, &ext->num_siblings))
> +		return -EFAULT;
> +
> +	err = check_user_mbz(&ext->flags);
> +	if (err)
> +		return err;
> +
> +	err = check_user_mbz(&ext->mbz64);
> +	if (err)
> +		return err;
> +
> +	siblings = stack;
> +	if (num_siblings > ARRAY_SIZE(stack)) {
> +		siblings = kmalloc_array(num_siblings,
> +					 sizeof(*siblings),
> +					 GFP_KERNEL);
> +		if (!siblings)
> +			return -ENOMEM;
> +	}
> +
> +	for (n = 0; n < num_siblings; n++) {
> +		struct i915_engine_class_instance ci;
> +
> +		if (copy_from_user(&ci, &ext->engines[n], sizeof(ci))) {
> +			err = -EFAULT;
> +			goto out_siblings;
> +		}
> +
> +		siblings[n] = intel_engine_lookup_user(set->ctx->i915,
> +						       ci.engine_class,
> +						       ci.engine_instance);
> +		if (!siblings[n]) {
> +			DRM_DEBUG("Invalid sibling[%d]: { class:%d, inst:%d }\n",
> +				  n, ci.engine_class, ci.engine_instance);
> +			err = -EINVAL;
> +			goto out_siblings;
> +		}
> +	}
> +
> +	ce = intel_execlists_create_virtual(set->ctx, siblings, n);
> +	if (IS_ERR(ce)) {
> +		err = PTR_ERR(ce);
> +		goto out_siblings;
> +	}
> +
> +	if (cmpxchg(&set->engines->engines[idx], NULL, ce)) {
> +		intel_context_put(ce);
> +		err = -EEXIST;
> +		goto out_siblings;
> +	}
> +
> +out_siblings:
> +	if (siblings != stack)
> +		kfree(siblings);
> +
> +	return err;
> +}
> +
>   static const i915_user_extension_fn set_engines__extensions[] = {
> +	[I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
>   };
>   
>   static int
> @@ -1711,14 +1804,29 @@ static int clone_engines(struct i915_gem_context *dst,
>   
>   	clone->i915 = dst->i915;
>   	for (n = 0; n < e->num_engines; n++) {
> +		struct intel_engine_cs *engine;
> +
>   		if (!e->engines[n]) {
>   			clone->engines[n] = NULL;
>   			continue;
>   		}
> +		engine = e->engines[n]->engine;
>   
> -		clone->engines[n] =
> -			intel_context_create(dst, e->engines[n]->engine);
> -		if (!clone->engines[n]) {
> +		/*
> +		 * Virtual engines are singletons; they can only exist
> +		 * inside a single context, because they embed their
> +		 * HW context... As each virtual context implies a single
> +		 * timeline (each engine can only dequeue a single request
> +		 * at any time), it would be surprising for two contexts
> +		 * to use the same engine. So let's create a copy of
> +		 * the virtual engine instead.
> +		 */
> +		if (intel_engine_is_virtual(engine))
> +			clone->engines[n] =
> +				intel_execlists_clone_virtual(dst, engine);
> +		else
> +			clone->engines[n] = intel_context_create(dst, engine);
> +		if (IS_ERR_OR_NULL(clone->engines[n])) {
>   			__free_engines(clone, n);
>   			goto err_unlock;
>   		}
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index d215dcdf0b1e..78ceb56d7801 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -150,17 +150,26 @@ sched_lock_engine(const struct i915_sched_node *node,
>   		  struct intel_engine_cs *locked,
>   		  struct sched_cache *cache)
>   {
> -	struct intel_engine_cs *engine = node_to_request(node)->engine;
> +	const struct i915_request *rq = node_to_request(node);
> +	struct intel_engine_cs *engine;
>   
>   	GEM_BUG_ON(!locked);
>   
> -	if (engine != locked) {
> +	/*
> +	 * Virtual engines complicate acquiring the engine timeline lock,
> +	 * as their rq->engine pointer is not stable until under that
> +	 * engine lock. The simple ploy we use is to take the lock then
> +	 * check that the rq still belongs to the newly locked engine.
> +	 */
> +	while (locked != (engine = READ_ONCE(rq->engine))) {
>   		spin_unlock(&locked->timeline.lock);
>   		memset(cache, 0, sizeof(*cache));
>   		spin_lock(&engine->timeline.lock);
> +		locked = engine;
>   	}
>   
> -	return engine;
> +	GEM_BUG_ON(locked != engine);
> +	return locked;
>   }
>   
>   static inline int rq_prio(const struct i915_request *rq)
> @@ -272,6 +281,7 @@ static void __i915_schedule(struct i915_sched_node *node,
>   	spin_lock(&engine->timeline.lock);
>   
>   	/* Fifo and depth-first replacement ensure our deps execute before us */
> +	engine = sched_lock_engine(node, engine, &cache);
>   	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
>   		INIT_LIST_HEAD(&dep->dfs_link);
>   
> @@ -283,8 +293,11 @@ static void __i915_schedule(struct i915_sched_node *node,
>   		if (prio <= node->attr.priority || node_signaled(node))
>   			continue;
>   
> +		GEM_BUG_ON(node_to_request(node)->engine != engine);
> +
>   		node->attr.priority = prio;
>   		if (!list_empty(&node->link)) {
> +			GEM_BUG_ON(intel_engine_is_virtual(engine));
>   			if (!cache.priolist)
>   				cache.priolist =
>   					i915_sched_lookup_priolist(engine,
> diff --git a/drivers/gpu/drm/i915/i915_timeline_types.h b/drivers/gpu/drm/i915/i915_timeline_types.h
> index 5256a0b5c5f7..1688705f4a2b 100644
> --- a/drivers/gpu/drm/i915/i915_timeline_types.h
> +++ b/drivers/gpu/drm/i915/i915_timeline_types.h
> @@ -26,6 +26,7 @@ struct i915_timeline {
>   	spinlock_t lock;
>   #define TIMELINE_CLIENT 0 /* default subclass */
>   #define TIMELINE_ENGINE 1
> +#define TIMELINE_VIRTUAL 2
>   	struct mutex mutex; /* protects the flow of requests */
>   
>   	unsigned int pin_count;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 62396d575e28..f9770948161c 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -137,6 +137,7 @@ struct i915_engine_class_instance {
>   	__u16 engine_class; /* see enum drm_i915_gem_engine_class */
>   	__u16 engine_instance;
>   #define I915_ENGINE_CLASS_INVALID_NONE -1
> +#define I915_ENGINE_CLASS_INVALID_VIRTUAL -2
>   };
>   
>   /**
> @@ -1608,8 +1609,46 @@ struct drm_i915_gem_context_param_sseu {
>   	__u32 rsvd;
>   };
>   
> +/*
> + * i915_context_engines_load_balance:
> + *
> + * Enable load balancing across this set of engines.
> + *
> + * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when
> + * used will proxy the execbuffer request onto one of the set of engines
> + * in such a way as to distribute the load evenly across the set.
> + *
> + * The set of engines must be compatible (e.g. the same HW class) as they
> + * will share the same logical GPU context and ring.
> + *
> + * To intermix rendering with the virtual engine and direct rendering onto
> + * the backing engines (bypassing the load balancing proxy), the context must
> + * be defined to use a single timeline for all engines.
> + */
> +struct i915_context_engines_load_balance {
> +	struct i915_user_extension base;
> +
> +	__u16 engine_index;
> +	__u16 num_siblings;
> +	__u32 flags; /* all undefined flags must be zero */
> +
> +	__u64 mbz64; /* reserved for future use; must be zero */
> +
> +	struct i915_engine_class_instance engines[0];
> +} __attribute__((packed));
> +
> +#define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \
> +	struct i915_user_extension base; \
> +	__u16 engine_index; \
> +	__u16 num_siblings; \
> +	__u32 flags; \
> +	__u64 mbz64; \
> +	struct i915_engine_class_instance engines[N__]; \
> +} __attribute__((packed)) name__
> +
>   struct i915_context_param_engines {
>   	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
> +#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
>   	struct i915_engine_class_instance engines[0];
>   } __attribute__((packed));
>   
>