[Intel-gfx] [PATCH 15/18] drm/i915: Load balancing across a virtual engine
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed Mar 20 15:59:14 UTC 2019
On 19/03/2019 11:57, Chris Wilson wrote:
> Having allowed the user to define a set of engines that they will want
> to only use, we go one step further and allow them to bind those engines
> into a single virtual instance. Submitting a batch to the virtual engine
> will then forward it to any one of the set in a manner as best to
> distribute load. The virtual engine has a single timeline across all
> engines (it operates as a single queue), so it is not able to concurrently
> run batches across multiple engines by itself; that is left up to the user
> to submit multiple concurrent batches to multiple queues. Multiple users
> will be load balanced across the system.
>
> The mechanism used for load balancing in this patch is a late greedy
> balancer. When a request is ready for execution, it is added to each
> engine's queue, and when an engine is ready for its next request it
> claims it from the virtual engine. The first engine to do so, wins, i.e.
> the request is executed at the earliest opportunity (idle moment) in the
> system.
>
> As not all HW is created equal, the user is still able to skip the
> virtual engine and execute the batch on a specific engine, all within the
> same queue. It will then be executed in order on the correct engine,
> with execution on other virtual engines being moved away due to the load
> detection.
>
> A couple of areas for potential improvement left!
>
> - The virtual engine always take priority over equal-priority tasks.
> Mostly broken up by applying FQ_CODEL rules for prioritising new clients,
> and hopefully the virtual and real engines are not then congested (i.e.
> all work is via virtual engines, or all work is to the real engine).
>
> - We require the breadcrumb irq around every virtual engine request. For
> normal engines, we eliminate the need for the slow round trip via
> interrupt by using the submit fence and queueing in order. For virtual
> engines, we have to allow any job to transfer to a new ring, and cannot
> coalesce the submissions, so require the completion fence instead,
> forcing the persistent use of interrupts.
>
> - We only drip feed single requests through each virtual engine and onto
> the physical engines, even if there was enough work to fill all ELSP,
> leaving small stalls with an idle CS event at the end of every request.
> Could we be greedy and fill both slots? Being lazy is virtuous for load
> distribution on less-than-full workloads though.
>
> Other areas of improvement are more general, such as reducing lock
> contention, reducing dispatch overhead, looking at direct submission
> rather than bouncing around tasklets etc.
>
> sseu: Lift the restriction to allow sseu to be reconfigured on virtual
> engines composed of RENDER_CLASS (rcs).
>
> v2: macroize check_user_mbz()
> v3: Cancel virtual engines on wedging
> v4: Commence commenting
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem.h | 5 +
> drivers/gpu/drm/i915/i915_gem_context.c | 126 ++++-
> drivers/gpu/drm/i915/i915_scheduler.c | 18 +-
> drivers/gpu/drm/i915/i915_timeline_types.h | 1 +
> drivers/gpu/drm/i915/intel_engine_types.h | 8 +
> drivers/gpu/drm/i915/intel_lrc.c | 567 ++++++++++++++++++++-
> drivers/gpu/drm/i915/intel_lrc.h | 11 +
> drivers/gpu/drm/i915/selftests/intel_lrc.c | 165 ++++++
> include/uapi/drm/i915_drm.h | 30 ++
> 9 files changed, 912 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
> index 5c073fe73664..3ca855505715 100644
> --- a/drivers/gpu/drm/i915/i915_gem.h
> +++ b/drivers/gpu/drm/i915/i915_gem.h
> @@ -96,4 +96,9 @@ static inline bool __tasklet_enable(struct tasklet_struct *t)
> return atomic_dec_and_test(&t->count);
> }
>
> +static inline bool __tasklet_is_scheduled(struct tasklet_struct *t)
> +{
> + return test_bit(TASKLET_STATE_SCHED, &t->state);
> +}
> +
> #endif /* __I915_GEM_H__ */
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index cbd76ef95115..8d8fcc8c7a86 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -86,6 +86,7 @@
> */
>
> #include <linux/log2.h>
> +#include <linux/nospec.h>
>
> #include <drm/i915_drm.h>
>
> @@ -94,6 +95,7 @@
> #include "i915_trace.h"
> #include "i915_user_extensions.h"
> #include "intel_lrc_reg.h"
> +#include "intel_lrc.h"
> #include "intel_workarounds.h"
>
> #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
> @@ -241,6 +243,20 @@ static void release_hw_id(struct i915_gem_context *ctx)
> mutex_unlock(&i915->contexts.mutex);
> }
>
> +static void free_engines(struct intel_engine_cs **engines, int count)
> +{
> + int i;
> +
> + if (ZERO_OR_NULL_PTR(engines))
> + return;
> +
> + /* We own the veng we created; regular engines are ignored */
> + for (i = 0; i < count; i++)
> + intel_virtual_engine_destroy(engines[i]);
> +
> + kfree(engines);
> +}
> +
> static void i915_gem_context_free(struct i915_gem_context *ctx)
> {
> struct intel_context *it, *n;
> @@ -251,8 +267,7 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
>
> release_hw_id(ctx);
> i915_ppgtt_put(ctx->ppgtt);
> -
> - kfree(ctx->engines);
> + free_engines(ctx->engines, ctx->num_engines);
>
> rbtree_postorder_for_each_entry_safe(it, n, &ctx->hw_contexts, node)
> intel_context_put(it);
> @@ -1239,7 +1254,6 @@ __i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx,
> int ret = 0;
>
> GEM_BUG_ON(INTEL_GEN(ctx->i915) < 8);
> - GEM_BUG_ON(engine->id != RCS0);
>
> ce = intel_context_pin_lock(ctx, engine);
> if (IS_ERR(ce))
> @@ -1433,7 +1447,80 @@ struct set_engines {
> unsigned int num_engines;
> };
>
> +static int
> +set_engines__load_balance(struct i915_user_extension __user *base, void *data)
> +{
> + struct i915_context_engines_load_balance __user *ext =
> + container_of_user(base, typeof(*ext), base);
> + const struct set_engines *set = data;
> + struct intel_engine_cs *ve;
> + unsigned int n;
> + u64 mask;
> + u16 idx;
> + int err;
> +
> + if (!HAS_EXECLISTS(set->ctx->i915))
> + return -ENODEV;
> +
> + if (USES_GUC_SUBMISSION(set->ctx->i915))
> + return -ENODEV; /* not implement yet */
> +
> + if (get_user(idx, &ext->engine_index))
> + return -EFAULT;
> +
> + if (idx >= set->num_engines)
> + return -EINVAL;
> +
> + idx = array_index_nospec(idx, set->num_engines);
> + if (set->engines[idx])
> + return -EEXIST;
> +
> + err = check_user_mbz(&ext->mbz16);
> + if (err)
> + return err;
> +
> + err = check_user_mbz(&ext->flags);
> + if (err)
> + return err;
> +
> + for (n = 0; n < ARRAY_SIZE(ext->mbz64); n++) {
> + err = check_user_mbz(&ext->mbz64[n]);
> + if (err)
> + return err;
> + }
> +
> + if (get_user(mask, &ext->engines_mask))
> + return -EFAULT;
> +
> + mask &= GENMASK_ULL(set->num_engines - 1, 0) & ~BIT_ULL(idx);
> + if (!mask)
> + return -EINVAL;
> +
> + if (is_power_of_2(mask)) {
> + ve = set->engines[__ffs64(mask)];
> + } else {
> + struct intel_engine_cs *stack[64];
> + int bit;
> +
> + n = 0;
> + for_each_set_bit(bit, (unsigned long *)&mask, set->num_engines)
> + stack[n++] = set->engines[bit];
> +
> + ve = intel_execlists_create_virtual(set->ctx, stack, n);
> + }
> + if (IS_ERR(ve))
> + return PTR_ERR(ve);
> +
> + if (cmpxchg(&set->engines[idx], NULL, ve)) {
> + intel_virtual_engine_destroy(ve);
> + return -EEXIST;
> + }
> +
> + return 0;
> +}
> +
> static const i915_user_extension_fn set_engines__extensions[] = {
> + [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
> };
>
> static int
> @@ -1497,13 +1584,13 @@ set_engines(struct i915_gem_context *ctx,
> ARRAY_SIZE(set_engines__extensions),
> &set);
> if (err) {
> - kfree(set.engines);
> + free_engines(set.engines, set.num_engines);
> return err;
> }
>
> out:
> mutex_lock(&ctx->i915->drm.struct_mutex);
> - kfree(ctx->engines);
> + free_engines(ctx->engines, ctx->num_engines);
> ctx->engines = set.engines;
> ctx->num_engines = set.num_engines;
> mutex_unlock(&ctx->i915->drm.struct_mutex);
> @@ -1692,7 +1779,7 @@ static int clone_engines(struct i915_gem_context *dst,
> struct i915_gem_context *src)
> {
> struct intel_engine_cs **engines;
> - unsigned int num_engines;
> + unsigned int num_engines, i;
>
> mutex_lock(&src->i915->drm.struct_mutex); /* serialise src->engine[] */
>
> @@ -1707,11 +1794,36 @@ static int clone_engines(struct i915_gem_context *dst,
> mutex_unlock(&src->i915->drm.struct_mutex);
> return -ENOMEM;
> }
> +
> + /*
> + * Virtual engines are singletons; they can only exist
> + * inside a single context, because they embed their
> + * HW context... As each virtual context implies a single
> + * timeline (each engine can only dequeue a single request
> + * at any time), it would be surprising for two contexts
> + * to use the same engine. So let's create a copy of
> + * the virtual engine instead.
> + */
> + for (i = 0; i < num_engines; i++) {
> + struct intel_engine_cs *engine = engines[i];
> +
> + if (!engine || !intel_engine_is_virtual(engine))
> + continue;
> +
> + engine = intel_execlists_clone_virtual(dst, engine);
> + if (IS_ERR(engine)) {
> + free_engines(engines, i);
> + mutex_unlock(&src->i915->drm.struct_mutex);
> + return PTR_ERR(engine);
> + }
> +
> + engines[i] = engine;
> + }
> }
>
> mutex_unlock(&src->i915->drm.struct_mutex);
>
> - kfree(dst->engines);
> + free_engines(dst->engines, dst->num_engines);
> dst->engines = engines;
> dst->num_engines = num_engines;
> return 0;
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index e0f609d01564..8cff4f6d6158 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -247,17 +247,26 @@ sched_lock_engine(const struct i915_sched_node *node,
> struct intel_engine_cs *locked,
> struct sched_cache *cache)
> {
> - struct intel_engine_cs *engine = node_to_request(node)->engine;
> + const struct i915_request *rq = node_to_request(node);
> + struct intel_engine_cs *engine;
>
> GEM_BUG_ON(!locked);
>
> - if (engine != locked) {
> + /*
> + * Virtual engines complicate acquiring the engine timeline lock,
> + * as their rq->engine pointer is not stable until under that
> + * engine lock. The simple ploy we use is to take the lock then
> + * check that the rq still belongs to the newly locked engine.
> + */
> + while (locked != (engine = READ_ONCE(rq->engine))) {
> spin_unlock(&locked->timeline.lock);
> memset(cache, 0, sizeof(*cache));
> spin_lock(&engine->timeline.lock);
> + locked = engine;
> }
>
> - return engine;
> + GEM_BUG_ON(locked != engine);
> + return locked;
> }
>
> static bool inflight(const struct i915_request *rq,
> @@ -370,8 +379,11 @@ static void __i915_schedule(struct i915_request *rq,
> if (prio <= node->attr.priority || node_signaled(node))
> continue;
>
> + GEM_BUG_ON(node_to_request(node)->engine != engine);
> +
> node->attr.priority = prio;
> if (!list_empty(&node->link)) {
> + GEM_BUG_ON(intel_engine_is_virtual(engine));
> if (!cache.priolist)
> cache.priolist =
> i915_sched_lookup_priolist(engine,
> diff --git a/drivers/gpu/drm/i915/i915_timeline_types.h b/drivers/gpu/drm/i915/i915_timeline_types.h
> index 1f5b55d9ffb5..57c79830bb8c 100644
> --- a/drivers/gpu/drm/i915/i915_timeline_types.h
> +++ b/drivers/gpu/drm/i915/i915_timeline_types.h
> @@ -26,6 +26,7 @@ struct i915_timeline {
> spinlock_t lock;
> #define TIMELINE_CLIENT 0 /* default subclass */
> #define TIMELINE_ENGINE 1
> +#define TIMELINE_VIRTUAL 2
> struct mutex mutex; /* protects the flow of requests */
>
> unsigned int pin_count;
> diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h
> index cef1e71a7401..4d526767c371 100644
> --- a/drivers/gpu/drm/i915/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/intel_engine_types.h
> @@ -224,6 +224,7 @@ struct intel_engine_execlists {
> * @queue: queue of requests, in priority lists
> */
> struct rb_root_cached queue;
> + struct rb_root_cached virtual;
>
> /**
> * @csb_write: control register for Context Switch buffer
> @@ -429,6 +430,7 @@ struct intel_engine_cs {
> #define I915_ENGINE_SUPPORTS_STATS BIT(1)
> #define I915_ENGINE_HAS_PREEMPTION BIT(2)
> #define I915_ENGINE_HAS_SEMAPHORES BIT(3)
> +#define I915_ENGINE_IS_VIRTUAL BIT(4)
> unsigned int flags;
>
> /*
> @@ -512,6 +514,12 @@ intel_engine_has_semaphores(const struct intel_engine_cs *engine)
> return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
> }
>
> +static inline bool
> +intel_engine_is_virtual(const struct intel_engine_cs *engine)
> +{
> + return engine->flags & I915_ENGINE_IS_VIRTUAL;
> +}
> +
> #define instdone_slice_mask(dev_priv__) \
> (IS_GEN(dev_priv__, 7) ? \
> 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0e64317ddcbf..a7f0235f19c5 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -166,6 +166,41 @@
>
> #define ACTIVE_PRIORITY (I915_PRIORITY_NEWCLIENT | I915_PRIORITY_NOSEMAPHORE)
>
> +struct virtual_engine {
> + struct intel_engine_cs base;
> + struct intel_context context;
> +
> + /*
> + * We allow only a single request through the virtual engine at a time
> + * (each request in the timeline waits for the completion fence of
> + * the previous before being submitted). By restricting ourselves to
> + * only submitting a single request, each request is placed on to a
> + * physical to maximise load spreading (by virtue of the late greedy
> + * scheduling -- each real engine takes the next available request
> + * upon idling).
> + */
> + struct i915_request *request;
> +
> + /*
> + * We keep a rbtree of available virtual engines inside each physical
> + * engine, sorted by priority. Here we preallocate the nodes we need
> + * for the virtual engine, indexed by physical_engine->id.
> + */
> + struct ve_node {
> + struct rb_node rb;
> + int prio;
> + } nodes[I915_NUM_ENGINES];
> +
> + /* And finally, which physical engines this virtual engine maps onto. */
> + unsigned int count;
> + struct intel_engine_cs *siblings[0];
> +};
> +
> +static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
> +{
> + return container_of(engine, struct virtual_engine, base);
> +}
> +
> static int execlists_context_deferred_alloc(struct intel_context *ce,
> struct intel_engine_cs *engine);
> static void execlists_init_reg_state(u32 *reg_state,
> @@ -229,7 +264,8 @@ static int queue_prio(const struct intel_engine_execlists *execlists)
> }
>
> static inline bool need_preempt(const struct intel_engine_cs *engine,
> - const struct i915_request *rq)
> + const struct i915_request *rq,
> + struct rb_node *rb)
> {
> int last_prio;
>
> @@ -264,6 +300,22 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
> rq_prio(list_next_entry(rq, link)) > last_prio)
> return true;
>
> + if (rb) { /* XXX virtual precedence */
> + struct virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> + bool preempt = false;
> +
> + if (engine == ve->siblings[0]) { /* only preempt one sibling */
> + spin_lock(&ve->base.timeline.lock);
> + if (ve->request)
> + preempt = rq_prio(ve->request) > last_prio;
> + spin_unlock(&ve->base.timeline.lock);
> + }
> +
> + if (preempt)
> + return preempt;
> + }
> +
> /*
> * If the inflight context did not trigger the preemption, then maybe
> * it was the set of queued requests? Pick the highest priority in
> @@ -382,6 +434,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
> list_for_each_entry_safe_reverse(rq, rn,
> &engine->timeline.requests,
> link) {
> + struct intel_engine_cs *owner;
> +
> if (i915_request_completed(rq))
> break;
>
> @@ -390,14 +444,30 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
>
> GEM_BUG_ON(rq->hw_context->active);
>
> - GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> - if (rq_prio(rq) != prio) {
> - prio = rq_prio(rq);
> - pl = i915_sched_lookup_priolist(engine, prio);
> - }
> - GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
> + /*
> + * Push the request back into the queue for later resubmission.
> + * If this request is not native to this physical engine (i.e.
> + * it came from a virtual source), push it back onto the virtual
> + * engine so that it can be moved across onto another physical
> + * engine as load dictates.
> + */
> + owner = rq->hw_context->engine;
> + if (likely(owner == engine)) {
> + GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> + if (rq_prio(rq) != prio) {
> + prio = rq_prio(rq);
> + pl = i915_sched_lookup_priolist(engine, prio);
> + }
> + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
>
> - list_add(&rq->sched.link, pl);
> + list_add(&rq->sched.link, pl);
> + } else {
> + if (__i915_request_has_started(rq))
> + rq->sched.attr.priority |= ACTIVE_PRIORITY;
> +
> + rq->engine = owner;
> + owner->submit_request(rq);
> + }
>
> active = rq;
> }
> @@ -659,6 +729,50 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists)
> execlists));
> }
>
> +static void virtual_update_register_offsets(u32 *regs,
> + struct intel_engine_cs *engine)
> +{
> + u32 base = engine->mmio_base;
> +
> + regs[CTX_CONTEXT_CONTROL] =
> + i915_mmio_reg_offset(RING_CONTEXT_CONTROL(engine));
> + regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
> + regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
> + regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
> + regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
> +
> + regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
> + regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
> + regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
> + regs[CTX_SECOND_BB_HEAD_U] =
> + i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
> + regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
> + regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
> +
> + regs[CTX_CTX_TIMESTAMP] =
> + i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
> + regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 3));
> + regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 3));
> + regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 2));
> + regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 2));
> + regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 1));
> + regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 1));
> + regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 0));
> + regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 0));
> +
> + if (engine->class == RENDER_CLASS) {
> + regs[CTX_RCS_INDIRECT_CTX] =
> + i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
> + regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
> + i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
> + regs[CTX_BB_PER_CTX_PTR] =
> + i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
> +
> + regs[CTX_R_PWR_CLK_STATE] =
> + i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
> + }
> +}
> +
> static void execlists_dequeue(struct intel_engine_cs *engine)
> {
> struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -691,6 +805,37 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> * and context switches) submission.
> */
>
> + for (rb = rb_first_cached(&execlists->virtual); rb; ) {
> + struct virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> + struct i915_request *rq = READ_ONCE(ve->request);
> + struct intel_engine_cs *active;
> +
> + if (!rq) { /* lazily cleanup after another engine handled rq */
> + rb_erase_cached(rb, &execlists->virtual);
> + RB_CLEAR_NODE(rb);
> + rb = rb_first_cached(&execlists->virtual);
> + continue;
> + }
> +
> + /*
> + * We track when the HW has completed saving the context image
> + * (i.e. when we have seen the final CS event switching out of
> + * the context) and must not overwrite the context image before
> + * then. This restricts us to only using the active engine
> + * while the previous virtualized request is inflight (so
> + * we reuse the register offsets). This is a very small
> + * hystersis on the greedy seelction algorithm.
> + */
> + active = READ_ONCE(ve->context.active);
> + if (active && active != engine) {
> + rb = rb_next(rb);
> + continue;
> + }
> +
> + break;
> + }
> +
> if (last) {
> /*
> * Don't resubmit or switch until all outstanding
> @@ -712,7 +857,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
> return;
>
> - if (need_preempt(engine, last)) {
> + if (need_preempt(engine, last, rb)) {
> inject_preempt_context(engine);
> return;
> }
> @@ -752,6 +897,72 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> last->tail = last->wa_tail;
> }
>
> + while (rb) { /* XXX virtual is always taking precedence */
> + struct virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> + struct i915_request *rq;
> +
> + spin_lock(&ve->base.timeline.lock);
> +
> + rq = ve->request;
> + if (unlikely(!rq)) { /* lost the race to a sibling */
> + spin_unlock(&ve->base.timeline.lock);
> + rb_erase_cached(rb, &execlists->virtual);
> + RB_CLEAR_NODE(rb);
> + rb = rb_first_cached(&execlists->virtual);
> + continue;
> + }
> +
> + if (rq_prio(rq) >= queue_prio(execlists)) {
> + if (last && !can_merge_rq(last, rq)) {
> + spin_unlock(&ve->base.timeline.lock);
> + return; /* leave this rq for another engine */
Should this engine attempt to dequeue something else? Maybe something
from the non-virtual queue for instance could be submitted/appended.
> + }
> +
> + GEM_BUG_ON(rq->engine != &ve->base);
> + ve->request = NULL;
> + ve->base.execlists.queue_priority_hint = INT_MIN;
> + rb_erase_cached(rb, &execlists->virtual);
> + RB_CLEAR_NODE(rb);
> +
> + GEM_BUG_ON(rq->hw_context != &ve->context);
> + rq->engine = engine;
> +
> + if (engine != ve->siblings[0]) {
> + u32 *regs = ve->context.lrc_reg_state;
> + unsigned int n;
> +
> + GEM_BUG_ON(READ_ONCE(ve->context.active));
> + virtual_update_register_offsets(regs, engine);
> +
> + /*
> + * Move the bound engine to the top of the list
> + * for future execution. We then kick this
> + * tasklet first before checking others, so that
> + * we preferentially reuse this set of bound
> + * registers.
> + */
> + for (n = 1; n < ve->count; n++) {
> + if (ve->siblings[n] == engine) {
> + swap(ve->siblings[n],
> + ve->siblings[0]);
> + break;
> + }
> + }
> +
> + GEM_BUG_ON(ve->siblings[0] != engine);
> + }
> +
> + __i915_request_submit(rq);
> + trace_i915_request_in(rq, port_index(port, execlists));
> + submit = true;
> + last = rq;
> + }
> +
> + spin_unlock(&ve->base.timeline.lock);
> + break;
> + }
> +
> while ((rb = rb_first_cached(&execlists->queue))) {
> struct i915_priolist *p = to_priolist(rb);
> struct i915_request *rq, *rn;
> @@ -971,6 +1182,24 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
> i915_priolist_free(p);
> }
>
> + /* Cancel all attached virtual engines */
> + while ((rb = rb_first_cached(&execlists->virtual))) {
> + struct virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +
> + rb_erase_cached(rb, &execlists->virtual);
> + RB_CLEAR_NODE(rb);
> +
> + spin_lock(&ve->base.timeline.lock);
> + if (ve->request) {
> + __i915_request_submit(ve->request);
> + dma_fence_set_error(&ve->request->fence, -EIO);
> + i915_request_mark_complete(ve->request);
> + ve->request = NULL;
> + }
> + spin_unlock(&ve->base.timeline.lock);
> + }
> +
> /* Remaining _unready_ requests will be nop'ed when submitted */
>
> execlists->queue_priority_hint = INT_MIN;
> @@ -2897,6 +3126,303 @@ void intel_lr_context_resume(struct drm_i915_private *i915)
> }
> }
>
> +static void virtual_context_destroy(struct kref *kref)
> +{
> + struct virtual_engine *ve =
> + container_of(kref, typeof(*ve), context.ref);
> + unsigned int n;
> +
> + GEM_BUG_ON(ve->request);
> + GEM_BUG_ON(ve->context.active);
> +
> + for (n = 0; n < ve->count; n++) {
> + struct intel_engine_cs *sibling = ve->siblings[n];
> + struct rb_node *node = &ve->nodes[sibling->id].rb;
> +
> + if (RB_EMPTY_NODE(node))
> + continue;
> +
> + spin_lock_irq(&sibling->timeline.lock);
> +
> + if (!RB_EMPTY_NODE(node))
> + rb_erase_cached(node, &sibling->execlists.virtual);
> +
> + spin_unlock_irq(&sibling->timeline.lock);
> + }
> + GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
> +
> + if (ve->context.state)
> + __execlists_context_fini(&ve->context);
> +
> + i915_timeline_fini(&ve->base.timeline);
> + kfree(ve);
> +}
> +
> +static void virtual_engine_initial_hint(struct virtual_engine *ve)
> +{
> + int swp;
> +
> + /*
> + * Pick a random sibling on starting to help spread the load around.
> + *
> + * New contexts are typically created with exactly the same order
> + * of siblings, and often started in batches. Due to the way we iterate
> + * the array of sibling when submitting requests, sibling[0] is
> + * prioritised for dequeuing. If we make sure that sibling[0] is fairly
> + * randomised across the system, we also help spread the load by the
> + * first engine we inspect being different each time.
> + *
> + * NB This does not force us to execute on this engine, it will just
> + * typically be the first we inspect for submission.
> + */
> + swp = prandom_u32_max(ve->count);
> + if (!swp)
> + return;
> +
> + swap(ve->siblings[swp], ve->siblings[0]);
> + virtual_update_register_offsets(ve->context.lrc_reg_state,
> + ve->siblings[0]);
> +}
> +
> +static int virtual_context_pin(struct intel_context *ce)
> +{
> + struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
> + int err;
> +
> + /* Note: we must use a real engine class for setting up reg state */
> + err = __execlists_context_pin(ce, ve->siblings[0]);
> + if (err)
> + return err;
> +
> + virtual_engine_initial_hint(ve);
> + return 0;
> +}
> +
> +static const struct intel_context_ops virtual_context_ops = {
> + .pin = virtual_context_pin,
> + .unpin = execlists_context_unpin,
> +
> + .destroy = virtual_context_destroy,
> +};
> +
> +static void virtual_submission_tasklet(unsigned long data)
> +{
> + struct virtual_engine * const ve = (struct virtual_engine *)data;
> + unsigned int n;
> + int prio;
> +
> + prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
> + if (prio == INT_MIN)
> + return;
When does it hit this return?
> +
> + local_irq_disable();
> + for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) {
> + struct intel_engine_cs *sibling = ve->siblings[n];
> + struct ve_node * const node = &ve->nodes[sibling->id];
> + struct rb_node **parent, *rb;
> + bool first;
> +
> + spin_lock(&sibling->timeline.lock);
> +
> + if (!RB_EMPTY_NODE(&node->rb)) {
> + /*
> + * Cheat and avoid rebalancing the tree if we can
> + * reuse this node in situ.
> + */
> + first = rb_first_cached(&sibling->execlists.virtual) ==
> + &node->rb;
> + if (prio == node->prio || (prio > node->prio && first))
> + goto submit_engine;
> +
> + rb_erase_cached(&node->rb, &sibling->execlists.virtual);
How the cheat works exactly? Avoids inserting into the tree in some
cases? And how does the real tasklet find this node then?
> + }
> +
> + rb = NULL;
> + first = true;
> + parent = &sibling->execlists.virtual.rb_root.rb_node;
> + while (*parent) {
> + struct ve_node *other;
> +
> + rb = *parent;
> + other = rb_entry(rb, typeof(*other), rb);
> + if (prio > other->prio) {
> + parent = &rb->rb_left;
> + } else {
> + parent = &rb->rb_right;
> + first = false;
> + }
> + }
> +
> + rb_link_node(&node->rb, rb, parent);
> + rb_insert_color_cached(&node->rb,
> + &sibling->execlists.virtual,
> + first);
> +
> +submit_engine:
> + GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
> + node->prio = prio;
> + if (first && prio > sibling->execlists.queue_priority_hint) {
> + sibling->execlists.queue_priority_hint = prio;
> + tasklet_hi_schedule(&sibling->execlists.tasklet);
> + }
> +
> + spin_unlock(&sibling->timeline.lock);
> + }
> + local_irq_enable();
> +}
> +
> +static void virtual_submit_request(struct i915_request *request)
> +{
> + struct virtual_engine *ve = to_virtual_engine(request->engine);
> +
> + GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
> +
> + GEM_BUG_ON(ve->request);
> + ve->base.execlists.queue_priority_hint = rq_prio(request);
> + WRITE_ONCE(ve->request, request);
> +
> + tasklet_schedule(&ve->base.execlists.tasklet);
> +}
> +
> +struct intel_engine_cs *
> +intel_execlists_create_virtual(struct i915_gem_context *ctx,
> + struct intel_engine_cs **siblings,
> + unsigned int count)
> +{
> + struct virtual_engine *ve;
> + unsigned int n;
> + int err;
> +
> + if (!count)
> + return ERR_PTR(-EINVAL);
> +
> + ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
> + if (!ve)
> + return ERR_PTR(-ENOMEM);
> +
> + ve->base.i915 = ctx->i915;
> + ve->base.id = -1;
> + ve->base.class = OTHER_CLASS;
> + ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
> + ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
> + ve->base.flags = I915_ENGINE_IS_VIRTUAL;
> +
> + snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
> +
> + err = i915_timeline_init(ctx->i915, &ve->base.timeline, NULL);
> + if (err)
> + goto err_put;
> + i915_timeline_set_subclass(&ve->base.timeline, TIMELINE_VIRTUAL);
> +
> + ve->base.cops = &virtual_context_ops;
> + ve->base.request_alloc = execlists_request_alloc;
> +
> + ve->base.schedule = i915_schedule;
> + ve->base.submit_request = virtual_submit_request;
> +
> + ve->base.execlists.queue_priority_hint = INT_MIN;
> + tasklet_init(&ve->base.execlists.tasklet,
> + virtual_submission_tasklet,
> + (unsigned long)ve);
> +
> + intel_context_init(&ve->context, ctx, &ve->base);
> +
> + for (n = 0; n < count; n++) {
> + struct intel_engine_cs *sibling = siblings[n];
> +
> + GEM_BUG_ON(!is_power_of_2(sibling->mask));
> + if (sibling->mask & ve->base.mask)
> + continue;
Continuing from the previous round - should we -EINVAL if two of the
same are found in the map? Since we are going to silently drop it..
perhaps better to disallow.
> +
> + /*
> + * The virtual engine implementation is tightly coupled to
> + * the execlists backend -- we push out request directly
> + * into a tree inside each physical engine. We could support
> + * layering if we handling cloning of the requests and
> + * submitting a copy into each backend.
> + */
> + if (sibling->execlists.tasklet.func !=
> + execlists_submission_tasklet) {
> + err = -ENODEV;
> + goto err_put;
> + }
> +
> + GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
> + RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
> +
> + ve->siblings[ve->count++] = sibling;
> + ve->base.mask |= sibling->mask;
> +
> + /*
> + * All physical engines must be compatible for their emission
> + * functions (as we build the instructions during request
> + * construction and do not alter them before submission
> + * on the physical engine). We use the engine class as a guide
> + * here, although that could be refined.
> + */
> + if (ve->base.class != OTHER_CLASS) {
> + if (ve->base.class != sibling->class) {
> + err = -EINVAL;
> + goto err_put;
> + }
> + continue;
> + }
> +
> + ve->base.class = sibling->class;
> + snprintf(ve->base.name, sizeof(ve->base.name),
> + "v%dx%d", ve->base.class, count);
> + ve->base.context_size = sibling->context_size;
> +
> + ve->base.emit_bb_start = sibling->emit_bb_start;
> + ve->base.emit_flush = sibling->emit_flush;
> + ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
> + ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
> + ve->base.emit_fini_breadcrumb_dw =
> + sibling->emit_fini_breadcrumb_dw;
> + }
> +
> + /* gracefully replace a degenerate virtual engine */
> + if (ve->count == 1) {
> + struct intel_engine_cs *actual = ve->siblings[0];
> + intel_context_put(&ve->context);
> + return actual;
> + }
> +
> + __intel_context_insert(ctx, &ve->base, &ve->context);
> + return &ve->base;
> +
> +err_put:
> + intel_context_put(&ve->context);
> + return ERR_PTR(err);
> +}
> +
> +struct intel_engine_cs *
> +intel_execlists_clone_virtual(struct i915_gem_context *ctx,
> + struct intel_engine_cs *src)
> +{
> + struct virtual_engine *se = to_virtual_engine(src);
> + struct intel_engine_cs *dst;
> +
> + dst = intel_execlists_create_virtual(ctx,
> + se->siblings,
> + se->count);
> + if (IS_ERR(dst))
> + return dst;
> +
> + return dst;
> +}
> +
> +void intel_virtual_engine_destroy(struct intel_engine_cs *engine)
> +{
> + struct virtual_engine *ve = to_virtual_engine(engine);
> +
> + if (!engine || !intel_engine_is_virtual(engine))
> + return;
> +
> + __intel_context_remove(&ve->context);
> + intel_context_put(&ve->context);
> +}
> +
> void intel_execlists_show_requests(struct intel_engine_cs *engine,
> struct drm_printer *m,
> void (*show_request)(struct drm_printer *m,
> @@ -2954,6 +3480,29 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
> show_request(m, last, "\t\tQ ");
> }
>
> + last = NULL;
> + count = 0;
> + for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
> + struct virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> + struct i915_request *rq = READ_ONCE(ve->request);
> +
> + if (rq) {
> + if (count++ < max - 1)
> + show_request(m, rq, "\t\tV ");
> + else
> + last = rq;
> + }
> + }
> + if (last) {
> + if (count > max) {
> + drm_printf(m,
> + "\t\t...skipping %d virtual requests...\n",
> + count - max);
> + }
> + show_request(m, last, "\t\tV ");
> + }
> +
> spin_unlock_irqrestore(&engine->timeline.lock, flags);
> }
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index f1aec8a6986f..9d90dc68e02b 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -112,6 +112,17 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
> const char *prefix),
> unsigned int max);
>
> +struct intel_engine_cs *
> +intel_execlists_create_virtual(struct i915_gem_context *ctx,
> + struct intel_engine_cs **siblings,
> + unsigned int count);
> +
> +struct intel_engine_cs *
> +intel_execlists_clone_virtual(struct i915_gem_context *ctx,
> + struct intel_engine_cs *src);
> +
> +void intel_virtual_engine_destroy(struct intel_engine_cs *engine);
> +
> u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
>
> #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index 9e871eb0bfb1..6df033960350 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -10,6 +10,7 @@
>
> #include "../i915_selftest.h"
> #include "igt_flush_test.h"
> +#include "igt_live_test.h"
> #include "igt_spinner.h"
> #include "i915_random.h"
>
> @@ -1057,6 +1058,169 @@ static int live_preempt_smoke(void *arg)
> return err;
> }
>
> +static int nop_virtual_engine(struct drm_i915_private *i915,
> + struct intel_engine_cs **siblings,
> + unsigned int nsibling,
> + unsigned int nctx,
> + unsigned int flags)
> +#define CHAIN BIT(0)
> +{
> + IGT_TIMEOUT(end_time);
> + struct i915_request *request[16];
> + struct i915_gem_context *ctx[16];
> + struct intel_engine_cs *ve[16];
> + unsigned long n, prime, nc;
> + struct igt_live_test t;
> + ktime_t times[2] = {};
> + int err;
> +
> + GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx));
> +
> + for (n = 0; n < nctx; n++) {
> + ctx[n] = kernel_context(i915);
> + if (!ctx[n])
> + return -ENOMEM;
> +
> + ve[n] = intel_execlists_create_virtual(ctx[n],
> + siblings, nsibling);
> + if (IS_ERR(ve[n]))
> + return PTR_ERR(ve[n]);
> + }
> +
> + err = igt_live_test_begin(&t, i915, __func__, ve[0]->name);
> + if (err)
> + goto out;
> +
> + for_each_prime_number_from(prime, 1, 8192) {
> + times[1] = ktime_get_raw();
> +
> + if (flags & CHAIN) {
> + for (nc = 0; nc < nctx; nc++) {
> + for (n = 0; n < prime; n++) {
> + request[nc] =
> + i915_request_alloc(ve[nc], ctx[nc]);
> + if (IS_ERR(request[nc])) {
> + err = PTR_ERR(request[nc]);
> + goto out;
> + }
> +
> + i915_request_add(request[nc]);
> + }
> + }
> + } else {
> + for (n = 0; n < prime; n++) {
> + for (nc = 0; nc < nctx; nc++) {
> + request[nc] =
> + i915_request_alloc(ve[nc], ctx[nc]);
> + if (IS_ERR(request[nc])) {
> + err = PTR_ERR(request[nc]);
> + goto out;
> + }
> +
> + i915_request_add(request[nc]);
> + }
> + }
> + }
> +
> + for (nc = 0; nc < nctx; nc++) {
> + if (i915_request_wait(request[nc],
> + I915_WAIT_LOCKED,
> + HZ / 10) < 0) {
> + pr_err("%s(%s): wait for %llx:%lld timed out\n",
> + __func__, ve[0]->name,
> + request[nc]->fence.context,
> + request[nc]->fence.seqno);
> +
> + GEM_TRACE("%s(%s) failed at request %llx:%lld\n",
> + __func__, ve[0]->name,
> + request[nc]->fence.context,
> + request[nc]->fence.seqno);
> + GEM_TRACE_DUMP();
> + i915_gem_set_wedged(i915);
> + break;
> + }
> + }
> +
> + times[1] = ktime_sub(ktime_get_raw(), times[1]);
> + if (prime == 1)
> + times[0] = times[1];
> +
> + if (__igt_timeout(end_time, NULL))
> + break;
> + }
> +
> + err = igt_live_test_end(&t);
> + if (err)
> + goto out;
> +
> + pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n",
> + nctx, ve[0]->name, ktime_to_ns(times[0]),
> + prime, div64_u64(ktime_to_ns(times[1]), prime));
> +
> +out:
> + if (igt_flush_test(i915, I915_WAIT_LOCKED))
> + err = -EIO;
> +
> + for (nc = 0; nc < nctx; nc++) {
> + intel_virtual_engine_destroy(ve[nc]);
> + kernel_context_close(ctx[nc]);
> + }
> + return err;
> +}
> +
> +static int live_virtual_engine(void *arg)
> +{
> + struct drm_i915_private *i915 = arg;
> + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
> + struct intel_engine_cs *engine;
> + enum intel_engine_id id;
> + unsigned int class, inst;
> + int err = -ENODEV;
> +
> + if (USES_GUC_SUBMISSION(i915))
> + return 0;
> +
> + mutex_lock(&i915->drm.struct_mutex);
> +
> + for_each_engine(engine, i915, id) {
> + err = nop_virtual_engine(i915, &engine, 1, 1, 0);
> + if (err) {
> + pr_err("Failed to wrap engine %s: err=%d\n",
> + engine->name, err);
> + goto out_unlock;
> + }
> + }
> +
> + for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
> + int nsibling, n;
> +
> + nsibling = 0;
> + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
> + if (!i915->engine_class[class][inst])
> + break;
> +
> + siblings[nsibling++] = i915->engine_class[class][inst];
> + }
> + if (nsibling < 2)
> + continue;
> +
> + for (n = 1; n <= nsibling + 1; n++) {
> + err = nop_virtual_engine(i915, siblings, nsibling,
> + n, 0);
> + if (err)
> + goto out_unlock;
> + }
> +
> + err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN);
> + if (err)
> + goto out_unlock;
> + }
> +
> +out_unlock:
> + mutex_unlock(&i915->drm.struct_mutex);
> + return err;
> +}
> +
> int intel_execlists_live_selftests(struct drm_i915_private *i915)
> {
> static const struct i915_subtest tests[] = {
> @@ -1068,6 +1232,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
> SUBTEST(live_chain_preempt),
> SUBTEST(live_preempt_hang),
> SUBTEST(live_preempt_smoke),
> + SUBTEST(live_virtual_engine),
> };
>
> if (!HAS_EXECLISTS(i915))
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 8ef6d60929c6..9c94c037d13b 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -127,6 +127,7 @@ enum drm_i915_gem_engine_class {
> };
>
> #define I915_ENGINE_CLASS_INVALID_NONE -1
> +#define I915_ENGINE_CLASS_INVALID_VIRTUAL 0
>
> /**
> * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915
> @@ -1598,8 +1599,37 @@ struct drm_i915_gem_context_param_sseu {
> __u32 rsvd;
> };
>
> +/*
> + * i915_context_engines_load_balance:
> + *
> + * Enable load balancing across this set of engines.
> + *
> + * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when
> + * used will proxy the execbuffer request onto one of the set of engines
> + * in such a way as to distribute the load evenly across the set.
> + *
> + * The set of engines must be compatible (e.g. the same HW class) as they
> + * will share the same logical GPU context and ring.
> + *
> + * To intermix rendering with the virtual engine and direct rendering onto
> + * the backing engines (bypassing the load balancing proxy), the context must
> + * be defined to use a single timeline for all engines.
> + */
> +struct i915_context_engines_load_balance {
> + struct i915_user_extension base;
> +
> + __u16 engine_index;
> + __u16 mbz16; /* reserved for future use; must be zero */
> + __u32 flags; /* all undefined flags must be zero */
> +
> + __u64 engines_mask; /* selection mask of engines[] */
> +
> + __u64 mbz64[4]; /* reserved for future use; must be zero */
> +};
> +
> struct i915_context_param_engines {
> __u64 extensions; /* linked chain of extension blocks, 0 terminates */
> +#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
>
> struct {
> __u16 engine_class; /* see enum drm_i915_gem_engine_class */
>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list