[Intel-gfx] [PATCH 15/18] drm/i915: Load balancing across a virtual engine
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Thu Mar 21 15:13:59 UTC 2019
On 21/03/2019 15:00, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-03-20 15:59:14)
>>
>> On 19/03/2019 11:57, Chris Wilson wrote:
>>> static void execlists_dequeue(struct intel_engine_cs *engine)
>>> {
>>> struct intel_engine_execlists * const execlists = &engine->execlists;
>>> @@ -691,6 +805,37 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>>> * and context switches) submission.
>>> */
>>>
>>> + for (rb = rb_first_cached(&execlists->virtual); rb; ) {
>>> + struct virtual_engine *ve =
>>> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
>>> + struct i915_request *rq = READ_ONCE(ve->request);
>>> + struct intel_engine_cs *active;
>>> +
>>> + if (!rq) { /* lazily cleanup after another engine handled rq */
>>> + rb_erase_cached(rb, &execlists->virtual);
>>> + RB_CLEAR_NODE(rb);
>>> + rb = rb_first_cached(&execlists->virtual);
>>> + continue;
>>> + }
>>> +
>>> + /*
>>> + * We track when the HW has completed saving the context image
>>> + * (i.e. when we have seen the final CS event switching out of
>>> + * the context) and must not overwrite the context image before
>>> + * then. This restricts us to only using the active engine
>>> + * while the previous virtualized request is inflight (so
>>> + * we reuse the register offsets). This is a very small
>>> + * hystersis on the greedy seelction algorithm.
>>> + */
>>> + active = READ_ONCE(ve->context.active);
>>> + if (active && active != engine) {
>>> + rb = rb_next(rb);
>>> + continue;
>>> + }
>>> +
>>> + break;
>>> + }
>>> +
>>> if (last) {
>>> /*
>>> * Don't resubmit or switch until all outstanding
>>> @@ -712,7 +857,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>>> if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
>>> return;
>>>
>>> - if (need_preempt(engine, last)) {
>>> + if (need_preempt(engine, last, rb)) {
>>> inject_preempt_context(engine);
>>> return;
>>> }
>>> @@ -752,6 +897,72 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>>> last->tail = last->wa_tail;
>>> }
>>>
>>> + while (rb) { /* XXX virtual is always taking precedence */
>>> + struct virtual_engine *ve =
>>> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
>>> + struct i915_request *rq;
>>> +
>>> + spin_lock(&ve->base.timeline.lock);
>>> +
>>> + rq = ve->request;
>>> + if (unlikely(!rq)) { /* lost the race to a sibling */
>>> + spin_unlock(&ve->base.timeline.lock);
>>> + rb_erase_cached(rb, &execlists->virtual);
>>> + RB_CLEAR_NODE(rb);
>>> + rb = rb_first_cached(&execlists->virtual);
>>> + continue;
>>> + }
>>> +
>>> + if (rq_prio(rq) >= queue_prio(execlists)) {
>>> + if (last && !can_merge_rq(last, rq)) {
>>> + spin_unlock(&ve->base.timeline.lock);
>>> + return; /* leave this rq for another engine */
>>
>> Should this engine attempt to dequeue something else? Maybe something
>> from the non-virtual queue for instance could be submitted/appended.
>
> We can't pick another virtual request for this engine as we run the risk
> of priority inversion (and actually scheduling something that depends on
> the request we skip over, although that is excluded at the moment by
> virtue of only allowing completion fences between virtual engines, that
> is something that we may be able to eliminate. Hmm.). If we give up on
> inserting a virtual request at all and skip onto the regular dequeue, we
> end up with a bubble and worst case would be we never allow a virtual
> request onto this engine (as we keep it busy and last always active).
>
> Can you say "What do we want? A timeslicing scheduler!".
Makes sense.
>>> + }
>>> +
>>> + GEM_BUG_ON(rq->engine != &ve->base);
>>> + ve->request = NULL;
>>> + ve->base.execlists.queue_priority_hint = INT_MIN;
>>> + rb_erase_cached(rb, &execlists->virtual);
>>> + RB_CLEAR_NODE(rb);
>>> +
>>> + GEM_BUG_ON(rq->hw_context != &ve->context);
>>> + rq->engine = engine;
>>> +
>>> + if (engine != ve->siblings[0]) {
>>> + u32 *regs = ve->context.lrc_reg_state;
>>> + unsigned int n;
>>> +
>>> + GEM_BUG_ON(READ_ONCE(ve->context.active));
>>> + virtual_update_register_offsets(regs, engine);
>>> +
>>> + /*
>>> + * Move the bound engine to the top of the list
>>> + * for future execution. We then kick this
>>> + * tasklet first before checking others, so that
>>> + * we preferentially reuse this set of bound
>>> + * registers.
>>> + */
>>> + for (n = 1; n < ve->count; n++) {
>>> + if (ve->siblings[n] == engine) {
>>> + swap(ve->siblings[n],
>>> + ve->siblings[0]);
>>> + break;
>>> + }
>>> + }
>>> +
>>> + GEM_BUG_ON(ve->siblings[0] != engine);
>>> + }
>>> +
>>> + __i915_request_submit(rq);
>>> + trace_i915_request_in(rq, port_index(port, execlists));
>>> + submit = true;
>>> + last = rq;
>>> + }
>>> +
>>> + spin_unlock(&ve->base.timeline.lock);
>>> + break;
>>> + }
>>> +
>>> while ((rb = rb_first_cached(&execlists->queue))) {
>>> struct i915_priolist *p = to_priolist(rb);
>>> struct i915_request *rq, *rn;
>>> @@ -971,6 +1182,24 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>>> i915_priolist_free(p);
>>> }
>>>
>>> + /* Cancel all attached virtual engines */
>>> + while ((rb = rb_first_cached(&execlists->virtual))) {
>>> + struct virtual_engine *ve =
>>> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
>>> +
>>> + rb_erase_cached(rb, &execlists->virtual);
>>> + RB_CLEAR_NODE(rb);
>>> +
>>> + spin_lock(&ve->base.timeline.lock);
>>> + if (ve->request) {
>>> + __i915_request_submit(ve->request);
>>> + dma_fence_set_error(&ve->request->fence, -EIO);
>>> + i915_request_mark_complete(ve->request);
>>> + ve->request = NULL;
>>> + }
>>> + spin_unlock(&ve->base.timeline.lock);
>>> + }
>>> +
>>> /* Remaining _unready_ requests will be nop'ed when submitted */
>>>
>>> execlists->queue_priority_hint = INT_MIN;
>>> @@ -2897,6 +3126,303 @@ void intel_lr_context_resume(struct drm_i915_private *i915)
>>> }
>>> }
>>>
>>> +static void virtual_context_destroy(struct kref *kref)
>>> +{
>>> + struct virtual_engine *ve =
>>> + container_of(kref, typeof(*ve), context.ref);
>>> + unsigned int n;
>>> +
>>> + GEM_BUG_ON(ve->request);
>>> + GEM_BUG_ON(ve->context.active);
>>> +
>>> + for (n = 0; n < ve->count; n++) {
>>> + struct intel_engine_cs *sibling = ve->siblings[n];
>>> + struct rb_node *node = &ve->nodes[sibling->id].rb;
>>> +
>>> + if (RB_EMPTY_NODE(node))
>>> + continue;
>>> +
>>> + spin_lock_irq(&sibling->timeline.lock);
>>> +
>>> + if (!RB_EMPTY_NODE(node))
>>> + rb_erase_cached(node, &sibling->execlists.virtual);
>>> +
>>> + spin_unlock_irq(&sibling->timeline.lock);
>>> + }
>>> + GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
>>> +
>>> + if (ve->context.state)
>>> + __execlists_context_fini(&ve->context);
>>> +
>>> + i915_timeline_fini(&ve->base.timeline);
>>> + kfree(ve);
>>> +}
>>> +
>>> +static void virtual_engine_initial_hint(struct virtual_engine *ve)
>>> +{
>>> + int swp;
>>> +
>>> + /*
>>> + * Pick a random sibling on starting to help spread the load around.
>>> + *
>>> + * New contexts are typically created with exactly the same order
>>> + * of siblings, and often started in batches. Due to the way we iterate
>>> + * the array of sibling when submitting requests, sibling[0] is
>>> + * prioritised for dequeuing. If we make sure that sibling[0] is fairly
>>> + * randomised across the system, we also help spread the load by the
>>> + * first engine we inspect being different each time.
>>> + *
>>> + * NB This does not force us to execute on this engine, it will just
>>> + * typically be the first we inspect for submission.
>>> + */
>>> + swp = prandom_u32_max(ve->count);
>>> + if (!swp)
>>> + return;
>>> +
>>> + swap(ve->siblings[swp], ve->siblings[0]);
>>> + virtual_update_register_offsets(ve->context.lrc_reg_state,
>>> + ve->siblings[0]);
>>> +}
>>> +
>>> +static int virtual_context_pin(struct intel_context *ce)
>>> +{
>>> + struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
>>> + int err;
>>> +
>>> + /* Note: we must use a real engine class for setting up reg state */
>>> + err = __execlists_context_pin(ce, ve->siblings[0]);
>>> + if (err)
>>> + return err;
>>> +
>>> + virtual_engine_initial_hint(ve);
>>> + return 0;
>>> +}
>>> +
>>> +static const struct intel_context_ops virtual_context_ops = {
>>> + .pin = virtual_context_pin,
>>> + .unpin = execlists_context_unpin,
>>> +
>>> + .destroy = virtual_context_destroy,
>>> +};
>>> +
>>> +static void virtual_submission_tasklet(unsigned long data)
>>> +{
>>> + struct virtual_engine * const ve = (struct virtual_engine *)data;
>>> + unsigned int n;
>>> + int prio;
>>> +
>>> + prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
>>> + if (prio == INT_MIN)
>>> + return;
>>
>> When does it hit this return?
>
> If the tasklet runs when I don't expect it to. Should be never indeed.
GEM_BUG_ON then, or a GEM_WARN_ON to start with?
> At least with bonding it becomes something a bit more tangible.
Hmm how so?
>>> + local_irq_disable();
>>> + for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) {
>>> + struct intel_engine_cs *sibling = ve->siblings[n];
>>> + struct ve_node * const node = &ve->nodes[sibling->id];
>>> + struct rb_node **parent, *rb;
>>> + bool first;
>>> +
>>> + spin_lock(&sibling->timeline.lock);
>>> +
>>> + if (!RB_EMPTY_NODE(&node->rb)) {
>>> + /*
>>> + * Cheat and avoid rebalancing the tree if we can
>>> + * reuse this node in situ.
>>> + */
>>> + first = rb_first_cached(&sibling->execlists.virtual) ==
>>> + &node->rb;
>>> + if (prio == node->prio || (prio > node->prio && first))
>>> + goto submit_engine;
>>> +
>>> + rb_erase_cached(&node->rb, &sibling->execlists.virtual);
>>
>> How the cheat works exactly? Avoids inserting into the tree in some
>> cases? And how does the real tasklet find this node then?
>
> It's already in the sibling->execlists.virtual, so we don't need to
> remove the node and reinsert it. So when we kick the sibling's tasklet
> it is right there.
So the cheat bit is just the prio and first check and the erase on
non-empty node is normal operation?
>>> + rb = NULL;
>>> + first = true;
>>> + parent = &sibling->execlists.virtual.rb_root.rb_node;
>>> + while (*parent) {
>>> + struct ve_node *other;
>>> +
>>> + rb = *parent;
>>> + other = rb_entry(rb, typeof(*other), rb);
>>> + if (prio > other->prio) {
>>> + parent = &rb->rb_left;
>>> + } else {
>>> + parent = &rb->rb_right;
>>> + first = false;
>>> + }
>>> + }
>>> +
>>> + rb_link_node(&node->rb, rb, parent);
>>> + rb_insert_color_cached(&node->rb,
>>> + &sibling->execlists.virtual,
>>> + first);
>>> +
>>> +submit_engine:
>>> + GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
>>> + node->prio = prio;
>>> + if (first && prio > sibling->execlists.queue_priority_hint) {
>>> + sibling->execlists.queue_priority_hint = prio;
>>> + tasklet_hi_schedule(&sibling->execlists.tasklet);
>>> + }
>>> +
>>> + spin_unlock(&sibling->timeline.lock);
>>> + }
>>> + local_irq_enable();
>>> +}
>>> +
>>> +static void virtual_submit_request(struct i915_request *request)
>>> +{
>>> + struct virtual_engine *ve = to_virtual_engine(request->engine);
>>> +
>>> + GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
>>> +
>>> + GEM_BUG_ON(ve->request);
>>> + ve->base.execlists.queue_priority_hint = rq_prio(request);
>>> + WRITE_ONCE(ve->request, request);
>>> +
>>> + tasklet_schedule(&ve->base.execlists.tasklet);
>>> +}
>>> +
>>> +struct intel_engine_cs *
>>> +intel_execlists_create_virtual(struct i915_gem_context *ctx,
>>> + struct intel_engine_cs **siblings,
>>> + unsigned int count)
>>> +{
>>> + struct virtual_engine *ve;
>>> + unsigned int n;
>>> + int err;
>>> +
>>> + if (!count)
>>> + return ERR_PTR(-EINVAL);
>>> +
>>> + ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
>>> + if (!ve)
>>> + return ERR_PTR(-ENOMEM);
>>> +
>>> + ve->base.i915 = ctx->i915;
>>> + ve->base.id = -1;
>>> + ve->base.class = OTHER_CLASS;
>>> + ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
>>> + ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
>>> + ve->base.flags = I915_ENGINE_IS_VIRTUAL;
>>> +
>>> + snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
>>> +
>>> + err = i915_timeline_init(ctx->i915, &ve->base.timeline, NULL);
>>> + if (err)
>>> + goto err_put;
>>> + i915_timeline_set_subclass(&ve->base.timeline, TIMELINE_VIRTUAL);
>>> +
>>> + ve->base.cops = &virtual_context_ops;
>>> + ve->base.request_alloc = execlists_request_alloc;
>>> +
>>> + ve->base.schedule = i915_schedule;
>>> + ve->base.submit_request = virtual_submit_request;
>>> +
>>> + ve->base.execlists.queue_priority_hint = INT_MIN;
>>> + tasklet_init(&ve->base.execlists.tasklet,
>>> + virtual_submission_tasklet,
>>> + (unsigned long)ve);
>>> +
>>> + intel_context_init(&ve->context, ctx, &ve->base);
>>> +
>>> + for (n = 0; n < count; n++) {
>>> + struct intel_engine_cs *sibling = siblings[n];
>>> +
>>> + GEM_BUG_ON(!is_power_of_2(sibling->mask));
>>> + if (sibling->mask & ve->base.mask)
>>> + continue;
>>
>> Continuing from the previous round - should we -EINVAL if two of the
>> same are found in the map? Since we are going to silently drop it..
>> perhaps better to disallow.
>
> Could do. I have no strong use case that expects to be able to handle
> the user passing in (vcs1, vcs2, vcs2).
>
> The really important question, is do we always create a virtual engine
> even wrapping a single physical engine.
>
> The more I ask myself, the answer is yes. (Primarily so that we can
> create multiple instances of the same engine with different logical
> contexts and sseus. Oh flip, i915_perf needs updating to find virtual
> contexts.). It's just that submitting a stream of nops to a virtual engine
> is 3x as expensive as a real engine.
Hmmm I think we do need to. Or mandate single timeline before veng can
be created. Otherwise I think we break the contract of multi-timeline
having each own GPU context.
> It's just that you mentioned that userspace ended up wrapping everything
> inside a virtual engine willy-nilly, that spells trouble.
We can always say -EINVAL to that since it hardly makes sense. If they
want contexts they can create real ones with ppgtt sharing.
Regards,
Tvrtko
More information about the Intel-gfx
mailing list