[Intel-gfx] [PATCH 16/39] drm/i915/execlists: Virtual engine bonding
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Thu Mar 14 17:26:19 UTC 2019
On 13/03/2019 14:43, Chris Wilson wrote:
> Some users require that when a master batch is executed on one particular
> engine, a companion batch is run simultaneously on a specific slave
> engine. For this purpose, we introduce virtual engine bonding, allowing
> maps of master:slaves to be constructed to constrain which physical
> engines a virtual engine may select given a fence on a master engine.
>
> For the moment, we continue to ignore the issue of preemption deferring
> the master request for later. Ideally, we would like to then also remove
> the slave and run something else rather than have it stall the pipeline.
> With load balancing, we should be able to move workload around it, but
> there is a similar stall on the master pipeline while it may wait for
> the slave to be executed. At the cost of more latency for the bonded
> request, it may be interesting to launch both on their engines in
> lockstep. (Bubbles abound.)
>
> Opens: Also what about bonding an engine as its own master? It doesn't
> break anything internally, so allow the silliness.
>
> v2: Emancipate the bonds
> v3: Couple in delayed scheduling for the selftests
> v4: Handle invalid mutually exclusive bonding
> v5: Mention what the uapi does
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gem_context.c | 50 +++++
> drivers/gpu/drm/i915/i915_request.c | 1 +
> drivers/gpu/drm/i915/i915_request.h | 1 +
> drivers/gpu/drm/i915/intel_engine_types.h | 7 +
> drivers/gpu/drm/i915/intel_lrc.c | 143 ++++++++++++++
> drivers/gpu/drm/i915/intel_lrc.h | 4 +
> drivers/gpu/drm/i915/selftests/intel_lrc.c | 185 ++++++++++++++++++
> drivers/gpu/drm/i915/selftests/lib_sw_fence.c | 3 +
> include/uapi/drm/i915_drm.h | 33 ++++
> 9 files changed, 427 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 98763d3f1b12..0ec78c386473 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -1513,8 +1513,58 @@ set_engines__load_balance(struct i915_user_extension __user *base, void *data)
> return 0;
> }
>
> +static int
> +set_engines__bond(struct i915_user_extension __user *base, void *data)
> +{
> + struct i915_context_engines_bond __user *ext =
> + container_of_user(base, typeof(*ext), base);
> + const struct set_engines *set = data;
> + struct intel_engine_cs *master;
> + u32 class, instance, siblings;
u16 class, instance for no real gain.
> + u16 idx;
> + int err;
> +
> + if (get_user(idx, &ext->virtual_index))
> + return -EFAULT;
> +
> + if (idx >= set->nengine)
> + return -EINVAL;
> +
> + idx = array_index_nospec(idx, set->nengine);
> + if (!set->engines[idx])
> + return -EINVAL;
> +
> + /*
> + * A non-virtual engine has 0 siblings to choose between; and submit
> + * fence will always be directed to the one engine.
> + */
> + if (!intel_engine_is_virtual(set->engines[idx]))
> + return 0;
> +
> + err = check_user_mbz(&ext->mbz);
> + if (err)
> + return err;
> +
> + if (get_user(class, &ext->master_class))
> + return -EFAULT;
> +
> + if (get_user(instance, &ext->master_instance))
> + return -EFAULT;
> +
> + master = intel_engine_lookup_user(set->ctx->i915, class, instance);
> + if (!master)
> + return -EINVAL;
> +
> + if (get_user(siblings, &ext->sibling_mask))
> + return -EFAULT;
> +
> + return intel_virtual_engine_attach_bond(set->engines[idx],
> + master, siblings);
> +}
> +
> static const i915_user_extension_fn set_engines__extensions[] = {
> [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
> + [I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
> };
>
> static int
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 0a46f8113f5c..9ce710baa452 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -743,6 +743,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> rq->batch = NULL;
> rq->capture_list = NULL;
> rq->waitboost = false;
> + rq->execution_mask = ~0u;
>
> /*
> * Reserve space in the ring buffer for all the commands required to
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index d4f6b2940130..862b25930de0 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -145,6 +145,7 @@ struct i915_request {
> */
> struct i915_sched_node sched;
> struct i915_dependency dep;
> + unsigned int execution_mask;
>
> /*
> * A convenience pointer to the current breadcrumb value stored in
> diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h
> index 322fbda65190..1da35509d811 100644
> --- a/drivers/gpu/drm/i915/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/intel_engine_types.h
> @@ -384,6 +384,13 @@ struct intel_engine_cs {
> */
> void (*submit_request)(struct i915_request *rq);
>
> + /*
> + * Called on signaling of a SUBMIT_FENCE, passing along the signaling
> + * request down to the bonded pairs.
> + */
> + void (*bond_execute)(struct i915_request *rq,
> + struct dma_fence *signal);
> +
> /*
> * Call when the priority on a request has changed and it and its
> * dependencies may need rescheduling. Note the request itself may
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d54b7cc43633..79ab4bc543fd 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -191,6 +191,18 @@ struct virtual_engine {
> int prio;
> } nodes[I915_NUM_ENGINES];
>
> + /*
> + * Keep track of bonded pairs -- restrictions upon on our selection
> + * of physical engines any particular request may be submitted to.
> + * If we receive a submit-fence from a master engine, we will only
> + * use one of sibling_mask physical engines.
> + */
> + struct ve_bond {
> + struct intel_engine_cs *master;
> + unsigned int sibling_mask;
> + } *bonds;
> + unsigned int nbond;
I'll tempt to pluralize this as well.
> +
> /* And finally, which physical engines this virtual engine maps onto. */
> unsigned int count;
> struct intel_engine_cs *siblings[0];
> @@ -3156,6 +3168,8 @@ static void virtual_context_destroy(struct kref *kref)
> if (ve->context.state)
> __execlists_context_fini(&ve->context);
>
> + kfree(ve->bonds);
> +
> i915_timeline_fini(&ve->base.timeline);
> kfree(ve);
> }
> @@ -3207,9 +3221,28 @@ static const struct intel_context_ops virtual_context_ops = {
> .destroy = virtual_context_destroy,
> };
>
> +static void virtual_submit_error(struct virtual_engine *ve)
> +{
> + struct i915_request *rq = fetch_and_zero(&ve->request);
> + struct intel_engine_cs *engine = ve->siblings[0]; /* any victim */
> +
> + dma_fence_set_error(&rq->fence, -ENODEV);
> + i915_request_mark_complete(rq);
> +
> + spin_lock(&engine->timeline.lock);
> + rq->engine = engine;
> +
> + __i915_request_submit(rq);
> + /* move to start of list to avoid unwind complications */
> + list_move(&rq->link, &engine->timeline.requests);
> +
> + spin_unlock(&engine->timeline.lock);
> +}
> +
> static void virtual_submission_tasklet(unsigned long data)
> {
> struct virtual_engine * const ve = (struct virtual_engine *)data;
> + unsigned int mask;
> unsigned int n;
> int prio;
>
> @@ -3218,12 +3251,35 @@ static void virtual_submission_tasklet(unsigned long data)
> return;
>
> local_irq_disable();
> +
> + mask = 0;
> + spin_lock(&ve->base.timeline.lock);
> + if (ve->request) {
> + mask = ve->request->execution_mask;
> + if (unlikely(!mask))
> + virtual_submit_error(ve);
What clears the mask? And virtual_submit_error fails it then?
> + }
> + spin_unlock(&ve->base.timeline.lock);
> + if (!mask)
> + goto out_irq;
> +
> for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) {
> struct intel_engine_cs *sibling = ve->siblings[n];
> struct ve_node * const node = &ve->nodes[sibling->id];
> struct rb_node **parent, *rb;
> bool first;
>
> + if (unlikely(!(mask & sibling->mask))) {
> + if (!RB_EMPTY_NODE(&node->rb)) {
> + spin_lock(&sibling->timeline.lock);
> + rb_erase_cached(&node->rb,
> + &sibling->execlists.virtual);
> + RB_CLEAR_NODE(&node->rb);
> + spin_unlock(&sibling->timeline.lock);
> + }
> + continue;
> + }
> +
> spin_lock(&sibling->timeline.lock);
>
> if (!RB_EMPTY_NODE(&node->rb)) {
> @@ -3270,6 +3326,7 @@ static void virtual_submission_tasklet(unsigned long data)
>
> spin_unlock(&sibling->timeline.lock);
> }
> +out_irq:
> local_irq_enable();
> }
>
> @@ -3286,6 +3343,30 @@ static void virtual_submit_request(struct i915_request *request)
> tasklet_schedule(&ve->base.execlists.tasklet);
> }
>
> +static struct ve_bond *
> +virtual_find_bond(struct virtual_engine *ve, struct intel_engine_cs *master)
> +{
> + int i;
> +
> + for (i = 0; i < ve->nbond; i++) {
> + if (ve->bonds[i].master == master)
> + return &ve->bonds[i];
> + }
> +
> + return NULL;
> +}
> +
> +static void
> +virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
> +{
> + struct virtual_engine *ve = to_virtual_engine(rq->engine);
> + struct ve_bond *bond;
> +
> + bond = virtual_find_bond(ve, to_request(signal)->engine);
> + if (bond) /* XXX serialise with rq->lock? */
> + rq->execution_mask &= bond->sibling_mask;
> +}
> +
> struct intel_engine_cs *
> intel_execlists_create_virtual(struct i915_gem_context *ctx,
> struct intel_engine_cs **siblings,
> @@ -3324,6 +3405,7 @@ intel_execlists_create_virtual(struct i915_gem_context *ctx,
>
> ve->base.schedule = i915_schedule;
> ve->base.submit_request = virtual_submit_request;
> + ve->base.bond_execute = virtual_bond_execute;
>
> ve->base.execlists.queue_priority_hint = INT_MIN;
> tasklet_init(&ve->base.execlists.tasklet,
> @@ -3414,9 +3496,70 @@ intel_execlists_clone_virtual(struct i915_gem_context *ctx,
> if (IS_ERR(dst))
> return dst;
>
> + if (se->nbond) {
> + struct virtual_engine *de = to_virtual_engine(dst);
> +
> + de->bonds = kmemdup(se->bonds,
> + sizeof(*se->bonds) * se->nbond,
> + GFP_KERNEL);
> + if (!de->bonds) {
> + intel_virtual_engine_destroy(dst);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + de->nbond = se->nbond;
> + }
> +
> return dst;
> }
>
> +static unsigned long
> +virtual_execution_mask(struct virtual_engine *ve, unsigned long mask)
> +{
> + unsigned long emask = 0;
> + int bit;
> +
> + for_each_set_bit(bit, &mask, ve->count)
> + emask |= ve->siblings[bit]->mask;
> +
> + return emask;
> +}
> +
> +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
> + struct intel_engine_cs *master,
> + unsigned long mask)
> +{
> + struct virtual_engine *ve = to_virtual_engine(engine);
> + struct ve_bond *bond;
> +
> + if (mask >> ve->count)
> + return -EINVAL;
> +
> + mask = virtual_execution_mask(ve, mask);
> + if (!mask)
> + return -EINVAL;
> +
> + bond = virtual_find_bond(ve, master);
> + if (bond) {
> + bond->sibling_mask |= mask;
> + return 0;
> + }
> +
> + bond = krealloc(ve->bonds,
> + sizeof(*bond) * (ve->nbond + 1),
> + GFP_KERNEL);
> + if (!bond)
> + return -ENOMEM;
> +
> + bond[ve->nbond].master = master;
> + bond[ve->nbond].sibling_mask = mask;
> +
> + ve->bonds = bond;
> + ve->nbond++;
> +
> + return 0;
> +}
> +
> void intel_virtual_engine_destroy(struct intel_engine_cs *engine)
> {
> struct virtual_engine *ve = to_virtual_engine(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 9d90dc68e02b..77b85648045a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -121,6 +121,10 @@ struct intel_engine_cs *
> intel_execlists_clone_virtual(struct i915_gem_context *ctx,
> struct intel_engine_cs *src);
>
> +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
> + struct intel_engine_cs *master,
> + unsigned long mask);
> +
> void intel_virtual_engine_destroy(struct intel_engine_cs *engine);
>
> u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index 4b8a339529d1..d2e00175c417 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -13,6 +13,7 @@
> #include "igt_live_test.h"
> #include "igt_spinner.h"
> #include "i915_random.h"
> +#include "lib_sw_fence.h"
>
> #include "mock_context.h"
>
> @@ -1224,6 +1225,189 @@ static int live_virtual_engine(void *arg)
> return err;
> }
>
> +static int bond_virtual_engine(struct drm_i915_private *i915,
> + unsigned int class,
> + struct intel_engine_cs **siblings,
> + unsigned int nsibling,
> + unsigned int flags)
> +#define BOND_SCHEDULE BIT(0)
> +{
> + struct intel_engine_cs *master;
> + struct i915_gem_context *ctx;
> + struct i915_request *rq[16];
> + enum intel_engine_id id;
> + unsigned long n;
> + int err;
> +
> + GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
> +
> + ctx = kernel_context(i915);
> + if (!ctx)
> + return -ENOMEM;
> +
> + err = 0;
> + rq[0] = ERR_PTR(-ENOMEM);
> + for_each_engine(master, i915, id) {
> + struct i915_sw_fence fence = {};
> +
> + if (master->class == class)
> + continue;
> +
> + memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
> +
> + rq[0] = i915_request_alloc(master, ctx);
> + if (IS_ERR(rq[0])) {
> + err = PTR_ERR(rq[0]);
> + goto out;
> + }
> + i915_request_get(rq[0]);
> +
> + if (flags & BOND_SCHEDULE) {
> + onstack_fence_init(&fence);
> + err = i915_sw_fence_await_sw_fence_gfp(&rq[0]->submit,
> + &fence,
> + GFP_KERNEL);
> + }
> + i915_request_add(rq[0]);
> + if (err < 0)
> + goto out;
> +
> + for (n = 0; n < nsibling; n++) {
> + struct intel_engine_cs *engine;
> +
> + engine = intel_execlists_create_virtual(ctx,
> + siblings,
> + nsibling);
> + if (IS_ERR(engine)) {
> + err = PTR_ERR(engine);
> + onstack_fence_fini(&fence);
> + goto out;
> + }
> +
> + err = intel_virtual_engine_attach_bond(engine,
> + master,
> + BIT(n));
> + if (err) {
> + intel_virtual_engine_destroy(engine);
> + onstack_fence_fini(&fence);
> + goto out;
> + }
> +
> + rq[n + 1] = i915_request_alloc(engine, ctx);
> + if (IS_ERR(rq[n + 1])) {
> + err = PTR_ERR(rq[n + 1]);
> + intel_virtual_engine_destroy(engine);
> + onstack_fence_fini(&fence);
> + goto out;
> + }
> + i915_request_get(rq[n + 1]);
> +
> + err = i915_request_await_execution(rq[n + 1],
> + &rq[0]->fence,
> + engine->bond_execute);
> + i915_request_add(rq[n + 1]);
> + intel_virtual_engine_destroy(engine);
> + if (err < 0) {
> + onstack_fence_fini(&fence);
> + goto out;
> + }
> + }
> + onstack_fence_fini(&fence);
> +
> + if (i915_request_wait(rq[0],
> + I915_WAIT_LOCKED,
> + HZ / 10) < 0) {
> + pr_err("Master request did not execute (on %s)!\n",
> + rq[0]->engine->name);
> + err = -EIO;
> + goto out;
> + }
> +
> + for (n = 0; n < nsibling; n++) {
> + if (i915_request_wait(rq[n + 1],
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT) < 0) {
> + err = -EIO;
> + goto out;
> + }
> +
> + if (rq[n + 1]->engine != siblings[n]) {
> + pr_err("Bonded request did not execute on target engine: expected %s, used %s; master was %s\n",
> + siblings[n]->name,
> + rq[n + 1]->engine->name,
> + rq[0]->engine->name);
> + err = -EINVAL;
> + goto out;
> + }
> + }
> +
> + for (n = 0; !IS_ERR(rq[n]); n++)
> + i915_request_put(rq[n]);
> + rq[0] = ERR_PTR(-ENOMEM);
> + }
> +
> +out:
> + for (n = 0; !IS_ERR(rq[n]); n++)
> + i915_request_put(rq[n]);
> + if (igt_flush_test(i915, I915_WAIT_LOCKED))
> + err = -EIO;
> +
> + kernel_context_close(ctx);
> + return err;
> +}
> +
> +static int live_virtual_bond(void *arg)
> +{
> + static const struct phase {
> + const char *name;
> + unsigned int flags;
> + } phases[] = {
> + { "", 0 },
> + { "schedule", BOND_SCHEDULE },
> + { },
> + };
> + struct drm_i915_private *i915 = arg;
> + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
> + unsigned int class, inst;
> + int err = 0;
> +
> + if (USES_GUC_SUBMISSION(i915))
> + return 0;
> +
> + mutex_lock(&i915->drm.struct_mutex);
> +
> + for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
> + const struct phase *p;
> + int nsibling;
> +
> + nsibling = 0;
> + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
> + if (!i915->engine_class[class][inst])
> + break;
> +
> + GEM_BUG_ON(nsibling == ARRAY_SIZE(siblings));
> + siblings[nsibling++] = i915->engine_class[class][inst];
> + }
> + if (nsibling < 2)
> + continue;
> +
> + for (p = phases; p->name; p++) {
> + err = bond_virtual_engine(i915,
> + class, siblings, nsibling,
> + p->flags);
> + if (err) {
> + pr_err("%s(%s): failed class=%d, nsibling=%d, err=%d\n",
> + __func__, p->name, class, nsibling, err);
> + goto out_unlock;
> + }
> + }
> + }
> +
> +out_unlock:
> + mutex_unlock(&i915->drm.struct_mutex);
> + return err;
> +}
> +
> int intel_execlists_live_selftests(struct drm_i915_private *i915)
> {
> static const struct i915_subtest tests[] = {
> @@ -1236,6 +1420,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
> SUBTEST(live_preempt_hang),
> SUBTEST(live_preempt_smoke),
> SUBTEST(live_virtual_engine),
> + SUBTEST(live_virtual_bond),
> };
>
> if (!HAS_EXECLISTS(i915))
> diff --git a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
> index 2bfa72c1654b..b976c12817c5 100644
> --- a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
> +++ b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
> @@ -45,6 +45,9 @@ void __onstack_fence_init(struct i915_sw_fence *fence,
>
> void onstack_fence_fini(struct i915_sw_fence *fence)
> {
> + if (!fence->flags)
> + return;
> +
> i915_sw_fence_commit(fence);
> i915_sw_fence_fini(fence);
> }
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index c057536df5cd..ed33b8af8692 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1532,6 +1532,10 @@ struct drm_i915_gem_context_param {
> * sized argument, will revert back to default settings.
> *
> * See struct i915_context_param_engines.
> + *
> + * Extensions:
> + * i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
> + * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
> */
> #define I915_CONTEXT_PARAM_ENGINES 0xa
> /* Must be kept compact -- no holes and well documented */
> @@ -1627,9 +1631,38 @@ struct i915_context_engines_load_balance {
> __u64 mbz64[4]; /* reserved for future use; must be zero */
> };
>
> +/*
> + * i915_context_engines_bond:
> + *
> + * Constructed bonded pairs for execution within a virtual engine.
> + *
> + * All engines are equal, but some are more equal than others. Given
> + * the distribution of resources in the HW, it may be preferable to run
> + * a request on a given subset of engines in parallel to a request on a
> + * specific engine. We enable this selection of engines within a virtual
> + * engine by specifying bonding pairs, for any given master engine we will
> + * only execute on one of the corresponding siblings within the virtual engine.
> + *
> + * To execute a request in parallel on the master engine and a sibling requires
> + * coordination with a I915_EXEC_FENCE_SUBMIT.
> + */
> +struct i915_context_engines_bond {
> + struct i915_user_extension base;
> +
> + __u16 virtual_index; /* index of virtual engine in ctx->engines[] */
> + __u16 mbz;
> +
> + __u16 master_class;
> + __u16 master_instance;
> +
> + __u64 sibling_mask; /* bitmask of BIT(sibling_index) wrt the v.engine */
> + __u64 flags; /* all undefined flags must be zero */
> +};
> +
> struct i915_context_param_engines {
> __u64 extensions; /* linked chain of extension blocks, 0 terminates */
> #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
> +#define I915_CONTEXT_ENGINES_EXT_BOND 1
>
> struct {
> __u16 engine_class; /* see enum drm_i915_gem_engine_class */
>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list