[Intel-gfx] [PATCH 12/13] drm/i915/execlists: Virtual engine bonding

Mon Mar 11 13:38:52 UTC 2019

On 08/03/2019 14:12, Chris Wilson wrote:
> Some users require that when a master batch is executed on one particular
> engine, a companion batch is run simultaneously on a specific slave
> engine. For this purpose, we introduce virtual engine bonding, allowing
> maps of master:slaves to be constructed to constrain which physical
> engines a virtual engine may select given a fence on a master engine.
> 
> For the moment, we continue to ignore the issue of preemption deferring
> the master request for later. Ideally, we would like to then also remove
> the slave and run something else rather than have it stall the pipeline.
> With load balancing, we should be able to move workload around it, but
> there is a similar stall on the master pipeline while it may wait for
> the slave to be executed. At the cost of more latency for the bonded
> request, it may be interesting to launch both on their engines in
> lockstep. (Bubbles abound.)
> 
> Opens: Also what about bonding an engine as its own master? It doesn't
> break anything internally, so allow the silliness.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.c    |  50 ++++++
>   drivers/gpu/drm/i915/i915_request.c        |   1 +
>   drivers/gpu/drm/i915/i915_request.h        |   1 +
>   drivers/gpu/drm/i915/intel_engine_types.h  |   7 +
>   drivers/gpu/drm/i915/intel_lrc.c           | 111 ++++++++++++++
>   drivers/gpu/drm/i915/intel_lrc.h           |   4 +
>   drivers/gpu/drm/i915/selftests/intel_lrc.c | 167 +++++++++++++++++++++
>   include/uapi/drm/i915_drm.h                |  22 +++
>   8 files changed, 363 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 13b79980f7f3..0d86306497b8 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -1484,8 +1484,58 @@ set_engines__load_balance(struct i915_user_extension __user *base, void *data)
>   	return 0;
>   }
>   
> +static int
> +set_engines__bond(struct i915_user_extension __user *base, void *data)
> +{
> +	struct i915_context_engines_bond __user *ext =
> +		container_of_user(base, typeof(*ext), base);
> +	const struct set_engines *set = data;
> +	struct intel_engine_cs *master;
> +	u32 class, instance, siblings;
> +	u16 idx;
> +	int err;
> +
> +	if (get_user(idx, &ext->engine_index))
> +		return -EFAULT;
> +
> +	if (idx >= set->nengine)
> +		return -EINVAL;
> +
> +	idx = array_index_nospec(idx, set->nengine);
> +	if (!set->engines[idx])
> +		return -EINVAL;
> +
> +	/*
> +	 * A non-virtual engine has 0 siblings to choose between; and submit
> +	 * fence will always be directed to the one engine.
> +	 */
> +	if (!intel_engine_is_virtual(set->engines[idx]))
> +		return 0;
> +
> +	err = check_user_mbz16(&ext->mbz);
> +	if (err)
> +		return err;
> +
> +	if (get_user(class, &ext->master_class))
> +		return -EFAULT;
> +
> +	if (get_user(instance, &ext->master_instance))
> +		return -EFAULT;
> +
> +	master = intel_engine_lookup_user(set->ctx->i915, class, instance);
> +	if (!master)
> +		return -EINVAL;
> +
> +	if (get_user(siblings, &ext->sibling_mask))
> +		return -EFAULT;
> +
> +	return intel_virtual_engine_attach_bond(set->engines[idx],
> +						master, siblings);
> +}
> +
>   static const i915_user_extension_fn set_engines__extensions[] = {
>   	[I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
> +	[I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
>   };
>   
>   static int
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5527ab22dbf2..0caf31de2b98 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -743,6 +743,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	rq->batch = NULL;
>   	rq->capture_list = NULL;
>   	rq->waitboost = false;
> +	rq->execution_mask = ~0u;
>   
>   	/*
>   	 * Reserve space in the ring buffer for all the commands required to
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index d4f6b2940130..862b25930de0 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -145,6 +145,7 @@ struct i915_request {
>   	 */
>   	struct i915_sched_node sched;
>   	struct i915_dependency dep;
> +	unsigned int execution_mask;
>   
>   	/*
>   	 * A convenience pointer to the current breadcrumb value stored in
> diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h
> index d54d2a1840cc..6dfcf5cc08c1 100644
> --- a/drivers/gpu/drm/i915/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/intel_engine_types.h
> @@ -382,6 +382,13 @@ struct intel_engine_cs {
>   	 */
>   	void		(*submit_request)(struct i915_request *rq);
>   
> +	/*
> +	 * Called on signaling of a SUBMIT_FENCE, passing along the signaling
> +	 * request down to the bonded pairs.
> +	 */
> +	void            (*bond_execute)(struct i915_request *rq,
> +					struct dma_fence *signal);
> +
>   	/*
>   	 * Call when the priority on a request has changed and it and its
>   	 * dependencies may need rescheduling. Note the request itself may
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0c97e8f30223..f06312d185af 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -179,6 +179,12 @@ struct virtual_engine {
>   		int prio;
>   	} nodes[I915_NUM_ENGINES];
>   
> +	struct ve_bond {
> +		struct intel_engine_cs *master;
> +		unsigned int sibling_mask;
> +	} *bonds;
> +	unsigned int nbond;
> +
>   	unsigned int count;
>   	struct intel_engine_cs *siblings[0];
>   };
> @@ -3183,6 +3189,7 @@ static const struct intel_context_ops virtual_context_ops = {
>   static void virtual_submission_tasklet(unsigned long data)
>   {
>   	struct virtual_engine * const ve = (struct virtual_engine *)data;
> +	unsigned int mask;
>   	unsigned int n;
>   	int prio;
>   
> @@ -3191,12 +3198,30 @@ static void virtual_submission_tasklet(unsigned long data)
>   		return;
>   
>   	local_irq_disable();
> +
> +	mask = 0;
> +	spin_lock(&ve->base.timeline.lock);
> +	if (ve->request)
> +		mask = ve->request->execution_mask;
> +	spin_unlock(&ve->base.timeline.lock);
> +
>   	for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) {
>   		struct intel_engine_cs *sibling = ve->siblings[n];
>   		struct ve_node * const node = &ve->nodes[sibling->id];
>   		struct rb_node **parent, *rb;
>   		bool first;
> 

/* Check if request can be executed on this sibling. Because it cannot 
be known in advance (at insert to queue time).. ? */

> +		if (unlikely(!(mask & sibling->mask))) {
> +			if (!RB_EMPTY_NODE(&node->rb)) {
> +				spin_lock(&sibling->timeline.lock);
> +				rb_erase_cached(&node->rb,
> +						&sibling->execlists.virtual);
> +				RB_CLEAR_NODE(&node->rb);
> +				spin_unlock(&sibling->timeline.lock);
> +			}
> +			continue;
> +		}
> +
>   		spin_lock(&sibling->timeline.lock);
>   
>   		if (!RB_EMPTY_NODE(&node->rb)) {
> @@ -3254,6 +3279,30 @@ static void virtual_submit_request(struct i915_request *request)
>   	tasklet_schedule(&ve->base.execlists.tasklet);
>   }
>   
> +static struct ve_bond *
> +virtual_find_bond(struct virtual_engine *ve, struct intel_engine_cs *master)
> +{
> +	int i;
> +
> +	for (i = 0; i < ve->nbond; i++) {
> +		if (ve->bonds[i].master == master)
> +			return &ve->bonds[i];
> +	}
> +
> +	return NULL;
> +}
> +
> +static void
> +virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
> +{
> +	struct virtual_engine *ve = to_virtual_engine(rq->engine);
> +	struct ve_bond *bond;
> +
> +	bond = virtual_find_bond(ve, to_request(signal)->engine);
> +	if (bond) /* XXX serialise with rq->lock? */
> +		rq->execution_mask &= bond->sibling_mask;
> +}
> +
>   struct intel_engine_cs *
>   intel_execlists_create_virtual(struct i915_gem_context *ctx,
>   			       struct intel_engine_cs **siblings,
> @@ -3294,6 +3343,7 @@ intel_execlists_create_virtual(struct i915_gem_context *ctx,
>   
>   	ve->base.schedule = i915_schedule;
>   	ve->base.submit_request = virtual_submit_request;
> +	ve->base.bond_execute = virtual_bond_execute;
>   
>   	ve->base.execlists.queue_priority_hint = INT_MIN;
>   	tasklet_init(&ve->base.execlists.tasklet,
> @@ -3369,9 +3419,70 @@ intel_execlists_clone_virtual(struct i915_gem_context *ctx,
>   	if (IS_ERR(dst))
>   		return dst;
>   
> +	if (se->nbond) {
> +		struct virtual_engine *de = to_virtual_engine(dst);
> +
> +		de->bonds = kmemdup(se->bonds,
> +				    sizeof(*se->bonds) * se->nbond,
> +				    GFP_KERNEL);
> +		if (!de->bonds) {
> +			intel_virtual_engine_destroy(dst);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +
> +		de->nbond = se->nbond;
> +	}
> +
>   	return dst;
>   }
>   
> +static unsigned long
> +virtual_execution_mask(struct virtual_engine *ve, unsigned long mask)
> +{
> +	unsigned long emask = 0;
> +	int bit;
> +
> +	for_each_set_bit(bit, &mask, ve->count)
> +		emask |= ve->siblings[bit]->mask;
> +
> +	return emask;
> +}
> +
> +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
> +				     struct intel_engine_cs *master,
> +				     unsigned long mask)
> +{
> +	struct virtual_engine *ve = to_virtual_engine(engine);
> +	struct ve_bond *bond;
> +
> +	if (mask >> ve->count)
> +		return -EINVAL;
> +
> +	mask = virtual_execution_mask(ve, mask);
> +	if (!mask)
> +		return -EINVAL;
> +
> +	bond = virtual_find_bond(ve, master);
> +	if (bond) {
> +		bond->sibling_mask |= mask;
> +		return 0;
> +	}
> +
> +	bond = krealloc(ve->bonds,
> +			sizeof(*bond) * (ve->nbond + 1),
> +			GFP_KERNEL);
> +	if (!bond)
> +		return -ENOMEM;
> +
> +	bond[ve->nbond].master = master;
> +	bond[ve->nbond].sibling_mask = mask;
> +
> +	ve->bonds = bond;
> +	ve->nbond++;
> +
> +	return 0;
> +}
> +
>   void intel_virtual_engine_destroy(struct intel_engine_cs *engine)
>   {

I think there should be a hunk in here to free ve->bonds.

>   	struct virtual_engine *ve = to_virtual_engine(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 9d90dc68e02b..77b85648045a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -121,6 +121,10 @@ struct intel_engine_cs *
>   intel_execlists_clone_virtual(struct i915_gem_context *ctx,
>   			      struct intel_engine_cs *src);
>   
> +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
> +				     struct intel_engine_cs *master,
> +				     unsigned long mask);
> +
>   void intel_virtual_engine_destroy(struct intel_engine_cs *engine);
>   
>   u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index 4b8a339529d1..a7de7a8fc24a 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -13,6 +13,7 @@
>   #include "igt_live_test.h"
>   #include "igt_spinner.h"
>   #include "i915_random.h"
> +#include "lib_sw_fence.h"
>   
>   #include "mock_context.h"
>   
> @@ -1224,6 +1225,171 @@ static int live_virtual_engine(void *arg)
>   	return err;
>   }
>   
> +static int bond_virtual_engine(struct drm_i915_private *i915,
> +			       unsigned int class,
> +			       struct intel_engine_cs **siblings,
> +			       unsigned int nsibling,
> +			       unsigned int flags)
> +#define BOND_SCHEDULE BIT(0)
> +{
> +	struct intel_engine_cs *master;
> +	struct i915_gem_context *ctx;
> +	struct i915_request *rq[16];
> +	enum intel_engine_id id;
> +	unsigned long n;
> +	int err;
> +
> +	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
> +
> +	ctx = kernel_context(i915);
> +	if (!ctx)
> +		return -ENOMEM;
> +
> +	err = 0;
> +	rq[0] = ERR_PTR(-ENOMEM);
> +	for_each_engine(master, i915, id) {
> +		struct i915_sw_fence fence;
> +
> +		if (master->class == class)
> +			continue;
> +
> +		rq[0] = i915_request_alloc(master, ctx);
> +		if (IS_ERR(rq[0])) {
> +			err = PTR_ERR(rq[0]);
> +			goto out;
> +		}
> +
> +		if (flags & BOND_SCHEDULE)
> +			onstack_fence_init(&fence);
> +
> +		i915_request_get(rq[0]);
> +		i915_request_add(rq[0]);
> +
> +		for (n = 0; n < nsibling; n++) {
> +			struct intel_engine_cs *engine;
> +
> +			engine = intel_execlists_create_virtual(ctx,
> +								siblings,
> +								nsibling);
> +			if (IS_ERR(engine)) {
> +				err = PTR_ERR(engine);
> +				goto out;
> +			}
> +
> +			err = intel_virtual_engine_attach_bond(engine,
> +							       master,
> +							       BIT(n));
> +			if (err) {
> +				intel_virtual_engine_destroy(engine);
> +				goto out;
> +			}
> +
> +			rq[n + 1] = i915_request_alloc(engine, ctx);
> +			if (IS_ERR(rq[n + 1])) {
> +				err = PTR_ERR(rq[n + 1]);
> +				intel_virtual_engine_destroy(engine);
> +				goto out;
> +			}
> +			i915_request_get(rq[n + 1]);
> +
> +			err = i915_request_await_execution(rq[n + 1],
> +							   &rq[0]->fence,
> +							   engine->bond_execute);
> +			i915_request_add(rq[n + 1]);
> +			intel_virtual_engine_destroy(engine);
> +			if (err < 0)
> +				goto out;
> +		}
> +		rq[n + 1] = ERR_PTR(-EINVAL);
> +
> +		if (flags & BOND_SCHEDULE)
> +			onstack_fence_fini(&fence);

The idea of this fence is to delay rq[0]? But I don't see it passed in 
anywhere? I wanted to suggest to check before onstack_fence_fini that 
rq[0], or any other request, haven't been submitted/completed yet.

> +
> +		for (n = 0; n < nsibling; n++) {
> +			if (i915_request_wait(rq[n + 1],
> +					      I915_WAIT_LOCKED,
> +					      MAX_SCHEDULE_TIMEOUT) < 0) {
> +				err = -EIO;
> +				goto out;
> +			}
> +
> +			if (rq[n + 1]->engine != siblings[n]) {
> +				pr_err("Bonded request did not execute on target engine: expected %s, used %s; master was %s\n",
> +				       siblings[n]->name,
> +				       rq[n + 1]->engine->name,
> +				       rq[0]->engine->name);
> +				err = -EINVAL;
> +				goto out;
> +			}
> +		}
> +
> +		for (n = 0; !IS_ERR(rq[n]); n++)
> +			i915_request_put(rq[n]);
> +		rq[0] = ERR_PTR(-ENOMEM);
> +	}
> +
> +out:
> +	for (n = 0; !IS_ERR(rq[n]); n++)
> +		i915_request_put(rq[n]);
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +
> +	kernel_context_close(ctx);
> +	return err;
> +}
> +
> +static int live_virtual_bond(void *arg)
> +{
> +	static const struct phase {
> +		const char *name;
> +		unsigned int flags;
> +	} phases[] = {
> +		{ "", 0 },
> +		{ "schedule", BOND_SCHEDULE },
> +		{ },
> +	};
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
> +	unsigned int class, inst;
> +	int err = 0;
> +
> +	if (USES_GUC_SUBMISSION(i915))
> +		return 0;

One day no one will remember to add missing coverage but we don't have 
selftest skips.. task for an intern.

> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +
> +	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
> +		const struct phase *p;
> +		int nsibling;
> +
> +		nsibling = 0;
> +		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
> +			if (!i915->engine_class[class][inst])
> +				break;
> +
> +			GEM_BUG_ON(nsibling == ARRAY_SIZE(siblings));
> +			siblings[nsibling++] = i915->engine_class[class][inst];
> +		}
> +		if (nsibling < 2)
> +			continue;
> +
> +		for (p = phases; p->name; p++) {
> +			err = bond_virtual_engine(i915,
> +						  class, siblings, nsibling,
> +						  p->flags);
> +			if (err) {
> +				pr_err("%s(%s): failed class=%d, nsibling=%d, err=%d\n",
> +				       __func__, p->name, class, nsibling, err);
> +				goto out_unlock;
> +			}
> +		}
> +	}
> +
> +out_unlock:
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	return err;
> +}
> +
>   int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
> @@ -1236,6 +1402,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(live_preempt_hang),
>   		SUBTEST(live_preempt_smoke),
>   		SUBTEST(live_virtual_engine),
> +		SUBTEST(live_virtual_bond),
>   	};
>   
>   	if (!HAS_EXECLISTS(i915))
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 592b02676044..94e72ae954a0 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1530,6 +1530,10 @@ struct drm_i915_gem_context_param {
>    * sized argument, will revert back to default settings.
>    *
>    * See struct i915_context_param_engines.
> + *
> + * Extensions:
> + *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
> + *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
>    */
>   #define I915_CONTEXT_PARAM_ENGINES	0xa
>   /* Must be kept compact -- no holes and well documented */
> @@ -1625,9 +1629,27 @@ struct i915_context_engines_load_balance {
>   	__u64 mbz64[4]; /* reserved for future use; must be zero */
>   };
>   
> +/*
> + * i915_context_engines_bond:
> + *
> + */
> +struct i915_context_engines_bond {
> +	struct i915_user_extension base;
> +
> +	__u16 engine_index;
> +	__u16 mbz;
> +
> +	__u16 master_class;
> +	__u16 master_instance;
> +
> +	__u64 sibling_mask;
> +	__u64 flags; /* all undefined flags must be zero */
> +};

Need some api doc for it all.

We are not in danger of any compiler added padding due not leaving any 
holes, right? So no need to define this as packed?

> +
>   struct i915_context_param_engines {
>   	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
>   #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
> +#define I915_CONTEXT_ENGINES_EXT_BOND 1
>   
>   	struct {
>   		__u16 engine_class; /* see enum drm_i915_gem_engine_class */
> 

Regards,

Tvrtko