[PATCH 44/47] drm/i915/execlists: Virtual engine bonding

Sun Feb 24 19:06:56 UTC 2019

Some users require that when a master batch is executed on one particular
engine, a companion batch is run simultaneously on a specific slave
engine. For this purpose, we introduce virtual engine bonding, allowing
maps of master:slaves to be constructed to constrain which physical
engines a virtual engine may select given a fence on a master engine.

For the moment, we continue to ignore the issue of preemption deferring
the master request for later. Ideally, we would like to then also remove
the slave and run something else rather than have it stall the pipeline.
With load balancing, we should be able to move workload around it, but
there is a similar stall on the master pipeline while it may wait for
the slave to be executed. At the cost of more latency for the bonded
request, it may be interesting to launch both on their engines in
lockstep. (Bubbles abound.)

Opens: Also what about bonding an engine as its own master? It doesn't
break anything internally, so allow the silliness.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_context.c    |  45 ++++++
 drivers/gpu/drm/i915/i915_request.c        |   1 +
 drivers/gpu/drm/i915/i915_request.h        |   1 +
 drivers/gpu/drm/i915/intel_engine_types.h  |   7 +
 drivers/gpu/drm/i915/intel_lrc.c           |  97 ++++++++++++
 drivers/gpu/drm/i915/intel_lrc.h           |   3 +
 drivers/gpu/drm/i915/selftests/intel_lrc.c | 167 +++++++++++++++++++++
 include/uapi/drm/i915_drm.h                |  18 +++
 8 files changed, 339 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index f26e364cd705..8297d51804a6 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1329,6 +1329,16 @@ static int set_sseu(struct i915_gem_context *ctx,
 	return 0;
 };
 
+static int check_user_mbz32(u32 __user *user)
+{
+	u32 mbz;
+
+	if (get_user(mbz, user))
+		return -EFAULT;
+
+	return mbz ? -EINVAL : 0;
+}
+
 static int check_user_mbz64(u64 __user *user)
 {
 	u64 mbz;
@@ -1412,8 +1422,43 @@ set_engines__load_balance(struct i915_user_extension __user *base, void *data)
 	return 0;
 }
 
+static int
+set_engines__bond(struct i915_user_extension __user *base, void *data)
+{
+	struct i915_context_engines_bond __user *ext =
+		container_of_user(base, typeof(*ext), base);
+	const struct set_engines *set = data;
+	struct intel_engine_cs *master;
+	u32 class, instance, siblings;
+	int err;
+
+	if (!set->engines[0])
+		return -EINVAL;
+
+	err = check_user_mbz32(&ext->flags);
+	if (err)
+		return err;
+
+	if (get_user(class, &ext->master_class))
+		return -EFAULT;
+
+	if (get_user(instance, &ext->master_instance))
+		return -EFAULT;
+
+	master = intel_engine_lookup_user(set->ctx->i915, class, instance);
+	if (!master)
+		return -EINVAL;
+
+	if (get_user(siblings, &ext->sibling_mask))
+		return -EFAULT;
+
+	return intel_virtual_engine_attach_bond(set->engines[0],
+						master, siblings);
+}
+
 static const i915_user_extension_fn set_engines__extensions[] = {
 	[I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
+	[I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
 };
 
 static int
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 9799564faaf3..56fedc2849c9 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -741,6 +741,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	rq->batch = NULL;
 	rq->capture_list = NULL;
 	rq->waitboost = false;
+	rq->execution_mask = ~0u;
 
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index fef6aabbb1a4..01de566c8e73 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -145,6 +145,7 @@ struct i915_request {
 	 */
 	struct i915_sched_node sched;
 	struct i915_dependency dep;
+	unsigned int execution_mask;
 
 	/*
 	 * A convenience pointer to the current breadcrumb value stored in
diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h
index fe59fa24013e..e455a09a7100 100644
--- a/drivers/gpu/drm/i915/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/intel_engine_types.h
@@ -382,6 +382,13 @@ struct intel_engine_cs {
 	 */
 	void		(*submit_request)(struct i915_request *rq);
 
+	/*
+	 * Called on signaling of a SUBMIT_FENCE, passing along the signaling
+	 * request down to the bonded pairs.
+	 */
+	void            (*bond_execute)(struct i915_request *rq,
+					struct dma_fence *signal);
+
 	/*
 	 * Call when the priority on a request has changed and it and its
 	 * dependencies may need rescheduling. Note the request itself may
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 8c29f8093a21..45d30d1a580f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -178,6 +178,12 @@ struct virtual_engine {
 		int prio;
 	} nodes[I915_NUM_ENGINES];
 
+	struct ve_bond {
+		struct intel_engine_cs *master;
+		unsigned int sibling_mask;
+	} *bonds;
+	unsigned int nbond;
+
 	unsigned int count;
 	struct intel_engine_cs *siblings[0];
 };
@@ -3184,6 +3190,7 @@ static const struct intel_context_ops virtual_context_ops = {
 static void virtual_submission_tasklet(unsigned long data)
 {
 	struct virtual_engine * const ve = (struct virtual_engine *)data;
+	unsigned int mask;
 	unsigned int n;
 	int prio;
 
@@ -3192,12 +3199,30 @@ static void virtual_submission_tasklet(unsigned long data)
 		return;
 
 	local_irq_disable();
+
+	mask = 0;
+	spin_lock(&ve->base.timeline.lock);
+	if (ve->request)
+		mask = ve->request->execution_mask;
+	spin_unlock(&ve->base.timeline.lock);
+
 	for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) {
 		struct intel_engine_cs *sibling = ve->siblings[n];
 		struct ve_node * const node = &ve->nodes[sibling->id];
 		struct rb_node **parent, *rb;
 		bool first;
 
+		if (unlikely(!(mask & sibling->mask))) {
+			if (!RB_EMPTY_NODE(&node->rb)) {
+				spin_lock(&sibling->timeline.lock);
+				rb_erase_cached(&node->rb,
+						&sibling->execlists.virtual);
+				RB_CLEAR_NODE(&node->rb);
+				spin_unlock(&sibling->timeline.lock);
+			}
+			continue;
+		}
+
 		spin_lock(&sibling->timeline.lock);
 
 		if (!RB_EMPTY_NODE(&node->rb)) {
@@ -3255,6 +3280,30 @@ static void virtual_submit_request(struct i915_request *request)
 	tasklet_schedule(&ve->base.execlists.tasklet);
 }
 
+static struct ve_bond *
+virtual_find_bond(struct virtual_engine *ve, struct intel_engine_cs *master)
+{
+	int i;
+
+	for (i = 0; i < ve->nbond; i++) {
+		if (ve->bonds[i].master == master)
+			return &ve->bonds[i];
+	}
+
+	return NULL;
+}
+
+static void
+virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
+{
+	struct virtual_engine *ve = to_virtual_engine(rq->engine);
+	struct ve_bond *bond;
+
+	bond = virtual_find_bond(ve, to_request(signal)->engine);
+	if (bond) /* XXX serialise with rq->lock? */
+		rq->execution_mask &= bond->sibling_mask;
+}
+
 struct intel_engine_cs *
 intel_execlists_create_virtual(struct i915_gem_context *ctx,
 			       struct intel_engine_cs **siblings,
@@ -3294,6 +3343,7 @@ intel_execlists_create_virtual(struct i915_gem_context *ctx,
 
 	ve->base.schedule = i915_schedule;
 	ve->base.submit_request = virtual_submit_request;
+	ve->base.bond_execute = virtual_bond_execute;
 
 	ve->base.execlists.queue_priority_hint = INT_MIN;
 	tasklet_init(&ve->base.execlists.tasklet,
@@ -3369,6 +3419,53 @@ intel_virtual_engine_get_siblings(struct intel_engine_cs *engine,
 	return ve->count;
 }
 
+static unsigned long
+virtual_execution_mask(struct virtual_engine *ve, unsigned long mask)
+{
+	unsigned long emask = 0;
+	int bit;
+
+	for_each_set_bit(bit, &mask, ve->count)
+		emask |= ve->siblings[bit]->mask;
+
+	return emask;
+}
+
+int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
+				     struct intel_engine_cs *master,
+				     unsigned long mask)
+{
+	struct virtual_engine *ve = to_virtual_engine(engine);
+	struct ve_bond *bond;
+
+	if (mask >> ve->count)
+		return -EINVAL;
+
+	mask = virtual_execution_mask(ve, mask);
+	if (!mask)
+		return -EINVAL;
+
+	bond = virtual_find_bond(ve, master);
+	if (bond) {
+		bond->sibling_mask |= mask;
+		return 0;
+	}
+
+	bond = krealloc(ve->bonds,
+			sizeof(*bond) * (ve->nbond + 1),
+			GFP_KERNEL);
+	if (!bond)
+		return -ENOMEM;
+
+	bond[ve->nbond].master = master;
+	bond[ve->nbond].sibling_mask = mask;
+
+	ve->bonds = bond;
+	ve->nbond++;
+
+	return 0;
+}
+
 void intel_virtual_engine_destroy(struct intel_engine_cs *engine)
 {
 	struct virtual_engine *ve = to_virtual_engine(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 0e69feba4173..d32f5ed0b769 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -119,6 +119,9 @@ intel_execlists_create_virtual(struct i915_gem_context *ctx,
 unsigned long
 intel_virtual_engine_get_siblings(struct intel_engine_cs *engine,
 				  struct intel_engine_cs ***siblings);
+int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
+				     struct intel_engine_cs *master,
+				     unsigned long siblings);
 void intel_virtual_engine_destroy(struct intel_engine_cs *engine);
 
 u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 9ff9f27168a4..423a6e76d3c0 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -13,6 +13,7 @@
 #include "igt_live_test.h"
 #include "igt_spinner.h"
 #include "i915_random.h"
+#include "lib_sw_fence.h"
 
 #include "mock_context.h"
 
@@ -1206,6 +1207,171 @@ static int live_virtual_engine(void *arg)
 	return err;
 }
 
+static int bond_virtual_engine(struct drm_i915_private *i915,
+			       unsigned int class,
+			       struct intel_engine_cs **siblings,
+			       unsigned int nsibling,
+			       unsigned int flags)
+#define BOND_SCHEDULE BIT(0)
+{
+	struct intel_engine_cs *master;
+	struct i915_gem_context *ctx;
+	struct i915_request *rq[16];
+	enum intel_engine_id id;
+	unsigned long n;
+	int err;
+
+	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
+
+	ctx = kernel_context(i915);
+	if (!ctx)
+		return -ENOMEM;
+
+	err = 0;
+	rq[0] = ERR_PTR(-ENOMEM);
+	for_each_engine(master, i915, id) {
+		struct i915_sw_fence fence;
+
+		if (master->class == class)
+			continue;
+
+		rq[0] = i915_request_alloc(master, ctx);
+		if (IS_ERR(rq[0])) {
+			err = PTR_ERR(rq[0]);
+			goto out;
+		}
+
+		if (flags & BOND_SCHEDULE)
+			onstack_fence_init(&fence);
+
+		i915_request_get(rq[0]);
+		i915_request_add(rq[0]);
+
+		for (n = 0; n < nsibling; n++) {
+			struct intel_engine_cs *engine;
+
+			engine = intel_execlists_create_virtual(ctx,
+								siblings,
+								nsibling);
+			if (IS_ERR(engine)) {
+				err = PTR_ERR(engine);
+				goto out;
+			}
+
+			err = intel_virtual_engine_attach_bond(engine,
+							       master,
+							       BIT(n));
+			if (err) {
+				intel_virtual_engine_destroy(engine);
+				goto out;
+			}
+
+			rq[n + 1] = i915_request_alloc(engine, ctx);
+			if (IS_ERR(rq[n + 1])) {
+				err = PTR_ERR(rq[n + 1]);
+				intel_virtual_engine_destroy(engine);
+				goto out;
+			}
+			i915_request_get(rq[n + 1]);
+
+			err = i915_request_await_execution(rq[n + 1],
+							   &rq[0]->fence,
+							   engine->bond_execute);
+			i915_request_add(rq[n + 1]);
+			intel_virtual_engine_destroy(engine);
+			if (err < 0)
+				goto out;
+		}
+		rq[n + 1] = ERR_PTR(-EINVAL);
+
+		if (flags & BOND_SCHEDULE)
+			onstack_fence_fini(&fence);
+
+		for (n = 0; n < nsibling; n++) {
+			if (i915_request_wait(rq[n + 1],
+					      I915_WAIT_LOCKED,
+					      MAX_SCHEDULE_TIMEOUT) < 0) {
+				err = -EIO;
+				goto out;
+			}
+
+			if (rq[n + 1]->engine != siblings[n]) {
+				pr_err("Bonded request did not execute on target engine: expected %s, used %s; master was %s\n",
+				       siblings[n]->name,
+				       rq[n + 1]->engine->name,
+				       rq[0]->engine->name);
+				err = -EINVAL;
+				goto out;
+			}
+		}
+
+		for (n = 0; !IS_ERR(rq[n]); n++)
+			i915_request_put(rq[n]);
+		rq[0] = ERR_PTR(-ENOMEM);
+	}
+
+out:
+	for (n = 0; !IS_ERR(rq[n]); n++)
+		i915_request_put(rq[n]);
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	kernel_context_close(ctx);
+	return err;
+}
+
+static int live_virtual_bond(void *arg)
+{
+	static const struct phase {
+		const char *name;
+		unsigned int flags;
+	} phases[] = {
+		{ "", 0 },
+		{ "schedule", BOND_SCHEDULE },
+		{ },
+	};
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
+	unsigned int class, inst;
+	int err = 0;
+
+	if (USES_GUC_SUBMISSION(i915))
+		return 0;
+
+	mutex_lock(&i915->drm.struct_mutex);
+
+	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
+		const struct phase *p;
+		int nsibling;
+
+		nsibling = 0;
+		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
+			if (!i915->engine_class[class][inst])
+				break;
+
+			GEM_BUG_ON(nsibling == ARRAY_SIZE(siblings));
+			siblings[nsibling++] = i915->engine_class[class][inst];
+		}
+		if (nsibling < 2)
+			continue;
+
+		for (p = phases; p->name; p++) {
+			err = bond_virtual_engine(i915,
+						  class, siblings, nsibling,
+						  p->flags);
+			if (err) {
+				pr_err("%s(%s): failed class=%d, nsibling=%d, err=%d\n",
+				       __func__, p->name, class, nsibling, err);
+				goto out_unlock;
+			}
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
 int intel_execlists_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
@@ -1218,6 +1384,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_preempt_hang),
 		SUBTEST(live_preempt_smoke),
 		SUBTEST(live_virtual_engine),
+		SUBTEST(live_virtual_bond),
 	};
 
 	if (!HAS_EXECLISTS(i915))
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 29d630998585..7c24c3a44ed0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1530,6 +1530,10 @@ struct drm_i915_gem_context_param {
  * sized argument, will revert back to default settings.
  *
  * See struct i915_context_param_engines.
+ *
+ * Extensions:
+ *   i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE)
+ *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
  */
 #define I915_CONTEXT_PARAM_ENGINES	0xa
 /* Must be kept compact -- no holes and well documented */
@@ -1622,9 +1626,23 @@ struct i915_context_engines_load_balance {
 	__u64 mbz[4]; /* reserved for future use; must be zero */
 };
 
+/*
+ * i915_context_engines_bond:
+ *
+ */
+struct i915_context_engines_bond {
+	struct i915_user_extension base;
+
+	__u16 master_class;
+	__u16 master_instance;
+	__u32 flags; /* all undefined flags must be zero */
+	__u64 sibling_mask;
+};
+
 struct i915_context_param_engines {
 	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
 #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
+#define I915_CONTEXT_ENGINES_EXT_BOND 1
 
 	struct {
 		__u16 engine_class; /* see enum drm_i915_gem_engine_class */
-- 
2.20.1