[PATCH] drm/i915: Special handling for bonded requests

Mon Jun 1 09:07:35 UTC 2020

From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Our generic mechanism for specifying aligned request start,
I915_EXEC_FENCE_SUBMIT, has a bit too relaxed rules when it comes to the
actual use case of media scalability on Tigerlake.

While the submit fence can provide the aligned start, the  actual media
pipeline expects that execution remains in lockstep from then onwards.
Otherwise the hardware cannot handle well one of the pair getting
preempted leading to GPU hangs and corrupted media playback.

This puts us in a difficult position where the only visible solution,
which does not involve adding new uapi, is to give more meaning to the
generic submit fence. At least when used between the video engines.

The special semantics this patch applies in that case are two fold. First
is that both of the aligned pairs will be marked as non-preemptable and
second is ensuring both are not sharing ELSP ports with any other context.

Non-preemptable property will ensure that once the start has been aligned
the requests remain executing until completion.

Single port ELSP property will ensure there are no possible inversions
between different independent pairs of aligned requests.

Going forward we can think of introducing new uapi to request this
behaviour as a separate, more explicit flag, and at that point retire this
semi-generic special handling.

v2:
 * Use engine->flags.
 * Apply wa from execbuf. (Chris)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Suggested-by: Xiaogang Li <xiaogang.li at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  8 +++
 drivers/gpu/drm/i915/gt/intel_context.h       |  2 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  7 ++
 drivers/gpu/drm/i915/gt/intel_lrc.c           | 69 ++++++++++++++++---
 4 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 219a36995b96..5b7a9988356f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -2587,6 +2587,14 @@ static void eb_request_add(struct i915_execbuffer *eb)
 		__i915_request_skip(rq);
 	}
 
+	/*
+	 * Because it is too late to apply
+	 * intel_engine_requires_media_split_wa() once the bonded pair is
+	 * established we have to do it indiscriminately from the start. :(
+	 */
+	if (!intel_engine_requires_media_split_wa(rq->engine))
+		intel_context_set_single_submission(rq->context);
+
 	__i915_request_queue(rq, &attr);
 
 	/* Try to clean up the client's timeline after submitting the request */
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index 07be021882cc..576d11c0cdd9 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -212,7 +212,7 @@ intel_context_force_single_submission(const struct intel_context *ce)
 static inline void
 intel_context_set_single_submission(struct intel_context *ce)
 {
-	__set_bit(CONTEXT_FORCE_SINGLE_SUBMISSION, &ce->flags);
+	set_bit(CONTEXT_FORCE_SINGLE_SUBMISSION, &ce->flags);
 }
 
 static inline bool
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 2b6cdf47d428..6e62e1f8c69d 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -503,6 +503,7 @@ struct intel_engine_cs {
 #define I915_ENGINE_IS_VIRTUAL       BIT(6)
 #define I915_ENGINE_HAS_RELATIVE_MMIO BIT(7)
 #define I915_ENGINE_REQUIRES_CMD_PARSER BIT(8)
+#define I915_ENGINE_REQUIRES_MEDIA_SPLIT_WA	BIT(9)
 	unsigned int flags;
 
 	/*
@@ -627,6 +628,12 @@ intel_engine_has_relative_mmio(const struct intel_engine_cs * const engine)
 	return engine->flags & I915_ENGINE_HAS_RELATIVE_MMIO;
 }
 
+static inline bool
+intel_engine_requires_media_split_wa(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_REQUIRES_MEDIA_SPLIT_WA;
+}
+
 #define instdone_has_slice(dev_priv___, sseu___, slice___) \
 	((IS_GEN(dev_priv___, 7) ? 1 : ((sseu___)->slice_mask)) & BIT(slice___))
 
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 6fc0966b75ff..69fc7a3f5f07 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1734,8 +1734,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
 
 static bool ctx_single_port_submission(const struct intel_context *ce)
 {
-	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
-		intel_context_force_single_submission(ce));
+	return intel_context_force_single_submission(ce);
 }
 
 static bool can_merge_ctx(const struct intel_context *prev,
@@ -4932,9 +4931,42 @@ static void execlists_park(struct intel_engine_cs *engine)
 	cancel_timer(&engine->execlists.preempt);
 }
 
+static void
+mark_bonded_pair(struct i915_request *rq, struct i915_request *signal)
+{
+	/*
+	 * Give (temporary) special meaning to a pair of requests with requested
+	 * aligned start along the video engines.
+	 *
+	 * They should be non-preemptible and have all ELSP ports to themselves
+	 * to avoid any deadlocks caused by inversions.
+	 *
+	 * Gen11+
+	 */
+	if (!intel_engine_requires_media_split_wa(rq->engine) &&
+	    !intel_engine_requires_media_split_wa(signal->engine))
+		return;
+
+	set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags);
+	intel_context_set_single_submission(rq->context);
+
+	set_bit(I915_FENCE_FLAG_NOPREEMPT, &signal->fence.flags);
+	intel_context_set_single_submission(signal->context);
+}
+
+static void
+execlists_bond_execute(struct i915_request *rq, struct dma_fence *signal)
+{
+	mark_bonded_pair(rq, to_request(signal));
+}
+
 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 {
+	struct drm_i915_private *i915 = engine->i915;
+
 	engine->submit_request = execlists_submit_request;
+	if (engine->class == VIDEO_DECODE_CLASS)
+		engine->bond_execute = execlists_bond_execute;
 	engine->schedule = i915_schedule;
 	engine->execlists.tasklet.func = execlists_submission_tasklet;
 
@@ -4947,16 +4979,32 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 	engine->unpark = NULL;
 
 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
-	if (!intel_vgpu_active(engine->i915)) {
+	if (!intel_vgpu_active(i915)) {
 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
-		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
+		if (HAS_LOGICAL_RING_PREEMPTION(i915)) {
 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 		}
 	}
 
-	if (INTEL_GEN(engine->i915) >= 12)
+	/*
+	 * Special handling needed for media split frame workloads in order to:
+	 *  1. Avoid preempting one of the bonded pair of requests if other is
+	 *     in a non-preemptible section.
+	 *  2. Avoid creating ordering inversions between different bonded pairs
+	 *     in different execlists ports.
+	 *
+	 * Based on this flag elsewhere in the code we do:
+	 *  1. Mark bonded pair as non-preemtible.
+	 *  2. Limit the pair to be alone in their respective elsp.
+	 *
+	 */
+	if (INTEL_GEN(i915) >= 11 &&
+	    engine->class == VIDEO_DECODE_CLASS)
+		engine->flags |= I915_ENGINE_REQUIRES_MEDIA_SPLIT_WA;
+
+	if (INTEL_GEN(i915) >= 12)
 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
 
 	if (intel_engine_has_preemption(engine))
@@ -5605,15 +5653,18 @@ virtual_find_bond(struct virtual_engine *ve,
 }
 
 static void
-virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
+virtual_bond_execute(struct i915_request *rq, struct dma_fence *sigfence)
 {
 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
+	struct i915_request *signal = to_request(sigfence);
 	intel_engine_mask_t allowed, exec;
 	struct ve_bond *bond;
 
-	allowed = ~to_request(signal)->engine->mask;
+	mark_bonded_pair(rq, signal);
+
+	allowed = ~signal->engine->mask;
 
-	bond = virtual_find_bond(ve, to_request(signal)->engine);
+	bond = virtual_find_bond(ve, signal->engine);
 	if (bond)
 		allowed &= bond->sibling_mask;
 
@@ -5623,7 +5674,7 @@ virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
 		;
 
 	/* Prevent the master from being re-run on the bonded engines */
-	to_request(signal)->execution_mask &= ~allowed;
+	signal->execution_mask &= ~allowed;
 }
 
 struct intel_context *
-- 
2.20.1