[PATCH 2/2] drm/i915/icl+: Workaround for media frame split

Mon Jun 1 10:38:54 UTC 2020

From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Our generic mechanism for specifying aligned request start,
I915_EXEC_FENCE_SUBMIT, has a bit too relaxed rules when it comes to the
actual use case of media scalability on Tigerlake.

While the submit fence can provide the aligned start, the actual media
pipeline expects that execution remains in lockstep from then onwards.
Otherwise the hardware can not handle well one of the pair getting
preempted, leading to GPU hangs and corrupted media playback.

The only robust workaround, which does not involve adding new uapi, seems
to be temporarily declaring no preemption in user batch buffers.

On top, we need to treat all submissions on video engines as sentinel
requests, so that the undefined submission order between master and slave
pairs from different contexts does not create a deadlock.

Going forward we plan to add new uapi so this hack can be removed by
allowing the special needs to be more explicitly declared and by also
making preemption reporting more granular.

v2:
 * Use engine->flags.
 * Apply wa from execbuf. (Chris)

v3:
 * Use gen8_emit_bb_start_noarb and mark the engine as no-preempt to allow
   for preemption during dependency awaits. (Chris)
 * Sentinel bit is enough, no need for single port. (Chris)
   (And therefore no need for any bond execute callbacks.)
 * Reworded the commit message and updated the title accordingly.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Suggested-by: Xiaogang Li <xiaogang.li at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  8 ++++++
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  7 +++++
 drivers/gpu/drm/i915/gt/intel_lrc.c           | 28 +++++++++++++++----
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 219a36995b96..6c26b1d8160e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -2587,6 +2587,14 @@ static void eb_request_add(struct i915_execbuffer *eb)
 		__i915_request_skip(rq);
 	}
 
+	/*
+	 * Because it is too late to apply
+	 * intel_engine_requires_media_split_wa() once the bonded pair is
+	 * established we have to do it indiscriminately from the start. :(
+	 */
+	if (intel_engine_requires_media_split_wa(rq->engine))
+		__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
+
 	__i915_request_queue(rq, &attr);
 
 	/* Try to clean up the client's timeline after submitting the request */
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 2b6cdf47d428..6e62e1f8c69d 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -503,6 +503,7 @@ struct intel_engine_cs {
 #define I915_ENGINE_IS_VIRTUAL       BIT(6)
 #define I915_ENGINE_HAS_RELATIVE_MMIO BIT(7)
 #define I915_ENGINE_REQUIRES_CMD_PARSER BIT(8)
+#define I915_ENGINE_REQUIRES_MEDIA_SPLIT_WA	BIT(9)
 	unsigned int flags;
 
 	/*
@@ -627,6 +628,12 @@ intel_engine_has_relative_mmio(const struct intel_engine_cs * const engine)
 	return engine->flags & I915_ENGINE_HAS_RELATIVE_MMIO;
 }
 
+static inline bool
+intel_engine_requires_media_split_wa(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_REQUIRES_MEDIA_SPLIT_WA;
+}
+
 #define instdone_has_slice(dev_priv___, sseu___, slice___) \
 	((IS_GEN(dev_priv___, 7) ? 1 : ((sseu___)->slice_mask)) & BIT(slice___))
 
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 0f55126d53da..aa496266b248 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1641,9 +1641,8 @@ assert_pending_valid(const struct intel_engine_execlists *execlists,
 				      ce->timeline->fence_context,
 				      port - execlists->pending);
 			return false;
-		} else  {
-			sentinel = i915_request_has_sentinel(rq);
 		}
+		sentinel |= i915_request_has_sentinel(rq);
 
 		/* Hold tightly onto the lock to prevent concurrent retires! */
 		if (!spin_trylock_irqsave(&rq->lock, flags))
@@ -4927,6 +4926,8 @@ static void execlists_park(struct intel_engine_cs *engine)
 
 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 {
+	struct drm_i915_private *i915 = engine->i915;
+
 	engine->submit_request = execlists_submit_request;
 	engine->schedule = i915_schedule;
 	engine->execlists.tasklet.func = execlists_submission_tasklet;
@@ -4939,17 +4940,34 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 	engine->park = execlists_park;
 	engine->unpark = NULL;
 
+	/*
+	 * Special handling needed for media split frame workloads in order to:
+	 *  1. Avoid preempting one of the bonded pair of requests if other is
+	 *     in a non-preemptible section.
+	 *  2. Avoid creating ordering inversions between different bonded pairs
+	 *     in different execlists ports.
+	 *
+	 * Based on this flag elsewhere in the code we do:
+	 *  1. Mark bonded pair as non-preemtible.
+	 *  2. Limit the pair to be alone in their respective elsp.
+	 *
+	 */
+	if (INTEL_GEN(i915) >= 11 &&
+	    engine->class == VIDEO_DECODE_CLASS)
+		engine->flags |= I915_ENGINE_REQUIRES_MEDIA_SPLIT_WA;
+
 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
-	if (!intel_vgpu_active(engine->i915)) {
+	if (!intel_vgpu_active(i915)) {
 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
-		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
+		if (HAS_LOGICAL_RING_PREEMPTION(i915) &&
+		    !intel_engine_requires_media_split_wa(engine)) {
 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
 		}
 	}
 
-	if (INTEL_GEN(engine->i915) >= 12)
+	if (INTEL_GEN(i915) >= 12)
 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
 
 	if (intel_engine_has_preemption(engine))
-- 
2.20.1