[PATCH 7/7] drm/i915: Move irq_seqno_barrier w/a into the interrupt handler

Wed Dec 19 11:09:40 UTC 2018

We only need to apply the w/a on processing an interrupt and as we have
reduced it to a heavy mmio access on a small subset of engines/machines,
we can reduce the mechanism and apply it inside the irq handler
directly.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h          | 84 ------------------------
 drivers/gpu/drm/i915/i915_gem.c          |  7 --
 drivers/gpu/drm/i915/i915_irq.c          | 35 ++++++++--
 drivers/gpu/drm/i915/i915_request.c      |  8 +--
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 26 --------
 drivers/gpu/drm/i915/intel_engine_cs.c   |  6 --
 drivers/gpu/drm/i915/intel_hangcheck.c   | 10 ---
 drivers/gpu/drm/i915/intel_ringbuffer.c  | 35 +---------
 drivers/gpu/drm/i915/intel_ringbuffer.h  | 17 ++---
 9 files changed, 39 insertions(+), 189 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7da653ece4dd..2ce731d987f1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3584,90 +3584,6 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
 	}
 }
 
-static inline bool
-__i915_request_irq_complete(const struct i915_request *rq)
-{
-	struct intel_engine_cs *engine = rq->engine;
-	u32 seqno;
-
-	/* Note that the engine may have wrapped around the seqno, and
-	 * so our request->global_seqno will be ahead of the hardware,
-	 * even though it completed the request before wrapping. We catch
-	 * this by kicking all the waiters before resetting the seqno
-	 * in hardware, and also signal the fence.
-	 */
-	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
-		return true;
-
-	/* The request was dequeued before we were awoken. We check after
-	 * inspecting the hw to confirm that this was the same request
-	 * that generated the HWS update. The memory barriers within
-	 * the request execution are sufficient to ensure that a check
-	 * after reading the value from hw matches this request.
-	 */
-	seqno = i915_request_global_seqno(rq);
-	if (!seqno)
-		return false;
-
-	/* Before we do the heavier coherent read of the seqno,
-	 * check the value (hopefully) in the CPU cacheline.
-	 */
-	if (__i915_request_completed(rq, seqno))
-		return true;
-
-	/* Ensure our read of the seqno is coherent so that we
-	 * do not "miss an interrupt" (i.e. if this is the last
-	 * request and the seqno write from the GPU is not visible
-	 * by the time the interrupt fires, we will see that the
-	 * request is incomplete and go back to sleep awaiting
-	 * another interrupt that will never come.)
-	 *
-	 * Strictly, we only need to do this once after an interrupt,
-	 * but it is easier and safer to do it every time the waiter
-	 * is woken.
-	 */
-	if (engine->irq_seqno_barrier &&
-	    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted)) {
-		struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-		/* The ordering of irq_posted versus applying the barrier
-		 * is crucial. The clearing of the current irq_posted must
-		 * be visible before we perform the barrier operation,
-		 * such that if a subsequent interrupt arrives, irq_posted
-		 * is reasserted and our task rewoken (which causes us to
-		 * do another __i915_request_irq_complete() immediately
-		 * and reapply the barrier). Conversely, if the clear
-		 * occurs after the barrier, then an interrupt that arrived
-		 * whilst we waited on the barrier would not trigger a
-		 * barrier on the next pass, and the read may not see the
-		 * seqno update.
-		 */
-		engine->irq_seqno_barrier(engine);
-
-		/* If we consume the irq, but we are no longer the bottom-half,
-		 * the real bottom-half may not have serialised their own
-		 * seqno check with the irq-barrier (i.e. may have inspected
-		 * the seqno before we believe it coherent since they see
-		 * irq_posted == false but we are still running).
-		 */
-		spin_lock_irq(&b->irq_lock);
-		if (b->irq_wait && b->irq_wait->tsk != current)
-			/* Note that if the bottom-half is changed as we
-			 * are sending the wake-up, the new bottom-half will
-			 * be woken by whomever made the change. We only have
-			 * to worry about when we steal the irq-posted for
-			 * ourself.
-			 */
-			wake_up_process(b->irq_wait->tsk);
-		spin_unlock_irq(&b->irq_lock);
-
-		if (__i915_request_completed(rq, seqno))
-			return true;
-	}
-
-	return false;
-}
-
 void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
 bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f4254e012620..7ea87c7a0e8c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3230,13 +3230,6 @@ void i915_gem_reset_engine(struct intel_engine_cs *engine,
 			   struct i915_request *request,
 			   bool stalled)
 {
-	/*
-	 * Make sure this write is visible before we re-enable the interrupt
-	 * handlers on another CPU, as tasklet_enable() resolves to just
-	 * a compiler barrier which is insufficient for our purpose here.
-	 */
-	smp_store_mb(engine->irq_posted, 0);
-
 	if (request)
 		request = i915_gem_reset_request(engine, request, stalled);
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 0c7fc9890891..45dfb30acc2e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1152,6 +1152,31 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
 	return;
 }
 
+static void irq_seqno_barrier(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	/*
+	 * Workaround to force correct ordering between irq and seqno writes on
+	 * ivb (and maybe also on snb) by reading from a CS register (like
+	 * ACTHD) before reading the status page.
+	 *
+	 * Note that this effectively stalls the read by the time it takes to
+	 * do a memory transaction, which more or less ensures that the write
+	 * from the GPU has sufficient time to invalidate the CPU cacheline.
+	 * Alternatively we could delay the interrupt from the CS ring to give
+	 * the write time to land, but that would incur a delay after every
+	 * batch i.e. much more frequent than a delay when waiting for the
+	 * interrupt (with the same net latency).
+	 *
+	 * Also note that to prevent whole machine hangs on gen7, we have to
+	 * take the spinlock to guard against concurrent cacheline access.
+	 */
+	spin_lock(&dev_priv->uncore.lock);
+	POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
+	spin_unlock(&dev_priv->uncore.lock);
+}
+
 static void notify_ring(struct intel_engine_cs *engine)
 {
 	const u32 seqno = intel_engine_get_seqno(engine);
@@ -1162,6 +1187,9 @@ static void notify_ring(struct intel_engine_cs *engine)
 	if (unlikely(!engine->breadcrumbs.irq_armed))
 		return;
 
+	if (intel_engine_quirk_irq_seqno(engine))
+		irq_seqno_barrier(engine);
+
 	rcu_read_lock();
 
 	spin_lock(&engine->breadcrumbs.irq_lock);
@@ -1189,13 +1217,6 @@ static void notify_ring(struct intel_engine_cs *engine)
 				rq = i915_request_get(waiter);
 
 			tsk = wait->tsk;
-		} else {
-			if (engine->irq_seqno_barrier &&
-			    i915_seqno_passed(seqno, wait->seqno - 1)) {
-				set_bit(ENGINE_IRQ_BREADCRUMB,
-					&engine->irq_posted);
-				tsk = wait->tsk;
-			}
 		}
 
 		engine->breadcrumbs.irq_count++;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 8f8e844f2e85..432deadea584 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1178,13 +1178,7 @@ long i915_request_wait(struct i915_request *rq,
 		set_current_state(state);
 
 wakeup:
-		/*
-		 * Carefully check if the request is complete, giving time
-		 * for the seqno to be visible following the interrupt.
-		 * We also have to check in case we are kicked by the GPU
-		 * reset in order to drop the struct_mutex.
-		 */
-		if (__i915_request_irq_complete(rq))
+		if (i915_request_completed(rq))
 			break;
 
 		/*
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 447c5256f63a..4ed7105d7ff5 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -166,12 +166,6 @@ static void irq_enable(struct intel_engine_cs *engine)
 	 */
 	GEM_BUG_ON(!intel_irqs_enabled(engine->i915));
 
-	/* Enabling the IRQ may miss the generation of the interrupt, but
-	 * we still need to force the barrier before reading the seqno,
-	 * just in case.
-	 */
-	set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
-
 	/* Caller disables interrupts */
 	if (engine->irq_enable) {
 		spin_lock(&engine->i915->irq_lock);
@@ -683,16 +677,6 @@ static int intel_breadcrumbs_signaler(void *arg)
 		}
 
 		if (unlikely(do_schedule)) {
-			/* Before we sleep, check for a missed seqno */
-			if (current->state & TASK_NORMAL &&
-			    !list_empty(&b->signals) &&
-			    engine->irq_seqno_barrier &&
-			    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB,
-					       &engine->irq_posted)) {
-				engine->irq_seqno_barrier(engine);
-				intel_engine_wakeup(engine);
-			}
-
 sleep:
 			if (kthread_should_park())
 				kthread_parkme();
@@ -859,16 +843,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
 	else
 		irq_disable(engine);
 
-	/*
-	 * We set the IRQ_BREADCRUMB bit when we enable the irq presuming the
-	 * GPU is active and may have already executed the MI_USER_INTERRUPT
-	 * before the CPU is ready to receive. However, the engine is currently
-	 * idle (we haven't started it yet), there is no possibility for a
-	 * missed interrupt as we enabled the irq and so we can clear the
-	 * immediate wakeup (until a real interrupt arrives for the waiter).
-	 */
-	clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
-
 	spin_unlock_irqrestore(&b->irq_lock, flags);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 1462bb49f420..189a934a63e9 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -457,7 +457,6 @@ int intel_engines_init(struct drm_i915_private *dev_priv)
 void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno)
 {
 	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
-	clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
 
 	/* After manually advancing the seqno, fake the interrupt in case
 	 * there are any waiters for that seqno.
@@ -1536,11 +1535,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	spin_unlock(&b->rb_lock);
 	local_irq_restore(flags);
 
-	drm_printf(m, "IRQ? 0x%lx (breadcrumbs? %s)\n",
-		   engine->irq_posted,
-		   yesno(test_bit(ENGINE_IRQ_BREADCRUMB,
-				  &engine->irq_posted)));
-
 	drm_printf(m, "HWSP:\n");
 	hexdump(m, engine->status_page.page_addr, PAGE_SIZE);
 
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index c3f929f59424..51e9efec5116 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -120,16 +120,6 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 static void hangcheck_load_sample(struct intel_engine_cs *engine,
 				  struct intel_engine_hangcheck *hc)
 {
-	/* We don't strictly need an irq-barrier here, as we are not
-	 * serving an interrupt request, be paranoid in case the
-	 * barrier has side-effects (such as preventing a broken
-	 * cacheline snoop) and so be sure that we can see the seqno
-	 * advance. If the seqno should stick, due to a stale
-	 * cacheline, we would erroneously declare the GPU hung.
-	 */
-	if (engine->irq_seqno_barrier)
-		engine->irq_seqno_barrier(engine);
-
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_seqno(engine);
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index bd73641f1ce6..a7bdc20e8fad 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -683,10 +683,6 @@ static int init_ring_common(struct intel_engine_cs *engine)
 static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
 {
 	intel_engine_stop_cs(engine);
-
-	if (engine->irq_seqno_barrier)
-		engine->irq_seqno_barrier(engine);
-
 	return i915_gem_find_active_request(engine);
 }
 
@@ -874,31 +870,6 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 }
 static const int gen5_emit_breadcrumb_sz = 3 * 8 + 2;
 
-static void
-gen6_seqno_barrier(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	/* Workaround to force correct ordering between irq and seqno writes on
-	 * ivb (and maybe also on snb) by reading from a CS register (like
-	 * ACTHD) before reading the status page.
-	 *
-	 * Note that this effectively stalls the read by the time it takes to
-	 * do a memory transaction, which more or less ensures that the write
-	 * from the GPU has sufficient time to invalidate the CPU cacheline.
-	 * Alternatively we could delay the interrupt from the CS ring to give
-	 * the write time to land, but that would incur a delay after every
-	 * batch i.e. much more frequent than a delay when waiting for the
-	 * interrupt (with the same net latency).
-	 *
-	 * Also note that to prevent whole machine hangs on gen7, we have to
-	 * take the spinlock to guard against concurrent cacheline access.
-	 */
-	spin_lock_irq(&dev_priv->uncore.lock);
-	POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
-	spin_unlock_irq(&dev_priv->uncore.lock);
-}
-
 static void
 gen5_irq_enable(struct intel_engine_cs *engine)
 {
@@ -2224,7 +2195,7 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
 		engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
 		engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
 
-		engine->irq_seqno_barrier = gen6_seqno_barrier;
+		engine->flags |= I915_ENGINE_QUIRK_IRQ_SEQNO;
 	} else {
 		engine->emit_flush = bsd_ring_flush;
 		if (IS_GEN(dev_priv, 5))
@@ -2249,7 +2220,7 @@ int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
 
 	engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
-	engine->irq_seqno_barrier = gen6_seqno_barrier;
+	engine->flags |= I915_ENGINE_QUIRK_IRQ_SEQNO;
 
 	return intel_init_ring_buffer(engine);
 }
@@ -2269,7 +2240,7 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
 
 	engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
-	engine->irq_seqno_barrier = gen6_seqno_barrier;
+	engine->flags |= I915_ENGINE_QUIRK_IRQ_SEQNO;
 
 	return intel_init_ring_buffer(engine);
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 99e2cb75d29a..9bf7e4eb0cbc 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -365,9 +365,6 @@ struct intel_engine_cs {
 	struct drm_i915_gem_object *default_state;
 	void *pinned_default_state;
 
-	unsigned long irq_posted;
-#define ENGINE_IRQ_BREADCRUMB 0
-
 	/* Rather than have every client wait upon all user interrupts,
 	 * with the herd waking after every interrupt and each doing the
 	 * heavyweight seqno dance, we delegate the task (of being the
@@ -501,13 +498,6 @@ struct intel_engine_cs {
 	 */
 	void		(*cancel_requests)(struct intel_engine_cs *engine);
 
-	/* Some chipsets are not quite as coherent as advertised and need
-	 * an expensive kick to force a true read of the up-to-date seqno.
-	 * However, the up-to-date seqno is not always required and the last
-	 * seen value is good enough. Note that the seqno will always be
-	 * monotonic, even if not coherent.
-	 */
-	void		(*irq_seqno_barrier)(struct intel_engine_cs *engine);
 	void		(*cleanup)(struct intel_engine_cs *engine);
 
 	struct intel_engine_execlists execlists;
@@ -531,6 +521,7 @@ struct intel_engine_cs {
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_QUIRK_IRQ_SEQNO  BIT(3)
 	unsigned int flags;
 
 	/*
@@ -608,6 +599,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_HAS_PREEMPTION;
 }
 
+static inline bool
+intel_engine_quirk_irq_seqno(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_QUIRK_IRQ_SEQNO;
+}
+
 static inline bool __execlists_need_preempt(int prio, int last)
 {
 	return prio > max(0, last);
-- 
2.20.0