[Intel-gfx] [PATCH 29/34] drm/i915: Drop fake breadcrumb irq

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Thu Jan 24 17:55:20 UTC 2019


On 21/01/2019 22:21, Chris Wilson wrote:
> Missed breadcrumb detection is defunct due to the tight coupling with

How it is defunct.. oh because there is no irq_count any more... could 
you describe the transition from irq_count to irq_fired and then to 
nothing briefly?

> dma_fence signaling and the myriad ways we may signal fences from
> everywhere but from an interrupt, i.e. we frequently signal a fence
> before we even see its interrupt. This means that even if we miss an
> interrupt for a fence, it still is signaled before our breadcrumb
> hangcheck fires, so simplify the breadcrumb hangchecking by moving it
> into the GPU hangcheck and forgo fake interrupts.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c           |  93 -----------
>   drivers/gpu/drm/i915/i915_gpu_error.c         |   2 -
>   drivers/gpu/drm/i915/i915_gpu_error.h         |   5 -
>   drivers/gpu/drm/i915/intel_breadcrumbs.c      | 147 +-----------------
>   drivers/gpu/drm/i915/intel_hangcheck.c        |   2 +
>   drivers/gpu/drm/i915/intel_ringbuffer.h       |   5 -
>   .../gpu/drm/i915/selftests/igt_live_test.c    |   7 -
>   7 files changed, 5 insertions(+), 256 deletions(-)

With this balance of insertions and deletions I cannot decide if this is 
easy or hard to review.

IGT uses intel_detect_and_clear_missed_interrupts a lot. What is the 
plan there?

> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index d7764e62e9b4..c2aaf010c3d1 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>   			   intel_engine_last_submit(engine),
>   			   jiffies_to_msecs(jiffies -
>   					    engine->hangcheck.action_timestamp));
> -		seq_printf(m, "\tfake irq active? %s\n",
> -			   yesno(test_bit(engine->id,
> -					  &dev_priv->gpu_error.missed_irq_rings)));
>   
>   		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
>   			   (long long)engine->hangcheck.acthd,
> @@ -3874,94 +3871,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
>   			i915_wedged_get, i915_wedged_set,
>   			"%llu\n");
>   
> -static int
> -fault_irq_set(struct drm_i915_private *i915,
> -	      unsigned long *irq,
> -	      unsigned long val)
> -{
> -	int err;
> -
> -	err = mutex_lock_interruptible(&i915->drm.struct_mutex);
> -	if (err)
> -		return err;
> -
> -	err = i915_gem_wait_for_idle(i915,
> -				     I915_WAIT_LOCKED |
> -				     I915_WAIT_INTERRUPTIBLE,
> -				     MAX_SCHEDULE_TIMEOUT);
> -	if (err)
> -		goto err_unlock;
> -
> -	*irq = val;
> -	mutex_unlock(&i915->drm.struct_mutex);
> -
> -	/* Flush idle worker to disarm irq */
> -	drain_delayed_work(&i915->gt.idle_work);
> -
> -	return 0;
> -
> -err_unlock:
> -	mutex_unlock(&i915->drm.struct_mutex);
> -	return err;
> -}
> -
> -static int
> -i915_ring_missed_irq_get(void *data, u64 *val)
> -{
> -	struct drm_i915_private *dev_priv = data;
> -
> -	*val = dev_priv->gpu_error.missed_irq_rings;
> -	return 0;
> -}
> -
> -static int
> -i915_ring_missed_irq_set(void *data, u64 val)
> -{
> -	struct drm_i915_private *i915 = data;
> -
> -	return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
> -}
> -
> -DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
> -			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
> -			"0x%08llx\n");
> -
> -static int
> -i915_ring_test_irq_get(void *data, u64 *val)
> -{
> -	struct drm_i915_private *dev_priv = data;
> -
> -	*val = dev_priv->gpu_error.test_irq_rings;
> -
> -	return 0;
> -}
> -
> -static int
> -i915_ring_test_irq_set(void *data, u64 val)
> -{
> -	struct drm_i915_private *i915 = data;
> -
> -	/* GuC keeps the user interrupt permanently enabled for submission */
> -	if (USES_GUC_SUBMISSION(i915))
> -		return -ENODEV;
> -
> -	/*
> -	 * From icl, we can no longer individually mask interrupt generation
> -	 * from each engine.
> -	 */
> -	if (INTEL_GEN(i915) >= 11)
> -		return -ENODEV;
> -
> -	val &= INTEL_INFO(i915)->ring_mask;
> -	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
> -
> -	return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
> -}
> -
> -DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
> -			i915_ring_test_irq_get, i915_ring_test_irq_set,
> -			"0x%08llx\n");
> -
>   #define DROP_UNBOUND	BIT(0)
>   #define DROP_BOUND	BIT(1)
>   #define DROP_RETIRE	BIT(2)
> @@ -4724,8 +4633,6 @@ static const struct i915_debugfs_files {
>   } i915_debugfs_files[] = {
>   	{"i915_wedged", &i915_wedged_fops},
>   	{"i915_cache_sharing", &i915_cache_sharing_fops},
> -	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
> -	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
>   	{"i915_gem_drop_caches", &i915_drop_caches_fops},
>   #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
>   	{"i915_error_state", &i915_error_state_fops},
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 825572127029..0584c8dfa6ae 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -718,8 +718,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
>   	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
>   	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
>   	err_printf(m, "CCID: 0x%08x\n", error->ccid);
> -	err_printf(m, "Missed interrupts: 0x%08lx\n",
> -		   m->i915->gpu_error.missed_irq_rings);
>   
>   	for (i = 0; i < error->nfence; i++)
>   		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 0e184712cbcc..99a53c0cd6da 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -203,8 +203,6 @@ struct i915_gpu_error {
>   
>   	atomic_t pending_fb_pin;
>   
> -	unsigned long missed_irq_rings;
> -
>   	/**
>   	 * State variable controlling the reset flow and count
>   	 *
> @@ -273,9 +271,6 @@ struct i915_gpu_error {
>   	 */
>   	wait_queue_head_t reset_queue;
>   
> -	/* For missed irq/seqno simulation. */
> -	unsigned long test_irq_rings;
> -
>   	struct i915_gpu_restart *restart;
>   };
>   
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index faeb0083b561..3bdfa63ea4a1 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -91,7 +91,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine)
>   
>   	spin_lock(&b->irq_lock);
>   
> -	b->irq_fired = true;
>   	if (b->irq_armed && list_empty(&b->signalers))
>   		__intel_breadcrumbs_disarm_irq(b);
>   
> @@ -155,86 +154,6 @@ static void signal_irq_work(struct irq_work *work)
>   	intel_engine_breadcrumbs_irq(engine);
>   }
>   
> -static unsigned long wait_timeout(void)
> -{
> -	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
> -}
> -
> -static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
> -{
> -	if (GEM_SHOW_DEBUG()) {
> -		struct drm_printer p = drm_debug_printer(__func__);
> -
> -		intel_engine_dump(engine, &p,
> -				  "%s missed breadcrumb at %pS\n",
> -				  engine->name, __builtin_return_address(0));
> -	}
> -
> -	set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
> -}
> -
> -static void intel_breadcrumbs_hangcheck(struct timer_list *t)
> -{
> -	struct intel_engine_cs *engine =
> -		from_timer(engine, t, breadcrumbs.hangcheck);
> -	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -
> -	if (!b->irq_armed)
> -		return;
> -
> -	if (b->irq_fired)
> -		goto rearm;
> -
> -	/*
> -	 * We keep the hangcheck timer alive until we disarm the irq, even
> -	 * if there are no waiters at present.
> -	 *
> -	 * If the waiter was currently running, assume it hasn't had a chance
> -	 * to process the pending interrupt (e.g, low priority task on a loaded
> -	 * system) and wait until it sleeps before declaring a missed interrupt.
> -	 *
> -	 * If the waiter was asleep (and not even pending a wakeup), then we
> -	 * must have missed an interrupt as the GPU has stopped advancing
> -	 * but we still have a waiter. Assuming all batches complete within
> -	 * DRM_I915_HANGCHECK_JIFFIES [1.5s]!
> -	 */
> -	synchronize_hardirq(engine->i915->drm.irq);
> -	if (intel_engine_signal_breadcrumbs(engine)) {
> -		missed_breadcrumb(engine);
> -		mod_timer(&b->fake_irq, jiffies + 1);
> -	} else {
> -rearm:
> -		b->irq_fired = false;
> -		mod_timer(&b->hangcheck, wait_timeout());
> -	}
> -}
> -
> -static void intel_breadcrumbs_fake_irq(struct timer_list *t)
> -{
> -	struct intel_engine_cs *engine =
> -		from_timer(engine, t, breadcrumbs.fake_irq);
> -	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -
> -	/*
> -	 * The timer persists in case we cannot enable interrupts,
> -	 * or if we have previously seen seqno/interrupt incoherency
> -	 * ("missed interrupt" syndrome, better known as a "missed breadcrumb").
> -	 * Here the worker will wake up every jiffie in order to kick the
> -	 * oldest waiter to do the coherent seqno check.
> -	 */
> -
> -	if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
> -		return;
> -
> -	/* If the user has disabled the fake-irq, restore the hangchecking */
> -	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
> -		mod_timer(&b->hangcheck, wait_timeout());
> -		return;
> -	}
> -
> -	mod_timer(&b->fake_irq, jiffies + 1);
> -}
> -
>   void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
>   {
>   	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> @@ -257,43 +176,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine)
>   	spin_unlock_irq(&b->irq_lock);
>   }
>   
> -static bool use_fake_irq(const struct intel_breadcrumbs *b)
> -{
> -	const struct intel_engine_cs *engine =
> -		container_of(b, struct intel_engine_cs, breadcrumbs);
> -
> -	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
> -		return false;
> -
> -	/*
> -	 * Only start with the heavy weight fake irq timer if we have not
> -	 * seen any interrupts since enabling it the first time. If the
> -	 * interrupts are still arriving, it means we made a mistake in our
> -	 * engine->seqno_barrier(), a timing error that should be transient
> -	 * and unlikely to reoccur.
> -	 */
> -	return !b->irq_fired;
> -}
> -
> -static void enable_fake_irq(struct intel_breadcrumbs *b)
> -{
> -	/* Ensure we never sleep indefinitely */
> -	if (!b->irq_enabled || use_fake_irq(b))
> -		mod_timer(&b->fake_irq, jiffies + 1);
> -	else
> -		mod_timer(&b->hangcheck, wait_timeout());
> -}
> -
> -static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
> +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
>   {
>   	struct intel_engine_cs *engine =
>   		container_of(b, struct intel_engine_cs, breadcrumbs);
> -	struct drm_i915_private *i915 = engine->i915;
> -	bool enabled;
>   
>   	lockdep_assert_held(&b->irq_lock);
>   	if (b->irq_armed)
> -		return false;
> +		return;
>   
>   	/*
>   	 * The breadcrumb irq will be disarmed on the interrupt after the
> @@ -311,16 +201,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
>   	 * the driver is idle) we disarm the breadcrumbs.
>   	 */
>   
> -	/* No interrupts? Kick the waiter every jiffie! */
> -	enabled = false;
> -	if (!b->irq_enabled++ &&
> -	    !test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
> +	if (!b->irq_enabled++)
>   		irq_enable(engine);
> -		enabled = true;
> -	}
> -
> -	enable_fake_irq(b);
> -	return enabled;
>   }
>   
>   void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
> @@ -331,18 +213,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
>   	INIT_LIST_HEAD(&b->signalers);
>   
>   	init_irq_work(&b->irq_work, signal_irq_work);
> -
> -	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
> -	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
> -}
> -
> -static void cancel_fake_irq(struct intel_engine_cs *engine)
> -{
> -	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -
> -	del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
> -	del_timer_sync(&b->hangcheck);
> -	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
>   }
>   
>   void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
> @@ -352,13 +222,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
>   
>   	spin_lock_irqsave(&b->irq_lock, flags);
>   
> -	/*
> -	 * Leave the fake_irq timer enabled (if it is running), but clear the
> -	 * bit so that it turns itself off on its next wake up and goes back
> -	 * to the long hangcheck interval if still required.
> -	 */
> -	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
> -
>   	if (b->irq_enabled)
>   		irq_enable(engine);
>   	else
> @@ -369,7 +232,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
>   
>   void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
>   {
> -	cancel_fake_irq(engine);
>   }
>   
>   bool intel_engine_enable_signaling(struct i915_request *rq)
> @@ -451,7 +313,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
>   		}
>   	}
>   	spin_unlock_irq(&b->irq_lock);
> -
> -	if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
> -		drm_printf(p, "Fake irq active\n");
>   }
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 5662d6fed523..a219c796e56d 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	for_each_engine(engine, dev_priv, id) {
>   		struct hangcheck hc;
>   
> +		intel_engine_signal_breadcrumbs(engine);
> +

Sounds fine. So only downside is detecting missed interrupts gets 
slower. But in practice they don't happen often?

>   		hangcheck_load_sample(engine, &hc);
>   		hangcheck_accumulate_sample(engine, &hc);
>   		hangcheck_store_sample(engine, &hc);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index b78cb9bd4bc2..7eec96cf2a0b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -382,14 +382,9 @@ struct intel_engine_cs {
>   
>   		struct irq_work irq_work;
>   
> -		struct timer_list fake_irq; /* used after a missed interrupt */
> -		struct timer_list hangcheck; /* detect missed interrupts */
> -
> -		unsigned int hangcheck_interrupts;
>   		unsigned int irq_enabled;
>   
>   		bool irq_armed;
> -		bool irq_fired;
>   	} breadcrumbs;
>   
>   	struct {
> diff --git a/drivers/gpu/drm/i915/selftests/igt_live_test.c b/drivers/gpu/drm/i915/selftests/igt_live_test.c
> index 5deb485fb942..3e902761cd16 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_live_test.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_live_test.c
> @@ -35,7 +35,6 @@ int igt_live_test_begin(struct igt_live_test *t,
>   		return err;
>   	}
>   
> -	i915->gpu_error.missed_irq_rings = 0;
>   	t->reset_global = i915_reset_count(&i915->gpu_error);
>   
>   	for_each_engine(engine, i915, id)
> @@ -75,11 +74,5 @@ int igt_live_test_end(struct igt_live_test *t)
>   		return -EIO;
>   	}
>   
> -	if (i915->gpu_error.missed_irq_rings) {
> -		pr_err("%s(%s): Missed interrupts on engines %lx\n",
> -		       t->func, t->name, i915->gpu_error.missed_irq_rings);
> -		return -EIO;
> -	}
> -
>   	return 0;
>   }
> 

Regards,

Tvrtko



More information about the Intel-gfx mailing list