[Intel-gfx] [PATCH v3 2/6] drm/i915: Watchdog timeout: IRQ handler for gen8+
Chris Wilson
chris at chris-wilson.co.uk
Thu Feb 14 09:15:33 UTC 2019
Quoting Carlos Santa (2019-02-14 02:57:09)
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 58b6ff8453dc..bc10acb24d9a 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -218,7 +218,8 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
>
> static void hangcheck_declare_hang(struct drm_i915_private *i915,
> unsigned int hung,
> - unsigned int stuck)
> + unsigned int stuck,
> + unsigned int watchdog)
Absolutely not.
> +/* From GEN9 onwards, all engines use the same RING_CNTR format */
> +static inline u32 get_watchdog_disable(struct intel_engine_cs *engine)
> +{
> + if (engine->id == RCS || INTEL_GEN(engine->i915) >= 9)
> + return GEN8_WATCHDOG_DISABLE;
> + else
> + return GEN8_XCS_WATCHDOG_DISABLE;
> +}
> +
> +#define GEN8_WATCHDOG_1000US(dev_priv) watchdog_to_clock_counts(dev_priv, 1000)
> +static void gen8_watchdog_irq_handler(unsigned long data)
> +{
> + struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> + struct drm_i915_private *dev_priv = engine->i915;
> + enum forcewake_domains fw_domains;
> + u32 current_seqno;
> +
> + switch (engine->class) {
> + default:
> + MISSING_CASE(engine->id);
> + /* fall through */
> + case RENDER_CLASS:
> + fw_domains = FORCEWAKE_RENDER;
> + break;
> + case VIDEO_DECODE_CLASS:
> + case VIDEO_ENHANCEMENT_CLASS:
> + fw_domains = FORCEWAKE_MEDIA;
> + break;
> + }
> +
> + intel_uncore_forcewake_get(dev_priv, fw_domains);
Doesn't scale well down into the 1ms response range.
> + /* Stop the counter to prevent further timeout interrupts */
> + I915_WRITE_FW(RING_CNTR(engine->mmio_base), get_watchdog_disable(engine));
A bit late? About 200ms or more may have passed since the interrupt, so
what's the point?
> + current_seqno = intel_engine_get_hangcheck_seqno(engine);
Doesn't exist. Only may exist as a temporary hack. If we need the seqno
counter, include it with the watchdog LRI.
> + /* did the request complete after the timer expired? */
> + if (engine->hangcheck.next_seqno == current_seqno)
> + goto fw_put;
> +
> + if (engine->hangcheck.watchdog == current_seqno) {
> + /* Make sure the active request will be marked as guilty */
> + engine->hangcheck.acthd = intel_engine_get_active_head(engine);
> + engine->hangcheck.last_seqno = current_seqno;
Incorrect even for the current code. And as I keep repeating the design
is for the engine reset to be processed immediately.
> + /* And try to run the hangcheck_work as soon as possible */
> + set_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags);
> + queue_delayed_work(system_long_wq,
> + &dev_priv->gpu_error.hangcheck_work,
> + round_jiffies_up_relative(HZ));
> + } else {
> + engine->hangcheck.watchdog = current_seqno;
> + /* Re-start the counter, if really hung, it will expire again */
> + I915_WRITE_FW(RING_THRESH(engine->mmio_base),
> + GEN8_WATCHDOG_1000US(dev_priv));
> + I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE);
> + }
> +
> +fw_put:
> + intel_uncore_forcewake_put(dev_priv, fw_domains);
> +}
More information about the Intel-gfx
mailing list