[Intel-gfx] [PATCH v3 2/6] drm/i915: Watchdog timeout: IRQ handler for gen8+

Thu Feb 14 09:15:33 UTC 2019

Quoting Carlos Santa (2019-02-14 02:57:09)
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 58b6ff8453dc..bc10acb24d9a 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -218,7 +218,8 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
>  
>  static void hangcheck_declare_hang(struct drm_i915_private *i915,
>                                    unsigned int hung,
> -                                  unsigned int stuck)
> +                                  unsigned int stuck,
> +                                  unsigned int watchdog)

Absolutely not.

> +/* From GEN9 onwards, all engines use the same RING_CNTR format */
> +static inline u32 get_watchdog_disable(struct intel_engine_cs *engine)
> +{
> +       if (engine->id == RCS || INTEL_GEN(engine->i915) >= 9)
> +               return GEN8_WATCHDOG_DISABLE;
> +       else
> +               return GEN8_XCS_WATCHDOG_DISABLE;
> +}
> +
> +#define GEN8_WATCHDOG_1000US(dev_priv) watchdog_to_clock_counts(dev_priv, 1000)
> +static void gen8_watchdog_irq_handler(unsigned long data)
> +{
> +       struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +       struct drm_i915_private *dev_priv = engine->i915;
> +       enum forcewake_domains fw_domains;
> +       u32 current_seqno;
> +
> +       switch (engine->class) {
> +       default:
> +               MISSING_CASE(engine->id);
> +               /* fall through */
> +       case RENDER_CLASS:
> +               fw_domains = FORCEWAKE_RENDER;
> +               break;
> +       case VIDEO_DECODE_CLASS:
> +       case VIDEO_ENHANCEMENT_CLASS:
> +               fw_domains = FORCEWAKE_MEDIA;
> +               break;
> +       }
> +
> +       intel_uncore_forcewake_get(dev_priv, fw_domains);

Doesn't scale well down into the 1ms response range.

> +       /* Stop the counter to prevent further timeout interrupts */
> +       I915_WRITE_FW(RING_CNTR(engine->mmio_base), get_watchdog_disable(engine));

A bit late? About 200ms or more may have passed since the interrupt, so
what's the point?

> +       current_seqno = intel_engine_get_hangcheck_seqno(engine);

Doesn't exist. Only may exist as a temporary hack. If we need the seqno
counter, include it with the watchdog LRI.

> +       /* did the request complete after the timer expired? */
> +       if (engine->hangcheck.next_seqno == current_seqno)
> +               goto fw_put;
> +
> +       if (engine->hangcheck.watchdog == current_seqno) {
> +               /* Make sure the active request will be marked as guilty */
> +               engine->hangcheck.acthd = intel_engine_get_active_head(engine);
> +               engine->hangcheck.last_seqno = current_seqno;

Incorrect even for the current code. And as I keep repeating the design
is for the engine reset to be processed immediately.

> +               /* And try to run the hangcheck_work as soon as possible */
> +               set_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags);
> +               queue_delayed_work(system_long_wq,
> +                                  &dev_priv->gpu_error.hangcheck_work,
> +                                  round_jiffies_up_relative(HZ));
> +       } else {
> +               engine->hangcheck.watchdog = current_seqno;
> +               /* Re-start the counter, if really hung, it will expire again */
> +               I915_WRITE_FW(RING_THRESH(engine->mmio_base),
> +                             GEN8_WATCHDOG_1000US(dev_priv));
> +               I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE);
> +       }
> +
> +fw_put:
> +       intel_uncore_forcewake_put(dev_priv, fw_domains);
> +}