[Intel-gfx] [PATCH v5 3/5] drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+
Carlos Santa
carlos.santa at intel.com
Tue Apr 2 00:57:10 UTC 2019
On Sat, 2019-03-30 at 09:01 +0000, Chris Wilson wrote:
> Quoting Carlos Santa (2019-03-22 23:41:16)
> > From: Michel Thierry <michel.thierry at intel.com>
> >
> > Emit the required commands into the ring buffer for starting and
> > stopping the watchdog timer before/after batch buffer start during
> > batch buffer submission.
>
> I'm expecting to see some discussion of how this is handled across
> preemption here since you are inside an arbitration enabled section.
>
> > v2: Support watchdog threshold per context engine, merge lri
> > commands,
> > and move watchdog commands emission to emit_bb_start. Request space
> > of
> > combined start_watchdog, bb_start and stop_watchdog to avoid any
> > error
> > after emitting bb_start.
> >
> > v3: There were too many req->engine in emit_bb_start.
> > Use GEM_BUG_ON instead of returning a very late EINVAL in the
> > remote
> > case of watchdog misprogramming; set correct LRI cmd size in
> > emit_stop_watchdog. (Chris)
> >
> > v4: Rebase.
> > v5: use to_intel_context instead of ctx->engine.
> > v6: Rebase.
> > v7: Rebase,
> > Store gpu watchdog capability in engine flag (Tvrtko)
> > Store WATCHDOG_DISABLE magic # in engine (Tvrtko)
> > No need to declare emit_{start|stop}_watchdog as vfuncs
> > (Tvrtko)
> > Replace flag watchdog_running with enable_watchdog (Tvrtko)
> > Emit a single MI_NOOP by conditionally checking whether the #
> > of emitted OPs is odd (Tvrtko)
> > v8: Rebase
> >
> > Cc: Chris Wilson <chris at chris-wilson.co.uk>
> > Cc: Antonio Argenziano <antonio.argenziano at intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> > Signed-off-by: Michel Thierry <michel.thierry at intel.com>
> > Signed-off-by: Carlos Santa <carlos.santa at intel.com>
> > ---
> > drivers/gpu/drm/i915/intel_context_types.h | 4 +
> > drivers/gpu/drm/i915/intel_engine_cs.c | 2 +
> > drivers/gpu/drm/i915/intel_engine_types.h | 17 ++++-
> > drivers/gpu/drm/i915/intel_lrc.c | 89
> > +++++++++++++++++++++-
> > drivers/gpu/drm/i915/intel_lrc.h | 2 +
> > 5 files changed, 106 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_context_types.h
> > b/drivers/gpu/drm/i915/intel_context_types.h
> > index 6dc9b4b9067b..e56fc263568e 100644
> > --- a/drivers/gpu/drm/i915/intel_context_types.h
> > +++ b/drivers/gpu/drm/i915/intel_context_types.h
> > @@ -51,6 +51,10 @@ struct intel_context {
> > u64 lrc_desc;
> >
> > atomic_t pin_count;
> > + /** watchdog_threshold: hw watchdog threshold value,
> > + * in clock counts
> > + */
>
> Gah. Why would you put it here? Between a tightly coupled count +
> mutex.
>
> > + u32 watchdog_threshold;
> > struct mutex pin_mutex; /* guards pinning and associated
> > on-gpuing */
> >
> > /**
> > diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c
> > b/drivers/gpu/drm/i915/intel_engine_cs.c
> > index 88cf0fc07623..d4ea07b70904 100644
> > --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> > +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> > @@ -324,6 +324,8 @@ intel_engine_setup(struct drm_i915_private
> > *dev_priv,
> > if (engine->context_size)
> > DRIVER_CAPS(dev_priv)->has_logical_contexts = true;
> >
> > + engine->watchdog_disable_id = get_watchdog_disable(engine);
> > +
> > /* Nothing to do here, execute in order of dependencies */
> > engine->schedule = NULL;
> >
> > diff --git a/drivers/gpu/drm/i915/intel_engine_types.h
> > b/drivers/gpu/drm/i915/intel_engine_types.h
> > index c4f66b774e7c..1f99b536471d 100644
> > --- a/drivers/gpu/drm/i915/intel_engine_types.h
> > +++ b/drivers/gpu/drm/i915/intel_engine_types.h
> > @@ -260,6 +260,7 @@ struct intel_engine_cs {
> > unsigned int guc_id;
> > intel_engine_mask_t mask;
> >
> > + u32 watchdog_disable_id;
>
> You've just put this between a pair of u8s.
>
> > u8 uabi_class;
> >
> > u8 class;
> > @@ -422,10 +423,12 @@ struct intel_engine_cs {
> >
> > struct intel_engine_hangcheck hangcheck;
> >
> > -#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
> > -#define I915_ENGINE_SUPPORTS_STATS BIT(1)
> > -#define I915_ENGINE_HAS_PREEMPTION BIT(2)
> > -#define I915_ENGINE_HAS_SEMAPHORES BIT(3)
> > +#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
> > +#define I915_ENGINE_SUPPORTS_STATS BIT(1)
> > +#define I915_ENGINE_HAS_PREEMPTION BIT(2)
> > +#define I915_ENGINE_HAS_SEMAPHORES BIT(3)
> > +#define I915_ENGINE_SUPPORTS_WATCHDOG BIT(4)
> > +
> > unsigned int flags;
> >
> > /*
> > @@ -509,6 +512,12 @@ intel_engine_has_semaphores(const struct
> > intel_engine_cs *engine)
> > return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
> > }
> >
> > +static inline bool
> > +intel_engine_supports_watchdog(const struct intel_engine_cs
> > *engine)
> > +{
> > + return engine->flags & I915_ENGINE_SUPPORTS_WATCHDOG;
> > +}
> > +
> > #define instdone_slice_mask(dev_priv__) \
> > (IS_GEN(dev_priv__, 7) ? \
> > 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask)
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c
> > b/drivers/gpu/drm/i915/intel_lrc.c
> > index 85785a94f6ae..78ea54a5dbc3 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -2036,16 +2036,75 @@ static void execlists_reset_finish(struct
> > intel_engine_cs *engine)
> > atomic_read(&execlists->tasklet.count));
> > }
> >
> > +static u32 *gen8_emit_start_watchdog(struct i915_request *rq, u32
> > *cs)
> > +{
> > + struct intel_engine_cs *engine = rq->engine;
> > + struct i915_gem_context *ctx = rq->gem_context;
> > + struct intel_context *ce = intel_context_lookup(ctx,
> > engine);
> > +
> > + GEM_BUG_ON(!intel_engine_supports_watchdog(engine));
> > +
> > + /*
> > + * watchdog register must never be programmed to zero. This
> > would
> > + * cause the watchdog counter to exceed and not allow the
> > engine to
> > + * go into IDLE state
> > + */
> > + GEM_BUG_ON(ce->watchdog_threshold == 0);
> > +
> > + /* Set counter period */
> > + *cs++ = MI_LOAD_REGISTER_IMM(2);
> > + *cs++ = i915_mmio_reg_offset(RING_THRESH(engine-
> > >mmio_base));
> > + *cs++ = ce->watchdog_threshold;
> > + /* Start counter */
> > + *cs++ = i915_mmio_reg_offset(RING_CNTR(engine->mmio_base));
> > + *cs++ = GEN8_WATCHDOG_ENABLE;
>
> Hmm, so no watchdog seqno.
>
> > + return cs;
> > +}
> > +
> > +static u32 *gen8_emit_stop_watchdog(struct i915_request *rq, u32
> > *cs)
> > +{
> > + struct intel_engine_cs *engine = rq->engine;
> > +
> > + GEM_BUG_ON(!intel_engine_supports_watchdog(engine));
> > +
> > + *cs++ = MI_LOAD_REGISTER_IMM(1);
> > + *cs++ = i915_mmio_reg_offset(RING_CNTR(engine->mmio_base));
> > + *cs++ = engine->watchdog_disable_id;
> > +
> > + return cs;
> > +}
> > +
> > static int gen8_emit_bb_start(struct i915_request *rq,
> > u64 offset, u32 len,
> > const unsigned int flags)
> > {
> > + struct intel_engine_cs *engine = rq->engine;
> > + struct i915_gem_context *ctx = rq->gem_context;
> > + struct intel_context *ce = intel_context_lookup(ctx,
> > engine);
>
> Ahem. This keeps on getting worse.
Can you explain a bit more why?
>
> > u32 *cs;
> > + u32 num_dwords;
> > + bool enable_watchdog = false;
> >
> > - cs = intel_ring_begin(rq, 6);
> > + /* bb_start only */
> > + num_dwords = 6;
> > +
> > + /* check if watchdog will be required */
> > + if (ce->watchdog_threshold != 0) {
> > + /* + start_watchdog (6) + stop_watchdog (4) */
> > + num_dwords += 10;
> > + enable_watchdog = true;
> > + }
> > +
> > + cs = intel_ring_begin(rq, num_dwords);
> > if (IS_ERR(cs))
> > return PTR_ERR(cs);
> >
> > + if (enable_watchdog) {
> > + /* Start watchdog timer */
>
> Please don't simply repeat code in comments.
Ack.
>
> > + cs = gen8_emit_start_watchdog(rq, cs);
> > + }
> > +
> > /*
> > * WaDisableCtxRestoreArbitration:bdw,chv
> > *
> > @@ -2072,10 +2131,16 @@ static int gen8_emit_bb_start(struct
> > i915_request *rq,
> > *cs++ = upper_32_bits(offset);
> >
> > *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> > - *cs++ = MI_NOOP;
> >
> > - intel_ring_advance(rq, cs);
> > + if (enable_watchdog) {
> > + /* Cancel watchdog timer */
> > + cs = gen8_emit_stop_watchdog(rq, cs);
> > + }
> > +
> > + if (*cs%2 != 0)
> > + *cs++ = MI_NOOP;
>
> This is wrong. cs points into the unset portion of the ring. The
> watchdog commands are even, so there is no reason to move the
> original
> NOOP, or at least no reason to make it conditional.
Ok, I initially took the suggestion from Tvrtko from way back, where we
were trying to get rid of too many MI_NOOPs by emitting them
conditionally based on whether the cs pointer was odd...
Carlos
> -Chris
More information about the Intel-gfx
mailing list