[Intel-gfx] [PATCH] drm/i915/gt: Hook up CS_MASTER_ERROR_INTERRUPT
Mika Kuoppala
mika.kuoppala at linux.intel.com
Wed Jan 29 15:07:20 UTC 2020
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Now that we have offline error capture and can reset an engine from
> inside an atomic context while also preserving the GPU state for
> post-mortem analysis, it is time to handle error interrupts thrown by
> the command parser.
You might want to add an advertisement here that this way the
detection/recovery speed of cs errors is greatly improved.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 8 +-
> drivers/gpu/drm/i915/gt/intel_engine_types.h | 10 ++
> drivers/gpu/drm/i915/gt/intel_gt.c | 5 +
> drivers/gpu/drm/i915/gt/intel_gt_irq.c | 27 ++-
> drivers/gpu/drm/i915/gt/intel_lrc.c | 57 +++++--
> drivers/gpu/drm/i915/gt/selftest_lrc.c | 166 +++++++++++++++++--
> drivers/gpu/drm/i915/i915_gpu_error.c | 2 +
> drivers/gpu/drm/i915/i915_gpu_error.h | 1 +
> drivers/gpu/drm/i915/i915_reg.h | 5 +-
> 9 files changed, 246 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 9b965d1f811d..39fe9a5b4820 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1293,8 +1293,14 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
> }
>
> if (INTEL_GEN(dev_priv) >= 6) {
> - drm_printf(m, "\tRING_IMR: %08x\n",
> + drm_printf(m, "\tRING_IMR: 0x%08x\n",
> ENGINE_READ(engine, RING_IMR));
> + drm_printf(m, "\tRING_ESR: 0x%08x\n",
> + ENGINE_READ(engine, RING_ESR));
> + drm_printf(m, "\tRING_EMR: 0x%08x\n",
> + ENGINE_READ(engine, RING_EMR));
> + drm_printf(m, "\tRING_EIR: 0x%08x\n",
> + ENGINE_READ(engine, RING_EIR));
> }
>
> addr = intel_engine_get_active_head(engine);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 92be41a6903c..abd1de3b83a8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -156,6 +156,16 @@ struct intel_engine_execlists {
> */
> struct i915_priolist default_priolist;
>
> + /**
> + * @error_interrupt: CS Master EIR
> + *
> + * The CS generates an interrupt when it detects an error. We capture
> + * the first error interrupt, record the EIR and schedule the tasklet.
> + * In the tasklet, we process the pending CS events to ensure we have
> + * the guilty request, and then reset the engine.
> + */
> + u32 error_interrupt;
> +
> /**
> * @no_priolist: priority lists disabled
> */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index 88b6c904607c..143268083135 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -455,6 +455,11 @@ static int __engines_record_defaults(struct intel_gt *gt)
> if (!rq)
> continue;
>
> + if (rq->fence.error) {
> + err = -EIO;
> + goto out;
> + }
> +
> GEM_BUG_ON(!test_bit(CONTEXT_ALLOC_BIT, &rq->context->flags));
> state = rq->context->state;
> if (!state)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> index 7278b10e1a03..864efaf1eb37 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> @@ -24,6 +24,21 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> {
> bool tasklet = false;
>
> + if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
> + u32 eir;
> +
> + eir = ENGINE_READ(engine, RING_EIR);
> + ENGINE_TRACE(engine, "CS error: %x\n", eir);
> +
> + /* Disable the error interrupt until after the reset */
> + if (likely(eir)) {
> + ENGINE_WRITE(engine, RING_EMR, ~0u);
> + ENGINE_WRITE(engine, RING_EIR, eir);
> + engine->execlists.error_interrupt = eir;
WRITE_ONCE
> + tasklet = true;
> + }
> + }
> +
> if (iir & GT_CONTEXT_SWITCH_INTERRUPT)
> tasklet = true;
>
> @@ -210,7 +225,10 @@ void gen11_gt_irq_reset(struct intel_gt *gt)
>
> void gen11_gt_irq_postinstall(struct intel_gt *gt)
> {
> - const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
> + const u32 irqs =
> + GT_CS_MASTER_ERROR_INTERRUPT |
> + GT_RENDER_USER_INTERRUPT |
> + GT_CONTEXT_SWITCH_INTERRUPT;
> struct intel_uncore *uncore = gt->uncore;
> const u32 dmask = irqs << 16 | irqs;
> const u32 smask = irqs << 16;
> @@ -279,7 +297,7 @@ void gen6_gt_irq_handler(struct intel_gt *gt, u32 gt_iir)
>
> if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
> GT_BSD_CS_ERROR_INTERRUPT |
> - GT_RENDER_CS_MASTER_ERROR_INTERRUPT))
> + GT_CS_MASTER_ERROR_INTERRUPT))
> DRM_DEBUG("Command parser error, gt_iir 0x%08x\n", gt_iir);
>
> if (gt_iir & GT_PARITY_ERROR(gt->i915))
> @@ -345,7 +363,10 @@ void gen8_gt_irq_reset(struct intel_gt *gt)
> void gen8_gt_irq_postinstall(struct intel_gt *gt)
> {
> /* These are interrupts we'll toggle with the ring mask register */
> - const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
> + const u32 irqs =
> + GT_CS_MASTER_ERROR_INTERRUPT |
> + GT_RENDER_USER_INTERRUPT |
> + GT_CONTEXT_SWITCH_INTERRUPT;
> const u32 gt_interrupts[] = {
> irqs << GEN8_RCS_IRQ_SHIFT | irqs << GEN8_BCS_IRQ_SHIFT,
> irqs << GEN8_VCS0_IRQ_SHIFT | irqs << GEN8_VCS1_IRQ_SHIFT,
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index cf6c43bd540a..1b6aab05fde1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2613,13 +2613,13 @@ static bool execlists_capture(struct intel_engine_cs *engine)
> if (!cap)
> return true;
>
> + spin_lock_irq(&engine->active.lock);
> cap->rq = execlists_active(&engine->execlists);
> - GEM_BUG_ON(!cap->rq);
> -
> - rcu_read_lock();
> - cap->rq = active_request(cap->rq->context->timeline, cap->rq);
> - cap->rq = i915_request_get_rcu(cap->rq);
> - rcu_read_unlock();
> + if (cap->rq) {
> + cap->rq = active_request(cap->rq->context->timeline, cap->rq);
> + cap->rq = i915_request_get_rcu(cap->rq);
> + }
> + spin_unlock_irq(&engine->active.lock);
> if (!cap->rq)
> goto err_free;
>
> @@ -2658,27 +2658,25 @@ static bool execlists_capture(struct intel_engine_cs *engine)
> return false;
> }
>
> -static noinline void preempt_reset(struct intel_engine_cs *engine)
> +static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
> {
> const unsigned int bit = I915_RESET_ENGINE + engine->id;
> unsigned long *lock = &engine->gt->reset.flags;
>
> - if (i915_modparams.reset < 3)
> + if (!intel_has_reset_engine(engine->gt))
> return;
>
> if (test_and_set_bit(bit, lock))
> return;
>
> + ENGINE_TRACE(engine, "reset for %s\n", msg);
> +
> /* Mark this tasklet as disabled to avoid waiting for it to complete */
> tasklet_disable_nosync(&engine->execlists.tasklet);
>
> - ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
> - READ_ONCE(engine->props.preempt_timeout_ms),
> - jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
> -
> ring_set_paused(engine, 1); /* Freeze the current request in place */
> if (execlists_capture(engine))
> - intel_engine_reset(engine, "preemption time out");
> + intel_engine_reset(engine, msg);
> else
> ring_set_paused(engine, 0);
>
> @@ -2709,6 +2707,13 @@ static void execlists_submission_tasklet(unsigned long data)
> bool timeout = preempt_timeout(engine);
>
> process_csb(engine);
> +
> + if (unlikely(engine->execlists.error_interrupt)) {
READ_ONCE
> + engine->execlists.error_interrupt = 0;
The race looks bening as this is only for us.
> + if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
> + execlists_reset(engine, "CS error");
> + }
> +
> if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
> unsigned long flags;
>
> @@ -2717,8 +2722,8 @@ static void execlists_submission_tasklet(unsigned long data)
> spin_unlock_irqrestore(&engine->active.lock, flags);
>
> /* Recheck after serialising with direct-submission */
> - if (timeout && preempt_timeout(engine))
> - preempt_reset(engine);
> + if (unlikely(timeout && preempt_timeout(engine)))
> + execlists_reset(engine, "preemption time out");
> }
> }
>
> @@ -3343,6 +3348,25 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
> return ret;
> }
>
> +static void enable_error_interrupt(struct intel_engine_cs *engine)
> +{
> + u32 status;
> +
> + engine->execlists.error_interrupt = 0;
> + ENGINE_WRITE(engine, RING_EMR, ~0u);
> + ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
> +
> + status = ENGINE_READ(engine, RING_ESR);
> + if (unlikely(status)) {
> + dev_err(engine->i915->drm.dev,
> + "engine '%s' resumed still in error: %08x\n",
> + engine->name, status);
> + __intel_gt_reset(engine->gt, engine->mask);
> + }
> +
> + ENGINE_WRITE(engine, RING_EMR, ~REG_BIT(0));
Please do define this bit.
> +}
> +
> static void enable_execlists(struct intel_engine_cs *engine)
> {
> u32 mode;
> @@ -3364,6 +3388,8 @@ static void enable_execlists(struct intel_engine_cs *engine)
> i915_ggtt_offset(engine->status_page.vma));
> ENGINE_POSTING_READ(engine, RING_HWS_PGA);
>
> + enable_error_interrupt(engine);
> +
> engine->context_tag = 0;
> }
>
> @@ -4290,6 +4316,7 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
>
> engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
> engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
> + engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
> }
>
> static void rcs_submission_override(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 65718ca2326e..84bac953d1b7 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -68,6 +68,21 @@ static void engine_heartbeat_enable(struct intel_engine_cs *engine,
> engine->props.heartbeat_interval_ms = saved;
> }
>
> +static int wait_for_submit(struct intel_engine_cs *engine,
> + struct i915_request *rq,
> + unsigned long timeout)
> +{
> + timeout += jiffies;
> + do {
> + cond_resched();
> + intel_engine_flush_submission(engine);
> + if (i915_request_is_active(rq))
If this would not be just moving, I would ask what
if the request whizzed by to inactivity between
our sample points.
> + return 0;
> + } while (time_before(jiffies, timeout));
> +
> + return -ETIME;
> +}
> +
> static int live_sanitycheck(void *arg)
> {
> struct intel_gt *gt = arg;
> @@ -386,6 +401,141 @@ static int live_hold_reset(void *arg)
> return err;
> }
>
> +static const char *error_repr(int err)
> +{
> + return err ? "bad" : "good";
> +}
> +
> +static int live_error_interrupt(void *arg)
> +{
> + static const struct error_phase {
> + enum { GOOD = 0, BAD = -EIO } error[2];
> + } phases[] = {
> + { { BAD, GOOD } },
> + { { BAD, BAD } },
> + { { BAD, GOOD } },
> + { { GOOD, GOOD } }, /* sentinel */
> + };
> + struct intel_gt *gt = arg;
> + struct intel_engine_cs *engine;
> + enum intel_engine_id id;
> +
> + /*
> + * We hook up the CS_MASTER_ERROR_INTERRUPT to have forewarning
> + * of invalid commands in user batches that will cause a GPU hang.
> + * This is a faster mechanism than using hangcheck/heartbeats, but
> + * only detects problems the HW knows about -- it will not warn when
> + * we kill the HW!
> + *
> + * To verify our detection and reset, we throw some invalid commands
> + * at the HW and wait for the interrupt.
> + */
> +
> + if (!intel_has_reset_engine(gt))
> + return 0;
> +
> + for_each_engine(engine, gt, id) {
> + const struct error_phase *p;
> + unsigned long heartbeat;
> +
> + engine_heartbeat_disable(engine, &heartbeat);
> +
> + for (p = phases; p->error[0] != GOOD; p++) {
> + struct i915_request *client[ARRAY_SIZE(phases->error)];
> + int err = 0, i;
> + u32 *cs;
> +
> + memset(client, 0, sizeof(*client));
> + for (i = 0; i < ARRAY_SIZE(client); i++) {
> + struct intel_context *ce;
> + struct i915_request *rq;
> +
> + ce = intel_context_create(engine);
> + if (IS_ERR(ce)) {
> + err = PTR_ERR(ce);
> + goto out;
> + }
> +
> + rq = intel_context_create_request(ce);
> + intel_context_put(ce);
> + if (IS_ERR(rq)) {
> + err = PTR_ERR(rq);
> + goto out;
> + }
> +
> + if (rq->engine->emit_init_breadcrumb) {
> + err = rq->engine->emit_init_breadcrumb(rq);
> + if (err) {
> + i915_request_add(rq);
> + goto out;
> + }
> + }
> +
> + cs = intel_ring_begin(rq, 2);
> + if (IS_ERR(cs)) {
> + i915_request_add(rq);
> + err = PTR_ERR(cs);
> + goto out;
> + }
> +
> + if (p->error[i]) {
Was going to nag about enums but on second read,
it reads just fine.
Defining the REG(0) and some WRITE_ONCE/READ_ONCE salt
added on top,
Reviewed-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> + *cs++ = 0xdeadbeef;
> + *cs++ = 0xdeadbeef;
> + } else {
> + *cs++ = MI_NOOP;
> + *cs++ = MI_NOOP;
> + }
> +
> + client[i] = i915_request_get(rq);
> + i915_request_add(rq);
> + }
> +
> + err = wait_for_submit(engine, client[0], HZ / 2);
> + if (err) {
> + pr_err("%s: first request did not start within time!\n",
> + engine->name);
> + err = -ETIME;
> + goto out;
> + }
> +
> + for (i = 0; i < ARRAY_SIZE(client); i++) {
> + if (i915_request_wait(client[i], 0, HZ / 5) < 0) {
> + pr_err("%s: %s request still executing!\n",
> + engine->name,
> + error_repr(p->error[i]));
> + err = -ETIME;
> + goto out;
> + }
> +
> + if (client[i]->fence.error != p->error[i]) {
> + pr_err("%s: %s request completed with wrong error code: %d\n",
> + engine->name,
> + error_repr(p->error[i]),
> + client[i]->fence.error);
> + err = -EINVAL;
> + goto out;
> + }
> + }
> +
> +out:
> + for (i = 0; i < ARRAY_SIZE(client); i++)
> + if (client[i])
> + i915_request_put(client[i]);
> + if (err) {
> + pr_err("%s: failed at phase[%zd] { %d, %d }\n",
> + engine->name, p - phases,
> + p->error[0], p->error[1]);
> + intel_gt_set_wedged(gt);
> + return err;
> + }
> + }
> +
> + engine_heartbeat_enable(engine, heartbeat);
> + }
> +
> + return 0;
> +}
> +
> static int
> emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx)
> {
> @@ -628,21 +778,6 @@ static struct i915_request *nop_request(struct intel_engine_cs *engine)
> return rq;
> }
>
> -static int wait_for_submit(struct intel_engine_cs *engine,
> - struct i915_request *rq,
> - unsigned long timeout)
> -{
> - timeout += jiffies;
> - do {
> - cond_resched();
> - intel_engine_flush_submission(engine);
> - if (i915_request_is_active(rq))
> - return 0;
> - } while (time_before(jiffies, timeout));
> -
> - return -ETIME;
> -}
> -
> static long timeslice_threshold(const struct intel_engine_cs *engine)
> {
> return 2 * msecs_to_jiffies_timeout(timeslice(engine)) + 1;
> @@ -3572,6 +3707,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
> SUBTEST(live_unlite_switch),
> SUBTEST(live_unlite_preempt),
> SUBTEST(live_hold_reset),
> + SUBTEST(live_error_interrupt),
> SUBTEST(live_timeslice_preempt),
> SUBTEST(live_timeslice_queue),
> SUBTEST(live_busywait_preempt),
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 0f67bef83106..dcab4723b17d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -515,6 +515,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
> (u32)(ee->acthd>>32), (u32)ee->acthd);
> err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
> err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
> + err_printf(m, " ESR: 0x%08x\n", ee->esr);
>
> error_print_instdone(m, ee);
>
> @@ -1102,6 +1103,7 @@ static void engine_record_registers(struct intel_engine_coredump *ee)
> }
>
> if (INTEL_GEN(i915) >= 4) {
> + ee->esr = ENGINE_READ(engine, RING_ESR);
> ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
> ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
> ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index e4a6afed3bbf..b35bc9edd733 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -75,6 +75,7 @@ struct intel_engine_coredump {
> u32 hws;
> u32 ipeir;
> u32 ipehr;
> + u32 esr;
> u32 bbstate;
> u32 instpm;
> u32 instps;
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index b93c4c18f05c..4c72b8ac0f2e 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2639,6 +2639,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define GEN11_MCR_SUBSLICE_MASK GEN11_MCR_SUBSLICE(0x7)
> #define RING_IPEIR(base) _MMIO((base) + 0x64)
> #define RING_IPEHR(base) _MMIO((base) + 0x68)
> +#define RING_EIR(base) _MMIO((base) + 0xb0)
> +#define RING_EMR(base) _MMIO((base) + 0xb4)
> +#define RING_ESR(base) _MMIO((base) + 0xb8)
> /*
> * On GEN4, only the render ring INSTDONE exists and has a different
> * layout than the GEN7+ version.
> @@ -3088,7 +3091,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define GT_CONTEXT_SWITCH_INTERRUPT (1 << 8)
> #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT (1 << 5) /* !snb */
> #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT (1 << 4)
> -#define GT_RENDER_CS_MASTER_ERROR_INTERRUPT (1 << 3)
> +#define GT_CS_MASTER_ERROR_INTERRUPT REG_BIT(3)
> #define GT_RENDER_SYNC_STATUS_INTERRUPT (1 << 2)
> #define GT_RENDER_DEBUG_INTERRUPT (1 << 1)
> #define GT_RENDER_USER_INTERRUPT (1 << 0)
> --
> 2.25.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list