[Intel-gfx] [PATCH 42/47] drm/i915/guc: Fix for error capture after full GPU reset with GuC
Matthew Brost
matthew.brost at intel.com
Thu Jul 15 00:43:28 UTC 2021
On Thu, Jun 24, 2021 at 12:05:11AM -0700, Matthew Brost wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> In the case of a full GPU reset (e.g. because GuC has died or because
> GuC's hang detection has been disabled), the driver can't rely on GuC
> reporting the guilty context. Instead, the driver needs to scan all
> active contexts and find one that is currently executing, as per the
> execlist mode behaviour. In GuC mode, this scan is different to
> execlist mode as the active request list is handled very differently.
>
> Similarly, the request state dump in debugfs needs to be handled
> differently when in GuC submission mode.
>
> Also refactured some of the request scanning code to avoid duplication
> across the multiple code paths that are now replicating it.
>
> Signed-off-by: John Harrison <john.c.harrison at intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_engine.h | 3 +
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 139 ++++++++++++------
> .../gpu/drm/i915/gt/intel_engine_heartbeat.c | 8 +
> drivers/gpu/drm/i915/gt/intel_reset.c | 2 +-
> drivers/gpu/drm/i915/gt/uc/intel_guc.h | 2 +
> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 +++++++++
> .../gpu/drm/i915/gt/uc/intel_guc_submission.h | 3 +
> drivers/gpu/drm/i915/i915_request.c | 41 ++++++
> drivers/gpu/drm/i915/i915_request.h | 11 ++
> 9 files changed, 229 insertions(+), 47 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> index 6ea5643a3aaa..9ba131175564 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> @@ -240,6 +240,9 @@ __printf(3, 4)
> void intel_engine_dump(struct intel_engine_cs *engine,
> struct drm_printer *m,
> const char *header, ...);
> +void intel_engine_dump_active_requests(struct list_head *requests,
> + struct i915_request *hung_rq,
> + struct drm_printer *m);
>
> ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
> ktime_t *now);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 1d243b83b023..bbea7c9a367d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1624,6 +1624,97 @@ static void print_properties(struct intel_engine_cs *engine,
> read_ul(&engine->defaults, p->offset));
> }
>
> +static void engine_dump_request(struct i915_request *rq, struct drm_printer *m, const char *msg)
> +{
> + struct intel_timeline *tl = get_timeline(rq);
> +
> + i915_request_show(m, rq, msg, 0);
> +
> + drm_printf(m, "\t\tring->start: 0x%08x\n",
> + i915_ggtt_offset(rq->ring->vma));
> + drm_printf(m, "\t\tring->head: 0x%08x\n",
> + rq->ring->head);
> + drm_printf(m, "\t\tring->tail: 0x%08x\n",
> + rq->ring->tail);
> + drm_printf(m, "\t\tring->emit: 0x%08x\n",
> + rq->ring->emit);
> + drm_printf(m, "\t\tring->space: 0x%08x\n",
> + rq->ring->space);
> +
> + if (tl) {
> + drm_printf(m, "\t\tring->hwsp: 0x%08x\n",
> + tl->hwsp_offset);
> + intel_timeline_put(tl);
> + }
> +
> + print_request_ring(m, rq);
> +
> + if (rq->context->lrc_reg_state) {
> + drm_printf(m, "Logical Ring Context:\n");
> + hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
> + }
> +}
> +
> +void intel_engine_dump_active_requests(struct list_head *requests,
> + struct i915_request *hung_rq,
> + struct drm_printer *m)
> +{
> + struct i915_request *rq;
> + const char *msg;
> + enum i915_request_state state;
> +
> + list_for_each_entry(rq, requests, sched.link) {
> + if (rq == hung_rq)
> + continue;
> +
> + state = i915_test_request_state(rq);
> + if (state < I915_REQUEST_QUEUED)
> + continue;
> +
> + if (state == I915_REQUEST_ACTIVE)
> + msg = "\t\tactive on engine";
> + else
> + msg = "\t\tactive in queue";
> +
> + engine_dump_request(rq, m, msg);
> + }
> +}
> +
> +static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
> +{
> + struct i915_request *hung_rq = NULL;
> + struct intel_context *ce;
> + bool guc;
> +
> + /*
> + * No need for an engine->irq_seqno_barrier() before the seqno reads.
> + * The GPU is still running so requests are still executing and any
> + * hardware reads will be out of date by the time they are reported.
> + * But the intention here is just to report an instantaneous snapshot
> + * so that's fine.
> + */
> + lockdep_assert_held(&engine->sched_engine->lock);
> +
> + drm_printf(m, "\tRequests:\n");
> +
> + guc = intel_uc_uses_guc_submission(&engine->gt->uc);
> + if (guc) {
> + ce = intel_engine_get_hung_context(engine);
> + if (ce)
> + hung_rq = intel_context_find_active_request(ce);
> + } else
> + hung_rq = intel_engine_execlist_find_hung_request(engine);
> +
> + if (hung_rq)
> + engine_dump_request(hung_rq, m, "\t\thung");
> +
> + if (guc)
> + intel_guc_dump_active_requests(engine, hung_rq, m);
> + else
> + intel_engine_dump_active_requests(&engine->sched_engine->requests,
> + hung_rq, m);
> +}
> +
> void intel_engine_dump(struct intel_engine_cs *engine,
> struct drm_printer *m,
> const char *header, ...)
> @@ -1668,39 +1759,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> i915_reset_count(error));
> print_properties(engine, m);
>
> - drm_printf(m, "\tRequests:\n");
> -
> spin_lock_irqsave(&engine->sched_engine->lock, flags);
> - rq = intel_engine_execlist_find_hung_request(engine);
> - if (rq) {
> - struct intel_timeline *tl = get_timeline(rq);
> -
> - i915_request_show(m, rq, "\t\tactive ", 0);
> -
> - drm_printf(m, "\t\tring->start: 0x%08x\n",
> - i915_ggtt_offset(rq->ring->vma));
> - drm_printf(m, "\t\tring->head: 0x%08x\n",
> - rq->ring->head);
> - drm_printf(m, "\t\tring->tail: 0x%08x\n",
> - rq->ring->tail);
> - drm_printf(m, "\t\tring->emit: 0x%08x\n",
> - rq->ring->emit);
> - drm_printf(m, "\t\tring->space: 0x%08x\n",
> - rq->ring->space);
> -
> - if (tl) {
> - drm_printf(m, "\t\tring->hwsp: 0x%08x\n",
> - tl->hwsp_offset);
> - intel_timeline_put(tl);
> - }
> -
> - print_request_ring(m, rq);
> + engine_dump_active_requests(engine, m);
>
> - if (rq->context->lrc_reg_state) {
> - drm_printf(m, "Logical Ring Context:\n");
> - hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
> - }
> - }
> drm_printf(m, "\tOn hold?: %lu\n",
> list_count(&engine->sched_engine->hold));
> spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> @@ -1774,13 +1835,6 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings,
> return siblings[0]->cops->create_virtual(siblings, count);
> }
>
> -static bool match_ring(struct i915_request *rq)
> -{
> - u32 ring = ENGINE_READ(rq->engine, RING_START);
> -
> - return ring == i915_ggtt_offset(rq->ring->vma);
> -}
> -
> struct i915_request *
> intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
> {
> @@ -1824,14 +1878,7 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
>
> list_for_each_entry(request, &engine->sched_engine->requests,
> sched.link) {
> - if (__i915_request_is_complete(request))
> - continue;
> -
> - if (!__i915_request_has_started(request))
> - continue;
> -
> - /* More than one preemptible request may match! */
> - if (!match_ring(request))
> + if (i915_test_request_state(request) != I915_REQUEST_ACTIVE)
> continue;
>
> active = request;
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> index a8495364d906..f0768824de6f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> @@ -90,6 +90,14 @@ reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
> if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> show_heartbeat(rq, engine);
>
> + if (intel_engine_uses_guc(engine))
> + /*
> + * GuC itself is toast or GuC's hang detection
> + * is disabled. Either way, need to find the
> + * hang culprit manually.
> + */
> + intel_guc_find_hung_context(engine);
> +
> intel_gt_handle_error(engine->gt, engine->mask,
> I915_ERROR_CAPTURE,
> "stopped heartbeat on %s",
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 2987282dff6d..f3cdbf4ba5c8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -156,7 +156,7 @@ void __i915_request_reset(struct i915_request *rq, bool guilty)
> if (guilty) {
> i915_request_set_error_once(rq, -EIO);
> __i915_request_skip(rq);
> - if (mark_guilty(rq))
> + if (mark_guilty(rq) && !intel_engine_uses_guc(rq->engine))
> skip_context(rq);
> } else {
> i915_request_set_error_once(rq, -EAGAIN);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index ab1a85b508db..c38365cd5fab 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -268,6 +268,8 @@ int intel_guc_context_reset_process_msg(struct intel_guc *guc,
> int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
> const u32 *msg, u32 len);
>
> +void intel_guc_find_hung_context(struct intel_engine_cs *engine);
> +
> void intel_guc_submission_reset_prepare(struct intel_guc *guc);
> void intel_guc_submission_reset(struct intel_guc *guc, bool stalled);
> void intel_guc_submission_reset_finish(struct intel_guc *guc);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 315edeaa186a..6188189314d5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -2267,6 +2267,73 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
> return 0;
> }
>
> +void intel_guc_find_hung_context(struct intel_engine_cs *engine)
> +{
> + struct intel_guc *guc = &engine->gt->uc.guc;
> + struct intel_context *ce;
> + struct i915_request *rq;
> + unsigned long index;
> +
> + /* Reset called during driver load? GuC not yet initialised! */
> + if (unlikely(!guc_submission_initialized(guc)))
> + return;
> +
> + xa_for_each(&guc->context_lookup, index, ce) {
> + if (!intel_context_is_pinned(ce))
> + continue;
> +
> + if (intel_engine_is_virtual(ce->engine)) {
> + if (!(ce->engine->mask & engine->mask))
> + continue;
> + } else {
> + if (ce->engine != engine)
> + continue;
> + }
> +
> + list_for_each_entry(rq, &ce->guc_active.requests, sched.link) {
> + if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE)
> + continue;
> +
> + intel_engine_set_hung_context(engine, ce);
> +
> + /* Can only cope with one hang at a time... */
> + return;
> + }
> + }
> +}
> +
> +void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
> + struct i915_request *hung_rq,
> + struct drm_printer *m)
> +{
> + struct intel_guc *guc = &engine->gt->uc.guc;
> + struct intel_context *ce;
> + unsigned long index;
> + unsigned long flags;
> +
> + /* Reset called during driver load? GuC not yet initialised! */
> + if (unlikely(!guc_submission_initialized(guc)))
> + return;
> +
> + xa_for_each(&guc->context_lookup, index, ce) {
> + if (!intel_context_is_pinned(ce))
> + continue;
> +
> + if (intel_engine_is_virtual(ce->engine)) {
> + if (!(ce->engine->mask & engine->mask))
> + continue;
> + } else {
> + if (ce->engine != engine)
> + continue;
> + }
> +
> + spin_lock_irqsave(&ce->guc_active.lock, flags);
> + intel_engine_dump_active_requests(&ce->guc_active.requests,
> + hung_rq, m);
> + spin_unlock_irqrestore(&ce->guc_active.lock, flags);
> + }
> +}
> +
> void intel_guc_log_submission_info(struct intel_guc *guc,
> struct drm_printer *p)
> {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index b9b9f0f60f91..a2a3fad72be1 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -24,6 +24,9 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine);
> void intel_guc_log_submission_info(struct intel_guc *guc,
> struct drm_printer *p);
> void intel_guc_log_context_info(struct intel_guc *guc, struct drm_printer *p);
> +void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
> + struct i915_request *hung_rq,
> + struct drm_printer *m);
>
> bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 192784875a1d..2978c8d45021 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -2076,6 +2076,47 @@ void i915_request_show(struct drm_printer *m,
> name);
> }
>
> +static bool engine_match_ring(struct intel_engine_cs *engine, struct i915_request *rq)
> +{
> + u32 ring = ENGINE_READ(engine, RING_START);
> +
> + return ring == i915_ggtt_offset(rq->ring->vma);
> +}
> +
> +static bool match_ring(struct i915_request *rq)
> +{
> + struct intel_engine_cs *engine;
> + bool found;
> + int i;
> +
> + if (!intel_engine_is_virtual(rq->engine))
> + return engine_match_ring(rq->engine, rq);
> +
> + found = false;
> + i = 0;
> + while ((engine = intel_engine_get_sibling(rq->engine, i++))) {
> + found = engine_match_ring(engine, rq);
> + if (found)
> + break;
> + }
> +
> + return found;
> +}
> +
> +enum i915_request_state i915_test_request_state(struct i915_request *rq)
> +{
> + if (i915_request_completed(rq))
> + return I915_REQUEST_COMPLETE;
> +
> + if (!i915_request_started(rq))
> + return I915_REQUEST_PENDING;
> +
> + if (match_ring(rq))
> + return I915_REQUEST_ACTIVE;
> +
> + return I915_REQUEST_QUEUED;
> +}
> +
> #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> #include "selftests/mock_request.c"
> #include "selftests/i915_request.c"
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index bcc6340c505e..f98385f72782 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -651,4 +651,15 @@ i915_request_active_engine(struct i915_request *rq,
>
> void i915_request_notify_execute_cb_imm(struct i915_request *rq);
>
> +enum i915_request_state
> +{
> + I915_REQUEST_UNKNOWN = 0,
> + I915_REQUEST_COMPLETE,
> + I915_REQUEST_PENDING,
> + I915_REQUEST_QUEUED,
> + I915_REQUEST_ACTIVE,
> +};
> +
> +enum i915_request_state i915_test_request_state(struct i915_request *rq);
> +
> #endif /* I915_REQUEST_H */
> --
> 2.28.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the dri-devel
mailing list