[Intel-gfx] [PATCH 42/47] drm/i915/guc: Fix for error capture after full GPU reset with GuC

Matthew Brost matthew.brost at intel.com
Thu Jul 15 00:43:28 UTC 2021


On Thu, Jun 24, 2021 at 12:05:11AM -0700, Matthew Brost wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
> 
> In the case of a full GPU reset (e.g. because GuC has died or because
> GuC's hang detection has been disabled), the driver can't rely on GuC
> reporting the guilty context. Instead, the driver needs to scan all
> active contexts and find one that is currently executing, as per the
> execlist mode behaviour. In GuC mode, this scan is different to
> execlist mode as the active request list is handled very differently.
> 
> Similarly, the request state dump in debugfs needs to be handled
> differently when in GuC submission mode.
> 
> Also refactured some of the request scanning code to avoid duplication
> across the multiple code paths that are now replicating it.
> 
> Signed-off-by: John Harrison <john.c.harrison at intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>

Reviewed-by: Matthew Brost <matthew.brost at intel.com>

> ---
>  drivers/gpu/drm/i915/gt/intel_engine.h        |   3 +
>  drivers/gpu/drm/i915/gt/intel_engine_cs.c     | 139 ++++++++++++------
>  .../gpu/drm/i915/gt/intel_engine_heartbeat.c  |   8 +
>  drivers/gpu/drm/i915/gt/intel_reset.c         |   2 +-
>  drivers/gpu/drm/i915/gt/uc/intel_guc.h        |   2 +
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  67 +++++++++
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   3 +
>  drivers/gpu/drm/i915/i915_request.c           |  41 ++++++
>  drivers/gpu/drm/i915/i915_request.h           |  11 ++
>  9 files changed, 229 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> index 6ea5643a3aaa..9ba131175564 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> @@ -240,6 +240,9 @@ __printf(3, 4)
>  void intel_engine_dump(struct intel_engine_cs *engine,
>  		       struct drm_printer *m,
>  		       const char *header, ...);
> +void intel_engine_dump_active_requests(struct list_head *requests,
> +				       struct i915_request *hung_rq,
> +				       struct drm_printer *m);
>  
>  ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
>  				   ktime_t *now);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 1d243b83b023..bbea7c9a367d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1624,6 +1624,97 @@ static void print_properties(struct intel_engine_cs *engine,
>  			   read_ul(&engine->defaults, p->offset));
>  }
>  
> +static void engine_dump_request(struct i915_request *rq, struct drm_printer *m, const char *msg)
> +{
> +	struct intel_timeline *tl = get_timeline(rq);
> +
> +	i915_request_show(m, rq, msg, 0);
> +
> +	drm_printf(m, "\t\tring->start:  0x%08x\n",
> +		   i915_ggtt_offset(rq->ring->vma));
> +	drm_printf(m, "\t\tring->head:   0x%08x\n",
> +		   rq->ring->head);
> +	drm_printf(m, "\t\tring->tail:   0x%08x\n",
> +		   rq->ring->tail);
> +	drm_printf(m, "\t\tring->emit:   0x%08x\n",
> +		   rq->ring->emit);
> +	drm_printf(m, "\t\tring->space:  0x%08x\n",
> +		   rq->ring->space);
> +
> +	if (tl) {
> +		drm_printf(m, "\t\tring->hwsp:   0x%08x\n",
> +			   tl->hwsp_offset);
> +		intel_timeline_put(tl);
> +	}
> +
> +	print_request_ring(m, rq);
> +
> +	if (rq->context->lrc_reg_state) {
> +		drm_printf(m, "Logical Ring Context:\n");
> +		hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
> +	}
> +}
> +
> +void intel_engine_dump_active_requests(struct list_head *requests,
> +				       struct i915_request *hung_rq,
> +				       struct drm_printer *m)
> +{
> +	struct i915_request *rq;
> +	const char *msg;
> +	enum i915_request_state state;
> +
> +	list_for_each_entry(rq, requests, sched.link) {
> +		if (rq == hung_rq)
> +			continue;
> +
> +		state = i915_test_request_state(rq);
> +		if (state < I915_REQUEST_QUEUED)
> +			continue;
> +
> +		if (state == I915_REQUEST_ACTIVE)
> +			msg = "\t\tactive on engine";
> +		else
> +			msg = "\t\tactive in queue";
> +
> +		engine_dump_request(rq, m, msg);
> +	}
> +}
> +
> +static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
> +{
> +	struct i915_request *hung_rq = NULL;
> +	struct intel_context *ce;
> +	bool guc;
> +
> +	/*
> +	 * No need for an engine->irq_seqno_barrier() before the seqno reads.
> +	 * The GPU is still running so requests are still executing and any
> +	 * hardware reads will be out of date by the time they are reported.
> +	 * But the intention here is just to report an instantaneous snapshot
> +	 * so that's fine.
> +	 */
> +	lockdep_assert_held(&engine->sched_engine->lock);
> +
> +	drm_printf(m, "\tRequests:\n");
> +
> +	guc = intel_uc_uses_guc_submission(&engine->gt->uc);
> +	if (guc) {
> +		ce = intel_engine_get_hung_context(engine);
> +		if (ce)
> +			hung_rq = intel_context_find_active_request(ce);
> +	} else
> +		hung_rq = intel_engine_execlist_find_hung_request(engine);
> +
> +	if (hung_rq)
> +		engine_dump_request(hung_rq, m, "\t\thung");
> +
> +	if (guc)
> +		intel_guc_dump_active_requests(engine, hung_rq, m);
> +	else
> +		intel_engine_dump_active_requests(&engine->sched_engine->requests,
> +						  hung_rq, m);
> +}
> +
>  void intel_engine_dump(struct intel_engine_cs *engine,
>  		       struct drm_printer *m,
>  		       const char *header, ...)
> @@ -1668,39 +1759,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>  		   i915_reset_count(error));
>  	print_properties(engine, m);
>  
> -	drm_printf(m, "\tRequests:\n");
> -
>  	spin_lock_irqsave(&engine->sched_engine->lock, flags);
> -	rq = intel_engine_execlist_find_hung_request(engine);
> -	if (rq) {
> -		struct intel_timeline *tl = get_timeline(rq);
> -
> -		i915_request_show(m, rq, "\t\tactive ", 0);
> -
> -		drm_printf(m, "\t\tring->start:  0x%08x\n",
> -			   i915_ggtt_offset(rq->ring->vma));
> -		drm_printf(m, "\t\tring->head:   0x%08x\n",
> -			   rq->ring->head);
> -		drm_printf(m, "\t\tring->tail:   0x%08x\n",
> -			   rq->ring->tail);
> -		drm_printf(m, "\t\tring->emit:   0x%08x\n",
> -			   rq->ring->emit);
> -		drm_printf(m, "\t\tring->space:  0x%08x\n",
> -			   rq->ring->space);
> -
> -		if (tl) {
> -			drm_printf(m, "\t\tring->hwsp:   0x%08x\n",
> -				   tl->hwsp_offset);
> -			intel_timeline_put(tl);
> -		}
> -
> -		print_request_ring(m, rq);
> +	engine_dump_active_requests(engine, m);
>  
> -		if (rq->context->lrc_reg_state) {
> -			drm_printf(m, "Logical Ring Context:\n");
> -			hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
> -		}
> -	}
>  	drm_printf(m, "\tOn hold?: %lu\n",
>  		   list_count(&engine->sched_engine->hold));
>  	spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> @@ -1774,13 +1835,6 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings,
>  	return siblings[0]->cops->create_virtual(siblings, count);
>  }
>  
> -static bool match_ring(struct i915_request *rq)
> -{
> -	u32 ring = ENGINE_READ(rq->engine, RING_START);
> -
> -	return ring == i915_ggtt_offset(rq->ring->vma);
> -}
> -
>  struct i915_request *
>  intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
>  {
> @@ -1824,14 +1878,7 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
>  
>  	list_for_each_entry(request, &engine->sched_engine->requests,
>  			    sched.link) {
> -		if (__i915_request_is_complete(request))
> -			continue;
> -
> -		if (!__i915_request_has_started(request))
> -			continue;
> -
> -		/* More than one preemptible request may match! */
> -		if (!match_ring(request))
> +		if (i915_test_request_state(request) != I915_REQUEST_ACTIVE)
>  			continue;
>  
>  		active = request;
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> index a8495364d906..f0768824de6f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> @@ -90,6 +90,14 @@ reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
>  	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
>  		show_heartbeat(rq, engine);
>  
> +	if (intel_engine_uses_guc(engine))
> +		/*
> +		 * GuC itself is toast or GuC's hang detection
> +		 * is disabled. Either way, need to find the
> +		 * hang culprit manually.
> +		 */
> +		intel_guc_find_hung_context(engine);
> +
>  	intel_gt_handle_error(engine->gt, engine->mask,
>  			      I915_ERROR_CAPTURE,
>  			      "stopped heartbeat on %s",
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 2987282dff6d..f3cdbf4ba5c8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -156,7 +156,7 @@ void __i915_request_reset(struct i915_request *rq, bool guilty)
>  	if (guilty) {
>  		i915_request_set_error_once(rq, -EIO);
>  		__i915_request_skip(rq);
> -		if (mark_guilty(rq))
> +		if (mark_guilty(rq) && !intel_engine_uses_guc(rq->engine))
>  			skip_context(rq);
>  	} else {
>  		i915_request_set_error_once(rq, -EAGAIN);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index ab1a85b508db..c38365cd5fab 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -268,6 +268,8 @@ int intel_guc_context_reset_process_msg(struct intel_guc *guc,
>  int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
>  					 const u32 *msg, u32 len);
>  
> +void intel_guc_find_hung_context(struct intel_engine_cs *engine);
> +
>  void intel_guc_submission_reset_prepare(struct intel_guc *guc);
>  void intel_guc_submission_reset(struct intel_guc *guc, bool stalled);
>  void intel_guc_submission_reset_finish(struct intel_guc *guc);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 315edeaa186a..6188189314d5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -2267,6 +2267,73 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
>  	return 0;
>  }
>  
> +void intel_guc_find_hung_context(struct intel_engine_cs *engine)
> +{
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	struct intel_context *ce;
> +	struct i915_request *rq;
> +	unsigned long index;
> +
> +	/* Reset called during driver load? GuC not yet initialised! */
> +	if (unlikely(!guc_submission_initialized(guc)))
> +		return;
> +
> +	xa_for_each(&guc->context_lookup, index, ce) {
> +		if (!intel_context_is_pinned(ce))
> +			continue;
> +
> +		if (intel_engine_is_virtual(ce->engine)) {
> +			if (!(ce->engine->mask & engine->mask))
> +				continue;
> +		} else {
> +			if (ce->engine != engine)
> +				continue;
> +		}
> +
> +		list_for_each_entry(rq, &ce->guc_active.requests, sched.link) {
> +			if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE)
> +				continue;
> +
> +			intel_engine_set_hung_context(engine, ce);
> +
> +			/* Can only cope with one hang at a time... */
> +			return;
> +		}
> +	}
> +}
> +
> +void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
> +				    struct i915_request *hung_rq,
> +				    struct drm_printer *m)
> +{
> +	struct intel_guc *guc = &engine->gt->uc.guc;
> +	struct intel_context *ce;
> +	unsigned long index;
> +	unsigned long flags;
> +
> +	/* Reset called during driver load? GuC not yet initialised! */
> +	if (unlikely(!guc_submission_initialized(guc)))
> +		return;
> +
> +	xa_for_each(&guc->context_lookup, index, ce) {
> +		if (!intel_context_is_pinned(ce))
> +			continue;
> +
> +		if (intel_engine_is_virtual(ce->engine)) {
> +			if (!(ce->engine->mask & engine->mask))
> +				continue;
> +		} else {
> +			if (ce->engine != engine)
> +				continue;
> +		}
> +
> +		spin_lock_irqsave(&ce->guc_active.lock, flags);
> +		intel_engine_dump_active_requests(&ce->guc_active.requests,
> +						  hung_rq, m);
> +		spin_unlock_irqrestore(&ce->guc_active.lock, flags);
> +	}
> +}
> +
>  void intel_guc_log_submission_info(struct intel_guc *guc,
>  				   struct drm_printer *p)
>  {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index b9b9f0f60f91..a2a3fad72be1 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -24,6 +24,9 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine);
>  void intel_guc_log_submission_info(struct intel_guc *guc,
>  				   struct drm_printer *p);
>  void intel_guc_log_context_info(struct intel_guc *guc, struct drm_printer *p);
> +void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
> +				    struct i915_request *hung_rq,
> +				    struct drm_printer *m);
>  
>  bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>  
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 192784875a1d..2978c8d45021 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -2076,6 +2076,47 @@ void i915_request_show(struct drm_printer *m,
>  		   name);
>  }
>  
> +static bool engine_match_ring(struct intel_engine_cs *engine, struct i915_request *rq)
> +{
> +	u32 ring = ENGINE_READ(engine, RING_START);
> +
> +	return ring == i915_ggtt_offset(rq->ring->vma);
> +}
> +
> +static bool match_ring(struct i915_request *rq)
> +{
> +	struct intel_engine_cs *engine;
> +	bool found;
> +	int i;
> +
> +	if (!intel_engine_is_virtual(rq->engine))
> +		return engine_match_ring(rq->engine, rq);
> +
> +	found = false;
> +	i = 0;
> +	while ((engine = intel_engine_get_sibling(rq->engine, i++))) {
> +		found = engine_match_ring(engine, rq);
> +		if (found)
> +			break;
> +	}
> +
> +	return found;
> +}
> +
> +enum i915_request_state i915_test_request_state(struct i915_request *rq)
> +{
> +	if (i915_request_completed(rq))
> +		return I915_REQUEST_COMPLETE;
> +
> +	if (!i915_request_started(rq))
> +		return I915_REQUEST_PENDING;
> +
> +	if (match_ring(rq))
> +		return I915_REQUEST_ACTIVE;
> +
> +	return I915_REQUEST_QUEUED;
> +}
> +
>  #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>  #include "selftests/mock_request.c"
>  #include "selftests/i915_request.c"
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index bcc6340c505e..f98385f72782 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -651,4 +651,15 @@ i915_request_active_engine(struct i915_request *rq,
>  
>  void i915_request_notify_execute_cb_imm(struct i915_request *rq);
>  
> +enum i915_request_state
> +{
> +	I915_REQUEST_UNKNOWN = 0,
> +	I915_REQUEST_COMPLETE,
> +	I915_REQUEST_PENDING,
> +	I915_REQUEST_QUEUED,
> +	I915_REQUEST_ACTIVE,
> +};
> +
> +enum i915_request_state i915_test_request_state(struct i915_request *rq);
> +
>  #endif /* I915_REQUEST_H */
> -- 
> 2.28.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx


More information about the Intel-gfx mailing list