[Intel-gfx] [PATCH v3 2/6] drm/i915: Fix up locking around dumping requests lists
Ceraolo Spurio, Daniele
daniele.ceraolospurio at intel.com
Fri Jan 20 01:41:40 UTC 2023
On 1/18/2023 10:49 PM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> The debugfs dump of requests was confused about what state requires
> the execlist lock versus the GuC lock. There was also a bunch of
> duplicated messy code between it and the error capture code.
>
> So refactor the hung request search into a re-usable function. And
> reduce the span of the execlist state lock to only the execlist
> specific code paths.
>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_context.c | 29 +++++++++++++
> drivers/gpu/drm/i915/gt/intel_context.h | 3 ++
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 51 +++++++++++------------
> drivers/gpu/drm/i915/i915_gpu_error.c | 27 ++----------
> 4 files changed, 60 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index e7c5509c48ef1..a61f052092ed9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -559,6 +559,35 @@ struct i915_request *intel_context_find_active_request(struct intel_context *ce)
> return active;
> }
>
> +void intel_get_hung_entity(struct intel_engine_cs *engine,
> + struct intel_context **ce, struct i915_request **rq)
IMO this belongs in the engine_cs.c file, given that the engine is the
input. Might also be worth renaming to intel_engine_*
> +{
> + unsigned long flags;
> +
> + *ce = intel_engine_get_hung_context(engine);
> + if (*ce) {
> + intel_engine_clear_hung_context(engine);
> +
> + /* This will reference count the request (if found) */
> + *rq = intel_context_find_active_request(*ce);
> +
> + return;
> + }
> +
> + /*
> + * Getting here with GuC enabled means it is a forced error capture
> + * with no actual hang. So, no need to attempt the execlist search.
> + */
> + if (intel_uc_uses_guc_submission(&engine->gt->uc))
> + return;
> +
> + spin_lock_irqsave(&engine->sched_engine->lock, flags);
> + *rq = intel_engine_execlist_find_hung_request(engine);
> + if (*rq)
> + *rq = i915_request_get_rcu(*rq);
> + spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> +}
> +
> void intel_context_bind_parent_child(struct intel_context *parent,
> struct intel_context *child)
> {
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index fb62b7b8cbcda..ca50f3312a941 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -271,6 +271,9 @@ struct i915_request *intel_context_create_request(struct intel_context *ce);
> struct i915_request *
> intel_context_find_active_request(struct intel_context *ce);
>
> +void intel_get_hung_entity(struct intel_engine_cs *engine,
> + struct intel_context **ce, struct i915_request **rq);
> +
> static inline bool intel_context_is_barrier(const struct intel_context *ce)
> {
> return test_bit(CONTEXT_BARRIER_BIT, &ce->flags);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 6a082658d0082..5e173dfc8849e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -2216,11 +2216,27 @@ void intel_engine_dump_active_requests(struct list_head *requests,
> }
> }
>
> -static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
> +static void execlist_dump_active_requests(struct intel_engine_cs *engine,
Might be worth moving this to the execlists submission file, to be
symmetrical with the GuC submission one. Other execlists functions are
in this file though, so not a blocker.
> + struct i915_request *hung_rq,
> + struct drm_printer *m)
> {
> + unsigned long flags;
> +
> + spin_lock_irqsave(&engine->sched_engine->lock, flags);
> +
> + intel_engine_dump_active_requests(&engine->sched_engine->requests, hung_rq, m);
> +
> + drm_printf(m, "\tOn hold?: %lu\n",
> + list_count(&engine->sched_engine->hold));
> +
> + spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> +}
> +
> +static void engine_dump_active_requests(struct intel_engine_cs *engine,
> + struct drm_printer *m)
> +{
> + struct intel_context *hung_ce = NULL;
> struct i915_request *hung_rq = NULL;
> - struct intel_context *ce;
> - bool guc;
>
> /*
> * No need for an engine->irq_seqno_barrier() before the seqno reads.
> @@ -2229,31 +2245,20 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
> * But the intention here is just to report an instantaneous snapshot
> * so that's fine.
> */
> - lockdep_assert_held(&engine->sched_engine->lock);
> + intel_get_hung_entity(engine, &hung_ce, &hung_rq);
>
> drm_printf(m, "\tRequests:\n");
>
> - guc = intel_uc_uses_guc_submission(&engine->gt->uc);
> - if (guc) {
> - ce = intel_engine_get_hung_context(engine);
> - if (ce) {
> - /* This will reference count the request (if found) */
> - hung_rq = intel_context_find_active_request(ce);
> - }
> - } else {
> - hung_rq = intel_engine_execlist_find_hung_request(engine);
> - if (hung_rq)
> - hung_rq = i915_request_get_rcu(hung_rq);
> - }
> -
> if (hung_rq)
> engine_dump_request(hung_rq, m, "\t\thung");
> + else if (hung_ce)
> + drm_printf(m, "\t\tGot hung ce but no hung rq!\n");
>
> - if (guc)
> + if (intel_uc_uses_guc_submission(&engine->gt->uc))
> intel_guc_dump_active_requests(engine, hung_rq, m);
> else
> - intel_engine_dump_active_requests(&engine->sched_engine->requests,
> - hung_rq, m);
> + execlist_dump_active_requests(engine, hung_rq, m);
> +
> if (hung_rq)
> i915_request_put(hung_rq);
> }
> @@ -2265,7 +2270,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> struct i915_gpu_error * const error = &engine->i915->gpu_error;
> struct i915_request *rq;
> intel_wakeref_t wakeref;
> - unsigned long flags;
> ktime_t dummy;
>
> if (header) {
> @@ -2302,13 +2306,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> i915_reset_count(error));
> print_properties(engine, m);
>
> - spin_lock_irqsave(&engine->sched_engine->lock, flags);
> engine_dump_active_requests(engine, m);
>
> - drm_printf(m, "\tOn hold?: %lu\n",
> - list_count(&engine->sched_engine->hold));
This print moves from the global function to the execlists submission
one. AFAICS this is only used in execlists mode so not an issue, but a
note in the commit message would've been nice.
> - spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> -
> drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base);
> wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm);
> if (wakeref) {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 7ea36478ee52d..78cf95e4dd230 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1596,36 +1596,15 @@ capture_engine(struct intel_engine_cs *engine,
> {
> struct intel_engine_capture_vma *capture = NULL;
> struct intel_engine_coredump *ee;
> - struct intel_context *ce;
> + struct intel_context *ce = NULL;
> struct i915_request *rq = NULL;
> - unsigned long flags;
>
> ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags);
> if (!ee)
> return NULL;
>
> - ce = intel_engine_get_hung_context(engine);
> - if (ce) {
> - intel_engine_clear_hung_context(engine);
> - /* This will reference count the request (if found) */
> - rq = intel_context_find_active_request(ce);
> - if (!rq || !i915_request_started(rq))
> - goto no_request_capture;
> - } else {
> - /*
> - * Getting here with GuC enabled means it is a forced error capture
> - * with no actual hang. So, no need to attempt the execlist search.
> - */
> - if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
> - spin_lock_irqsave(&engine->sched_engine->lock, flags);
> - rq = intel_engine_execlist_find_hung_request(engine);
> - if (rq)
> - rq = i915_request_get_rcu(rq);
> - spin_unlock_irqrestore(&engine->sched_engine->lock,
> - flags);
> - }
> - }
> - if (!rq)
> + intel_get_hung_entity(engine, &ce, &rq);
> + if (!rq || !i915_request_started(rq))
> goto no_request_capture;
This has been a pain to review, but AFAICS all the locking is correct,
so with the get_hung function moved to the intel_engine file this is:
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>
> capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
More information about the dri-devel
mailing list