[Intel-gfx] [PATCH 06/24] drm/i915: Lock the engine while dumping the active request
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Tue Jul 16 10:26:43 UTC 2019
On 15/07/2019 09:09, Chris Wilson wrote:
> We cannot let the request be retired and freed while we are trying to
> dump it during error capture. It is not sufficient just to grab a
> reference to the request, as during retirement we may free the ring
> which we are also dumping. So take the engine lock to prevent retiring
> and freeing of the request.
>
> Reported-by: Alex Shumsky <alexthreed at gmail.com>
> Fixes: 83c317832eb1 ("drm/i915: Dump the ringbuffer of the active request for debugging")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Alex Shumsky <alexthreed at gmail.com>
> ---
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 11 ++++-------
> drivers/gpu/drm/i915/i915_gpu_error.c | 6 ++++--
> 2 files changed, 8 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 75099500d1ed..5f8909546d36 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1472,6 +1472,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> struct i915_gpu_error * const error = &engine->i915->gpu_error;
> struct i915_request *rq;
> intel_wakeref_t wakeref;
> + unsigned long flags;
>
> if (header) {
> va_list ap;
> @@ -1491,10 +1492,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> i915_reset_engine_count(error, engine),
> i915_reset_count(error));
>
> - rcu_read_lock();
> -
> drm_printf(m, "\tRequests:\n");
>
> + spin_lock_irqsave(&engine->active.lock, flags);
> rq = intel_engine_find_active_request(engine);
> if (rq) {
> print_request(m, rq, "\t\tactive ");
> @@ -1514,8 +1514,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>
> print_request_ring(m, rq);
> }
> -
> - rcu_read_unlock();
> + spin_unlock_irqrestore(&engine->active.lock, flags);
>
> wakeref = intel_runtime_pm_get_if_in_use(&engine->i915->runtime_pm);
> if (wakeref) {
> @@ -1654,7 +1653,6 @@ struct i915_request *
> intel_engine_find_active_request(struct intel_engine_cs *engine)
> {
> struct i915_request *request, *active = NULL;
> - unsigned long flags;
>
> /*
> * We are called by the error capture, reset and to dump engine
> @@ -1667,7 +1665,7 @@ intel_engine_find_active_request(struct intel_engine_cs *engine)
> * At all other times, we must assume the GPU is still running, but
> * we only care about the snapshot of this moment.
> */
> - spin_lock_irqsave(&engine->active.lock, flags);
> + lockdep_assert_held(&engine->active.lock);
> list_for_each_entry(request, &engine->active.requests, sched.link) {
> if (i915_request_completed(request))
> continue;
> @@ -1682,7 +1680,6 @@ intel_engine_find_active_request(struct intel_engine_cs *engine)
> active = request;
> break;
> }
> - spin_unlock_irqrestore(&engine->active.lock, flags);
>
> return active;
> }
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 78e388fa059c..c5b89bf4d616 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1411,6 +1411,7 @@ static void gem_record_rings(struct i915_gpu_state *error)
> struct intel_engine_cs *engine = i915->engine[i];
> struct drm_i915_error_engine *ee = &error->engine[i];
> struct i915_request *request;
> + unsigned long flags;
>
> ee->engine_id = -1;
>
> @@ -1422,10 +1423,11 @@ static void gem_record_rings(struct i915_gpu_state *error)
> error_record_engine_registers(error, engine, ee);
> error_record_engine_execlists(engine, ee);
>
> + spin_lock_irqsave(&engine->active.lock, flags);
> request = intel_engine_find_active_request(engine);
> if (request) {
> struct i915_gem_context *ctx = request->gem_context;
> - struct intel_ring *ring;
> + struct intel_ring *ring = request->ring;
>
> ee->vm = ctx->vm ?: &engine->gt->ggtt->vm;
>
> @@ -1455,7 +1457,6 @@ static void gem_record_rings(struct i915_gpu_state *error)
> ee->rq_post = request->postfix;
> ee->rq_tail = request->tail;
>
> - ring = request->ring;
> ee->cpu_ring_head = ring->head;
> ee->cpu_ring_tail = ring->tail;
> ee->ringbuffer =
> @@ -1463,6 +1464,7 @@ static void gem_record_rings(struct i915_gpu_state *error)
>
> engine_record_requests(engine, request, ee);
> }
> + spin_unlock_irqrestore(&engine->active.lock, flags);
>
> ee->hws_page =
> i915_error_object_create(i915,
>
I missed in the initial read the fact underlying allocations are already
atomic.
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list