[Intel-gfx] [PATCH 03/18] drm/i915: Lock the engine while dumping the active request

Thu Jul 11 07:20:59 UTC 2019

On 11/07/2019 07:52, Chris Wilson wrote:
> We cannot let the request be retired and freed while we are trying to
> dump it during error capture. It is not sufficient just to grab a
> reference to the request, as during retirement we may free the ring
> which we are also dumping. So take the engine lock to prevent retiring
> and freeing of the request.
> 
> Reported-by: Alex Shumsky <alexthreed at gmail.com>
> Fixes: 83c317832eb1 ("drm/i915: Dump the ringbuffer of the active request for debugging")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Alex Shumsky <alexthreed at gmail.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c | 11 ++++-------
>   drivers/gpu/drm/i915/i915_gpu_error.c     |  6 ++++--
>   2 files changed, 8 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index bdf279fa3b2e..0a2e7a404c74 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1484,6 +1484,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	struct i915_gpu_error * const error = &engine->i915->gpu_error;
>   	struct i915_request *rq;
>   	intel_wakeref_t wakeref;
> +	unsigned long flags;
>   
>   	if (header) {
>   		va_list ap;
> @@ -1503,10 +1504,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   		   i915_reset_engine_count(error, engine),
>   		   i915_reset_count(error));
>   
> -	rcu_read_lock();
> -
>   	drm_printf(m, "\tRequests:\n");
>   
> +	spin_lock_irqsave(&engine->active.lock, flags);
>   	rq = intel_engine_find_active_request(engine);
>   	if (rq) {
>   		print_request(m, rq, "\t\tactive ");
> @@ -1526,8 +1526,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   
>   		print_request_ring(m, rq);
>   	}
> -
> -	rcu_read_unlock();
> +	spin_unlock_irqrestore(&engine->active.lock, flags);
>   
>   	wakeref = intel_runtime_pm_get_if_in_use(&engine->i915->runtime_pm);
>   	if (wakeref) {
> @@ -1689,7 +1688,6 @@ struct i915_request *
>   intel_engine_find_active_request(struct intel_engine_cs *engine)
>   {
>   	struct i915_request *request, *active = NULL;
> -	unsigned long flags;
>   
>   	/*
>   	 * We are called by the error capture, reset and to dump engine
> @@ -1702,7 +1700,7 @@ intel_engine_find_active_request(struct intel_engine_cs *engine)
>   	 * At all other times, we must assume the GPU is still running, but
>   	 * we only care about the snapshot of this moment.
>   	 */
> -	spin_lock_irqsave(&engine->active.lock, flags);
> +	lockdep_assert_held(&engine->active.lock);
>   	list_for_each_entry(request, &engine->active.requests, sched.link) {
>   		if (i915_request_completed(request))
>   			continue;
> @@ -1717,7 +1715,6 @@ intel_engine_find_active_request(struct intel_engine_cs *engine)
>   		active = request;
>   		break;
>   	}
> -	spin_unlock_irqrestore(&engine->active.lock, flags);
>   
>   	return active;
>   }
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 5489cd879315..f297a43df1e9 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1411,6 +1411,7 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   		struct intel_engine_cs *engine = i915->engine[i];
>   		struct drm_i915_error_engine *ee = &error->engine[i];
>   		struct i915_request *request;
> +		unsigned long flags;
>   
>   		ee->engine_id = -1;
>   
> @@ -1422,10 +1423,11 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   		error_record_engine_registers(error, engine, ee);
>   		error_record_engine_execlists(engine, ee);
>   
> +		spin_lock_irqsave(&engine->active.lock, flags);
>   		request = intel_engine_find_active_request(engine);
>   		if (request) {
>   			struct i915_gem_context *ctx = request->gem_context;
> -			struct intel_ring *ring;
> +			struct intel_ring *ring = request->ring;
>   
>   			ee->vm = ctx->vm ?: &engine->gt->ggtt->vm;
>   
> @@ -1455,7 +1457,6 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   			ee->rq_post = request->postfix;
>   			ee->rq_tail = request->tail;
>   
> -			ring = request->ring;
>   			ee->cpu_ring_head = ring->head;
>   			ee->cpu_ring_tail = ring->tail;
>   			ee->ringbuffer =
> @@ -1463,6 +1464,7 @@ static void gem_record_rings(struct i915_gpu_state *error)
>   

Aren't there allocations in this block?

>   			engine_record_requests(engine, request, ee);
>   		}
> +		spin_unlock_irqrestore(&engine->active.lock, flags);
>   
>   		ee->hws_page =
>   			i915_error_object_create(i915,
> 

Regards,

Tvrtko