[PATCH v2 2/2] drm/amdgpu: Move old fence waiting before reservation lock is aquired.

Christian König christian.koenig at amd.com
Wed Oct 11 07:57:33 UTC 2017


Am 10.10.2017 um 22:50 schrieb Andrey Grodzovsky:
> Helps avoiding deadlock during GPU reset.
> Added mutex to amdgpu_ctx to preserve order of fences on a ring.
>
> v2:
> Put waiting logic in a function in a seperate function in amdgpu_ctx.c
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  8 ++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 30 ++++++++++++++++++++++++------
>   3 files changed, 34 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index da48f97..235eca5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -741,6 +741,7 @@ struct amdgpu_ctx {
>   	bool 			preamble_presented;
>   	enum amd_sched_priority init_priority;
>   	enum amd_sched_priority override_priority;
> +	struct mutex            lock;
>   };
>   
>   struct amdgpu_ctx_mgr {
> @@ -763,9 +764,12 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
>   int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
>   		     struct drm_file *filp);
>   
> +int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id);
> +
>   void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
>   
> +
>   /*
>    * file private structure
>    */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 1a54e53..c36297c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -90,6 +90,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>   		goto free_chunk;
>   	}
>   
> +	mutex_lock(&p->ctx->lock);
> +

There is a bug in amdgpu_cs_parser_init(), take a look at the put_ctx 
label. It calls amdgpu_ctx_put() without setting p->ctx to NULL after that.

This way amdgpu_cs_parser_fini() will call amdgpu_ctx_put() again and 
mess up the reference count.

That is not a bug introduced by this patch, so this one is Reviewed-by: 
Christian König <christian.koenig at amd.com>.

But please provide a follow up patch just removing the extra 
amdgpu_ctx_put() from amdgpu_cs_parser_init().

Thanks for the help,
Christian.

>   	/* get chunks */
>   	chunk_array_user = u64_to_user_ptr(cs->in.chunks);
>   	if (copy_from_user(chunk_array, chunk_array_user,
> @@ -737,8 +739,10 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
>   
>   	dma_fence_put(parser->fence);
>   
> -	if (parser->ctx)
> +	if (parser->ctx) {
> +		mutex_unlock(&parser->ctx->lock);
>   		amdgpu_ctx_put(parser->ctx);
> +	}
>   
>   	if (parser->bo_list)
>   		amdgpu_bo_list_put(parser->bo_list);
> @@ -992,7 +996,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>   	    parser->job->ring->funcs->type == AMDGPU_RING_TYPE_VCE))
>   		return -EINVAL;
>   
> -	return 0;
> +	return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->job->ring->idx);
>   }
>   
>   static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index a78b03f6..4309820 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -67,6 +67,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>   	if (!ctx->fences)
>   		return -ENOMEM;
>   
> +	mutex_init(&ctx->lock);
> +
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		ctx->rings[i].sequence = 1;
>   		ctx->rings[i].fences = &ctx->fences[amdgpu_sched_jobs * i];
> @@ -126,6 +128,8 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
>   				      &ctx->rings[i].entity);
>   
>   	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
> +
> +	mutex_destroy(&ctx->lock);
>   }
>   
>   static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
> @@ -296,12 +300,8 @@ int amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
>   
>   	idx = seq & (amdgpu_sched_jobs - 1);
>   	other = cring->fences[idx];
> -	if (other) {
> -		signed long r;
> -		r = dma_fence_wait_timeout(other, true, MAX_SCHEDULE_TIMEOUT);
> -		if (r < 0)
> -			return r;
> -	}
> +	if (other)
> +		BUG_ON(!dma_fence_is_signaled(other));
>   
>   	dma_fence_get(fence);
>   
> @@ -372,6 +372,24 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
>   	}
>   }
>   
> +int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id)
> +{
> +	struct amdgpu_ctx_ring *cring = &ctx->rings[ring_id];
> +	unsigned idx = cring->sequence & (amdgpu_sched_jobs - 1);
> +	struct dma_fence *other = cring->fences[idx];
> +
> +	if (other) {
> +		signed long r;
> +		r = dma_fence_wait_timeout(other, false, MAX_SCHEDULE_TIMEOUT);
> +		if (r < 0) {
> +			DRM_ERROR("Error (%ld) waiting for fence!\n", r);
> +			return r;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>   void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
>   {
>   	mutex_init(&mgr->lock);




More information about the amd-gfx mailing list