[PATCH 1/2] drm/amdgpu:resolv deadlock between reset and cs_ioctl v4.

Mon Oct 9 08:34:29 UTC 2017

Am 06.10.2017 um 20:20 schrieb Andrey Grodzovsky:
> From: Monk Liu <Monk.Liu at amd.com>
>
> need to unreserve ttm bo before "cs_add_fence" and "entity_push_job"
> otherwise there will be deadlock between "recover_vram_from_shadow"
> and previous two routines on the ttm bo's resv lock.
>
> v2:
> Add per ctx mutex.
>
> v3:
> Rellocate mutex aquisition into amdgpu_cs_parser_init and muex release
> into amdgpu_cs_parser_fini to avoid nested locking lockup.
> Add rollback code for amdgpu_ctx_add_fence in case of error or signal
> interruption.
>
> v4:
> Refactor amdgpu_cs_ib_vm_chunk and amdgpu_cs_ib_fill to enable
> old fence waiting before reservation lock is aquired.
>
> Change-Id: Ia209beab5036bfc2c38cbf18324fa3efd4bab1cf
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |   1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 164 ++++++++++++++++++--------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |   4 +
>   3 files changed, 100 insertions(+), 69 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 53d8df3..baa2953 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -724,6 +724,7 @@ struct amdgpu_ctx {
>   	struct dma_fence	**fences;
>   	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
>   	bool preamble_presented;
> +	struct mutex		lock;
>   };
>   
>   struct amdgpu_ctx_mgr {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 9f1202a..0fa1bc7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -89,6 +89,9 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>   		goto free_chunk;
>   	}
>   
> +
> +	mutex_lock(&p->ctx->lock);
> +
>   	/* get chunks */
>   	chunk_array_user = u64_to_user_ptr(cs->in.chunks);
>   	if (copy_from_user(chunk_array, chunk_array_user,
> @@ -715,28 +718,21 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>   /**
>    * cs_parser_fini() - clean parser states
>    * @parser:	parser structure holding parsing context.
> - * @error:	error number
> - *
> - * If error is set than unvalidate buffer, otherwise just free memory
> - * used by parsing context.
>    **/
> -static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
> -				  bool backoff)
> +static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)

You can now drop this change and keep the logic as it is since we moved 
the wait before taking the reservation locks.

>   {
>   	unsigned i;
>   
> -	if (error && backoff)
> -		ttm_eu_backoff_reservation(&parser->ticket,
> -					   &parser->validated);
> -
>   	for (i = 0; i < parser->num_post_dep_syncobjs; i++)
>   		drm_syncobj_put(parser->post_dep_syncobjs[i]);
>   	kfree(parser->post_dep_syncobjs);
>   
>   	dma_fence_put(parser->fence);
>   
> -	if (parser->ctx)
> +	if (parser->ctx) {
> +		mutex_unlock(&parser->ctx->lock);
>   		amdgpu_ctx_put(parser->ctx);
> +	}
>   	if (parser->bo_list)
>   		amdgpu_bo_list_put(parser->bo_list);
>   
> @@ -843,7 +839,72 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev,
>   	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>   	struct amdgpu_vm *vm = &fpriv->vm;
>   	struct amdgpu_ring *ring = p->job->ring;
> -	int i, r;
> +	int i, j, r;
> +
> +	for (i = 0, j = 0; i < p->nchunks && j < p->job->num_ibs; i++) {
> +
> +		struct amdgpu_cs_chunk *chunk;
> +		struct amdgpu_ib *ib;
> +		struct drm_amdgpu_cs_chunk_ib *chunk_ib;
> +
> +		chunk = &p->chunks[i];
> +		ib = &p->job->ibs[j];
> +		chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
> +
> +		if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
> +					continue;
> +
> +		if (p->job->ring->funcs->parse_cs) {
> +			struct amdgpu_bo_va_mapping *m;
> +			struct amdgpu_bo *aobj = NULL;
> +			uint64_t offset;
> +			uint8_t *kptr;
> +
> +			r = amdgpu_cs_find_mapping(p, chunk_ib->va_start,
> +						   &aobj, &m);
> +			if (r) {
> +				DRM_ERROR("IB va_start is invalid\n");
> +				return r;
> +			}
> +
> +			if ((chunk_ib->va_start + chunk_ib->ib_bytes) >
> +				(m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
> +				DRM_ERROR("IB va_start+ib_bytes is invalid\n");
> +				return -EINVAL;
> +			}
> +
> +			/* the IB should be reserved at this point */
> +			r = amdgpu_bo_kmap(aobj, (void **)&kptr);
> +			if (r) {
> +				return r;
> +			}
> +
> +			offset = m->start * AMDGPU_GPU_PAGE_SIZE;
> +			kptr += chunk_ib->va_start - offset;
> +
> +			r =  amdgpu_ib_get(adev, vm, chunk_ib->ib_bytes, ib);
> +			if (r) {
> +				DRM_ERROR("Failed to get ib !\n");
> +				return r;
> +			}
> +
> +			memcpy(ib->ptr, kptr, chunk_ib->ib_bytes);
> +			amdgpu_bo_kunmap(aobj);
> +		} else {
> +			r =  amdgpu_ib_get(adev, vm, 0, ib);
> +			if (r) {
> +				DRM_ERROR("Failed to get ib !\n");
> +				return r;
> +			}
> +
> +		}
> +
> +		ib->gpu_addr = chunk_ib->va_start;
> +		ib->length_dw = chunk_ib->ib_bytes / 4;
> +		ib->flags = chunk_ib->flags;

Please keep the calls to amdgpu_ib_get() inside amdgpu_cs_ib_fill().

> +		j++;
> +
> +	}
>   
>   	/* Only for UVD/VCE VM emulation */
>   	if (ring->funcs->parse_cs) {
> @@ -868,19 +929,15 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev,
>   static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>   			     struct amdgpu_cs_parser *parser)
>   {
> -	struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
> -	struct amdgpu_vm *vm = &fpriv->vm;
>   	int i, j;
>   	int r, ce_preempt = 0, de_preempt = 0;
>   
>   	for (i = 0, j = 0; i < parser->nchunks && j < parser->job->num_ibs; i++) {
>   		struct amdgpu_cs_chunk *chunk;
> -		struct amdgpu_ib *ib;
>   		struct drm_amdgpu_cs_chunk_ib *chunk_ib;
>   		struct amdgpu_ring *ring;
>   
>   		chunk = &parser->chunks[i];
> -		ib = &parser->job->ibs[j];
>   		chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
>   
>   		if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
> @@ -917,54 +974,6 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>   
>   		parser->job->ring = ring;
>   
> -		if (ring->funcs->parse_cs) {
> -			struct amdgpu_bo_va_mapping *m;
> -			struct amdgpu_bo *aobj = NULL;
> -			uint64_t offset;
> -			uint8_t *kptr;
> -
> -			r = amdgpu_cs_find_mapping(parser, chunk_ib->va_start,
> -						   &aobj, &m);
> -			if (r) {
> -				DRM_ERROR("IB va_start is invalid\n");
> -				return r;
> -			}
> -
> -			if ((chunk_ib->va_start + chunk_ib->ib_bytes) >
> -			    (m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
> -				DRM_ERROR("IB va_start+ib_bytes is invalid\n");
> -				return -EINVAL;
> -			}
> -
> -			/* the IB should be reserved at this point */
> -			r = amdgpu_bo_kmap(aobj, (void **)&kptr);
> -			if (r) {
> -				return r;
> -			}
> -
> -			offset = m->start * AMDGPU_GPU_PAGE_SIZE;
> -			kptr += chunk_ib->va_start - offset;
> -
> -			r =  amdgpu_ib_get(adev, vm, chunk_ib->ib_bytes, ib);
> -			if (r) {
> -				DRM_ERROR("Failed to get ib !\n");
> -				return r;
> -			}
> -
> -			memcpy(ib->ptr, kptr, chunk_ib->ib_bytes);
> -			amdgpu_bo_kunmap(aobj);
> -		} else {
> -			r =  amdgpu_ib_get(adev, vm, 0, ib);
> -			if (r) {
> -				DRM_ERROR("Failed to get ib !\n");
> -				return r;
> -			}
> -
> -		}
> -
> -		ib->gpu_addr = chunk_ib->va_start;
> -		ib->length_dw = chunk_ib->ib_bytes / 4;
> -		ib->flags = chunk_ib->flags;
>   		j++;
>   	}
>   
> @@ -1160,14 +1169,26 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   
>   	amdgpu_cs_post_dependencies(p);
>   
> +
> +	/* hook sched fence to all BOs' reservation in validated list
> +	 * and unreserve them.
> +	 *
> +	 * we unreserve at here is because otherwise
> +	 * there'll be deadlock between ctx_add_fence/sched_entity_push_job
> +	 * and gpu_reset routine's recover_bo_from_shadow on PD/PTEs' ttm bo lock
> +	 */
> +	ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
> +
> +
>   	cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, p->fence);
> +
> +

Please drop that change. It isn't necessary any more after you moved the 
waiting outside of the BO lock.

Dito for most of the other changes as well.

Regards,
Christian.

>   	job->uf_sequence = cs->out.handle;
>   	amdgpu_job_free_resources(job);
>   
>   	trace_amdgpu_cs_ioctl(job);
>   	amd_sched_entity_push_job(&job->base);
>   
> -	ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
>   	amdgpu_mn_unlock(p->mn);
>   
>   	return 0;
> @@ -1189,6 +1210,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   
>   	parser.adev = adev;
>   	parser.filp = filp;
> +	fpriv = filp->driver_priv;
>   
>   	r = amdgpu_cs_parser_init(&parser, data);
>   	if (r) {
> @@ -1196,6 +1218,10 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   		goto out;
>   	}
>   
> +	r = amdgpu_cs_ib_fill(adev, &parser);
> +	if (r)
> +		goto out;
> +
>   	r = amdgpu_cs_parser_bos(&parser, data);
>   	if (r) {
>   		if (r == -ENOMEM)
> @@ -1206,9 +1232,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   	}
>   
>   	reserved_buffers = true;
> -	r = amdgpu_cs_ib_fill(adev, &parser);
> -	if (r)
> -		goto out;
>   
>   	r = amdgpu_cs_dependencies(adev, &parser);
>   	if (r) {
> @@ -1226,7 +1249,10 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   	r = amdgpu_cs_submit(&parser, cs);
>   
>   out:
> -	amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
> +	if (r && reserved_buffers)
> +		ttm_eu_backoff_reservation(&parser.ticket, &parser.validated);
> +
> +	amdgpu_cs_parser_fini(&parser);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index a11e443..c073a68 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -39,6 +39,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, struct amdgpu_ctx *ctx)
>   	if (!ctx->fences)
>   		return -ENOMEM;
>   
> +	mutex_init(&ctx->lock);
> +
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		ctx->rings[i].sequence = 1;
>   		ctx->rings[i].fences = &ctx->fences[amdgpu_sched_jobs * i];
> @@ -96,6 +98,8 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
>   				      &ctx->rings[i].entity);
>   
>   	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
> +
> +	mutex_destroy(&ctx->lock);
>   }
>   
>   static int amdgpu_ctx_alloc(struct amdgpu_device *adev,