[RFC PATCH 1/1] drm/amdgpu: Protect the amdgpu_bo_list list with a mutex

Tue Jul 12 08:07:17 UTC 2022

Am 12.07.22 um 07:39 schrieb Luben Tuikov:
> Protect the struct amdgpu_bo_list with a mutex. This is used during command
> submission in order to avoid buffer object corruption as recorded in
> the link below.
>
> Suggested-by: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <Alexander.Deucher at amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
> Cc: Vitaly Prosyak <Vitaly.Prosyak at amd.com>
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048
> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |  4 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 31 +++++++++++++++++++--
>   3 files changed, 35 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> index 714178f1b6c6ed..2168163aad2d38 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> @@ -40,7 +40,7 @@ static void amdgpu_bo_list_free_rcu(struct rcu_head *rcu)
>   {
>   	struct amdgpu_bo_list *list = container_of(rcu, struct amdgpu_bo_list,
>   						   rhead);
> -
> +	mutex_destroy(&list->bo_list_mutex);
>   	kvfree(list);
>   }
>   
> @@ -136,6 +136,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
>   
>   	trace_amdgpu_cs_bo_status(list->num_entries, total_size);
>   
> +	mutex_init(&list->bo_list_mutex);
>   	*result = list;
>   	return 0;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> index 044b41f0bfd9ce..717984d4de6858 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> @@ -48,6 +48,10 @@ struct amdgpu_bo_list {
>   	struct amdgpu_bo *oa_obj;
>   	unsigned first_userptr;
>   	unsigned num_entries;
> +
> +	/* Protect access during command submission.
> +	 */
> +	struct mutex bo_list_mutex;
>   };
>   
>   int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 36ac1f1d11e6b4..0b2932c20ec777 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -517,6 +517,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   			return r;
>   	}
>   
> +	mutex_lock(&p->bo_list->bo_list_mutex);

That lock/unlock placement is not correct and probably the reason why 
you still run into trouble with this patch.

You need to grab the lock before the call to amdgpu_bo_list_get_list() 
and drop it either after a call to ttm_eu_backoff_reservation() or 
ttm_eu_fence_buffer_objects().

If the lock is dropped anywhere in between we would have list corruption 
again.

Regards,
Christian.

> +
>   	/* One for TTM and one for the CS job */
>   	amdgpu_bo_list_for_each_entry(e, p->bo_list)
>   		e->tv.num_shared = 2;
> @@ -544,6 +546,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   		if (!e->user_pages) {
>   			DRM_ERROR("kvmalloc_array failure\n");
>   			r = -ENOMEM;
> +			mutex_unlock(&p->bo_list->bo_list_mutex);
>   			goto out_free_user_pages;
>   		}
>   
> @@ -551,6 +554,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   		if (r) {
>   			kvfree(e->user_pages);
>   			e->user_pages = NULL;
> +			mutex_unlock(&p->bo_list->bo_list_mutex);
>   			goto out_free_user_pages;
>   		}
>   
> @@ -568,6 +572,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   	if (unlikely(r != 0)) {
>   		if (r != -ERESTARTSYS)
>   			DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
> +		mutex_unlock(&p->bo_list->bo_list_mutex);
>   		goto out_free_user_pages;
>   	}
>   
> @@ -580,11 +585,14 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   			e->chain = dma_fence_chain_alloc();
>   			if (!e->chain) {
>   				r = -ENOMEM;
> +				mutex_unlock(&p->bo_list->bo_list_mutex);
>   				goto error_validate;
>   			}
>   		}
>   	}
>   
> +	mutex_unlock(&p->bo_list->bo_list_mutex);

> +
>   	/* Move fence waiting after getting reservation lock of
>   	 * PD root. Then there is no need on a ctx mutex lock.
>   	 */
> @@ -607,6 +615,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   		goto error_validate;
>   	}
>   
> +	mutex_lock(&p->bo_list->bo_list_mutex);
>   	r = amdgpu_cs_list_validate(p, &duplicates);
>   	if (r)
>   		goto error_validate;
> @@ -614,6 +623,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   	r = amdgpu_cs_list_validate(p, &p->validated);
>   	if (r)
>   		goto error_validate;
> +	mutex_unlock(&p->bo_list->bo_list_mutex);
>   
>   	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
>   				     p->bytes_moved_vis);
> @@ -644,15 +654,18 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   
>   error_validate:
>   	if (r) {
> +		mutex_lock(&p->bo_list->bo_list_mutex);
>   		amdgpu_bo_list_for_each_entry(e, p->bo_list) {
>   			dma_fence_chain_free(e->chain);
>   			e->chain = NULL;
>   		}
>   		ttm_eu_backoff_reservation(&p->ticket, &p->validated);
> +		mutex_unlock(&p->bo_list->bo_list_mutex);
>   	}
>   
>   out_free_user_pages:
>   	if (r) {
> +		mutex_lock(&p->bo_list->bo_list_mutex);
>   		amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
>   			struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>   
> @@ -662,6 +675,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   			kvfree(e->user_pages);
>   			e->user_pages = NULL;
>   		}
> +		mutex_unlock(&p->bo_list->bo_list_mutex);
>   	}
>   	return r;
>   }
> @@ -704,6 +718,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
>   	if (error && backoff) {
>   		struct amdgpu_bo_list_entry *e;
>   
> +		mutex_lock(&parser->bo_list->bo_list_mutex);
>   		amdgpu_bo_list_for_each_entry(e, parser->bo_list) {
>   			dma_fence_chain_free(e->chain);
>   			e->chain = NULL;
> @@ -711,6 +726,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
>   
>   		ttm_eu_backoff_reservation(&parser->ticket,
>   					   &parser->validated);
> +		mutex_unlock(&parser->bo_list->bo_list_mutex);
>   	}
>   
>   	for (i = 0; i < parser->num_post_deps; i++) {
> @@ -839,6 +855,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   			return r;
>   	}
>   
> +	mutex_lock(&p->bo_list->bo_list_mutex);
>   	amdgpu_bo_list_for_each_entry(e, p->bo_list) {
>   		/* ignore duplicates */
>   		bo = ttm_to_amdgpu_bo(e->tv.bo);
> @@ -850,13 +867,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   			continue;
>   
>   		r = amdgpu_vm_bo_update(adev, bo_va, false);
> -		if (r)
> +		if (r) {
> +			mutex_unlock(&p->bo_list->bo_list_mutex);
>   			return r;
> +		}
>   
>   		r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
> -		if (r)
> +		if (r) {
> +			mutex_unlock(&p->bo_list->bo_list_mutex);
>   			return r;
> +		}
>   	}
> +	mutex_unlock(&p->bo_list->bo_list_mutex);
>   
>   	r = amdgpu_vm_handle_moved(adev, vm);
>   	if (r)
> @@ -874,6 +896,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   
>   	if (amdgpu_vm_debug) {
>   		/* Invalidate all BOs to test for userspace bugs */
> +		mutex_lock(&p->bo_list->bo_list_mutex);
>   		amdgpu_bo_list_for_each_entry(e, p->bo_list) {
>   			struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>   
> @@ -883,6 +906,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   
>   			amdgpu_vm_bo_invalidate(adev, bo, false);
>   		}
> +		mutex_unlock(&p->bo_list->bo_list_mutex);
>   	}
>   
>   	return amdgpu_cs_sync_rings(p);
> @@ -1249,6 +1273,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	 * added to BOs.
>   	 */
>   	mutex_lock(&p->adev->notifier_lock);
> +	mutex_lock(&p->bo_list->bo_list_mutex);
>   
>   	/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
>   	 * -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
> @@ -1308,12 +1333,14 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	}
>   
>   	ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
> +	mutex_unlock(&p->bo_list->bo_list_mutex);
>   	mutex_unlock(&p->adev->notifier_lock);
>   
>   	return 0;
>   
>   error_abort:
>   	drm_sched_job_cleanup(&job->base);
> +	mutex_unlock(&p->bo_list->bo_list_mutex);
>   	mutex_unlock(&p->adev->notifier_lock);
>   
>   error_unlock: