[PATCH 2/2] drm/amdgpu: fix gang submission error handling

Tvrtko Ursulin tvrtko.ursulin at igalia.com
Mon Mar 24 08:38:07 UTC 2025


On 21/03/2025 15:58, Christian König wrote:
> For the unlikely case that we ran into an ENOMEM while fixing up the gang
> submission dependencies we can't clean up any more since the gang
> members are already armed.
> 
> Fix this by using pre-allocated dependency slots and re-ordering the
> code, also fix a double unref since the fence reference is also dropped
> on error.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 57 +++++++++++++++-----------
>   1 file changed, 33 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 5cc5f59e3018..25e7f7d356d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1285,36 +1285,21 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	uint64_t seq;
>   	int r;
>   
> -	for (i = 0; i < p->gang_size; ++i)
> -		drm_sched_job_arm(&p->jobs[i]->base);
> -
> -	for (i = 0; i < p->gang_size; ++i) {
> -		struct dma_fence *fence;
> -
> -		if (p->jobs[i] == leader)
> -			continue;
> -
> -		fence = &p->jobs[i]->base.s_fence->scheduled;
> -		dma_fence_get(fence);
> -		r = drm_sched_job_add_dependency(&leader->base, fence);
> -		if (r) {
> -			dma_fence_put(fence);
> -			return r;
> -		}
> -	}
> -
> -	if (p->gang_size > 1) {
> -		for (i = 0; i < p->gang_size; ++i)
> -			amdgpu_job_set_gang_leader(p->jobs[i], leader);
> -	}
> +	/* Preallocate the memory for the gang dependencies */
> +	r = drm_sched_job_prealloc_dependency_slots(&leader->base,
> +						    p->gang_size - 1);
> +	if (r)
> +		return r;
>   
> -	/* No memory allocation is allowed while holding the notifier lock.
> +	/*
> +	 * No memory allocation is allowed while holding the notifier lock.
>   	 * The lock is held until amdgpu_cs_submit is finished and fence is
>   	 * added to BOs.
>   	 */
>   	mutex_lock(&p->adev->notifier_lock);
>   
> -	/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
> +	/*
> +	 * If userptr are invalidated after amdgpu_cs_parser_bos(), return
>   	 * -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
>   	 */
>   	r = 0;
> @@ -1329,6 +1314,30 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   		return r;
>   	}
>   
> +	for (i = 0; i < p->gang_size; ++i)
> +		drm_sched_job_arm(&p->jobs[i]->base);
> +
> +	for (i = 0; i < p->gang_size; ++i) {
> +		struct dma_fence *fence;
> +
> +		if (p->jobs[i] == leader)
> +			continue;
> +
> +		fence = dma_fence_get(&p->jobs[i]->base.s_fence->scheduled);
> +		r = drm_sched_job_add_dependency(&leader->base, fence);
> +		/*
> +		 * We can't abort here with an error any more, but we should
> +		 * also never run into an error since the slots for the
> +		 * dependency fences are preallocated.
> +		 */
> +		WARN_ON(r);
> +	}
> +
> +	if (p->gang_size > 1) {
> +		for (i = 0; i < p->gang_size; ++i)
> +			amdgpu_job_set_gang_leader(p->jobs[i], leader);
> +	}
> +
>   	p->fence = dma_fence_get(&leader->base.s_fence->finished);
>   	drm_exec_for_each_locked_object(&p->exec, index, gobj) {
>   

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>

Regards,

Tvrtko



More information about the amd-gfx mailing list