[PATCH 1/4] drm/sched: optimize drm_sched_job_add_dependency

Mon May 26 07:28:02 UTC 2025

On Fri, 2025-05-23 at 14:56 +0200, Christian König wrote:
> It turned out that we can actually massively optimize here.
> 
> The previous code was horrible inefficient since it constantly
> released
> and re-acquired the lock of the xarray and started each iteration
> from the
> base of the array to avoid concurrent modification which in our case
> doesn't exist.
> 
> Additional to that the xas_find() and xas_store() functions are
> explicitly
> made in a way so that you can efficiently check entries and if you
> don't
> find a match store a new one at the end or replace existing ones.
> 
> So use xas_for_each()/xa_store() instead of xa_for_each()/xa_alloc().
> It's a bit more code, but should be much faster in the end.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/scheduler/sched_main.c | 29 ++++++++++++++++++------
> --
>  1 file changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> b/drivers/gpu/drm/scheduler/sched_main.c
> index f7118497e47a..cf200b1b643e 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -871,10 +871,8 @@ EXPORT_SYMBOL(drm_sched_job_arm);
>  int drm_sched_job_add_dependency(struct drm_sched_job *job,
>  				 struct dma_fence *fence)
>  {
> +	XA_STATE(xas, &job->dependencies, 0);
>  	struct dma_fence *entry;
> -	unsigned long index;
> -	u32 id = 0;
> -	int ret;
>  
>  	if (!fence)
>  		return 0;
> @@ -883,24 +881,37 @@ int drm_sched_job_add_dependency(struct
> drm_sched_job *job,
>  	 * This lets the size of the array of deps scale with the
> number of
>  	 * engines involved, rather than the number of BOs.
>  	 */
> -	xa_for_each(&job->dependencies, index, entry) {
> +	xa_lock(&job->dependencies);
> +	xas_for_each(&xas, entry, ULONG_MAX) {
>  		if (entry->context != fence->context)
>  			continue;
>  
>  		if (dma_fence_is_later(fence, entry)) {
>  			dma_fence_put(entry);
> -			xa_store(&job->dependencies, index, fence,
> GFP_KERNEL);
> +			xas_store(&xas, fence);
>  		} else {
>  			dma_fence_put(fence);
>  		}
> -		return 0;
> +		xa_unlock(&job->dependencies);
> +		return xas_error(&xas);
>  	}
>  
> -	ret = xa_alloc(&job->dependencies, &id, fence, xa_limit_32b,
> GFP_KERNEL);
> -	if (ret != 0)
> +retry:
> +	entry = xas_store(&xas, fence);
> +	xa_unlock(&job->dependencies);
> +
> +	/* There shouldn't be any concurrent add, so no need to loop
> again */

Should we maybe add it to the function documentation that this must not
be called concurrently?

Looks to me as if the current version were already broken if someone
does that. So maybe is also OK to just leave it as is.

P.

> +	if (xas_nomem(&xas, GFP_KERNEL)) {
> +		xa_lock(&job->dependencies);
> +		goto retry;
> +	}
> +
> +	if (xas_error(&xas))
>  		dma_fence_put(fence);
> +	else
> +		WARN_ON(entry);
>  
> -	return ret;
> +	return xas_error(&xas);
>  }
>  EXPORT_SYMBOL(drm_sched_job_add_dependency);
>