[PATCH 1/4] drm/sched: optimize drm_sched_job_add_dependency

Fri May 23 13:49:59 UTC 2025

On 23/05/2025 13:56, Christian König wrote:
> It turned out that we can actually massively optimize here.
> 
> The previous code was horrible inefficient since it constantly released
> and re-acquired the lock of the xarray and started each iteration from the
> base of the array to avoid concurrent modification which in our case
> doesn't exist.
> 
> Additional to that the xas_find() and xas_store() functions are explicitly
> made in a way so that you can efficiently check entries and if you don't
> find a match store a new one at the end or replace existing ones.
> 
> So use xas_for_each()/xa_store() instead of xa_for_each()/xa_alloc().
> It's a bit more code, but should be much faster in the end.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 29 ++++++++++++++++++--------
>   1 file changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index f7118497e47a..cf200b1b643e 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -871,10 +871,8 @@ EXPORT_SYMBOL(drm_sched_job_arm);
>   int drm_sched_job_add_dependency(struct drm_sched_job *job,
>   				 struct dma_fence *fence)
>   {
> +	XA_STATE(xas, &job->dependencies, 0);
>   	struct dma_fence *entry;
> -	unsigned long index;
> -	u32 id = 0;
> -	int ret;
>   
>   	if (!fence)
>   		return 0;
> @@ -883,24 +881,37 @@ int drm_sched_job_add_dependency(struct drm_sched_job *job,
>   	 * This lets the size of the array of deps scale with the number of
>   	 * engines involved, rather than the number of BOs.
>   	 */
> -	xa_for_each(&job->dependencies, index, entry) {
> +	xa_lock(&job->dependencies);
> +	xas_for_each(&xas, entry, ULONG_MAX) {
>   		if (entry->context != fence->context)
>   			continue;
>   
>   		if (dma_fence_is_later(fence, entry)) {
>   			dma_fence_put(entry);
> -			xa_store(&job->dependencies, index, fence, GFP_KERNEL);
> +			xas_store(&xas, fence);
>   		} else {
>   			dma_fence_put(fence);
>   		}
> -		return 0;
> +		xa_unlock(&job->dependencies);
> +		return xas_error(&xas);
>   	}
>   
> -	ret = xa_alloc(&job->dependencies, &id, fence, xa_limit_32b, GFP_KERNEL);
> -	if (ret != 0)
> +retry:
> +	entry = xas_store(&xas, fence);
> +	xa_unlock(&job->dependencies);
> +
> +	/* There shouldn't be any concurrent add, so no need to loop again */
> +	if (xas_nomem(&xas, GFP_KERNEL)) {
> +		xa_lock(&job->dependencies);
> +		goto retry;
> +	}
> +
> +	if (xas_error(&xas))
>   		dma_fence_put(fence);
> +	else
> +		WARN_ON(entry);

Looks good, I cannot spot a high level problem with this approach.

Maybe only tail end of this function could be improved with something 
like this:

...
if (xas_nomem(&xas, GFP_KERNEL)) {
	xa_lock(&job->dependencies);
	goto retry;
}

err = xas_error(&xas);
if (WARN_ON(!err && entry))
	dma_fence_put(entry);
else if (err)
	dma_fence_put(fence);

return err;

Thoughts?

>   
> -	return ret;
> +	return xas_error(&xas);
>   }
>   EXPORT_SYMBOL(drm_sched_job_add_dependency);
>