[PATCH] drm/sched: Consolidate drm_sched_job_timedout

Wed Jul 16 20:53:45 UTC 2025

On 16/07/25 11:48, Tvrtko Ursulin wrote:
> Reduce to one spin_unlock for hopefully a little bit clearer flow in the
> function. It may appear that there is a behavioural change with the
> drm_sched_start_timeout_unlocked() now not being called if there were
> initially no jobs on the pending list, and then some appeared after
> unlock, however if the code would rely on the TDR handler restarting
> itself then it would fail to do that if the job arrived on the pending
> list after the check.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>

Reviewed-by: Maíra Canal <mcanal at igalia.com>

Best Regards,
- Maíra

> Cc: Christian König <christian.koenig at amd.com>
> Cc: Danilo Krummrich <dakr at kernel.org>
> Cc: Maíra Canal <mcanal at igalia.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Philipp Stanner <phasta at kernel.org>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 36 ++++++++++++--------------
>   1 file changed, 17 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index e2cda28a1af4..60ae600590dc 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -556,17 +556,15 @@ static void drm_sched_job_reinsert_on_false_timeout(struct drm_gpu_scheduler *sc
>   
>   static void drm_sched_job_timedout(struct work_struct *work)
>   {
> -	struct drm_gpu_scheduler *sched;
> +	struct drm_gpu_scheduler *sched =
> +		container_of(work, struct drm_gpu_scheduler, work_tdr.work);
> +	enum drm_gpu_sched_stat status;
>   	struct drm_sched_job *job;
> -	enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_RESET;
> -
> -	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
>   
>   	/* Protects against concurrent deletion in drm_sched_get_finished_job */
>   	spin_lock(&sched->job_list_lock);
>   	job = list_first_entry_or_null(&sched->pending_list,
>   				       struct drm_sched_job, list);
> -
>   	if (job) {
>   		/*
>   		 * Remove the bad job so it cannot be freed by a concurrent
> @@ -575,23 +573,23 @@ static void drm_sched_job_timedout(struct work_struct *work)
>   		 * cancelled, at which point it's safe.
>   		 */
>   		list_del_init(&job->list);
> -		spin_unlock(&sched->job_list_lock);
> +	}
> +	spin_unlock(&sched->job_list_lock);
>   
> -		status = job->sched->ops->timedout_job(job);
> +	if (!job)
> +		return;
>   
> -		/*
> -		 * Guilty job did complete and hence needs to be manually removed
> -		 * See drm_sched_stop doc.
> -		 */
> -		if (sched->free_guilty) {
> -			job->sched->ops->free_job(job);
> -			sched->free_guilty = false;
> -		}
> +	status = job->sched->ops->timedout_job(job);
>   
> -		if (status == DRM_GPU_SCHED_STAT_NO_HANG)
> -			drm_sched_job_reinsert_on_false_timeout(sched, job);
> -	} else {
> -		spin_unlock(&sched->job_list_lock);
> +	/*
> +	 * Guilty job did complete and hence needs to be manually removed. See
> +	 * documentation for drm_sched_stop.
> +	 */
> +	if (sched->free_guilty) {
> +		job->sched->ops->free_job(job);
> +		sched->free_guilty = false;
> +	} else if (status == DRM_GPU_SCHED_STAT_NO_HANG) {
> +		drm_sched_job_reinsert_on_false_timeout(sched, job);
>   	}
>   
>   	if (status != DRM_GPU_SCHED_STAT_ENODEV)