[PATCH v3 2/8] drm/sched: Allow drivers to skip the reset and keep on running

Mon Jun 30 11:25:28 UTC 2025

Hi,

@Matthew, @Philipp, @Danilo, do you folks have some feedback about this
patch and also 1/8 and 7/8? I'd be glad to hear your thoughts and/or
gather some R-b's. Thanks!

Best Regards,
- Maíra

On 18/06/25 11:47, Maíra Canal wrote:
> When the DRM scheduler times out, it's possible that the GPU isn't hung;
> instead, a job may still be running, and there may be no valid reason to
> reset the hardware. This can occur in two situations:
> 
>    1. The GPU exposes some mechanism that ensures the GPU is still making
>       progress. By checking this mechanism, the driver can safely skip the
>       reset, re-arm the timeout, and allow the job to continue running until
>       completion. This is the case for v3d, Etnaviv, and Xe.
>    2. Timeout has fired before the free-job worker. Consequently, the
>       scheduler calls `timedout_job()` for a job that isn't timed out.
> 
> These two scenarios are problematic because the job was removed from the
> `sched->pending_list` before calling `sched->ops->timedout_job()`, which
> means that when the job finishes, it won't be freed by the scheduler
> though `sched->ops->free_job()` - leading to a memory leak.
> 
> To solve those problems, create a new `drm_gpu_sched_stat`, called
> DRM_GPU_SCHED_STAT_NO_HANG, that allows a driver to skip the reset. The
> new status will indicate that the job must be reinserted into the
> pending list, and the hardware / driver will still complete that job.
> 
> Signed-off-by: Maíra Canal <mcanal at igalia.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 43 ++++++++++++++++++++++++++++++++--
>   include/drm/gpu_scheduler.h            |  3 +++
>   2 files changed, 44 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index fb6d9eddf5b378910b66d456f3610ff2ca7c0f41..5e1c07ca867cb14746cec9a7e53896fe17af6e58 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -380,11 +380,16 @@ static void drm_sched_run_free_queue(struct drm_gpu_scheduler *sched)
>   {
>   	struct drm_sched_job *job;
>   
> -	spin_lock(&sched->job_list_lock);
>   	job = list_first_entry_or_null(&sched->pending_list,
>   				       struct drm_sched_job, list);
>   	if (job && dma_fence_is_signaled(&job->s_fence->finished))
>   		__drm_sched_run_free_queue(sched);
> +}
> +
> +static void drm_sched_run_free_queue_unlocked(struct drm_gpu_scheduler *sched)
> +{
> +	spin_lock(&sched->job_list_lock);
> +	drm_sched_run_free_queue(sched);
>   	spin_unlock(&sched->job_list_lock);
>   }
>   
> @@ -537,6 +542,31 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>   	spin_unlock(&sched->job_list_lock);
>   }
>   
> +/**
> + * drm_sched_job_reinsert_on_false_timeout - Reinsert the job on a false timeout
> + * @sched: scheduler instance
> + * @job: job to be reinserted on the pending list
> + *
> + * In the case of a "false timeout" - when a timeout occurs but the GPU isn't
> + * hung and the job is making progress, the scheduler must reinsert the job back
> + * into the pending list. Otherwise, the job and its resources won't be freed
> + * through the &drm_sched_backend_ops.free_job callback.
> + *
> + * Note that after reinserting the job, the scheduler enqueues the free-job
> + * work again if ready. Otherwise, a signaled job could be added to the pending
> + * list, but never freed.
> + *
> + * This function must be used in "false timeout" cases only.
> + */
> +static void drm_sched_job_reinsert_on_false_timeout(struct drm_gpu_scheduler *sched,
> +						    struct drm_sched_job *job)
> +{
> +	spin_lock(&sched->job_list_lock);
> +	list_add(&job->list, &sched->pending_list);
> +	drm_sched_run_free_queue(sched);
> +	spin_unlock(&sched->job_list_lock);
> +}
> +
>   static void drm_sched_job_timedout(struct work_struct *work)
>   {
>   	struct drm_gpu_scheduler *sched;
> @@ -570,6 +600,9 @@ static void drm_sched_job_timedout(struct work_struct *work)
>   			job->sched->ops->free_job(job);
>   			sched->free_guilty = false;
>   		}
> +
> +		if (status == DRM_GPU_SCHED_STAT_NO_HANG)
> +			drm_sched_job_reinsert_on_false_timeout(sched, job);
>   	} else {
>   		spin_unlock(&sched->job_list_lock);
>   	}
> @@ -592,6 +625,9 @@ static void drm_sched_job_timedout(struct work_struct *work)
>    * This function is typically used for reset recovery (see the docu of
>    * drm_sched_backend_ops.timedout_job() for details). Do not call it for
>    * scheduler teardown, i.e., before calling drm_sched_fini().
> + *
> + * As it's used for reset recovery, drm_sched_stop() shouldn't be called
> + * if the driver skipped the reset (DRM_GPU_SCHED_STAT_NO_HANG).
>    */
>   void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>   {
> @@ -677,6 +713,9 @@ EXPORT_SYMBOL(drm_sched_stop);
>    * drm_sched_backend_ops.timedout_job() for details). Do not call it for
>    * scheduler startup. The scheduler itself is fully operational after
>    * drm_sched_init() succeeded.
> + *
> + * As it's used for reset recovery, drm_sched_start() shouldn't be called
> + * if the driver skipped the reset (DRM_GPU_SCHED_STAT_NO_HANG).
>    */
>   void drm_sched_start(struct drm_gpu_scheduler *sched, int errno)
>   {
> @@ -1198,7 +1237,7 @@ static void drm_sched_free_job_work(struct work_struct *w)
>   	if (job)
>   		sched->ops->free_job(job);
>   
> -	drm_sched_run_free_queue(sched);
> +	drm_sched_run_free_queue_unlocked(sched);
>   	drm_sched_run_job_queue(sched);
>   }
>   
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 83e5c00d8dd9a83ab20547a93d6fc572de97616e..423bcc7d7584d3f85cc5a10982f3cf637a781825 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -393,11 +393,14 @@ struct drm_sched_job {
>    * @DRM_GPU_SCHED_STAT_NONE: Reserved. Do not use.
>    * @DRM_GPU_SCHED_STAT_RESET: The GPU hung and successfully reset.
>    * @DRM_GPU_SCHED_STAT_ENODEV: Error: Device is not available anymore.
> + * @DRM_GPU_SCHED_STAT_NO_HANG: Contrary to scheduler's belief, the GPU
> + * did not hang and is operational.
>    */
>   enum drm_gpu_sched_stat {
>   	DRM_GPU_SCHED_STAT_NONE,
>   	DRM_GPU_SCHED_STAT_RESET,
>   	DRM_GPU_SCHED_STAT_ENODEV,
> +	DRM_GPU_SCHED_STAT_NO_HANG,
>   };
>   
>   /**
>