[PATCH] drm/sched: fix the bug of time out calculation(v2)

Wed Aug 25 06:31:41 UTC 2021

Well NAK to that approach. First of all your bug analyses is incorrect.

The timeout started by queue_delayed_work() in drm_sched_start_timeout() 
is paired with the cancel_delayed_work() in drm_sched_get_cleanup_job().

So you must have something else going on here.

Then please don't use mod_delayed_work(), instead always cancel it and 
restart it.

Regards,
Christian.

Am 25.08.21 um 06:14 schrieb Monk Liu:
> the original logic is wrong that the timeout will not be retriggerd
> after the previous job siganled, and that lead to the scenario that all
> jobs in the same scheduler shares the same timeout timer from the very
> begining job in this scheduler which is wrong.
>
> we should modify the timer everytime a previous job signaled.
>
> v2:
> further cleanup the logic, and do the TDR timer cancelling if the signaled job
> is the last one in its scheduler.
>
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 29 ++++++++++++++++++++---------
>   1 file changed, 20 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index a2a9536..8c102ac 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -305,8 +305,17 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>   	struct drm_gpu_scheduler *sched = s_job->sched;
>   
>   	spin_lock(&sched->job_list_lock);
> -	list_add_tail(&s_job->list, &sched->pending_list);
> -	drm_sched_start_timeout(sched);
> +	if (list_empty(&sched->pending_list)) {
> +		list_add_tail(&s_job->list, &sched->pending_list);
> +		drm_sched_start_timeout(sched);
> +	} else {
> +		/* the old jobs in pending list are not finished yet
> +		 * no need to restart TDR timer here, it is already
> +		 * handled by drm_sched_get_cleanup_job
> +		 */
> +		list_add_tail(&s_job->list, &sched->pending_list);
> +	}
> +
>   	spin_unlock(&sched->job_list_lock);
>   }
>   
> @@ -693,17 +702,22 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>   	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
>   		/* remove job from pending_list */
>   		list_del_init(&job->list);
> +
>   		/* make the scheduled timestamp more accurate */
>   		next = list_first_entry_or_null(&sched->pending_list,
>   						typeof(*next), list);
> -		if (next)
> +		if (next) {
> +			/* if we still have job in pending list we need modify the TDR timer */
> +			mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout);
>   			next->s_fence->scheduled.timestamp =
>   				job->s_fence->finished.timestamp;
> +		} else {
> +			/* cancel the TDR timer if no job in pending list */
> +			cancel_delayed_work(&sched->work_tdr);
> +		}
>   
>   	} else {
>   		job = NULL;
> -		/* queue timeout for next job */
> -		drm_sched_start_timeout(sched);
>   	}
>   
>   	spin_unlock(&sched->job_list_lock);
> @@ -791,11 +805,8 @@ static int drm_sched_main(void *param)
>   					  (entity = drm_sched_select_entity(sched))) ||
>   					 kthread_should_stop());
>   
> -		if (cleanup_job) {
> +		if (cleanup_job)
>   			sched->ops->free_job(cleanup_job);
> -			/* queue timeout for next job */
> -			drm_sched_start_timeout(sched);
> -		}
>   
>   		if (!entity)
>   			continue;