[PATCH] drm/sched: fix the bug of time out calculation(v2)

Alex Deucher alexdeucher at gmail.com
Wed Aug 25 12:51:25 UTC 2021


Please cc dri-devel on all scheduler patches.  It's core functionality.

Alex

On Wed, Aug 25, 2021 at 12:14 AM Monk Liu <Monk.Liu at amd.com> wrote:
>
> the original logic is wrong that the timeout will not be retriggerd
> after the previous job siganled, and that lead to the scenario that all
> jobs in the same scheduler shares the same timeout timer from the very
> begining job in this scheduler which is wrong.
>
> we should modify the timer everytime a previous job signaled.
>
> v2:
> further cleanup the logic, and do the TDR timer cancelling if the signaled job
> is the last one in its scheduler.
>
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>  drivers/gpu/drm/scheduler/sched_main.c | 29 ++++++++++++++++++++---------
>  1 file changed, 20 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index a2a9536..8c102ac 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -305,8 +305,17 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>         struct drm_gpu_scheduler *sched = s_job->sched;
>
>         spin_lock(&sched->job_list_lock);
> -       list_add_tail(&s_job->list, &sched->pending_list);
> -       drm_sched_start_timeout(sched);
> +       if (list_empty(&sched->pending_list)) {
> +               list_add_tail(&s_job->list, &sched->pending_list);
> +               drm_sched_start_timeout(sched);
> +       } else {
> +               /* the old jobs in pending list are not finished yet
> +                * no need to restart TDR timer here, it is already
> +                * handled by drm_sched_get_cleanup_job
> +                */
> +               list_add_tail(&s_job->list, &sched->pending_list);
> +       }
> +
>         spin_unlock(&sched->job_list_lock);
>  }
>
> @@ -693,17 +702,22 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>         if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
>                 /* remove job from pending_list */
>                 list_del_init(&job->list);
> +
>                 /* make the scheduled timestamp more accurate */
>                 next = list_first_entry_or_null(&sched->pending_list,
>                                                 typeof(*next), list);
> -               if (next)
> +               if (next) {
> +                       /* if we still have job in pending list we need modify the TDR timer */
> +                       mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout);
>                         next->s_fence->scheduled.timestamp =
>                                 job->s_fence->finished.timestamp;
> +               } else {
> +                       /* cancel the TDR timer if no job in pending list */
> +                       cancel_delayed_work(&sched->work_tdr);
> +               }
>
>         } else {
>                 job = NULL;
> -               /* queue timeout for next job */
> -               drm_sched_start_timeout(sched);
>         }
>
>         spin_unlock(&sched->job_list_lock);
> @@ -791,11 +805,8 @@ static int drm_sched_main(void *param)
>                                           (entity = drm_sched_select_entity(sched))) ||
>                                          kthread_should_stop());
>
> -               if (cleanup_job) {
> +               if (cleanup_job)
>                         sched->ops->free_job(cleanup_job);
> -                       /* queue timeout for next job */
> -                       drm_sched_start_timeout(sched);
> -               }
>
>                 if (!entity)
>                         continue;
> --
> 2.7.4
>


More information about the amd-gfx mailing list