[PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job

Nayan Deshmukh nayan26deshmukh at gmail.com
Thu Sep 20 11:25:25 UTC 2018


On Wed, Sep 19, 2018, 9:31 PM Christian König <christian.koenig at amd.com>
wrote:

> Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
> > having a delayed work item per job is redundant as we only need one
> > per scheduler to track the time out the currently executing job.
>
> Well that looks simpler than I thought it would be.
>
> But it shows the next problem that the timeout and the completion could
> race.
>
> As far as I can see that can be fixed by moving the
> dma_fence_remove_callback()/dma_fence_add_callback() dance from
> drm_sched_hw_job_reset() to drm_sched_job_timedout().
>
> Anyway, I would say drop patch #1 and fix the one comment below and we
> can use this.
>
> >
> > Signed-off-by: Nayan Deshmukh <nayan26deshmukh at gmail.com>
> > Suggested-by: Christian König <christian.koenig at amd.com>
> > ---
> >   drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++-------
> >   include/drm/gpu_scheduler.h            |  6 +++---
> >   2 files changed, 12 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> b/drivers/gpu/drm/scheduler/sched_main.c
> > index 0e6ccc8243db..f213b5c7f718 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct
> *work)
> >        * manages to find this job as the next job in the list, the fence
> >        * signaled check below will prevent the timeout to be restarted.
> >        */
> > -     cancel_delayed_work_sync(&s_job->work_tdr);
> > +     cancel_delayed_work_sync(&sched->work_tdr);
> >
> >       spin_lock(&sched->job_list_lock);
> >       /* queue TDR for next job */
> > @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct
> *work)
> >       if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> >           !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
> >               if (!dma_fence_is_signaled(&next->s_fence->finished))
>
> Since we now have only one delayed work item we can just drop the test
> if next is already signaled.
>
Can you elaborate more on this. Which test are you talking about?

Regards,
Nayan

>
>
> Regards,
> Christian.
>
> > -                     schedule_delayed_work(&next->work_tdr,
> sched->timeout);
> > +                     schedule_delayed_work(&sched->work_tdr,
> sched->timeout);
> >       }
> >       /* remove job from ring_mirror_list */
> >       list_del(&s_job->node);
> > @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job
> *s_job)
> >       if (list_first_entry_or_null(&sched->ring_mirror_list,
> >                               struct drm_sched_job, node) == s_job) {
> >               if (sched->timeout != MAX_SCHEDULE_TIMEOUT)
> > -                     schedule_delayed_work(&s_job->work_tdr,
> sched->timeout);
> > +                     schedule_delayed_work(&sched->work_tdr,
> sched->timeout);
> >               sched->curr_job = s_job;
> >       }
> >       spin_unlock(&sched->job_list_lock);
> > @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct
> drm_sched_job *s_job)
> >
> >   static void drm_sched_job_timedout(struct work_struct *work)
> >   {
> > -     struct drm_sched_job *job = container_of(work, struct
> drm_sched_job,
> > -                                              work_tdr.work);
> > +     struct drm_gpu_scheduler *sched = container_of(work,
> > +                                             struct drm_gpu_scheduler,
> > +                                             work_tdr.work);
> > +     struct drm_sched_job *job = sched->curr_job;
> >
> >       job->sched->ops->timedout_job(job);
> >   }
> > @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler
> *sched)
> >       s_job = list_first_entry_or_null(&sched->ring_mirror_list,
> >                                        struct drm_sched_job, node);
> >       if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
> > -             schedule_delayed_work(&s_job->work_tdr, sched->timeout);
> > +             schedule_delayed_work(&sched->work_tdr, sched->timeout);
> >       if (s_job)
> >               sched->curr_job = s_job;
> >
> > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job,
> >
> >       INIT_WORK(&job->finish_work, drm_sched_job_finish);
> >       INIT_LIST_HEAD(&job->node);
> > -     INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
> >
> >       return 0;
> >   }
> > @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >       INIT_LIST_HEAD(&sched->ring_mirror_list);
> >       spin_lock_init(&sched->job_list_lock);
> >       atomic_set(&sched->hw_rq_count, 0);
> > +     INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> >       atomic_set(&sched->num_jobs, 0);
> >       atomic64_set(&sched->job_id_count, 0);
> >
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index 07e776b1ca42..9d50d7f3eaa4 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct
> dma_fence *f);
> >    *               finished to remove the job from the
> >    *               @drm_gpu_scheduler.ring_mirror_list.
> >    * @node: used to append this struct to the
> @drm_gpu_scheduler.ring_mirror_list.
> > - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after
> the timeout
> > - *            interval is over.
> >    * @id: a unique id assigned to each job scheduled on the scheduler.
> >    * @karma: increment on every hang caused by this job. If this exceeds
> the hang
> >    *         limit of the scheduler then the job is marked guilty and
> will not
> > @@ -195,7 +193,6 @@ struct drm_sched_job {
> >       struct dma_fence_cb             finish_cb;
> >       struct work_struct              finish_work;
> >       struct list_head                node;
> > -     struct delayed_work             work_tdr;
> >       uint64_t                        id;
> >       atomic_t                        karma;
> >       enum drm_sched_priority         s_priority;
> > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {
> >    *                 finished.
> >    * @hw_rq_count: the number of jobs currently in the hardware queue.
> >    * @job_id_count: used to assign unique id to the each job.
> > + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after
> the
> > + *            timeout interval is over.
> >    * @thread: the kthread on which the scheduler which run.
> >    * @ring_mirror_list: the list of jobs which are currently in the job
> queue.
> >    * @job_list_lock: lock to protect the ring_mirror_list.
> > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {
> >       wait_queue_head_t               job_scheduled;
> >       atomic_t                        hw_rq_count;
> >       atomic64_t                      job_id_count;
> > +     struct delayed_work             work_tdr;
> >       struct task_struct              *thread;
> >       struct list_head                ring_mirror_list;
> >       spinlock_t                      job_list_lock;
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/dri-devel/attachments/20180920/076f64bc/attachment-0001.html>


More information about the dri-devel mailing list