[PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job

Thu Sep 20 11:30:09 UTC 2018

Am 20.09.2018 um 13:25 schrieb Nayan Deshmukh:
>
>
> On Wed, Sep 19, 2018, 9:31 PM Christian König 
> <christian.koenig at amd.com <mailto:christian.koenig at amd.com>> wrote:
>
>     Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
>     > having a delayed work item per job is redundant as we only need one
>     > per scheduler to track the time out the currently executing job.
>
>     Well that looks simpler than I thought it would be.
>
>     But it shows the next problem that the timeout and the completion
>     could
>     race.
>
>     As far as I can see that can be fixed by moving the
>     dma_fence_remove_callback()/dma_fence_add_callback() dance from
>     drm_sched_hw_job_reset() to drm_sched_job_timedout().
>
>     Anyway, I would say drop patch #1 and fix the one comment below
>     and we
>     can use this.
>
>     >
>     > Signed-off-by: Nayan Deshmukh <nayan26deshmukh at gmail.com
>     <mailto:nayan26deshmukh at gmail.com>>
>     > Suggested-by: Christian König <christian.koenig at amd.com
>     <mailto:christian.koenig at amd.com>>
>     > ---
>     >   drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++-------
>     >   include/drm/gpu_scheduler.h            |  6 +++---
>     >   2 files changed, 12 insertions(+), 10 deletions(-)
>     >
>     > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>     b/drivers/gpu/drm/scheduler/sched_main.c
>     > index 0e6ccc8243db..f213b5c7f718 100644
>     > --- a/drivers/gpu/drm/scheduler/sched_main.c
>     > +++ b/drivers/gpu/drm/scheduler/sched_main.c
>     > @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct
>     work_struct *work)
>     >        * manages to find this job as the next job in the list,
>     the fence
>     >        * signaled check below will prevent the timeout to be
>     restarted.
>     >        */
>     > -  cancel_delayed_work_sync(&s_job->work_tdr);
>     > +  cancel_delayed_work_sync(&sched->work_tdr);
>     >
>     >       spin_lock(&sched->job_list_lock);
>     >       /* queue TDR for next job */
>     > @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct
>     work_struct *work)
>     >       if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>     >           !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
>     >               if (!dma_fence_is_signaled(&next->s_fence->finished))
>
>     Since we now have only one delayed work item we can just drop the
>     test
>     if next is already signaled.
>
> Can you elaborate more on this. Which test are you talking about?

I was talking about the "!dma_fence_is_signaled()" test here.

>
> Regards,
> Nayan
>
>
>
>     Regards,
>     Christian.
>
>     > -  schedule_delayed_work(&next->work_tdr, sched->timeout);
>     > +  schedule_delayed_work(&sched->work_tdr, sched->timeout);
>     >       }
>     >       /* remove job from ring_mirror_list */
>     >       list_del(&s_job->node);
>

Basically you could do this first and then you need to only test if 
sched->ring_mirror_list is empty.

Regards,
Christian.

>     > @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct
>     drm_sched_job *s_job)
>     >       if (list_first_entry_or_null(&sched->ring_mirror_list,
>     >                               struct drm_sched_job, node) ==
>     s_job) {
>     >               if (sched->timeout != MAX_SCHEDULE_TIMEOUT)
>     > -  schedule_delayed_work(&s_job->work_tdr, sched->timeout);
>     > +  schedule_delayed_work(&sched->work_tdr, sched->timeout);
>     >               sched->curr_job = s_job;
>     >       }
>     >       spin_unlock(&sched->job_list_lock);
>     > @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct
>     drm_sched_job *s_job)
>     >
>     >   static void drm_sched_job_timedout(struct work_struct *work)
>     >   {
>     > -     struct drm_sched_job *job = container_of(work, struct
>     drm_sched_job,
>     > - work_tdr.work <http://work_tdr.work>);
>     > +     struct drm_gpu_scheduler *sched = container_of(work,
>     > +                                             struct
>     drm_gpu_scheduler,
>     > + work_tdr.work <http://work_tdr.work>);
>     > +     struct drm_sched_job *job = sched->curr_job;
>     >
>     >       job->sched->ops->timedout_job(job);
>     >   }
>     > @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct
>     drm_gpu_scheduler *sched)
>     >       s_job = list_first_entry_or_null(&sched->ring_mirror_list,
>     >                                        struct drm_sched_job, node);
>     >       if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
>     > -  schedule_delayed_work(&s_job->work_tdr, sched->timeout);
>     > +  schedule_delayed_work(&sched->work_tdr, sched->timeout);
>     >       if (s_job)
>     >               sched->curr_job = s_job;
>     >
>     > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job
>     *job,
>     >
>     >       INIT_WORK(&job->finish_work, drm_sched_job_finish);
>     >       INIT_LIST_HEAD(&job->node);
>     > -     INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
>     >
>     >       return 0;
>     >   }
>     > @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler
>     *sched,
>     >  INIT_LIST_HEAD(&sched->ring_mirror_list);
>     >       spin_lock_init(&sched->job_list_lock);
>     >       atomic_set(&sched->hw_rq_count, 0);
>     > +     INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>     >       atomic_set(&sched->num_jobs, 0);
>     >       atomic64_set(&sched->job_id_count, 0);
>     >
>     > diff --git a/include/drm/gpu_scheduler.h
>     b/include/drm/gpu_scheduler.h
>     > index 07e776b1ca42..9d50d7f3eaa4 100644
>     > --- a/include/drm/gpu_scheduler.h
>     > +++ b/include/drm/gpu_scheduler.h
>     > @@ -175,8 +175,6 @@ struct drm_sched_fence
>     *to_drm_sched_fence(struct dma_fence *f);
>     >    *               finished to remove the job from the
>     >    *  @drm_gpu_scheduler.ring_mirror_list.
>     >    * @node: used to append this struct to the
>     @drm_gpu_scheduler.ring_mirror_list.
>     > - * @work_tdr: schedules a delayed call to
>     @drm_sched_job_timedout after the timeout
>     > - *            interval is over.
>     >    * @id: a unique id assigned to each job scheduled on the
>     scheduler.
>     >    * @karma: increment on every hang caused by this job. If this
>     exceeds the hang
>     >    *         limit of the scheduler then the job is marked
>     guilty and will not
>     > @@ -195,7 +193,6 @@ struct drm_sched_job {
>     >       struct dma_fence_cb             finish_cb;
>     >       struct work_struct              finish_work;
>     >       struct list_head                node;
>     > -     struct delayed_work             work_tdr;
>     >       uint64_t                        id;
>     >       atomic_t                        karma;
>     >       enum drm_sched_priority         s_priority;
>     > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {
>     >    *                 finished.
>     >    * @hw_rq_count: the number of jobs currently in the hardware
>     queue.
>     >    * @job_id_count: used to assign unique id to the each job.
>     > + * @work_tdr: schedules a delayed call to
>     @drm_sched_job_timedout after the
>     > + *            timeout interval is over.
>     >    * @thread: the kthread on which the scheduler which run.
>     >    * @ring_mirror_list: the list of jobs which are currently in
>     the job queue.
>     >    * @job_list_lock: lock to protect the ring_mirror_list.
>     > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {
>     >       wait_queue_head_t               job_scheduled;
>     >       atomic_t                        hw_rq_count;
>     >       atomic64_t                      job_id_count;
>     > +     struct delayed_work             work_tdr;
>     >       struct task_struct              *thread;
>     >       struct list_head ring_mirror_list;
>     >       spinlock_t                      job_list_lock;
>

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/dri-devel/attachments/20180920/f9e97e23/attachment-0001.html>