<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <div class="moz-cite-prefix">Am 20.09.2018 um 13:25 schrieb Nayan
      Deshmukh:<br>
    </div>
    <blockquote type="cite"
cite="mid:CAFd4ddzZo2M20igKZuRdxrRg6__FCn4LBRxYvC2xD=sYogs7PQ@mail.gmail.com">
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
      <div dir="auto">
        <div><br>
          <br>
          <div class="gmail_quote">
            <div dir="ltr">On Wed, Sep 19, 2018, 9:31 PM Christian König
              <<a href="mailto:christian.koenig@amd.com"
                moz-do-not-send="true">christian.koenig@amd.com</a>>
              wrote:<br>
            </div>
            <blockquote class="gmail_quote" style="margin:0 0 0
              .8ex;border-left:1px #ccc solid;padding-left:1ex">Am
              18.09.2018 um 18:17 schrieb Nayan Deshmukh:<br>
              > having a delayed work item per job is redundant as we
              only need one<br>
              > per scheduler to track the time out the currently
              executing job.<br>
              <br>
              Well that looks simpler than I thought it would be.<br>
              <br>
              But it shows the next problem that the timeout and the
              completion could <br>
              race.<br>
              <br>
              As far as I can see that can be fixed by moving the <br>
              dma_fence_remove_callback()/dma_fence_add_callback() dance
              from <br>
              drm_sched_hw_job_reset() to drm_sched_job_timedout().<br>
              <br>
              Anyway, I would say drop patch #1 and fix the one comment
              below and we <br>
              can use this.<br>
              <br>
              ><br>
              > Signed-off-by: Nayan Deshmukh <<a
                href="mailto:nayan26deshmukh@gmail.com" target="_blank"
                rel="noreferrer" moz-do-not-send="true">nayan26deshmukh@gmail.com</a>><br>
              > Suggested-by: Christian König <<a
                href="mailto:christian.koenig@amd.com" target="_blank"
                rel="noreferrer" moz-do-not-send="true">christian.koenig@amd.com</a>><br>
              > ---<br>
              >   drivers/gpu/drm/scheduler/sched_main.c | 16
              +++++++++-------<br>
              >   include/drm/gpu_scheduler.h            |  6 +++---<br>
              >   2 files changed, 12 insertions(+), 10 deletions(-)<br>
              ><br>
              > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
              b/drivers/gpu/drm/scheduler/sched_main.c<br>
              > index 0e6ccc8243db..f213b5c7f718 100644<br>
              > --- a/drivers/gpu/drm/scheduler/sched_main.c<br>
              > +++ b/drivers/gpu/drm/scheduler/sched_main.c<br>
              > @@ -198,7 +198,7 @@ static void
              drm_sched_job_finish(struct work_struct *work)<br>
              >        * manages to find this job as the next job in
              the list, the fence<br>
              >        * signaled check below will prevent the
              timeout to be restarted.<br>
              >        */<br>
              > -   
               cancel_delayed_work_sync(&s_job->work_tdr);<br>
              > +   
               cancel_delayed_work_sync(&sched->work_tdr);<br>
              >   <br>
              >       spin_lock(&sched->job_list_lock);<br>
              >       /* queue TDR for next job */<br>
              > @@ -207,7 +207,7 @@ static void
              drm_sched_job_finish(struct work_struct *work)<br>
              >       if (sched->timeout != MAX_SCHEDULE_TIMEOUT
              &&<br>
              >           !list_is_last(&s_job->node,
              &sched->ring_mirror_list)) {<br>
              >               if
              (!dma_fence_is_signaled(&next->s_fence->finished))<br>
              <br>
              Since we now have only one delayed work item we can just
              drop the test <br>
              if next is already signaled.<br>
            </blockquote>
          </div>
        </div>
        <div dir="auto"><span
            style="font-family:sans-serif;font-size:12.8px">Can you
            elaborate more on this. Which test are you talking about?</span><br
            style="font-family:sans-serif;font-size:12.8px">
        </div>
      </div>
    </blockquote>
    <br>
    I was talking about the "!dma_fence_is_signaled()" test here.<br>
    <br>
    <blockquote type="cite"
cite="mid:CAFd4ddzZo2M20igKZuRdxrRg6__FCn4LBRxYvC2xD=sYogs7PQ@mail.gmail.com">
      <div dir="auto">
        <div dir="auto"><br
            style="font-family:sans-serif;font-size:12.8px">
          <span style="font-family:sans-serif;font-size:12.8px">Regards,</span><br
            style="font-family:sans-serif;font-size:12.8px">
          <span style="font-family:sans-serif;font-size:12.8px">Nayan</span><br>
        </div>
        <div dir="auto">
          <div class="gmail_quote">
            <blockquote class="gmail_quote" style="margin:0 0 0
              .8ex;border-left:1px #ccc solid;padding-left:1ex">
              <br>
              <br>
              Regards,<br>
              Christian.<br>
              <br>
              > -                   
               schedule_delayed_work(&next->work_tdr,
              sched->timeout);<br>
              > +                   
               schedule_delayed_work(&sched->work_tdr,
              sched->timeout);<br>
              >       }<br>
              >       /* remove job from ring_mirror_list */<br>
              >       list_del(&s_job->node);<br>
            </blockquote>
          </div>
        </div>
      </div>
    </blockquote>
    <br>
    Basically you could do this first and then you need to only test if
    sched->ring_mirror_list is empty.<br>
    <br>
    Regards,<br>
    Christian.<br>
    <br>
    <blockquote type="cite"
cite="mid:CAFd4ddzZo2M20igKZuRdxrRg6__FCn4LBRxYvC2xD=sYogs7PQ@mail.gmail.com">
      <div dir="auto">
        <div dir="auto">
          <div class="gmail_quote">
            <blockquote class="gmail_quote" style="margin:0 0 0
              .8ex;border-left:1px #ccc solid;padding-left:1ex">
              > @@ -237,7 +237,7 @@ static void
              drm_sched_job_begin(struct drm_sched_job *s_job)<br>
              >       if
              (list_first_entry_or_null(&sched->ring_mirror_list,<br>
              >                               struct drm_sched_job,
              node) == s_job) {<br>
              >               if (sched->timeout !=
              MAX_SCHEDULE_TIMEOUT)<br>
              > -                   
               schedule_delayed_work(&s_job->work_tdr,
              sched->timeout);<br>
              > +                   
               schedule_delayed_work(&sched->work_tdr,
              sched->timeout);<br>
              >               sched->curr_job = s_job;<br>
              >       }<br>
              >       spin_unlock(&sched->job_list_lock);<br>
              > @@ -245,8 +245,10 @@ static void
              drm_sched_job_begin(struct drm_sched_job *s_job)<br>
              >   <br>
              >   static void drm_sched_job_timedout(struct
              work_struct *work)<br>
              >   {<br>
              > -     struct drm_sched_job *job = container_of(work,
              struct drm_sched_job,<br>
              > -                                              <a
                href="http://work_tdr.work" rel="noreferrer noreferrer"
                target="_blank" moz-do-not-send="true">work_tdr.work</a>);<br>
              > +     struct drm_gpu_scheduler *sched =
              container_of(work,<br>
              > +                                             struct
              drm_gpu_scheduler,<br>
              > +                                             <a
                href="http://work_tdr.work" rel="noreferrer noreferrer"
                target="_blank" moz-do-not-send="true">work_tdr.work</a>);<br>
              > +     struct drm_sched_job *job = sched->curr_job;<br>
              >   <br>
              >       job->sched->ops->timedout_job(job);<br>
              >   }<br>
              > @@ -318,7 +320,7 @@ void
              drm_sched_job_recovery(struct drm_gpu_scheduler *sched)<br>
              >       s_job =
              list_first_entry_or_null(&sched->ring_mirror_list,<br>
              >                                        struct
              drm_sched_job, node);<br>
              >       if (s_job && sched->timeout !=
              MAX_SCHEDULE_TIMEOUT)<br>
              > -           
               schedule_delayed_work(&s_job->work_tdr,
              sched->timeout);<br>
              > +           
               schedule_delayed_work(&sched->work_tdr,
              sched->timeout);<br>
              >       if (s_job)<br>
              >               sched->curr_job = s_job;<br>
              >   <br>
              > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct
              drm_sched_job *job,<br>
              >   <br>
              >       INIT_WORK(&job->finish_work,
              drm_sched_job_finish);<br>
              >       INIT_LIST_HEAD(&job->node);<br>
              > -     INIT_DELAYED_WORK(&job->work_tdr,
              drm_sched_job_timedout);<br>
              >   <br>
              >       return 0;<br>
              >   }<br>
              > @@ -580,6 +581,7 @@ int drm_sched_init(struct
              drm_gpu_scheduler *sched,<br>
              >     
               INIT_LIST_HEAD(&sched->ring_mirror_list);<br>
              >       spin_lock_init(&sched->job_list_lock);<br>
              >       atomic_set(&sched->hw_rq_count, 0);<br>
              > +     INIT_DELAYED_WORK(&sched->work_tdr,
              drm_sched_job_timedout);<br>
              >       atomic_set(&sched->num_jobs, 0);<br>
              >       atomic64_set(&sched->job_id_count, 0);<br>
              >   <br>
              > diff --git a/include/drm/gpu_scheduler.h
              b/include/drm/gpu_scheduler.h<br>
              > index 07e776b1ca42..9d50d7f3eaa4 100644<br>
              > --- a/include/drm/gpu_scheduler.h<br>
              > +++ b/include/drm/gpu_scheduler.h<br>
              > @@ -175,8 +175,6 @@ struct drm_sched_fence
              *to_drm_sched_fence(struct dma_fence *f);<br>
              >    *               finished to remove the job from
              the<br>
              >    *             
               @drm_gpu_scheduler.ring_mirror_list.<br>
              >    * @node: used to append this struct to the
              @drm_gpu_scheduler.ring_mirror_list.<br>
              > - * @work_tdr: schedules a delayed call to
              @drm_sched_job_timedout after the timeout<br>
              > - *            interval is over.<br>
              >    * @id: a unique id assigned to each job scheduled
              on the scheduler.<br>
              >    * @karma: increment on every hang caused by this
              job. If this exceeds the hang<br>
              >    *         limit of the scheduler then the job is
              marked guilty and will not<br>
              > @@ -195,7 +193,6 @@ struct drm_sched_job {<br>
              >       struct dma_fence_cb             finish_cb;<br>
              >       struct work_struct              finish_work;<br>
              >       struct list_head                node;<br>
              > -     struct delayed_work             work_tdr;<br>
              >       uint64_t                        id;<br>
              >       atomic_t                        karma;<br>
              >       enum drm_sched_priority         s_priority;<br>
              > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {<br>
              >    *                 finished.<br>
              >    * @hw_rq_count: the number of jobs currently in
              the hardware queue.<br>
              >    * @job_id_count: used to assign unique id to the
              each job.<br>
              > + * @work_tdr: schedules a delayed call to
              @drm_sched_job_timedout after the<br>
              > + *            timeout interval is over.<br>
              >    * @thread: the kthread on which the scheduler
              which run.<br>
              >    * @ring_mirror_list: the list of jobs which are
              currently in the job queue.<br>
              >    * @job_list_lock: lock to protect the
              ring_mirror_list.<br>
              > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {<br>
              >       wait_queue_head_t               job_scheduled;<br>
              >       atomic_t                        hw_rq_count;<br>
              >       atomic64_t                      job_id_count;<br>
              > +     struct delayed_work             work_tdr;<br>
              >       struct task_struct              *thread;<br>
              >       struct list_head               
              ring_mirror_list;<br>
              >       spinlock_t                      job_list_lock;<br>
              <br>
            </blockquote>
          </div>
        </div>
      </div>
    </blockquote>
    <br>
  </body>
</html>