<div dir="auto"><div>Hi Christian,</div><div dir="auto"><br><br><div class="gmail_quote" dir="auto"><div dir="ltr">On Wed, Sep 26, 2018, 10:13 AM Christian König <<a href="mailto:ckoenig.leichtzumerken@gmail.com" target="_blank" rel="noreferrer">ckoenig.leichtzumerken@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Am 26.09.2018 um 09:39 schrieb Lucas Stach:<br>
> Hi Nayan,<br>
><br>
> Am Mittwoch, den 26.09.2018, 02:09 +0900 schrieb Nayan Deshmukh:<br>
>> having a delayed work item per job is redundant as we only need one<br>
>> per scheduler to track the time out the currently executing job.<br>
>><br>
>> v2: the first element of the ring mirror list is the currently<br>
>> executing job so we don't need a additional variable for it<br>
>><br>
>> Signed-off-by: Nayan Deshmukh <<a href="mailto:nayan26deshmukh@gmail.com" rel="noreferrer noreferrer" target="_blank">nayan26deshmukh@gmail.com</a>><br>
>> Suggested-by: Christian König <<a href="mailto:christian.koenig@amd.com" rel="noreferrer noreferrer" target="_blank">christian.koenig@amd.com</a>><br>
>> ---<br>
>>   drivers/gpu/drm/scheduler/sched_main.c | 31 ++++++++++++++++---------------<br>
>>   include/drm/gpu_scheduler.h            |  6 +++---<br>
>>   2 files changed, 19 insertions(+), 18 deletions(-)<br>
>><br>
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c<br>
>> index 9ca741f3a0bc..4e8505d51795 100644<br>
>> --- a/drivers/gpu/drm/scheduler/sched_main.c<br>
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c<br>
>> @@ -197,19 +197,15 @@ static void drm_sched_job_finish(struct work_struct *work)<br>
>>       * manages to find this job as the next job in the list, the fence<br>
>>       * signaled check below will prevent the timeout to be restarted.<br>
>>       */<br>
>> -    cancel_delayed_work_sync(&s_job->work_tdr);<br>
>> +    cancel_delayed_work_sync(&sched->work_tdr);<br>
>>   <br>
>>      spin_lock(&sched->job_list_lock);<br>
>> -    /* queue TDR for next job */<br>
>> -    if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&<br>
>> -        !list_is_last(&s_job->node, &sched->ring_mirror_list)) {<br>
>> -            struct drm_sched_job *next = list_next_entry(s_job, node);<br>
>> -<br>
>> -            if (!dma_fence_is_signaled(&next->s_fence->finished))<br>
>> -                    schedule_delayed_work(&next->work_tdr, sched->timeout);<br>
>> -    }<br>
>>      /* remove job from ring_mirror_list */<br>
>>      list_del(&s_job->node);<br>
>> +    /* queue TDR for next job */<br>
>> +    if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&<br>
>> +        !list_empty(&sched->ring_mirror_list))<br>
>> +            schedule_delayed_work(&sched->work_tdr, sched->timeout);<br>
>>      spin_unlock(&sched->job_list_lock);<br>
>>   <br>
>>      dma_fence_put(&s_job->s_fence->finished);<br>
>> @@ -236,16 +232,21 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)<br>
>>      if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&<br>
>>          list_first_entry_or_null(&sched->ring_mirror_list,<br>
>>                                   struct drm_sched_job, node) == s_job)<br>
>> -            schedule_delayed_work(&s_job->work_tdr, sched->timeout);<br>
>> +            schedule_delayed_work(&sched->work_tdr, sched->timeout);<br>
>>      spin_unlock(&sched->job_list_lock);<br>
>>   }<br>
>>   <br>
>>   static void drm_sched_job_timedout(struct work_struct *work)<br>
>>   {<br>
>> -    struct drm_sched_job *job = container_of(work, struct drm_sched_job,<br>
>> -                                             <a href="http://work_tdr.work" rel="noreferrer noreferrer noreferrer" target="_blank">work_tdr.work</a>);<br>
>> +    struct drm_gpu_scheduler *sched;<br>
>> +    struct drm_sched_job *job;<br>
>> +<br>
>> +    sched = container_of(work, struct drm_gpu_scheduler, <a href="http://work_tdr.work" rel="noreferrer noreferrer noreferrer" target="_blank">work_tdr.work</a>);<br>
>> +    job = list_first_entry_or_null(&sched->ring_mirror_list,<br>
>> +                                   struct drm_sched_job, node);<br>
>>   <br>
>> -    job->sched->ops->timedout_job(job);<br>
>> +    if (job)<br>
>> +            job->sched->ops->timedout_job(job);<br>
> I don't think this is fully robust. Jobs are only removed from the<br>
> ring_mirror_list once the job_finish worker has run. If execution of<br>
> this worker is delayed for any reason (though it's really unlikely for<br>
> a delay as long as the job timeout to happen) you are blaming the wrong<br>
> job here.<br>
><br>
> So I think what you need to to is find the first job in the ring mirror<br>
> list with an unsignaled finish fence to robustly find the stuck job.<br>
<br>
Yeah, that is a known problem I've pointed out as well.<br>
<br>
The issue is we have bug reports that this happened before the patch, <br>
but I'm not 100% sure how.<br>
<br>
My suggestion is to move a good part of the logic from <br>
drm_sched_hw_job_reset() and drm_sched_job_recovery() into <br>
drm_sched_job_timedout().<br>
<br>
E.g. we first call dma_fence_remove_callback() for each job and actually <br>
check the return value if the fence was already signaled.<br>
<br>
If we find a signaled fence we abort and add the callback back to the <br>
ones where we removed it.<br>
<br>
Nayan do you want to take care of this or should I take a look?<br></blockquote></div></div><div dir="auto">I can take care of it. </div><div dir="auto"><br></div><div dir="auto">Regards,</div><div dir="auto">Nayan</div><div dir="auto"><div class="gmail_quote" dir="auto"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
Regards,<br>
Christian.<br>
<br>
><br>
> Regards,<br>
> Lucas<br>
><br>
>>   }<br>
>>   <br>
>>   /**<br>
>> @@ -315,7 +316,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)<br>
>>      s_job = list_first_entry_or_null(&sched->ring_mirror_list,<br>
>>                                       struct drm_sched_job, node);<br>
>>      if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)<br>
>> -            schedule_delayed_work(&s_job->work_tdr, sched->timeout);<br>
>> +            schedule_delayed_work(&sched->work_tdr, sched->timeout);<br>
>>   <br>
>>      list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {<br>
>>              struct drm_sched_fence *s_fence = s_job->s_fence;<br>
>> @@ -384,7 +385,6 @@ int drm_sched_job_init(struct drm_sched_job *job,<br>
>>   <br>
>>      INIT_WORK(&job->finish_work, drm_sched_job_finish);<br>
>>      INIT_LIST_HEAD(&job->node);<br>
>> -    INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);<br>
>>   <br>
>>      return 0;<br>
>>   }<br>
>> @@ -575,6 +575,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,<br>
>>      INIT_LIST_HEAD(&sched->ring_mirror_list);<br>
>>      spin_lock_init(&sched->job_list_lock);<br>
>>      atomic_set(&sched->hw_rq_count, 0);<br>
>> +    INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);<br>
>>      atomic_set(&sched->num_jobs, 0);<br>
>>      atomic64_set(&sched->job_id_count, 0);<br>
>>   <br>
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h<br>
>> index daec50f887b3..d87b268f1781 100644<br>
>> --- a/include/drm/gpu_scheduler.h<br>
>> +++ b/include/drm/gpu_scheduler.h<br>
>> @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);<br>
>>    *               finished to remove the job from the<br>
>>    *               @drm_gpu_scheduler.ring_mirror_list.<br>
>>    * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list.<br>
>> - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the timeout<br>
>> - *            interval is over.<br>
>>    * @id: a unique id assigned to each job scheduled on the scheduler.<br>
>>    * @karma: increment on every hang caused by this job. If this exceeds the hang<br>
>>    *         limit of the scheduler then the job is marked guilty and will not<br>
>> @@ -195,7 +193,6 @@ struct drm_sched_job {<br>
>>      struct dma_fence_cb             finish_cb;<br>
>>      struct work_struct              finish_work;<br>
>>      struct list_head                node;<br>
>> -    struct delayed_work             work_tdr;<br>
>>      uint64_t                        id;<br>
>>      atomic_t                        karma;<br>
>>      enum drm_sched_priority         s_priority;<br>
>> @@ -259,6 +256,8 @@ struct drm_sched_backend_ops {<br>
>>    *                 finished.<br>
>>    * @hw_rq_count: the number of jobs currently in the hardware queue.<br>
>>    * @job_id_count: used to assign unique id to the each job.<br>
>> + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the<br>
>> + *            timeout interval is over.<br>
>>    * @thread: the kthread on which the scheduler which run.<br>
>>    * @ring_mirror_list: the list of jobs which are currently in the job queue.<br>
>>    * @job_list_lock: lock to protect the ring_mirror_list.<br>
>> @@ -278,6 +277,7 @@ struct drm_gpu_scheduler {<br>
>>      wait_queue_head_t               job_scheduled;<br>
>>      atomic_t                        hw_rq_count;<br>
>>      atomic64_t                      job_id_count;<br>
>> +    struct delayed_work             work_tdr;<br>
>>      struct task_struct              *thread;<br>
>>      struct list_head                ring_mirror_list;<br>
>>      spinlock_t                      job_list_lock;<br>
> _______________________________________________<br>
> dri-devel mailing list<br>
> <a href="mailto:dri-devel@lists.freedesktop.org" rel="noreferrer noreferrer" target="_blank">dri-devel@lists.freedesktop.org</a><br>
> <a href="https://lists.freedesktop.org/mailman/listinfo/dri-devel" rel="noreferrer noreferrer noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/dri-devel</a><br>
<br>
</blockquote></div></div></div>