[PATCH v3] drm/scheduler re-insert Bailing job to avoid memleak

Thu Mar 25 16:32:25 UTC 2021

There are a few issues here like - how u handle non guilty singnaled jobs
in drm_sched_stop, currently looks like you don't call put for them
and just explicitly free them as before. Also sched->free_guilty
seems useless with the new approach. Do we even need the cleanup
mechanism at drm_sched_get_cleanup_job with this approach...

But first - We need Christian to express his opinion on this since
I think he opposed refcounting jobs and that we should concentrate on
fences instead.

Christian - can you chime in here ?

Andrey

On 2021-03-25 5:51 a.m., Zhang, Jack (Jian) wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> 
> Hi, Andrey
> 
> Thank you for your good opinions.
> 
> I literally agree with you that the refcount could solve the 
> get_clean_up_up cocurrent job gracefully, and no need to re-insert the
> 
> job back anymore.
> 
> I quickly made a draft for this idea as follows:
> 
> How do you like it? I will start implement to it after I got your 
> acknowledge.
> 
> Thanks,
> 
> Jack
> 
> +void drm_job_get(struct drm_sched_job *s_job)
> 
> +{
> 
> +       kref_get(&s_job->refcount);
> 
> +}
> 
> +
> 
> +void drm_job_do_release(struct kref *ref)
> 
> +{
> 
> +       struct drm_sched_job *s_job;
> 
> +       struct drm_gpu_scheduler *sched;
> 
> +
> 
> +       s_job = container_of(ref, struct drm_sched_job, refcount);
> 
> +       sched = s_job->sched;
> 
> +       sched->ops->free_job(s_job);
> 
> +}
> 
> +
> 
> +void drm_job_put(struct drm_sched_job *s_job)
> 
> +{
> 
> +       kref_put(&s_job->refcount, drm_job_do_release);
> 
> +}
> 
> +
> 
> static void drm_sched_job_begin(struct drm_sched_job *s_job)
> 
> {
> 
>          struct drm_gpu_scheduler *sched = s_job->sched;
> 
> +       kref_init(&s_job->refcount);
> 
> +       drm_job_get(s_job);
> 
>          spin_lock(&sched->job_list_lock);
> 
>          list_add_tail(&s_job->node, &sched->ring_mirror_list);
> 
>          drm_sched_start_timeout(sched);
> 
> @@ -294,17 +316,16 @@ static void drm_sched_job_timedout(struct 
> work_struct *work)
> 
>                   * drm_sched_cleanup_jobs. It will be reinserted back 
> after sched->thread
> 
>                   * is parked at which point it's safe.
> 
>                   */
> 
> -               list_del_init(&job->node);
> 
> +               drm_job_get(job);
> 
>                  spin_unlock(&sched->job_list_lock);
> 
>                  job->sched->ops->timedout_job(job);
> 
> -
> 
> +               drm_job_put(job);
> 
>                  /*
> 
>                   * Guilty job did complete and hence needs to be 
> manually removed
> 
>                   * See drm_sched_stop doc.
> 
>                   */
> 
>                  if (sched->free_guilty) {
> 
> -                       job->sched->ops->free_job(job);
> 
>                          sched->free_guilty = false;
> 
>                  }
> 
>          } else {
> 
> @@ -355,20 +376,6 @@ void drm_sched_stop(struct drm_gpu_scheduler 
> *sched, struct drm_sched_job *bad)
> 
> -       /*
> 
> -        * Reinsert back the bad job here - now it's safe as
> 
> -        * drm_sched_get_cleanup_job cannot race against us and release the
> 
> -        * bad job at this point - we parked (waited for) any in progress
> 
> -        * (earlier) cleanups and drm_sched_get_cleanup_job will not be 
> called
> 
> -        * now until the scheduler thread is unparked.
> 
> -        */
> 
> -       if (bad && bad->sched == sched)
> 
> -               /*
> 
> -                * Add at the head of the queue to reflect it was the 
> earliest
> 
> -                * job extracted.
> 
> -                */
> 
> -               list_add(&bad->node, &sched->ring_mirror_list);
> 
> -
> 
>          /*
> 
>           * Iterate the job list from later to  earlier one and either 
> deactive
> 
>           * their HW callbacks or remove them from mirror list if they 
> already
> 
> @@ -774,7 +781,7 @@ static int drm_sched_main(void *param)
> 
>                                           kthread_should_stop());
> 
>                  if (cleanup_job) {
> 
> -                       sched->ops->free_job(cleanup_job);
> 
> +                       drm_job_put(cleanup_job);
> 
>                          /* queue timeout for next job */
> 
>                          drm_sched_start_timeout(sched);
> 
>                  }
> 
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> 
> index 5a1f068af1c2..b80513eec90f 100644
> 
> --- a/include/drm/gpu_scheduler.h
> 
> +++ b/include/drm/gpu_scheduler.h
> 
> @@ -188,6 +188,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct 
> dma_fence *f);
> 
>    * to schedule the job.
> 
>    */
> 
> struct drm_sched_job {
> 
> +       struct kref                     refcount;
> 
>          struct spsc_node                queue_node;
> 
>          struct drm_gpu_scheduler        *sched;
> 
>          struct drm_sched_fence          *s_fence;
> 
> @@ -198,6 +199,7 @@ struct drm_sched_job {
> 
>          enum drm_sched_priority         s_priority;
> 
>          struct drm_sched_entity  *entity;
> 
>          struct dma_fence_cb             cb;
> 
> +
> 
> };
> 
> *From:* Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
> *Sent:* Friday, March 19, 2021 12:17 AM
> *To:* Zhang, Jack (Jian) <Jack.Zhang1 at amd.com>; Christian König 
> <ckoenig.leichtzumerken at gmail.com>; dri-devel at lists.freedesktop.org; 
> amd-gfx at lists.freedesktop.org; Koenig, Christian 
> <Christian.Koenig at amd.com>; Liu, Monk <Monk.Liu at amd.com>; Deng, Emily 
> <Emily.Deng at amd.com>; Rob Herring <robh at kernel.org>; Tomeu Vizoso 
> <tomeu.vizoso at collabora.com>; Steven Price <steven.price at arm.com>
> *Subject:* Re: [PATCH v3] drm/scheduler re-insert Bailing job to avoid 
> memleak
> 
> On 2021-03-18 6:41 a.m., Zhang, Jack (Jian) wrote:
> 
>     [AMD Official Use Only - Internal Distribution Only]
> 
>     Hi, Andrey
> 
>     Let me summarize the background of this patch:
> 
>     In TDR resubmit step “amdgpu_device_recheck_guilty_jobs,
> 
>     It will submit first jobs of each ring and do guilty job re-check.
> 
>     At that point, We had to make sure each job is in the mirror list(or
>     re-inserted back already).
> 
>     But we found the current code never re-insert the job to mirror list
>     in the 2^nd , 3^rd job_timeout thread(Bailing TDR thread).
> 
>     This not only will cause memleak of the bailing jobs. What’s more
>     important, the 1^st tdr thread can never iterate the bailing job and
>     set its guilty status to a correct status.
> 
>     Therefore, we had to re-insert the job(or even not delete node) for
>     bailing job.
> 
>     For the above V3 patch, the racing condition in my mind is:
> 
>     we cannot make sure all bailing jobs are finished before we do
>     amdgpu_device_recheck_guilty_jobs.
> 
> Yes,that race i missed - so you say that for 2nd, baling thread who 
> extracted the job, even if he reinsert it right away back after driver 
> callback return DRM_GPU_SCHED_STAT_BAILING, there is small time slot 
> where the job is not in mirror list and so the 1st TDR might miss it and 
> not find that  2nd job is the actual guilty job, right ? But, still this 
> job will get back into mirror list, and since it's really the bad job, 
> it will never signal completion and so on the next timeout cycle it will 
> be caught (of course there is a starvation scenario here if more TDRs 
> kick in and it bails out again but this is really unlikely).
> 
>     Based on this insight, I think we have two options to solve this issue:
> 
>      1. Skip delete node in tdr thread2, thread3, 4 … (using mutex or
>         atomic variable)
>      2. Re-insert back bailing job, and meanwhile use semaphore in each
>         tdr thread to keep the sequence as expected and ensure each job
>         is in the mirror list when do resubmit step.
> 
>     For Option1, logic is simpler and we need only one global atomic
>     variable:
> 
>     What do you think about this plan?
> 
>     Option1 should look like the following logic:
> 
>     +static atomic_t in_reset;             //a global atomic var for
>     synchronization
> 
>     static void drm_sched_process_job(struct dma_fence *f, struct
>     dma_fence_cb *cb);
> 
>       /**
> 
>     @@ -295,6 +296,12 @@ static void drm_sched_job_timedout(struct
>     work_struct *work)
> 
>                       * drm_sched_cleanup_jobs. It will be reinserted
>     back after sched->thread
> 
>                       * is parked at which point it's safe.
> 
>                       */
> 
>     +               if (atomic_cmpxchg(&in_reset, 0, 1) != 0) {  //skip
>     delete node if it’s thead1,2,3,….
> 
>     +                       spin_unlock(&sched->job_list_lock);
> 
>     +                       drm_sched_start_timeout(sched);
> 
>     +                       return;
> 
>     +               }
> 
>     +
> 
>                      list_del_init(&job->node);
> 
>                      spin_unlock(&sched->job_list_lock);
> 
>     @@ -320,6 +327,7 @@ static void drm_sched_job_timedout(struct
>     work_struct *work)
> 
>              spin_lock(&sched->job_list_lock);
> 
>              drm_sched_start_timeout(sched);
> 
>              spin_unlock(&sched->job_list_lock);
> 
>     +       atomic_set(&in_reset, 0); //reset in_reset when the first
>     thread finished tdr
> 
>     }
> 
> Technically looks like it should work as you don't access the job 
> pointer any longer and so no risk that if signaled it will be freed by 
> drm_sched_get_cleanup_job but,you can't just use one global variable an 
> by this bailing from TDR when different drivers run their TDR threads in 
> parallel, and even for amdgpu, if devices in different XGMI hives or 2 
> independent devices in non XGMI setup. There should be defined some kind 
> of GPU reset group structure on drm_scheduler level for which this 
> variable would be used.
> 
> P.S I wonder why we can't just ref-count the job so that even if 
> drm_sched_get_cleanup_job would delete it before we had a chance to stop 
> the scheduler thread, we wouldn't crash. This would avoid all the dance 
> with deletion and reinsertion.
> 
> Andrey
> 
>     Thanks,
> 
>     Jack
> 
>     *From:* amd-gfx <amd-gfx-bounces at lists.freedesktop.org>
>     <mailto:amd-gfx-bounces at lists.freedesktop.org> *On Behalf Of *Zhang,
>     Jack (Jian)
>     *Sent:* Wednesday, March 17, 2021 11:11 PM
>     *To:* Christian König <ckoenig.leichtzumerken at gmail.com>
>     <mailto:ckoenig.leichtzumerken at gmail.com>;
>     dri-devel at lists.freedesktop.org
>     <mailto:dri-devel at lists.freedesktop.org>;
>     amd-gfx at lists.freedesktop.org
>     <mailto:amd-gfx at lists.freedesktop.org>; Koenig, Christian
>     <Christian.Koenig at amd.com> <mailto:Christian.Koenig at amd.com>; Liu,
>     Monk <Monk.Liu at amd.com> <mailto:Monk.Liu at amd.com>; Deng, Emily
>     <Emily.Deng at amd.com> <mailto:Emily.Deng at amd.com>; Rob Herring
>     <robh at kernel.org> <mailto:robh at kernel.org>; Tomeu Vizoso
>     <tomeu.vizoso at collabora.com> <mailto:tomeu.vizoso at collabora.com>;
>     Steven Price <steven.price at arm.com> <mailto:steven.price at arm.com>;
>     Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
>     <mailto:Andrey.Grodzovsky at amd.com>
>     *Subject:* Re: [PATCH v3] drm/scheduler re-insert Bailing job to
>     avoid memleak
> 
>     [AMD Official Use Only - Internal Distribution Only]
> 
>     [AMD Official Use Only - Internal Distribution Only]
> 
>     Hi，Andrey，
> 
>     Good catch，I will expore this corner case and give feedback soon～
> 
>     Best，
> 
>     Jack
> 
>     ------------------------------------------------------------------------
> 
>     *From:*Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com
>     <mailto:Andrey.Grodzovsky at amd.com>>
>     *Sent:* Wednesday, March 17, 2021 10:50:59 PM
>     *To:* Christian König <ckoenig.leichtzumerken at gmail.com
>     <mailto:ckoenig.leichtzumerken at gmail.com>>; Zhang, Jack (Jian)
>     <Jack.Zhang1 at amd.com <mailto:Jack.Zhang1 at amd.com>>;
>     dri-devel at lists.freedesktop.org
>     <mailto:dri-devel at lists.freedesktop.org>
>     <dri-devel at lists.freedesktop.org
>     <mailto:dri-devel at lists.freedesktop.org>>;
>     amd-gfx at lists.freedesktop.org <mailto:amd-gfx at lists.freedesktop.org>
>     <amd-gfx at lists.freedesktop.org
>     <mailto:amd-gfx at lists.freedesktop.org>>; Koenig, Christian
>     <Christian.Koenig at amd.com <mailto:Christian.Koenig at amd.com>>; Liu,
>     Monk <Monk.Liu at amd.com <mailto:Monk.Liu at amd.com>>; Deng, Emily
>     <Emily.Deng at amd.com <mailto:Emily.Deng at amd.com>>; Rob Herring
>     <robh at kernel.org <mailto:robh at kernel.org>>; Tomeu Vizoso
>     <tomeu.vizoso at collabora.com <mailto:tomeu.vizoso at collabora.com>>;
>     Steven Price <steven.price at arm.com <mailto:steven.price at arm.com>>
>     *Subject:* Re: [PATCH v3] drm/scheduler re-insert Bailing job to
>     avoid memleak
> 
>     I actually have a race condition concern here - see bellow -
> 
>     On 2021-03-17 3:43 a.m., Christian König wrote:
>      > I was hoping Andrey would take a look since I'm really busy with
>     other
>      > work right now.
>      >
>      > Regards,
>      > Christian.
>      >
>      > Am 17.03.21 um 07:46 schrieb Zhang, Jack (Jian):
>      >> Hi, Andrey/Crhistian and Team,
>      >>
>      >> I didn't receive the reviewer's message from maintainers on
>     panfrost
>      >> driver for several days.
>      >> Due to this patch is urgent for my current working project.
>      >> Would you please help to give some review ideas?
>      >>
>      >> Many Thanks,
>      >> Jack
>      >> -----Original Message-----
>      >> From: Zhang, Jack (Jian)
>      >> Sent: Tuesday, March 16, 2021 3:20 PM
>      >> To: dri-devel at lists.freedesktop.org
>     <mailto:dri-devel at lists.freedesktop.org>;
>     amd-gfx at lists.freedesktop.org <mailto:amd-gfx at lists.freedesktop.org>;
>      >> Koenig, Christian <Christian.Koenig at amd.com
>     <mailto:Christian.Koenig at amd.com>>; Grodzovsky, Andrey
>      >> <Andrey.Grodzovsky at amd.com <mailto:Andrey.Grodzovsky at amd.com>>;
>     Liu, Monk <Monk.Liu at amd.com <mailto:Monk.Liu at amd.com>>; Deng,
>      >> Emily <Emily.Deng at amd.com <mailto:Emily.Deng at amd.com>>; Rob
>     Herring <robh at kernel.org <mailto:robh at kernel.org>>; Tomeu
>      >> Vizoso <tomeu.vizoso at collabora.com
>     <mailto:tomeu.vizoso at collabora.com>>; Steven Price
>     <steven.price at arm.com <mailto:steven.price at arm.com>>
>      >> Subject: RE: [PATCH v3] drm/scheduler re-insert Bailing job to
>     avoid
>      >> memleak
>      >>
>      >> [AMD Public Use]
>      >>
>      >> Ping
>      >>
>      >> -----Original Message-----
>      >> From: Zhang, Jack (Jian)
>      >> Sent: Monday, March 15, 2021 1:24 PM
>      >> To: Jack Zhang <Jack.Zhang1 at amd.com <mailto:Jack.Zhang1 at amd.com>>;
>      >> dri-devel at lists.freedesktop.org
>     <mailto:dri-devel at lists.freedesktop.org>;
>     amd-gfx at lists.freedesktop.org <mailto:amd-gfx at lists.freedesktop.org>;
>      >> Koenig, Christian <Christian.Koenig at amd.com
>     <mailto:Christian.Koenig at amd.com>>; Grodzovsky, Andrey
>      >> <Andrey.Grodzovsky at amd.com <mailto:Andrey.Grodzovsky at amd.com>>;
>     Liu, Monk <Monk.Liu at amd.com <mailto:Monk.Liu at amd.com>>; Deng,
>      >> Emily <Emily.Deng at amd.com <mailto:Emily.Deng at amd.com>>; Rob
>     Herring <robh at kernel.org <mailto:robh at kernel.org>>; Tomeu
>      >> Vizoso <tomeu.vizoso at collabora.com
>     <mailto:tomeu.vizoso at collabora.com>>; Steven Price
>     <steven.price at arm.com <mailto:steven.price at arm.com>>
>      >> Subject: RE: [PATCH v3] drm/scheduler re-insert Bailing job to
>     avoid
>      >> memleak
>      >>
>      >> [AMD Public Use]
>      >>
>      >> Hi, Rob/Tomeu/Steven,
>      >>
>      >> Would you please help to review this patch for panfrost driver?
>      >>
>      >> Thanks,
>      >> Jack Zhang
>      >>
>      >> -----Original Message-----
>      >> From: Jack Zhang <Jack.Zhang1 at amd.com <mailto:Jack.Zhang1 at amd.com>>
>      >> Sent: Monday, March 15, 2021 1:21 PM
>      >> To: dri-devel at lists.freedesktop.org
>     <mailto:dri-devel at lists.freedesktop.org>;
>     amd-gfx at lists.freedesktop.org <mailto:amd-gfx at lists.freedesktop.org>;
>      >> Koenig, Christian <Christian.Koenig at amd.com
>     <mailto:Christian.Koenig at amd.com>>; Grodzovsky, Andrey
>      >> <Andrey.Grodzovsky at amd.com <mailto:Andrey.Grodzovsky at amd.com>>;
>     Liu, Monk <Monk.Liu at amd.com <mailto:Monk.Liu at amd.com>>; Deng,
>      >> Emily <Emily.Deng at amd.com <mailto:Emily.Deng at amd.com>>
>      >> Cc: Zhang, Jack (Jian) <Jack.Zhang1 at amd.com
>     <mailto:Jack.Zhang1 at amd.com>>
>      >> Subject: [PATCH v3] drm/scheduler re-insert Bailing job to avoid
>     memleak
>      >>
>      >> re-insert Bailing jobs to avoid memory leak.
>      >>
>      >> V2: move re-insert step to drm/scheduler logic
>      >> V3: add panfrost's return value for bailing jobs in case it hits
>     the
>      >> memleak issue.
>      >>
>      >> Signed-off-by: Jack Zhang <Jack.Zhang1 at amd.com
>     <mailto:Jack.Zhang1 at amd.com>>
>      >> ---
>      >>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++-
>      >>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 8 ++++++--
>      >>   drivers/gpu/drm/panfrost/panfrost_job.c    | 4 ++--
>      >>   drivers/gpu/drm/scheduler/sched_main.c     | 8 +++++++-
>      >>   include/drm/gpu_scheduler.h                | 1 +
>      >>   5 files changed, 19 insertions(+), 6 deletions(-)
>      >>
>      >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>      >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>      >> index 79b9cc73763f..86463b0f936e 100644
>      >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>      >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>      >> @@ -4815,8 +4815,10 @@ int amdgpu_device_gpu_recover(struct
>      >> amdgpu_device *adev,
>      >>                       job ? job->base.id : -1);
>      >>             /* even we skipped this reset, still need to set the
>     job
>      >> to guilty */
>      >> -        if (job)
>      >> +        if (job) {
>      >>               drm_sched_increase_karma(&job->base);
>      >> +            r = DRM_GPU_SCHED_STAT_BAILING;
>      >> +        }
>      >>           goto skip_recovery;
>      >>       }
>      >>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>      >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>      >> index 759b34799221..41390bdacd9e 100644
>      >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>      >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>      >> @@ -34,6 +34,7 @@ static enum drm_gpu_sched_stat
>      >> amdgpu_job_timedout(struct drm_sched_job *s_job)
>      >>       struct amdgpu_job *job = to_amdgpu_job(s_job);
>      >>       struct amdgpu_task_info ti;
>      >>       struct amdgpu_device *adev = ring->adev;
>      >> +    int ret;
>      >>         memset(&ti, 0, sizeof(struct amdgpu_task_info));
>      >>   @@ -52,8 +53,11 @@ static enum drm_gpu_sched_stat
>      >> amdgpu_job_timedout(struct drm_sched_job *s_job)
>      >>             ti.process_name, ti.tgid, ti.task_name, ti.pid);
>      >>         if (amdgpu_device_should_recover_gpu(ring->adev)) {
>      >> -        amdgpu_device_gpu_recover(ring->adev, job);
>      >> -        return DRM_GPU_SCHED_STAT_NOMINAL;
>      >> +        ret = amdgpu_device_gpu_recover(ring->adev, job);
>      >> +        if (ret == DRM_GPU_SCHED_STAT_BAILING)
>      >> +            return DRM_GPU_SCHED_STAT_BAILING;
>      >> +        else
>      >> +            return DRM_GPU_SCHED_STAT_NOMINAL;
>      >>       } else {
>      >>           drm_sched_suspend_timeout(&ring->sched);
>      >>           if (amdgpu_sriov_vf(adev))
>      >> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>      >> b/drivers/gpu/drm/panfrost/panfrost_job.c
>      >> index 6003cfeb1322..e2cb4f32dae1 100644
>      >> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>      >> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>      >> @@ -444,7 +444,7 @@ static enum drm_gpu_sched_stat
>      >> panfrost_job_timedout(struct drm_sched_job
>      >>        * spurious. Bail out.
>      >>        */
>      >>       if (dma_fence_is_signaled(job->done_fence))
>      >> -        return DRM_GPU_SCHED_STAT_NOMINAL;
>      >> +        return DRM_GPU_SCHED_STAT_BAILING;
>      >>         dev_err(pfdev->dev, "gpu sched timeout, js=%d, config=0x%x,
>      >> status=0x%x, head=0x%x, tail=0x%x, sched_job=%p",
>      >>           js,
>      >> @@ -456,7 +456,7 @@ static enum drm_gpu_sched_stat
>      >> panfrost_job_timedout(struct drm_sched_job
>      >>         /* Scheduler is already stopped, nothing to do. */
>      >>       if (!panfrost_scheduler_stop(&pfdev->js->queue[js],
>     sched_job))
>      >> -        return DRM_GPU_SCHED_STAT_NOMINAL;
>      >> +        return DRM_GPU_SCHED_STAT_BAILING;
>      >>         /* Schedule a reset if there's no reset in progress. */
>      >>       if (!atomic_xchg(&pfdev->reset.pending, 1)) diff --git
>      >> a/drivers/gpu/drm/scheduler/sched_main.c
>      >> b/drivers/gpu/drm/scheduler/sched_main.c
>      >> index 92d8de24d0a1..a44f621fb5c4 100644
>      >> --- a/drivers/gpu/drm/scheduler/sched_main.c
>      >> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>      >> @@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct
>      >> work_struct *work)  {
>      >>       struct drm_gpu_scheduler *sched;
>      >>       struct drm_sched_job *job;
>      >> +    int ret;
>      >>         sched = container_of(work, struct drm_gpu_scheduler,
>      >> work_tdr.work);
>      >>   @@ -331,8 +332,13 @@ static void drm_sched_job_timedout(struct
>      >> work_struct *work)
>      >>           list_del_init(&job->list);
>      >>           spin_unlock(&sched->job_list_lock);
>      >>   -        job->sched->ops->timedout_job(job);
>      >> +        ret = job->sched->ops->timedout_job(job);
>      >>   +        if (ret == DRM_GPU_SCHED_STAT_BAILING) {
>      >> +            spin_lock(&sched->job_list_lock);
>      >> +            list_add(&job->node, &sched->ring_mirror_list);
>      >> +            spin_unlock(&sched->job_list_lock);
>      >> +        }
> 
> 
>     At this point we don't hold GPU reset locks anymore, and so we could
>     be racing against another TDR thread from another scheduler ring of
>     same
>     device
>     or another XGMI hive member. The other thread might be in the middle of
>     luckless
>     iteration of mirror list (drm_sched_stop, drm_sched_start and
>     drm_sched_resubmit)
>     and so locking job_list_lock will not help. Looks like it's required to
>     take all GPU rest locks
>     here.
> 
>     Andrey
> 
> 
>      >>           /*
>      >>            * Guilty job did complete and hence needs to be manually
>      >> removed
>      >>            * See drm_sched_stop doc.
>      >> diff --git a/include/drm/gpu_scheduler.h
>      >> b/include/drm/gpu_scheduler.h index 4ea8606d91fe..8093ac2427ef
>     100644
>      >> --- a/include/drm/gpu_scheduler.h
>      >> +++ b/include/drm/gpu_scheduler.h
>      >> @@ -210,6 +210,7 @@ enum drm_gpu_sched_stat {
>      >>       DRM_GPU_SCHED_STAT_NONE, /* Reserve 0 */
>      >>       DRM_GPU_SCHED_STAT_NOMINAL,
>      >>       DRM_GPU_SCHED_STAT_ENODEV,
>      >> +    DRM_GPU_SCHED_STAT_BAILING,
>      >>   };
>      >>     /**
>      >> --
>      >> 2.25.1
>      >> _______________________________________________
>      >> amd-gfx mailing list
>      >> amd-gfx at lists.freedesktop.org <mailto:amd-gfx at lists.freedesktop.org>
>      >>
>     https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7CAndrey.Grodzovsky%40amd.com%7Ce90f30af0f43444c6aea08d8e91860c4%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637515638213180413%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=NnLqtz%2BZ8%2BweYwCqRinrfkqmhzibNAF6CYSdVqL6xi0%3D&reserved=0
>     <https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7CJack.Zhang1%40amd.com%7C95b2ff206ee74bbe520a08d8e956f5dd%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637515907000888939%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C2000&sdata=BGoSfOYiDar8SrpMx%2BsOMWpaMr87bxB%2F9ycu0FhhipA%3D&reserved=0>
> 
>      >>
>      >
>