[RFC PATCH 1/1] drm/amdgpu: wait for sched to become ready on job submit

Christian König christian.koenig at amd.com
Mon Feb 24 18:44:05 UTC 2020


I think that the IB tests already set the ready flag to false when 
something goes wrong, but for the ring tests your probably want to 
double check and maybe generalize that.

Christian.

Am 24.02.20 um 19:15 schrieb Das, Nirmoy:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> Thanks Christian. I will try to send a updated patch soon.
>
> Get Outlook for Android <https://aka.ms/ghei36>
>
> ------------------------------------------------------------------------
> *From:* Koenig, Christian <Christian.Koenig at amd.com>
> *Sent:* Monday, February 24, 2020, 18:06
> *To:* Nirmoy Das
> *Cc:* amd-gfx at lists.freedesktop.org; Deucher, Alexander; Liu, Monk; 
> Li, Dennis; Das, Nirmoy
> *Subject:* Re: [RFC PATCH 1/1] drm/amdgpu: wait for sched to become 
> ready on job submit
>
> Hi Nirmoy,
>
> Am 24.02.2020 17:48 schrieb Nirmoy Das <nirmoy.aiemd at gmail.com>:
>
>     On reset, amdgpu can set a drm sched's ready status to false
>     temporarily. drm job
>     init will fail if all of the drm scheds are not ready for a HW IP.
>     This patch tries to make
>     kernel's internal drm job submit handle, amdgpu_job_submit() a bit
>     more fault tolerant.
>
>
> I don't think that this approach makes sense. Since it is a front end 
> property we should rather stop setting the scheduler ready status to 
> false during reset.
>
> Instead we should only set it to false when the ring/IB test fails and 
> we can't bring the ring back to life again.
>
> Christian.
>
>
>     Signed-off-by: Nirmoy Das <nirmoy.das at amd.com>
>     ---
>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 +++++++++++++++++++--
>      drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  5 +--
>      drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  6 ++--
>      drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c |  2 +-
>      drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c |  2 +-
>      drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c |  2 +-
>      drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  2 +-
>      7 files changed, 43 insertions(+), 11 deletions(-)
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>     index d42be880a236..0745df80112f 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>     @@ -139,7 +139,38 @@ void amdgpu_job_free(struct amdgpu_job *job)
>              kfree(job);
>      }
>
>     -int amdgpu_job_submit(struct amdgpu_job *job, struct
>     drm_sched_entity *entity,
>     +static int amdgpu_job_try_init(struct amdgpu_device *adev,
>     +                              struct drm_sched_job *base,
>     +                              struct drm_sched_entity *entity,
>     +                              void *owner)
>     +{
>     +       int r, i;
>     +
>     +       r = drm_sched_job_init(base, entity, owner);
>     +       if (r == -ENOENT) {
>     +               /* retry till we come out of reset phase */
>     +               while (!mutex_trylock(&adev->lock_reset))
>     +                       msleep(10);
>     +               /* retry for a second for the sched to get ready*/
>     +               for (i = 0; i < 100; i++) {
>     +                       msleep(10);
>     +                       r = drm_sched_job_init(base, entity, owner);
>     +                       if (r == -ENOENT)
>     +                               continue;
>     +               }
>     +
>     + mutex_unlock(&adev->lock_reset);
>     +               /* If after all these we failed to initialize a job
>     +                * it means the IP is unrecoverable */
>     +               if (r == -ENOENT)
>     +                       return -ENODEV;
>     +       }
>     +
>     +       return r;
>     +}
>     +
>     +int amdgpu_job_submit(struct amdgpu_device *adev,struct
>     amdgpu_job *job,
>     +                     struct drm_sched_entity *entity,
>                            void *owner, struct dma_fence **f)
>      {
>              enum drm_sched_priority priority;
>     @@ -149,7 +180,7 @@ int amdgpu_job_submit(struct amdgpu_job *job,
>     struct drm_sched_entity *entity,
>              if (!f)
>                      return -EINVAL;
>
>     -       r = drm_sched_job_init(&job->base, entity, owner);
>     +       r = amdgpu_job_try_init(adev, &job->base, entity, owner);
>              if (r)
>                      return r;
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>     index 2e2110dddb76..fed87e96cacc 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>     @@ -70,8 +70,9 @@ int amdgpu_job_alloc_with_ib(struct
>     amdgpu_device *adev, unsigned size,
>
>      void amdgpu_job_free_resources(struct amdgpu_job *job);
>      void amdgpu_job_free(struct amdgpu_job *job);
>     -int amdgpu_job_submit(struct amdgpu_job *job, struct
>     drm_sched_entity *entity,
>     -                     void *owner, struct dma_fence **f);
>     +int amdgpu_job_submit(struct amdgpu_device *adev, struct
>     amdgpu_job *job,
>     +                     struct drm_sched_entity *entity, void *owner,
>     +                     struct dma_fence **f);
>      int amdgpu_job_submit_direct(struct amdgpu_job *job, struct
>     amdgpu_ring *ring,
>                                   struct dma_fence **fence);
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>     index 660867cf2597..adfde07eb75f 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>     @@ -2066,7 +2066,7 @@ static int amdgpu_map_buffer(struct
>     ttm_buffer_object *bo,
>              if (r)
>                      goto error_free;
>
>     -       r = amdgpu_job_submit(job, &adev->mman.entity,
>     +       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
>     AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
>              if (r)
>                      goto error_free;
>     @@ -2137,7 +2137,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring
>     *ring, uint64_t src_offset,
>              if (direct_submit)
>                      r = amdgpu_job_submit_direct(job, ring, fence);
>              else
>     -               r = amdgpu_job_submit(job, &adev->mman.entity,
>     +               r = amdgpu_job_submit(adev, job, &adev->mman.entity,
>     AMDGPU_FENCE_OWNER_UNDEFINED, fence);
>              if (r)
>                      goto error_free;
>     @@ -2231,7 +2231,7 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>
>              amdgpu_ring_pad_ib(ring, &job->ibs[0]);
>              WARN_ON(job->ibs[0].length_dw > num_dw);
>     -       r = amdgpu_job_submit(job, &adev->mman.entity,
>     +       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
>     AMDGPU_FENCE_OWNER_UNDEFINED, fence);
>              if (r)
>                      goto error_free;
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
>     index 5fd32ad1c575..8ff97b24914e 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
>     @@ -1104,7 +1104,7 @@ static int amdgpu_uvd_send_msg(struct
>     amdgpu_ring *ring, struct amdgpu_bo *bo,
>                      if (r)
>                              goto err_free;
>
>     -               r = amdgpu_job_submit(job, &adev->uvd.entity,
>     +               r = amdgpu_job_submit(adev, job, &adev->uvd.entity,
>     AMDGPU_FENCE_OWNER_UNDEFINED, &f);
>                      if (r)
>                              goto err_free;
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
>     index 59ddba137946..e721d3367783 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
>     @@ -554,7 +554,7 @@ static int amdgpu_vce_get_destroy_msg(struct
>     amdgpu_ring *ring, uint32_t handle,
>              if (direct)
>                      r = amdgpu_job_submit_direct(job, ring, &f);
>              else
>     -               r = amdgpu_job_submit(job, &ring->adev->vce.entity,
>     +               r = amdgpu_job_submit(ring->adev, job,
>     &ring->adev->vce.entity,
>     AMDGPU_FENCE_OWNER_UNDEFINED, &f);
>              if (r)
>                      goto err;
>     diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>     index 4cc7881f438c..b536962c22d9 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>     @@ -100,7 +100,7 @@ static int amdgpu_vm_sdma_commit(struct
>     amdgpu_vm_update_params *p,
>              WARN_ON(ib->length_dw == 0);
>              amdgpu_ring_pad_ib(ring, ib);
>              WARN_ON(ib->length_dw > p->num_dw_left);
>     -       r = amdgpu_job_submit(p->job, entity,
>     AMDGPU_FENCE_OWNER_VM, &f);
>     +       r = amdgpu_job_submit(p->adev, p->job, entity,
>     AMDGPU_FENCE_OWNER_VM, &f);
>              if (r)
>                      goto error;
>
>     diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>     b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>     index 9775eca6fe43..a4aaa2a1f878 100644
>     --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>     +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>     @@ -377,7 +377,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct
>     amdgpu_device *adev, uint32_t vmid,
>              job->vm_needs_flush = true;
>     job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop;
>              amdgpu_ring_pad_ib(ring, &job->ibs[0]);
>     -       r = amdgpu_job_submit(job, &adev->mman.entity,
>     +       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
>     AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
>              if (r)
>                      goto error_submit;
>     -- 
>     2.25.0
>
>
>

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20200224/4e47c0f3/attachment-0001.htm>


More information about the amd-gfx mailing list