[PATCH 02/33] drm/amdgpu/sdma: consolidate engine reset handling

Mon Jun 30 13:47:43 UTC 2025

On Mon, Jun 30, 2025 at 4:48 AM Christian König
<christian.koenig at amd.com> wrote:
>
> On 27.06.25 05:39, Alex Deucher wrote:
> > Move the force completion handling into the common
> > engine reset function.  No need to duplicate it for
> > every IP version.
> >
> > Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  5 ++++-
> >  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 17 +----------------
> >  drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   |  6 ++----
> >  drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   |  6 ++----
> >  4 files changed, 9 insertions(+), 25 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > index 7e26a44dcc1fd..56939bb1d1a95 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > @@ -590,9 +590,12 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
> >        * to be submitted to the queues after the reset is complete.
> >        */
> >       if (!ret) {
> > +             amdgpu_fence_driver_force_completion(gfx_ring);
> >               drm_sched_wqueue_start(&gfx_ring->sched);
> > -             if (adev->sdma.has_page_queue)
> > +             if (adev->sdma.has_page_queue) {
> > +                     amdgpu_fence_driver_force_completion(page_ring);
>
> Calling amdgpu_fence_driver_force_completion() here sounds like a really bad idea in the first place.
>
> That will mark all fences as completed making it impossible to execute the remaingin work.

That is the current behavior of the ring reset code so this just moves
the existing code around.  Also, SDMA 5.x and older is a bit special
because the hardware only supports engine reset (all queues) so when
we reset we lose all the queues.  Also SDMA resets can come in via KFD
as well so at this point, we just mark everything as lost when we
reset the engine.  This could potentially be optimized in the future.
I've done this for SDMA 5.x later in the series.

Alex

>
> Regards,
> Christian.
>
> >                       drm_sched_wqueue_start(&page_ring->sched);
> > +             }
> >       }
> >       mutex_unlock(&sdma_instance->engine_reset_mutex);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> > index d3072bca43e3f..572d105420ec3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> > @@ -1714,7 +1714,7 @@ static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring)
> >  static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring)
> >  {
> >       struct amdgpu_device *adev = ring->adev;
> > -     u32 inst_mask, tmp_mask;
> > +     u32 inst_mask;
> >       int i, r;
> >
> >       inst_mask = 1 << ring->me;
> > @@ -1733,21 +1733,6 @@ static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring)
> >       }
> >
> >       r = sdma_v4_4_2_inst_start(adev, inst_mask, true);
> > -     if (r)
> > -             return r;
> > -
> > -     tmp_mask = inst_mask;
> > -     for_each_inst(i, tmp_mask) {
> > -             ring = &adev->sdma.instance[i].ring;
> > -
> > -             amdgpu_fence_driver_force_completion(ring);
> > -
> > -             if (adev->sdma.has_page_queue) {
> > -                     struct amdgpu_ring *page = &adev->sdma.instance[i].page;
> > -
> > -                     amdgpu_fence_driver_force_completion(page);
> > -             }
> > -     }
> >
> >       return r;
> >  }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > index 4d72b085b3dd7..ed1706da7deec 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> > @@ -1618,10 +1618,8 @@ static int sdma_v5_0_restore_queue(struct amdgpu_ring *ring)
> >
> >       r = sdma_v5_0_gfx_resume_instance(adev, inst_id, true);
> >       amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> > -     if (r)
> > -             return r;
> > -     amdgpu_fence_driver_force_completion(ring);
> > -     return 0;
> > +
> > +     return r;
> >  }
> >
> >  static int sdma_v5_0_ring_preempt_ib(struct amdgpu_ring *ring)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > index 42a25150f83ac..b87a4b44fa939 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > @@ -1534,10 +1534,8 @@ static int sdma_v5_2_restore_queue(struct amdgpu_ring *ring)
> >       r = sdma_v5_2_gfx_resume_instance(adev, inst_id, true);
> >
> >       amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> > -     if (r)
> > -             return r;
> > -     amdgpu_fence_driver_force_completion(ring);
> > -     return 0;
> > +
> > +     return r;
> >  }
> >
> >  static int sdma_v5_2_ring_preempt_ib(struct amdgpu_ring *ring)
>