[PATCH 2/5] drm/amdgpu: rework gfx9 queue reset

Alex Deucher alexdeucher at gmail.com
Tue May 20 13:14:26 UTC 2025


On Mon, May 19, 2025 at 7:59 PM Rodrigo Siqueira <siqueira at igalia.com> wrote:
>
> On 05/02, Christian König wrote:
> > Testing this feature turned out that it was a bit unstable. The
> > CP_VMID_RESET register takes the VMID which all submissions from should
> > be canceled.
> >
> > Unlike Windows Linux uses per process VMIDs instead of per engine VMIDs
> > for the simple reason that we don't have enough. So resetting one VMID
> > only killed the submissions of one specific process.
> >
> > Fortunately that turned out to be exactly what we want to have.
> >
> > So clear the CP_VMID_RESET register between every context switch between
> > applications when we do the pipeline sync to avoid trouble if multiple
> > VMIDs are used on the ring right behind each other.
> >
> > Use the same pipeline sync function in the reset handler and issue an IB
> > test instead of a ring test after the queue reset to provide a longer
> > timeout and additional fence value should there be additional work on
> > the ring after the one aborted.
> >
> > Also drop the soft recovery since that pretty much does the same thing as
> > CP_VMID_RESET, just on a lower level and with less chance of succeeding.
> >
> > This now survives a stress test running over night sending a broken
> > submission ever 45 seconds and recovering fine from each of them.
> >
> > Signed-off-by: Christian König <christian.koenig at amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 47 ++++++++++-----------------
> >  2 files changed, 19 insertions(+), 29 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index cc26cf1bd843..c39fe784419b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -278,6 +278,7 @@ extern int amdgpu_user_queue;
> >  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS               3000
> >  #define AMDGPU_MAX_USEC_TIMEOUT                      100000  /* 100 ms */
> >  #define AMDGPU_FENCE_JIFFIES_TIMEOUT         (HZ / 2)
> > +#define AMDGPU_QUEUE_RESET_TIMEOUT           (HZ / 10)
> >  #define AMDGPU_DEBUGFS_MAX_COMPONENTS                32
> >  #define AMDGPUFB_CONN_LIMIT                  4
> >  #define AMDGPU_BIOS_NUM_SCRATCH                      16
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index d377a7c57d5e..92d9a28c62d3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -5565,7 +5565,17 @@ static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> >       int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> >       uint32_t seq = ring->fence_drv.sync_seq;
> >       uint64_t addr = ring->fence_drv.gpu_addr;
> > +     struct amdgpu_device *adev = ring->adev;
> >
> > +     amdgpu_ring_emit_reg_wait(ring,
> > +                               SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> > +                               0, 0xffff);
> > +     amdgpu_ring_emit_wreg(ring,
> > +                           SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> > +                           0);
> > +     amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> > +                            ring->fence_drv.sync_seq,
> > +                            AMDGPU_FENCE_FLAG_EXEC);
> >       gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
> >                             lower_32_bits(addr), upper_32_bits(addr),
> >                             seq, 0xffffffff, 4);
> > @@ -5896,20 +5906,6 @@ static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
> >                                                          ref, mask);
> >  }
> >
> > -static void gfx_v9_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
> > -{
> > -     struct amdgpu_device *adev = ring->adev;
> > -     uint32_t value = 0;
> > -
> > -     value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
> > -     value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
> > -     value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
> > -     value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
> > -     amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
> > -     WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> > -     amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> > -}
> > -
> >  static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
> >                                                enum amdgpu_interrupt_state state)
> >  {
> > @@ -7185,16 +7181,12 @@ static int gfx_v9_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
> >       if (r)
> >               return r;
> >
> > -     if (amdgpu_ring_alloc(ring, 7 + 7 + 5))
> > +     if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7))
>
> Hi Christian,
>
> What is the meaning of all of the above additions (7 + 7 + 5 + 7)? I see
> it in many different parts of the code. Is this some indication of
> preambles?

It's the number of dwords needed for the operation.  In this case
gfx_v9_0_ring_emit_pipeline_sync() uses 7 + 7 + 5 + 7.  Actually It
should be 7 (gfx_v9_0_wait_reg_mem()) + 7
(amdgpu_ring_emit_reg_wait()) + 5 (amdgpu_ring_emit_wreg()) + 8
(amdgpu_ring_emit_fence()).  Fixed up locally.

Alex

>
> Thanks
>
> >               return -ENOMEM;
> > -     gfx_v9_0_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> > -                              ring->fence_drv.sync_seq, AMDGPU_FENCE_FLAG_EXEC);
> > -     gfx_v9_0_ring_emit_reg_wait(ring,
> > -                                 SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0, 0xffff);
> > -     gfx_v9_0_ring_emit_wreg(ring,
> > -                             SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0);
> > +     gfx_v9_0_ring_emit_pipeline_sync(ring);
> > +     amdgpu_ring_commit(ring);
> >
> > -     return amdgpu_ring_test_ring(ring);
> > +     return gfx_v9_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT);
> >  }
> >
> >  static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring,
> > @@ -7437,7 +7429,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> >       .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
> >       .emit_frame_size = /* totally 242 maximum if 16 IBs */
> >               5 +  /* COND_EXEC */
> > -             7 +  /* PIPELINE_SYNC */
> > +             7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >               2 + /* VM_FLUSH */
> > @@ -7475,7 +7467,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> >       .emit_wreg = gfx_v9_0_ring_emit_wreg,
> >       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> >       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> > -     .soft_recovery = gfx_v9_0_ring_soft_recovery,
> >       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> >       .reset = gfx_v9_0_reset_kgq,
> >       .emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader,
> > @@ -7494,7 +7485,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
> >       .set_wptr = amdgpu_sw_ring_set_wptr_gfx,
> >       .emit_frame_size = /* totally 242 maximum if 16 IBs */
> >               5 +  /* COND_EXEC */
> > -             7 +  /* PIPELINE_SYNC */
> > +             7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >               2 + /* VM_FLUSH */
> > @@ -7533,7 +7524,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
> >       .emit_wreg = gfx_v9_0_ring_emit_wreg,
> >       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> >       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> > -     .soft_recovery = gfx_v9_0_ring_soft_recovery,
> >       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> >       .patch_cntl = gfx_v9_0_ring_patch_cntl,
> >       .patch_de = gfx_v9_0_ring_patch_de_meta,
> > @@ -7555,7 +7545,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >               20 + /* gfx_v9_0_ring_emit_gds_switch */
> >               7 + /* gfx_v9_0_ring_emit_hdp_flush */
> >               5 + /* hdp invalidate */
> > -             7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > +             7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >               8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> > @@ -7577,7 +7567,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> >       .emit_wreg = gfx_v9_0_ring_emit_wreg,
> >       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> >       .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> > -     .soft_recovery = gfx_v9_0_ring_soft_recovery,
> >       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> >       .emit_wave_limit = gfx_v9_0_emit_wave_limit,
> >       .reset = gfx_v9_0_reset_kcq,
> > @@ -7598,7 +7587,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> >               20 + /* gfx_v9_0_ring_emit_gds_switch */
> >               7 + /* gfx_v9_0_ring_emit_hdp_flush */
> >               5 + /* hdp invalidate */
> > -             7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> > +             7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
> >               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >               SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
> > --
> > 2.34.1
> >
>
> --
> Rodrigo Siqueira


More information about the amd-gfx mailing list