[PATCH 5/5] drm/amdgpu: rework gfx10 queue reset

Tue Feb 4 15:03:02 UTC 2025

On Tue, Feb 4, 2025 at 9:57 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> Apply the same changes to gfx10 as done to gfx9.
>
> The general idea to reset the whole kernel queue and then asking the kiq
> to map it again didn't worked at all. Background is that we don't use per
> application kernel queues for gfx10 on Linux for performance reasons.
>
> So instead use the gfx9 approach here as well and only reset all
> submissions from a specific VMID instead of the whole queue.
>
> This also avoids reserving and kmap the MQD which are operations
> generally not allowed in the reset handler.
>
> This approach seems to work for at least some time, but not as reliable
> as it is on gfx9. It will probably need some more work until it survives
> a whole night of reset stress testing.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 65 +++++++-------------------
>  1 file changed, 16 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 5ba263fe5512..7ffdba974f87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -8684,7 +8684,17 @@ static void gfx_v10_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>         int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
>         uint32_t seq = ring->fence_drv.sync_seq;
>         uint64_t addr = ring->fence_drv.gpu_addr;
> +       struct amdgpu_device *adev = ring->adev;
>
> +       amdgpu_ring_emit_reg_wait(ring,
> +                                 SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +                                 0, 0xffff);
> +       amdgpu_ring_emit_wreg(ring,
> +                             SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +                             0);
> +       amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> +                              ring->fence_drv.sync_seq,
> +                              AMDGPU_FENCE_FLAG_EXEC);
>         gfx_v10_0_wait_reg_mem(ring, usepfp, 1, 0, lower_32_bits(addr),
>                                upper_32_bits(addr), seq, 0xffffffff, 4);
>  }
> @@ -8984,21 +8994,6 @@ static void gfx_v10_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
>                                                            ref, mask);
>  }
>
> -static void gfx_v10_0_ring_soft_recovery(struct amdgpu_ring *ring,
> -                                        unsigned int vmid)
> -{
> -       struct amdgpu_device *adev = ring->adev;
> -       uint32_t value = 0;
> -
> -       value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
> -       value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
> -       value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
> -       value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
> -       amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
> -       WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> -       amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> -}
> -
>  static void
>  gfx_v10_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>                                       uint32_t me, uint32_t pipe,
> @@ -9467,7 +9462,6 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>         struct amdgpu_ring *kiq_ring = &kiq->ring;
>         unsigned long flags;
>         u32 tmp;
> -       u64 addr;
>         int r;
>
>         if (amdgpu_sriov_vf(adev))
> @@ -9478,27 +9472,14 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>
>         spin_lock_irqsave(&kiq->ring_lock, flags);
>
> -       if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + kiq->pmf->map_queues_size)) {
> +       if (amdgpu_ring_alloc(kiq_ring, 5)) {
>                 spin_unlock_irqrestore(&kiq->ring_lock, flags);
>                 return -ENOMEM;
>         }
>
> -       addr = amdgpu_bo_gpu_offset(ring->mqd_obj) +
> -               offsetof(struct v10_gfx_mqd, cp_gfx_hqd_active);
>         tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << vmid);
> -       if (ring->pipe == 0)
> -               tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE0_QUEUES, 1 << ring->queue);
> -       else
> -               tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE1_QUEUES, 1 << ring->queue);
> -
>         gfx_v10_0_ring_emit_wreg(kiq_ring,
>                                  SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), tmp);
> -       gfx_v10_0_wait_reg_mem(kiq_ring, 0, 1, 0,
> -                              lower_32_bits(addr), upper_32_bits(addr),
> -                              0, 1, 0x20);
> -       gfx_v10_0_ring_emit_reg_wait(kiq_ring,
> -                                    SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0, 0xffffffff);
> -       kiq->pmf->kiq_map_queues(kiq_ring, ring);
>         amdgpu_ring_commit(kiq_ring);
>
>         spin_unlock_irqrestore(&kiq->ring_lock, flags);
> @@ -9507,24 +9488,12 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>         if (r)
>                 return r;
>
> -       r = amdgpu_bo_reserve(ring->mqd_obj, false);
> -       if (unlikely(r != 0)) {
> -               DRM_ERROR("fail to resv mqd_obj\n");
> -               return r;
> -       }
> -       r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
> -       if (!r) {
> -               r = gfx_v10_0_kgq_init_queue(ring, true);
> -               amdgpu_bo_kunmap(ring->mqd_obj);
> -               ring->mqd_ptr = NULL;
> -       }
> -       amdgpu_bo_unreserve(ring->mqd_obj);
> -       if (r) {
> -               DRM_ERROR("fail to unresv mqd_obj\n");
> -               return r;
> -       }
> +       if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7))
> +               return -ENOMEM;
> +       gfx_v10_0_ring_emit_pipeline_sync(ring);
> +       amdgpu_ring_commit(ring);
>
> -       return amdgpu_ring_test_ring(ring);
> +       return gfx_v10_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT);
>  }
>
>  static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
> @@ -9819,7 +9788,6 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>         .emit_wreg = gfx_v10_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
> -       .soft_recovery = gfx_v10_0_ring_soft_recovery,
>         .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>         .reset = gfx_v10_0_reset_kgq,
>         .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader,
> @@ -9860,7 +9828,6 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>         .emit_wreg = gfx_v10_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
> -       .soft_recovery = gfx_v10_0_ring_soft_recovery,

And here.

Alex

>         .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>         .reset = gfx_v10_0_reset_kcq,
>         .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader,
> --
> 2.34.1
>