[PATCH 5/5] drm/amdgpu: rework gfx10 queue reset
Alex Deucher
alexdeucher at gmail.com
Tue Feb 4 15:03:02 UTC 2025
On Tue, Feb 4, 2025 at 9:57 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> Apply the same changes to gfx10 as done to gfx9.
>
> The general idea to reset the whole kernel queue and then asking the kiq
> to map it again didn't worked at all. Background is that we don't use per
> application kernel queues for gfx10 on Linux for performance reasons.
>
> So instead use the gfx9 approach here as well and only reset all
> submissions from a specific VMID instead of the whole queue.
>
> This also avoids reserving and kmap the MQD which are operations
> generally not allowed in the reset handler.
>
> This approach seems to work for at least some time, but not as reliable
> as it is on gfx9. It will probably need some more work until it survives
> a whole night of reset stress testing.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 65 +++++++-------------------
> 1 file changed, 16 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 5ba263fe5512..7ffdba974f87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -8684,7 +8684,17 @@ static void gfx_v10_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> uint32_t seq = ring->fence_drv.sync_seq;
> uint64_t addr = ring->fence_drv.gpu_addr;
> + struct amdgpu_device *adev = ring->adev;
>
> + amdgpu_ring_emit_reg_wait(ring,
> + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> + 0, 0xffff);
> + amdgpu_ring_emit_wreg(ring,
> + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> + 0);
> + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> + ring->fence_drv.sync_seq,
> + AMDGPU_FENCE_FLAG_EXEC);
> gfx_v10_0_wait_reg_mem(ring, usepfp, 1, 0, lower_32_bits(addr),
> upper_32_bits(addr), seq, 0xffffffff, 4);
> }
> @@ -8984,21 +8994,6 @@ static void gfx_v10_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
> ref, mask);
> }
>
> -static void gfx_v10_0_ring_soft_recovery(struct amdgpu_ring *ring,
> - unsigned int vmid)
> -{
> - struct amdgpu_device *adev = ring->adev;
> - uint32_t value = 0;
> -
> - value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
> - value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
> - value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
> - value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
> - amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
> - WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> - amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> -}
> -
> static void
> gfx_v10_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
> uint32_t me, uint32_t pipe,
> @@ -9467,7 +9462,6 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
> struct amdgpu_ring *kiq_ring = &kiq->ring;
> unsigned long flags;
> u32 tmp;
> - u64 addr;
> int r;
>
> if (amdgpu_sriov_vf(adev))
> @@ -9478,27 +9472,14 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>
> spin_lock_irqsave(&kiq->ring_lock, flags);
>
> - if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + kiq->pmf->map_queues_size)) {
> + if (amdgpu_ring_alloc(kiq_ring, 5)) {
> spin_unlock_irqrestore(&kiq->ring_lock, flags);
> return -ENOMEM;
> }
>
> - addr = amdgpu_bo_gpu_offset(ring->mqd_obj) +
> - offsetof(struct v10_gfx_mqd, cp_gfx_hqd_active);
> tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << vmid);
> - if (ring->pipe == 0)
> - tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE0_QUEUES, 1 << ring->queue);
> - else
> - tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE1_QUEUES, 1 << ring->queue);
> -
> gfx_v10_0_ring_emit_wreg(kiq_ring,
> SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), tmp);
> - gfx_v10_0_wait_reg_mem(kiq_ring, 0, 1, 0,
> - lower_32_bits(addr), upper_32_bits(addr),
> - 0, 1, 0x20);
> - gfx_v10_0_ring_emit_reg_wait(kiq_ring,
> - SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0, 0xffffffff);
> - kiq->pmf->kiq_map_queues(kiq_ring, ring);
> amdgpu_ring_commit(kiq_ring);
>
> spin_unlock_irqrestore(&kiq->ring_lock, flags);
> @@ -9507,24 +9488,12 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
> if (r)
> return r;
>
> - r = amdgpu_bo_reserve(ring->mqd_obj, false);
> - if (unlikely(r != 0)) {
> - DRM_ERROR("fail to resv mqd_obj\n");
> - return r;
> - }
> - r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
> - if (!r) {
> - r = gfx_v10_0_kgq_init_queue(ring, true);
> - amdgpu_bo_kunmap(ring->mqd_obj);
> - ring->mqd_ptr = NULL;
> - }
> - amdgpu_bo_unreserve(ring->mqd_obj);
> - if (r) {
> - DRM_ERROR("fail to unresv mqd_obj\n");
> - return r;
> - }
> + if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7))
> + return -ENOMEM;
> + gfx_v10_0_ring_emit_pipeline_sync(ring);
> + amdgpu_ring_commit(ring);
>
> - return amdgpu_ring_test_ring(ring);
> + return gfx_v10_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT);
> }
>
> static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
> @@ -9819,7 +9788,6 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> .emit_wreg = gfx_v10_0_ring_emit_wreg,
> .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
> - .soft_recovery = gfx_v10_0_ring_soft_recovery,
> .emit_mem_sync = gfx_v10_0_emit_mem_sync,
> .reset = gfx_v10_0_reset_kgq,
> .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader,
> @@ -9860,7 +9828,6 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> .emit_wreg = gfx_v10_0_ring_emit_wreg,
> .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
> - .soft_recovery = gfx_v10_0_ring_soft_recovery,
And here.
Alex
> .emit_mem_sync = gfx_v10_0_emit_mem_sync,
> .reset = gfx_v10_0_reset_kcq,
> .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader,
> --
> 2.34.1
>
More information about the amd-gfx
mailing list