[PATCH 2/5] drm/amdgpu: rework gfx9 queue reset
Alex Deucher
alexdeucher at gmail.com
Tue Feb 4 14:56:06 UTC 2025
On Tue, Feb 4, 2025 at 9:48 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> Testing this feature turned out that it was a bit unstable. The
> CP_VMID_RESET register takes the VMID which all submissions from should
> be canceled.
>
> Unlike Windows Linux uses per process VMIDs instead of per engine VMIDs
> for the simple reason that we don't have enough. So resetting one VMID
> only killed the submissions of one specific process.
>
> Fortunately that turned out to be exactly what we want to have.
>
> So clear the CP_VMID_RESET register between every context switch between
> applications when we do the pipeline sync to avoid trouble if multiple
> VMIDs are used on the ring right behind each other.
>
> Use the same pipeline sync function in the reset handler and issue an IB
> test instead of a ring test after the queue reset to provide a longer
> timeout and additional fence value should there be additional work on
> the ring after the one aborted.
>
> Also drop the soft recovery since that pretty much does the same thing as
> CP_VMID_RESET, just on a lower level and with less chance of succeeding.
>
> This now survives a stress test running over night sending a broken
> submission ever 45 seconds and recovering fine from each of them.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 47 ++++++++++-----------------
> 2 files changed, 19 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 8902fafbcf8d..1eee2a1bca5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -275,6 +275,7 @@ extern int amdgpu_wbrf;
> #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000
> #define AMDGPU_MAX_USEC_TIMEOUT 100000 /* 100 ms */
> #define AMDGPU_FENCE_JIFFIES_TIMEOUT (HZ / 2)
> +#define AMDGPU_QUEUE_RESET_TIMEOUT (HZ / 10)
> #define AMDGPU_DEBUGFS_MAX_COMPONENTS 32
> #define AMDGPUFB_CONN_LIMIT 4
> #define AMDGPU_BIOS_NUM_SCRATCH 16
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index fa572b40989e..705f5a9c11a0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5607,7 +5607,17 @@ static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> uint32_t seq = ring->fence_drv.sync_seq;
> uint64_t addr = ring->fence_drv.gpu_addr;
> + struct amdgpu_device *adev = ring->adev;
>
> + amdgpu_ring_emit_reg_wait(ring,
> + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> + 0, 0xffff);
> + amdgpu_ring_emit_wreg(ring,
> + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> + 0);
> + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> + ring->fence_drv.sync_seq,
> + AMDGPU_FENCE_FLAG_EXEC);
> gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
> lower_32_bits(addr), upper_32_bits(addr),
> seq, 0xffffffff, 4);
> @@ -5963,20 +5973,6 @@ static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
> ref, mask);
> }
>
> -static void gfx_v9_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
> -{
> - struct amdgpu_device *adev = ring->adev;
> - uint32_t value = 0;
> -
> - value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
> - value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
> - value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
> - value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
> - amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
> - WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> - amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> -}
> -
> static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
> enum amdgpu_interrupt_state state)
> {
> @@ -7252,16 +7248,12 @@ static int gfx_v9_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
> if (r)
> return r;
>
> - if (amdgpu_ring_alloc(ring, 7 + 7 + 5))
> + if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7))
> return -ENOMEM;
> - gfx_v9_0_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> - ring->fence_drv.sync_seq, AMDGPU_FENCE_FLAG_EXEC);
> - gfx_v9_0_ring_emit_reg_wait(ring,
> - SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0, 0xffff);
> - gfx_v9_0_ring_emit_wreg(ring,
> - SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0);
> + gfx_v9_0_ring_emit_pipeline_sync(ring);
> + amdgpu_ring_commit(ring);
>
> - return amdgpu_ring_test_ring(ring);
> + return gfx_v9_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT);
> }
>
> static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring,
> @@ -7468,7 +7460,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
> .emit_frame_size = /* totally 242 maximum if 16 IBs */
> 5 + /* COND_EXEC */
> - 7 + /* PIPELINE_SYNC */
> + 7 + 7 + 5 + 7 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 2 + /* VM_FLUSH */
> @@ -7506,7 +7498,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
> .emit_wreg = gfx_v9_0_ring_emit_wreg,
> .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> - .soft_recovery = gfx_v9_0_ring_soft_recovery,
> .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> .reset = gfx_v9_0_reset_kgq,
> .emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader,
> @@ -7525,7 +7516,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
> .set_wptr = amdgpu_sw_ring_set_wptr_gfx,
> .emit_frame_size = /* totally 242 maximum if 16 IBs */
> 5 + /* COND_EXEC */
> - 7 + /* PIPELINE_SYNC */
> + 7 + 7 + 5 + 7 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 2 + /* VM_FLUSH */
> @@ -7564,7 +7555,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
> .emit_wreg = gfx_v9_0_ring_emit_wreg,
> .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> - .soft_recovery = gfx_v9_0_ring_soft_recovery,
> .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> .patch_cntl = gfx_v9_0_ring_patch_cntl,
> .patch_de = gfx_v9_0_ring_patch_de_meta,
> @@ -7586,7 +7576,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> 20 + /* gfx_v9_0_ring_emit_gds_switch */
> 7 + /* gfx_v9_0_ring_emit_hdp_flush */
> 5 + /* hdp invalidate */
> - 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> + 7 + 7 + 5 + 7 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> @@ -7608,7 +7598,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
> .emit_wreg = gfx_v9_0_ring_emit_wreg,
> .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> - .soft_recovery = gfx_v9_0_ring_soft_recovery,
Probably want to keep soft_recovery for compute as compute queues
don't support vmid reset.
Alex
> .emit_mem_sync = gfx_v9_0_emit_mem_sync,
> .emit_wave_limit = gfx_v9_0_emit_wave_limit,
> .reset = gfx_v9_0_reset_kcq,
> @@ -7629,7 +7618,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> 20 + /* gfx_v9_0_ring_emit_gds_switch */
> 7 + /* gfx_v9_0_ring_emit_hdp_flush */
> 5 + /* hdp invalidate */
> - 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> + 7 + 7 + 5 + 7 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
> --
> 2.34.1
>
More information about the amd-gfx
mailing list