[PATCH 2/5] drm/amdgpu: rework gfx9 queue reset

Rodrigo Siqueira siqueira at igalia.com
Mon May 19 23:50:46 UTC 2025


On 05/02, Christian König wrote:
> Testing this feature turned out that it was a bit unstable. The
> CP_VMID_RESET register takes the VMID which all submissions from should
> be canceled.
> 
> Unlike Windows Linux uses per process VMIDs instead of per engine VMIDs
> for the simple reason that we don't have enough. So resetting one VMID
> only killed the submissions of one specific process.
> 
> Fortunately that turned out to be exactly what we want to have.
> 
> So clear the CP_VMID_RESET register between every context switch between
> applications when we do the pipeline sync to avoid trouble if multiple
> VMIDs are used on the ring right behind each other.
> 
> Use the same pipeline sync function in the reset handler and issue an IB
> test instead of a ring test after the queue reset to provide a longer
> timeout and additional fence value should there be additional work on
> the ring after the one aborted.
> 
> Also drop the soft recovery since that pretty much does the same thing as
> CP_VMID_RESET, just on a lower level and with less chance of succeeding.
> 
> This now survives a stress test running over night sending a broken
> submission ever 45 seconds and recovering fine from each of them.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 47 ++++++++++-----------------
>  2 files changed, 19 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index cc26cf1bd843..c39fe784419b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -278,6 +278,7 @@ extern int amdgpu_user_queue;
>  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
>  #define AMDGPU_MAX_USEC_TIMEOUT			100000	/* 100 ms */
>  #define AMDGPU_FENCE_JIFFIES_TIMEOUT		(HZ / 2)
> +#define AMDGPU_QUEUE_RESET_TIMEOUT		(HZ / 10)
>  #define AMDGPU_DEBUGFS_MAX_COMPONENTS		32
>  #define AMDGPUFB_CONN_LIMIT			4
>  #define AMDGPU_BIOS_NUM_SCRATCH			16
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index d377a7c57d5e..92d9a28c62d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5565,7 +5565,17 @@ static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>  	int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
>  	uint32_t seq = ring->fence_drv.sync_seq;
>  	uint64_t addr = ring->fence_drv.gpu_addr;
> +	struct amdgpu_device *adev = ring->adev;
>  
> +	amdgpu_ring_emit_reg_wait(ring,
> +				  SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +				  0, 0xffff);
> +	amdgpu_ring_emit_wreg(ring,
> +			      SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +			      0);
> +	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> +			       ring->fence_drv.sync_seq,
> +			       AMDGPU_FENCE_FLAG_EXEC);
>  	gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
>  			      lower_32_bits(addr), upper_32_bits(addr),
>  			      seq, 0xffffffff, 4);
> @@ -5896,20 +5906,6 @@ static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
>  							   ref, mask);
>  }
>  
> -static void gfx_v9_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
> -{
> -	struct amdgpu_device *adev = ring->adev;
> -	uint32_t value = 0;
> -
> -	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
> -	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
> -	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
> -	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
> -	amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
> -	WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> -	amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> -}
> -
>  static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>  						 enum amdgpu_interrupt_state state)
>  {
> @@ -7185,16 +7181,12 @@ static int gfx_v9_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>  	if (r)
>  		return r;
>  
> -	if (amdgpu_ring_alloc(ring, 7 + 7 + 5))
> +	if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7))

Hi Christian,

What is the meaning of all of the above additions (7 + 7 + 5 + 7)? I see
it in many different parts of the code. Is this some indication of
preambles?

Thanks

>  		return -ENOMEM;
> -	gfx_v9_0_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> -				 ring->fence_drv.sync_seq, AMDGPU_FENCE_FLAG_EXEC);
> -	gfx_v9_0_ring_emit_reg_wait(ring,
> -				    SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0, 0xffff);
> -	gfx_v9_0_ring_emit_wreg(ring,
> -				SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0);
> +	gfx_v9_0_ring_emit_pipeline_sync(ring);
> +	amdgpu_ring_commit(ring);
>  
> -	return amdgpu_ring_test_ring(ring);
> +	return gfx_v9_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT);
>  }
>  
>  static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring,
> @@ -7437,7 +7429,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>  	.set_wptr = gfx_v9_0_ring_set_wptr_gfx,
>  	.emit_frame_size = /* totally 242 maximum if 16 IBs */
>  		5 +  /* COND_EXEC */
> -		7 +  /* PIPELINE_SYNC */
> +		7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>  		2 + /* VM_FLUSH */
> @@ -7475,7 +7467,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>  	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>  	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>  	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> -	.soft_recovery = gfx_v9_0_ring_soft_recovery,
>  	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
>  	.reset = gfx_v9_0_reset_kgq,
>  	.emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader,
> @@ -7494,7 +7485,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
>  	.set_wptr = amdgpu_sw_ring_set_wptr_gfx,
>  	.emit_frame_size = /* totally 242 maximum if 16 IBs */
>  		5 +  /* COND_EXEC */
> -		7 +  /* PIPELINE_SYNC */
> +		7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>  		2 + /* VM_FLUSH */
> @@ -7533,7 +7524,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
>  	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>  	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>  	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> -	.soft_recovery = gfx_v9_0_ring_soft_recovery,
>  	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
>  	.patch_cntl = gfx_v9_0_ring_patch_cntl,
>  	.patch_de = gfx_v9_0_ring_patch_de_meta,
> @@ -7555,7 +7545,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>  		20 + /* gfx_v9_0_ring_emit_gds_switch */
>  		7 + /* gfx_v9_0_ring_emit_hdp_flush */
>  		5 + /* hdp invalidate */
> -		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> +		7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>  		8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> @@ -7577,7 +7567,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>  	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>  	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>  	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> -	.soft_recovery = gfx_v9_0_ring_soft_recovery,
>  	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
>  	.emit_wave_limit = gfx_v9_0_emit_wave_limit,
>  	.reset = gfx_v9_0_reset_kcq,
> @@ -7598,7 +7587,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>  		20 + /* gfx_v9_0_ring_emit_gds_switch */
>  		7 + /* gfx_v9_0_ring_emit_hdp_flush */
>  		5 + /* hdp invalidate */
> -		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> +		7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
>  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>  		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
> -- 
> 2.34.1
> 

-- 
Rodrigo Siqueira


More information about the amd-gfx mailing list