[PATCH 07/28] drm/amdgpu: track ring state associated with a job

Christian König christian.koenig at amd.com
Mon Jun 2 14:27:49 UTC 2025


On 5/29/25 22:07, Alex Deucher wrote:
> We need to know the wptr and sequence number associated
> with a job so that we can re-emit the unprocessed state
> after a ring reset.  Pre-allocate storage space for
> the ring buffer contents and add a helper to save off
> the unprocessed state so that it can be re-emitted
> after the queue is reset.
> 
> Add a helper that ring reset callbacks can use to verify
> that the ring has reset successfully and to reemit any
> unprocessed ring contents from subsequent jobs.
> 
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 12 ++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c    |  6 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  5 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h   |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c  | 46 +++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  8 ++++
>  6 files changed, 78 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 2f24a6aa13bf6..319548ac58820 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -764,6 +764,18 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
>  	amdgpu_fence_process(ring);
>  }
>  
> +/**
> + * amdgpu_fence_driver_seq_force_completion - force signal of specified sequence
> + *
> + * @ring: fence of the ring to signal
> + *
> + */
> +void amdgpu_fence_driver_seq_force_completion(struct amdgpu_ring *ring, u32 seq)
> +{
> +	amdgpu_fence_write(ring, seq);
> +	amdgpu_fence_process(ring);
> +}
> +
>  /*
>   * Common fence implementation
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 802743efa3b39..67df82d50a74a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -306,6 +306,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned int num_ibs,
>  
>  	amdgpu_ring_ib_end(ring);
>  	amdgpu_ring_commit(ring);
> +	/* This must be last for resets to work properly
> +	 * as we need to save the wptr associated with this
> +	 * job.
> +	 */
> +	if (job)
> +		job->ring_wptr = ring->wptr;

First of all such state should *absolutely* not be made part of the job. That belongs into the HW fence.

Then we need to handle the case that one application submitted multiple jobs which potentially depend on each other.

I think we should rather put this logic into amdgpu_device_enforce_isolation().

Regards,
Christian.


>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index a0fab947143b5..f0f752284b925 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -91,6 +91,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>  	struct amdgpu_job *job = to_amdgpu_job(s_job);
>  	struct amdgpu_task_info *ti;
>  	struct amdgpu_device *adev = ring->adev;
> +	struct dma_fence *fence = &job->hw_fence;
>  	int idx;
>  	int r;
>  
> @@ -154,8 +155,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>  		else
>  			is_guilty = true;
>  
> -		if (is_guilty)
> +		if (is_guilty) {
> +			amdgpu_ring_backup_unprocessed_jobs(ring, job->ring_wptr, fence->seqno);
>  			dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
> +		}
>  
>  		r = amdgpu_ring_reset(ring, job->vmid);
>  		if (!r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index f2c049129661f..c2ed0edb5179d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -79,6 +79,8 @@ struct amdgpu_job {
>  	/* enforce isolation */
>  	bool			enforce_isolation;
>  	bool			run_cleaner_shader;
> +	/* wptr for the job for resets */
> +	uint32_t		ring_wptr;
>  
>  	uint32_t		num_ibs;
>  	struct amdgpu_ib	ibs[];
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 426834806fbf2..909b121d432cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -333,6 +333,12 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
>  	/*  Initialize cached_rptr to 0 */
>  	ring->cached_rptr = 0;
>  
> +	if (!ring->ring_backup) {
> +		ring->ring_backup = kvzalloc(ring->ring_size, GFP_KERNEL);
> +		if (!ring->ring_backup)
> +			return -ENOMEM;
> +	}
> +
>  	/* Allocate ring buffer */
>  	if (ring->ring_obj == NULL) {
>  		r = amdgpu_bo_create_kernel(adev, ring->ring_size + ring->funcs->extra_dw, PAGE_SIZE,
> @@ -342,6 +348,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
>  					    (void **)&ring->ring);
>  		if (r) {
>  			dev_err(adev->dev, "(%d) ring create failed\n", r);
> +			kvfree(ring->ring_backup);
>  			return r;
>  		}
>  		amdgpu_ring_clear_ring(ring);
> @@ -385,6 +392,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring)
>  	amdgpu_bo_free_kernel(&ring->ring_obj,
>  			      &ring->gpu_addr,
>  			      (void **)&ring->ring);
> +	kvfree(ring->ring_backup);
> +	ring->ring_backup = NULL;
>  
>  	dma_fence_put(ring->vmid_wait);
>  	ring->vmid_wait = NULL;
> @@ -753,3 +762,40 @@ bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring)
>  
>  	return true;
>  }
> +
> +void amdgpu_ring_backup_unprocessed_jobs(struct amdgpu_ring *ring,
> +					 u64 bad_wptr, u32 bad_seq)
> +{
> +	unsigned int entries_to_copy = ring->wptr - bad_wptr;
> +	unsigned int idx, i;
> +
> +	for (i = 0; i < entries_to_copy; i++) {
> +		idx = (bad_wptr + i) & ring->buf_mask;
> +		ring->ring_backup[i] = ring->ring[idx];
> +	}
> +	ring->ring_backup_entries_to_copy = entries_to_copy;
> +	ring->ring_backup_seq = bad_seq;
> +}
> +
> +int amdgpu_ring_reemit_unprocessed_jobs(struct amdgpu_ring *ring)
> +{
> +	unsigned int i;
> +	int r;
> +
> +	/* signal the fence of the bad job */
> +	amdgpu_fence_driver_seq_force_completion(ring, ring->ring_backup_seq);
> +	/* verify that the ring is functional */
> +	r = amdgpu_ring_test_ring(ring);
> +	if (r)
> +		return r;
> +	/* re-emit the unprocessed ring contents */
> +	if (ring->ring_backup_entries_to_copy) {
> +		if (amdgpu_ring_alloc(ring, ring->ring_backup_entries_to_copy))
> +			return -ENOMEM;
> +		for (i = 0; i < ring->ring_backup_entries_to_copy; i++)
> +			amdgpu_ring_write(ring, ring->ring_backup[i]);
> +		amdgpu_ring_commit(ring);
> +	}
> +
> +	return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index b95b471107692..fd08449eee33f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -132,6 +132,8 @@ extern const struct drm_sched_backend_ops amdgpu_sched_ops;
>  void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
>  void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
>  void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
> +void amdgpu_fence_driver_seq_force_completion(struct amdgpu_ring *ring,
> +					      u32 seq);
>  
>  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);
>  int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
> @@ -268,6 +270,9 @@ struct amdgpu_ring {
>  
>  	struct amdgpu_bo	*ring_obj;
>  	uint32_t		*ring;
> +	uint32_t		*ring_backup;
> +	uint32_t		ring_backup_seq;
> +	unsigned int		ring_backup_entries_to_copy;
>  	unsigned		rptr_offs;
>  	u64			rptr_gpu_addr;
>  	volatile u32		*rptr_cpu_addr;
> @@ -534,4 +539,7 @@ int amdgpu_ib_pool_init(struct amdgpu_device *adev);
>  void amdgpu_ib_pool_fini(struct amdgpu_device *adev);
>  int amdgpu_ib_ring_tests(struct amdgpu_device *adev);
>  bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring);
> +void amdgpu_ring_backup_unprocessed_jobs(struct amdgpu_ring *ring,
> +					 u64 bad_wptr, u32 bad_seq);
> +int amdgpu_ring_reemit_unprocessed_jobs(struct amdgpu_ring *ring);
>  #endif



More information about the amd-gfx mailing list