[PATCH 2/2] drm/amdgpu: properly release the eviction fences

Christian König christian.koenig at amd.com
Wed Apr 23 13:58:24 UTC 2025


On 4/23/25 15:26, Prike Liang wrote:
> The following cases require releasing and deferring the
> eviction fences properly
> 
> 1) Detach the old eviction fences before attaching a new one.
> 2) Drop the eviction fence init reference.
> 3) Correct the attached eviction fence reference.
> 4) Free the eviction fence when the attached BOs are released.
> 
> Signed-off-by: Prike Liang <Prike.Liang at amd.com>
> ---
>  .../drm/amd/amdgpu/amdgpu_eviction_fence.c    | 53 +++++++++++++++----
>  .../drm/amd/amdgpu/amdgpu_eviction_fence.h    |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    |  1 +
>  3 files changed, 45 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> index d86e611a9ff4..a47db865c530 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
> @@ -70,11 +70,6 @@ amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
>  		return -ENOMEM;
>  	}
>  
> -	/* Update the eviction fence now */
> -	spin_lock(&evf_mgr->ev_fence_lock);
> -	old_ef = evf_mgr->ev_fence;
> -	evf_mgr->ev_fence = new_ef;
> -	spin_unlock(&evf_mgr->ev_fence_lock);


That is certainly incorrect. The new fence *must* be made public before attaching it to the existing BOs.

>  
>  	/* Attach the new fence */
>  	drm_exec_for_each_locked_object(exec, index, obj) {
> @@ -82,6 +77,16 @@ amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
>  
>  		if (!bo)
>  			continue;
> +
> +		/*detach the old eviction fence first*/
> +		amdgpu_eviction_fence_detach(evf_mgr, bo);

That's completely nonsense. The old eviction fence is signaled at this state and doesn't need to be detached.

> +
> +		/* Update the eviction fence now */
> +		spin_lock(&evf_mgr->ev_fence_lock);
> +		old_ef = evf_mgr->ev_fence;
> +		evf_mgr->ev_fence = new_ef;
> +		spin_unlock(&evf_mgr->ev_fence_lock);
> +
>  		ret = amdgpu_eviction_fence_attach(evf_mgr, bo);
>  		if (ret) {
>  			DRM_ERROR("Failed to attch new eviction fence\n");
> @@ -89,9 +94,10 @@ amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
>  		}
>  	}
>  
> -	/* Free old fence */
> +	/* Free the init eviction fence which was referred by the dma_init*/
>  	if (old_ef)
>  		dma_fence_put(&old_ef->base);

The comment isn't really helpful. Additional to that it's valid to call dma_fence_put() with a NULL fence to we can drop the extra if check.

> +
>  	return 0;
>  
>  free_err:
> @@ -189,7 +195,6 @@ void amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr)
>  int amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr,
>  				 struct amdgpu_bo *bo)
>  {
> -	struct dma_fence *ef;
>  	struct amdgpu_eviction_fence *ev_fence;
>  	struct dma_resv *resv = bo->tbo.base.resv;
>  	int ret;
> @@ -205,10 +210,12 @@ int amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr,
>  
>  	spin_lock(&evf_mgr->ev_fence_lock);
>  	ev_fence = evf_mgr->ev_fence;
> -	if (ev_fence) {
> -		ef = dma_fence_get(&ev_fence->base);
> -		dma_resv_add_fence(resv, ef, DMA_RESV_USAGE_BOOKKEEP);
> -	}
> +	/*
> +	 * The dma_resv_add_fence() already refer to the added fence, hence in this
> +	 * placement needn't refer to the armed fence anymore.
> +	 */

Please drop that comment.

> +	if (ev_fence)
> +		dma_resv_add_fence(resv, &ev_fence->base, DMA_RESV_USAGE_BOOKKEEP);
>  	spin_unlock(&evf_mgr->ev_fence_lock);
>  
>  	return 0;
> @@ -224,6 +231,30 @@ void amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr,
>  	dma_fence_put(stub);
>  }
>  
> +void amdgpu_remove_all_eviction_fences(struct amdgpu_bo *bo)
> +{
> +	struct dma_resv *resv = &bo->tbo.base._resv;
> +	struct dma_fence *fence, *stub;
> +	struct dma_resv_iter cursor;
> +
> +	dma_resv_assert_held(resv);
> +
> +	stub = dma_fence_get_stub();
> +	dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) {
> +		struct amdgpu_eviction_fence *ev_fence;
> +
> +		ev_fence = fence_to_evf(fence);
> +		if (!ev_fence || !dma_fence_is_signaled(&ev_fence->base))
> +			continue;
> +
> +		dma_resv_replace_fences(resv, fence->context, stub,
> +				DMA_RESV_USAGE_BOOKKEEP);
> +
> +	}
> +
> +	dma_fence_put(stub);
> +}
> +
>  int amdgpu_eviction_fence_init(struct amdgpu_eviction_fence_mgr *evf_mgr)
>  {
>  	/* This needs to be done one time per open */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h
> index fcd867b7147d..7e6c55a334e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h
> @@ -42,6 +42,7 @@ struct amdgpu_eviction_fence_mgr {
>  };
>  
>  /* Eviction fence helper functions */
> +#define fence_to_evf(f)  container_of(f, struct amdgpu_eviction_fence, base)
>  struct amdgpu_eviction_fence *
>  amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr);
>  
> @@ -66,4 +67,5 @@ amdgpu_eviction_fence_signal(struct amdgpu_eviction_fence_mgr *evf_mgr,
>  int
>  amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
>  				    struct drm_exec *exec);
> +void amdgpu_remove_all_eviction_fences(struct amdgpu_bo *bo);
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index dbe57996a481..6d3eacffbcb9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -1322,6 +1322,7 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
>  	amdgpu_vram_mgr_set_cleared(bo->resource);
>  	dma_resv_add_fence(&bo->base._resv, fence, DMA_RESV_USAGE_KERNEL);
>  	dma_fence_put(fence);
> +	amdgpu_remove_all_eviction_fences(abo);

That's a really bad idea. We only did that for the KFD eviction fence because we didn't had any other choice.

For the user queue eviction fence we should rather do that in amdgpu_gem_object_close() e.g. when amdgpu_vm_bo_del() is called.

Regards,
Christian.

>  
>  out:
>  	dma_resv_unlock(&bo->base._resv);



More information about the amd-gfx mailing list