[PATCH] drm/amdgpu: once more fix the call oder in amdgpu_ttm_move()

Tvrtko Ursulin tursulin at ursulin.net
Thu Mar 21 14:12:56 UTC 2024


On 21/03/2024 12:43, Christian König wrote:
> This reverts drm/amdgpu: fix ftrace event amdgpu_bo_move always move
> on same heap. The basic problem here is that after the move the old
> location is simply not available any more.
> 
> Some fixes where suggested, but essentially we should call the move
> notification before actually moving things because only this way we have
> the correct order for DMA-buf and VM move notifications as well.
> 
> Also rework the statistic handling so that we don't update the eviction
> counter before the move.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>

Don't forget:

Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move always 
move on same heap")
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171

;)

Regards,

Tvrtko

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 15 +++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  4 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 48 ++++++++++++----------
>   3 files changed, 37 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 425cebcc5cbf..eb7d824763b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -1245,19 +1245,20 @@ int amdgpu_bo_get_metadata(struct amdgpu_bo *bo, void *buffer,
>    * amdgpu_bo_move_notify - notification about a memory move
>    * @bo: pointer to a buffer object
>    * @evict: if this move is evicting the buffer from the graphics address space
> + * @new_mem: new resource for backing the BO
>    *
>    * Marks the corresponding &amdgpu_bo buffer object as invalid, also performs
>    * bookkeeping.
>    * TTM driver callback which is called when ttm moves a buffer.
>    */
> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict)
> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
> +			   bool evict,
> +			   struct ttm_resource *new_mem)
>   {
>   	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
> +	struct ttm_resource *old_mem = bo->resource;
>   	struct amdgpu_bo *abo;
>   
> -	if (!amdgpu_bo_is_amdgpu_bo(bo))
> -		return;
> -
>   	abo = ttm_to_amdgpu_bo(bo);
>   	amdgpu_vm_bo_invalidate(adev, abo, evict);
>   
> @@ -1267,9 +1268,9 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict)
>   	    bo->resource->mem_type != TTM_PL_SYSTEM)
>   		dma_buf_move_notify(abo->tbo.base.dma_buf);
>   
> -	/* remember the eviction */
> -	if (evict)
> -		atomic64_inc(&adev->num_evictions);
> +	/* move_notify is called before move happens */
> +	trace_amdgpu_bo_move(abo, new_mem ? new_mem->mem_type : -1,
> +			     old_mem ? old_mem->mem_type : -1);
>   }
>   
>   void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index a3ea8a82db23..d28e21baef16 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -344,7 +344,9 @@ int amdgpu_bo_set_metadata (struct amdgpu_bo *bo, void *metadata,
>   int amdgpu_bo_get_metadata(struct amdgpu_bo *bo, void *buffer,
>   			   size_t buffer_size, uint32_t *metadata_size,
>   			   uint64_t *flags);
> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict);
> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
> +			   bool evict,
> +			   struct ttm_resource *new_mem);
>   void amdgpu_bo_release_notify(struct ttm_buffer_object *bo);
>   vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo);
>   void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index a5ceec7820cf..460b23918bfc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -471,14 +471,16 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   
>   	if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
>   			 bo->ttm == NULL)) {
> +		amdgpu_bo_move_notify(bo, evict, new_mem);
>   		ttm_bo_move_null(bo, new_mem);
> -		goto out;
> +		return 0;
>   	}
>   	if (old_mem->mem_type == TTM_PL_SYSTEM &&
>   	    (new_mem->mem_type == TTM_PL_TT ||
>   	     new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
> +		amdgpu_bo_move_notify(bo, evict, new_mem);
>   		ttm_bo_move_null(bo, new_mem);
> -		goto out;
> +		return 0;
>   	}
>   	if ((old_mem->mem_type == TTM_PL_TT ||
>   	     old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
> @@ -488,9 +490,10 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   			return r;
>   
>   		amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);
> +		amdgpu_bo_move_notify(bo, evict, new_mem);
>   		ttm_resource_free(bo, &bo->resource);
>   		ttm_bo_assign_mem(bo, new_mem);
> -		goto out;
> +		return 0;
>   	}
>   
>   	if (old_mem->mem_type == AMDGPU_PL_GDS ||
> @@ -502,8 +505,9 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   	    new_mem->mem_type == AMDGPU_PL_OA ||
>   	    new_mem->mem_type == AMDGPU_PL_DOORBELL) {
>   		/* Nothing to save here */
> +		amdgpu_bo_move_notify(bo, evict, new_mem);
>   		ttm_bo_move_null(bo, new_mem);
> -		goto out;
> +		return 0;
>   	}
>   
>   	if (bo->type == ttm_bo_type_device &&
> @@ -515,22 +519,23 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   		abo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>   	}
>   
> -	if (adev->mman.buffer_funcs_enabled) {
> -		if (((old_mem->mem_type == TTM_PL_SYSTEM &&
> -		      new_mem->mem_type == TTM_PL_VRAM) ||
> -		     (old_mem->mem_type == TTM_PL_VRAM &&
> -		      new_mem->mem_type == TTM_PL_SYSTEM))) {
> -			hop->fpfn = 0;
> -			hop->lpfn = 0;
> -			hop->mem_type = TTM_PL_TT;
> -			hop->flags = TTM_PL_FLAG_TEMPORARY;
> -			return -EMULTIHOP;
> -		}
> +	if (adev->mman.buffer_funcs_enabled &&
> +	    ((old_mem->mem_type == TTM_PL_SYSTEM &&
> +	      new_mem->mem_type == TTM_PL_VRAM) ||
> +	     (old_mem->mem_type == TTM_PL_VRAM &&
> +	      new_mem->mem_type == TTM_PL_SYSTEM))) {
> +		hop->fpfn = 0;
> +		hop->lpfn = 0;
> +		hop->mem_type = TTM_PL_TT;
> +		hop->flags = TTM_PL_FLAG_TEMPORARY;
> +		return -EMULTIHOP;
> +	}
>   
> +	amdgpu_bo_move_notify(bo, evict, new_mem);
> +	if (adev->mman.buffer_funcs_enabled)
>   		r = amdgpu_move_blit(bo, evict, new_mem, old_mem);
> -	} else {
> +	else
>   		r = -ENODEV;
> -	}
>   
>   	if (r) {
>   		/* Check that all memory is CPU accessible */
> @@ -545,11 +550,10 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   			return r;
>   	}
>   
> -	trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
> -out:
> -	/* update statistics */
> +	/* update statistics after the move */
> +	if (evict)
> +		atomic64_inc(&adev->num_evictions);
>   	atomic64_add(bo->base.size, &adev->num_bytes_moved);
> -	amdgpu_bo_move_notify(bo, evict);
>   	return 0;
>   }
>   
> @@ -1551,7 +1555,7 @@ static int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo,
>   static void
>   amdgpu_bo_delete_mem_notify(struct ttm_buffer_object *bo)
>   {
> -	amdgpu_bo_move_notify(bo, false);
> +	amdgpu_bo_move_notify(bo, false, NULL);
>   }
>   
>   static struct ttm_device_funcs amdgpu_bo_driver = {


More information about the amd-gfx mailing list