[PATCH v2] drm/amdgpu: Fix use after free in trace_amdgpu_bo_move

Christian König christian.koenig at amd.com
Thu Mar 21 10:25:50 UTC 2024


Am 20.03.24 um 18:12 schrieb Tvrtko Ursulin:
> From: Tvrtko Ursulin <tursulin at ursulin.net>
>
> Pipelined object migration will free up the old bo->resource, meaning
> the tracepoint added in 94aeb4117343 ("drm/amdgpu: fix ftrace event
> amdgpu_bo_move always move on same heap") will trigger an use after free
> when it dereferences the cached old_mem.
>
> Fix it by caching the memory type locally, which is the only thing
> tracepoint wants to know about.
>
> While at it convert the whole function to use the cached memory types for
> consistency.
>
> v2:
>   * Fix compilation.

I do remember that we have fixed this before. Question is really why it 
shows up again.

Going to investigate.

Thanks,
Christian.

>
> Signed-off-by: Tvrtko Ursulin <tursulin at ursulin.net>
> Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move always move on same heap")
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171
> Cc: Beyond Wang <Wang.Beyond at amd.com>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> ---
> Beware this is a speculative fix for now based only on source code
> analysis and backtraces from 3171. It is also a bit on the churny side so
> I am happy to minimize it. But most importantly, given how I don't have
> any experience in amdgpu, I am looking for domain experts to either
> confirm or disprove my analysis.
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 47 ++++++++++++-------------
>   1 file changed, 22 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 8722beba494e..81189aab5a04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -452,10 +452,11 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   	struct amdgpu_device *adev;
>   	struct amdgpu_bo *abo;
>   	struct ttm_resource *old_mem = bo->resource;
> +	uint32_t new_mem_type = new_mem->mem_type;
> +	uint32_t old_mem_type;
>   	int r;
>   
> -	if (new_mem->mem_type == TTM_PL_TT ||
> -	    new_mem->mem_type == AMDGPU_PL_PREEMPT) {
> +	if (new_mem_type == TTM_PL_TT || new_mem_type == AMDGPU_PL_PREEMPT) {
>   		r = amdgpu_ttm_backend_bind(bo->bdev, bo->ttm, new_mem);
>   		if (r)
>   			return r;
> @@ -464,20 +465,18 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   	abo = ttm_to_amdgpu_bo(bo);
>   	adev = amdgpu_ttm_adev(bo->bdev);
>   
> -	if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
> -			 bo->ttm == NULL)) {
> +	if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM && bo->ttm == NULL)) {
>   		ttm_bo_move_null(bo, new_mem);
>   		goto out;
>   	}
> -	if (old_mem->mem_type == TTM_PL_SYSTEM &&
> -	    (new_mem->mem_type == TTM_PL_TT ||
> -	     new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
> +	old_mem_type = old_mem->mem_type;
> +	if (old_mem_type == TTM_PL_SYSTEM &&
> +	    (new_mem_type == TTM_PL_TT || new_mem_type == AMDGPU_PL_PREEMPT)) {
>   		ttm_bo_move_null(bo, new_mem);
>   		goto out;
>   	}
> -	if ((old_mem->mem_type == TTM_PL_TT ||
> -	     old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
> -	    new_mem->mem_type == TTM_PL_SYSTEM) {
> +	if ((old_mem_type == TTM_PL_TT || old_mem_type == AMDGPU_PL_PREEMPT) &&
> +	    new_mem_type == TTM_PL_SYSTEM) {
>   		r = ttm_bo_wait_ctx(bo, ctx);
>   		if (r)
>   			return r;
> @@ -488,22 +487,22 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   		goto out;
>   	}
>   
> -	if (old_mem->mem_type == AMDGPU_PL_GDS ||
> -	    old_mem->mem_type == AMDGPU_PL_GWS ||
> -	    old_mem->mem_type == AMDGPU_PL_OA ||
> -	    old_mem->mem_type == AMDGPU_PL_DOORBELL ||
> -	    new_mem->mem_type == AMDGPU_PL_GDS ||
> -	    new_mem->mem_type == AMDGPU_PL_GWS ||
> -	    new_mem->mem_type == AMDGPU_PL_OA ||
> -	    new_mem->mem_type == AMDGPU_PL_DOORBELL) {
> +	if (old_mem_type == AMDGPU_PL_GDS ||
> +	    old_mem_type == AMDGPU_PL_GWS ||
> +	    old_mem_type == AMDGPU_PL_OA ||
> +	    old_mem_type == AMDGPU_PL_DOORBELL ||
> +	    new_mem_type == AMDGPU_PL_GDS ||
> +	    new_mem_type == AMDGPU_PL_GWS ||
> +	    new_mem_type == AMDGPU_PL_OA ||
> +	    new_mem_type == AMDGPU_PL_DOORBELL) {
>   		/* Nothing to save here */
>   		ttm_bo_move_null(bo, new_mem);
>   		goto out;
>   	}
>   
>   	if (bo->type == ttm_bo_type_device &&
> -	    new_mem->mem_type == TTM_PL_VRAM &&
> -	    old_mem->mem_type != TTM_PL_VRAM) {
> +	    new_mem_type == TTM_PL_VRAM &&
> +	    old_mem_type != TTM_PL_VRAM) {
>   		/* amdgpu_bo_fault_reserve_notify will re-set this if the CPU
>   		 * accesses the BO after it's moved.
>   		 */
> @@ -511,10 +510,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   	}
>   
>   	if (adev->mman.buffer_funcs_enabled) {
> -		if (((old_mem->mem_type == TTM_PL_SYSTEM &&
> -		      new_mem->mem_type == TTM_PL_VRAM) ||
> -		     (old_mem->mem_type == TTM_PL_VRAM &&
> -		      new_mem->mem_type == TTM_PL_SYSTEM))) {
> +		if (((old_mem_type == TTM_PL_SYSTEM && new_mem_type == TTM_PL_VRAM) ||
> +		     (old_mem_type == TTM_PL_VRAM && new_mem_type == TTM_PL_SYSTEM))) {
>   			hop->fpfn = 0;
>   			hop->lpfn = 0;
>   			hop->mem_type = TTM_PL_TT;
> @@ -540,7 +537,7 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   			return r;
>   	}
>   
> -	trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
> +	trace_amdgpu_bo_move(abo, new_mem_type, old_mem_type);
>   out:
>   	/* update statistics */
>   	atomic64_add(bo->base.size, &adev->num_bytes_moved);



More information about the amd-gfx mailing list