[PATCH v2] drm/amdgpu: Fix use after free in trace_amdgpu_bo_move
Christian König
christian.koenig at amd.com
Thu Mar 21 10:25:50 UTC 2024
Am 20.03.24 um 18:12 schrieb Tvrtko Ursulin:
> From: Tvrtko Ursulin <tursulin at ursulin.net>
>
> Pipelined object migration will free up the old bo->resource, meaning
> the tracepoint added in 94aeb4117343 ("drm/amdgpu: fix ftrace event
> amdgpu_bo_move always move on same heap") will trigger an use after free
> when it dereferences the cached old_mem.
>
> Fix it by caching the memory type locally, which is the only thing
> tracepoint wants to know about.
>
> While at it convert the whole function to use the cached memory types for
> consistency.
>
> v2:
> * Fix compilation.
I do remember that we have fixed this before. Question is really why it
shows up again.
Going to investigate.
Thanks,
Christian.
>
> Signed-off-by: Tvrtko Ursulin <tursulin at ursulin.net>
> Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move always move on same heap")
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171
> Cc: Beyond Wang <Wang.Beyond at amd.com>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> ---
> Beware this is a speculative fix for now based only on source code
> analysis and backtraces from 3171. It is also a bit on the churny side so
> I am happy to minimize it. But most importantly, given how I don't have
> any experience in amdgpu, I am looking for domain experts to either
> confirm or disprove my analysis.
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 47 ++++++++++++-------------
> 1 file changed, 22 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 8722beba494e..81189aab5a04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -452,10 +452,11 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
> struct amdgpu_device *adev;
> struct amdgpu_bo *abo;
> struct ttm_resource *old_mem = bo->resource;
> + uint32_t new_mem_type = new_mem->mem_type;
> + uint32_t old_mem_type;
> int r;
>
> - if (new_mem->mem_type == TTM_PL_TT ||
> - new_mem->mem_type == AMDGPU_PL_PREEMPT) {
> + if (new_mem_type == TTM_PL_TT || new_mem_type == AMDGPU_PL_PREEMPT) {
> r = amdgpu_ttm_backend_bind(bo->bdev, bo->ttm, new_mem);
> if (r)
> return r;
> @@ -464,20 +465,18 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
> abo = ttm_to_amdgpu_bo(bo);
> adev = amdgpu_ttm_adev(bo->bdev);
>
> - if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
> - bo->ttm == NULL)) {
> + if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM && bo->ttm == NULL)) {
> ttm_bo_move_null(bo, new_mem);
> goto out;
> }
> - if (old_mem->mem_type == TTM_PL_SYSTEM &&
> - (new_mem->mem_type == TTM_PL_TT ||
> - new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
> + old_mem_type = old_mem->mem_type;
> + if (old_mem_type == TTM_PL_SYSTEM &&
> + (new_mem_type == TTM_PL_TT || new_mem_type == AMDGPU_PL_PREEMPT)) {
> ttm_bo_move_null(bo, new_mem);
> goto out;
> }
> - if ((old_mem->mem_type == TTM_PL_TT ||
> - old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
> - new_mem->mem_type == TTM_PL_SYSTEM) {
> + if ((old_mem_type == TTM_PL_TT || old_mem_type == AMDGPU_PL_PREEMPT) &&
> + new_mem_type == TTM_PL_SYSTEM) {
> r = ttm_bo_wait_ctx(bo, ctx);
> if (r)
> return r;
> @@ -488,22 +487,22 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
> goto out;
> }
>
> - if (old_mem->mem_type == AMDGPU_PL_GDS ||
> - old_mem->mem_type == AMDGPU_PL_GWS ||
> - old_mem->mem_type == AMDGPU_PL_OA ||
> - old_mem->mem_type == AMDGPU_PL_DOORBELL ||
> - new_mem->mem_type == AMDGPU_PL_GDS ||
> - new_mem->mem_type == AMDGPU_PL_GWS ||
> - new_mem->mem_type == AMDGPU_PL_OA ||
> - new_mem->mem_type == AMDGPU_PL_DOORBELL) {
> + if (old_mem_type == AMDGPU_PL_GDS ||
> + old_mem_type == AMDGPU_PL_GWS ||
> + old_mem_type == AMDGPU_PL_OA ||
> + old_mem_type == AMDGPU_PL_DOORBELL ||
> + new_mem_type == AMDGPU_PL_GDS ||
> + new_mem_type == AMDGPU_PL_GWS ||
> + new_mem_type == AMDGPU_PL_OA ||
> + new_mem_type == AMDGPU_PL_DOORBELL) {
> /* Nothing to save here */
> ttm_bo_move_null(bo, new_mem);
> goto out;
> }
>
> if (bo->type == ttm_bo_type_device &&
> - new_mem->mem_type == TTM_PL_VRAM &&
> - old_mem->mem_type != TTM_PL_VRAM) {
> + new_mem_type == TTM_PL_VRAM &&
> + old_mem_type != TTM_PL_VRAM) {
> /* amdgpu_bo_fault_reserve_notify will re-set this if the CPU
> * accesses the BO after it's moved.
> */
> @@ -511,10 +510,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
> }
>
> if (adev->mman.buffer_funcs_enabled) {
> - if (((old_mem->mem_type == TTM_PL_SYSTEM &&
> - new_mem->mem_type == TTM_PL_VRAM) ||
> - (old_mem->mem_type == TTM_PL_VRAM &&
> - new_mem->mem_type == TTM_PL_SYSTEM))) {
> + if (((old_mem_type == TTM_PL_SYSTEM && new_mem_type == TTM_PL_VRAM) ||
> + (old_mem_type == TTM_PL_VRAM && new_mem_type == TTM_PL_SYSTEM))) {
> hop->fpfn = 0;
> hop->lpfn = 0;
> hop->mem_type = TTM_PL_TT;
> @@ -540,7 +537,7 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
> return r;
> }
>
> - trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
> + trace_amdgpu_bo_move(abo, new_mem_type, old_mem_type);
> out:
> /* update statistics */
> atomic64_add(bo->base.size, &adev->num_bytes_moved);
More information about the amd-gfx
mailing list