[PATCH v2] drm/amdgpu: Fix use after free in trace_amdgpu_bo_move
Tvrtko Ursulin
tursulin at ursulin.net
Thu Mar 21 11:18:02 UTC 2024
Hi Christian,
On 21/03/2024 10:25, Christian König wrote:
> Am 20.03.24 um 18:12 schrieb Tvrtko Ursulin:
>> From: Tvrtko Ursulin <tursulin at ursulin.net>
>>
>> Pipelined object migration will free up the old bo->resource, meaning
>> the tracepoint added in 94aeb4117343 ("drm/amdgpu: fix ftrace event
>> amdgpu_bo_move always move on same heap") will trigger an use after free
>> when it dereferences the cached old_mem.
>>
>> Fix it by caching the memory type locally, which is the only thing
>> tracepoint wants to know about.
>>
>> While at it convert the whole function to use the cached memory types for
>> consistency.
>>
>> v2:
>> * Fix compilation.
>
> I do remember that we have fixed this before. Question is really why it
> shows up again.
>
> Going to investigate.
If it helps, and if I did not mistrace something, this is the call trace
I saw with use after free:
ttm_mem_evict_first
ttm_bo_handle_move_mem
amdgpu_move (via bdev->funcs->move)
amdgpu_move_blit
ttm_bo_move_accel_cleanup
ttm_bo_move_pipeline_evict
ttm_resource_free(bo, &bo->resource);
... unwind back to amdgpu_bo_move:
trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
And this old_mem is now uaf since old_mem is &bo->resource from the top
of amdgpu_move, before if was freed in ttm_bo_move_pipeline_evict.
Regards,
Tvrtko
>> Signed-off-by: Tvrtko Ursulin <tursulin at ursulin.net>
>> Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move
>> always move on same heap")
>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171
>> Cc: Beyond Wang <Wang.Beyond at amd.com>
>> Cc: Christian König <christian.koenig at amd.com>
>> Cc: Alex Deucher <alexander.deucher at amd.com>
>> ---
>> Beware this is a speculative fix for now based only on source code
>> analysis and backtraces from 3171. It is also a bit on the churny side so
>> I am happy to minimize it. But most importantly, given how I don't have
>> any experience in amdgpu, I am looking for domain experts to either
>> confirm or disprove my analysis.
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 47 ++++++++++++-------------
>> 1 file changed, 22 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index 8722beba494e..81189aab5a04 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -452,10 +452,11 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> struct amdgpu_device *adev;
>> struct amdgpu_bo *abo;
>> struct ttm_resource *old_mem = bo->resource;
>> + uint32_t new_mem_type = new_mem->mem_type;
>> + uint32_t old_mem_type;
>> int r;
>> - if (new_mem->mem_type == TTM_PL_TT ||
>> - new_mem->mem_type == AMDGPU_PL_PREEMPT) {
>> + if (new_mem_type == TTM_PL_TT || new_mem_type ==
>> AMDGPU_PL_PREEMPT) {
>> r = amdgpu_ttm_backend_bind(bo->bdev, bo->ttm, new_mem);
>> if (r)
>> return r;
>> @@ -464,20 +465,18 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> abo = ttm_to_amdgpu_bo(bo);
>> adev = amdgpu_ttm_adev(bo->bdev);
>> - if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
>> - bo->ttm == NULL)) {
>> + if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM && bo->ttm ==
>> NULL)) {
>> ttm_bo_move_null(bo, new_mem);
>> goto out;
>> }
>> - if (old_mem->mem_type == TTM_PL_SYSTEM &&
>> - (new_mem->mem_type == TTM_PL_TT ||
>> - new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
>> + old_mem_type = old_mem->mem_type;
>> + if (old_mem_type == TTM_PL_SYSTEM &&
>> + (new_mem_type == TTM_PL_TT || new_mem_type ==
>> AMDGPU_PL_PREEMPT)) {
>> ttm_bo_move_null(bo, new_mem);
>> goto out;
>> }
>> - if ((old_mem->mem_type == TTM_PL_TT ||
>> - old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
>> - new_mem->mem_type == TTM_PL_SYSTEM) {
>> + if ((old_mem_type == TTM_PL_TT || old_mem_type ==
>> AMDGPU_PL_PREEMPT) &&
>> + new_mem_type == TTM_PL_SYSTEM) {
>> r = ttm_bo_wait_ctx(bo, ctx);
>> if (r)
>> return r;
>> @@ -488,22 +487,22 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> goto out;
>> }
>> - if (old_mem->mem_type == AMDGPU_PL_GDS ||
>> - old_mem->mem_type == AMDGPU_PL_GWS ||
>> - old_mem->mem_type == AMDGPU_PL_OA ||
>> - old_mem->mem_type == AMDGPU_PL_DOORBELL ||
>> - new_mem->mem_type == AMDGPU_PL_GDS ||
>> - new_mem->mem_type == AMDGPU_PL_GWS ||
>> - new_mem->mem_type == AMDGPU_PL_OA ||
>> - new_mem->mem_type == AMDGPU_PL_DOORBELL) {
>> + if (old_mem_type == AMDGPU_PL_GDS ||
>> + old_mem_type == AMDGPU_PL_GWS ||
>> + old_mem_type == AMDGPU_PL_OA ||
>> + old_mem_type == AMDGPU_PL_DOORBELL ||
>> + new_mem_type == AMDGPU_PL_GDS ||
>> + new_mem_type == AMDGPU_PL_GWS ||
>> + new_mem_type == AMDGPU_PL_OA ||
>> + new_mem_type == AMDGPU_PL_DOORBELL) {
>> /* Nothing to save here */
>> ttm_bo_move_null(bo, new_mem);
>> goto out;
>> }
>> if (bo->type == ttm_bo_type_device &&
>> - new_mem->mem_type == TTM_PL_VRAM &&
>> - old_mem->mem_type != TTM_PL_VRAM) {
>> + new_mem_type == TTM_PL_VRAM &&
>> + old_mem_type != TTM_PL_VRAM) {
>> /* amdgpu_bo_fault_reserve_notify will re-set this if the CPU
>> * accesses the BO after it's moved.
>> */
>> @@ -511,10 +510,8 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> }
>> if (adev->mman.buffer_funcs_enabled) {
>> - if (((old_mem->mem_type == TTM_PL_SYSTEM &&
>> - new_mem->mem_type == TTM_PL_VRAM) ||
>> - (old_mem->mem_type == TTM_PL_VRAM &&
>> - new_mem->mem_type == TTM_PL_SYSTEM))) {
>> + if (((old_mem_type == TTM_PL_SYSTEM && new_mem_type ==
>> TTM_PL_VRAM) ||
>> + (old_mem_type == TTM_PL_VRAM && new_mem_type ==
>> TTM_PL_SYSTEM))) {
>> hop->fpfn = 0;
>> hop->lpfn = 0;
>> hop->mem_type = TTM_PL_TT;
>> @@ -540,7 +537,7 @@ static int amdgpu_bo_move(struct ttm_buffer_object
>> *bo, bool evict,
>> return r;
>> }
>> - trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
>> + trace_amdgpu_bo_move(abo, new_mem_type, old_mem_type);
>> out:
>> /* update statistics */
>> atomic64_add(bo->base.size, &adev->num_bytes_moved);
>
More information about the amd-gfx
mailing list