[PATCH v2] drm/amdgpu: Fix use after free in trace_amdgpu_bo_move

Tvrtko Ursulin tursulin at ursulin.net
Thu Mar 21 11:18:02 UTC 2024


Hi Christian,

On 21/03/2024 10:25, Christian König wrote:
> Am 20.03.24 um 18:12 schrieb Tvrtko Ursulin:
>> From: Tvrtko Ursulin <tursulin at ursulin.net>
>>
>> Pipelined object migration will free up the old bo->resource, meaning
>> the tracepoint added in 94aeb4117343 ("drm/amdgpu: fix ftrace event
>> amdgpu_bo_move always move on same heap") will trigger an use after free
>> when it dereferences the cached old_mem.
>>
>> Fix it by caching the memory type locally, which is the only thing
>> tracepoint wants to know about.
>>
>> While at it convert the whole function to use the cached memory types for
>> consistency.
>>
>> v2:
>>   * Fix compilation.
> 
> I do remember that we have fixed this before. Question is really why it 
> shows up again.
> 
> Going to investigate.

If it helps, and if I did not mistrace something, this is the call trace 
I saw with use after free:

ttm_mem_evict_first
   ttm_bo_handle_move_mem
     amdgpu_move (via bdev->funcs->move)
       amdgpu_move_blit
   	ttm_bo_move_accel_cleanup
	  ttm_bo_move_pipeline_evict
  	    ttm_resource_free(bo, &bo->resource);
... unwind back to amdgpu_bo_move:
       trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);

And this old_mem is now uaf since old_mem is &bo->resource from the top 
of amdgpu_move, before if was freed in ttm_bo_move_pipeline_evict.

Regards,

Tvrtko

>> Signed-off-by: Tvrtko Ursulin <tursulin at ursulin.net>
>> Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move 
>> always move on same heap")
>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171
>> Cc: Beyond Wang <Wang.Beyond at amd.com>
>> Cc: Christian König <christian.koenig at amd.com>
>> Cc: Alex Deucher <alexander.deucher at amd.com>
>> ---
>> Beware this is a speculative fix for now based only on source code
>> analysis and backtraces from 3171. It is also a bit on the churny side so
>> I am happy to minimize it. But most importantly, given how I don't have
>> any experience in amdgpu, I am looking for domain experts to either
>> confirm or disprove my analysis.
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 47 ++++++++++++-------------
>>   1 file changed, 22 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index 8722beba494e..81189aab5a04 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -452,10 +452,11 @@ static int amdgpu_bo_move(struct 
>> ttm_buffer_object *bo, bool evict,
>>       struct amdgpu_device *adev;
>>       struct amdgpu_bo *abo;
>>       struct ttm_resource *old_mem = bo->resource;
>> +    uint32_t new_mem_type = new_mem->mem_type;
>> +    uint32_t old_mem_type;
>>       int r;
>> -    if (new_mem->mem_type == TTM_PL_TT ||
>> -        new_mem->mem_type == AMDGPU_PL_PREEMPT) {
>> +    if (new_mem_type == TTM_PL_TT || new_mem_type == 
>> AMDGPU_PL_PREEMPT) {
>>           r = amdgpu_ttm_backend_bind(bo->bdev, bo->ttm, new_mem);
>>           if (r)
>>               return r;
>> @@ -464,20 +465,18 @@ static int amdgpu_bo_move(struct 
>> ttm_buffer_object *bo, bool evict,
>>       abo = ttm_to_amdgpu_bo(bo);
>>       adev = amdgpu_ttm_adev(bo->bdev);
>> -    if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
>> -             bo->ttm == NULL)) {
>> +    if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM && bo->ttm == 
>> NULL)) {
>>           ttm_bo_move_null(bo, new_mem);
>>           goto out;
>>       }
>> -    if (old_mem->mem_type == TTM_PL_SYSTEM &&
>> -        (new_mem->mem_type == TTM_PL_TT ||
>> -         new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
>> +    old_mem_type = old_mem->mem_type;
>> +    if (old_mem_type == TTM_PL_SYSTEM &&
>> +        (new_mem_type == TTM_PL_TT || new_mem_type == 
>> AMDGPU_PL_PREEMPT)) {
>>           ttm_bo_move_null(bo, new_mem);
>>           goto out;
>>       }
>> -    if ((old_mem->mem_type == TTM_PL_TT ||
>> -         old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
>> -        new_mem->mem_type == TTM_PL_SYSTEM) {
>> +    if ((old_mem_type == TTM_PL_TT || old_mem_type == 
>> AMDGPU_PL_PREEMPT) &&
>> +        new_mem_type == TTM_PL_SYSTEM) {
>>           r = ttm_bo_wait_ctx(bo, ctx);
>>           if (r)
>>               return r;
>> @@ -488,22 +487,22 @@ static int amdgpu_bo_move(struct 
>> ttm_buffer_object *bo, bool evict,
>>           goto out;
>>       }
>> -    if (old_mem->mem_type == AMDGPU_PL_GDS ||
>> -        old_mem->mem_type == AMDGPU_PL_GWS ||
>> -        old_mem->mem_type == AMDGPU_PL_OA ||
>> -        old_mem->mem_type == AMDGPU_PL_DOORBELL ||
>> -        new_mem->mem_type == AMDGPU_PL_GDS ||
>> -        new_mem->mem_type == AMDGPU_PL_GWS ||
>> -        new_mem->mem_type == AMDGPU_PL_OA ||
>> -        new_mem->mem_type == AMDGPU_PL_DOORBELL) {
>> +    if (old_mem_type == AMDGPU_PL_GDS ||
>> +        old_mem_type == AMDGPU_PL_GWS ||
>> +        old_mem_type == AMDGPU_PL_OA ||
>> +        old_mem_type == AMDGPU_PL_DOORBELL ||
>> +        new_mem_type == AMDGPU_PL_GDS ||
>> +        new_mem_type == AMDGPU_PL_GWS ||
>> +        new_mem_type == AMDGPU_PL_OA ||
>> +        new_mem_type == AMDGPU_PL_DOORBELL) {
>>           /* Nothing to save here */
>>           ttm_bo_move_null(bo, new_mem);
>>           goto out;
>>       }
>>       if (bo->type == ttm_bo_type_device &&
>> -        new_mem->mem_type == TTM_PL_VRAM &&
>> -        old_mem->mem_type != TTM_PL_VRAM) {
>> +        new_mem_type == TTM_PL_VRAM &&
>> +        old_mem_type != TTM_PL_VRAM) {
>>           /* amdgpu_bo_fault_reserve_notify will re-set this if the CPU
>>            * accesses the BO after it's moved.
>>            */
>> @@ -511,10 +510,8 @@ static int amdgpu_bo_move(struct 
>> ttm_buffer_object *bo, bool evict,
>>       }
>>       if (adev->mman.buffer_funcs_enabled) {
>> -        if (((old_mem->mem_type == TTM_PL_SYSTEM &&
>> -              new_mem->mem_type == TTM_PL_VRAM) ||
>> -             (old_mem->mem_type == TTM_PL_VRAM &&
>> -              new_mem->mem_type == TTM_PL_SYSTEM))) {
>> +        if (((old_mem_type == TTM_PL_SYSTEM && new_mem_type == 
>> TTM_PL_VRAM) ||
>> +             (old_mem_type == TTM_PL_VRAM && new_mem_type == 
>> TTM_PL_SYSTEM))) {
>>               hop->fpfn = 0;
>>               hop->lpfn = 0;
>>               hop->mem_type = TTM_PL_TT;
>> @@ -540,7 +537,7 @@ static int amdgpu_bo_move(struct ttm_buffer_object 
>> *bo, bool evict,
>>               return r;
>>       }
>> -    trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
>> +    trace_amdgpu_bo_move(abo, new_mem_type, old_mem_type);
>>   out:
>>       /* update statistics */
>>       atomic64_add(bo->base.size, &adev->num_bytes_moved);
> 


More information about the amd-gfx mailing list