[RFC PATCH] drm/amdgpu: Fix one use-after-free of VM

Tue Apr 12 09:19:52 UTC 2022

Am 12.04.22 um 09:16 schrieb xinhui pan:
> VM might already be freed when amdgpu_vm_tlb_seq_cb() is called.  We see
> the calltrace below.
>
> Fix it by adding vm.delayed_tlb_flush and check this value in vm_fini().
>
> BUG kmalloc-4k (Not tainted): Poison overwritten

Shit, I feared that this could happen.

Please don't use a counter+schedule, but rather keep the last flush 
fence around and wait for it to signal during vm destruction.

Since fences signal in order you can then wait on that one and make sure 
that the last cb has run.

Thanks,
Christian.

>
> 0xffff9c88630414e8-0xffff9c88630414e8 @offset=5352. First byte 0x6c
> instead of 0x6b Allocated in amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
> age=44 cpu=0 pid=2343
>   __slab_alloc.isra.0+0x4f/0x90
>   kmem_cache_alloc_trace+0x6b8/0x7a0
>   amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
>   drm_file_alloc+0x222/0x3e0 [drm]
>   drm_open+0x11d/0x410 [drm]
>   drm_stub_open+0xdc/0x230 [drm]
>   chrdev_open+0xa5/0x1e0
>   do_dentry_open+0x16c/0x3c0
>   vfs_open+0x2d/0x30
>   path_openat+0x70a/0xa90
>   do_filp_open+0xb2/0x120
>   do_sys_openat2+0x245/0x330
>   do_sys_open+0x46/0x80
>   __x64_sys_openat+0x20/0x30
>   do_syscall_64+0x38/0xc0
>   entry_SYSCALL_64_after_hwframe+0x44/0xae
> Freed in amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu] age=22 cpu=1
> pid=2485
>   kfree+0x4a2/0x580
>   amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu]
>   drm_file_free+0x24e/0x3c0 [drm]
>   drm_close_helper.isra.0+0x90/0xb0 [drm]
>   drm_release+0x97/0x1a0 [drm]
>   __fput+0xb6/0x280
>   ____fput+0xe/0x10
>   task_work_run+0x64/0xb0
>   do_exit+0x406/0xcf0
>   do_group_exit+0x50/0xc0
>   __x64_sys_exit_group+0x18/0x20
>   do_syscall_64+0x38/0xc0
>   entry_SYSCALL_64_after_hwframe+0x44/0xae
>
> Signed-off-by: xinhui pan <xinhui.pan at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 +
>   2 files changed, 5 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 645ce28277c2..0e89f42283c6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -789,6 +789,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
>   
>   	tlb_cb = container_of(cb, typeof(*tlb_cb), cb);
>   	atomic64_inc(&tlb_cb->vm->tlb_seq);
> +	atomic64_dec(&tlb_cb->vm->delayed_tlb_flush);
>   	kfree(tlb_cb);
>   }
>   
> @@ -932,6 +933,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   
>   	if (flush_tlb || params.table_freed) {
>   		tlb_cb->vm = vm;
> +		atomic64_inc(&vm->delayed_tlb_flush);
>   		if (!fence || !*fence ||
>   		    dma_fence_add_callback(*fence, &tlb_cb->cb,
>   					   amdgpu_vm_tlb_seq_cb))
> @@ -2257,6 +2259,8 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   	amdgpu_vm_set_pasid(adev, vm, 0);
>   	dma_fence_wait(vm->last_unlocked, false);
>   	dma_fence_put(vm->last_unlocked);
> +	while (atomic64_read(&vm->delayed_tlb_flush))
> +		schedule();
>   
>   	list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {
>   		if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 1a814fbffff8..4dc9f493d355 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -286,6 +286,7 @@ struct amdgpu_vm {
>   
>   	/* Last finished delayed update */
>   	atomic64_t		tlb_seq;
> +	atomic64_t		delayed_tlb_flush;
>   
>   	/* Last unlocked submission to the scheduler entities */
>   	struct dma_fence	*last_unlocked;