[PATCH] drm/amdgpu: clear freed mappings immediately when BO may be freed
Nicolai Hähnle
nicolai.haehnle at amd.com
Thu Mar 23 18:33:02 UTC 2017
On 23.03.2017 19:18, Christian König wrote:
> Am 23.03.2017 um 18:22 schrieb Nicolai Hähnle:
>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>
>> Also, add the fence of the clear operations to the BO to ensure that
>> the underlying memory can only be re-used after all PTEs pointing to
>> it have been cleared.
>>
>> This avoids the following sequence of events that could be triggered
>> by user space:
>>
>> 1. Submit a CS that accesses some BO _without_ adding that BO to the
>> buffer list.
>> 2. Free that BO.
>> 3. Some other task re-uses the memory underlying the BO.
>> 4. The CS is submitted to the hardware and accesses memory that is
>> now already in use by somebody else.
>>
>> By clearing the page tables immediately in step 2, a GPU VM fault will
>> be triggered in step 4 instead of wild memory accesses.
>
> First of all please split adding the fence parameter to
> amdgpu_vm_clear_freed() into a separate patch.
Sure, I'll do that.
> Not a must have, but that should make it easier to review.
>
>>
>> Signed-off-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 13 +++++++++++--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 20 ++++++++++++++------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 ++-
>> 4 files changed, 28 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 55d553a..85e6070 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -778,21 +778,21 @@ static int amdgpu_bo_vm_update_pte(struct
>> amdgpu_cs_parser *p)
>> int i, r;
>> r = amdgpu_vm_update_page_directory(adev, vm);
>> if (r)
>> return r;
>> r = amdgpu_sync_fence(adev, &p->job->sync,
>> vm->page_directory_fence);
>> if (r)
>> return r;
>> - r = amdgpu_vm_clear_freed(adev, vm);
>> + r = amdgpu_vm_clear_freed(adev, vm, NULL);
>> if (r)
>> return r;
>> r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false);
>> if (r)
>> return r;
>> r = amdgpu_sync_fence(adev, &p->job->sync,
>> fpriv->prt_va->last_pt_update);
>> if (r)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index be9fb2c..bd2daef 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -145,20 +145,21 @@ void amdgpu_gem_object_close(struct
>> drm_gem_object *obj,
>> struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
>> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>> struct amdgpu_fpriv *fpriv = file_priv->driver_priv;
>> struct amdgpu_vm *vm = &fpriv->vm;
>> struct amdgpu_bo_list_entry vm_pd;
>> struct list_head list, duplicates;
>> struct ttm_validate_buffer tv;
>> struct ww_acquire_ctx ticket;
>> struct amdgpu_bo_va *bo_va;
>> + struct fence *fence = NULL;
>> int r;
>> INIT_LIST_HEAD(&list);
>> INIT_LIST_HEAD(&duplicates);
>> tv.bo = &bo->tbo;
>> tv.shared = true;
>> list_add(&tv.head, &list);
>> amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
>> @@ -166,23 +167,31 @@ void amdgpu_gem_object_close(struct
>> drm_gem_object *obj,
>> r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
>> if (r) {
>> dev_err(adev->dev, "leaking bo va because "
>> "we fail to reserve bo (%d)\n", r);
>> return;
>> }
>> bo_va = amdgpu_vm_bo_find(vm, bo);
>> if (bo_va) {
>> if (--bo_va->ref_count == 0) {
>> amdgpu_vm_bo_rmv(adev, bo_va);
>> +
>> + amdgpu_vm_clear_freed(adev, vm, &fence);
>> }
>> }
>> - ttm_eu_backoff_reservation(&ticket, &list);
>> +
>> + if (fence) {
>> + ttm_eu_fence_buffer_objects(&ticket, &list, fence);
>> + fence_put(fence);
>> + } else {
>> + ttm_eu_backoff_reservation(&ticket, &list);
>> + }
>
> Ah, now I remember why we didn't do that before. We could run into
> problems because amdgpu_gem_object_close() can't fail and adding this
> needs memory.
>
> Anyway, "tv.shared = true;" was there before. So your patch doesn't make
> it any worse.
>
> But I suggest to use amdgpu_bo_fence() instead of
> ttm_eu_fence_buffer_objects(). This way we don't try to add the fence to
> the page directory.
Just checked, the fence is added to the page directory anyway, in
amdgpu_vm_bo_update_mapping. I think that's necessary to make sure
subsequent CS submissions see the cleared page tables.
Anyway, it still makes sense to remove the call to
ttm_eu_fence_buffer_objects here. That's more explicit about who is
responsible for adding fences to what.
Thanks,
Nicolai
>
> Apart from that the patch looks good to me,
> Christian.
>
>> }
>> static int amdgpu_gem_handle_lockup(struct amdgpu_device *adev,
>> int r)
>> {
>> if (r == -EDEADLK) {
>> r = amdgpu_gpu_reset(adev);
>> if (!r)
>> r = -EAGAIN;
>> }
>> return r;
>> @@ -535,21 +544,21 @@ static void amdgpu_gem_va_update_vm(struct
>> amdgpu_device *adev,
>> r = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_gem_va_check,
>> NULL);
>> if (r)
>> goto error;
>> r = amdgpu_vm_update_page_directory(adev, vm);
>> if (r)
>> goto error;
>> - r = amdgpu_vm_clear_freed(adev, vm);
>> + r = amdgpu_vm_clear_freed(adev, vm, NULL);
>> if (r)
>> goto error;
>> if (operation == AMDGPU_VA_OP_MAP ||
>> operation == AMDGPU_VA_OP_REPLACE)
>> r = amdgpu_vm_bo_update(adev, bo_va, false);
>> error:
>> if (r && r != -ERESTARTSYS)
>> DRM_ERROR("Couldn't update BO_VA (%d)\n", r);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index dd7df45..b6029ed 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1397,48 +1397,56 @@ static void amdgpu_vm_prt_fini(struct
>> amdgpu_device *adev, struct amdgpu_vm *vm)
>> }
>> kfree(shared);
>> }
>> /**
>> * amdgpu_vm_clear_freed - clear freed BOs in the PT
>> *
>> * @adev: amdgpu_device pointer
>> * @vm: requested vm
>> + * @fence: optional resulting fence
>> *
>> * Make sure all freed BOs are cleared in the PT.
>> * Returns 0 for success.
>> *
>> * PTs have to be reserved and mutex must be locked!
>> */
>> int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>> - struct amdgpu_vm *vm)
>> + struct amdgpu_vm *vm,
>> + struct fence **fence)
>> {
>> struct amdgpu_bo_va_mapping *mapping;
>> - struct fence *fence = NULL;
>> + struct fence *f = NULL;
>> int r;
>> while (!list_empty(&vm->freed)) {
>> mapping = list_first_entry(&vm->freed,
>> struct amdgpu_bo_va_mapping, list);
>> list_del(&mapping->list);
>> r = amdgpu_vm_bo_split_mapping(adev, NULL, 0, NULL, vm,
>> mapping,
>> - 0, 0, &fence);
>> - amdgpu_vm_free_mapping(adev, vm, mapping, fence);
>> + 0, 0, &f);
>> + amdgpu_vm_free_mapping(adev, vm, mapping, f);
>> if (r) {
>> - fence_put(fence);
>> + fence_put(f);
>> return r;
>> }
>> + }
>> + if (fence && f) {
>> + fence_put(*fence);
>> + *fence = f;
>> + } else {
>> + fence_put(f);
>> }
>> - fence_put(fence);
>> +
>> return 0;
>> }
>> /**
>> * amdgpu_vm_clear_invalids - clear invalidated BOs in the PT
>> *
>> * @adev: amdgpu_device pointer
>> * @vm: requested vm
>> *
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index ff10fa5..9d5a572 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -187,21 +187,22 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>> struct amdgpu_vm *vm,
>> uint64_t saddr, uint64_t size);
>> int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>> struct amdgpu_sync *sync, struct fence *fence,
>> struct amdgpu_job *job);
>> int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job);
>> void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vm_id);
>> int amdgpu_vm_update_page_directory(struct amdgpu_device *adev,
>> struct amdgpu_vm *vm);
>> int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>> - struct amdgpu_vm *vm);
>> + struct amdgpu_vm *vm,
>> + struct fence **fence);
>> int amdgpu_vm_clear_invalids(struct amdgpu_device *adev, struct
>> amdgpu_vm *vm,
>> struct amdgpu_sync *sync);
>> int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>> struct amdgpu_bo_va *bo_va,
>> bool clear);
>> void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
>> struct amdgpu_bo *bo);
>> struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
>> struct amdgpu_bo *bo);
>> struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
>
>
More information about the amd-gfx
mailing list