[PATCH v2 1/2] drm/amdgpu: Auto-validate DMABuf imports in compute VMs

Thu Dec 14 21:40:01 UTC 2023

On 2023-12-13 09:30, Christian König wrote:
> Am 06.12.23 um 22:44 schrieb Felix Kuehling:
>> DMABuf imports in compute VMs are not wrapped in a kgd_mem object on the
>> process_info->kfd_bo_list. There is no explicit KFD API call to validate
>> them or add eviction fences to them.
>>
>> This patch automatically validates and fences dymanic DMABuf imports 
>> when
>> they are added to a compute VM. Revalidation after evictions is handled
>> in the VM code.
>>
>> v2:
>> * Renamed amdgpu_vm_validate_evicted_bos to amdgpu_vm_validate
>> * Eliminated evicted_user state, use evicted state for VM BOs and 
>> user BOs
>> * Fixed and simplified amdgpu_vm_fence_imports, depends on reserved BOs
>> * Moved dma_resv_reserve_fences for amdgpu_vm_fence_imports into
>>    amdgpu_vm_validate, outside the vm->status_lock
>> * Added dummy version of amdgpu_amdkfd_bo_validate_and_fence for builds
>>    without KFD
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  10 ++
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  44 +++++---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |   6 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c   |   4 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  28 ++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        | 101 ++++++++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |  10 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  10 +-
>>   8 files changed, 177 insertions(+), 36 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index f262b9d89541..584a0cea5572 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -191,6 +191,9 @@ struct amdgpu_amdkfd_fence 
>> *to_amdgpu_amdkfd_fence(struct dma_fence *f);
>>   int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
>>   int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
>>                   unsigned long cur_seq, struct kgd_mem *mem);
>> +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
>> +                    uint32_t domain,
>> +                    struct dma_fence *fence);
>>   #else
>>   static inline
>>   bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
>> @@ -216,6 +219,13 @@ int amdgpu_amdkfd_evict_userptr(struct 
>> mmu_interval_notifier *mni,
>>   {
>>       return 0;
>>   }
>> +static inline
>> +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
>> +                    uint32_t domain,
>> +                    struct dma_fence *fence)
>> +{
>> +    return 0;
>> +}
>>   #endif
>>   /* Shared API */
>>   int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t 
>> size,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 5f445d856769..fbabb1e63a67 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -426,9 +426,9 @@ static int amdgpu_amdkfd_bo_validate(struct 
>> amdgpu_bo *bo, uint32_t domain,
>>       return ret;
>>   }
>>   -static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
>> -                           uint32_t domain,
>> -                           struct dma_fence *fence)
>> +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
>> +                    uint32_t domain,
>> +                    struct dma_fence *fence)
>>   {
>>       int ret = amdgpu_bo_reserve(bo, false);
>>   @@ -464,13 +464,15 @@ static int amdgpu_amdkfd_validate_vm_bo(void 
>> *_unused, struct amdgpu_bo *bo)
>>    * again. Page directories are only updated after updating page
>>    * tables.
>>    */
>> -static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
>> +static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm,
>> +                 struct ww_acquire_ctx *ticket)
>>   {
>>       struct amdgpu_bo *pd = vm->root.bo;
>>       struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
>>       int ret;
>>   -    ret = amdgpu_vm_validate_pt_bos(adev, vm, 
>> amdgpu_amdkfd_validate_vm_bo, NULL);
>> +    ret = amdgpu_vm_validate(adev, vm, ticket,
>> +                 amdgpu_amdkfd_validate_vm_bo, NULL);
>>       if (ret) {
>>           pr_err("failed to validate PT BOs\n");
>>           return ret;
>> @@ -1310,14 +1312,15 @@ static int map_bo_to_gpuvm(struct kgd_mem *mem,
>>       return ret;
>>   }
>>   -static int process_validate_vms(struct amdkfd_process_info 
>> *process_info)
>> +static int process_validate_vms(struct amdkfd_process_info 
>> *process_info,
>> +                struct ww_acquire_ctx *ticket)
>>   {
>>       struct amdgpu_vm *peer_vm;
>>       int ret;
>>         list_for_each_entry(peer_vm, &process_info->vm_list_head,
>>                   vm_list_node) {
>> -        ret = vm_validate_pt_pd_bos(peer_vm);
>> +        ret = vm_validate_pt_pd_bos(peer_vm, ticket);
>>           if (ret)
>>               return ret;
>>       }
>> @@ -1402,7 +1405,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, 
>> void **process_info,
>>       ret = amdgpu_bo_reserve(vm->root.bo, true);
>>       if (ret)
>>           goto reserve_pd_fail;
>> -    ret = vm_validate_pt_pd_bos(vm);
>> +    ret = vm_validate_pt_pd_bos(vm, NULL);
>>       if (ret) {
>>           pr_err("validate_pt_pd_bos() failed\n");
>>           goto validate_pd_fail;
>> @@ -2043,7 +2046,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
>>           bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
>>           is_invalid_userptr = true;
>>   -    ret = vm_validate_pt_pd_bos(avm);
>> +    ret = vm_validate_pt_pd_bos(avm, NULL);
>>       if (unlikely(ret))
>>           goto out_unreserve;
>>   @@ -2122,7 +2125,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
>>           goto unreserve_out;
>>       }
>>   -    ret = vm_validate_pt_pd_bos(avm);
>> +    ret = vm_validate_pt_pd_bos(avm, NULL);
>>       if (unlikely(ret))
>>           goto unreserve_out;
>>   @@ -2620,7 +2623,7 @@ static int validate_invalid_user_pages(struct 
>> amdkfd_process_info *process_info)
>>           }
>>       }
>>   -    ret = process_validate_vms(process_info);
>> +    ret = process_validate_vms(process_info, NULL);
>>       if (ret)
>>           goto unreserve_out;
>>   @@ -2880,11 +2883,6 @@ int 
>> amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence 
>> __rcu *
>>         amdgpu_sync_create(&sync_obj);
>>   -    /* Validate PDs and PTs */
>> -    ret = process_validate_vms(process_info);
>> -    if (ret)
>> -        goto validate_map_fail;
>> -
>>       /* Validate BOs and map them to GPUVM (update VM page tables). */
>>       list_for_each_entry(mem, &process_info->kfd_bo_list,
>>                   validate_list) {
>> @@ -2935,6 +2933,13 @@ int 
>> amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence 
>> __rcu *
>>       if (failed_size)
>>           pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
>>   +    /* Validate PDs, PTs and evicted DMABuf imports last. 
>> Otherwise BO
>> +     * validations above would invalidate DMABuf imports again.
>> +     */
>> +    ret = process_validate_vms(process_info, &exec.ticket);
>> +    if (ret)
>> +        goto validate_map_fail;
>> +
>>       /* Update mappings not managed by KFD */
>>       list_for_each_entry(peer_vm, &process_info->vm_list_head,
>>               vm_list_node) {
>> @@ -3006,7 +3011,7 @@ int 
>> amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence 
>> __rcu *
>> &process_info->eviction_fence->base,
>>                      DMA_RESV_USAGE_BOOKKEEP);
>>       }
>> -    /* Attach eviction fence to PD / PT BOs */
>> +    /* Attach eviction fence to PD / PT BOs and DMABuf imports */
>>       list_for_each_entry(peer_vm, &process_info->vm_list_head,
>>                   vm_list_node) {
>>           struct amdgpu_bo *bo = peer_vm->root.bo;
>> @@ -3014,6 +3019,11 @@ int 
>> amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence 
>> __rcu *
>>           dma_resv_add_fence(bo->tbo.base.resv,
>> &process_info->eviction_fence->base,
>>                      DMA_RESV_USAGE_BOOKKEEP);
>> +
>> +        ret = amdgpu_vm_fence_imports(peer_vm, &exec.ticket,
>> + &process_info->eviction_fence->base);
>> +        if (ret)
>> +            break;
>>       }
>>     validate_map_fail:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index e50be6500030..dc7fac778417 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -952,10 +952,10 @@ static int amdgpu_cs_parser_bos(struct 
>> amdgpu_cs_parser *p,
>>       p->bytes_moved = 0;
>>       p->bytes_moved_vis = 0;
>>   -    r = amdgpu_vm_validate_pt_bos(p->adev, &fpriv->vm,
>> -                      amdgpu_cs_bo_validate, p);
>> +    r = amdgpu_vm_validate(p->adev, &fpriv->vm, NULL,
>> +                   amdgpu_cs_bo_validate, p);
>>       if (r) {
>> -        DRM_ERROR("amdgpu_vm_validate_pt_bos() failed.\n");
>> +        DRM_ERROR("amdgpu_vm_validate() failed.\n");
>>           goto out_free_user_pages;
>>       }
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
>> index decbbe3d4f06..055ba2ea4c12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
>> @@ -377,6 +377,10 @@ amdgpu_dma_buf_move_notify(struct 
>> dma_buf_attachment *attach)
>>       struct amdgpu_vm_bo_base *bo_base;
>>       int r;
>>   +    /* FIXME: This should be after the "if", but needs a fix to 
>> make sure
>> +     * DMABuf imports are initialized in the right VM list.
>> +     */
>> +    amdgpu_vm_bo_invalidate(adev, bo, false);
>>       if (!bo->tbo.resource || bo->tbo.resource->mem_type == 
>> TTM_PL_SYSTEM)
>>           return;
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index b1ce22a9b186..68bab2879696 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -187,7 +187,33 @@ static int amdgpu_gem_object_open(struct 
>> drm_gem_object *obj,
>>       else
>>           ++bo_va->ref_count;
>>       amdgpu_bo_unreserve(abo);
>> -    return 0;
>> +
>> +    /* Validate and add eviction fence to DMABuf imports with dynamic
>> +     * attachment in compute VMs. Re-validation will be done by
>> +     * amdgpu_vm_validate and the fence will be updated by
>> +     * amdgpu_vm_fence_imports in 
>> amdgpu_amdkfd_gpuvm_restore_process_bos.
>> +     *
>> +     * Nested locking below for the case that a GEM object is opened in
>> +     * kfd_mem_export_dmabuf. Since the lock below is only taken for 
>> imports,
>> +     * but not for export, this is a different lock class that 
>> cannot lead to
>> +     * circular lock dependencies.
>> +     */
>> +    if (!vm->is_compute_context || !vm->process_info)
>> +        return 0;
>> +    if (!obj->import_attach ||
>> +        !dma_buf_is_dynamic(obj->import_attach->dmabuf))
>> +        return 0;
>> +    mutex_lock_nested(&vm->process_info->lock, 1);
>> +    if (!WARN_ON(!vm->process_info->eviction_fence)) {
>> +        r = amdgpu_amdkfd_bo_validate_and_fence(abo, 
>> AMDGPU_GEM_DOMAIN_GTT,
>> + &vm->process_info->eviction_fence->base);
>> +        if (r)
>> +            dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n",
>> +                 vm->task_info.pid, r);
>> +    }
>> +    mutex_unlock(&vm->process_info->lock);
>> +
>> +    return r;
>>   }
>>     static void amdgpu_gem_object_close(struct drm_gem_object *obj,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 7da71b6a9dc6..ab2662ab4ab8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -426,24 +426,29 @@ uint64_t amdgpu_vm_generation(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>   }
>>     /**
>> - * amdgpu_vm_validate_pt_bos - validate the page table BOs
>> + * amdgpu_vm_validate - validate evicted BOs tracked in the VM
>>    *
>>    * @adev: amdgpu device pointer
>>    * @vm: vm providing the BOs
>> + * @ticket: optional reservation ticket used to reserve the VM
>>    * @validate: callback to do the validation
>>    * @param: parameter for the validation callback
>>    *
>> - * Validate the page table BOs on command submission if neccessary.
>> + * Validate the page table BOs and per-VM BOs on command submission if
>> + * necessary. If a ticket is given, also try to validate evicted 
>> user queue
>> + * BOs. They must already be reserved with the given ticket.
>>    *
>>    * Returns:
>>    * Validation result.
>>    */
>> -int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm,
>> -                  int (*validate)(void *p, struct amdgpu_bo *bo),
>> -                  void *param)
>> +int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm 
>> *vm,
>> +               struct ww_acquire_ctx *ticket,
>> +               int (*validate)(void *p, struct amdgpu_bo *bo),
>> +               void *param)
>>   {
>>       struct amdgpu_vm_bo_base *bo_base;
>>       struct amdgpu_bo *shadow;
>> +    struct dma_resv *resv;
>>       struct amdgpu_bo *bo;
>>       int r;
>>   @@ -464,8 +469,27 @@ int amdgpu_vm_validate_pt_bos(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>           spin_unlock(&vm->status_lock);
>>             bo = bo_base->bo;
>> +        resv = bo->tbo.base.resv;
>>           shadow = amdgpu_bo_shadowed(bo);
>>   +        if (resv != vm->root.bo->tbo.base.resv) {
>> +            if (!ticket) {
>> +                /* We need to move the BO out of the evicted
>> +                 * list to avoid an infinite loop. It will be
>> +                 * moved back to evicted in the next
>> +                 * amdgpu_vm_handle_moved.
>> +                 */
>> +                amdgpu_vm_bo_invalidated(bo_base);
>> +                spin_lock(&vm->status_lock);
>> +                continue;
>> +            }
>> +            if (dma_resv_locking_ctx(resv) != ticket) {
>> +                pr_warn_ratelimited("Evicted user BO is not reserved 
>> in pid %d\n",
>> +                            vm->task_info.pid);
>
> I'm not sure if that's the most defensive option. Might be better to 
> handle this in the "if (!ticket..." case above.

Do you mean, printing the warning, or aborting the validation with an error?

The "if (!ticket)" case above is for callers that don't need to validate 
user BOs for user mode queue usage. This error here is in the case where 
the caller wants to validate user BOs for user mode queues, and the 
caller did not reserve all those BOs. In this case I think I need to 
terminate with an error to avoid enabling queues with invalid BOs.

>
>> +                return -EINVAL;
>> +            }
>> +        }
>> +
>>           r = validate(param, bo);
>>           if (r)
>>               return r;
>> @@ -475,7 +499,13 @@ int amdgpu_vm_validate_pt_bos(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>                   return r;
>>           }
>>   -        if (bo->tbo.type != ttm_bo_type_kernel) {
>> +        if (resv != vm->root.bo->tbo.base.resv) {
>> +            /* Fence reservation for amdgpu_vm_fence_imports */
>> +            r = dma_resv_reserve_fences(resv, 1);
>> +            if (r)
>> +                return r;
>
> Fence slot reservation should bet done by the caller and not here.

The caller doesn't necessarily have the BO list to create all those 
fences. The whole point of doing this in the VM code was, to use the "BO 
lists" maintained by the VM state machine. Having to find all the BOs in 
the caller to add these fences kind of defeats the purpose. Then I can 
do the validation there too, and I don't need to do the validation in 
the VM code at all.

The idea here is, that I reserve the fence slots in amdgpu_vm_validate 
and use the fence slots in amdgpu_vm_fence_imports. Conceptually, 
amdgpu_vm_validate is where we enumerate and validate BOs before command 
submission. This is a convenient place because I already require that 
the BOs must be reserved. amdgpu_vm_fence_imports is where we add the 
fences after command submission. At that point the BOs are not reserved 
any more, and I cannot reserve them without risking race conditions, 
because I'm holding the VM state spinlock.

>
>
>> + amdgpu_vm_bo_invalidated(bo_base);
>> +        } if (bo->tbo.type != ttm_bo_type_kernel) {
>>               amdgpu_vm_bo_moved(bo_base);
>>           } else {
>> vm->update_funcs->map_table(to_amdgpu_bo_vm(bo));
>> @@ -1425,11 +1455,21 @@ int amdgpu_vm_handle_moved(struct 
>> amdgpu_device *adev,
>>           }
>>             r = amdgpu_vm_bo_update(adev, bo_va, clear);
>> -        if (r)
>> -            return r;
>>             if (unlock)
>>               dma_resv_unlock(resv);
>> +        if (r)
>> +            return r;
>> +
>> +        /* Remember evicted DMABuf imports in compute VMs for later
>> +         * validation
>> +         */
>> +        if (vm->is_compute_context && bo_va->base.bo &&
>> +            bo_va->base.bo->tbo.base.import_attach &&
>> +            (!bo_va->base.bo->tbo.resource ||
>> +             bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM))
>> +            amdgpu_vm_bo_evicted(&bo_va->base);
>> +
>>           spin_lock(&vm->status_lock);
>>       }
>>       spin_unlock(&vm->status_lock);
>> @@ -1437,6 +1477,51 @@ int amdgpu_vm_handle_moved(struct 
>> amdgpu_device *adev,
>>       return 0;
>>   }
>>   +/**
>> + * amdgpu_vm_fence_imports - add fence to valid DMABuf imports
>> + *
>> + * @vm: requested vm
>> + * @ticket: reservation ticket used to reserve the VM
>> + * @fence: fence to add
>> + *
>> + * Add the specified fence to all dynamic DMABuf imports that are 
>> valid.
>> + *
>> + * Returns:
>> + * 0 for success.
>> + */
>> +int amdgpu_vm_fence_imports(struct amdgpu_vm *vm,
>> +                struct ww_acquire_ctx *ticket,
>> +                struct dma_fence *fence)
>> +{
>> +    struct amdgpu_bo_va *bo_va, *tmp;
>> +    struct dma_resv *resv;
>> +
>> +    if (!vm->is_compute_context)
>> +        return 0;
>> +    if (!ticket)
>> +        return -EINVAL;
>> +
>> +    spin_lock(&vm->status_lock);
>> +    list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) {
>
> Why using list_for_each_entry_safe() here?

Hmm, not sure any more. I must have thought the VM state could change 
here, so things would get removed from the vm->idle list. Must be a 
left-over from a previous revision. I think you're right, I don't need 
the safe version here.

Regards,
   Felix

>
> Regards,
> Christian.
>
>> +        if (!bo_va->base.bo || 
>> !bo_va->base.bo->tbo.base.import_attach ||
>> + !dma_buf_is_dynamic(bo_va->base.bo->tbo.base.import_attach->dmabuf))
>> +            continue;
>> +
>> +        resv = bo_va->base.bo->tbo.base.resv;
>> +        if (dma_resv_locking_ctx(resv) != ticket) {
>> +            pr_warn_ratelimited("Imported BO is not reserved in pid 
>> %d\n",
>> +                        vm->task_info.pid);
>> +            continue;
>> +        }
>> +        dma_resv_add_fence(resv, fence,
>> +                   DMA_RESV_USAGE_BOOKKEEP);
>> +
>> +    }
>> +    spin_unlock(&vm->status_lock);
>> +
>> +    return 0;
>> +}
>> +
>>   /**
>>    * amdgpu_vm_flush_compute_tlb - Flush TLB on compute VM
>>    *
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index b6cd565562ad..eff301976eee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -434,9 +434,10 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, 
>> struct drm_exec *exec,
>>                 unsigned int num_fences);
>>   bool amdgpu_vm_ready(struct amdgpu_vm *vm);
>>   uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm);
>> -int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm,
>> -                  int (*callback)(void *p, struct amdgpu_bo *bo),
>> -                  void *param);
>> +int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm 
>> *vm,
>> +               struct ww_acquire_ctx *ticket,
>> +               int (*callback)(void *p, struct amdgpu_bo *bo),
>> +               void *param);
>>   int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job 
>> *job, bool need_pipe_sync);
>>   int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
>>                 struct amdgpu_vm *vm, bool immediate);
>> @@ -446,6 +447,9 @@ int amdgpu_vm_clear_freed(struct amdgpu_device 
>> *adev,
>>   int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
>>                  struct amdgpu_vm *vm,
>>                  struct ww_acquire_ctx *ticket);
>> +int amdgpu_vm_fence_imports(struct amdgpu_vm *vm,
>> +                struct ww_acquire_ctx *ticket,
>> +                struct dma_fence *fence);
>>   int amdgpu_vm_flush_compute_tlb(struct amdgpu_device *adev,
>>                   struct amdgpu_vm *vm,
>>                   uint32_t flush_type,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> index 2834fb351818..9f29bbdb9050 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> @@ -1509,9 +1509,9 @@ static int svm_range_reserve_bos(struct 
>> svm_validate_context *ctx, bool intr)
>>               goto unreserve_out;
>>           }
>>   -        r = amdgpu_vm_validate_pt_bos(pdd->dev->adev,
>> -                          drm_priv_to_vm(pdd->drm_priv),
>> -                          svm_range_bo_validate, NULL);
>> +        r = amdgpu_vm_validate(pdd->dev->adev,
>> +                       drm_priv_to_vm(pdd->drm_priv), NULL,
>> +                       svm_range_bo_validate, NULL);
>>           if (r) {
>>               pr_debug("failed %d validate pt bos\n", r);
>>               goto unreserve_out;
>> @@ -1630,7 +1630,9 @@ static int svm_range_validate_and_map(struct 
>> mm_struct *mm,
>>           goto free_ctx;
>>       }
>>   -    svm_range_reserve_bos(ctx, intr);
>> +    r = svm_range_reserve_bos(ctx, intr);
>> +    if (r)
>> +        goto free_ctx;
>>         p = container_of(prange->svms, struct kfd_process, svms);
>>       owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap,
>