[PATCH 3/6] drm/amdgpu: Auto-validate DMABuf imports in compute VMs

Joshi, Mukul Mukul.Joshi at amd.com
Tue Nov 7 20:24:46 UTC 2023


[AMD Official Use Only - General]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Felix
> Kuehling
> Sent: Tuesday, November 7, 2023 11:58 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chen, Xiaogang <Xiaogang.Chen at amd.com>; Errabolu, Ramesh
> <Ramesh.Errabolu at amd.com>; Koenig, Christian
> <Christian.Koenig at amd.com>
> Subject: [PATCH 3/6] drm/amdgpu: Auto-validate DMABuf imports in
> compute VMs
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> DMABuf imports in compute VMs are not wrapped in a kgd_mem object on
> the process_info->kfd_bo_list. There is no explicit KFD API call to validate them
> or add eviction fences to them.
>
> This patch automatically validates and fences dymanic DMABuf imports when
> they are added to a compute VM. Revalidation after evictions is handled in the
> VM code.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   3 +
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  15 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c   |   6 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  26 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        | 117
> +++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |   6 +-
>  7 files changed, 164 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index fcf8a98ad15e..68d534a89942 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -178,6 +178,9 @@ int
> amdgpu_queue_mask_bit_to_set_resource_bit(struct amdgpu_device *adev,
> struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
>                                 struct mm_struct *mm,
>                                 struct svm_range_bo *svm_bo);
> +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
> +                                       uint32_t domain,
> +                                       struct dma_fence *fence);
>  #if defined(CONFIG_DEBUG_FS)
>  int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data);  #endif diff --
> git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 2e302956a279..0c1cb6048259 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -423,9 +423,9 @@ static int amdgpu_amdkfd_bo_validate(struct
> amdgpu_bo *bo, uint32_t domain,
>         return ret;
>  }
>
> -static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
> -                                              uint32_t domain,
> -                                              struct dma_fence *fence)
> +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
> +                                       uint32_t domain,
> +                                       struct dma_fence *fence)
>  {
>         int ret = amdgpu_bo_reserve(bo, false);
>
> @@ -2948,7 +2948,7 @@ int
> amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
> **ef)
>                 struct amdgpu_device *adev = amdgpu_ttm_adev(
>                         peer_vm->root.bo->tbo.bdev);
>
> -               ret = amdgpu_vm_handle_moved(adev, peer_vm, &ctx.ticket);
> +               ret = amdgpu_vm_handle_moved(adev, peer_vm, &ctx.ticket,
> + true);
>                 if (ret) {
>                         pr_debug("Memory eviction: handle moved failed. Try again\n");
>                         goto validate_map_fail; @@ -3001,7 +3001,7 @@ int
> amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
> **ef)
>                                    &process_info->eviction_fence->base,
>                                    DMA_RESV_USAGE_BOOKKEEP);
>         }
> -       /* Attach eviction fence to PD / PT BOs */
> +       /* Attach eviction fence to PD / PT BOs and DMABuf imports */
>         list_for_each_entry(peer_vm, &process_info->vm_list_head,
>                             vm_list_node) {
>                 struct amdgpu_bo *bo = peer_vm->root.bo; @@ -3009,6 +3009,11
> @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct
> dma_fence **ef)
>                 dma_resv_add_fence(bo->tbo.base.resv,
>                                    &process_info->eviction_fence->base,
>                                    DMA_RESV_USAGE_BOOKKEEP);
> +
> +               ret = amdgpu_vm_fence_imports(peer_vm, &ctx.ticket,
> +                                             &process_info->eviction_fence->base);
> +               if (ret)
> +                       break;
>         }
>
>  validate_map_fail:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index aafedb344c1b..20f4be8cd635 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1165,7 +1165,7 @@ static int amdgpu_cs_vm_handling(struct
> amdgpu_cs_parser *p)
>                         return r;
>         }
>
> -       r = amdgpu_vm_handle_moved(adev, vm, &p->ticket);
> +       r = amdgpu_vm_handle_moved(adev, vm, &p->ticket, false);
>         if (r)
>                 return r;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> index e7e87a3b2601..234244704f27 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> @@ -373,6 +373,10 @@ amdgpu_dma_buf_move_notify(struct
> dma_buf_attachment *attach)
>         struct amdgpu_vm_bo_base *bo_base;
>         int r;
>
> +       /* FIXME: This should be after the "if", but needs a fix to make sure
> +        * DMABuf imports are initialized in the right VM list.
> +        */
> +       amdgpu_vm_bo_invalidate(adev, bo, false);
>         if (!bo->tbo.resource || bo->tbo.resource->mem_type ==
> TTM_PL_SYSTEM)
>                 return;
>
> @@ -409,7 +413,7 @@ amdgpu_dma_buf_move_notify(struct
> dma_buf_attachment *attach)
>                 if (!r)
>                         r = amdgpu_vm_clear_freed(adev, vm, NULL);
>                 if (!r)
> -                       r = amdgpu_vm_handle_moved(adev, vm, ticket);
> +                       r = amdgpu_vm_handle_moved(adev, vm, ticket,
> + false);
>
>                 if (r && r != -EBUSY)
>                         DRM_ERROR("Failed to invalidate VM page tables (%d))\n", diff --
> git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 849fffbb367d..755cc3c559f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -186,6 +186,32 @@ static int amdgpu_gem_object_open(struct
> drm_gem_object *obj,
>         else
>                 ++bo_va->ref_count;
>         amdgpu_bo_unreserve(abo);
> +
> +       /* Validate and add eviction fence to DMABuf imports with dymanic

dymanic -> dynamic

> +        * attachment in compute VMs. Re-validation will be done by
> +        * amdgpu_vm_handle_moved and the fence will be updated by
> +        * amdgpu_vm_fence_imports in
> amdgpu_amdkfd_gpuvm_restore_process_bos.
> +        *
> +        * Nested locking below for the case that a GEM object is opened in
> +        * kfd_mem_export_dmabuf. Since the lock below is only taken for
> imports,
> +        * but not for export, this is a different lock class that cannot lead to
> +        * circular lock dependencies.
> +        */
> +       if (!vm->is_compute_context || !vm->process_info)
> +               return 0;
> +       if (!obj->import_attach ||
> +           !dma_buf_is_dynamic(obj->import_attach->dmabuf))
> +               return 0;
> +       mutex_lock_nested(&vm->process_info->lock, 1);
> +       if (!WARN_ON(!vm->process_info->eviction_fence)) {
> +               r = amdgpu_amdkfd_bo_validate_and_fence(abo,
> AMDGPU_GEM_DOMAIN_GTT,
> +                                                       &vm->process_info->eviction_fence->base);
> +               if (r)
> +                       dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n",
> +                                vm->task_info.pid, r);
> +       }
> +       mutex_unlock(&vm->process_info->lock);
> +
>         return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 0d685577243c..b2c7449ab561 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1307,6 +1307,7 @@ int amdgpu_vm_clear_freed(struct
> amdgpu_device *adev,
>   * @adev: amdgpu_device pointer
>   * @vm: requested vm
>   * @ticket: optional reservation ticket used to reserve the VM
> + * @valitate: whether to auto-validate invalid DMABuf imports

valitate -> validate

>   *
>   * Make sure all BOs which are moved are updated in the PTs.
>   *
> @@ -1317,7 +1318,8 @@ int amdgpu_vm_clear_freed(struct
> amdgpu_device *adev,
>   */
>  int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
>                            struct amdgpu_vm *vm,
> -                          struct ww_acquire_ctx *ticket)
> +                          struct ww_acquire_ctx *ticket,
> +                          bool validate)
>  {
>         struct amdgpu_bo_va *bo_va;
>         struct dma_resv *resv;
> @@ -1337,6 +1339,12 @@ int amdgpu_vm_handle_moved(struct
> amdgpu_device *adev,
>                 spin_lock(&vm->status_lock);
>         }
>
> +       /* If we're validating user BOs, splice all evicted user BOs into
> +        * the list of invalid BOs for revalidation
> +        */
> +       if (validate)
> +               list_splice_init(&vm->evicted_user, &vm->invalidated);
> +
>         while (!list_empty(&vm->invalidated)) {
>                 bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
>                                          base.vm_status); @@ -1357,17 +1365,120 @@ int
> amdgpu_vm_handle_moved(struct amdgpu_device *adev,
>                         unlock = false;
>                 }
>
> +               /* Automatically validate DMABuf imports in compute VMs, if we
> +                * have a reservation, or remember them for later validation.
> +                */
> +               if (vm->is_compute_context && bo_va->base.bo &&
> +                   bo_va->base.bo->tbo.base.import_attach &&
> +                   (!bo_va->base.bo->tbo.resource ||
> +                    bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM))
> {
> +                       struct ttm_operation_ctx ctx = { true, false };
> +                       struct amdgpu_bo *bo = bo_va->base.bo;
> +
> +                       if (!validate) {
> +                               r = amdgpu_vm_bo_update(adev, bo_va, clear);
> +                               if (!r)
> +                                       amdgpu_vm_bo_evicted_user(&bo_va->base);
> +                               goto unlock;
> +                       }
> +
> +                       if (clear) {
> +                               pr_warn_ratelimited("Invalid DMABuf import is busy in pid
> %d\n", vm->task_info.pid);
> +                               r = -EBUSY;
> +                               goto unlock;
> +                       }
> +
> +                       amdgpu_bo_placement_from_domain(bo,
> +                                                       bo->preferred_domains);
> +                       r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> +                       if (r)
> +                               goto unlock;
> +                       r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD,
> +                                               true);
> +                       if (r)
> +                               goto unlock;
> +               }
> +
>                 r = amdgpu_vm_bo_update(adev, bo_va, clear);
> +unlock:
> +               if (unlock)
> +                       dma_resv_unlock(resv);
>                 if (r)
>                         return r;
> +               spin_lock(&vm->status_lock);
> +       }
> +       spin_unlock(&vm->status_lock);
> +
> +       return 0;
> +}
> +
> +/**
> + * amdgpu_vm_fence_imports - add fence to valid DMABuf imports
> + *
> + * @vm: requested vm
> + * @ticket: optional reservation ticket used to reserve the VM
> + * @fence: fence to add
> + *
> + * Add the specified fence to all dymanic DMABuf imports that are valid.
> + *
> + * Returns:
> + * 0 for success.
> + */
> +int amdgpu_vm_fence_imports(struct amdgpu_vm *vm,
> +                           struct ww_acquire_ctx *ticket,
> +                           struct dma_fence *fence) {
> +       struct amdgpu_bo_va *bo_va, *tmp;
> +       struct dma_resv *resv;
> +       LIST_HEAD(imports);
> +       bool unlock;
> +       int ret = 0;
> +
> +       if (!vm->is_compute_context)
> +               return 0;
> +
> +       /* Move all the DMABuf imports to a private list so I can reserve
> +        * them while not holding te status_lock.
> +        */
> +       spin_lock(&vm->status_lock);
> +       list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) {
> +               if (bo_va->base.bo && bo_va->base.bo->tbo.base.import_attach &&
> +                   dma_buf_is_dynamic(bo_va->base.bo->tbo.base.import_attach-
> >dmabuf))
> +                       list_move(&bo_va->base.vm_status, &imports);
> +       }
> +       spin_unlock(&vm->status_lock);
> +
> +       list_for_each_entry(bo_va, &imports, base.vm_status) {
> +               resv = bo_va->base.bo->tbo.base.resv;
> +
> +               /* Try to reserve the BO */
> +               if (dma_resv_trylock(resv)) {
> +                       unlock = true;
> +               /* The caller is already holding the reservation lock */
> +               } else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
> +                       unlock = false;
> +               } else {
> +                       WARN_ONCE(1, "Failed to reserve DMABuf import");
> +                       ret = -EBUSY;
> +                       break;
> +               }
> +
> +               ret = dma_resv_reserve_fences(resv, 1);
> +               if (!ret)
> +                       dma_resv_add_fence(resv, fence,
> +                                          DMA_RESV_USAGE_BOOKKEEP);
>
>                 if (unlock)
>                         dma_resv_unlock(resv);
> -               spin_lock(&vm->status_lock);
> +               if (ret)
> +                       break;
>         }
> +
> +       spin_lock(&vm->status_lock);
> +       list_splice(&imports, &vm->idle);
>         spin_unlock(&vm->status_lock);
>
> -       return 0;
> +       return ret;
>  }
>
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 939d0c2219c0..2db04b8fef97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -443,7 +443,11 @@ int amdgpu_vm_clear_freed(struct amdgpu_device
> *adev,
>                           struct dma_fence **fence);  int
> amdgpu_vm_handle_moved(struct amdgpu_device *adev,
>                            struct amdgpu_vm *vm,
> -                          struct ww_acquire_ctx *ticket);
> +                          struct ww_acquire_ctx *ticket,
> +                          bool validate); int
> +amdgpu_vm_fence_imports(struct amdgpu_vm *vm,
> +                           struct ww_acquire_ctx *ticket,
> +                           struct dma_fence *fence);
>  int amdgpu_vm_flush_compute_tlb(struct amdgpu_device *adev,
>                                 struct amdgpu_vm *vm,
>                                 uint32_t flush_type,
> --
> 2.34.1



More information about the amd-gfx mailing list