回复: [RFC PATCH v3] drm/amdgpu: Remove kfd eviction fence before release bo

Pan, Xinhui Xinhui.Pan at amd.com
Sat Feb 8 14:58:20 UTC 2020


sorry, there is coding error, will send out V4.

________________________________________
发件人: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> 代表 Pan, Xinhui <Xinhui.Pan at amd.com>
发送时间: 2020年2月8日 22:48
收件人: amd-gfx at lists.freedesktop.org
抄送: Deucher, Alexander; Kuehling, Felix; Koenig, Christian
主题: [RFC PATCH v3] drm/amdgpu: Remove kfd eviction fence before release bo

No need to trigger eviction as the memory mapping will not be used anymore.

All pt/pd bos share same resv, hence the same shared eviction fence. Everytime page table is freed, the fence will be signled and that cuases kfd unexcepted evictions.

Signed-off-by: xinhui pan <xinhui.pan at example.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 35 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  1 +
 drivers/gpu/drm/ttm/ttm_bo.c                  | 16 +++++----
 5 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 47b0f2957d1f..265b1ed7264c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -96,6 +96,7 @@ struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
                                                       struct mm_struct *mm);
 bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
 struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
+int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);

 struct amdkfd_process_info {
        /* List head of all VMs that belong to a KFD process */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef721cb65868..8a06ba3c9d41 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -276,6 +276,40 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
        return 0;
 }

+int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
+{
+       struct amdgpu_vm_bo_base *vm_bo;
+       struct amdgpu_vm *vm;
+       struct amdkfd_process_info *info;
+       struct amdgpu_amdkfd_fence *ef;
+       int ret;
+
+       while (bo->parent)
+               bo = bo->parent;
+
+       vm_bo = bo->vm_bo;
+       if (!vm_bo)
+               return 0;
+
+       vm = vm_bo->vm;
+       if (!vm)
+               return 0;
+
+       info = vm->process_info;
+       if (!info || !info->eviction_fence)
+               return 0;
+
+       ef = container_of(dma_fence_get(&info->eviction_fence->base),
+                       struct amdgpu_amdkfd_fence, base);
+
+       BUG_ON(!dma_resv_trylock(&bo->tbo.base._resv));
+       ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef);
+       dma_resv_unlock(&bo->tbo.base._resv);
+
+       dma_fence_put(&ef->base);
+       return ret;
+}
+
 static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
                                     bool wait)
 {
@@ -1051,6 +1085,7 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
                WARN_ON(!list_empty(&process_info->userptr_valid_list));
                WARN_ON(!list_empty(&process_info->userptr_inval_list));

+               vm->process_info = NULL;
                dma_fence_put(&process_info->eviction_fence->base);
                cancel_delayed_work_sync(&process_info->restore_userptr_work);
                put_pid(process_info->pid);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 6f60a581e3ba..3784d178c965 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1307,6 +1307,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
        if (abo->kfd_bo)
                amdgpu_amdkfd_unreserve_memory_limit(abo);

+       amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo);
+
        if (bo->mem.mem_type != TTM_PL_VRAM || !bo->mem.mm_node ||
            !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE))
                return;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 247f328b7223..eca4ec66c1ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -3109,6 +3109,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
        }

        amdgpu_vm_free_pts(adev, vm, NULL);
+       root->vm_bo = NULL;
        amdgpu_bo_unreserve(root);
        amdgpu_bo_unref(&root);
        WARN_ON(vm->root.base.bo);
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 6c3cea509e25..855d3566381e 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -399,8 +399,7 @@ static int ttm_bo_individualize_resv(struct ttm_buffer_object *bo)
        BUG_ON(!dma_resv_trylock(&bo->base._resv));

        r = dma_resv_copy_fences(&bo->base._resv, bo->base.resv);
-       if (r)
-               dma_resv_unlock(&bo->base._resv);
+       dma_resv_unlock(&bo->base._resv);

        return r;
 }
@@ -565,9 +564,6 @@ static void ttm_bo_release(struct kref *kref)
        int ret;

        if (!bo->deleted) {
-               if (bo->bdev->driver->release_notify)
-                       bo->bdev->driver->release_notify(bo);
-
                drm_vma_offset_remove(bdev->vma_manager, &bo->base.vma_node);
                ttm_mem_io_lock(man, false);
                ttm_mem_io_free_vm(bo);
@@ -581,6 +577,14 @@ static void ttm_bo_release(struct kref *kref)
                        dma_resv_wait_timeout_rcu(bo->base.resv, true, false,
                                                  30 * HZ);
                }
+
+               spin_lock(&ttm_bo_glob.lru_lock);
+               if (bo->type != ttm_bo_type_sg)
+                       bo->base.resv = &bo->base._resv;
+               spin_unlock(&ttm_bo_glob.lru_lock);
+
+               if (bo->bdev->driver->release_notify)
+                       bo->bdev->driver->release_notify(bo);
        }

        if (!dma_resv_test_signaled_rcu(bo->base.resv, true)) {
@@ -599,8 +603,6 @@ static void ttm_bo_release(struct kref *kref)
                }

                spin_lock(&ttm_bo_glob.lru_lock);
-               if (bo->type != ttm_bo_type_sg)
-                       bo->base.resv = &bo->base._resv;
                kref_init(&bo->kref);
                list_add_tail(&bo->ddestroy, &bdev->ddestroy);
                spin_unlock(&ttm_bo_glob.lru_lock);
--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx at lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cxinhui.pan%40amd.com%7Ce3459b706ea244e5db9008d7aca60a76%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637167701449333169&sdata=WAk%2FWgwsOwgdrdOKGZ8kw0g6mj5IbEX9COuL1eN4pwg%3D&reserved=0


More information about the amd-gfx mailing list