[PATCH 4/5] drm/ttm: move the LRU into resource handling
Thomas Hellström
thomas.hellstrom at linux.intel.com
Mon Aug 23 08:10:42 UTC 2021
On Mon, 2021-07-19 at 13:51 +0200, Christian König wrote:
> This way we finally fix the problem that new resource are
> not immediately evict-able after allocation.
>
> That has caused numerous problems including OOM on GDS handling
> and not being able to use TTM as general resource manager.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 8 +-
> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
> drivers/gpu/drm/ttm/ttm_bo.c | 101 ++-----------------
> drivers/gpu/drm/ttm/ttm_bo_util.c | 1 -
> drivers/gpu/drm/ttm/ttm_device.c | 4 +-
> drivers/gpu/drm/ttm/ttm_resource.c | 127
> ++++++++++++++++++++++++
> include/drm/ttm/ttm_bo_api.h | 16 ---
> include/drm/ttm/ttm_bo_driver.h | 29 +-----
> include/drm/ttm/ttm_resource.h | 35 +++++++
> 9 files changed, 177 insertions(+), 146 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 18246b5b6ee3..4b178a74b4e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -643,12 +643,12 @@ void amdgpu_vm_move_to_lru_tail(struct
> amdgpu_device *adev,
>
> if (vm->bulk_moveable) {
> spin_lock(&adev->mman.bdev.lru_lock);
> - ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move);
> + ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
> spin_unlock(&adev->mman.bdev.lru_lock);
> return;
> }
>
> - memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move));
> + ttm_lru_bulk_move_init(&vm->lru_bulk_move);
>
> spin_lock(&adev->mman.bdev.lru_lock);
> list_for_each_entry(bo_base, &vm->idle, vm_status) {
> @@ -658,11 +658,9 @@ void amdgpu_vm_move_to_lru_tail(struct
> amdgpu_device *adev,
> if (!bo->parent)
> continue;
>
> - ttm_bo_move_to_lru_tail(&bo->tbo, bo->tbo.resource,
> - &vm->lru_bulk_move);
> + ttm_bo_move_to_lru_tail(&bo->tbo, &vm-
> >lru_bulk_move);
> if (shadow)
> ttm_bo_move_to_lru_tail(&shadow->tbo,
> - shadow->tbo.resource,
> &vm->lru_bulk_move);
> }
> spin_unlock(&adev->mman.bdev.lru_lock);
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index bf33724bed5c..b38eef37f1c8 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -472,7 +472,7 @@ static void i915_ttm_adjust_lru(struct
> drm_i915_gem_object *obj)
> bo->priority = I915_TTM_PRIO_NO_PAGES;
> }
>
> - ttm_bo_move_to_lru_tail(bo, bo->resource, NULL);
> + ttm_bo_move_to_lru_tail(bo, NULL);
> spin_unlock(&bo->bdev->lru_lock);
> }
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
> b/drivers/gpu/drm/ttm/ttm_bo.c
> index 5a2dc712c632..09a62ad06b9d 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -69,95 +69,15 @@ static void ttm_bo_mem_space_debug(struct
> ttm_buffer_object *bo,
> }
> }
>
> -static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
> -{
> - struct ttm_device *bdev = bo->bdev;
> -
> - list_del_init(&bo->lru);
> -
> - if (bdev->funcs->del_from_lru_notify)
> - bdev->funcs->del_from_lru_notify(bo);
> -}
> -
> -static void ttm_bo_bulk_move_set_pos(struct ttm_lru_bulk_move_pos
> *pos,
> - struct ttm_buffer_object *bo)
> -{
> - if (!pos->first)
> - pos->first = bo;
> - pos->last = bo;
> -}
> -
> void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo,
> - struct ttm_resource *mem,
> struct ttm_lru_bulk_move *bulk)
> {
> - struct ttm_device *bdev = bo->bdev;
> - struct ttm_resource_manager *man;
> -
> - if (!bo->deleted)
> - dma_resv_assert_held(bo->base.resv);
> -
> - if (bo->pin_count) {
> - ttm_bo_del_from_lru(bo);
> - return;
> - }
> -
> - man = ttm_manager_type(bdev, mem->mem_type);
> - list_move_tail(&bo->lru, &man->lru[bo->priority]);
> -
> - if (bdev->funcs->del_from_lru_notify)
> - bdev->funcs->del_from_lru_notify(bo);
> -
> - if (bulk && !bo->pin_count) {
> - switch (bo->resource->mem_type) {
> - case TTM_PL_TT:
> - ttm_bo_bulk_move_set_pos(&bulk->tt[bo-
> >priority], bo);
> - break;
> + dma_resv_assert_held(bo->base.resv);
>
> - case TTM_PL_VRAM:
> - ttm_bo_bulk_move_set_pos(&bulk->vram[bo-
> >priority], bo);
> - break;
> - }
> - }
> + ttm_resource_move_to_lru_tail(bo->resource, bulk);
> }
> EXPORT_SYMBOL(ttm_bo_move_to_lru_tail);
>
> -void ttm_bo_bulk_move_lru_tail(struct ttm_lru_bulk_move *bulk)
> -{
> - unsigned i;
> -
> - for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> - struct ttm_lru_bulk_move_pos *pos = &bulk->tt[i];
> - struct ttm_resource_manager *man;
> -
> - if (!pos->first)
> - continue;
> -
> - dma_resv_assert_held(pos->first->base.resv);
> - dma_resv_assert_held(pos->last->base.resv);
> -
> - man = ttm_manager_type(pos->first->bdev, TTM_PL_TT);
> - list_bulk_move_tail(&man->lru[i], &pos->first->lru,
> - &pos->last->lru);
> - }
> -
> - for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> - struct ttm_lru_bulk_move_pos *pos = &bulk->vram[i];
> - struct ttm_resource_manager *man;
> -
> - if (!pos->first)
> - continue;
> -
> - dma_resv_assert_held(pos->first->base.resv);
> - dma_resv_assert_held(pos->last->base.resv);
> -
> - man = ttm_manager_type(pos->first->bdev,
> TTM_PL_VRAM);
> - list_bulk_move_tail(&man->lru[i], &pos->first->lru,
> - &pos->last->lru);
> - }
> -}
> -EXPORT_SYMBOL(ttm_bo_bulk_move_lru_tail);
> -
> static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
> struct ttm_resource *mem, bool
> evict,
> struct ttm_operation_ctx *ctx,
> @@ -339,7 +259,6 @@ static int ttm_bo_cleanup_refs(struct
> ttm_buffer_object *bo,
> return ret;
> }
>
> - ttm_bo_del_from_lru(bo);
> list_del_init(&bo->ddestroy);
> spin_unlock(&bo->bdev->lru_lock);
> ttm_bo_cleanup_memtype_use(bo);
> @@ -440,7 +359,7 @@ static void ttm_bo_release(struct kref *kref)
> */
> if (bo->pin_count) {
> bo->pin_count = 0;
> - ttm_bo_move_to_lru_tail(bo, bo->resource,
> NULL);
> + ttm_resource_move_to_lru_tail(bo->resource,
> NULL);
> }
>
> kref_init(&bo->kref);
> @@ -453,7 +372,6 @@ static void ttm_bo_release(struct kref *kref)
> }
>
> spin_lock(&bo->bdev->lru_lock);
> - ttm_bo_del_from_lru(bo);
> list_del(&bo->ddestroy);
> spin_unlock(&bo->bdev->lru_lock);
>
> @@ -667,15 +585,17 @@ int ttm_mem_evict_first(struct ttm_device
> *bdev,
> struct ww_acquire_ctx *ticket)
> {
> struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
> + struct ttm_resource *res;
> bool locked = false;
> unsigned i;
> int ret;
>
> spin_lock(&bdev->lru_lock);
> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> - list_for_each_entry(bo, &man->lru[i], lru) {
> + list_for_each_entry(res, &man->lru[i], lru) {
> bool busy;
>
> + bo = res->bo;
Follow up to previous review: What happens here if someone now
reassigns @res->bo and then kills @bo. At least it's not immediately
clear what's protecting from that. Isn't a kref_get_unless_zero() on
the bo needed here, and res->bo being assigned (and properly cleared on
bo destruction) under the lru_lock when needed?
Admittedly as you pointed out earlier we can't kref_put() the bo under
the lru lock but (if all else fails) one could perhaps defer the put to
a worker, or move the bo to lru tail and drop the lru lock iff
kref_put() may hit a zero refcount.
/Thomas
More information about the dri-devel
mailing list