[PATCH] drm/ttm: fix busy memory to fail other user v4
zhoucm1
zhoucm1 at amd.com
Fri Apr 26 10:19:10 UTC 2019
please ignore v4. Will send v5 instead.
On 2019年04月26日 17:53, Chunming Zhou wrote:
> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>
> basically pick up Christian idea:
>
> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>
> v2: fix some minor check
> v3: address Christian v2 comments.
> v4: fix some missing
>
> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
> Signed-off-by: Chunming Zhou <david1.zhou at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 6 +-
> .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 +++++-
> drivers/gpu/drm/ttm/ttm_bo.c | 74 +++++++++++++++++--
> 3 files changed, 92 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index affde72b44db..032edf477827 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -811,7 +811,11 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
> u64 min_offset, u64 max_offset)
> {
> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> - struct ttm_operation_ctx ctx = { false, false };
> + struct ttm_operation_ctx ctx = {
> + .interruptible = false,
> + .no_wait_gpu = false,
> + .resv = bo->tbo.resv,
> + };
> int r, i;
>
> if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index a5cacf846e1b..2957ac38dcb0 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -4101,6 +4101,9 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
> struct amdgpu_device *adev;
> struct amdgpu_bo *rbo;
> struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
> + struct list_head list, duplicates;
> + struct ttm_validate_buffer tv;
> + struct ww_acquire_ctx ticket;
> uint64_t tiling_flags;
> uint32_t domain;
> int r;
> @@ -4120,6 +4123,18 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
> r = amdgpu_bo_reserve(rbo, false);
> if (unlikely(r != 0))
> return r;
> + INIT_LIST_HEAD(&list);
> + INIT_LIST_HEAD(&duplicates);
> +
> + tv.bo = &rbo->tbo;
> + tv.num_shared = 1;
> + list_add(&tv.head, &list);
> +
> + r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
> + if (r) {
> + dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
> + return r;
> + }
>
> if (plane->type != DRM_PLANE_TYPE_CURSOR)
> domain = amdgpu_display_supported_domains(adev);
> @@ -4130,21 +4145,21 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
> if (unlikely(r != 0)) {
> if (r != -ERESTARTSYS)
> DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
> - amdgpu_bo_unreserve(rbo);
> + ttm_eu_backoff_reservation(&ticket, &list);
> return r;
> }
>
> r = amdgpu_ttm_alloc_gart(&rbo->tbo);
> if (unlikely(r != 0)) {
> amdgpu_bo_unpin(rbo);
> - amdgpu_bo_unreserve(rbo);
> + ttm_eu_backoff_reservation(&ticket, &list);
> DRM_ERROR("%p bind failed\n", rbo);
> return r;
> }
>
> amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
>
> - amdgpu_bo_unreserve(rbo);
> + ttm_eu_backoff_reservation(&ticket, &list);
>
> afb->address = amdgpu_bo_gpu_offset(rbo);
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 8502b3ed2d88..f4f506dad2b0 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -766,11 +766,12 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
> * b. Otherwise, trylock it.
> */
> static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> - struct ttm_operation_ctx *ctx, bool *locked)
> + struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
> {
> bool ret = false;
>
> *locked = false;
> + *busy = false;
> if (bo->resv == ctx->resv) {
> reservation_object_assert_held(bo->resv);
> if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> @@ -779,6 +780,8 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> } else {
> *locked = reservation_object_trylock(bo->resv);
> ret = *locked;
> + if (!ret && busy)
> + *busy = true;
> }
>
> return ret;
> @@ -791,7 +794,7 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> {
> struct ttm_bo_global *glob = bdev->glob;
> struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> - struct ttm_buffer_object *bo = NULL;
> + struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
> bool locked = false;
> unsigned i;
> int ret;
> @@ -799,8 +802,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> spin_lock(&glob->lru_lock);
> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> list_for_each_entry(bo, &man->lru[i], lru) {
> - if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> + bool busy = false;
> + if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> + &busy)) {
> + if (!first_bo && busy) {
> + ttm_bo_get(bo);
> + first_bo = bo;
> + }
> continue;
> + }
>
> if (place && !bdev->driver->eviction_valuable(bo,
> place)) {
> @@ -808,6 +818,7 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> reservation_object_unlock(bo->resv);
> continue;
> }
> +
> break;
> }
>
> @@ -820,7 +831,59 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>
> if (!bo) {
> spin_unlock(&glob->lru_lock);
> - return -EBUSY;
> + /* check if other user occupy memory too long time */
> + if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx) {
> + if (first_bo)
> + ttm_bo_put(first_bo);
> + return -EBUSY;
> + }
> + if (ctx->interruptible)
> + ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
> + ctx->resv->lock.ctx);
> + else
> + ret = ww_mutex_lock(&first_bo->resv->lock, ctx->resv->lock.ctx);
> + if (ret)
> + return ret;
> + spin_lock(&glob->lru_lock);
> + for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> + /* previous busy resv lock is held by above, idle now,
> + * so let them evictable.
> + */
> + struct ttm_operation_ctx busy_ctx = {
> + .interruptible = ctx->interruptible,
> + .no_wait_gpu = ctx->no_wait_gpu,
> + .resv = first_bo->resv,
> + .flags = TTM_OPT_FLAG_ALLOW_RES_EVICT
> + };
> + list_for_each_entry(bo, &man->lru[i], lru) {
> + if (!ttm_bo_evict_swapout_allowable(bo,
> + &busy_ctx,
> + &locked,
> + NULL))
> + continue;
> +
> + if (place && !bdev->driver->eviction_valuable(bo,
> + place)) {
> + if (locked)
> + reservation_object_unlock(bo->resv);
> + continue;
> + }
> + break;
> + }
> + /* If the inner loop terminated early, we have our candidate */
> + if (&bo->lru != &man->lru[i])
> + break;
> + bo = NULL;
> + }
> + if (bo == first_bo)
> + locked = true;
> + else if (bo)
> + ttm_bo_unreserve(first_bo);
> + ttm_bo_put(first_bo);
> + if (!bo) {
> + spin_unlock(&glob->lru_lock);
> + return -EBUSY;
> + }
> }
>
> kref_get(&bo->list_kref);
> @@ -1784,7 +1847,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
> spin_lock(&glob->lru_lock);
> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> - if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> + if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> + NULL)) {
> ret = 0;
> break;
> }
More information about the dri-devel
mailing list