[PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
Kuehling, Felix
Felix.Kuehling at amd.com
Wed Jun 26 06:36:27 UTC 2019
I believe I found a live-lock due to this patch when running our KFD
eviction test in a loop. I pretty reliably hangs on the second loop
iteration. If I revert this patch, the problem disappears.
With some added instrumentation, I see that amdgpu_cs_list_validate in
amdgpu_cs_parser_bos returns -EAGAIN repeatedly. User mode dutifully
retries the command submission, but it never succeeds.
I think the problem is the fact that ttm_mem_evict_wait_busy returns
-EAGAIN when it fails to lock the busy BO. That will cause
ttm_bo_mem_space to give up instead of trying an alternate placement. A
straight-forward fix is to modify ttm_mem_evict_wait_busy to return
-EBUSY, which will maintain the old behaviour of trying alternate
placements when eviction is not possible. I'll send out a patch for that.
See also inline.
On 2019-05-22 8:59 a.m., Christian König wrote:
> [CAUTION: External Email]
>
> BOs on the LRU might be blocked during command submission
> and cause OOM situations.
>
> Avoid this by blocking for the first busy BO not locked by
> the same ticket as the BO we are searching space for.
>
> v10: completely start over with the patch since we didn't
> handled a whole bunch of corner cases.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
> 1 file changed, 66 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 4c6389d849ed..861facac33d4 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
> * b. Otherwise, trylock it.
> */
> static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> - struct ttm_operation_ctx *ctx, bool *locked)
> + struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
> {
> bool ret = false;
>
> - *locked = false;
> if (bo->resv == ctx->resv) {
> reservation_object_assert_held(bo->resv);
> if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> || !list_empty(&bo->ddestroy))
> ret = true;
> + *locked = false;
> + if (busy)
> + *busy = false;
> } else {
> - *locked = reservation_object_trylock(bo->resv);
> - ret = *locked;
> + ret = reservation_object_trylock(bo->resv);
> + *locked = ret;
> + if (busy)
> + *busy = !ret;
> }
>
> return ret;
> }
>
> +/**
> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
> + *
> + * @busy_bo: BO which couldn't be locked with trylock
> + * @ctx: operation context
> + * @ticket: acquire ticket
> + *
> + * Try to lock a busy buffer object to avoid failing eviction.
> + */
> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
> + struct ttm_operation_ctx *ctx,
> + struct ww_acquire_ctx *ticket)
> +{
> + int r;
> +
> + if (!busy_bo || !ticket)
> + return -EBUSY;
> +
> + if (ctx->interruptible)
> + r = reservation_object_lock_interruptible(busy_bo->resv,
> + ticket);
> + else
> + r = reservation_object_lock(busy_bo->resv, ticket);
> +
> + /*
> + * TODO: It would be better to keep the BO locked until allocation is at
> + * least tried one more time, but that would mean a much larger rework
> + * of TTM.
> + */
> + if (!r)
> + reservation_object_unlock(busy_bo->resv);
> +
> + return r == -EDEADLK ? -EAGAIN : r;
If locking fails, this returns -EAGAIN.
> +}
> +
> static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> uint32_t mem_type,
> const struct ttm_place *place,
> - struct ttm_operation_ctx *ctx)
> + struct ttm_operation_ctx *ctx,
> + struct ww_acquire_ctx *ticket)
> {
> + struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
> struct ttm_bo_global *glob = bdev->glob;
> struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> - struct ttm_buffer_object *bo = NULL;
> bool locked = false;
> unsigned i;
> int ret;
> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> spin_lock(&glob->lru_lock);
> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> list_for_each_entry(bo, &man->lru[i], lru) {
> - if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> + bool busy;
> +
> + if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> + &busy)) {
> + if (busy && !busy_bo &&
> + bo->resv->lock.ctx != ticket)
> + busy_bo = bo;
> continue;
> + }
>
> if (place && !bdev->driver->eviction_valuable(bo,
> place)) {
> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> }
>
> if (!bo) {
> + if (busy_bo)
> + ttm_bo_get(busy_bo);
> spin_unlock(&glob->lru_lock);
> - return -EBUSY;
The old behaviour was to always return -EBUSY if eviction was not possible.
> + ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
> + if (busy_bo)
> + ttm_bo_put(busy_bo);
> + return ret;
Now this can return -EAGAIN. This prevents ttm_bo_mem_space from trying
alternate placements.
Regards,
Felix
> }
>
> kref_get(&bo->list_kref);
> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
> return ret;
> if (mem->mm_node)
> break;
> - ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
> + ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx,
> + bo->resv->lock.ctx);
> if (unlikely(ret != 0))
> return ret;
> } while (1);
> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> while (!list_empty(&man->lru[i])) {
> spin_unlock(&glob->lru_lock);
> - ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> + ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> + NULL);
> if (ret)
> return ret;
> spin_lock(&glob->lru_lock);
> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
> spin_lock(&glob->lru_lock);
> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> - if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> + if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> + NULL)) {
> ret = 0;
> break;
> }
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the dri-devel
mailing list