[PATCH 06/10] drm/ttm: fix busy memory to fail other user v10

Fri May 24 08:49:23 UTC 2019

Yeah, that shouldn't be a problem. We just need to make sure we don't 
busy wait for the BOs to become available.

Christian.

Am 24.05.19 um 07:35 schrieb Liang, Prike:
> Use Abaqus torturing the amdgpu driver more times will running into locking first busy BO deadlock .Then the caller will
> return EAGAIN and eventually dm_plane_helper_prepare_fb popups out pinned failed message .For this case, the patch#7
>   can we add EAGAIN as ERESTARTSYS which filter out the annoying error message .
>
> Thanks,
> Prike
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken at gmail.com>
> Sent: Thursday, May 23, 2019 7:04 PM
> To: Zhou, David(ChunMing) <David1.Zhou at amd.com>; Olsak, Marek <Marek.Olsak at amd.com>; Zhou, David(ChunMing) <David1.Zhou at amd.com>; Liang, Prike <Prike.Liang at amd.com>; dri-devel at lists.freedesktop.org; amd-gfx at lists.freedesktop.org
> Subject: Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
>
> [CAUTION: External Email]
>
> Am 23.05.19 um 12:24 schrieb zhoucm1:
>>
>> On 2019年05月22日 20:59, Christian König wrote:
>>> [CAUTION: External Email]
>>>
>>> BOs on the LRU might be blocked during command submission and cause
>>> OOM situations.
>>>
>>> Avoid this by blocking for the first busy BO not locked by the same
>>> ticket as the BO we are searching space for.
>>>
>>> v10: completely start over with the patch since we didn't
>>>        handled a whole bunch of corner cases.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> ---
>>>    drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>>>    1 file changed, 66 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>> b/drivers/gpu/drm/ttm/ttm_bo.c index 4c6389d849ed..861facac33d4
>>> 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>     * b. Otherwise, trylock it.
>>>     */
>>>    static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object
>>> *bo,
>>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>>> +                       struct ttm_operation_ctx *ctx, bool *locked,
>>> bool *busy)
>>>    {
>>>           bool ret = false;
>>>
>>> -       *locked = false;
>>>           if (bo->resv == ctx->resv) {
>>>                   reservation_object_assert_held(bo->resv);
>>>                   if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>                       || !list_empty(&bo->ddestroy))
>>>                           ret = true;
>>> +               *locked = false;
>>> +               if (busy)
>>> +                       *busy = false;
>>>           } else {
>>> -               *locked = reservation_object_trylock(bo->resv);
>>> -               ret = *locked;
>>> +               ret = reservation_object_trylock(bo->resv);
>>> +               *locked = ret;
>>> +               if (busy)
>>> +                       *busy = !ret;
>>>           }
>>>
>>>           return ret;
>>>    }
>>>
>>> +/**
>>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>>> + *
>>> + * @busy_bo: BO which couldn't be locked with trylock
>>> + * @ctx: operation context
>>> + * @ticket: acquire ticket
>>> + *
>>> + * Try to lock a busy buffer object to avoid failing eviction.
>>> + */
>>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>>> +                                  struct ttm_operation_ctx *ctx,
>>> +                                  struct ww_acquire_ctx *ticket) {
>>> +       int r;
>>> +
>>> +       if (!busy_bo || !ticket)
>>> +               return -EBUSY;
>>> +
>>> +       if (ctx->interruptible)
>>> +               r =
>>> + reservation_object_lock_interruptible(busy_bo->resv,
>>> + ticket);
>>> +       else
>>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>>> +
>>> +       /*
>>> +        * TODO: It would be better to keep the BO locked until
>>> allocation is at
>>> +        * least tried one more time, but that would mean a much
>>> larger rework
>>> +        * of TTM.
>>> +        */
>>> +       if (!r)
>>> +               reservation_object_unlock(busy_bo->resv);
>>> +
>>> +       return r == -EDEADLK ? -EAGAIN : r; }
>>> +
>>>    static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>                                  uint32_t mem_type,
>>>                                  const struct ttm_place *place,
>>> -                              struct ttm_operation_ctx *ctx)
>>> +                              struct ttm_operation_ctx *ctx,
>>> +                              struct ww_acquire_ctx *ticket)
>>>    {
>>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>>           struct ttm_bo_global *glob = bdev->glob;
>>>           struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>> -       struct ttm_buffer_object *bo = NULL;
>>>           bool locked = false;
>>>           unsigned i;
>>>           int ret;
>>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct
>>> ttm_bo_device *bdev,
>>>           spin_lock(&glob->lru_lock);
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   list_for_each_entry(bo, &man->lru[i], lru) {
>>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked))
>>> +                       bool busy;
>>> +
>>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked,
>>> + &busy)) {
>>> +                               if (busy && !busy_bo &&
>>> +                                   bo->resv->lock.ctx != ticket)
>>> +                                       busy_bo = bo;
>>>                                   continue;
>>> +                       }
>>>
>>>                           if (place &&
>>> !bdev->driver->eviction_valuable(bo,
>>> place)) {
>>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct
>>> ttm_bo_device *bdev,
>>>           }
>>>
>>>           if (!bo) {
>>> +               if (busy_bo)
>>> +                       ttm_bo_get(busy_bo);
>>>                   spin_unlock(&glob->lru_lock);
>>> -               return -EBUSY;
>>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
>> If you rely on EAGAIN, why do you still try to lock busy_bo? any
>> negative effect if directly return EAGAIN without tring lock?
> Yeah, that would burn a lot of CPU cycles because we would essentially busy wait for the BO to become unlocked.
>
> When we only return in case of a deadlock the other thread can continue with its eviction while we reacquire all looks during EAGAIN handling.
>
> Even directly unlocking the BO as I do here is a bit questionable. But I couldn't get the original logic with finding a new BO to evict to work correctly, that's why I have the TODO comment in the function itself as well.
>
> Christian.
>
>> -David
>>> +               if (busy_bo)
>>> +                       ttm_bo_put(busy_bo);
>>> +               return ret;
>>>           }
>>>
>>>           kref_get(&bo->list_kref);
>>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct
>>> ttm_buffer_object *bo,
>>>                           return ret;
>>>                   if (mem->mm_node)
>>>                           break;
>>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>> ctx);
>>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>> ctx,
>>> + bo->resv->lock.ctx);
>>>                   if (unlikely(ret != 0))
>>>                           return ret;
>>>           } while (1);
>>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct
>>> ttm_bo_device *bdev,
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   while (!list_empty(&man->lru[i])) {
>>>                           spin_unlock(&glob->lru_lock);
>>> -                       ret = ttm_mem_evict_first(bdev, mem_type,
>>> NULL, &ctx);
>>> +                       ret = ttm_mem_evict_first(bdev, mem_type,
>>> NULL, &ctx,
>>> +                                                 NULL);
>>>                           if (ret)
>>>                                   return ret;
>>>                           spin_lock(&glob->lru_lock); @@ -1797,7
>>> +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct
>>> ttm_operation_ctx *ctx)
>>>           spin_lock(&glob->lru_lock);
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked)) {
>>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked,
>>> + NULL)) {
>>>                                   ret = 0;
>>>                                   break;
>>>                           }
>>> --
>>> 2.17.1
>>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx