[PATCH 06/10] drm/ttm: fix busy memory to fail other user v10

Fri May 24 05:35:57 UTC 2019

Use Abaqus torturing the amdgpu driver more times will running into locking first busy BO deadlock .Then the caller will 
return EAGAIN and eventually dm_plane_helper_prepare_fb popups out pinned failed message .For this case, the patch#7
 can we add EAGAIN as ERESTARTSYS which filter out the annoying error message .

Thanks,
Prike
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken at gmail.com> 
Sent: Thursday, May 23, 2019 7:04 PM
To: Zhou, David(ChunMing) <David1.Zhou at amd.com>; Olsak, Marek <Marek.Olsak at amd.com>; Zhou, David(ChunMing) <David1.Zhou at amd.com>; Liang, Prike <Prike.Liang at amd.com>; dri-devel at lists.freedesktop.org; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10

[CAUTION: External Email]

Am 23.05.19 um 12:24 schrieb zhoucm1:
>
>
> On 2019年05月22日 20:59, Christian König wrote:
>> [CAUTION: External Email]
>>
>> BOs on the LRU might be blocked during command submission and cause 
>> OOM situations.
>>
>> Avoid this by blocking for the first busy BO not locked by the same 
>> ticket as the BO we are searching space for.
>>
>> v10: completely start over with the patch since we didn't
>>       handled a whole bunch of corner cases.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>>   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>>   1 file changed, 66 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>> b/drivers/gpu/drm/ttm/ttm_bo.c index 4c6389d849ed..861facac33d4 
>> 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>    * b. Otherwise, trylock it.
>>    */
>>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object 
>> *bo,
>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>> +                       struct ttm_operation_ctx *ctx, bool *locked,
>> bool *busy)
>>   {
>>          bool ret = false;
>>
>> -       *locked = false;
>>          if (bo->resv == ctx->resv) {
>>                  reservation_object_assert_held(bo->resv);
>>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>                      || !list_empty(&bo->ddestroy))
>>                          ret = true;
>> +               *locked = false;
>> +               if (busy)
>> +                       *busy = false;
>>          } else {
>> -               *locked = reservation_object_trylock(bo->resv);
>> -               ret = *locked;
>> +               ret = reservation_object_trylock(bo->resv);
>> +               *locked = ret;
>> +               if (busy)
>> +                       *busy = !ret;
>>          }
>>
>>          return ret;
>>   }
>>
>> +/**
>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>> + *
>> + * @busy_bo: BO which couldn't be locked with trylock
>> + * @ctx: operation context
>> + * @ticket: acquire ticket
>> + *
>> + * Try to lock a busy buffer object to avoid failing eviction.
>> + */
>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>> +                                  struct ttm_operation_ctx *ctx,
>> +                                  struct ww_acquire_ctx *ticket) {
>> +       int r;
>> +
>> +       if (!busy_bo || !ticket)
>> +               return -EBUSY;
>> +
>> +       if (ctx->interruptible)
>> +               r = 
>> + reservation_object_lock_interruptible(busy_bo->resv,
>> + ticket);
>> +       else
>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>> +
>> +       /*
>> +        * TODO: It would be better to keep the BO locked until
>> allocation is at
>> +        * least tried one more time, but that would mean a much
>> larger rework
>> +        * of TTM.
>> +        */
>> +       if (!r)
>> +               reservation_object_unlock(busy_bo->resv);
>> +
>> +       return r == -EDEADLK ? -EAGAIN : r; }
>> +
>>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>                                 uint32_t mem_type,
>>                                 const struct ttm_place *place,
>> -                              struct ttm_operation_ctx *ctx)
>> +                              struct ttm_operation_ctx *ctx,
>> +                              struct ww_acquire_ctx *ticket)
>>   {
>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>          struct ttm_bo_global *glob = bdev->glob;
>>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> -       struct ttm_buffer_object *bo = NULL;
>>          bool locked = false;
>>          unsigned i;
>>          int ret;
>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct 
>> ttm_bo_device *bdev,
>>          spin_lock(&glob->lru_lock);
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  list_for_each_entry(bo, &man->lru[i], lru) {
>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>> &locked))
>> +                       bool busy;
>> +
>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>> &locked,
>> + &busy)) {
>> +                               if (busy && !busy_bo &&
>> +                                   bo->resv->lock.ctx != ticket)
>> +                                       busy_bo = bo;
>>                                  continue;
>> +                       }
>>
>>                          if (place && 
>> !bdev->driver->eviction_valuable(bo,
>> place)) {
>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct 
>> ttm_bo_device *bdev,
>>          }
>>
>>          if (!bo) {
>> +               if (busy_bo)
>> +                       ttm_bo_get(busy_bo);
>>                  spin_unlock(&glob->lru_lock);
>> -               return -EBUSY;
>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
> If you rely on EAGAIN, why do you still try to lock busy_bo? any 
> negative effect if directly return EAGAIN without tring lock?

Yeah, that would burn a lot of CPU cycles because we would essentially busy wait for the BO to become unlocked.

When we only return in case of a deadlock the other thread can continue with its eviction while we reacquire all looks during EAGAIN handling.

Even directly unlocking the BO as I do here is a bit questionable. But I couldn't get the original logic with finding a new BO to evict to work correctly, that's why I have the TODO comment in the function itself as well.

Christian.

>
> -David
>> +               if (busy_bo)
>> +                       ttm_bo_put(busy_bo);
>> +               return ret;
>>          }
>>
>>          kref_get(&bo->list_kref);
>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct 
>> ttm_buffer_object *bo,
>>                          return ret;
>>                  if (mem->mm_node)
>>                          break;
>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>> ctx);
>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>> ctx,
>> + bo->resv->lock.ctx);
>>                  if (unlikely(ret != 0))
>>                          return ret;
>>          } while (1);
>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct 
>> ttm_bo_device *bdev,
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  while (!list_empty(&man->lru[i])) {
>>                          spin_unlock(&glob->lru_lock);
>> -                       ret = ttm_mem_evict_first(bdev, mem_type,
>> NULL, &ctx);
>> +                       ret = ttm_mem_evict_first(bdev, mem_type,
>> NULL, &ctx,
>> +                                                 NULL);
>>                          if (ret)
>>                                  return ret;
>>                          spin_lock(&glob->lru_lock); @@ -1797,7 
>> +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct 
>> ttm_operation_ctx *ctx)
>>          spin_lock(&glob->lru_lock);
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>> &locked)) {
>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>> &locked,
>> + NULL)) {
>>                                  ret = 0;
>>                                  break;
>>                          }
>> --
>> 2.17.1
>>
>