[PATCH 02/11] drm/ttm: fix busy memory to fail other user v8

Christian König ckoenig.leichtzumerken at gmail.com
Wed May 15 09:28:23 UTC 2019


Am 15.05.19 um 11:27 schrieb Christian König:
> Am 15.05.19 um 10:45 schrieb Daniel Vetter:
>> On Wed, May 15, 2019 at 10:38:28AM +0200, Daniel Vetter wrote:
>>> On Tue, May 14, 2019 at 02:31:18PM +0200, Christian König wrote:
>>>> From: Chunming Zhou <david1.zhou at amd.com>
>>>>
>>>> heavy gpu job could occupy memory long time, which lead other user 
>>>> fail to get memory.
>>>>
>>>> basically pick up Christian idea:
>>>>
>>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>>> 2. If we then run into this EBUSY condition in TTM check if the BO 
>>>> we need memory for (or rather the ww_mutex of its reservation 
>>>> object) has a ticket assigned.
>>>> 3. If we have a ticket we grab a reference to the first BO on the 
>>>> LRU, drop the LRU lock and try to grab the reservation lock with 
>>>> the ticket.
>>>> 4. If getting the reservation lock with the ticket succeeded we 
>>>> check if the BO is still the first one on the LRU in question (the 
>>>> BO could have moved).
>>>> 5. If the BO is still the first one on the LRU in question we try 
>>>> to evict it as we would evict any other BO.
>>>> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>>>>
>>>> v2: fix some minor check
>>>> v3: address Christian v2 comments.
>>>> v4: fix some missing
>>>> v5: handle first_bo unlock and bo_get/put
>>>> v6: abstract unified iterate function, and handle all possible 
>>>> usecase not only pinned bo.
>>>> v7: pass request bo->resv to ttm_bo_evict_first
>>>> v8 (chk): minimal coding style fix
>>>>
>>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>>> Signed-off-by: Chunming Zhou <david1.zhou at amd.com>
>>>> Reviewed-by: Christian König <christian.koenig at amd.com>
>>> I think this closes a big gap between ttm and the bkl/struct_mutex
>>> drivers - it's much easier to guarantee you can evict everything if
>>> there's only a single lock :-)
>>>
>>> Would be absolutely awesome if we could extract this as some kind of
>>> building block, like we've done with lots of other ttm concepts already
>>> (reservation_obj, fences, ...).
>>>
>>> Just an aside really.
>> Ofc this is meant as a comment on the entire patch series, without 
>> all the
>> other patches to make sure BO always stay on a relevant LRU there's 
>> still
>> gaps in the guaranteed forward progress eviction algorithm.
>
> Yeah, the problem surfaced because of patch #4. Previously TTM would 
> have just ignored all errors and continued to try different placements 
> and only return -ENOMEM when we ran out of a possible placements.
>
> I probably need to either fix patch #4 or reorder the patches.

Ups, please ignore. I accidentally replied to the wrong mail.

Christian.

>
> Thanks for the note,
> Christian.
>
>> -Daniel
>>
>>> -Daniel
>>>
>>>> ---
>>>>   drivers/gpu/drm/ttm/ttm_bo.c | 113 
>>>> +++++++++++++++++++++++++++++------
>>>>   1 file changed, 96 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>> index 2845fceb2fbd..e634d3a36923 100644
>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>>    * b. Otherwise, trylock it.
>>>>    */
>>>>   static bool ttm_bo_evict_swapout_allowable(struct 
>>>> ttm_buffer_object *bo,
>>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>>>>   {
>>>>       bool ret = false;
>>>>         *locked = false;
>>>> +    if (busy)
>>>> +        *busy = false;
>>>>       if (bo->resv == ctx->resv) {
>>>>           reservation_object_assert_held(bo->resv);
>>>>           if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>> @@ -779,35 +781,46 @@ static bool 
>>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>>       } else {
>>>>           *locked = reservation_object_trylock(bo->resv);
>>>>           ret = *locked;
>>>> +        if (!ret && busy)
>>>> +            *busy = true;
>>>>       }
>>>>         return ret;
>>>>   }
>>>>   -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>> -                   uint32_t mem_type,
>>>> -                   const struct ttm_place *place,
>>>> -                   struct ttm_operation_ctx *ctx)
>>>> +static struct ttm_buffer_object*
>>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>>> +             struct ttm_mem_type_manager *man,
>>>> +             const struct ttm_place *place,
>>>> +             struct ttm_operation_ctx *ctx,
>>>> +             struct ttm_buffer_object **first_bo,
>>>> +             bool *locked)
>>>>   {
>>>> -    struct ttm_bo_global *glob = bdev->glob;
>>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>       struct ttm_buffer_object *bo = NULL;
>>>> -    bool locked = false;
>>>> -    unsigned i;
>>>> -    int ret;
>>>> +    int i;
>>>>   -    spin_lock(&glob->lru_lock);
>>>> +    if (first_bo)
>>>> +        *first_bo = NULL;
>>>>       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>           list_for_each_entry(bo, &man->lru[i], lru) {
>>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>>> +            bool busy = false;
>>>> +
>>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>>> +                                &busy)) {
>>>> +                if (first_bo && !(*first_bo) && busy) {
>>>> +                    ttm_bo_get(bo);
>>>> +                    *first_bo = bo;
>>>> +                }
>>>>                   continue;
>>>> +            }
>>>>                 if (place && !bdev->driver->eviction_valuable(bo,
>>>>                                         place)) {
>>>> -                if (locked)
>>>> +                if (*locked)
>>>> reservation_object_unlock(bo->resv);
>>>>                   continue;
>>>>               }
>>>> +
>>>>               break;
>>>>           }
>>>>   @@ -818,9 +831,69 @@ static int ttm_mem_evict_first(struct 
>>>> ttm_bo_device *bdev,
>>>>           bo = NULL;
>>>>       }
>>>>   +    return bo;
>>>> +}
>>>> +
>>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>> +                   uint32_t mem_type,
>>>> +                   const struct ttm_place *place,
>>>> +                   struct ttm_operation_ctx *ctx,
>>>> +                   struct reservation_object *request_resv)
>>>> +{
>>>> +    struct ttm_bo_global *glob = bdev->glob;
>>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>>> +    bool locked = false;
>>>> +    int ret;
>>>> +
>>>> +    spin_lock(&glob->lru_lock);
>>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
>>>> +                      &locked);
>>>>       if (!bo) {
>>>> +        struct ww_acquire_ctx *acquire_ctx = request_resv->lock.ctx;
>>>> +        struct ttm_operation_ctx busy_ctx;
>>>> +
>>>>           spin_unlock(&glob->lru_lock);
>>>> -        return -EBUSY;
>>>> +        /* check if other user occupy memory too long time */
>>>> +        if (!first_bo || !request_resv || !request_resv->lock.ctx) {
>>>> +            if (first_bo)
>>>> +                ttm_bo_put(first_bo);
>>>> +            return -EBUSY;
>>>> +        }
>>>> +        if (first_bo->resv == request_resv) {
>>>> +            ttm_bo_put(first_bo);
>>>> +            return -EBUSY;
>>>> +        }
>>>> +        if (ctx->interruptible)
>>>> +            ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>>> +                              acquire_ctx);
>>>> +        else
>>>> +            ret = ww_mutex_lock(&first_bo->resv->lock,
>>>> +                        acquire_ctx);
>>>> +        if (ret) {
>>>> +            ttm_bo_put(first_bo);
>>>> +            return ret;
>>>> +        }
>>>> +        spin_lock(&glob->lru_lock);
>>>> +        /* previous busy resv lock is held by above, idle now,
>>>> +         * so let them evictable.
>>>> +         */
>>>> +        busy_ctx.interruptible = ctx->interruptible;
>>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>>> +        busy_ctx.resv           = first_bo->resv;
>>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>>> +
>>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, 
>>>> NULL,
>>>> +                          &locked);
>>>> +        if (bo && (bo->resv == first_bo->resv))
>>>> +            locked = true;
>>>> +        else if (bo)
>>>> + ww_mutex_unlock(&first_bo->resv->lock);
>>>> +        if (!bo) {
>>>> +            spin_unlock(&glob->lru_lock);
>>>> +            ttm_bo_put(first_bo);
>>>> +            return -EBUSY;
>>>> +        }
>>>>       }
>>>>         kref_get(&bo->list_kref);
>>>> @@ -829,11 +902,15 @@ static int ttm_mem_evict_first(struct 
>>>> ttm_bo_device *bdev,
>>>>           ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>>                         ctx->no_wait_gpu, locked);
>>>>           kref_put(&bo->list_kref, ttm_bo_release_list);
>>>> +        if (first_bo)
>>>> +            ttm_bo_put(first_bo);
>>>>           return ret;
>>>>       }
>>>>         ttm_bo_del_from_lru(bo);
>>>>       spin_unlock(&glob->lru_lock);
>>>> +    if (first_bo)
>>>> +        ttm_bo_put(first_bo);
>>>>         ret = ttm_bo_evict(bo, ctx);
>>>>       if (locked) {
>>>> @@ -907,7 +984,7 @@ static int ttm_bo_mem_force_space(struct 
>>>> ttm_buffer_object *bo,
>>>>               return ret;
>>>>           if (mem->mm_node)
>>>>               break;
>>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx, 
>>>> bo->resv);
>>>>           if (unlikely(ret != 0))
>>>>               return ret;
>>>>       } while (1);
>>>> @@ -1401,7 +1478,8 @@ static int ttm_bo_force_list_clean(struct 
>>>> ttm_bo_device *bdev,
>>>>       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>           while (!list_empty(&man->lru[i])) {
>>>>               spin_unlock(&glob->lru_lock);
>>>> -            ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
>>>> +            ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
>>>> +                          NULL);
>>>>               if (ret)
>>>>                   return ret;
>>>>               spin_lock(&glob->lru_lock);
>>>> @@ -1772,7 +1850,8 @@ int ttm_bo_swapout(struct ttm_bo_global 
>>>> *glob, struct ttm_operation_ctx *ctx)
>>>>       spin_lock(&glob->lru_lock);
>>>>       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>           list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>>> +                               NULL)) {
>>>>                   ret = 0;
>>>>                   break;
>>>>               }
>>>> -- 
>>>> 2.17.1
>>>>
>>>> _______________________________________________
>>>> dri-devel mailing list
>>>> dri-devel at lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>> -- 
>>> Daniel Vetter
>>> Software Engineer, Intel Corporation
>>> http://blog.ffwll.ch
>



More information about the amd-gfx mailing list