[PATCH 1/2] drm/ttm: fix busy memory to fail other user v6

Koenig, Christian Christian.Koenig at amd.com
Tue May 7 11:13:48 UTC 2019


Am 07.05.19 um 13:08 schrieb zhoucm1:
>
>
> On 2019年05月07日 18:53, Koenig, Christian wrote:
>> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>>> heavy gpu job could occupy memory long time, which lead other user 
>>> fail to get memory.
>>>
>>> basically pick up Christian idea:
>>>
>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>> 2. If we then run into this EBUSY condition in TTM check if the BO 
>>> we need memory for (or rather the ww_mutex of its reservation 
>>> object) has a ticket assigned.
>>> 3. If we have a ticket we grab a reference to the first BO on the 
>>> LRU, drop the LRU lock and try to grab the reservation lock with the 
>>> ticket.
>>> 4. If getting the reservation lock with the ticket succeeded we 
>>> check if the BO is still the first one on the LRU in question (the 
>>> BO could have moved).
>>> 5. If the BO is still the first one on the LRU in question we try to 
>>> evict it as we would evict any other BO.
>>> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>>>
>>> v2: fix some minor check
>>> v3: address Christian v2 comments.
>>> v4: fix some missing
>>> v5: handle first_bo unlock and bo_get/put
>>> v6: abstract unified iterate function, and handle all possible 
>>> usecase not only pinned bo.
>>>
>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>> Signed-off-by: Chunming Zhou <david1.zhou at amd.com>
>>> ---
>>>    drivers/gpu/drm/ttm/ttm_bo.c | 113 
>>> ++++++++++++++++++++++++++++++-----
>>>    1 file changed, 97 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>> index 8502b3ed2d88..bbf1d14d00a7 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>     * b. Otherwise, trylock it.
>>>     */
>>>    static bool ttm_bo_evict_swapout_allowable(struct 
>>> ttm_buffer_object *bo,
>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>>>    {
>>>        bool ret = false;
>>>           *locked = false;
>>> +    if (busy)
>>> +        *busy = false;
>>>        if (bo->resv == ctx->resv) {
>>>            reservation_object_assert_held(bo->resv);
>>>            if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>> @@ -779,35 +781,45 @@ static bool 
>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>        } else {
>>>            *locked = reservation_object_trylock(bo->resv);
>>>            ret = *locked;
>>> +        if (!ret && busy)
>>> +            *busy = true;
>>>        }
>>>           return ret;
>>>    }
>>>    -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>> -                   uint32_t mem_type,
>>> -                   const struct ttm_place *place,
>>> -                   struct ttm_operation_ctx *ctx)
>>> +static struct ttm_buffer_object*
>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>> +             struct ttm_mem_type_manager *man,
>>> +             const struct ttm_place *place,
>>> +             struct ttm_operation_ctx *ctx,
>>> +             struct ttm_buffer_object **first_bo,
>>> +             bool *locked)
>>>    {
>>> -    struct ttm_bo_global *glob = bdev->glob;
>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>        struct ttm_buffer_object *bo = NULL;
>>> -    bool locked = false;
>>> -    unsigned i;
>>> -    int ret;
>>> +    int i;
>>>    -    spin_lock(&glob->lru_lock);
>>> +    if (first_bo)
>>> +        *first_bo = NULL;
>>>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>            list_for_each_entry(bo, &man->lru[i], lru) {
>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>> +            bool busy = false;
>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>> +                                &busy)) {
>> A newline between declaration and code please.
>>
>>> +                if (first_bo && !(*first_bo) && busy) {
>>> +                    ttm_bo_get(bo);
>>> +                    *first_bo = bo;
>>> +                }
>>>                    continue;
>>> +            }
>>>                   if (place && !bdev->driver->eviction_valuable(bo,
>>>                                          place)) {
>>> -                if (locked)
>>> +                if (*locked)
>>>                        reservation_object_unlock(bo->resv);
>>>                    continue;
>>>                }
>>> +
>>>                break;
>>>            }
>>>    @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct 
>>> ttm_bo_device *bdev,
>>>            bo = NULL;
>>>        }
>>>    +    return bo;
>>> +}
>>> +
>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>> +                   uint32_t mem_type,
>>> +                   const struct ttm_place *place,
>>> +                   struct ttm_operation_ctx *ctx)
>>> +{
>>> +    struct ttm_bo_global *glob = bdev->glob;
>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>> +    bool locked = false;
>>> +    int ret;
>>> +
>>> +    spin_lock(&glob->lru_lock);
>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
>>> +                      &locked);
>>>        if (!bo) {
>>> +        struct ttm_operation_ctx busy_ctx;
>>> +
>>>            spin_unlock(&glob->lru_lock);
>>> -        return -EBUSY;
>>> +        /* check if other user occupy memory too long time */
>>> +        if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx) {
>>> +            if (first_bo)
>>> +                ttm_bo_put(first_bo);
>>> +            return -EBUSY;
>>> +        }
>>> +        if (first_bo->resv == ctx->resv) {
>>> +            ttm_bo_put(first_bo);
>>> +            return -EBUSY;
>>> +        }
>>> +        if (ctx->interruptible)
>>> +            ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>> +                              ctx->resv->lock.ctx);
>>> +        else
>>> +            ret = ww_mutex_lock(&first_bo->resv->lock, 
>>> ctx->resv->lock.ctx);
>>> +        if (ret) {
>>> +            ttm_bo_put(first_bo);
>>> +            return ret;
>>> +        }
>>> +        spin_lock(&glob->lru_lock);
>>> +        /* previous busy resv lock is held by above, idle now,
>>> +         * so let them evictable.
>>> +         */
>>> +        busy_ctx.interruptible = ctx->interruptible;
>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>> +        busy_ctx.resv           = first_bo->resv;
>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>> +
>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, 
>>> NULL,
>>> +                          &locked);
>>> +        if (bo && (bo->resv == first_bo->resv))
>>> +            locked = true;
>>> +        else if (bo)
>>> +            ww_mutex_unlock(&first_bo->resv->lock);
>>> +        if (!bo) {
>>> +            spin_unlock(&glob->lru_lock);
>>> +            ttm_bo_put(first_bo);
>>> +            return -EBUSY;
>>> +        }
>>>        }
>>>           kref_get(&bo->list_kref);
>>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct 
>>> ttm_bo_device *bdev,
>>>            ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>                          ctx->no_wait_gpu, locked);
>>>            kref_put(&bo->list_kref, ttm_bo_release_list);
>>> +        if (first_bo)
>>> +            ttm_bo_put(first_bo);
>>>            return ret;
>>>        }
>>>           ttm_bo_del_from_lru(bo);
>>>        spin_unlock(&glob->lru_lock);
>>> +    if (first_bo)
>>> +        ttm_bo_put(first_bo);
>>>           ret = ttm_bo_evict(bo, ctx);
>>>        if (locked) {
>>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct 
>>> ttm_buffer_object *bo,
>>>    {
>>>        struct ttm_bo_device *bdev = bo->bdev;
>>>        struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>> +    struct ttm_operation_ctx native_ctx = {
>>> +        .interruptible = false,
>>> +        .no_wait_gpu = false,
>>> +        .resv = bo->resv,
>>> +        .flags = 0
>>> +    };
>>> +    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
>> I thought we made the ctx parameter mandatory, didn't we? Could be that
>> I remember that incorrectly.
> Prike said he see ctx->resv is null, in that case, code doesn't run 
> into busy path.
> Oh, as you mentioned here, we need add .resv=bo->resv for every 
> ttm_operation_ctx. That's a huge change which will cross all vendor 
> drivers.
>
> Can we just force to evaluate evict_ctx->resv = bo->resv? That means 
> we just add one extra line: evict_ctx->resv = bo->resv. How about that?

Well only if ctx->resv is NULL, otherwise we would overwrite some 
reservation context given by the driver.

Probably better to give the acquir_ctx as separate parameter to 
ttm_mem_evict_first().

Christian.

>
> -David
>>
>> Christian.
>>
>>>        int ret;
>>>           do {
>>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct 
>>> ttm_buffer_object *bo,
>>>                return ret;
>>>            if (mem->mm_node)
>>>                break;
>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place, evict_ctx);
>>>            if (unlikely(ret != 0))
>>>                return ret;
>>>        } while (1);
>>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, 
>>> struct ttm_operation_ctx *ctx)
>>>        spin_lock(&glob->lru_lock);
>>>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>            list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>> +                               NULL)) {
>>>                    ret = 0;
>>>                    break;
>>>                }
>



More information about the amd-gfx mailing list