[PATCH 1/2] drm/ttm: fix busy memory to fail other user v6

Koenig, Christian Christian.Koenig at amd.com
Tue May 7 11:42:38 UTC 2019


Am 07.05.19 um 13:37 schrieb Thomas Hellstrom:
> [CAUTION: External Email]
>
> On 5/7/19 1:24 PM, Christian König wrote:
>> Am 07.05.19 um 13:22 schrieb zhoucm1:
>>>
>>>
>>> On 2019年05月07日 19:13, Koenig, Christian wrote:
>>>> Am 07.05.19 um 13:08 schrieb zhoucm1:
>>>>>
>>>>> On 2019年05月07日 18:53, Koenig, Christian wrote:
>>>>>> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>>>>>>> heavy gpu job could occupy memory long time, which lead other user
>>>>>>> fail to get memory.
>>>>>>>
>>>>>>> basically pick up Christian idea:
>>>>>>>
>>>>>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>>>>>> 2. If we then run into this EBUSY condition in TTM check if the BO
>>>>>>> we need memory for (or rather the ww_mutex of its reservation
>>>>>>> object) has a ticket assigned.
>>>>>>> 3. If we have a ticket we grab a reference to the first BO on the
>>>>>>> LRU, drop the LRU lock and try to grab the reservation lock with 
>>>>>>> the
>>>>>>> ticket.
>>>>>>> 4. If getting the reservation lock with the ticket succeeded we
>>>>>>> check if the BO is still the first one on the LRU in question (the
>>>>>>> BO could have moved).
>>>>>>> 5. If the BO is still the first one on the LRU in question we 
>>>>>>> try to
>>>>>>> evict it as we would evict any other BO.
>>>>>>> 6. If any of the "If's" above fail we just back off and return
>>>>>>> -EBUSY.
>>>>>>>
>>>>>>> v2: fix some minor check
>>>>>>> v3: address Christian v2 comments.
>>>>>>> v4: fix some missing
>>>>>>> v5: handle first_bo unlock and bo_get/put
>>>>>>> v6: abstract unified iterate function, and handle all possible
>>>>>>> usecase not only pinned bo.
>>>>>>>
>>>>>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>>>>>> Signed-off-by: Chunming Zhou <david1.zhou at amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/ttm/ttm_bo.c | 113
>>>>>>> ++++++++++++++++++++++++++++++-----
>>>>>>>     1 file changed, 97 insertions(+), 16 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> index 8502b3ed2d88..bbf1d14d00a7 100644
>>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>>>>>      * b. Otherwise, trylock it.
>>>>>>>      */
>>>>>>>     static bool ttm_bo_evict_swapout_allowable(struct
>>>>>>> ttm_buffer_object *bo,
>>>>>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>>>>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool
>>>>>>> *busy)
>>>>>>>     {
>>>>>>>         bool ret = false;
>>>>>>>            *locked = false;
>>>>>>> +    if (busy)
>>>>>>> +        *busy = false;
>>>>>>>         if (bo->resv == ctx->resv) {
>>>>>>> reservation_object_assert_held(bo->resv);
>>>>>>>             if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>>>>> @@ -779,35 +781,45 @@ static bool
>>>>>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>>>>>         } else {
>>>>>>>             *locked = reservation_object_trylock(bo->resv);
>>>>>>>             ret = *locked;
>>>>>>> +        if (!ret && busy)
>>>>>>> +            *busy = true;
>>>>>>>         }
>>>>>>>            return ret;
>>>>>>>     }
>>>>>>>     -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>>>> -                   uint32_t mem_type,
>>>>>>> -                   const struct ttm_place *place,
>>>>>>> -                   struct ttm_operation_ctx *ctx)
>>>>>>> +static struct ttm_buffer_object*
>>>>>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>>>>>> +             struct ttm_mem_type_manager *man,
>>>>>>> +             const struct ttm_place *place,
>>>>>>> +             struct ttm_operation_ctx *ctx,
>>>>>>> +             struct ttm_buffer_object **first_bo,
>>>>>>> +             bool *locked)
>>>>>>>     {
>>>>>>> -    struct ttm_bo_global *glob = bdev->glob;
>>>>>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>>         struct ttm_buffer_object *bo = NULL;
>>>>>>> -    bool locked = false;
>>>>>>> -    unsigned i;
>>>>>>> -    int ret;
>>>>>>> +    int i;
>>>>>>>     -    spin_lock(&glob->lru_lock);
>>>>>>> +    if (first_bo)
>>>>>>> +        *first_bo = NULL;
>>>>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>>>             list_for_each_entry(bo, &man->lru[i], lru) {
>>>>>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>>>>>> +            bool busy = false;
>>>>>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>>>>>> +                                &busy)) {
>>>>>> A newline between declaration and code please.
>>>>>>
>>>>>>> +                if (first_bo && !(*first_bo) && busy) {
>>>>>>> +                    ttm_bo_get(bo);
>>>>>>> +                    *first_bo = bo;
>>>>>>> +                }
>>>>>>>                     continue;
>>>>>>> +            }
>>>>>>>                    if (place && 
>>>>>>> !bdev->driver->eviction_valuable(bo,
>>>>>>>                                           place)) {
>>>>>>> -                if (locked)
>>>>>>> +                if (*locked)
>>>>>>> reservation_object_unlock(bo->resv);
>>>>>>>                     continue;
>>>>>>>                 }
>>>>>>> +
>>>>>>>                 break;
>>>>>>>             }
>>>>>>>     @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct
>>>>>>> ttm_bo_device *bdev,
>>>>>>>             bo = NULL;
>>>>>>>         }
>>>>>>>     +    return bo;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>>>> +                   uint32_t mem_type,
>>>>>>> +                   const struct ttm_place *place,
>>>>>>> +                   struct ttm_operation_ctx *ctx)
>>>>>>> +{
>>>>>>> +    struct ttm_bo_global *glob = bdev->glob;
>>>>>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>>>>>> +    bool locked = false;
>>>>>>> +    int ret;
>>>>>>> +
>>>>>>> +    spin_lock(&glob->lru_lock);
>>>>>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, 
>>>>>>> &first_bo,
>>>>>>> +                      &locked);
>>>>>>>         if (!bo) {
>>>>>>> +        struct ttm_operation_ctx busy_ctx;
>>>>>>> +
>>>>>>>             spin_unlock(&glob->lru_lock);
>>>>>>> -        return -EBUSY;
>>>>>>> +        /* check if other user occupy memory too long time */
>>>>>>> +        if (!first_bo || !ctx || !ctx->resv ||
>>>>>>> !ctx->resv->lock.ctx) {
>>>>>>> +            if (first_bo)
>>>>>>> +                ttm_bo_put(first_bo);
>>>>>>> +            return -EBUSY;
>>>>>>> +        }
>>>>>>> +        if (first_bo->resv == ctx->resv) {
>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>> +            return -EBUSY;
>>>>>>> +        }
>>>>>>> +        if (ctx->interruptible)
>>>>>>> +            ret =
>>>>>>> ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>>>>>> + ctx->resv->lock.ctx);
>>>>>>> +        else
>>>>>>> +            ret = ww_mutex_lock(&first_bo->resv->lock,
>>>>>>> ctx->resv->lock.ctx);
>>>>>>> +        if (ret) {
>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>> +            return ret;
>>>>>>> +        }
>>>>>>> +        spin_lock(&glob->lru_lock);
>>>>>>> +        /* previous busy resv lock is held by above, idle now,
>>>>>>> +         * so let them evictable.
>>>>>>> +         */
>>>>>>> +        busy_ctx.interruptible = ctx->interruptible;
>>>>>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>>>>>> +        busy_ctx.resv           = first_bo->resv;
>>>>>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>>>>>> +
>>>>>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx,
>>>>>>> NULL,
>>>>>>> +                          &locked);
>>>>>>> +        if (bo && (bo->resv == first_bo->resv))
>>>>>>> +            locked = true;
>>>>>>> +        else if (bo)
>>>>>>> + ww_mutex_unlock(&first_bo->resv->lock);
>>>>>>> +        if (!bo) {
>>>>>>> +            spin_unlock(&glob->lru_lock);
>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>> +            return -EBUSY;
>>>>>>> +        }
>>>>>>>         }
>>>>>>>            kref_get(&bo->list_kref);
>>>>>>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct
>>>>>>> ttm_bo_device *bdev,
>>>>>>>             ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>>>>>                           ctx->no_wait_gpu, locked);
>>>>>>>             kref_put(&bo->list_kref, ttm_bo_release_list);
>>>>>>> +        if (first_bo)
>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>>             return ret;
>>>>>>>         }
>>>>>>>            ttm_bo_del_from_lru(bo);
>>>>>>>         spin_unlock(&glob->lru_lock);
>>>>>>> +    if (first_bo)
>>>>>>> +        ttm_bo_put(first_bo);
>>>>>>>            ret = ttm_bo_evict(bo, ctx);
>>>>>>>         if (locked) {
>>>>>>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct
>>>>>>> ttm_buffer_object *bo,
>>>>>>>     {
>>>>>>>         struct ttm_bo_device *bdev = bo->bdev;
>>>>>>>         struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>> +    struct ttm_operation_ctx native_ctx = {
>>>>>>> +        .interruptible = false,
>>>>>>> +        .no_wait_gpu = false,
>>>>>>> +        .resv = bo->resv,
>>>>>>> +        .flags = 0
>>>>>>> +    };
>>>>>>> +    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
>>>>>> I thought we made the ctx parameter mandatory, didn't we? Could be
>>>>>> that
>>>>>> I remember that incorrectly.
>>>>> Prike said he see ctx->resv is null, in that case, code doesn't run
>>>>> into busy path.
>>>>> Oh, as you mentioned here, we need add .resv=bo->resv for every
>>>>> ttm_operation_ctx. That's a huge change which will cross all vendor
>>>>> drivers.
>>>>>
>>>>> Can we just force to evaluate evict_ctx->resv = bo->resv? That means
>>>>> we just add one extra line: evict_ctx->resv = bo->resv. How about
>>>>> that?
>>>> Well only if ctx->resv is NULL, otherwise we would overwrite some
>>>> reservation context given by the driver.
>>>>
>>>> Probably better to give the acquir_ctx as separate parameter to
>>>> ttm_mem_evict_first().
>>> still put acquire_ctx into ttm_operation_ctx? Then that's same
>>> ctx->resv.
>>> Current problem is we don't pass resv anywhere except ALLOW_EVICT case.
>>> If you have concern for overwritten, we have to do ".resv = bo->resv"
>>> in every ttm_operation_ctx definitions.
>>
>> No, what I mean is to add the acquire_ctx as separate parameter to
>> ttm_mem_evict_first().
>>
>> E.g. we only need it in this function and it is actually not related
>> to the ttm operation context filled in by the driver.
>
>
> FWIW, I think it would be nice at some point to have a reservation
> context being part of the ttm operation context, so that validate and
> evict could do sleeping reservations, and have bos remain on the lru
> even when reserved...

Yeah, well that's exactly what the ctx->resv parameter is good for :)

And yes, we do keep the BOs on the LRU even when they are reserved.

Christian.

>
> /Thomas
>
>
>>
>> Christian.
>>
>>>
>>> -David
>>>>
>>>> Christian.
>>>>
>>>>> -David
>>>>>> Christian.
>>>>>>
>>>>>>>         int ret;
>>>>>>>            do {
>>>>>>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct
>>>>>>> ttm_buffer_object *bo,
>>>>>>>                 return ret;
>>>>>>>             if (mem->mm_node)
>>>>>>>                 break;
>>>>>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>>>>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place,
>>>>>>> evict_ctx);
>>>>>>>             if (unlikely(ret != 0))
>>>>>>>                 return ret;
>>>>>>>         } while (1);
>>>>>>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global 
>>>>>>> *glob,
>>>>>>> struct ttm_operation_ctx *ctx)
>>>>>>>         spin_lock(&glob->lru_lock);
>>>>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>>>             list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>>>>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx, 
>>>>>>> &locked)) {
>>>>>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>>>>>> +                               NULL)) {
>>>>>>>                     ret = 0;
>>>>>>>                     break;
>>>>>>>                 }
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx at lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
>



More information about the amd-gfx mailing list