[PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
zhoucm1
zhoucm1 at amd.com
Tue May 7 11:08:12 UTC 2019
On 2019年05月07日 18:53, Koenig, Christian wrote:
> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>>
>> basically pick up Christian idea:
>>
>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
>> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
>> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
>> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
>> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>>
>> v2: fix some minor check
>> v3: address Christian v2 comments.
>> v4: fix some missing
>> v5: handle first_bo unlock and bo_get/put
>> v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
>>
>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>> Signed-off-by: Chunming Zhou <david1.zhou at amd.com>
>> ---
>> drivers/gpu/drm/ttm/ttm_bo.c | 113 ++++++++++++++++++++++++++++++-----
>> 1 file changed, 97 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index 8502b3ed2d88..bbf1d14d00a7 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>> * b. Otherwise, trylock it.
>> */
>> static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>> - struct ttm_operation_ctx *ctx, bool *locked)
>> + struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>> {
>> bool ret = false;
>>
>> *locked = false;
>> + if (busy)
>> + *busy = false;
>> if (bo->resv == ctx->resv) {
>> reservation_object_assert_held(bo->resv);
>> if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>> @@ -779,35 +781,45 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>> } else {
>> *locked = reservation_object_trylock(bo->resv);
>> ret = *locked;
>> + if (!ret && busy)
>> + *busy = true;
>> }
>>
>> return ret;
>> }
>>
>> -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>> - uint32_t mem_type,
>> - const struct ttm_place *place,
>> - struct ttm_operation_ctx *ctx)
>> +static struct ttm_buffer_object*
>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>> + struct ttm_mem_type_manager *man,
>> + const struct ttm_place *place,
>> + struct ttm_operation_ctx *ctx,
>> + struct ttm_buffer_object **first_bo,
>> + bool *locked)
>> {
>> - struct ttm_bo_global *glob = bdev->glob;
>> - struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> struct ttm_buffer_object *bo = NULL;
>> - bool locked = false;
>> - unsigned i;
>> - int ret;
>> + int i;
>>
>> - spin_lock(&glob->lru_lock);
>> + if (first_bo)
>> + *first_bo = NULL;
>> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> list_for_each_entry(bo, &man->lru[i], lru) {
>> - if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>> + bool busy = false;
>> + if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>> + &busy)) {
> A newline between declaration and code please.
>
>> + if (first_bo && !(*first_bo) && busy) {
>> + ttm_bo_get(bo);
>> + *first_bo = bo;
>> + }
>> continue;
>> + }
>>
>> if (place && !bdev->driver->eviction_valuable(bo,
>> place)) {
>> - if (locked)
>> + if (*locked)
>> reservation_object_unlock(bo->resv);
>> continue;
>> }
>> +
>> break;
>> }
>>
>> @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>> bo = NULL;
>> }
>>
>> + return bo;
>> +}
>> +
>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>> + uint32_t mem_type,
>> + const struct ttm_place *place,
>> + struct ttm_operation_ctx *ctx)
>> +{
>> + struct ttm_bo_global *glob = bdev->glob;
>> + struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> + struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>> + bool locked = false;
>> + int ret;
>> +
>> + spin_lock(&glob->lru_lock);
>> + bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
>> + &locked);
>> if (!bo) {
>> + struct ttm_operation_ctx busy_ctx;
>> +
>> spin_unlock(&glob->lru_lock);
>> - return -EBUSY;
>> + /* check if other user occupy memory too long time */
>> + if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx) {
>> + if (first_bo)
>> + ttm_bo_put(first_bo);
>> + return -EBUSY;
>> + }
>> + if (first_bo->resv == ctx->resv) {
>> + ttm_bo_put(first_bo);
>> + return -EBUSY;
>> + }
>> + if (ctx->interruptible)
>> + ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
>> + ctx->resv->lock.ctx);
>> + else
>> + ret = ww_mutex_lock(&first_bo->resv->lock, ctx->resv->lock.ctx);
>> + if (ret) {
>> + ttm_bo_put(first_bo);
>> + return ret;
>> + }
>> + spin_lock(&glob->lru_lock);
>> + /* previous busy resv lock is held by above, idle now,
>> + * so let them evictable.
>> + */
>> + busy_ctx.interruptible = ctx->interruptible;
>> + busy_ctx.no_wait_gpu = ctx->no_wait_gpu;
>> + busy_ctx.resv = first_bo->resv;
>> + busy_ctx.flags = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>> +
>> + bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
>> + &locked);
>> + if (bo && (bo->resv == first_bo->resv))
>> + locked = true;
>> + else if (bo)
>> + ww_mutex_unlock(&first_bo->resv->lock);
>> + if (!bo) {
>> + spin_unlock(&glob->lru_lock);
>> + ttm_bo_put(first_bo);
>> + return -EBUSY;
>> + }
>> }
>>
>> kref_get(&bo->list_kref);
>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>> ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>> ctx->no_wait_gpu, locked);
>> kref_put(&bo->list_kref, ttm_bo_release_list);
>> + if (first_bo)
>> + ttm_bo_put(first_bo);
>> return ret;
>> }
>>
>> ttm_bo_del_from_lru(bo);
>> spin_unlock(&glob->lru_lock);
>> + if (first_bo)
>> + ttm_bo_put(first_bo);
>>
>> ret = ttm_bo_evict(bo, ctx);
>> if (locked) {
>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>> {
>> struct ttm_bo_device *bdev = bo->bdev;
>> struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> + struct ttm_operation_ctx native_ctx = {
>> + .interruptible = false,
>> + .no_wait_gpu = false,
>> + .resv = bo->resv,
>> + .flags = 0
>> + };
>> + struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
> I thought we made the ctx parameter mandatory, didn't we? Could be that
> I remember that incorrectly.
Prike said he see ctx->resv is null, in that case, code doesn't run into
busy path.
Oh, as you mentioned here, we need add .resv=bo->resv for every
ttm_operation_ctx. That's a huge change which will cross all vendor drivers.
Can we just force to evaluate evict_ctx->resv = bo->resv? That means we
just add one extra line: evict_ctx->resv = bo->resv. How about that?
-David
>
> Christian.
>
>> int ret;
>>
>> do {
>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>> return ret;
>> if (mem->mm_node)
>> break;
>> - ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>> + ret = ttm_mem_evict_first(bdev, mem_type, place, evict_ctx);
>> if (unlikely(ret != 0))
>> return ret;
>> } while (1);
>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>> spin_lock(&glob->lru_lock);
>> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> - if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>> + if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>> + NULL)) {
>> ret = 0;
>> break;
>> }
More information about the amd-gfx
mailing list