[PATCH] drm/ttm: fix busy memory to fail other user v3
zhoucm1
zhoucm1 at amd.com
Fri Apr 26 09:07:18 UTC 2019
On 2019年04月26日 16:31, Christian König wrote:
> Am 25.04.19 um 09:39 schrieb Chunming Zhou:
>> heavy gpu job could occupy memory long time, which lead other user
>> fail to get memory.
>>
>> basically pick up Christian idea:
>>
>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>
> Any reason you don't want to split this into a separate patch?
Will do that after locking down solution, just make the solution in one
patch to review conveniently.
>
>> 2. If we then run into this EBUSY condition in TTM check if the BO we
>> need memory for (or rather the ww_mutex of its reservation object)
>> has a ticket assigned.
>> 3. If we have a ticket we grab a reference to the first BO on the
>> LRU, drop the LRU lock and try to grab the reservation lock with the
>> ticket.
>> 4. If getting the reservation lock with the ticket succeeded we check
>> if the BO is still the first one on the LRU in question (the BO could
>> have moved).
>> 5. If the BO is still the first one on the LRU in question we try to
>> evict it as we would evict any other BO.
>> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>>
>> v2: fix some minor check
>> v3: address Christian v2 comments.
>>
>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>> Signed-off-by: Chunming Zhou <david1.zhou at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 7 ++-
>> .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ++++++--
>> drivers/gpu/drm/ttm/ttm_bo.c | 48 +++++++++++++++++--
>> 3 files changed, 67 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index affde72b44db..015bf62277d0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -811,7 +811,12 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo
>> *bo, u32 domain,
>> u64 min_offset, u64 max_offset)
>> {
>> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>> - struct ttm_operation_ctx ctx = { false, false };
>> + struct ttm_operation_ctx ctx = {
>> + .interruptible = false,
>> + .no_wait_gpu = false,
>> + .resv = bo->tbo.resv,
>> + .flags = TTM_OPT_FLAG_ALLOW_RES_EVICT
>> + };
>
> Please completely drop this chunk.
>
> If we allow evicting BOs from the same process during pinning it could
> lead to some unforeseen side effects.
Will remove that flags.
>
>> int r, i;
>> if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> index a5cacf846e1b..2957ac38dcb0 100644
>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> @@ -4101,6 +4101,9 @@ static int dm_plane_helper_prepare_fb(struct
>> drm_plane *plane,
>> struct amdgpu_device *adev;
>> struct amdgpu_bo *rbo;
>> struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
>> + struct list_head list, duplicates;
>> + struct ttm_validate_buffer tv;
>> + struct ww_acquire_ctx ticket;
>> uint64_t tiling_flags;
>> uint32_t domain;
>> int r;
>> @@ -4120,6 +4123,18 @@ static int dm_plane_helper_prepare_fb(struct
>> drm_plane *plane,
>> r = amdgpu_bo_reserve(rbo, false);
>> if (unlikely(r != 0))
>> return r;
>> + INIT_LIST_HEAD(&list);
>> + INIT_LIST_HEAD(&duplicates);
>> +
>> + tv.bo = &rbo->tbo;
>> + tv.num_shared = 1;
>> + list_add(&tv.head, &list);
>> +
>> + r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
>> + if (r) {
>> + dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
>> + return r;
>> + }
>> if (plane->type != DRM_PLANE_TYPE_CURSOR)
>> domain = amdgpu_display_supported_domains(adev);
>> @@ -4130,21 +4145,21 @@ static int dm_plane_helper_prepare_fb(struct
>> drm_plane *plane,
>> if (unlikely(r != 0)) {
>> if (r != -ERESTARTSYS)
>> DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
>> - amdgpu_bo_unreserve(rbo);
>> + ttm_eu_backoff_reservation(&ticket, &list);
>> return r;
>> }
>> r = amdgpu_ttm_alloc_gart(&rbo->tbo);
>> if (unlikely(r != 0)) {
>> amdgpu_bo_unpin(rbo);
>> - amdgpu_bo_unreserve(rbo);
>> + ttm_eu_backoff_reservation(&ticket, &list);
>> DRM_ERROR("%p bind failed\n", rbo);
>> return r;
>> }
>> amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
>> - amdgpu_bo_unreserve(rbo);
>> + ttm_eu_backoff_reservation(&ticket, &list);
>
> This part looks good to me, but as noted above should probably be in a
> separate patch.
>
>> afb->address = amdgpu_bo_gpu_offset(rbo);
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>> b/drivers/gpu/drm/ttm/ttm_bo.c
>> index 8502b3ed2d88..dbcf958d1b43 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -766,11 +766,12 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>> * b. Otherwise, trylock it.
>> */
>> static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object
>> *bo,
>> - struct ttm_operation_ctx *ctx, bool *locked)
>> + struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>> {
>> bool ret = false;
>> *locked = false;
>> + *busy = false;
>> if (bo->resv == ctx->resv) {
>> reservation_object_assert_held(bo->resv);
>> if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>> @@ -779,6 +780,8 @@ static bool ttm_bo_evict_swapout_allowable(struct
>> ttm_buffer_object *bo,
>> } else {
>> *locked = reservation_object_trylock(bo->resv);
>> ret = *locked;
>> + if (!ret)
>> + *busy = true;
>> }
>> return ret;
>> @@ -791,7 +794,7 @@ static int ttm_mem_evict_first(struct
>> ttm_bo_device *bdev,
>> {
>> struct ttm_bo_global *glob = bdev->glob;
>> struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> - struct ttm_buffer_object *bo = NULL;
>> + struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>> bool locked = false;
>> unsigned i;
>> int ret;
>> @@ -799,8 +802,13 @@ static int ttm_mem_evict_first(struct
>> ttm_bo_device *bdev,
>> spin_lock(&glob->lru_lock);
>> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> list_for_each_entry(bo, &man->lru[i], lru) {
>> - if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>> + bool busy = false;
>> + if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>> + &busy)) {
>> + if (!first_bo && busy)
>> + first_bo = bo;
>> continue;
>> + }
>> if (place && !bdev->driver->eviction_valuable(bo,
>> place)) {
>> @@ -808,6 +816,7 @@ static int ttm_mem_evict_first(struct
>> ttm_bo_device *bdev,
>> reservation_object_unlock(bo->resv);
>> continue;
>> }
>> +
>> break;
>> }
>> @@ -820,7 +829,33 @@ static int ttm_mem_evict_first(struct
>> ttm_bo_device *bdev,
>> if (!bo) {
>> spin_unlock(&glob->lru_lock);
>> - return -EBUSY;
>> + if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx)
>
> Ah, now I see why you need the reservation object here.
>
>> + return -EBUSY;
>> + if (ctx->interruptible)
>> + ret = ww_mutex_lock_interruptible(&bo->resv->lock,
>> + ctx->resv->lock.ctx);
>> + else
>> + ret = ww_mutex_lock(&bo->resv->lock, ctx->resv->lock.ctx);
>> + if (ret)
>> + return ret;
>> + spin_lock(&glob->lru_lock);
>> + for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> + if (list_empty(&man->lru[i]))
>> + continue;
>> + bo = list_first_entry(&man->lru[i],
>> + struct ttm_buffer_object,
>> + lru);
>
> You now need to check all BOs on the LRU, cause the first one where a
> trylock failed is not necessarily the first one on the list.
Sorry, I don't get your opinion on this, Could you detail a bit how to
do that?
I though after ww_mutex_lock, cs is done, then DC can pick up any of them.
>
>> +
>> + break;
>> + }
>> + /* verify if BO have been moved */
>> + if (first_bo != bo) {
>> + spin_unlock(&glob->lru_lock);
>> + ww_mutex_unlock(&bo->resv->lock);
>> + return -EBUSY;
>> + }
>> + spin_unlock(&glob->lru_lock);
>> + /* ok, pick up first busy BO to wait to evict */
>> }
>> kref_get(&bo->list_kref);
>> @@ -1784,7 +1819,10 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
>> struct ttm_operation_ctx *ctx)
>> spin_lock(&glob->lru_lock);
>> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> - if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>> + bool busy = false;
>
> Better make the parameter optional.
Will do.
-David
>
> Christian.
>
>> +
>> + if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>> + &busy)) {
>> ret = 0;
>> break;
>> }
>
More information about the dri-devel
mailing list