[PATCH 6/6] drm/ttm: Fix multihop assert on eviction.

Christian König ckoenig.leichtzumerken at gmail.com
Wed Jun 23 08:28:57 UTC 2021


Am 22.06.21 um 18:23 schrieb Andrey Grodzovsky:
> Problem:
> Under memory pressure when GTT domain is almost full multihop assert
> will come up when trying to evict LRU BO from VRAM to SYSTEM.
>
> Fix:
> Don't assert on multihop error in evict code but rather do a retry
> as we do in ttm_bo_move_buffer
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>

Reviewed-by: Christian König <christian.koenig at amd.com>

But I think you need to move this patch earlier in the series or 
otherwise you break amdgpu eviction in between.

Christian.

> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 63 +++++++++++++++++++-----------------
>   1 file changed, 34 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 45145d02aed2..5a2dc712c632 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -485,6 +485,31 @@ void ttm_bo_unlock_delayed_workqueue(struct ttm_device *bdev, int resched)
>   }
>   EXPORT_SYMBOL(ttm_bo_unlock_delayed_workqueue);
>   
> +static int ttm_bo_bounce_temp_buffer(struct ttm_buffer_object *bo,
> +				     struct ttm_resource **mem,
> +				     struct ttm_operation_ctx *ctx,
> +				     struct ttm_place *hop)
> +{
> +	struct ttm_placement hop_placement;
> +	struct ttm_resource *hop_mem;
> +	int ret;
> +
> +	hop_placement.num_placement = hop_placement.num_busy_placement = 1;
> +	hop_placement.placement = hop_placement.busy_placement = hop;
> +
> +	/* find space in the bounce domain */
> +	ret = ttm_bo_mem_space(bo, &hop_placement, &hop_mem, ctx);
> +	if (ret)
> +		return ret;
> +	/* move to the bounce domain */
> +	ret = ttm_bo_handle_move_mem(bo, hop_mem, false, ctx, NULL);
> +	if (ret) {
> +		ttm_resource_free(bo, &hop_mem);
> +		return ret;
> +	}
> +	return 0;
> +}
> +
>   static int ttm_bo_evict(struct ttm_buffer_object *bo,
>   			struct ttm_operation_ctx *ctx)
>   {
> @@ -524,12 +549,17 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo,
>   		goto out;
>   	}
>   
> +bounce:
>   	ret = ttm_bo_handle_move_mem(bo, evict_mem, true, ctx, &hop);
> -	if (unlikely(ret)) {
> -		WARN(ret == -EMULTIHOP, "Unexpected multihop in eviction - likely driver bug\n");
> -		if (ret != -ERESTARTSYS)
> +	if (ret == -EMULTIHOP) {
> +		ret = ttm_bo_bounce_temp_buffer(bo, &evict_mem, ctx, &hop);
> +		if (ret) {
>   			pr_err("Buffer eviction failed\n");
> -		ttm_resource_free(bo, &evict_mem);
> +			ttm_resource_free(bo, &evict_mem);
> +			goto out;
> +		}
> +		/* try and move to final place now. */
> +		goto bounce;
>   	}
>   out:
>   	return ret;
> @@ -844,31 +874,6 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
>   }
>   EXPORT_SYMBOL(ttm_bo_mem_space);
>   
> -static int ttm_bo_bounce_temp_buffer(struct ttm_buffer_object *bo,
> -				     struct ttm_resource **mem,
> -				     struct ttm_operation_ctx *ctx,
> -				     struct ttm_place *hop)
> -{
> -	struct ttm_placement hop_placement;
> -	struct ttm_resource *hop_mem;
> -	int ret;
> -
> -	hop_placement.num_placement = hop_placement.num_busy_placement = 1;
> -	hop_placement.placement = hop_placement.busy_placement = hop;
> -
> -	/* find space in the bounce domain */
> -	ret = ttm_bo_mem_space(bo, &hop_placement, &hop_mem, ctx);
> -	if (ret)
> -		return ret;
> -	/* move to the bounce domain */
> -	ret = ttm_bo_handle_move_mem(bo, hop_mem, false, ctx, NULL);
> -	if (ret) {
> -		ttm_resource_free(bo, &hop_mem);
> -		return ret;
> -	}
> -	return 0;
> -}
> -
>   static int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
>   			      struct ttm_placement *placement,
>   			      struct ttm_operation_ctx *ctx)



More information about the amd-gfx mailing list