[PATCH 1/2] drm/ttm: rework no_retry handling

Daniel Vetter daniel at ffwll.ch
Mon Nov 2 13:20:27 UTC 2020


On Mon, Nov 02, 2020 at 01:58:07PM +0100, Christian König wrote:
> During eviction we do want to trigger the OOM killer.
> 
> Only while doing new allocations we should try to avoid that and
> return -ENOMEM to the application.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 3 ---
>  drivers/gpu/drm/ttm/ttm_pool.c             | 2 +-
>  drivers/gpu/drm/ttm/ttm_tt.c               | 7 -------
>  include/drm/ttm/ttm_bo_api.h               | 2 ++
>  include/drm/ttm/ttm_bo_driver.h            | 3 ---
>  6 files changed, 5 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 1aa516429c80..52041f48e1c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -516,6 +516,8 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>  	struct ttm_operation_ctx ctx = {
>  		.interruptible = (bp->type != ttm_bo_type_kernel),
>  		.no_wait_gpu = bp->no_wait_gpu,
> +		/* We opt to avoid OOM on system pages allocations */
> +		.retry_mayfail = true,

Hm this is a bit confusingly named imo, but it comes from the gfp flags.
So maybe call it gfp_retry_mayfail, to make it clear where it's from and
that it's only for system allocations. Personally I'd have called this
limited_retry, but I think better to stay consistent.

Also I think we should __GFP_NOWARN for this, to avoid splats in dmesg
since we expect to handle this.

With the bikesheds addressed somehow.

Reviewed-by: Daniel Vetter <daniel.vetter at ffwll.ch>

>  		.resv = bp->resv,
>  		.flags = bp->type != ttm_bo_type_kernel ?
>  			TTM_OPT_FLAG_ALLOW_RES_EVICT : 0
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index bd6e6641c3fc..c01c060e4ac5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -1914,9 +1914,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>  	}
>  	adev->mman.initialized = true;
>  
> -	/* We opt to avoid OOM on system pages allocations */
> -	adev->mman.bdev.no_retry = true;
> -
>  	/* Initialize VRAM pool with all of VRAM divided into pages */
>  	r = amdgpu_vram_mgr_init(adev);
>  	if (r) {
> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
> index 1e50deefb5d5..7100fd0e9e88 100644
> --- a/drivers/gpu/drm/ttm/ttm_pool.c
> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
> @@ -367,7 +367,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
>  	if (tt->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC)
>  		gfp_flags |= __GFP_ZERO;
>  
> -	if (tt->page_flags & TTM_PAGE_FLAG_NO_RETRY)
> +	if (ctx->retry_mayfail)
>  		gfp_flags |= __GFP_RETRY_MAYFAIL;
>  
>  	if (pool->use_dma32)
> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> index 8861a74ac335..cfd633c7e764 100644
> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> @@ -51,9 +51,6 @@ int ttm_tt_create(struct ttm_buffer_object *bo, bool zero_alloc)
>  	if (bo->ttm)
>  		return 0;
>  
> -	if (bdev->no_retry)
> -		page_flags |= TTM_PAGE_FLAG_NO_RETRY;
> -
>  	switch (bo->type) {
>  	case ttm_bo_type_device:
>  		if (zero_alloc)
> @@ -211,8 +208,6 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
>  
>  	swap_space = swap_storage->f_mapping;
>  	gfp_mask = mapping_gfp_mask(swap_space);
> -	if (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY)
> -		gfp_mask |= __GFP_RETRY_MAYFAIL;
>  
>  	for (i = 0; i < ttm->num_pages; ++i) {
>  		from_page = shmem_read_mapping_page_gfp(swap_space, i,
> @@ -260,8 +255,6 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
>  
>  	swap_space = swap_storage->f_mapping;
>  	gfp_mask = mapping_gfp_mask(swap_space);
> -	if (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY)
> -		gfp_mask |= __GFP_RETRY_MAYFAIL;
>  
>  	for (i = 0; i < ttm->num_pages; ++i) {
>  		from_page = ttm->pages[i];
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index 37102e45e496..4c7c2d574db6 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -195,6 +195,7 @@ struct ttm_bo_kmap_obj {
>   *
>   * @interruptible: Sleep interruptible if sleeping.
>   * @no_wait_gpu: Return immediately if the GPU is busy.
> + * @retry_mayfail: Set the __GFP_RETRY_MAYFAIL when allocation pages.
>   * @resv: Reservation object to allow reserved evictions with.
>   * @flags: Including the following flags
>   *
> @@ -204,6 +205,7 @@ struct ttm_bo_kmap_obj {
>  struct ttm_operation_ctx {
>  	bool interruptible;
>  	bool no_wait_gpu;
> +	bool retry_mayfail;
>  	struct dma_resv *resv;
>  	uint64_t bytes_moved;
>  	uint32_t flags;
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> index e9f683fa72dc..da8208f43378 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -276,7 +276,6 @@ extern struct ttm_bo_global {
>   * @dev_mapping: A pointer to the struct address_space representing the
>   * device address space.
>   * @wq: Work queue structure for the delayed delete workqueue.
> - * @no_retry: Don't retry allocation if it fails
>   *
>   */
>  
> @@ -314,8 +313,6 @@ struct ttm_bo_device {
>  	 */
>  
>  	struct delayed_work wq;
> -
> -	bool no_retry;
>  };
>  
>  static inline struct ttm_resource_manager *ttm_manager_type(struct ttm_bo_device *bdev,
> -- 
> 2.25.1
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


More information about the dri-devel mailing list