[PATCH 3/5] ttm: add initial memcg integration. (v2)

Fri May 2 12:01:13 UTC 2025

On 5/2/25 05:36, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
> 
> Doing proper integration of TTM system memory allocations with
> memcg is a difficult ask, primarily due to difficulties around
> accounting for evictions properly.
> 
> However there are systems where userspace will be allocating
> objects in system memory and they won't be prone to migrating
> or evicting and we should start with at least accounting those.
> 
> This adds a memcg group to ttm bo and resource objects.
> 
> This memcg is used when:
> a) resource is allocated in system/tt memory
> b) the account_op is set in the operation context
> 
> This patch disables the flag around object evictions,
> but any operation that could populate a TTM tt object in process context
> should set the account_op flag.
> 
> This v2 moves the charging up a level and also no longer uses
> __GFP_ACCOUNT, or attaches the memcg to object pages, it instead
> uses the same approach as socket memory and just charges/uncharges
> at the object level. This was suggested by Christian.
> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  drivers/gpu/drm/ttm/ttm_bo.c       |  9 +++++++--
>  drivers/gpu/drm/ttm/ttm_bo_vm.c    |  3 ++-
>  drivers/gpu/drm/ttm/ttm_resource.c | 20 ++++++++++++++++++++
>  include/drm/ttm/ttm_bo.h           |  8 ++++++++
>  include/drm/ttm/ttm_resource.h     |  6 +++++-
>  5 files changed, 42 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 95b86003c50d..89d2df246ed2 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -537,6 +537,7 @@ static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *
>  	evict_walk->evicted++;
>  	if (evict_walk->res)
>  		lret = ttm_resource_alloc(evict_walk->evictor, evict_walk->place,
> +					  walk->ctx,
>  					  evict_walk->res, NULL);
>  	if (lret == 0)
>  		return 1;
> @@ -733,7 +734,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo,
>  			continue;
>  
>  		may_evict = (force_space && place->mem_type != TTM_PL_SYSTEM);
> -		ret = ttm_resource_alloc(bo, place, res, force_space ? &limit_pool : NULL);
> +		ret = ttm_resource_alloc(bo, place, ctx, res, force_space ? &limit_pool : NULL);
>  		if (ret) {
>  			if (ret != -ENOSPC && ret != -EAGAIN) {
>  				dmem_cgroup_pool_state_put(limit_pool);
> @@ -744,8 +745,12 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo,
>  				continue;
>  			}
>  
> +			/* we don't want to account evictions at this point */
> +			bool old_ctx_account = ctx->account_op;
> +			ctx->account_op = false;
>  			ret = ttm_bo_evict_alloc(bdev, man, place, bo, ctx,
>  						 ticket, res, limit_pool);
> +			ctx->account_op = old_ctx_account;
>  			dmem_cgroup_pool_state_put(limit_pool);
>  			if (ret == -EBUSY)
>  				continue;
> @@ -1145,7 +1150,7 @@ ttm_bo_swapout_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo)
>  
>  		memset(&hop, 0, sizeof(hop));
>  		place.mem_type = TTM_PL_SYSTEM;
> -		ret = ttm_resource_alloc(bo, &place, &evict_mem, NULL);
> +		ret = ttm_resource_alloc(bo, &place, ctx, &evict_mem, NULL);
>  		if (ret)
>  			goto out;
>  
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index a194db83421d..163039cf40a5 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -220,7 +220,8 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>  		struct ttm_operation_ctx ctx = {
>  			.interruptible = true,
>  			.no_wait_gpu = false,
> -			.force_alloc = true
> +			.force_alloc = true,
> +			.account_op = true,
>  		};
>  
>  		ttm = bo->ttm;
> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c
> index 7e5a60c55813..da257678a5ba 100644
> --- a/drivers/gpu/drm/ttm/ttm_resource.c
> +++ b/drivers/gpu/drm/ttm/ttm_resource.c
> @@ -27,6 +27,7 @@
>  #include <linux/iosys-map.h>
>  #include <linux/scatterlist.h>
>  #include <linux/cgroup_dmem.h>
> +#include <linux/memcontrol.h>
>  
>  #include <drm/ttm/ttm_bo.h>
>  #include <drm/ttm/ttm_placement.h>
> @@ -373,12 +374,14 @@ EXPORT_SYMBOL(ttm_resource_fini);
>  
>  int ttm_resource_alloc(struct ttm_buffer_object *bo,
>  		       const struct ttm_place *place,
> +		       struct ttm_operation_ctx *ctx,
>  		       struct ttm_resource **res_ptr,
>  		       struct dmem_cgroup_pool_state **ret_limit_pool)
>  {
>  	struct ttm_resource_manager *man =
>  		ttm_manager_type(bo->bdev, place->mem_type);
>  	struct dmem_cgroup_pool_state *pool = NULL;
> +	struct mem_cgroup *memcg = NULL;
>  	int ret;
>  
>  	if (man->cg) {
> @@ -387,13 +390,26 @@ int ttm_resource_alloc(struct ttm_buffer_object *bo,
>  			return ret;
>  	}
>  
> +	if ((place->mem_type == TTM_PL_SYSTEM || place->mem_type == TTM_PL_TT) &&
> +	    ctx->account_op && bo->memcg) {

I suggest to make that a placement flag instead of putting it into the ctx.

E.g. something like "if (place->flags TTM_PL_FLAG_MEMCG)" here and then just set the flag in amdgpu_bo_placement_from_domain() and clear it in amdgpu_evict_flags().

This wopuld not only simplify the handling here but also gives a potential solution to the eviction handling.

In other words allocations don't get charged to memcg on eviction, but when the next CS says during validation that the BO should stay in GTT we then charge the memory to the application.

Background is that then the right application get's the potential ENOMEM from their CS IOCTL.

Apart from that the solution looks totally sane to me as well.

> +		memcg = bo->memcg;
> +		gfp_t gfp_flags = GFP_USER;
> +		if (ctx->gfp_retry_mayfail)
> +			gfp_flags |= __GFP_RETRY_MAYFAIL;

BTW: gfp_retry_mayfail is kind of deprecated. Sima is strictly against that.

I was about to remove it but then XE picked it up as well.

I never kicked of the discussion on what to do with that.

Regards,
Christian.

> +
> +		if (!mem_cgroup_charge_gpu(memcg, bo->base.size >> PAGE_SHIFT, gfp_flags))
> +			return -ENOMEM;
> +	}
>  	ret = man->func->alloc(man, bo, place, res_ptr);
>  	if (ret) {
>  		if (pool)
>  			dmem_cgroup_uncharge(pool, bo->base.size);
> +		if (memcg)
> +			mem_cgroup_uncharge_gpu(memcg, bo->base.size >> PAGE_SHIFT);
>  		return ret;
>  	}
>  
> +	(*res_ptr)->memcg = memcg;
>  	(*res_ptr)->css = pool;
>  
>  	spin_lock(&bo->bdev->lru_lock);
> @@ -407,6 +423,7 @@ void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res)
>  {
>  	struct ttm_resource_manager *man;
>  	struct dmem_cgroup_pool_state *pool;
> +	struct mem_cgroup *memcg;
>  
>  	if (!*res)
>  		return;
> @@ -416,11 +433,14 @@ void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res)
>  	spin_unlock(&bo->bdev->lru_lock);
>  
>  	pool = (*res)->css;
> +	memcg = (*res)->memcg;
>  	man = ttm_manager_type(bo->bdev, (*res)->mem_type);
>  	man->func->free(man, *res);
>  	*res = NULL;
>  	if (man->cg)
>  		dmem_cgroup_uncharge(pool, bo->base.size);
> +	if (memcg)
> +		mem_cgroup_uncharge_gpu(memcg, bo->base.size >> PAGE_SHIFT);
>  }
>  EXPORT_SYMBOL(ttm_resource_free);
>  
> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> index 903cd1030110..56a33b5f5c41 100644
> --- a/include/drm/ttm/ttm_bo.h
> +++ b/include/drm/ttm/ttm_bo.h
> @@ -135,6 +135,12 @@ struct ttm_buffer_object {
>  	 * reservation lock.
>  	 */
>  	struct sg_table *sg;
> +
> +	/**
> +	 * @memcg: memory cgroup to charge this to if it ends up using system memory.
> +	 * NULL means don't charge.
> +	 */
> +	struct mem_cgroup *memcg;
>  };
>  
>  #define TTM_BO_MAP_IOMEM_MASK 0x80
> @@ -174,6 +180,7 @@ struct ttm_bo_kmap_obj {
>   * BOs share the same reservation object.
>   * @force_alloc: Don't check the memory account during suspend or CPU page
>   * faults. Should only be used by TTM internally.
> + * @account_op: account for any memory allocations by a bo with an memcg.
>   * @resv: Reservation object to allow reserved evictions with.
>   * @bytes_moved: Statistics on how many bytes have been moved.
>   *
> @@ -186,6 +193,7 @@ struct ttm_operation_ctx {
>  	bool gfp_retry_mayfail;
>  	bool allow_res_evict;
>  	bool force_alloc;
> +	bool account_op;
>  	struct dma_resv *resv;
>  	uint64_t bytes_moved;
>  };
> diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h
> index e52bba15012f..1ab515c6ec00 100644
> --- a/include/drm/ttm/ttm_resource.h
> +++ b/include/drm/ttm/ttm_resource.h
> @@ -45,6 +45,7 @@ struct ttm_resource;
>  struct ttm_place;
>  struct ttm_buffer_object;
>  struct ttm_placement;
> +struct ttm_operation_ctx;
>  struct iosys_map;
>  struct io_mapping;
>  struct sg_table;
> @@ -245,7 +246,8 @@ struct ttm_bus_placement {
>   * @placement: Placement flags.
>   * @bus: Placement on io bus accessible to the CPU
>   * @bo: weak reference to the BO, protected by ttm_device::lru_lock
> - * @css: cgroup state this resource is charged to
> + * @css: cgroup state this resource is charged to for dmem
> + * @memcg: memory cgroup this resource is charged to for sysmem
>   *
>   * Structure indicating the placement and space resources used by a
>   * buffer object.
> @@ -264,6 +266,7 @@ struct ttm_resource {
>  	 * @lru: Least recently used list, see &ttm_resource_manager.lru
>  	 */
>  	struct ttm_lru_item lru;
> +	struct mem_cgroup *memcg;
>  };
>  
>  /**
> @@ -444,6 +447,7 @@ void ttm_resource_fini(struct ttm_resource_manager *man,
>  
>  int ttm_resource_alloc(struct ttm_buffer_object *bo,
>  		       const struct ttm_place *place,
> +		       struct ttm_operation_ctx *ctx,
>  		       struct ttm_resource **res,
>  		       struct dmem_cgroup_pool_state **ret_limit_pool);
>  void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res);