[PATCH v2 2/2] drm/amdgpu: track bo memory stats at runtime

Christian König christian.koenig at amd.com
Tue Sep 10 15:47:16 UTC 2024


Am 24.06.24 um 16:08 schrieb Yunxiang Li:
> Before, every time fdinfo is queried we try to lock all the BOs in the
> VM and calculate memory usage from scratch. This works okay if the
> fdinfo is rarely read and the VMs don't have a ton of BOs. If either of
> these conditions is not true, we get a massive performance hit.
>
> In this new revision, we track the BOs as they change state. This way
> when the fdinfo is queried we only need to take the status lock and copy
> out the usage stats with minimal impact to the runtime performance.
>
> Signed-off-by: Yunxiang Li <Yunxiang.Li at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  15 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |   3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |   4 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  21 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 183 +++++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |  32 +++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |   1 +
>   8 files changed, 185 insertions(+), 76 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index ec888fc6ead8..397cb0f83811 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1166,7 +1166,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   			if (!bo)
>   				continue;
>   
> -			amdgpu_vm_bo_invalidate(adev, bo, false);
> +			amdgpu_vm_bo_invalidate(bo, NULL, false);
>   		}
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> index 8e81a83d37d8..87fc3f37a1eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> @@ -190,6 +190,13 @@ static void amdgpu_dma_buf_unmap(struct dma_buf_attachment *attach,
>   	}
>   }
>   
> +static void amdgpu_dma_buf_release(struct dma_buf *buf)
> +{
> +	struct amdgpu_bo *bo = gem_to_amdgpu_bo(buf->priv);
> +	amdgpu_vm_bo_handle_shared(bo, true);
> +	drm_gem_dmabuf_release(buf);
> +}
> +
>   /**
>    * amdgpu_dma_buf_begin_cpu_access - &dma_buf_ops.begin_cpu_access implementation
>    * @dma_buf: Shared DMA buffer
> @@ -237,7 +244,7 @@ const struct dma_buf_ops amdgpu_dmabuf_ops = {
>   	.unpin = amdgpu_dma_buf_unpin,
>   	.map_dma_buf = amdgpu_dma_buf_map,
>   	.unmap_dma_buf = amdgpu_dma_buf_unmap,
> -	.release = drm_gem_dmabuf_release,
> +	.release = amdgpu_dma_buf_release,
>   	.begin_cpu_access = amdgpu_dma_buf_begin_cpu_access,
>   	.mmap = drm_gem_dmabuf_mmap,
>   	.vmap = drm_gem_dmabuf_vmap,
> @@ -265,8 +272,10 @@ struct dma_buf *amdgpu_gem_prime_export(struct drm_gem_object *gobj,
>   		return ERR_PTR(-EPERM);
>   
>   	buf = drm_gem_prime_export(gobj, flags);
> -	if (!IS_ERR(buf))
> +	if (!IS_ERR(buf)) {
>   		buf->ops = &amdgpu_dmabuf_ops;
> +		amdgpu_vm_bo_handle_shared(bo, false);
> +	}
>   
>   	return buf;
>   }
> @@ -345,7 +354,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach)
>   	/* FIXME: This should be after the "if", but needs a fix to make sure
>   	 * DMABuf imports are initialized in the right VM list.
>   	 */
> -	amdgpu_vm_bo_invalidate(adev, bo, false);
> +	amdgpu_vm_bo_invalidate(bo, NULL, false);
>   	if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
>   		return;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 1f22b4208729..1cd28fc1dcf5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -843,7 +843,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
>   int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
>   			struct drm_file *filp)
>   {
> -	struct amdgpu_device *adev = drm_to_adev(dev);
>   	struct drm_amdgpu_gem_op *args = data;
>   	struct drm_gem_object *gobj;
>   	struct amdgpu_vm_bo_base *base;
> @@ -903,7 +902,7 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
>   			robj->allowed_domains |= AMDGPU_GEM_DOMAIN_GTT;
>   
>   		if (robj->flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
> -			amdgpu_vm_bo_invalidate(adev, robj, true);
> +			amdgpu_vm_bo_invalidate(robj, NULL, true);
>   
>   		amdgpu_bo_unreserve(robj);
>   		break;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index bcf25c7e85e0..469bb600bcaa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -37,6 +37,7 @@
>   #include <drm/amdgpu_drm.h>
>   #include <drm/drm_cache.h>
>   #include "amdgpu.h"
> +#include "amdgpu_vm.h"
>   #include "amdgpu_trace.h"
>   #include "amdgpu_amdkfd.h"
>   
> @@ -1258,7 +1259,6 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
>   			   bool evict,
>   			   struct ttm_resource *new_mem)
>   {
> -	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
>   	struct ttm_resource *old_mem = bo->resource;
>   	struct amdgpu_bo *abo;
>   
> @@ -1266,7 +1266,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
>   		return;
>   
>   	abo = ttm_to_amdgpu_bo(bo);
> -	amdgpu_vm_bo_invalidate(adev, abo, evict);
> +	amdgpu_vm_bo_invalidate(abo, new_mem, evict);
>   
>   	amdgpu_bo_kunmap(abo);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 218919fc13ec..c65463f66cb2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -135,27 +135,6 @@ struct amdgpu_bo_vm {
>   	struct amdgpu_vm_bo_base        entries[];
>   };
>   
> -struct amdgpu_mem_stats {
> -	/* current VRAM usage */
> -	uint64_t vram;
> -	/* current shared VRAM usage */
> -	uint64_t vram_shared;
> -	/* current GTT usage */
> -	uint64_t gtt;
> -	/* current shared GTT usage */
> -	uint64_t gtt_shared;
> -	/* current system memory usage */
> -	uint64_t cpu;
> -	/* current shared system memory usage */
> -	uint64_t cpu_shared;
> -	/* sum of evicted buffers */
> -	uint64_t evicted_vram;
> -	/* how much userspace asked for */
> -	uint64_t requested_vram;
> -	/* how much userspace asked for */
> -	uint64_t requested_gtt;
> -};
> -
>   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
>   {
>   	return container_of(tbo, struct amdgpu_bo, tbo);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3abfa66d72a2..8dd239912bb9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -36,6 +36,7 @@
>   #include <drm/ttm/ttm_tt.h>
>   #include <drm/drm_exec.h>
>   #include "amdgpu.h"
> +#include "amdgpu_vm.h"
>   #include "amdgpu_trace.h"
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu_gmc.h"
> @@ -310,6 +311,111 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
>   	spin_unlock(&vm->status_lock);
>   }
>   
> +static void amdgpu_vm_stats_update_shared(struct amdgpu_vm_bo_base *base,
> +				      bool unshare)
> +{
> +	struct amdgpu_vm *vm = base->vm;
> +	struct amdgpu_bo *bo = base->bo;
> +	struct amdgpu_mem_stats *stats;
> +	uint64_t size;
> +
> +	if (!vm || !bo)
> +		return;
> +
> +	spin_lock(&vm->status_lock);
> +	stats = &vm->stats;
> +	size = amdgpu_bo_size(bo);
> +	switch (bo->tbo.resource->mem_type) {
> +	case TTM_PL_VRAM:
> +		stats->vram_shared += unshare ? -size : size;
> +		break;
> +	case TTM_PL_TT:
> +		stats->gtt_shared += unshare ? -size : size;
> +		break;
> +	case TTM_PL_SYSTEM:
> +	default:
> +		stats->cpu_shared += unshare ? -size : size;
> +		break;
> +	}
> +	spin_unlock(&vm->status_lock);
> +}
> +
> +void amdgpu_vm_stats_update(struct amdgpu_vm_bo_base *base,
> +			    struct ttm_resource *new_mem,
> +			    struct ttm_resource *old_mem)
> +{
> +	struct amdgpu_vm *vm = base->vm;
> +	struct amdgpu_bo *bo = base->bo;
> +	struct amdgpu_mem_stats *stats;
> +	uint64_t size;
> +	bool shared;
> +
> +	if (!vm || !bo || (!new_mem && !old_mem))
> +		return;
> +	spin_lock(&vm->status_lock);
> +
> +	stats = &vm->stats;
> +	size = amdgpu_bo_size(bo);
> +	shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base);
> +
> +	if (old_mem) {
> +		switch (old_mem->mem_type) {
> +		case TTM_PL_VRAM:
> +			stats->vram -= size;
> +			if (shared)
> +				stats->vram_shared -= size;
> +			break;
> +		case TTM_PL_TT:
> +			stats->gtt -= size;
> +			if (shared)
> +				stats->gtt_shared -= size;
> +			break;
> +		case TTM_PL_SYSTEM:
> +		default:
> +			stats->cpu -= size;
> +			if (shared)
> +				stats->cpu_shared -= size;
> +			break;
> +		}
> +	}
> +	if (new_mem) {
> +		switch (new_mem->mem_type) {
> +		case TTM_PL_VRAM:
> +			stats->vram += size;
> +			if (shared)
> +				stats->vram_shared += size;
> +			break;
> +		case TTM_PL_TT:
> +			stats->gtt += size;
> +			if (shared)
> +				stats->gtt_shared += size;
> +			break;
> +		case TTM_PL_SYSTEM:
> +		default:
> +			stats->cpu += size;
> +			if (shared)
> +				stats->cpu_shared += size;
> +			break;
> +		}
> +	}
> +	if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) {
> +		if (!old_mem)
> +			stats->requested_vram += size;
> +		else if (old_mem->mem_type != TTM_PL_VRAM)
> +			stats->evicted_vram -= size;
> +		if (!new_mem)
> +			stats->requested_vram -= size;
> +		else if (new_mem->mem_type != TTM_PL_VRAM)
> +			stats->evicted_vram += size;
> +	} else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) {
> +		if (!old_mem)
> +			stats->requested_gtt += size;
> +		if (!new_mem)
> +			stats->requested_gtt -= size;
> +	}
> +	spin_unlock(&vm->status_lock);
> +}
> +
>   /**
>    * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
>    *
> @@ -332,6 +438,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
>   		return;
>   	base->next = bo->vm_bo;
>   	bo->vm_bo = base;
> +	amdgpu_vm_stats_update(base, bo->tbo.resource, NULL);
>   
>   	if (!amdgpu_vm_is_bo_always_valid(vm, bo))
>   		return;
> @@ -1088,51 +1195,11 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	return r;
>   }
>   
> -static void amdgpu_vm_bo_get_memory(struct amdgpu_bo_va *bo_va,
> -				    struct amdgpu_mem_stats *stats)
> -{
> -	struct amdgpu_vm *vm = bo_va->base.vm;
> -	struct amdgpu_bo *bo = bo_va->base.bo;
> -
> -	if (!bo)
> -		return;
> -
> -	/*
> -	 * For now ignore BOs which are currently locked and potentially
> -	 * changing their location.
> -	 */
> -	if (!amdgpu_vm_is_bo_always_valid(vm, bo) &&
> -	    !dma_resv_trylock(bo->tbo.base.resv))
> -		return;
> -
> -	amdgpu_bo_get_memory(bo, stats);
> -	if (!amdgpu_vm_is_bo_always_valid(vm, bo))
> -		dma_resv_unlock(bo->tbo.base.resv);
> -}
> -
>   void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
>   			  struct amdgpu_mem_stats *stats)
>   {
> -	struct amdgpu_bo_va *bo_va, *tmp;
> -
>   	spin_lock(&vm->status_lock);
> -	list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status)
> -		amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> -	list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status)
> -		amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> -	list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status)
> -		amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> -	list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status)
> -		amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> -	list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status)
> -		amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> -	list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status)
> -		amdgpu_vm_bo_get_memory(bo_va, stats);
> +	*stats = vm->stats;
>   	spin_unlock(&vm->status_lock);
>   }
>   
> @@ -2043,6 +2110,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
>   			if (*base != &bo_va->base)
>   				continue;
>   
> +			amdgpu_vm_stats_update(*base, NULL, bo->tbo.resource);
>   			*base = bo_va->base.next;
>   			break;
>   		}
> @@ -2108,17 +2176,37 @@ bool amdgpu_vm_evictable(struct amdgpu_bo *bo)
>   	return true;
>   }
>   
> +/**
> + * amdgpu_vm_bo_handle_shared - called when bo gets shared/unshared
> + *
> + * @bo: amdgpu buffer object
> + * @unshare: bo got unshared
> + *
> + * For now the only thing this does is to update the per VM stats
> + */
> +void amdgpu_vm_bo_handle_shared(struct amdgpu_bo *bo, bool unshare)
> +{
> +
> +	struct amdgpu_vm_bo_base *bo_base;
> +
> +	if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo))
> +		bo = bo->parent;
> +
> +	for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next)
> +		amdgpu_vm_stats_update_shared(bo_base, unshare);
> +}
> +
>   /**
>    * amdgpu_vm_bo_invalidate - mark the bo as invalid
>    *
> - * @adev: amdgpu_device pointer
>    * @bo: amdgpu buffer object
> + * @new_mem: the new placement of the BO move, NULL if just invalidate
>    * @evicted: is the BO evicted
>    *
>    * Mark @bo as invalid.
>    */
> -void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
> -			     struct amdgpu_bo *bo, bool evicted)
> +void amdgpu_vm_bo_invalidate(struct amdgpu_bo *bo, struct ttm_resource *new_mem,
> +			     bool evicted)
>   {
>   	struct amdgpu_vm_bo_base *bo_base;
>   
> @@ -2129,6 +2217,9 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
>   	for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
>   		struct amdgpu_vm *vm = bo_base->vm;
>   
> +		if (new_mem)
> +			amdgpu_vm_stats_update(bo_base, new_mem, bo->tbo.resource);
> +

Ok that looks extremely ugly. Please just add a separate function and 
call that from the TTM move function.

>   		if (evicted && amdgpu_vm_is_bo_always_valid(vm, bo)) {
>   			amdgpu_vm_bo_evicted(bo_base);
>   			continue;
> @@ -2640,6 +2731,10 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   		}
>   	}
>   
> +	if (memchr_inv(&vm->stats, 0, sizeof(vm->stats)))

Please either drop that or compare each memory stat variable separately. 
Byte by byte compares are really frowned upon.

Apart from that looks rather good to me,
Christian.

> +		dev_err(adev->dev, "VM memory stats is non-zero when fini\n");
> +	else
> +		dev_dbg(adev->dev, "VM memory stats is zero when fini\n");
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 046949c4b695..13e1ec4deb05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -42,7 +42,27 @@ struct amdgpu_bo_va;
>   struct amdgpu_job;
>   struct amdgpu_bo_list_entry;
>   struct amdgpu_bo_vm;
> -struct amdgpu_mem_stats;
> +
> +struct amdgpu_mem_stats {
> +	/* current VRAM usage */
> +	uint64_t vram;
> +	/* current shared VRAM usage */
> +	uint64_t vram_shared;
> +	/* current GTT usage */
> +	uint64_t gtt;
> +	/* current shared GTT usage */
> +	uint64_t gtt_shared;
> +	/* current system memory usage */
> +	uint64_t cpu;
> +	/* current shared system memory usage */
> +	uint64_t cpu_shared;
> +	/* sum of evicted buffers */
> +	uint64_t evicted_vram;
> +	/* how much userspace asked for */
> +	uint64_t requested_vram;
> +	/* how much userspace asked for */
> +	uint64_t requested_gtt;
> +};
>   
>   /*
>    * GPUVM handling
> @@ -336,6 +356,8 @@ struct amdgpu_vm {
>   	/* Lock to protect vm_bo add/del/move on all lists of vm */
>   	spinlock_t		status_lock;
>   
> +	struct amdgpu_mem_stats stats;
> +
>   	/* Per-VM and PT BOs who needs a validation */
>   	struct list_head	evicted;
>   
> @@ -514,8 +536,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>   			struct amdgpu_bo_va *bo_va,
>   			bool clear);
>   bool amdgpu_vm_evictable(struct amdgpu_bo *bo);
> -void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
> -			     struct amdgpu_bo *bo, bool evicted);
> +void amdgpu_vm_stats_update(struct amdgpu_vm_bo_base *base,
> +			    struct ttm_resource *new_mem,
> +			    struct ttm_resource *old_mem);
> +void amdgpu_vm_bo_handle_shared(struct amdgpu_bo *bo, bool unshare);
> +void amdgpu_vm_bo_invalidate(struct amdgpu_bo *bo, struct ttm_resource *new_mem,
> +			     bool evicted);
>   uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr);
>   struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
>   				       struct amdgpu_bo *bo);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index e39d6e7643bf..841a575cb18d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -586,6 +586,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
>   	if (!entry->bo)
>   		return;
>   
> +	amdgpu_vm_stats_update(entry, NULL, entry->bo->tbo.resource);
>   	entry->bo->vm_bo = NULL;
>   	shadow = amdgpu_bo_shadowed(entry->bo);
>   	if (shadow) {



More information about the amd-gfx mailing list