[PATCH v2 2/2] drm/amdgpu: track bo memory stats at runtime
Christian König
christian.koenig at amd.com
Tue Sep 10 15:47:16 UTC 2024
Am 24.06.24 um 16:08 schrieb Yunxiang Li:
> Before, every time fdinfo is queried we try to lock all the BOs in the
> VM and calculate memory usage from scratch. This works okay if the
> fdinfo is rarely read and the VMs don't have a ton of BOs. If either of
> these conditions is not true, we get a massive performance hit.
>
> In this new revision, we track the BOs as they change state. This way
> when the fdinfo is queried we only need to take the status lock and copy
> out the usage stats with minimal impact to the runtime performance.
>
> Signed-off-by: Yunxiang Li <Yunxiang.Li at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 15 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 3 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 21 ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 183 +++++++++++++++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 32 +++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 1 +
> 8 files changed, 185 insertions(+), 76 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index ec888fc6ead8..397cb0f83811 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1166,7 +1166,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> if (!bo)
> continue;
>
> - amdgpu_vm_bo_invalidate(adev, bo, false);
> + amdgpu_vm_bo_invalidate(bo, NULL, false);
> }
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> index 8e81a83d37d8..87fc3f37a1eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> @@ -190,6 +190,13 @@ static void amdgpu_dma_buf_unmap(struct dma_buf_attachment *attach,
> }
> }
>
> +static void amdgpu_dma_buf_release(struct dma_buf *buf)
> +{
> + struct amdgpu_bo *bo = gem_to_amdgpu_bo(buf->priv);
> + amdgpu_vm_bo_handle_shared(bo, true);
> + drm_gem_dmabuf_release(buf);
> +}
> +
> /**
> * amdgpu_dma_buf_begin_cpu_access - &dma_buf_ops.begin_cpu_access implementation
> * @dma_buf: Shared DMA buffer
> @@ -237,7 +244,7 @@ const struct dma_buf_ops amdgpu_dmabuf_ops = {
> .unpin = amdgpu_dma_buf_unpin,
> .map_dma_buf = amdgpu_dma_buf_map,
> .unmap_dma_buf = amdgpu_dma_buf_unmap,
> - .release = drm_gem_dmabuf_release,
> + .release = amdgpu_dma_buf_release,
> .begin_cpu_access = amdgpu_dma_buf_begin_cpu_access,
> .mmap = drm_gem_dmabuf_mmap,
> .vmap = drm_gem_dmabuf_vmap,
> @@ -265,8 +272,10 @@ struct dma_buf *amdgpu_gem_prime_export(struct drm_gem_object *gobj,
> return ERR_PTR(-EPERM);
>
> buf = drm_gem_prime_export(gobj, flags);
> - if (!IS_ERR(buf))
> + if (!IS_ERR(buf)) {
> buf->ops = &amdgpu_dmabuf_ops;
> + amdgpu_vm_bo_handle_shared(bo, false);
> + }
>
> return buf;
> }
> @@ -345,7 +354,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach)
> /* FIXME: This should be after the "if", but needs a fix to make sure
> * DMABuf imports are initialized in the right VM list.
> */
> - amdgpu_vm_bo_invalidate(adev, bo, false);
> + amdgpu_vm_bo_invalidate(bo, NULL, false);
> if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
> return;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 1f22b4208729..1cd28fc1dcf5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -843,7 +843,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
> int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
> struct drm_file *filp)
> {
> - struct amdgpu_device *adev = drm_to_adev(dev);
> struct drm_amdgpu_gem_op *args = data;
> struct drm_gem_object *gobj;
> struct amdgpu_vm_bo_base *base;
> @@ -903,7 +902,7 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
> robj->allowed_domains |= AMDGPU_GEM_DOMAIN_GTT;
>
> if (robj->flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
> - amdgpu_vm_bo_invalidate(adev, robj, true);
> + amdgpu_vm_bo_invalidate(robj, NULL, true);
>
> amdgpu_bo_unreserve(robj);
> break;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index bcf25c7e85e0..469bb600bcaa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -37,6 +37,7 @@
> #include <drm/amdgpu_drm.h>
> #include <drm/drm_cache.h>
> #include "amdgpu.h"
> +#include "amdgpu_vm.h"
> #include "amdgpu_trace.h"
> #include "amdgpu_amdkfd.h"
>
> @@ -1258,7 +1259,6 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
> bool evict,
> struct ttm_resource *new_mem)
> {
> - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
> struct ttm_resource *old_mem = bo->resource;
> struct amdgpu_bo *abo;
>
> @@ -1266,7 +1266,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
> return;
>
> abo = ttm_to_amdgpu_bo(bo);
> - amdgpu_vm_bo_invalidate(adev, abo, evict);
> + amdgpu_vm_bo_invalidate(abo, new_mem, evict);
>
> amdgpu_bo_kunmap(abo);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 218919fc13ec..c65463f66cb2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -135,27 +135,6 @@ struct amdgpu_bo_vm {
> struct amdgpu_vm_bo_base entries[];
> };
>
> -struct amdgpu_mem_stats {
> - /* current VRAM usage */
> - uint64_t vram;
> - /* current shared VRAM usage */
> - uint64_t vram_shared;
> - /* current GTT usage */
> - uint64_t gtt;
> - /* current shared GTT usage */
> - uint64_t gtt_shared;
> - /* current system memory usage */
> - uint64_t cpu;
> - /* current shared system memory usage */
> - uint64_t cpu_shared;
> - /* sum of evicted buffers */
> - uint64_t evicted_vram;
> - /* how much userspace asked for */
> - uint64_t requested_vram;
> - /* how much userspace asked for */
> - uint64_t requested_gtt;
> -};
> -
> static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
> {
> return container_of(tbo, struct amdgpu_bo, tbo);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3abfa66d72a2..8dd239912bb9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -36,6 +36,7 @@
> #include <drm/ttm/ttm_tt.h>
> #include <drm/drm_exec.h>
> #include "amdgpu.h"
> +#include "amdgpu_vm.h"
> #include "amdgpu_trace.h"
> #include "amdgpu_amdkfd.h"
> #include "amdgpu_gmc.h"
> @@ -310,6 +311,111 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
> spin_unlock(&vm->status_lock);
> }
>
> +static void amdgpu_vm_stats_update_shared(struct amdgpu_vm_bo_base *base,
> + bool unshare)
> +{
> + struct amdgpu_vm *vm = base->vm;
> + struct amdgpu_bo *bo = base->bo;
> + struct amdgpu_mem_stats *stats;
> + uint64_t size;
> +
> + if (!vm || !bo)
> + return;
> +
> + spin_lock(&vm->status_lock);
> + stats = &vm->stats;
> + size = amdgpu_bo_size(bo);
> + switch (bo->tbo.resource->mem_type) {
> + case TTM_PL_VRAM:
> + stats->vram_shared += unshare ? -size : size;
> + break;
> + case TTM_PL_TT:
> + stats->gtt_shared += unshare ? -size : size;
> + break;
> + case TTM_PL_SYSTEM:
> + default:
> + stats->cpu_shared += unshare ? -size : size;
> + break;
> + }
> + spin_unlock(&vm->status_lock);
> +}
> +
> +void amdgpu_vm_stats_update(struct amdgpu_vm_bo_base *base,
> + struct ttm_resource *new_mem,
> + struct ttm_resource *old_mem)
> +{
> + struct amdgpu_vm *vm = base->vm;
> + struct amdgpu_bo *bo = base->bo;
> + struct amdgpu_mem_stats *stats;
> + uint64_t size;
> + bool shared;
> +
> + if (!vm || !bo || (!new_mem && !old_mem))
> + return;
> + spin_lock(&vm->status_lock);
> +
> + stats = &vm->stats;
> + size = amdgpu_bo_size(bo);
> + shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base);
> +
> + if (old_mem) {
> + switch (old_mem->mem_type) {
> + case TTM_PL_VRAM:
> + stats->vram -= size;
> + if (shared)
> + stats->vram_shared -= size;
> + break;
> + case TTM_PL_TT:
> + stats->gtt -= size;
> + if (shared)
> + stats->gtt_shared -= size;
> + break;
> + case TTM_PL_SYSTEM:
> + default:
> + stats->cpu -= size;
> + if (shared)
> + stats->cpu_shared -= size;
> + break;
> + }
> + }
> + if (new_mem) {
> + switch (new_mem->mem_type) {
> + case TTM_PL_VRAM:
> + stats->vram += size;
> + if (shared)
> + stats->vram_shared += size;
> + break;
> + case TTM_PL_TT:
> + stats->gtt += size;
> + if (shared)
> + stats->gtt_shared += size;
> + break;
> + case TTM_PL_SYSTEM:
> + default:
> + stats->cpu += size;
> + if (shared)
> + stats->cpu_shared += size;
> + break;
> + }
> + }
> + if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) {
> + if (!old_mem)
> + stats->requested_vram += size;
> + else if (old_mem->mem_type != TTM_PL_VRAM)
> + stats->evicted_vram -= size;
> + if (!new_mem)
> + stats->requested_vram -= size;
> + else if (new_mem->mem_type != TTM_PL_VRAM)
> + stats->evicted_vram += size;
> + } else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) {
> + if (!old_mem)
> + stats->requested_gtt += size;
> + if (!new_mem)
> + stats->requested_gtt -= size;
> + }
> + spin_unlock(&vm->status_lock);
> +}
> +
> /**
> * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
> *
> @@ -332,6 +438,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
> return;
> base->next = bo->vm_bo;
> bo->vm_bo = base;
> + amdgpu_vm_stats_update(base, bo->tbo.resource, NULL);
>
> if (!amdgpu_vm_is_bo_always_valid(vm, bo))
> return;
> @@ -1088,51 +1195,11 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> return r;
> }
>
> -static void amdgpu_vm_bo_get_memory(struct amdgpu_bo_va *bo_va,
> - struct amdgpu_mem_stats *stats)
> -{
> - struct amdgpu_vm *vm = bo_va->base.vm;
> - struct amdgpu_bo *bo = bo_va->base.bo;
> -
> - if (!bo)
> - return;
> -
> - /*
> - * For now ignore BOs which are currently locked and potentially
> - * changing their location.
> - */
> - if (!amdgpu_vm_is_bo_always_valid(vm, bo) &&
> - !dma_resv_trylock(bo->tbo.base.resv))
> - return;
> -
> - amdgpu_bo_get_memory(bo, stats);
> - if (!amdgpu_vm_is_bo_always_valid(vm, bo))
> - dma_resv_unlock(bo->tbo.base.resv);
> -}
> -
> void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
> struct amdgpu_mem_stats *stats)
> {
> - struct amdgpu_bo_va *bo_va, *tmp;
> -
> spin_lock(&vm->status_lock);
> - list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status)
> - amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> - list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status)
> - amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> - list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status)
> - amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> - list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status)
> - amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> - list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status)
> - amdgpu_vm_bo_get_memory(bo_va, stats);
> -
> - list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status)
> - amdgpu_vm_bo_get_memory(bo_va, stats);
> + *stats = vm->stats;
> spin_unlock(&vm->status_lock);
> }
>
> @@ -2043,6 +2110,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
> if (*base != &bo_va->base)
> continue;
>
> + amdgpu_vm_stats_update(*base, NULL, bo->tbo.resource);
> *base = bo_va->base.next;
> break;
> }
> @@ -2108,17 +2176,37 @@ bool amdgpu_vm_evictable(struct amdgpu_bo *bo)
> return true;
> }
>
> +/**
> + * amdgpu_vm_bo_handle_shared - called when bo gets shared/unshared
> + *
> + * @bo: amdgpu buffer object
> + * @unshare: bo got unshared
> + *
> + * For now the only thing this does is to update the per VM stats
> + */
> +void amdgpu_vm_bo_handle_shared(struct amdgpu_bo *bo, bool unshare)
> +{
> +
> + struct amdgpu_vm_bo_base *bo_base;
> +
> + if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo))
> + bo = bo->parent;
> +
> + for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next)
> + amdgpu_vm_stats_update_shared(bo_base, unshare);
> +}
> +
> /**
> * amdgpu_vm_bo_invalidate - mark the bo as invalid
> *
> - * @adev: amdgpu_device pointer
> * @bo: amdgpu buffer object
> + * @new_mem: the new placement of the BO move, NULL if just invalidate
> * @evicted: is the BO evicted
> *
> * Mark @bo as invalid.
> */
> -void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
> - struct amdgpu_bo *bo, bool evicted)
> +void amdgpu_vm_bo_invalidate(struct amdgpu_bo *bo, struct ttm_resource *new_mem,
> + bool evicted)
> {
> struct amdgpu_vm_bo_base *bo_base;
>
> @@ -2129,6 +2217,9 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
> for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
> struct amdgpu_vm *vm = bo_base->vm;
>
> + if (new_mem)
> + amdgpu_vm_stats_update(bo_base, new_mem, bo->tbo.resource);
> +
Ok that looks extremely ugly. Please just add a separate function and
call that from the TTM move function.
> if (evicted && amdgpu_vm_is_bo_always_valid(vm, bo)) {
> amdgpu_vm_bo_evicted(bo_base);
> continue;
> @@ -2640,6 +2731,10 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
> }
> }
>
> + if (memchr_inv(&vm->stats, 0, sizeof(vm->stats)))
Please either drop that or compare each memory stat variable separately.
Byte by byte compares are really frowned upon.
Apart from that looks rather good to me,
Christian.
> + dev_err(adev->dev, "VM memory stats is non-zero when fini\n");
> + else
> + dev_dbg(adev->dev, "VM memory stats is zero when fini\n");
> }
>
> /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 046949c4b695..13e1ec4deb05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -42,7 +42,27 @@ struct amdgpu_bo_va;
> struct amdgpu_job;
> struct amdgpu_bo_list_entry;
> struct amdgpu_bo_vm;
> -struct amdgpu_mem_stats;
> +
> +struct amdgpu_mem_stats {
> + /* current VRAM usage */
> + uint64_t vram;
> + /* current shared VRAM usage */
> + uint64_t vram_shared;
> + /* current GTT usage */
> + uint64_t gtt;
> + /* current shared GTT usage */
> + uint64_t gtt_shared;
> + /* current system memory usage */
> + uint64_t cpu;
> + /* current shared system memory usage */
> + uint64_t cpu_shared;
> + /* sum of evicted buffers */
> + uint64_t evicted_vram;
> + /* how much userspace asked for */
> + uint64_t requested_vram;
> + /* how much userspace asked for */
> + uint64_t requested_gtt;
> +};
>
> /*
> * GPUVM handling
> @@ -336,6 +356,8 @@ struct amdgpu_vm {
> /* Lock to protect vm_bo add/del/move on all lists of vm */
> spinlock_t status_lock;
>
> + struct amdgpu_mem_stats stats;
> +
> /* Per-VM and PT BOs who needs a validation */
> struct list_head evicted;
>
> @@ -514,8 +536,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
> struct amdgpu_bo_va *bo_va,
> bool clear);
> bool amdgpu_vm_evictable(struct amdgpu_bo *bo);
> -void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
> - struct amdgpu_bo *bo, bool evicted);
> +void amdgpu_vm_stats_update(struct amdgpu_vm_bo_base *base,
> + struct ttm_resource *new_mem,
> + struct ttm_resource *old_mem);
> +void amdgpu_vm_bo_handle_shared(struct amdgpu_bo *bo, bool unshare);
> +void amdgpu_vm_bo_invalidate(struct amdgpu_bo *bo, struct ttm_resource *new_mem,
> + bool evicted);
> uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr);
> struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
> struct amdgpu_bo *bo);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index e39d6e7643bf..841a575cb18d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -586,6 +586,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
> if (!entry->bo)
> return;
>
> + amdgpu_vm_stats_update(entry, NULL, entry->bo->tbo.resource);
> entry->bo->vm_bo = NULL;
> shadow = amdgpu_bo_shadowed(entry->bo);
> if (shadow) {
More information about the amd-gfx
mailing list