[PATCH] drm/amdgpu: Add debugfs entry for printing VM info

Thu Oct 8 11:35:20 UTC 2020

Am 08.10.20 um 13:18 schrieb Mihir Patel:
> From: Mihir Bhogilal Patel <Mihir.Patel at amd.com>
>
> Create new debugfs entry to print memory info using VM buffer
> objects.
>
> Note: Early upload for discussion and review
> Require work for:
> - Identifying correct list for swap information
> - Review on proper locking
> - Consolidated memory utilization info like total etc.

Well that looks like a good start to me, a few nit picks and a bigger 
locking issue below.

> Signed-off-by: Mihir Bhogilal Patel <Mihir.Patel at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  34 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 123 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |   4 +
>   3 files changed, 161 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 2d125b8b15ee..74028b7d673c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1335,11 +1335,45 @@ static int amdgpu_debugfs_evict_gtt(struct seq_file *m, void *data)
>   	return 0;
>   }
>   
> +static int amdgpu_debugfs_vm_info(struct seq_file *m, void *data)
> +{
> +	struct drm_info_node *node = (struct drm_info_node *)m->private;
> +	struct drm_device *dev = node->minor->dev;
> +	struct drm_file *file;
> +	int r;
> +
> +#ifndef HAVE_DRM_DEVICE_FILELIST_MUTEX
> +	r = mutex_lock_interruptible(&dev->struct_mutex);
> +#else
> +	r = mutex_lock_interruptible(&dev->filelist_mutex);
> +#endif

Where is this coming from? On what branch have you based this patch?

In general we usually base new work on amd-staging-drm-next or something 
branched of from there, but this looks like some DKMS branch.

> +	if (r)
> +		return r;
> +
> +	list_for_each_entry(file, &dev->filelist, lhead) {
> +		struct amdgpu_fpriv *fpriv = file->driver_priv;
> +		struct amdgpu_vm *vm = &fpriv->vm;
> +
> +		seq_printf(m, "pid:%d\tProcess:%s ----------\n",
> +				vm->task_info.pid, vm->task_info.process_name);

We should probably add uid here as well, but this can come later.

> +		amdgpu_debugfs_vm_bo_info(vm, m);
> +	}
> +
> +#ifndef HAVE_DRM_DEVICE_FILELIST_MUTEX
> +	mutex_unlock(&dev->struct_mutex);
> +#else
> +	mutex_unlock(&dev->filelist_mutex);
> +#endif
> +
> +	return 0;
> +}
> +
>   static const struct drm_info_list amdgpu_debugfs_list[] = {
>   	{"amdgpu_vbios", amdgpu_debugfs_get_vbios_dump},
>   	{"amdgpu_test_ib", &amdgpu_debugfs_test_ib},
>   	{"amdgpu_evict_vram", &amdgpu_debugfs_evict_vram},
>   	{"amdgpu_evict_gtt", &amdgpu_debugfs_evict_gtt},
> +	{"amdgpu_vm_info", &amdgpu_debugfs_vm_info},
>   };
>   
>   static void amdgpu_ib_preempt_fences_swap(struct amdgpu_ring *ring,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3cd949aad500..f2f77fd3caa0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -3392,3 +3392,126 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid,
>   
>   	return false;
>   }
> +
> +#if defined(CONFIG_DEBUG_FS)
> +#define amdgpu_debugfs_vm_bo_print_flag(m, bo, flag)		\
> +	do {							\
> +		if (bo->flags & (AMDGPU_GEM_CREATE_ ## flag)) {	\
> +			seq_printf((m), " " #flag);		\
> +		}						\
> +	} while (0)

We already have something very similar in amdgpu_gem.c.

I suggest to add a function amdgpu_bo_print_flags() into amdgpu_bo.c and 
then call both from here as well as from amdgpu_gem.

> +
> +/**
> + * amdgpu_debugfs_print_bo_info - print BO info in debugfs file
> + *
> + * @id: Index or Id of the BO
> + * @bo: Requested BO for printing info
> + * @data: debugfs file
> + *
> + * Print BO information in debugfs file
> + *
> + * Returns:
> + * Size of the BO in bytes.
> + */
> +static unsigned long amdgpu_debugfs_print_bo_info(int id, struct amdgpu_bo *bo, void *data)
> +{
> +	struct seq_file *m = data;
> +	struct dma_buf_attachment *attachment;
> +	struct dma_buf *dma_buf;
> +	unsigned int domain;
> +	const char *placement;
> +	unsigned int pin_count;
> +	unsigned long size;
> +
> +	domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type);
> +	switch (domain) {
> +	case AMDGPU_GEM_DOMAIN_VRAM:
> +		placement = "VRAM";
> +		break;
> +	case AMDGPU_GEM_DOMAIN_GTT:
> +		placement = " GTT";
> +		break;
> +	case AMDGPU_GEM_DOMAIN_CPU:
> +	default:
> +		placement = " CPU";
> +		break;
> +	}

Same thing as with the flags, we should have only one place for this.

> +
> +	size = amdgpu_bo_size(bo);
> +	seq_printf(m, "\t\t0x%08x: %12ld byte %s",
> +		   id++, size, placement);
> +
> +	pin_count = READ_ONCE(bo->pin_count);
> +	if (pin_count)
> +		seq_printf(m, " pin count %d", pin_count);
> +
> +	dma_buf = READ_ONCE(bo->tbo.base.dma_buf);
> +	attachment = READ_ONCE(bo->tbo.base.import_attach);
> +
> +	if (attachment)
> +		seq_printf(m, " imported from %p", dma_buf);
> +	else if (dma_buf)
> +		seq_printf(m, " exported as %p", dma_buf);
> +
> +	amdgpu_debugfs_vm_bo_print_flag(m, bo, CPU_ACCESS_REQUIRED);
> +	amdgpu_debugfs_vm_bo_print_flag(m, bo, NO_CPU_ACCESS);
> +	amdgpu_debugfs_vm_bo_print_flag(m, bo, CPU_GTT_USWC);
> +	amdgpu_debugfs_vm_bo_print_flag(m, bo, VRAM_CLEARED);
> +	amdgpu_debugfs_vm_bo_print_flag(m, bo, SHADOW);
> +	amdgpu_debugfs_vm_bo_print_flag(m, bo, VRAM_CONTIGUOUS);
> +	amdgpu_debugfs_vm_bo_print_flag(m, bo, VM_ALWAYS_VALID);
> +	amdgpu_debugfs_vm_bo_print_flag(m, bo, EXPLICIT_SYNC);
> +
> +	seq_printf(m, "\n");
> +
> +	return size;
> +}
> +
> +/**
> + * amdgpu_debugfs_vm_bo_info  - print BO info for the VM
> + *
> + * @vm: Requested VM for printing BO info
> + * @data: debugfs file
> + *
> + * Print BO information in debugfs file for the VM
> + */
> +void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, void *data)
> +{
> +	struct seq_file *m = data;
> +
> +	struct amdgpu_bo_va_mapping *mapping;
> +	struct amdgpu_bo_va *bo_va, *tmp;
> +	unsigned long long total_mapped_size = 0;
> +	unsigned long long total_idle_size = 0;
> +	unsigned int total_mapped_objs = 0;
> +	unsigned int total_idle_objs = 0;
> +	unsigned int id = 0;
> +
> +	spin_lock(&vm->invalidated_lock);

That isn't sufficient. You need to reserve the root page directory to 
make sure that nobody is modifying the list of BOs while we dump it here.

We usually do this outside of the VM code itself.

> +	/* Print BO info for all VA mappings */
> +	seq_printf(m, "\tMapped BOs:\n");
> +	for (mapping = amdgpu_vm_it_iter_first(&vm->va, 0, U64_MAX); mapping;
> +	     mapping = amdgpu_vm_it_iter_next(mapping, 0, U64_MAX)) {
> +		if (mapping->bo_va && mapping->bo_va->base.bo)
> +			total_mapped_size += amdgpu_debugfs_print_bo_info(id++,
> +							mapping->bo_va->base.bo, m);
> +	}
> +
> +	total_mapped_objs = id;
> +	id = 0;
> +	/* Print info for Idle BOs */
> +	seq_printf(m, "\tIdle BOs:\n");
> +	list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) {
> +		if (bo_va && bo_va->base.bo)
> +			total_idle_size += amdgpu_debugfs_print_bo_info(id++,
> +								bo_va->base.bo, m);
> +	}

This is quite misleading. You should either print the mappings or the BO 
state, but not both.

I suggest to only use the BO state, but for this you need to handle 
evicted, relocated, moved, idle and invalidated. See amdgpu_vm.h for all 
the states.

Regards,
Christian.

> +	spin_unlock(&vm->invalidated_lock);
> +
> +	total_idle_objs = id;
> +	seq_printf(m, "\tTotal mapped size:  %12lld\n", total_mapped_size);
> +	seq_printf(m, "\tTotal mapped objs:  %12d\n", total_mapped_objs);
> +	seq_printf(m, "\tTotal idle size:    %12lld\n", total_idle_size);
> +	seq_printf(m, "\tTotal idle objs:    %12d\n", total_idle_objs);
> +}
> +#endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 7c46937c1c0e..a64465c72c9c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -441,4 +441,8 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
>   				struct amdgpu_vm *vm);
>   void amdgpu_vm_del_from_lru_notify(struct ttm_buffer_object *bo);
>   
> +#if defined(CONFIG_DEBUG_FS)
> +void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, void *data);
> +#endif
> +
>   #endif