[PATCH v3] drm/vc4: Add the DRM_IOCTL_VC4_GEM_MADVISE ioctl

Wed Oct 18 00:16:29 UTC 2017

Boris Brezillon <boris.brezillon at free-electrons.com> writes:

> This ioctl will allow us to purge inactive userspace buffers when the
> system is running out of contiguous memory.
>
> For now, the purge logic is rather dumb in that it does not try to
> release only the amount of BO needed to meet the last CMA alloc request
> but instead purges all objects placed in the purgeable pool as soon as
> we experience a CMA allocation failure.
>
> Note that the in-kernel BO cache is always purged before the purgeable
> cache because those objects are known to be unused while objects marked
> as purgeable by a userspace application/library might have to be
> restored when they are marked back as unpurgeable, which can be
> expensive.
>
> Signed-off-by: Boris Brezillon <boris.brezillon at free-electrons.com>

Looking good!  Just a couple things I'd like to sort out before we
merge.

> ---
> Hello,
>
> Updates to libdrm, mesa and igt making use of this kernel feature can
> be found on my github repos [1][2][3].
>
> There's currently no debugfs hook to manually force a purge, but this
> is being discussed and might be added later on.
>
> Regards,
>
> Boris
>
> [1]https://github.com/bbrezillon/drm/tree/vc4/purgeable
> [2]https://github.com/bbrezillon/mesa/tree/vc4/purgeable
> [3]https://github.com/bbrezillon/igt/tree/vc4/purgeable
>
> Changes in v3:
> - Use %zd formatters when printing size_t values
> - rework detection of BOs that do not support the MADV ioctl
> - replace DRM_ERROR by DRM_DEBUG in the ioctl path
> - do not explicitly memzero purged BO mem
> - check that pad is set to zero in the madv ioctl function
>
> Changes in v2:
> - Do not re-allocate BO's memory after when WILLNEED is asked after a purge
> - Add purgeable/purged stats to debugfs and dumpstats
> - Pad the drm_vc4_gem_madvise to make it 64-bit aligned
> - Forbid madvise calls on internal BO objects (everything that is not
>   meant to be exposed to userspace)
> - Do not increment/decrement usecnt on internal BOs (they cannot be marked
>   purgeable, so doing that is useless and error prone)
> - Fix a few bugs related to usecnt refcounting
> ---
>  drivers/gpu/drm/vc4/vc4_bo.c    | 292 ++++++++++++++++++++++++++++++++++++++--
>  drivers/gpu/drm/vc4/vc4_drv.c   |  10 +-
>  drivers/gpu/drm/vc4/vc4_drv.h   |  30 +++++
>  drivers/gpu/drm/vc4/vc4_gem.c   | 156 ++++++++++++++++++++-
>  drivers/gpu/drm/vc4/vc4_plane.c |  20 +++
>  include/uapi/drm/vc4_drm.h      |  19 +++
>  6 files changed, 512 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
> index 3afdbf4bc10b..e4d13bbef54f 100644
> --- a/drivers/gpu/drm/vc4/vc4_bo.c
> +++ b/drivers/gpu/drm/vc4/vc4_bo.c
> @@ -42,6 +42,8 @@ static bool is_user_label(int label)
>  
>  static void vc4_bo_stats_dump(struct vc4_dev *vc4)
>  {
> +	size_t purgeable_size, purged_size;
> +	unsigned int npurgeable, npurged;
>  	int i;
>  
>  	for (i = 0; i < vc4->num_labels; i++) {
> @@ -53,6 +55,21 @@ static void vc4_bo_stats_dump(struct vc4_dev *vc4)
>  			 vc4->bo_labels[i].size_allocated / 1024,
>  			 vc4->bo_labels[i].num_allocated);
>  	}
> +
> +	mutex_lock(&vc4->purgeable.lock);
> +	npurgeable = vc4->purgeable.num;
> +	purgeable_size = vc4->purgeable.size;
> +	purged_size = vc4->purgeable.purged_size;
> +	npurged = vc4->purgeable.purged_num;
> +	mutex_unlock(&vc4->purgeable.lock);
> +
> +	if (npurgeable)
> +		DRM_INFO("%30s: %6zdkb BOs (%d)\n", "userspace BO cache",
> +			 purgeable_size / 1024, npurgeable);
> +
> +	if (npurged)
> +		DRM_INFO("%30s: %6zdkb BOs (%d)\n", "total purged BO",
> +			 purged_size / 1024, npurged);
>  }
>  
>  #ifdef CONFIG_DEBUG_FS
> @@ -61,6 +78,8 @@ int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
>  	struct drm_info_node *node = (struct drm_info_node *)m->private;
>  	struct drm_device *dev = node->minor->dev;
>  	struct vc4_dev *vc4 = to_vc4_dev(dev);
> +	size_t purgeable_size, purged_size;
> +	unsigned int npurgeable, npurged;
>  	int i;
>  
>  	mutex_lock(&vc4->bo_lock);
> @@ -75,6 +94,21 @@ int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
>  	}
>  	mutex_unlock(&vc4->bo_lock);
>  
> +	mutex_lock(&vc4->purgeable.lock);
> +	npurgeable = vc4->purgeable.num;
> +	purgeable_size = vc4->purgeable.size;
> +	purged_size = vc4->purgeable.purged_size;
> +	npurged = vc4->purgeable.purged_num;
> +	mutex_unlock(&vc4->purgeable.lock);
> +
> +	if (npurgeable)
> +		seq_printf(m, "%30s: %6dkb BOs (%d)\n", "userspace BO cache",
> +			   purgeable_size / 1024, npurgeable);
> +
> +	if (npurged)
> +		seq_printf(m, "%30s: %6dkb BOs (%d)\n", "total purged BO",
> +			   purged_size / 1024, npurged);

I think you could just do these seq_printfs under the lock, instead of
taking a snapshot ahead of time.  Same for the dump.

> +static void vc4_bo_remove_from_purgeable_pool_locked(struct vc4_bo *bo)
> +{
> +	struct vc4_dev *vc4 = to_vc4_dev(bo->base.base.dev);
> +
> +	/* list_del_init() is used here because the caller might release
> +	 * the purgeable lock in order to acquire the madv one and update the
> +	 * madv status.
> +	 * During this short period of time a user might decide to mark
> +	 * the BO as unpurgeable, and if bo->madv is set to
> +	 * VC4_MADV_DONTNEED it will try to remove the BO from the
> +	 * purgeable list which will fail if the ->next/prev fields
> +	 * are set to LIST_POISON1/LIST_POISON2 (which is what
> +	 * list_del() does).
> +	 * Re-initializing the list element guarantees that list_del()
> +	 * will work correctly even if it's a NOP.
> +	 */
> +	list_del_init(&bo->size_head);
> +	vc4->purgeable.num--;
> +	vc4->purgeable.size -= bo->base.base.size;
> +}
> +
> +void vc4_bo_remove_from_purgeable_pool(struct vc4_bo *bo)
> +{
> +	struct vc4_dev *vc4 = to_vc4_dev(bo->base.base.dev);
> +
> +	mutex_lock(&vc4->purgeable.lock);
> +	vc4_bo_remove_from_purgeable_pool_locked(bo);
> +	mutex_unlock(&vc4->purgeable.lock);
> +}
> +
> +static void vc4_bo_purge(struct drm_gem_object *obj)
> +{
> +	struct vc4_bo *bo = to_vc4_bo(obj);
> +	struct drm_device *dev = obj->dev;
> +
> +	WARN_ON(!mutex_is_locked(&bo->madv_lock));
> +	WARN_ON(bo->madv != VC4_MADV_DONTNEED);
> +
> +	drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping);
> +
> +	dma_free_wc(dev->dev, obj->size, bo->base.vaddr, bo->base.paddr);
> +	bo->base.vaddr = NULL;
> +	bo->madv = __VC4_MADV_PURGED;
> +}
> +
> +static void vc4_bo_userspace_cache_purge(struct drm_device *dev)
> +{
> +	struct vc4_dev *vc4 = to_vc4_dev(dev);
> +
> +	mutex_lock(&vc4->purgeable.lock);
> +	while (!list_empty(&vc4->purgeable.list)) {
> +		struct vc4_bo *bo = list_first_entry(&vc4->purgeable.list,
> +						     struct vc4_bo, size_head);
> +		struct drm_gem_object *obj = &bo->base.base;
> +		size_t purged_size = 0;
> +
> +		vc4_bo_remove_from_purgeable_pool_locked(bo);
> +
> +		/* Release the purgeable lock while we're purging the BO so
> +		 * that other people can continue inserting things in the
> +		 * purgeable pool without having to wait for all BOs to be
> +		 * purged.
> +		 */
> +		mutex_unlock(&vc4->purgeable.lock);
> +		mutex_lock(&bo->madv_lock);
> +
> +		/* Since we released the purgeable pool lock before acquiring
> +		 * the BO madv one, vc4_gem_madvise_ioctl() may have changed
> +		 * the bo->madv status. In this case, just skip this entry.
> +		 */
> +		if (bo->madv == VC4_MADV_DONTNEED) {
> +			purged_size = bo->base.base.size;
> +			vc4_bo_purge(obj);
> +		}

Possible ugly race here?

thread 1               thread 2
----------------------------------------------------
Idle my BO
Mark DONTNEED
                       Start purge
                       pull BO off of purgeable list
Mark NEED
Do a submit CL
Mark DONTNEED
                       purge the BO

Can we plug that by checking the refcount along with checking that we're
DONTNEED?

> +		mutex_unlock(&bo->madv_lock);
> +		mutex_lock(&vc4->purgeable.lock);
> +
> +		if (purged_size) {
> +			vc4->purgeable.purged_size += purged_size;
> +			vc4->purgeable.purged_num++;
> +		}
> +	}
> +	mutex_unlock(&vc4->purgeable.lock);
> +}

>  int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
>  {
>  	struct drm_gem_object *gem_obj;
> +	unsigned long vm_pgoff;
>  	struct vc4_bo *bo;
>  	int ret;
>  
> @@ -507,16 +759,36 @@ int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
>  		return -EINVAL;
>  	}
>  
> +	if (bo->madv != VC4_MADV_WILLNEED) {
> +		DRM_DEBUG("mmaping of %s BO not allowed\n",
> +			  bo->madv == VC4_MADV_DONTNEED ?
> +			  "purgeable" : "purged");
> +		return -EINVAL;
> +	}
> +
>  	/*
>  	 * Clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the
>  	 * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map
>  	 * the whole buffer.
>  	 */
>  	vma->vm_flags &= ~VM_PFNMAP;
> -	vma->vm_pgoff = 0;
>  
> +	/* This ->vm_pgoff dance is needed to make all parties happy:
> +	 * - dma_mmap_wc() uses ->vm_pgoff as an offset within the allocated
> +	 *   mem-region, hence the need to set it to zero (the value set by
> +	 *   the DRM core is a virtual offset encoding the GEM object-id)
> +	 * - the mmap() core logic needs ->vm_pgoff to be restored to its
> +	 *   initial before returning from this function because it encodes the
> +	 *   offset of this GEM in the dev->anon_inode pseudo-file and this
> +	 *   information will be used when we invalidate userspace mappings
> +	 *   with drm_vma_node_unmap() (called from vc4_gem_purge()).
> +	 */
> +	vm_pgoff = vma->vm_pgoff;
> +	vma->vm_pgoff = 0;
>  	ret = dma_mmap_wc(bo->base.base.dev->dev, vma, bo->base.vaddr,
>  			  bo->base.paddr, vma->vm_end - vma->vm_start);
> +	vma->vm_pgoff = vm_pgoff;

Thanks for the explanation here!

> diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
> index 87f2d8e5c134..f54c1f5ce47f 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.h
> +++ b/drivers/gpu/drm/vc4/vc4_drv.h
> @@ -74,6 +74,19 @@ struct vc4_dev {
>  	/* Protects bo_cache and bo_labels. */
>  	struct mutex bo_lock;
>  
> +	/* Purgeable BO pool. All BOs in this pool can have their memory
> +	 * reclaimed if the driver is unable to allocate new BOs. We also
> +	 * keep stats related to the purge mechanism here.
> +	 */
> +	struct {
> +		struct list_head list;
> +		unsigned int num;
> +		size_t size;
> +		unsigned int purged_num;
> +		size_t purged_size;
> +		struct mutex lock;
> +	} purgeable;
> +
>  	uint64_t dma_fence_context;
>  
>  	/* Sequence number for the last job queued in bin_job_list.
> @@ -192,6 +205,16 @@ struct vc4_bo {
>  	 * for user-allocated labels.
>  	 */
>  	int label;
> +
> +	/* Count the number of active users. This is needed to determine
> +	 * whether we can the BO to the purgeable or not (when the BO is used

Something went weird here.  Maybe you wanted "whether we can move the BO
to the purgeable list or not"?

> +	 * by the GPU or the display engine we can't purge it).
> +	 */
> +	refcount_t usecnt;
> +
> +	/* Store purgeable/purged state here */
> +	u32 madv;
> +	struct mutex madv_lock;
>  };

> diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
> index d0c6bfb68c4e..50a0b2638a2f 100644
> --- a/drivers/gpu/drm/vc4/vc4_gem.c
> +++ b/drivers/gpu/drm/vc4/vc4_gem.c
> @@ -188,11 +188,22 @@ vc4_save_hang_state(struct drm_device *dev)
>  			continue;
>  
>  		for (j = 0; j < exec[i]->bo_count; j++) {
> +			bo = to_vc4_bo(&exec[i]->bo[j]->base);
> +
> +			/* Retain BOs just in case they were marked purgeable.
> +			 * This prevents the BO from being purged before
> +			 * someone had a chance to dump the hang state.
> +			 */
> +			WARN_ON(!refcount_read(&bo->usecnt));
> +			refcount_inc(&bo->usecnt);
>  			drm_gem_object_get(&exec[i]->bo[j]->base);
>  			kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
>  		}
>  
>  		list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
> +			/* No need to retain BOs coming from the ->unref_list
> +			 * because they are naturally unpurgeable.
> +			 */
>  			drm_gem_object_get(&bo->base.base);
>  			kernel_state->bo[j + prev_idx] = &bo->base.base;
>  			j++;
> @@ -233,6 +244,26 @@ vc4_save_hang_state(struct drm_device *dev)
>  	state->fdbgs = V3D_READ(V3D_FDBGS);
>  	state->errstat = V3D_READ(V3D_ERRSTAT);
>  
> +	/* We need to turn purgeable BOs into unpurgeable ones so that
> +	 * userspace has a chance to dump the hang state before the kernel
> +	 * decides to purge those BOs.
> +	 * Note that BO consistency at dump time cannot be guaranteed. For
> +	 * example, if the owner of these BOs decides to re-use them or mark
> +	 * them purgeable again there's nothing we can do to prevent it.
> +	 */
> +	for (i = 0; i < kernel_state->user_state.bo_count; i++) {
> +		struct vc4_bo *bo = to_vc4_bo(kernel_state->bo[i]);
> +
> +		if (bo->madv == __VC4_MADV_NOTSUPP)
> +			continue;
> +
> +		mutex_lock(&bo->madv_lock);
> +		if (!WARN_ON(bo->madv == __VC4_MADV_PURGED))
> +			bo->madv = VC4_MADV_WILLNEED;
> +		refcount_dec(&bo->usecnt);
> +		mutex_unlock(&bo->madv_lock);
> +	}

I like your solution for getting these buffers transitioned to WILLNEED.
In my previous comments, I had been thinking you were incrementing the
ref here and decrementing at hang state ioctl time, but you're actually
doing it all within save_hang_state.

> @@ -639,9 +670,6 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
>   * The command validator needs to reference BOs by their index within
>   * the submitted job's BO list.  This does the validation of the job's
>   * BO list and reference counting for the lifetime of the job.
> - *
> - * Note that this function doesn't need to unreference the BOs on
> - * failure, because that will happen at vc4_complete_exec() time.
>   */
>  static int
>  vc4_cl_lookup_bos(struct drm_device *dev,
> @@ -693,16 +721,47 @@ vc4_cl_lookup_bos(struct drm_device *dev,
>  			DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
>  				  i, handles[i]);
>  			ret = -EINVAL;
> -			spin_unlock(&file_priv->table_lock);
> -			goto fail;
> +			break;
>  		}
> +
>  		drm_gem_object_get(bo);
>  		exec->bo[i] = (struct drm_gem_cma_object *)bo;
>  	}
>  	spin_unlock(&file_priv->table_lock);
>  
> +	if (ret)
> +		goto fail_put_bo;
> +
> +	for (i = 0; i < exec->bo_count; i++) {
> +		ret = vc4_bo_inc_usecnt(to_vc4_bo(&exec->bo[i]->base));
> +		if (ret)
> +			goto fail_dec_usecnt;
> +	}
> +
> +	kvfree(handles);
> +	return 0;
> +
> +fail_dec_usecnt:
> +	/* Decrease usecnt on acquired objects.
> +	 * We cannot rely on  vc4_complete_exec() to release resources here,
> +	 * because vc4_complete_exec() has no information about which BO has
> +	 * had its ->usecnt incremented.
> +	 * To make things easier we just free everything explicitly and set
> +	 * exec->bo to NULL so that vc4_complete_exec() skips the 'BO release'
> +	 * step.
> +	 */
> +	for (i-- ; i >= 0; i--)
> +		vc4_bo_dec_usecnt(to_vc4_bo(&exec->bo[i]->base));
> +
> +fail_put_bo:
> +	/* Release any reference to acquired objects. */
> +	for (i = 0; i < exec->bo_count && exec->bo[i]; i++)
> +		drm_gem_object_put(&exec->bo[i]->base);

I think this should be drm_gem_object_put_unlocked() -- we're not
holding struct_mutex, and don't use it in this driver.  You can safely
call it with NULL, but I'm fine either way.

>  static void vc4_plane_destroy(struct drm_plane *plane)
> diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
> index afae87004963..52263b575bdc 100644
> --- a/include/uapi/drm/vc4_drm.h
> +++ b/include/uapi/drm/vc4_drm.h
> @@ -41,6 +41,7 @@ extern "C" {
>  #define DRM_VC4_SET_TILING                        0x08
>  #define DRM_VC4_GET_TILING                        0x09
>  #define DRM_VC4_LABEL_BO                          0x0a
> +#define DRM_VC4_GEM_MADVISE                       0x0b
>  
>  #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
>  #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
> @@ -53,6 +54,7 @@ extern "C" {
>  #define DRM_IOCTL_VC4_SET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SET_TILING, struct drm_vc4_set_tiling)
>  #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
>  #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
> +#define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
>  
>  struct drm_vc4_submit_rcl_surface {
>  	__u32 hindex; /* Handle index, or ~0 if not present. */
> @@ -305,6 +307,7 @@ struct drm_vc4_get_hang_state {
>  #define DRM_VC4_PARAM_SUPPORTS_ETC1		4
>  #define DRM_VC4_PARAM_SUPPORTS_THREADED_FS	5
>  #define DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER	6
> +#define DRM_VC4_PARAM_SUPPORTS_MADVISE		7
>  
>  struct drm_vc4_get_param {
>  	__u32 param;
> @@ -333,6 +336,22 @@ struct drm_vc4_label_bo {
>  	__u64 name;
>  };
>  
> +/*
> + * States prefixed with '__' are internal states and cannot be passed to the
> + * DRM_IOCTL_VC4_GEM_MADVISE ioctl.
> + */
> +#define VC4_MADV_WILLNEED			0
> +#define VC4_MADV_DONTNEED			1
> +#define __VC4_MADV_PURGED			2
> +#define __VC4_MADV_NOTSUPP			3
> +
> +struct drm_vc4_gem_madvise {
> +	__u32 handle;
> +	__u32 madv;
> +	__u32 retained;
> +	__u32 pad;
> +};

danvet, you had said that the pad we added here is not necessary,
despite your blog post saying to pad to 64 bits.  Should we have it or
not?  I'm fine either way.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 832 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/dri-devel/attachments/20171017/e86c3855/attachment-0001.sig>