[PATCH v3] drm/vc4: Add the DRM_IOCTL_VC4_GEM_MADVISE ioctl

Wed Oct 18 07:54:48 UTC 2017

On Tue, 17 Oct 2017 17:16:29 -0700
Eric Anholt <eric at anholt.net> wrote:

> Boris Brezillon <boris.brezillon at free-electrons.com> writes:
> 
> > This ioctl will allow us to purge inactive userspace buffers when the
> > system is running out of contiguous memory.
> >
> > For now, the purge logic is rather dumb in that it does not try to
> > release only the amount of BO needed to meet the last CMA alloc request
> > but instead purges all objects placed in the purgeable pool as soon as
> > we experience a CMA allocation failure.
> >
> > Note that the in-kernel BO cache is always purged before the purgeable
> > cache because those objects are known to be unused while objects marked
> > as purgeable by a userspace application/library might have to be
> > restored when they are marked back as unpurgeable, which can be
> > expensive.
> >
> > Signed-off-by: Boris Brezillon <boris.brezillon at free-electrons.com>  
> 
> Looking good!  Just a couple things I'd like to sort out before we
> merge.
> 
> > ---
> > Hello,
> >
> > Updates to libdrm, mesa and igt making use of this kernel feature can
> > be found on my github repos [1][2][3].
> >
> > There's currently no debugfs hook to manually force a purge, but this
> > is being discussed and might be added later on.
> >
> > Regards,
> >
> > Boris
> >
> > [1]https://github.com/bbrezillon/drm/tree/vc4/purgeable
> > [2]https://github.com/bbrezillon/mesa/tree/vc4/purgeable
> > [3]https://github.com/bbrezillon/igt/tree/vc4/purgeable
> >
> > Changes in v3:
> > - Use %zd formatters when printing size_t values
> > - rework detection of BOs that do not support the MADV ioctl
> > - replace DRM_ERROR by DRM_DEBUG in the ioctl path
> > - do not explicitly memzero purged BO mem
> > - check that pad is set to zero in the madv ioctl function
> >
> > Changes in v2:
> > - Do not re-allocate BO's memory after when WILLNEED is asked after a purge
> > - Add purgeable/purged stats to debugfs and dumpstats
> > - Pad the drm_vc4_gem_madvise to make it 64-bit aligned
> > - Forbid madvise calls on internal BO objects (everything that is not
> >   meant to be exposed to userspace)
> > - Do not increment/decrement usecnt on internal BOs (they cannot be marked
> >   purgeable, so doing that is useless and error prone)
> > - Fix a few bugs related to usecnt refcounting
> > ---
> >  drivers/gpu/drm/vc4/vc4_bo.c    | 292 ++++++++++++++++++++++++++++++++++++++--
> >  drivers/gpu/drm/vc4/vc4_drv.c   |  10 +-
> >  drivers/gpu/drm/vc4/vc4_drv.h   |  30 +++++
> >  drivers/gpu/drm/vc4/vc4_gem.c   | 156 ++++++++++++++++++++-
> >  drivers/gpu/drm/vc4/vc4_plane.c |  20 +++
> >  include/uapi/drm/vc4_drm.h      |  19 +++
> >  6 files changed, 512 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
> > index 3afdbf4bc10b..e4d13bbef54f 100644
> > --- a/drivers/gpu/drm/vc4/vc4_bo.c
> > +++ b/drivers/gpu/drm/vc4/vc4_bo.c
> > @@ -42,6 +42,8 @@ static bool is_user_label(int label)
> >  
> >  static void vc4_bo_stats_dump(struct vc4_dev *vc4)
> >  {
> > +	size_t purgeable_size, purged_size;
> > +	unsigned int npurgeable, npurged;
> >  	int i;
> >  
> >  	for (i = 0; i < vc4->num_labels; i++) {
> > @@ -53,6 +55,21 @@ static void vc4_bo_stats_dump(struct vc4_dev *vc4)
> >  			 vc4->bo_labels[i].size_allocated / 1024,
> >  			 vc4->bo_labels[i].num_allocated);
> >  	}
> > +
> > +	mutex_lock(&vc4->purgeable.lock);
> > +	npurgeable = vc4->purgeable.num;
> > +	purgeable_size = vc4->purgeable.size;
> > +	purged_size = vc4->purgeable.purged_size;
> > +	npurged = vc4->purgeable.purged_num;
> > +	mutex_unlock(&vc4->purgeable.lock);
> > +
> > +	if (npurgeable)
> > +		DRM_INFO("%30s: %6zdkb BOs (%d)\n", "userspace BO cache",
> > +			 purgeable_size / 1024, npurgeable);
> > +
> > +	if (npurged)
> > +		DRM_INFO("%30s: %6zdkb BOs (%d)\n", "total purged BO",
> > +			 purged_size / 1024, npurged);
> >  }
> >  
> >  #ifdef CONFIG_DEBUG_FS
> > @@ -61,6 +78,8 @@ int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
> >  	struct drm_info_node *node = (struct drm_info_node *)m->private;
> >  	struct drm_device *dev = node->minor->dev;
> >  	struct vc4_dev *vc4 = to_vc4_dev(dev);
> > +	size_t purgeable_size, purged_size;
> > +	unsigned int npurgeable, npurged;
> >  	int i;
> >  
> >  	mutex_lock(&vc4->bo_lock);
> > @@ -75,6 +94,21 @@ int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
> >  	}
> >  	mutex_unlock(&vc4->bo_lock);
> >  
> > +	mutex_lock(&vc4->purgeable.lock);
> > +	npurgeable = vc4->purgeable.num;
> > +	purgeable_size = vc4->purgeable.size;
> > +	purged_size = vc4->purgeable.purged_size;
> > +	npurged = vc4->purgeable.purged_num;
> > +	mutex_unlock(&vc4->purgeable.lock);
> > +
> > +	if (npurgeable)
> > +		seq_printf(m, "%30s: %6dkb BOs (%d)\n", "userspace BO cache",
> > +			   purgeable_size / 1024, npurgeable);
> > +
> > +	if (npurged)
> > +		seq_printf(m, "%30s: %6dkb BOs (%d)\n", "total purged BO",
> > +			   purged_size / 1024, npurged);  
> 
> I think you could just do these seq_printfs under the lock, instead of
> taking a snapshot ahead of time.  Same for the dump.

The reason I initially did it like that is because printing a message
on the console can take a non-negligible amount of time if it's a
serial console. That's less of a problem for the seq_printfs.

Anyway, I can go for your solution if you prefer.

> 
> > +static void vc4_bo_remove_from_purgeable_pool_locked(struct vc4_bo *bo)
> > +{
> > +	struct vc4_dev *vc4 = to_vc4_dev(bo->base.base.dev);
> > +
> > +	/* list_del_init() is used here because the caller might release
> > +	 * the purgeable lock in order to acquire the madv one and update the
> > +	 * madv status.
> > +	 * During this short period of time a user might decide to mark
> > +	 * the BO as unpurgeable, and if bo->madv is set to
> > +	 * VC4_MADV_DONTNEED it will try to remove the BO from the
> > +	 * purgeable list which will fail if the ->next/prev fields
> > +	 * are set to LIST_POISON1/LIST_POISON2 (which is what
> > +	 * list_del() does).
> > +	 * Re-initializing the list element guarantees that list_del()
> > +	 * will work correctly even if it's a NOP.
> > +	 */
> > +	list_del_init(&bo->size_head);
> > +	vc4->purgeable.num--;
> > +	vc4->purgeable.size -= bo->base.base.size;
> > +}
> > +
> > +void vc4_bo_remove_from_purgeable_pool(struct vc4_bo *bo)
> > +{
> > +	struct vc4_dev *vc4 = to_vc4_dev(bo->base.base.dev);
> > +
> > +	mutex_lock(&vc4->purgeable.lock);
> > +	vc4_bo_remove_from_purgeable_pool_locked(bo);
> > +	mutex_unlock(&vc4->purgeable.lock);
> > +}
> > +
> > +static void vc4_bo_purge(struct drm_gem_object *obj)
> > +{
> > +	struct vc4_bo *bo = to_vc4_bo(obj);
> > +	struct drm_device *dev = obj->dev;
> > +
> > +	WARN_ON(!mutex_is_locked(&bo->madv_lock));
> > +	WARN_ON(bo->madv != VC4_MADV_DONTNEED);
> > +
> > +	drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping);
> > +
> > +	dma_free_wc(dev->dev, obj->size, bo->base.vaddr, bo->base.paddr);
> > +	bo->base.vaddr = NULL;
> > +	bo->madv = __VC4_MADV_PURGED;
> > +}
> > +
> > +static void vc4_bo_userspace_cache_purge(struct drm_device *dev)
> > +{
> > +	struct vc4_dev *vc4 = to_vc4_dev(dev);
> > +
> > +	mutex_lock(&vc4->purgeable.lock);
> > +	while (!list_empty(&vc4->purgeable.list)) {
> > +		struct vc4_bo *bo = list_first_entry(&vc4->purgeable.list,
> > +						     struct vc4_bo, size_head);
> > +		struct drm_gem_object *obj = &bo->base.base;
> > +		size_t purged_size = 0;
> > +
> > +		vc4_bo_remove_from_purgeable_pool_locked(bo);
> > +
> > +		/* Release the purgeable lock while we're purging the BO so
> > +		 * that other people can continue inserting things in the
> > +		 * purgeable pool without having to wait for all BOs to be
> > +		 * purged.
> > +		 */
> > +		mutex_unlock(&vc4->purgeable.lock);
> > +		mutex_lock(&bo->madv_lock);
> > +
> > +		/* Since we released the purgeable pool lock before acquiring
> > +		 * the BO madv one, vc4_gem_madvise_ioctl() may have changed
> > +		 * the bo->madv status. In this case, just skip this entry.
> > +		 */
> > +		if (bo->madv == VC4_MADV_DONTNEED) {
> > +			purged_size = bo->base.base.size;
> > +			vc4_bo_purge(obj);
> > +		}  
> 
> Possible ugly race here?
> 
> thread 1               thread 2
> ----------------------------------------------------
> Idle my BO
> Mark DONTNEED
>                        Start purge
>                        pull BO off of purgeable list
> Mark NEED
> Do a submit CL
> Mark DONTNEED
>                        purge the BO
> 

Crap! You're right. I didn't go that far in my analysis.

> Can we plug that by checking the refcount along with checking that we're
> DONTNEED?

That should solve the problem you're describing above, but I see another
problem:

thread 1                      thread 2
----------------------------------------------------
Idle my BO
Mark DONTNEED
                              Start purge
                              pull BO off of purgeable list
Mark NEED
Mark DONTNEED
 (BO gets back in the list)
                              purge the BO
                               (ouch! we purged a BO that is still in
                                the purgeable list)

We'd need something like:

		/* Since we released the purgeable pool lock before
		 * acquiring the BO madv one, vc4_gem_madvise_ioctl()
		 * may have changed the bo->madv status in the meantime
		 * and may have re-used the BO. Before purging the BO we
		 * need to make sure
		 * - it is still marked as DONTNEED
		 * - it has not been re-inserted in the purgeable list
		 * - it is not used by HW blocks
		 * If one of these conditions is not true, just skip the
		 * entry.
		 */
		if (bo->madv == VC4_MADV_DONTNEED &&
		    list_empty(&bo->size_list) &&
		    !refcount_read(&bo->usecnt)) {
			purged_size = bo->base.base.size;
			vc4_bo_purge(obj);
		}

> > +		mutex_unlock(&bo->madv_lock);
> > +		mutex_lock(&vc4->purgeable.lock);
> > +
> > +		if (purged_size) {
> > +			vc4->purgeable.purged_size += purged_size;
> > +			vc4->purgeable.purged_num++;
> > +		}
> > +	}
> > +	mutex_unlock(&vc4->purgeable.lock);
> > +}  
> 
> 
> >  int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
> >  {
> >  	struct drm_gem_object *gem_obj;
> > +	unsigned long vm_pgoff;
> >  	struct vc4_bo *bo;
> >  	int ret;
> >  
> > @@ -507,16 +759,36 @@ int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
> >  		return -EINVAL;
> >  	}
> >  
> > +	if (bo->madv != VC4_MADV_WILLNEED) {
> > +		DRM_DEBUG("mmaping of %s BO not allowed\n",
> > +			  bo->madv == VC4_MADV_DONTNEED ?
> > +			  "purgeable" : "purged");
> > +		return -EINVAL;
> > +	}
> > +
> >  	/*
> >  	 * Clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the
> >  	 * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map
> >  	 * the whole buffer.
> >  	 */
> >  	vma->vm_flags &= ~VM_PFNMAP;
> > -	vma->vm_pgoff = 0;
> >  
> > +	/* This ->vm_pgoff dance is needed to make all parties happy:
> > +	 * - dma_mmap_wc() uses ->vm_pgoff as an offset within the allocated
> > +	 *   mem-region, hence the need to set it to zero (the value set by
> > +	 *   the DRM core is a virtual offset encoding the GEM object-id)
> > +	 * - the mmap() core logic needs ->vm_pgoff to be restored to its
> > +	 *   initial before returning from this function because it encodes the

		   ^ value

> > +	 *   offset of this GEM in the dev->anon_inode pseudo-file and this
> > +	 *   information will be used when we invalidate userspace mappings
> > +	 *   with drm_vma_node_unmap() (called from vc4_gem_purge()).
> > +	 */
> > +	vm_pgoff = vma->vm_pgoff;
> > +	vma->vm_pgoff = 0;
> >  	ret = dma_mmap_wc(bo->base.base.dev->dev, vma, bo->base.vaddr,
> >  			  bo->base.paddr, vma->vm_end - vma->vm_start);
> > +	vma->vm_pgoff = vm_pgoff;  
> 
> Thanks for the explanation here!
> 
> > diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
> > index 87f2d8e5c134..f54c1f5ce47f 100644
> > --- a/drivers/gpu/drm/vc4/vc4_drv.h
> > +++ b/drivers/gpu/drm/vc4/vc4_drv.h
> > @@ -74,6 +74,19 @@ struct vc4_dev {
> >  	/* Protects bo_cache and bo_labels. */
> >  	struct mutex bo_lock;
> >  
> > +	/* Purgeable BO pool. All BOs in this pool can have their memory
> > +	 * reclaimed if the driver is unable to allocate new BOs. We also
> > +	 * keep stats related to the purge mechanism here.
> > +	 */
> > +	struct {
> > +		struct list_head list;
> > +		unsigned int num;
> > +		size_t size;
> > +		unsigned int purged_num;
> > +		size_t purged_size;
> > +		struct mutex lock;
> > +	} purgeable;
> > +
> >  	uint64_t dma_fence_context;
> >  
> >  	/* Sequence number for the last job queued in bin_job_list.
> > @@ -192,6 +205,16 @@ struct vc4_bo {
> >  	 * for user-allocated labels.
> >  	 */
> >  	int label;
> > +
> > +	/* Count the number of active users. This is needed to determine
> > +	 * whether we can the BO to the purgeable or not (when the BO is used  
> 
> Something went weird here.  Maybe you wanted "whether we can move the BO
> to the purgeable list or not"?

Yep.

> 
> > +	 * by the GPU or the display engine we can't purge it).
> > +	 */
> > +	refcount_t usecnt;
> > +
> > +	/* Store purgeable/purged state here */
> > +	u32 madv;
> > +	struct mutex madv_lock;
> >  };  
> 
> > diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
> > index d0c6bfb68c4e..50a0b2638a2f 100644
> > --- a/drivers/gpu/drm/vc4/vc4_gem.c
> > +++ b/drivers/gpu/drm/vc4/vc4_gem.c
> > @@ -188,11 +188,22 @@ vc4_save_hang_state(struct drm_device *dev)
> >  			continue;
> >  
> >  		for (j = 0; j < exec[i]->bo_count; j++) {
> > +			bo = to_vc4_bo(&exec[i]->bo[j]->base);
> > +
> > +			/* Retain BOs just in case they were marked purgeable.
> > +			 * This prevents the BO from being purged before
> > +			 * someone had a chance to dump the hang state.
> > +			 */
> > +			WARN_ON(!refcount_read(&bo->usecnt));
> > +			refcount_inc(&bo->usecnt);
> >  			drm_gem_object_get(&exec[i]->bo[j]->base);
> >  			kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
> >  		}
> >  
> >  		list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
> > +			/* No need to retain BOs coming from the ->unref_list
> > +			 * because they are naturally unpurgeable.
> > +			 */
> >  			drm_gem_object_get(&bo->base.base);
> >  			kernel_state->bo[j + prev_idx] = &bo->base.base;
> >  			j++;
> > @@ -233,6 +244,26 @@ vc4_save_hang_state(struct drm_device *dev)
> >  	state->fdbgs = V3D_READ(V3D_FDBGS);
> >  	state->errstat = V3D_READ(V3D_ERRSTAT);
> >  
> > +	/* We need to turn purgeable BOs into unpurgeable ones so that
> > +	 * userspace has a chance to dump the hang state before the kernel
> > +	 * decides to purge those BOs.
> > +	 * Note that BO consistency at dump time cannot be guaranteed. For
> > +	 * example, if the owner of these BOs decides to re-use them or mark
> > +	 * them purgeable again there's nothing we can do to prevent it.
> > +	 */
> > +	for (i = 0; i < kernel_state->user_state.bo_count; i++) {
> > +		struct vc4_bo *bo = to_vc4_bo(kernel_state->bo[i]);
> > +
> > +		if (bo->madv == __VC4_MADV_NOTSUPP)
> > +			continue;
> > +
> > +		mutex_lock(&bo->madv_lock);
> > +		if (!WARN_ON(bo->madv == __VC4_MADV_PURGED))
> > +			bo->madv = VC4_MADV_WILLNEED;
> > +		refcount_dec(&bo->usecnt);
> > +		mutex_unlock(&bo->madv_lock);
> > +	}  
> 
> I like your solution for getting these buffers transitioned to WILLNEED.
> In my previous comments, I had been thinking you were incrementing the
> ref here and decrementing at hang state ioctl time, but you're actually
> doing it all within save_hang_state.
> 
> > @@ -639,9 +670,6 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
> >   * The command validator needs to reference BOs by their index within
> >   * the submitted job's BO list.  This does the validation of the job's
> >   * BO list and reference counting for the lifetime of the job.
> > - *
> > - * Note that this function doesn't need to unreference the BOs on
> > - * failure, because that will happen at vc4_complete_exec() time.
> >   */
> >  static int
> >  vc4_cl_lookup_bos(struct drm_device *dev,
> > @@ -693,16 +721,47 @@ vc4_cl_lookup_bos(struct drm_device *dev,
> >  			DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
> >  				  i, handles[i]);
> >  			ret = -EINVAL;
> > -			spin_unlock(&file_priv->table_lock);
> > -			goto fail;
> > +			break;
> >  		}
> > +
> >  		drm_gem_object_get(bo);
> >  		exec->bo[i] = (struct drm_gem_cma_object *)bo;
> >  	}
> >  	spin_unlock(&file_priv->table_lock);
> >  
> > +	if (ret)
> > +		goto fail_put_bo;
> > +
> > +	for (i = 0; i < exec->bo_count; i++) {
> > +		ret = vc4_bo_inc_usecnt(to_vc4_bo(&exec->bo[i]->base));
> > +		if (ret)
> > +			goto fail_dec_usecnt;
> > +	}
> > +
> > +	kvfree(handles);
> > +	return 0;
> > +
> > +fail_dec_usecnt:
> > +	/* Decrease usecnt on acquired objects.
> > +	 * We cannot rely on  vc4_complete_exec() to release resources here,
> > +	 * because vc4_complete_exec() has no information about which BO has
> > +	 * had its ->usecnt incremented.
> > +	 * To make things easier we just free everything explicitly and set
> > +	 * exec->bo to NULL so that vc4_complete_exec() skips the 'BO release'
> > +	 * step.
> > +	 */
> > +	for (i-- ; i >= 0; i--)
> > +		vc4_bo_dec_usecnt(to_vc4_bo(&exec->bo[i]->base));
> > +
> > +fail_put_bo:
> > +	/* Release any reference to acquired objects. */
> > +	for (i = 0; i < exec->bo_count && exec->bo[i]; i++)
> > +		drm_gem_object_put(&exec->bo[i]->base);  
> 
> I think this should be drm_gem_object_put_unlocked() -- we're not
> holding struct_mutex, and don't use it in this driver.

Right, I'll change that.

> You can safely
> call it with NULL, but I'm fine either way.

But I can't dereference a NULL pointer: I'm passing &exec->bo[i]->base
to drm_gem_object_put(), and if exec->bo[i] is NULL => PANIC!

> 
> >  static void vc4_plane_destroy(struct drm_plane *plane)
> > diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
> > index afae87004963..52263b575bdc 100644
> > --- a/include/uapi/drm/vc4_drm.h
> > +++ b/include/uapi/drm/vc4_drm.h
> > @@ -41,6 +41,7 @@ extern "C" {
> >  #define DRM_VC4_SET_TILING                        0x08
> >  #define DRM_VC4_GET_TILING                        0x09
> >  #define DRM_VC4_LABEL_BO                          0x0a
> > +#define DRM_VC4_GEM_MADVISE                       0x0b
> >  
> >  #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
> >  #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
> > @@ -53,6 +54,7 @@ extern "C" {
> >  #define DRM_IOCTL_VC4_SET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SET_TILING, struct drm_vc4_set_tiling)
> >  #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
> >  #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
> > +#define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
> >  
> >  struct drm_vc4_submit_rcl_surface {
> >  	__u32 hindex; /* Handle index, or ~0 if not present. */
> > @@ -305,6 +307,7 @@ struct drm_vc4_get_hang_state {
> >  #define DRM_VC4_PARAM_SUPPORTS_ETC1		4
> >  #define DRM_VC4_PARAM_SUPPORTS_THREADED_FS	5
> >  #define DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER	6
> > +#define DRM_VC4_PARAM_SUPPORTS_MADVISE		7
> >  
> >  struct drm_vc4_get_param {
> >  	__u32 param;
> > @@ -333,6 +336,22 @@ struct drm_vc4_label_bo {
> >  	__u64 name;
> >  };
> >  
> > +/*
> > + * States prefixed with '__' are internal states and cannot be passed to the
> > + * DRM_IOCTL_VC4_GEM_MADVISE ioctl.
> > + */
> > +#define VC4_MADV_WILLNEED			0
> > +#define VC4_MADV_DONTNEED			1
> > +#define __VC4_MADV_PURGED			2
> > +#define __VC4_MADV_NOTSUPP			3
> > +
> > +struct drm_vc4_gem_madvise {
> > +	__u32 handle;
> > +	__u32 madv;
> > +	__u32 retained;
> > +	__u32 pad;
> > +};  
> 
> danvet, you had said that the pad we added here is not necessary,
> despite your blog post saying to pad to 64 bits.  Should we have it or
> not?  I'm fine either way.