[PATCH v1 2/2] vfio/pci: Allow MMIO regions to be exported through dma-buf

Jason Gunthorpe jgg at nvidia.com
Wed May 1 12:53:09 UTC 2024


On Tue, Apr 30, 2024 at 04:24:50PM -0600, Alex Williamson wrote:
> > +static vm_fault_t vfio_pci_dma_buf_fault(struct vm_fault *vmf)
> > +{
> > +	struct vm_area_struct *vma = vmf->vma;
> > +	struct vfio_pci_dma_buf *priv = vma->vm_private_data;
> > +	pgoff_t pgoff = vmf->pgoff;
> > +
> > +	if (pgoff >= priv->nr_pages)
> > +		return VM_FAULT_SIGBUS;
> > +
> > +	return vmf_insert_pfn(vma, vmf->address,
> > +			      page_to_pfn(priv->pages[pgoff]));
> > +}
> 
> How does this prevent the MMIO space from being mmap'd when disabled at
> the device?  How is the mmap revoked when the MMIO becomes disabled?
> Is it part of the move protocol?

Yes, we should not have a mmap handler for dmabuf. vfio memory must be
mmapped in the normal way.

> > +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> > +{
> > +	struct vfio_pci_dma_buf *priv = dmabuf->priv;
> > +
> > +	/*
> > +	 * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
> > +	 * The refcount prevents both.
> > +	 */
> > +	if (priv->vdev) {
> > +		release_p2p_pages(priv, priv->nr_pages);
> > +		kfree(priv->pages);
> > +		down_write(&priv->vdev->memory_lock);
> > +		list_del_init(&priv->dmabufs_elm);
> > +		up_write(&priv->vdev->memory_lock);
> 
> Why are we acquiring and releasing the memory_lock write lock
> throughout when we're not modifying the device memory enable state?
> Ugh, we're using it to implicitly lock dmabufs_elm/dmabufs aren't we...

Not really implicitly, but yes the dmabufs list is locked by the
memory_lock.

> > +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > +				  struct vfio_device_feature_dma_buf __user *arg,
> > +				  size_t argsz)
> > +{
> > +	struct vfio_device_feature_dma_buf get_dma_buf;
> > +	struct vfio_region_p2p_area *p2p_areas;
> > +	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> > +	struct vfio_pci_dma_buf *priv;
> > +	int i, ret;
> > +
> > +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> > +				 sizeof(get_dma_buf));
> > +	if (ret != 1)
> > +		return ret;
> > +
> > +	if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
> > +		return -EFAULT;
> > +
> > +	p2p_areas = memdup_array_user(&arg->p2p_areas,
> > +				      get_dma_buf.nr_areas,
> > +				      sizeof(*p2p_areas));
> > +	if (IS_ERR(p2p_areas))
> > +		return PTR_ERR(p2p_areas);
> > +
> > +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> > +	if (!priv)
> > +		return -ENOMEM;
> 
> p2p_areas is leaked.

What is this new p2p_areas thing? It wasn't in my patch..

> > +	exp_info.ops = &vfio_pci_dmabuf_ops;
> > +	exp_info.size = priv->nr_pages << PAGE_SHIFT;
> > +	exp_info.flags = get_dma_buf.open_flags;
> 
> open_flags from userspace are unchecked.

Huh. That seems to be a dmabuf pattern. :\

> > +	exp_info.priv = priv;
> > +
> > +	priv->dmabuf = dma_buf_export(&exp_info);
> > +	if (IS_ERR(priv->dmabuf)) {
> > +		ret = PTR_ERR(priv->dmabuf);
> > +		goto err_free_pages;
> > +	}
> > +
> > +	/* dma_buf_put() now frees priv */
> > +	INIT_LIST_HEAD(&priv->dmabufs_elm);
> > +	down_write(&vdev->memory_lock);
> > +	dma_resv_lock(priv->dmabuf->resv, NULL);
> > +	priv->revoked = !__vfio_pci_memory_enabled(vdev);
> > +	vfio_device_try_get_registration(&vdev->vdev);
> 
> I guess we're assuming this can't fail in the ioctl path of an open
> device?

Seems like a bug added here.. My version had this as
vfio_device_get(). This stuff has probably changed since I wrote it.

> > +	list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
> > +	dma_resv_unlock(priv->dmabuf->resv);
> 
> What was the purpose of locking this?

?

> > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > +{
> > +	struct vfio_pci_dma_buf *priv;
> > +	struct vfio_pci_dma_buf *tmp;
> > +
> > +	lockdep_assert_held_write(&vdev->memory_lock);
> > +
> > +	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> > +		if (!get_file_rcu(&priv->dmabuf->file))
> > +			continue;
> 
> Does this indicate the file was closed?

Yes.. The original patch was clearer, Christian asked to open
code it:

+ * Returns true if a reference was successfully obtained. The caller must
+ * interlock with the dmabuf's release function in some way, such as RCU, to
+ * ensure that this is not called on freed memory.

A description of how the locking is working should be put in a comment
above that code.

> > @@ -623,6 +625,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
> >  		*virt_cmd &= cpu_to_le16(~mask);
> >  		*virt_cmd |= cpu_to_le16(new_cmd & mask);
> >  
> > +		if (__vfio_pci_memory_enabled(vdev))
> > +			vfio_pci_dma_buf_move(vdev, false);
> >  		up_write(&vdev->memory_lock);
> >  	}
> 
> FLR is also accessible through config space.

That needs fixing up

> > @@ -1246,7 +1248,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
> >  	 */
> >  	vfio_pci_set_power_state(vdev, PCI_D0);
> >  
> > +	vfio_pci_dma_buf_move(vdev, true);
> >  	ret = pci_try_reset_function(vdev->pdev);
> > +	if (__vfio_pci_memory_enabled(vdev))
> > +		vfio_pci_dma_buf_move(vdev, false);
> >  	up_write(&vdev->memory_lock);
> >  
> 
> What about runtime power management?

Yes

Yes, I somehow thing it was added

> > -static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
> > -				       uuid_t __user *arg, size_t argsz)
> > +static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev,
> > +				       u32 flags, uuid_t __user *arg,
> > +				       size_t argsz)
> >  {
> > -	struct vfio_pci_core_device *vdev =
> > -		container_of(device, struct vfio_pci_core_device, vdev);
> 
> Why is only this existing function updated?  If the core device deref
> is common then apply it to all and do so in a separate patch.  Thanks,

Hm, I think that was som rebasing issue.

> > +/**
> > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> > + * region selected.
> > + *
> > + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> > + * etc. offset/length specify a slice of the region to create the dmabuf from.
> > + * If both are 0 then the whole region is used.
> > + *
> > + * Return: The fd number on success, -1 and errno is set on failure.
> > + */
> > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> > +
> > +struct vfio_region_p2p_area {
> > +	__u32	region_index;
> > +	__u32	__pad;
> > +	__u64	offset;
> > +	__u64	length;
> > +};
> > +
> > +struct vfio_device_feature_dma_buf {
> > +	__u32	open_flags;
> > +	__u32	nr_areas;
> > +	struct vfio_region_p2p_area p2p_areas[];
> > +};

Still have no clue what this p2p areas is. You want to create a dmabuf
out of a scatterlist? Why??

I'm also not sure of the use of the pci_p2pdma family of functions, it
is a bold step to make struct pages, that isn't going to work in quite
alot of cases. It is really hacky/wrong to immediately call
pci_alloc_p2pmem() to defeat the internal genalloc.

I'd rather we stick with the original design. Leon is working on DMA
API changes that should address half the issue.

Jason


More information about the dri-devel mailing list