[PATCH v1 10/10] vfio/pci: Add dma-buf export support for MMIO regions
Alex Williamson
alex.williamson at redhat.com
Wed Aug 6 22:24:55 UTC 2025
On Mon, 4 Aug 2025 16:00:45 +0300
Leon Romanovsky <leon at kernel.org> wrote:
> From: Leon Romanovsky <leonro at nvidia.com>
>
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.
>
> The implementation provides a revocable attachment mechanism using
> dma-buf move operations. MMIO regions are normally pinned as BARs
> don't change physical addresses, but access is revoked when the VFIO
> device is closed or a PCI reset is issued. This ensures kernel
> self-defense against potentially hostile userspace.
>
> Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy at intel.com>
> Signed-off-by: Leon Romanovsky <leonro at nvidia.com>
> ---
> drivers/vfio/pci/Kconfig | 20 ++
> drivers/vfio/pci/Makefile | 2 +
> drivers/vfio/pci/vfio_pci_config.c | 22 +-
> drivers/vfio/pci/vfio_pci_core.c | 25 +-
> drivers/vfio/pci/vfio_pci_dmabuf.c | 390 +++++++++++++++++++++++++++++
> drivers/vfio/pci/vfio_pci_priv.h | 23 ++
> include/linux/dma-buf.h | 1 +
> include/linux/vfio_pci_core.h | 3 +
> include/uapi/linux/vfio.h | 25 ++
> 9 files changed, 506 insertions(+), 5 deletions(-)
> create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
>
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 2b0172f546652..55ae888bf26ae 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -55,6 +55,26 @@ config VFIO_PCI_ZDEV_KVM
>
> To enable s390x KVM vfio-pci extensions, say Y.
>
> +config VFIO_PCI_DMABUF
> + bool "VFIO PCI extensions for DMA-BUF"
> + depends on VFIO_PCI_CORE
> + depends on PCI_P2PDMA && DMA_SHARED_BUFFER
> + default y
> + help
> + Enable support for VFIO PCI extensions that allow exporting
> + device MMIO regions as DMA-BUFs for peer devices to access via
> + peer-to-peer (P2P) DMA.
> +
> + This feature enables a VFIO-managed PCI device to export a portion
> + of its MMIO BAR as a DMA-BUF file descriptor, which can be passed
> + to other userspace drivers or kernel subsystems capable of
> + initiating DMA to that region.
> +
> + Say Y here if you want to enable VFIO DMABUF-based MMIO export
> + support for peer-to-peer DMA use cases.
> +
> + If unsure, say N.
> +
> source "drivers/vfio/pci/mlx5/Kconfig"
>
> source "drivers/vfio/pci/hisilicon/Kconfig"
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index cf00c0a7e55c8..f9155e9c5f630 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -2,7 +2,9 @@
>
> vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
> vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
> +
> obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
> +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
>
> vfio-pci-y := vfio_pci.o
> vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 8f02f236b5b4b..7e23387a43b4d 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
> virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
> new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
>
> - if (!new_mem)
> + if (!new_mem) {
> vfio_pci_zap_and_down_write_memory_lock(vdev);
> - else
> + vfio_pci_dma_buf_move(vdev, true);
> + } else {
> down_write(&vdev->memory_lock);
> + }
>
> /*
> * If the user is writing mem/io enable (new_mem/io) and we
> @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
> *virt_cmd &= cpu_to_le16(~mask);
> *virt_cmd |= cpu_to_le16(new_cmd & mask);
>
> + if (__vfio_pci_memory_enabled(vdev))
> + vfio_pci_dma_buf_move(vdev, false);
> up_write(&vdev->memory_lock);
> }
>
> @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
> static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
> pci_power_t state)
> {
> - if (state >= PCI_D3hot)
> + if (state >= PCI_D3hot) {
> vfio_pci_zap_and_down_write_memory_lock(vdev);
> - else
> + vfio_pci_dma_buf_move(vdev, true);
> + } else {
> down_write(&vdev->memory_lock);
> + }
>
> vfio_pci_set_power_state(vdev, state);
> + if (__vfio_pci_memory_enabled(vdev))
> + vfio_pci_dma_buf_move(vdev, false);
> up_write(&vdev->memory_lock);
> }
>
> @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
>
> if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
> vfio_pci_zap_and_down_write_memory_lock(vdev);
> + vfio_pci_dma_buf_move(vdev, true);
> pci_try_reset_function(vdev->pdev);
> + if (__vfio_pci_memory_enabled(vdev))
> + vfio_pci_dma_buf_move(vdev, true);
@revoked true -> true seems wrong.
> up_write(&vdev->memory_lock);
> }
> }
> @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
>
> if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
> vfio_pci_zap_and_down_write_memory_lock(vdev);
> + vfio_pci_dma_buf_move(vdev, true);
> pci_try_reset_function(vdev->pdev);
> + if (__vfio_pci_memory_enabled(vdev))
> + vfio_pci_dma_buf_move(vdev, true);
Same.
> up_write(&vdev->memory_lock);
> }
> }
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index b1863d84b11aa..8e840ac413e9b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -28,7 +28,9 @@
> #include <linux/nospec.h>
> #include <linux/sched/mm.h>
> #include <linux/iommufd.h>
> +#ifdef CONFIG_VFIO_PCI_DMABUF
> #include <linux/pci-p2pdma.h>
> +#endif
> #if IS_ENABLED(CONFIG_EEH)
> #include <asm/eeh.h>
> #endif
> @@ -287,6 +289,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
> * semaphore.
> */
> vfio_pci_zap_and_down_write_memory_lock(vdev);
> + vfio_pci_dma_buf_move(vdev, true);
> +
> if (vdev->pm_runtime_engaged) {
> up_write(&vdev->memory_lock);
> return -EINVAL;
> @@ -370,6 +374,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
> */
> down_write(&vdev->memory_lock);
> __vfio_pci_runtime_pm_exit(vdev);
> + if (__vfio_pci_memory_enabled(vdev))
> + vfio_pci_dma_buf_move(vdev, false);
> up_write(&vdev->memory_lock);
> }
>
> @@ -690,6 +696,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
> #endif
> vfio_pci_core_disable(vdev);
>
> + vfio_pci_dma_buf_cleanup(vdev);
> +
> mutex_lock(&vdev->igate);
> if (vdev->err_trigger) {
> eventfd_ctx_put(vdev->err_trigger);
> @@ -1222,7 +1230,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
> */
> vfio_pci_set_power_state(vdev, PCI_D0);
>
> + vfio_pci_dma_buf_move(vdev, true);
> ret = pci_try_reset_function(vdev->pdev);
> + if (__vfio_pci_memory_enabled(vdev))
> + vfio_pci_dma_buf_move(vdev, false);
> up_write(&vdev->memory_lock);
>
> return ret;
> @@ -1511,6 +1522,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
> case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> + case VFIO_DEVICE_FEATURE_DMA_BUF:
> + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> default:
> return -ENOTTY;
> }
> @@ -2085,9 +2098,13 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
> INIT_LIST_HEAD(&vdev->dummy_resources_list);
> INIT_LIST_HEAD(&vdev->ioeventfds_list);
> INIT_LIST_HEAD(&vdev->sriov_pfs_item);
> +#ifdef CONFIG_VFIO_PCI_DMABUF
> vdev->provider = pci_p2pdma_enable(vdev->pdev);
> if (IS_ERR(vdev->provider))
> return PTR_ERR(vdev->provider);
> +
> + INIT_LIST_HEAD(&vdev->dmabufs);
> +#endif
> init_rwsem(&vdev->memory_lock);
> xa_init(&vdev->ctx);
>
> @@ -2470,11 +2487,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> * cause the PCI config space reset without restoring the original
> * state (saved locally in 'vdev->pm_save').
> */
> - list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
> + vfio_pci_dma_buf_move(vdev, true);
> vfio_pci_set_power_state(vdev, PCI_D0);
> + }
The revoke should have happened at the time the BARs were zapped.
Thanks,
Alex
>
> ret = pci_reset_bus(pdev);
>
> + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> + if (__vfio_pci_memory_enabled(vdev))
> + vfio_pci_dma_buf_move(vdev, false);
> +
> vdev = list_last_entry(&dev_set->device_list,
> struct vfio_pci_core_device, vdev.dev_set_list);
>
More information about the dri-devel
mailing list