[Intel-gfx] [PATCH v6 12/24] vfio/pci: Allow passing zero-length fd array in VFIO_DEVICE_PCI_HOT_RESET

Liu, Yi L yi.l.liu at intel.com
Fri Mar 24 09:25:12 UTC 2023


> From: Jason Gunthorpe <jgg at nvidia.com>
> Sent: Thursday, March 23, 2023 8:02 PM
> 
> On Thu, Mar 23, 2023 at 03:15:20AM +0000, Liu, Yi L wrote:
> > > From: Jason Gunthorpe <jgg at nvidia.com>
> > > Sent: Wednesday, March 22, 2023 9:43 PM
> > >
> > > On Wed, Mar 22, 2023 at 01:33:09PM +0000, Liu, Yi L wrote:
> > >
> > > > Thanks. So this new _INFO only reports a limited scope instead of
> > > > the full list of affected devices. Also, it is not static scope since device
> > > > may be opened just after the _INFO returns.
> > >
> > > Yes, it would be simplest for qemu to do the query after it gains a
> > > new dev_id and then it can add the new dev_id with the correct reset
> > > group.
> >
> > I see. QEMU can decide. For now, it seems like QEMU doesn't store
> > such the info return by the existing _INFO ioctl. It is used in the hot
> > reset helper and freed before it returns. Though, I'm not sure whether
> > QEMU will store the dev_id info returned by the new _INFO. Perhaps
> > Alex can give some guidance.
> 
> That seems a bit confusing, qemu should take the reset group
> information and encode it so that the guest can know it as well.
> 
> If all it does is blindly invoke the hot_reset with the right
> parameters then what was the point of all this discussion?
> 
> > btw.  Another question about this new _INFO ioctl. If there are affected
> > devices that have not been bound to vfio driver yet, should this new _INFO
> > ioctl fail all the same with EPERM?
> 
> Yeah, it should EPERM the same as the normal hot reset if it can't
> marshal the device list.

Hi Jason, Alex,

I got a draft patch to add the new _INFO? It checks if all the affected devices
are in the dev_set, and then gets the dev_id of all the opened devices within
the dev_set. There is still one thing not quite clear. It is the noiommu mode.
In this mode, there is no iommufd bind, so no dev_id. For now, I just fail this
new _INFO ioctl if there is no iommufd_device. Hence, this new _INFO is not
available for users that operating in noiommu mode. Is this acceptable?

>From e763474e255ff9805b1fb76d6b6b9ccedb61058f Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu at intel.com>
Date: Fri, 24 Mar 2023 00:54:08 -0700
Subject: [PATCH 06/10] vfio/pci: Add VFIO_DEVICE_GET_PCI_HOT_RESET_GROUP_INFO

to report the affected devices for a given device's hot reset. It is a
list of iommufd dev_id that is opened by the user. If there is device
that is not bound to vfio driver or opened by another user, this shall
fail with -EPERM. For the noiommu mode in the vfio device cdev path,
this shall fail as no dev_id would be generated.

Signed-off-by: Yi Liu <yi.l.liu at intel.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 98 ++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        | 28 +++++++++
 2 files changed, 126 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index b68fcba67a4b..5789933a64ae 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1181,6 +1181,102 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 	return ret;
 }
 
+static struct pci_dev *
+vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set);
+
+static int vfio_pci_ioctl_get_pci_hot_reset_group_info(
+	struct vfio_pci_core_device *vdev,
+	struct vfio_pci_hot_reset_group_info __user *arg)
+{
+	unsigned long minsz =
+		offsetofend(struct vfio_pci_hot_reset_group_info, count);
+	struct vfio_pci_hot_reset_group_info hdr;
+	struct iommufd_ctx *iommufd, *cur_iommufd;
+	u32 count = 0, index = 0, *devices = NULL;
+	struct vfio_pci_core_device *cur;
+	bool slot = false;
+	int ret = 0;
+
+	if (copy_from_user(&hdr, arg, minsz))
+		return -EFAULT;
+
+	if (hdr.argsz < minsz)
+		return -EINVAL;
+
+	hdr.flags = 0;
+
+	/* Can we do a slot or bus reset or neither? */
+	if (!pci_probe_reset_slot(vdev->pdev->slot))
+		slot = true;
+	else if (pci_probe_reset_bus(vdev->pdev->bus))
+		return -ENODEV;
+
+	mutex_lock(&vdev->vdev.dev_set->lock);
+	if (!vfio_pci_dev_set_resettable(vdev->vdev.dev_set)) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
+	if (!iommufd) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	/* How many devices are affected? */
+	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
+					    &count, slot);
+	if (ret)
+		goto out_unlock;
+
+	WARN_ON(!count); /* Should always be at least one */
+
+	/*
+	 * If there's enough space, fill it now, otherwise return -ENOSPC and
+	 * the number of devices affected.
+	 */
+	if (hdr.argsz < sizeof(hdr) + (count * sizeof(*devices))) {
+		ret = -ENOSPC;
+		hdr.count = count;
+		goto reset_info_exit;
+	}
+
+	devices = kcalloc(count, sizeof(*devices), GFP_KERNEL);
+	if (!devices) {
+		ret = -ENOMEM;
+		goto reset_info_exit;
+	}
+
+	list_for_each_entry(cur, &vdev->vdev.dev_set->device_list, vdev.dev_set_list) {
+		cur_iommufd = vfio_iommufd_physical_ictx(&cur->vdev);
+		if (cur->vdev.open_count) {
+			if (cur_iommufd != iommufd) {
+				ret = -EPERM;
+				break;
+			}
+			ret = vfio_iommufd_physical_devid(&cur->vdev, &devices[index]);
+			if (ret)
+				break;
+			index++;
+		}
+	}
+
+reset_info_exit:
+	if (copy_to_user(arg, &hdr, minsz))
+		ret = -EFAULT;
+
+	if (!ret) {
+		if (copy_to_user(&arg->devices, devices,
+				 hdr.count * sizeof(*devices)))
+			ret = -EFAULT;
+	}
+
+	kfree(devices);
+out_unlock:
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+	return ret;
+}
+
 static int vfio_pci_ioctl_get_pci_hot_reset_info(
 	struct vfio_pci_core_device *vdev,
 	struct vfio_pci_hot_reset_info __user *arg)
@@ -1404,6 +1500,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		return vfio_pci_ioctl_get_irq_info(vdev, uarg);
 	case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
 		return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg);
+	case VFIO_DEVICE_GET_PCI_HOT_RESET_GROUP_INFO:
+		return vfio_pci_ioctl_get_pci_hot_reset_group_info(vdev, uarg);
 	case VFIO_DEVICE_GET_REGION_INFO:
 		return vfio_pci_ioctl_get_region_info(vdev, uarg);
 	case VFIO_DEVICE_IOEVENTFD:
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 17aa5d09db41..572497cda3ca 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -669,6 +669,43 @@ struct vfio_pci_hot_reset_info {
 
 #define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
 
+/**
+ * VFIO_DEVICE_GET_PCI_HOT_RESET_GROUP_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
+ *						    struct vfio_pci_hot_reset_group_info)
+ *
+ * This is used in the vfio device cdev mode.  It returns the list of
+ * affected devices (represented by iommufd dev_id) when hot reset is
+ * issued on the current device with which this ioctl is invoked.  It
+ * only includes the devices that are opened by the current user in the
+ * time of this command is invoked.  So user should re-invoke it when
+ * it has opened new devices.  This information gives user a scope of
+ * affected devices that is opened by it.  This is helpful for user to
+ * decide whether VFIO_DEVICE_PCI_HOT_RESET can be issued.  However,
+ * even user can do hot reset per the info got via this ioctl, hot reset
+ * can fail since the not-opened affected device can be opened by other
+ * users in the window between the two ioctls.  Detail can check the
+ * description of VFIO_DEVICE_PCI_HOT_RESET.
+ *
+ * Return: 0 on success, -errno on failure:
+ *	-enospc = insufficient buffer;
+ *	-enodev = unsupported for device;
+ *	-eperm = no permission for device, this error comes:
+ *		 - when there are affected devices that are opened but
+ *		   not bound to the same iommufd with the current device
+ *		   with which this ioctl is invoked,
+ *		 - there are affected devices that are not bound to vfio
+ *		   driver yet.
+ *		 - no valid iommufd is bound (e.g. noiommu mode)
+ */
+struct vfio_pci_hot_reset_group_info {
+	__u32	argsz;
+	__u32	flags;
+	__u32	count;
+	__u32	devices[];
+};
+
+#define VFIO_DEVICE_GET_PCI_HOT_RESET_GROUP_INFO	_IO(VFIO_TYPE, VFIO_BASE + 18)
+
 /**
  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
  *				    struct vfio_pci_hot_reset)
-- 
2.34.1



More information about the Intel-gfx mailing list