[Intel-gfx] [IGDVFIO] [PATCH 2/8] RFC and help completing: Intel IGD Direct Assignment with VFIO
Alex Williamson
alex.williamson at redhat.com
Wed Sep 24 21:25:28 CEST 2014
On Wed, 2014-09-24 at 14:20 +0100, Andrew Barnes wrote:
> hw/misc/vfio.c
>
> this patch adds:
> * memory map intel opregion
> * mirroring of bdsm to guest's device 0 not hosts.
>
> patch
> ---------------------
>
> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> index e88b610..54e549b 100644
> --- a/hw/misc/vfio.c
> +++ b/hw/misc/vfio.c
> @@ -5,6 +5,7 @@
> *
> * Authors:
> * Alex Williamson <alex.williamson at redhat.com>
> + * Andrew Barnes <andy at outsideglobe.com> IGD Support
> *
> * This work is licensed under the terms of the GNU GPL, version 2. See
> * the COPYING file in the top-level directory.
> @@ -56,6 +57,45 @@
> #define VFIO_ALLOW_KVM_MSI 1
> #define VFIO_ALLOW_KVM_MSIX 1
>
> +/* A handy list of IGD device ID's */
> +#define IS_IGD_HASWELL(id) (id == 0x0402 \
> + || id == 0x0406 \
> + || id == 0x040a \
> + || id == 0x0412 \
> + || id == 0x0416 \
> + || id == 0x041a \
> + || id == 0x0a04 \
> + || id == 0x0a16 \
> + || id == 0x0a22 \
> + || id == 0x0a26 \
> + || id == 0x0a2a )
> +#define IS_IGD_IVYBRIDGE(id) (id == 0x0162 \
> + || id == 0x0166 \
> + || id == 0x016a \
> + || id == 0x0152 \
> + || id == 0x0156 \
> + || id == 0x015a )
> +#define IS_IGD_SANDYBRIDGE(id) (id == 0x0102 \
> + || id == 0x0106 \
> + || id == 0x0112 \
> + || id == 0x0116 \
> + || id == 0x0122 \
> + || id == 0x0126 \
> + || id ==0x010a )
> +#define IS_IGD_IRONLAKE_CLARKDALE(id) (id == 0x0042 )
> +#define IS_IGD_IRONLAKE_ARRANDALE(id) (id == 0x0046 )
> +#define IS_IGD(id) (IS_IGD_IRONLAKE_CLARKDALE(id) \
> + || IS_IGD_IRONLAKE_ARRANDALE(id) \
> + || IS_IGD_SANDYBRIDGE(id) \
> + || IS_IGD_IVYBRIDGE(id) \
> + || IS_IGD_HASWELL(id) )
> +#define IGD_BAR_MASK 0xFFFFFFFFFFFF0000
> +#define DMAR_OPERATION_TIMEOUT ((s_time_t)((_ms) * 1000000ULL))
> +
> +#define PCI_CONFIG_INTEL_OPREGION 0xfc
> +#define INTEL_OPREGION_PAGES 3
> +#define INTEL_OPREGION_SIZE INTEL_OPREGION_PAGES *
> TARGET_PAGE_SIZE
> +
*GAG*
> struct VFIODevice;
>
> typedef struct VFIOQuirk {
> @@ -227,6 +267,8 @@ typedef struct VFIODevice {
> bool has_pm_reset;
> bool needs_reset;
> bool rom_read_failed;
> + MemoryRegion opregion; /* Intel opregion */
> + uint32_t host_opregion; /* Host address of opregion */
Let's at least create a new struct to host these with a pointer off of
VFIODevice, then we only need to test IS_IGD() once any everywhere else
can just test the pointer.
> } VFIODevice;
>
> typedef struct VFIOGroup {
> @@ -283,6 +325,18 @@ static void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr,
> uint32_t val, int len);
> static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
>
> +static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask);
> +static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
> + uint16_t val, uint16_t mask);
> +static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask);
> +static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
> + uint32_t val, uint32_t mask);
> +static void vfio_add_emulated_rw_long(VFIODevice *vdev, int pos,
> + uint32_t val, uint32_t mask);
> +static void vfio_map_igdopregion(VFIODevice *vdev, uint32_t
> guest_opregion);
> +
> +static VFIODevice *igdvfio;
Unused?
> +
> /*
> * Common VFIO interrupt disable
> */
> @@ -2324,27 +2378,46 @@ static uint32_t vfio_pci_read_config(PCIDevice
> *pdev, uint32_t addr, int len)
> VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
>
> - memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
> - emu_bits = le32_to_cpu(emu_bits);
> -
> - if (emu_bits) {
> - emu_val = pci_default_read_config(pdev, addr, len);
> + /* BDSM mirror - BDSM can be read at either 0xb0 device 0, or 0x5c
> device 2.
> + * Redirect this mirror from host 0xb0 device 0 to guest 0xb0 device
> 0.*/
Defining BDSM would be helpful.
> + if (IS_IGD(pci_get_word(pdev->config + PCI_DEVICE_ID)) &&
> ranges_overlap(addr,len,0x5c,4))
> + {
> + DPRINTF("%s Read Trapped (%04x:%02x:%02x.%x, @0x%x, 0x%x,
> len=0x%x)\n", __func__,
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, addr, val, len);
> + PCIBus *root = pci_find_primary_bus();
> + PCIDevice *q35 = pci_find_device(root,0,PCI_DEVFN(0, 0));
> + val = pci_default_read_config(q35, 0xb0, len);
My system shows:
$ sudo setpci -s 0:0.0 b0.l
dba00001
$ sudo setpci -s 0:2.0 5c.l
dba00001
So why are we doing this? Is one of these other patches emulating b0 on
the host bridge and we're trying to access that emulation here? A patch
series that's simply a full diff of each file is really hard to review.
> }
>
> - if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
> - ssize_t ret;
> + else
> + {
> + memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
> + emu_bits = le32_to_cpu(emu_bits);
> +
> + if (emu_bits) {
> + emu_val = pci_default_read_config(pdev, addr, len);
> + DPRINTF("%s emulated read: %x \n",__func__,emu_val);
>
> - ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
> - if (ret != len) {
> - error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
> - __func__, vdev->host.domain, vdev->host.bus,
> - vdev->host.slot, vdev->host.function, addr, len);
> - return -errno;
> }
> - phys_val = le32_to_cpu(phys_val);
> - }
>
> - val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
> + if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
> + ssize_t ret;
> +
> + ret = pread(vdev->fd, &phys_val, len, vdev->config_offset +
> addr);
> + if (ret != len) {
> + error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed:
> %m",
> + __func__, vdev->host.domain, vdev->host.bus,
> + vdev->host.slot, vdev->host.function, addr,
> len);
> + return -errno;
> + }
> + phys_val = le32_to_cpu(phys_val);
> + DPRINTF("%s direct read: %x \n",__func__,phys_val);
> +
> + }
> +
> + val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
> + }
>
> DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
> vdev->host.domain, vdev->host.bus, vdev->host.slot,
> @@ -2363,12 +2436,30 @@ static void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr,
> vdev->host.domain, vdev->host.bus, vdev->host.slot,
> vdev->host.function, addr, val, len);
>
> + /* A write to OPREGION base address means that seabios has allocated a
> new memory region for OPREGION
> + * in the guest. */
> + if (IS_IGD(pci_get_word(pdev->config + PCI_DEVICE_ID)) &&
> ranges_overlap(addr,len,PCI_CONFIG_INTEL_OPREGION,4))
> + {
> + DPRINTF("%s Write Trapped (%04x:%02x:%02x.%x, @0x%x, 0x%x,
> len=0x%x)\n", __func__,
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, addr, val, len);
> + //val = (val & 0xfffff000) | (vdev->host_opregion & 0xfff);
> + vfio_map_igdopregion(vdev,val);
> + goto defaultwrite;
> + }
> +
> /* Write everything to VFIO, let it filter out what we can't write */
> - if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len)
> {
> + else if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) !=
> len) {
> error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
> __func__, vdev->host.domain, vdev->host.bus,
> vdev->host.slot, vdev->host.function, addr, val, len);
> }
> + else
> + {
> + DPRINTF("%s Written to VFIO (%04x:%02x:%02x.%x, @0x%x, 0x%x,
> len=0x%x)\n", __func__,
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, addr, val, len);
> + }
>
> /* MSI/MSI-X Enabling/Disabling */
> if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> @@ -2405,7 +2496,11 @@ static void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr,
> }
> } else {
> /* Write everything to QEMU to keep emulated bits correct */
> - pci_default_write_config(pdev, addr, val, len);
> +defaultwrite:
> + pci_default_write_config(pdev, addr, val, len);
> + DPRINTF("%s Default Write (%04x:%02x:%02x.%x, @0x%x, 0x%x,
> len=0x%x)\n", __func__,
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function, addr, val, len);
> }
> }
>
> @@ -3065,7 +3160,7 @@ static void vfio_set_word_bits(uint8_t *buf, uint16_t
> val, uint16_t mask)
> {
> pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
> }
> -
> +/* helper functions make read-only emulated registers! */
> static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
> uint16_t val, uint16_t mask)
> {
> @@ -3087,6 +3182,62 @@ static void vfio_add_emulated_long(VFIODevice *vdev,
> int pos,
> vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
> }
>
> +static void vfio_add_emulated_rw_long(VFIODevice *vdev, int pos,
> + uint32_t val, uint32_t mask)
> +{
> + vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
> + vfio_set_long_bits(vdev->pdev.wmask + pos, mask, mask);
> + vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
> +}
> +
> +/* Setup the mapping of the opregion ready for Seabios to allocate the
> guest location */
> +static void vfio_setup_igdopregion(VFIODevice *vdev)
> +{
> + PCIDevice *pdev = &vdev->pdev;
> + int fd;
> + char name[64];
> + void *map;
> +
> + vdev->host_opregion =
> vfio_pci_read_config(pdev,PCI_CONFIG_INTEL_OPREGION,4);
> +
> + DPRINTF("%s Setup IGD OpRegion: %x\n",__func__,vdev->host_opregion);
> +
> + snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x IGDOPREGION mmap",
> + vdev->host.domain, vdev->host.bus, vdev->host.slot,
> + vdev->host.function);
> +
> + fd = open("/dev/mem", O_RDWR);
> + map =
> mmap(NULL,INTEL_OPREGION_SIZE,PROT_READ|PROT_WRITE,MAP_SHARED,fd,(vdev->host_opregion
> & ~0xfff));
> + if (map == MAP_FAILED)
> + {
> + map = NULL;
> + DPRINTF("%s Map IGD OpRegion: MAP_FAILED\n",__func__);
> + }
> + memory_region_init_ram_ptr(&vdev->opregion, OBJECT(vdev), name,
> INTEL_OPREGION_SIZE, map);
This would of course need to be a new region exposed by vfio rather than
using /dev/mem. We'd probably want to reserve a device specific region
and indicate via region flags various IGD passthrus provided.
> +}
> +
> +/* Seabios allocates the guest location for the opregion. This function
> then memmory maps
> + * host memory at that guest location */
> +static void vfio_map_igdopregion(VFIODevice *vdev, uint32_t new_opregion)
> +{
> + PCIDevice *pdev = &vdev->pdev;
> + MemoryRegion *guest_memory = get_system_memory();
> + uint32_t current_opregion =
> vfio_pci_read_config(pdev,PCI_CONFIG_INTEL_OPREGION,4);
> +
> + if ( current_opregion != vdev->host_opregion )
> + {
> + // remap
> + DPRINTF("%s Delete IGD OpRegion: %x\n",__func__,(current_opregion
> & ~0xfff));
> + memory_region_del_subregion(guest_memory, &vdev->opregion);
> + }
> +
> + DPRINTF("%s Map IGD OpRegion: %x ->
> %x\n",__func__,vdev->host_opregion,new_opregion);
> + memory_region_add_subregion(guest_memory, (new_opregion & ~0xfff),
> &vdev->opregion);
> +
> + DPRINTF("%s Adding 0xfc to emulated bits\n", __func__);
> + vfio_add_emulated_rw_long(vdev, PCI_CONFIG_INTEL_OPREGION,
> new_opregion, 0xffffffff);
I know you've had issues with getting it to work, but we should really
only need to call vfio_add_emulated_rw_long() once when we init the
device, and then simply store the new value here.
> +}
> +
> static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
> {
> uint16_t flags;
> @@ -4028,7 +4179,8 @@ static int vfio_get_device(VFIOGroup *group, const
> char *name, VFIODevice *vdev)
> ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> if (ret) {
> /* This can fail for an old kernel or legacy PCI dev */
> - DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
> + //DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
> + DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure ret=%d\n", ret);
> ret = 0;
> } else if (irq_info.count == 1) {
> vdev->pci_aer = true;
> @@ -4253,6 +4405,11 @@ static int vfio_initfn(PCIDevice *pdev)
> vdev->emulated_config_bits[PCI_HEADER_TYPE] =
>
> PCI_HEADER_TYPE_MULTI_FUNCTION;
>
> + if (IS_IGD(pci_get_word(pdev->config + PCI_DEVICE_ID)))
> + {
> + vfio_setup_igdopregion(vdev);
> + }
> +
> /* Restore or clear multifunction, this is always controlled by QEMU */
> if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> vdev->pdev.config[PCI_HEADER_TYPE] |=
> PCI_HEADER_TYPE_MULTI_FUNCTION;
More information about the Intel-gfx
mailing list