[Intel-gfx] [IGDVFIO] [PATCH 2/8] RFC and help completing: Intel IGD Direct Assignment with VFIO

Alex Williamson alex.williamson at redhat.com
Wed Sep 24 21:25:28 CEST 2014


On Wed, 2014-09-24 at 14:20 +0100, Andrew Barnes wrote:
> hw/misc/vfio.c
> 
> this patch adds:
> * memory map intel opregion
> * mirroring of bdsm to guest's device 0 not hosts.
> 
> patch
> ---------------------
> 
> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> index e88b610..54e549b 100644
> --- a/hw/misc/vfio.c
> +++ b/hw/misc/vfio.c
> @@ -5,6 +5,7 @@
>   *
>   * Authors:
>   *  Alex Williamson <alex.williamson at redhat.com>
> + *  Andrew Barnes <andy at outsideglobe.com> IGD Support
>   *
>   * This work is licensed under the terms of the GNU GPL, version 2.  See
>   * the COPYING file in the top-level directory.
> @@ -56,6 +57,45 @@
>  #define VFIO_ALLOW_KVM_MSI 1
>  #define VFIO_ALLOW_KVM_MSIX 1
> 
> +/* A handy list of IGD device ID's */
> +#define IS_IGD_HASWELL(id)            (id == 0x0402 \
> +                                        || id == 0x0406 \
> +                                        || id == 0x040a \
> +                                        || id == 0x0412 \
> +                                        || id == 0x0416 \
> +                                        || id == 0x041a \
> +                                        || id == 0x0a04 \
> +                                        || id == 0x0a16 \
> +                                        || id == 0x0a22 \
> +                                        || id == 0x0a26 \
> +                                        || id == 0x0a2a )
> +#define IS_IGD_IVYBRIDGE(id)          (id == 0x0162 \
> +                                        || id == 0x0166 \
> +                                        || id == 0x016a \
> +                                        || id == 0x0152 \
> +                                        || id == 0x0156 \
> +                                        || id == 0x015a )
> +#define IS_IGD_SANDYBRIDGE(id)        (id == 0x0102 \
> +                                        || id == 0x0106 \
> +                                        || id == 0x0112 \
> +                                        || id == 0x0116 \
> +                                        || id == 0x0122 \
> +                                        || id == 0x0126 \
> +                                        || id ==0x010a )
> +#define IS_IGD_IRONLAKE_CLARKDALE(id) (id == 0x0042 )
> +#define IS_IGD_IRONLAKE_ARRANDALE(id) (id == 0x0046 )
> +#define IS_IGD(id)                    (IS_IGD_IRONLAKE_CLARKDALE(id) \
> +                                        || IS_IGD_IRONLAKE_ARRANDALE(id) \
> +                                        || IS_IGD_SANDYBRIDGE(id) \
> +                                        || IS_IGD_IVYBRIDGE(id) \
> +                                        || IS_IGD_HASWELL(id) )
> +#define IGD_BAR_MASK                  0xFFFFFFFFFFFF0000
> +#define DMAR_OPERATION_TIMEOUT        ((s_time_t)((_ms) * 1000000ULL))
> +
> +#define PCI_CONFIG_INTEL_OPREGION       0xfc
> +#define INTEL_OPREGION_PAGES            3
> +#define INTEL_OPREGION_SIZE             INTEL_OPREGION_PAGES *
> TARGET_PAGE_SIZE
> +

*GAG*

>  struct VFIODevice;
> 
>  typedef struct VFIOQuirk {
> @@ -227,6 +267,8 @@ typedef struct VFIODevice {
>      bool has_pm_reset;
>      bool needs_reset;
>      bool rom_read_failed;
> +    MemoryRegion opregion; /* Intel opregion */
> +    uint32_t host_opregion; /* Host address of opregion */

Let's at least create a new struct to host these with a pointer off of
VFIODevice, then we only need to test IS_IGD() once any everywhere else
can just test the pointer.

>  } VFIODevice;
> 
>  typedef struct VFIOGroup {
> @@ -283,6 +325,18 @@ static void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr,
>                                    uint32_t val, int len);
>  static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
> 
> +static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask);
> +static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
> +                                   uint16_t val, uint16_t mask);
> +static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask);
> +static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
> +                                   uint32_t val, uint32_t mask);
> +static void vfio_add_emulated_rw_long(VFIODevice *vdev, int pos,
> +                                   uint32_t val, uint32_t mask);
> +static void vfio_map_igdopregion(VFIODevice *vdev, uint32_t
> guest_opregion);
> +
> +static VFIODevice *igdvfio;

Unused?

> +
>  /*
>   * Common VFIO interrupt disable
>   */
> @@ -2324,27 +2378,46 @@ static uint32_t vfio_pci_read_config(PCIDevice
> *pdev, uint32_t addr, int len)
>      VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
>      uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
> 
> -    memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
> -    emu_bits = le32_to_cpu(emu_bits);
> -
> -    if (emu_bits) {
> -        emu_val = pci_default_read_config(pdev, addr, len);
> +    /* BDSM mirror - BDSM can be read at either 0xb0 device 0, or 0x5c
> device 2.
> +     * Redirect this mirror from host 0xb0 device 0 to guest 0xb0 device
> 0.*/

Defining BDSM would be helpful.

> +    if (IS_IGD(pci_get_word(pdev->config + PCI_DEVICE_ID)) &&
> ranges_overlap(addr,len,0x5c,4))
> +    {
> +        DPRINTF("%s Read Trapped (%04x:%02x:%02x.%x, @0x%x, 0x%x,
> len=0x%x)\n", __func__,
> +                    vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +                                vdev->host.function, addr, val, len);
> +        PCIBus *root = pci_find_primary_bus();
> +        PCIDevice *q35 = pci_find_device(root,0,PCI_DEVFN(0, 0));
> +        val = pci_default_read_config(q35, 0xb0, len);

My system shows:

$ sudo setpci -s 0:0.0 b0.l
dba00001
$ sudo setpci -s 0:2.0 5c.l
dba00001

So why are we doing this?  Is one of these other patches emulating b0 on
the host bridge and we're trying to access that emulation here?  A patch
series that's simply a full diff of each file is really hard to review.

>      }
> 
> -    if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
> -        ssize_t ret;
> +    else
> +    {
> +        memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
> +        emu_bits = le32_to_cpu(emu_bits);
> +
> +        if (emu_bits) {
> +            emu_val = pci_default_read_config(pdev, addr, len);
> +            DPRINTF("%s emulated read: %x \n",__func__,emu_val);
> 
> -        ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
> -        if (ret != len) {
> -            error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
> -                         __func__, vdev->host.domain, vdev->host.bus,
> -                         vdev->host.slot, vdev->host.function, addr, len);
> -            return -errno;
>          }
> -        phys_val = le32_to_cpu(phys_val);
> -    }
> 
> -    val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
> +        if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
> +            ssize_t ret;
> +
> +            ret = pread(vdev->fd, &phys_val, len, vdev->config_offset +
> addr);
> +            if (ret != len) {
> +                error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed:
> %m",
> +                             __func__, vdev->host.domain, vdev->host.bus,
> +                             vdev->host.slot, vdev->host.function, addr,
> len);
> +                return -errno;
> +            }
> +            phys_val = le32_to_cpu(phys_val);
> +            DPRINTF("%s direct read: %x \n",__func__,phys_val);
> +
> +        }
> +
> +        val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
> +    }
> 
>      DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
>              vdev->host.domain, vdev->host.bus, vdev->host.slot,
> @@ -2363,12 +2436,30 @@ static void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr,
>              vdev->host.domain, vdev->host.bus, vdev->host.slot,
>              vdev->host.function, addr, val, len);
> 
> +    /* A write to OPREGION base address means that seabios has allocated a
> new memory region for OPREGION
> +     * in the guest. */
> +    if (IS_IGD(pci_get_word(pdev->config + PCI_DEVICE_ID)) &&
> ranges_overlap(addr,len,PCI_CONFIG_INTEL_OPREGION,4))
> +    {
> +        DPRINTF("%s Write Trapped (%04x:%02x:%02x.%x, @0x%x, 0x%x,
> len=0x%x)\n", __func__,
> +                    vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +                                vdev->host.function, addr, val, len);
> +        //val = (val & 0xfffff000) | (vdev->host_opregion & 0xfff);
> +        vfio_map_igdopregion(vdev,val);
> +        goto defaultwrite;
> +    }
> +
>      /* Write everything to VFIO, let it filter out what we can't write */
> -    if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len)
> {
> +    else if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) !=
> len) {
>          error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
>                       __func__, vdev->host.domain, vdev->host.bus,
>                       vdev->host.slot, vdev->host.function, addr, val, len);
>      }
> +    else
> +    {
> +        DPRINTF("%s Written to VFIO (%04x:%02x:%02x.%x, @0x%x, 0x%x,
> len=0x%x)\n", __func__,
> +                  vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +                              vdev->host.function, addr, val, len);
> +    }
> 
>      /* MSI/MSI-X Enabling/Disabling */
>      if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> @@ -2405,7 +2496,11 @@ static void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr,
>          }
>      } else {
>          /* Write everything to QEMU to keep emulated bits correct */
> -        pci_default_write_config(pdev, addr, val, len);
> +defaultwrite:
> +       pci_default_write_config(pdev, addr, val, len);
> +       DPRINTF("%s Default Write (%04x:%02x:%02x.%x, @0x%x, 0x%x,
> len=0x%x)\n", __func__,
> +                   vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +                               vdev->host.function, addr, val, len);
>      }
>  }
> 
> @@ -3065,7 +3160,7 @@ static void vfio_set_word_bits(uint8_t *buf, uint16_t
> val, uint16_t mask)
>  {
>      pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
>  }
> -
> +/* helper functions make read-only emulated registers! */
>  static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
>                                     uint16_t val, uint16_t mask)
>  {
> @@ -3087,6 +3182,62 @@ static void vfio_add_emulated_long(VFIODevice *vdev,
> int pos,
>      vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
>  }
> 
> +static void vfio_add_emulated_rw_long(VFIODevice *vdev, int pos,
> +                                   uint32_t val, uint32_t mask)
> +{
> +    vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
> +    vfio_set_long_bits(vdev->pdev.wmask + pos, mask, mask);
> +    vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
> +}
> +
> +/* Setup the mapping of the opregion ready for Seabios to allocate the
> guest location */
> +static void vfio_setup_igdopregion(VFIODevice *vdev)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    int fd;
> +    char name[64];
> +    void *map;
> +
> +    vdev->host_opregion =
> vfio_pci_read_config(pdev,PCI_CONFIG_INTEL_OPREGION,4);
> +
> +    DPRINTF("%s Setup IGD OpRegion: %x\n",__func__,vdev->host_opregion);
> +
> +    snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x IGDOPREGION mmap",
> +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +             vdev->host.function);
> +
> +    fd = open("/dev/mem", O_RDWR);
> +    map =
> mmap(NULL,INTEL_OPREGION_SIZE,PROT_READ|PROT_WRITE,MAP_SHARED,fd,(vdev->host_opregion
> & ~0xfff));
> +    if (map == MAP_FAILED)
> +    {
> +        map = NULL;
> +        DPRINTF("%s Map IGD OpRegion: MAP_FAILED\n",__func__);
> +    }
> +    memory_region_init_ram_ptr(&vdev->opregion, OBJECT(vdev), name,
> INTEL_OPREGION_SIZE, map);

This would of course need to be a new region exposed by vfio rather than
using /dev/mem.  We'd probably want to reserve a device specific region
and indicate via region flags various IGD passthrus provided.

> +}
> +
> +/* Seabios allocates the guest location for the opregion. This function
> then memmory maps
> + * host memory at that guest location */
> +static void vfio_map_igdopregion(VFIODevice *vdev, uint32_t new_opregion)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    MemoryRegion *guest_memory = get_system_memory();
> +    uint32_t current_opregion =
> vfio_pci_read_config(pdev,PCI_CONFIG_INTEL_OPREGION,4);
> +
> +    if ( current_opregion != vdev->host_opregion )
> +    {
> +        // remap
> +        DPRINTF("%s Delete IGD OpRegion: %x\n",__func__,(current_opregion
> & ~0xfff));
> +        memory_region_del_subregion(guest_memory, &vdev->opregion);
> +    }
> +
> +    DPRINTF("%s Map IGD OpRegion: %x ->
> %x\n",__func__,vdev->host_opregion,new_opregion);
> +    memory_region_add_subregion(guest_memory, (new_opregion & ~0xfff),
> &vdev->opregion);
> +
> +    DPRINTF("%s Adding 0xfc to emulated bits\n", __func__);
> +    vfio_add_emulated_rw_long(vdev, PCI_CONFIG_INTEL_OPREGION,
> new_opregion, 0xffffffff);

I know you've had issues with getting it to work, but we should really
only need to call vfio_add_emulated_rw_long() once when we init the
device, and then simply store the new value here.

> +}
> +
>  static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
>  {
>      uint16_t flags;
> @@ -4028,7 +4179,8 @@ static int vfio_get_device(VFIOGroup *group, const
> char *name, VFIODevice *vdev)
>      ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
>      if (ret) {
>          /* This can fail for an old kernel or legacy PCI dev */
> -        DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
> +        //DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
> +        DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure ret=%d\n", ret);
>          ret = 0;
>      } else if (irq_info.count == 1) {
>          vdev->pci_aer = true;
> @@ -4253,6 +4405,11 @@ static int vfio_initfn(PCIDevice *pdev)
>      vdev->emulated_config_bits[PCI_HEADER_TYPE] =
> 
>  PCI_HEADER_TYPE_MULTI_FUNCTION;
> 
> +    if (IS_IGD(pci_get_word(pdev->config + PCI_DEVICE_ID)))
> +    {
> +        vfio_setup_igdopregion(vdev);
> +    }
> +
>      /* Restore or clear multifunction, this is always controlled by QEMU */
>      if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
>          vdev->pdev.config[PCI_HEADER_TYPE] |=
> PCI_HEADER_TYPE_MULTI_FUNCTION;






More information about the Intel-gfx mailing list