[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH V5 07/10] Introduce Xen PCI Passthrough, qdevice (1/3)



On Thu, Nov 24, 2011 at 05:44:36PM +0000, Anthony PERARD wrote:
> From: Allen Kay <allen.m.kay@xxxxxxxxx>
> 
> A more complete history can be found here:
> git://xenbits.xensource.com/qemu-xen-unstable.git
> 
> Signed-off-by: Allen Kay <allen.m.kay@xxxxxxxxx>
> Signed-off-by: Guy Zana <guy@xxxxxxxxxxxx>
> Signed-off-by: Anthony PERARD <anthony.perard@xxxxxxxxxx>
> ---
>  Makefile.target                      |    2 +
>  hw/xen_common.h                      |    3 +
>  hw/xen_pci_passthrough.c             |  831 
> ++++++++++++++++++++++++++++++++++
>  hw/xen_pci_passthrough.h             |  282 ++++++++++++
>  hw/xen_pci_passthrough_config_init.c |   11 +
>  xen-all.c                            |   12 +
>  6 files changed, 1141 insertions(+), 0 deletions(-)
>  create mode 100644 hw/xen_pci_passthrough.c
>  create mode 100644 hw/xen_pci_passthrough.h
>  create mode 100644 hw/xen_pci_passthrough_config_init.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index e527c1b..33435a3 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -221,6 +221,8 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
>  
>  # Xen PCI Passthrough
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_config_init.o
>  
>  # Inter-VM PCI shared memory
>  CONFIG_IVSHMEM =
> diff --git a/hw/xen_common.h b/hw/xen_common.h
> index 0409ac7..48916fd 100644
> --- a/hw/xen_common.h
> +++ b/hw/xen_common.h
> @@ -135,4 +135,7 @@ static inline int xc_fd(xc_interface *xen_xc)
>  
>  void destroy_hvm_domain(void);
>  
> +/* shutdown/destroy current domain because of an error */
> +void xen_shutdown_fatal_error(const char *fmt, ...) GCC_FMT_ATTR(1, 2);
> +
>  #endif /* QEMU_HW_XEN_COMMON_H */
> diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
> new file mode 100644
> index 0000000..998470b
> --- /dev/null
> +++ b/hw/xen_pci_passthrough.c
> @@ -0,0 +1,831 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + * Copyright (c) 2007, Intel Corporation.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Alex Novik <alex@xxxxxxxxxxxx>
> + * Allen Kay <allen.m.kay@xxxxxxxxx>
> + * Guy Zana <guy@xxxxxxxxxxxx>
> + *
> + * This file implements direct PCI assignment to a HVM guest
> + */
> +
> +/*
> + * Interrupt Disable policy:
> + *
> + * INTx interrupt:
> + *   Initialize(register_real_device)
> + *     Map INTx(xc_physdev_map_pirq):
> + *       <fail>
> + *         - Set real Interrupt Disable bit to '1'.
> + *         - Set machine_irq and assigned_device->machine_irq to '0'.
> + *         * Don't bind INTx.
> + *
> + *     Bind INTx(xc_domain_bind_pt_pci_irq):
> + *       <fail>
> + *         - Set real Interrupt Disable bit to '1'.
> + *         - Unmap INTx.
> + *         - Decrement mapped_machine_irq[machine_irq]
> + *         - Set assigned_device->machine_irq to '0'.
> + *
> + *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
> + *     Write '0'
> + *       <ptdev->msi_trans_en is false>
> + *         - Set real bit to '0' if assigned_device->machine_irq isn't '0'.
> + *
> + *     Write '1'
> + *       <ptdev->msi_trans_en is false>
> + *         - Set real bit to '1'.
> + */
> +
> +#include <sys/ioctl.h>
> +
> +#include "pci.h"
> +#include "xen.h"
> +#include "xen_backend.h"
> +#include "xen_pci_passthrough.h"
> +
> +#define PCI_BAR_ENTRIES (6)
> +
> +#define PT_NR_IRQS          (256)
> +char mapped_machine_irq[PT_NR_IRQS] = {0};


char? Can it be made uint8_t instead?
> +
> +void pt_log(const PCIDevice *d, const char *f, ...)
> +{
> +    va_list ap;
> +
> +    va_start(ap, f);
> +    if (d) {
> +        fprintf(stderr, "[%02x:%02x.%x] ", pci_bus_num(d->bus),
> +                PCI_SLOT(d->devfn), PCI_FUNC(d->devfn));
> +    }
> +    vfprintf(stderr, f, ap);
> +    va_end(ap);
> +}
> +
> +
> +/* Config Space */
> +static int pt_pci_config_access_check(PCIDevice *d, uint32_t address, int 
> len)
> +{
> +    /* check offset range */
> +    if (address >= 0xFF) {
> +        PT_ERR(d, "Failed to access register with offset exceeding 0xFF. "
> +               "(addr: 0x%02x, len: %d)\n", address, len);
> +        return -1;
> +    }
> +
> +    /* check read size */
> +    if ((len != 1) && (len != 2) && (len != 4)) {
> +        PT_ERR(d, "Failed to access register with invalid access length. "
> +               "(addr: 0x%02x, len: %d)\n", address, len);
> +        return -1;
> +    }
> +
> +    /* check offset alignment */
> +    if (address & (len - 1)) {
> +        PT_ERR(d, "Failed to access register with invalid access size "
> +               "alignment. (addr: 0x%02x, len: %d)\n", address, len);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +int pt_bar_offset_to_index(uint32_t offset)
> +{
> +    int index = 0;
> +
> +    /* check Exp ROM BAR */
> +    if (offset == PCI_ROM_ADDRESS) {
> +        return PCI_ROM_SLOT;
> +    }
> +
> +    /* calculate BAR index */
> +    index = (offset - PCI_BASE_ADDRESS_0) >> 2;
> +    if (index >= PCI_NUM_REGIONS) {
> +        return -1;
> +    }
> +
> +    return index;
> +}
> +
> +static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
> +    uint32_t val = 0;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    int rc = 0;
> +    int emul_len = 0;
> +    uint32_t find_addr = address;
> +
> +    if (pt_pci_config_access_check(d, address, len)) {
> +        goto exit;
> +    }
> +
> +    /* check power state transition flags */
> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> +        /* can't accept until previous power state transition is completed.
> +         * so finish previous request here.

Uh, how do we finish previous request here?

> +         */
> +        PT_WARN(d, "Guest want to write during power state transition\n");
> +        goto exit;
> +    }
> +
> +    /* find register group entry */
> +    reg_grp_entry = pt_find_reg_grp(s, address);
> +    if (reg_grp_entry) {
> +        /* check 0 Hardwired register group */

So what does that mean? 0 hardwired register group? It means
that the register is always zero?

> +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
> +            /* no need to emulate, just return 0 */
> +            val = 0;
> +            goto exit;
> +        }
> +    }
> +
> +    /* read I/O device register value */
> +    rc = host_pci_get_block(s->real_device, address, (uint8_t *)&val, len);
> +    if (rc < 0) {
> +        PT_ERR(d, "pci_read_block failed. return value: %d.\n", rc);
> +        memset(&val, 0xff, len);
> +    }
> +
> +    /* just return the I/O device register value for
> +     * passthrough type register group */

Is another way to say that: "There is no filter for the
value, so just return the raw value." ?

> +    if (reg_grp_entry == NULL) {
> +        goto exit;
> +    }
> +
> +    /* adjust the read value to appropriate CFC-CFF window */
> +    val <<= (address & 3) << 3;
> +    emul_len = len;
> +
> +    /* loop around the guest requested size */
> +    while (emul_len > 0) {
> +        /* find register entry to be emulated */
> +        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
> +        if (reg_entry) {
> +            XenPTRegInfo *reg = reg_entry->reg;
> +            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
> +            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
> +            uint8_t *ptr_val = NULL;
> +
> +            valid_mask <<= (find_addr - real_offset) << 3;
> +            ptr_val = (uint8_t *)&val + (real_offset & 3);
> +
> +            /* do emulation based on register size */
> +            switch (reg->size) {
> +            case 1:
> +                if (reg->u.b.read) {
> +                    rc = reg->u.b.read(s, reg_entry, ptr_val, valid_mask);
> +                }
> +                break;
> +            case 2:
> +                if (reg->u.w.read) {
> +                    rc = reg->u.w.read(s, reg_entry,
> +                                       (uint16_t *)ptr_val, valid_mask);
> +                }
> +                break;
> +            case 4:
> +                if (reg->u.dw.read) {
> +                    rc = reg->u.dw.read(s, reg_entry,
> +                                        (uint32_t *)ptr_val, valid_mask);
> +                }
> +                break;
> +            }
> +
> +            if (rc < 0) {
> +                xen_shutdown_fatal_error("Internal error: Invalid read "
> +                                         "emulation. (%s, rc: %d)\n",
> +                                         __func__, rc);
> +                return 0;
> +            }
> +
> +            /* calculate next address to find */
> +            emul_len -= reg->size;
> +            if (emul_len > 0) {
> +                find_addr = real_offset + reg->size;
> +            }
> +        } else {
> +            /* nothing to do with passthrough type register,
> +             * continue to find next byte */
> +            emul_len--;
> +            find_addr++;
> +        }
> +    }
> +
> +    /* need to shift back before returning them to pci bus emulator */
> +    val >>= ((address & 3) << 3);
> +
> +exit:
> +    PT_LOG_CONFIG(d, address, val, len);
> +    return val;
> +}
> +
> +static void pt_pci_write_config(PCIDevice *d, uint32_t address,
> +                                uint32_t val, int len)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
> +    int index = 0;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    int rc = 0;
> +    uint32_t read_val = 0;
> +    int emul_len = 0;
> +    XenPTReg *reg_entry = NULL;
> +    uint32_t find_addr = address;
> +    XenPTRegInfo *reg = NULL;
> +
> +    if (pt_pci_config_access_check(d, address, len)) {
> +        return;
> +    }
> +
> +    PT_LOG_CONFIG(d, address, val, len);
> +
> +    /* check unused BAR register */
> +    index = pt_bar_offset_to_index(address);
> +    if ((index >= 0) && (val > 0 && val < PT_BAR_ALLF) &&
> +        (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED)) {
> +        PT_WARN(d, "Guest attempt to set address to unused Base Address "
> +                "Register. (addr: 0x%02x, len: %d)\n", address, len);
> +    }
> +
> +    /* check power state transition flags */
> +    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
> +        /* can't accept until previous power state transition is completed.
> +         * so finish previous request here.

Ditto. How do we finish the previous request here?
> +         */
> +        PT_WARN(d, "Guest want to write during power state transition\n");
> +        return;
> +    }
> +
> +    /* find register group entry */
> +    reg_grp_entry = pt_find_reg_grp(s, address);
> +    if (reg_grp_entry) {
> +        /* check 0 Hardwired register group */
> +        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
> +            /* ignore silently */
> +            PT_WARN(d, "Access to 0 Hardwired register. "
> +                    "(addr: 0x%02x, len: %d)\n", address, len);
> +            return;
> +        }
> +    }
> +
> +    /* read I/O device register value */
> +    rc = host_pci_get_block(s->real_device, address,
> +                             (uint8_t *)&read_val, len);
> +    if (rc < 0) {
> +        PT_ERR(d, "pci_read_block failed. return value: %d.\n", rc);
> +        memset(&read_val, 0xff, len);
> +    }
> +
> +    /* pass directly to the real device for passthrough type register group 
> */
> +    if (reg_grp_entry == NULL) {
> +        goto out;
> +    }
> +
> +    /* adjust the read and write value to appropriate CFC-CFF window */
> +    read_val <<= (address & 3) << 3;
> +    val <<= (address & 3) << 3;
> +    emul_len = len;
> +
> +    /* loop around the guest requested size */
> +    while (emul_len > 0) {
> +        /* find register entry to be emulated */
> +        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
> +        if (reg_entry) {
> +            reg = reg_entry->reg;
> +            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
> +            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
> +            uint8_t *ptr_val = NULL;
> +
> +            valid_mask <<= (find_addr - real_offset) << 3;
> +            ptr_val = (uint8_t *)&val + (real_offset & 3);
> +
> +            /* do emulation based on register size */
> +            switch (reg->size) {
> +            case 1:
> +                if (reg->u.b.write) {
> +                    rc = reg->u.b.write(s, reg_entry, ptr_val,
> +                                        read_val >> ((real_offset & 3) << 3),
> +                                        valid_mask);
> +                }
> +                break;
> +            case 2:
> +                if (reg->u.w.write) {
> +                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
> +                                        (read_val >> ((real_offset & 3) << 
> 3)),
> +                                        valid_mask);
> +                }
> +                break;
> +            case 4:
> +                if (reg->u.dw.write) {
> +                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
> +                                         (read_val >> ((real_offset & 3) << 
> 3)),
> +                                         valid_mask);
> +                }
> +                break;
> +            }
> +
> +            if (rc < 0) {
> +                xen_shutdown_fatal_error("Internal error: Invalid write"
> +                                         " emulation. (%s, rc: %d)\n",
> +                                         __func__, rc);
> +                return;
> +            }
> +
> +            /* calculate next address to find */
> +            emul_len -= reg->size;
> +            if (emul_len > 0) {
> +                find_addr = real_offset + reg->size;
> +            }
> +        } else {
> +            /* nothing to do with passthrough type register,
> +             * continue to find next byte */
> +            emul_len--;
> +            find_addr++;
> +        }
> +    }
> +
> +    /* need to shift back before passing them to host_pci_device */
> +    val >>= (address & 3) << 3;
> +
> +out:
> +    if (!(reg && reg->no_wb)) {
> +        /* unknown regs are passed through */
> +        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, 
> len);
> +
> +        if (rc < 0) {
> +            PT_ERR(d, "pci_write_block failed. return value: %d.\n", rc);
> +        }
> +    }
> +}
> +
> +/* ioport/iomem space*/
> +static void pt_iomem_map(XenPCIPassthroughState *s, int i,
> +                         pcibus_t e_phys, pcibus_t e_size, int type)
> +{
> +    PCIIORegion *r = &s->dev.io_regions[i];
> +    uint32_t old_ebase = s->bases[i].e_physbase;
> +    bool first_map = s->bases[i].e_size == 0;
> +    int ret = 0;
> +
> +    s->bases[i].e_physbase = e_phys;
> +    s->bases[i].e_size = e_size;
> +
> +    PT_LOG(&s->dev, "e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%d"
> +           " len=%#"PRIx64" index=%d first_map=%d\n",
> +           e_phys, s->bases[i].access.maddr, type,
> +           e_size, i, first_map);
> +
> +    if (e_size == 0) {
> +        return;
> +    }
> +
> +    if (!first_map && old_ebase != PT_PCI_BAR_UNMAPPED) {
> +        /* Remove old mapping */
> +        memory_region_del_subregion(r->address_space,
> +                                    r->memory);
> +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                               old_ebase >> XC_PAGE_SHIFT,
> +                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
> +                               DPCI_REMOVE_MAPPING);
> +        if (ret != 0) {
> +            PT_ERR(&s->dev, "remove old mapping failed!\n");
> +            return;
> +        }
> +    }
> +
> +    /* map only valid guest address */
> +    if (e_phys != PCI_BAR_UNMAPPED) {
> +        /* Create new mapping */
> +        memory_region_add_subregion_overlap(r->address_space,
> +                                            e_phys, r->memory, 1);
> +        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
> +                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
> +                                   DPCI_ADD_MAPPING);
> +
> +        if (ret != 0) {
> +            PT_ERR(&s->dev, "create new mapping failed!\n");
> +        }
> +    }
> +}
> +
> +static void pt_ioport_map(XenPCIPassthroughState *s, int i,
> +                          pcibus_t e_phys, pcibus_t e_size, int type)
> +{
> +    PCIIORegion *r = &s->dev.io_regions[i];
> +    uint32_t old_ebase = s->bases[i].e_physbase;
> +    bool first_map = s->bases[i].e_size == 0;
> +    int ret = 0;
> +
> +    s->bases[i].e_physbase = e_phys;
> +    s->bases[i].e_size = e_size;
> +
> +    PT_LOG(&s->dev, "e_phys=%#04"PRIx64" pio_base=%#04"PRIx64" len=%"PRId64
> +           " index=%d first_map=%d\n",
> +           e_phys, s->bases[i].access.pio_base, e_size, i, first_map);
> +
> +    if (e_size == 0) {
> +        return;
> +    }
> +
> +    if (!first_map && old_ebase != PT_PCI_BAR_UNMAPPED) {
> +        /* Remove old mapping */
> +        memory_region_del_subregion(r->address_space,
> +                                    r->memory);
> +        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, old_ebase,
> +                                       s->bases[i].access.pio_base, e_size,
> +                                       DPCI_REMOVE_MAPPING);
> +        if (ret != 0) {
> +            PT_ERR(&s->dev, "remove old mapping failed!\n");
> +            return;
> +        }
> +    }
> +
> +    /* map only valid guest address (include 0) */
> +    if (e_phys != PCI_BAR_UNMAPPED) {
> +        /* Create new mapping */
> +        memory_region_add_subregion_overlap(r->address_space,
> +                                            e_phys, r->memory, 1);
> +        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, e_phys,
> +                                       s->bases[i].access.pio_base, e_size,
> +                                       DPCI_ADD_MAPPING);
> +        if (ret != 0) {
> +            PT_ERR(&s->dev, "create new mapping failed!\n");
> +        }
> +    }
> +
> +}
> +
> +
> +/* mapping BAR */
> +
> +void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
> +                        int io_enable, int mem_enable)
> +{
> +    PCIDevice *dev = &s->dev;
> +    PCIIORegion *r;
> +    XenPTRegGroup *reg_grp_entry = NULL;
> +    XenPTReg *reg_entry = NULL;
> +    XenPTRegion *base = NULL;
> +    pcibus_t r_size = 0, r_addr = PCI_BAR_UNMAPPED;
> +    int rc = 0;
> +
> +    r = &dev->io_regions[bar];
> +
> +    /* check valid region */
> +    if (!r->size) {
> +        return;
> +    }
> +
> +    base = &s->bases[bar];
> +    /* skip unused BAR or upper 64bit BAR */
> +    if ((base->bar_flag == PT_BAR_FLAG_UNUSED)
> +        || (base->bar_flag == PT_BAR_FLAG_UPPER)) {
> +           return;
> +    }
> +
> +    /* copy region address to temporary */
> +    r_addr = r->addr;
> +
> +    /* need unmapping in case I/O Space or Memory Space disable */

.. is disabled.

> +    if (((base->bar_flag == PT_BAR_FLAG_IO) && !io_enable) ||
> +        ((base->bar_flag == PT_BAR_FLAG_MEM) && !mem_enable)) {
> +        r_addr = PCI_BAR_UNMAPPED;
> +    }

Add a comment saying:
        /* or ROM address is disabled. */
> +    if ((bar == PCI_ROM_SLOT) && (r_addr != PCI_BAR_UNMAPPED)) {
> +        reg_grp_entry = pt_find_reg_grp(s, PCI_ROM_ADDRESS);
> +        if (reg_grp_entry) {
> +            reg_entry = pt_find_reg(reg_grp_entry, PCI_ROM_ADDRESS);
> +            if (reg_entry && !(reg_entry->data & PCI_ROM_ADDRESS_ENABLE)) {
> +                r_addr = PCI_BAR_UNMAPPED;
> +            }
> +        }
> +    }
> +
> +    /* prevent guest software mapping memory resource to 00000000h */
> +    if ((base->bar_flag == PT_BAR_FLAG_MEM) && (r_addr == 0)) {
> +        r_addr = PCI_BAR_UNMAPPED;
> +    }
> +
> +    r_size = pt_get_emul_size(base->bar_flag, r->size);
> +
> +    rc = pci_check_bar_overlap(dev, r_addr, r_size, r->type);
> +    if (rc) {
> +        PT_WARN(dev, "Region: %d (addr: %#"FMT_PCIBUS
> +                ", len: %#"FMT_PCIBUS") is overlapped.\n",

Is the FMT_PCIBUS for len correct?

> +                bar, r_addr, r_size);
> +    }
> +
> +    /* check whether we need to update the mapping or not */
> +    if (r_addr != s->bases[bar].e_physbase) {
> +        /* mapping BAR */
> +        if (base->bar_flag == PT_BAR_FLAG_IO) {
> +            pt_ioport_map(s, bar, r_addr, r_size, r->type);
> +        } else {
> +            pt_iomem_map(s, bar, r_addr, r_size, r->type);
> +        }
> +    }
> +}
> +
> +void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
> +{
> +    int i;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        pt_bar_mapping_one(s, i, io_enable, mem_enable);
> +    }
> +}
> +
> +static uint64_t bar_read(void *o, target_phys_addr_t addr, unsigned size)
> +{
> +    PCIDevice *d = o;
> +    PT_ERR(d, "Should not read BAR through QEMU. @0x"TARGET_FMT_plx"\n", 
> addr);


Perhaps you can add a comment of how it should read it? Is the reading
trapped by the hypervisor? If so, you might want to include a comment
stating that, maybe even the name of the file to look in.

> +    return 0;
> +}
> +static void bar_write(void *o, target_phys_addr_t addr,
> +                      uint64_t data, unsigned size)
> +{
> +    PCIDevice* d = o;
> +    PT_ERR(d, "Should not write BAR through QEMU. @0x"TARGET_FMT_plx"\n", 
> addr);
> +}
> +
> +static const MemoryRegionOps ops = {
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .read = bar_read,
> +    .write = bar_write,
> +};
> +
> +/* register regions */
> +static int pt_register_regions(XenPCIPassthroughState *s)
> +{
> +    int i = 0;
> +    uint32_t bar_data = 0;
> +    HostPCIDevice *d = s->real_device;
> +
> +    /* Register PIO/MMIO BARs */
> +    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
> +        HostPCIIORegion *r = &d->io_regions[i];
> +
> +        if (r->base_addr && r->size) {
> +            s->bases[i].e_physbase = r->base_addr;
> +            s->bases[i].access.u = r->base_addr;
> +
> +            /* Register current region */
> +            if (r->flags & IORESOURCE_IO) {
> +                memory_region_init_io(&s->bar[i], &ops, &s->dev,
> +                                      "xen-pci-pt-bar-io", r->size);
> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_IO,
> +                                 &s->bar[i]);
> +            } else if (r->flags & IORESOURCE_PREFETCH) {
> +                memory_region_init_io(&s->bar[i], &ops, &s->dev,
> +                                      "xen-pci-pt-bar-mem", r->size);
> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_MEM_PREFETCH,
> +                                 &s->bar[i]);
> +            } else {
> +                memory_region_init_io(&s->bar[i], &ops, &s->dev,
> +                                      "xen-pci-pt-bar-mem", r->size);
> +                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_MEMORY,
> +                                 &s->bar[i]);
> +            }
> +
> +            PT_LOG(&s->dev, "IO region registered (size=0x%08"PRIx64
> +                   " base_addr=0x%08"PRIx64")\n",
> +                   r->size, r->base_addr);
> +        }
> +    }
> +
> +    /* Register expansion ROM address */
> +    if (d->rom.base_addr && d->rom.size) {
> +        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
> +        if (host_pci_get_long(d, PCI_ROM_ADDRESS, &bar_data)) {
> +            return 0;
> +        }
> +        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
> +            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
> +            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
> +        }
> +
> +        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
> +        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
> +
> +        memory_region_init_rom_device(&s->rom, NULL, NULL, &s->dev.qdev,
> +                                      "xen-pci-pt-rom", d->rom.size);
> +        pci_register_bar(&s->dev, PCI_ROM_SLOT, 
> PCI_BASE_ADDRESS_MEM_PREFETCH,
> +                         &s->rom);
> +
> +        PT_LOG(&s->dev, "Expansion ROM registered (size=0x%08"PRIx64
> +               " base_addr=0x%08"PRIx64")\n",
> +               d->rom.size, d->rom.base_addr);
> +    }
> +
> +    return 0;
> +}
> +
> +static void pt_unregister_regions(XenPCIPassthroughState *s)
> +{
> +    int i, type, rc;
> +    uint32_t e_size;
> +    PCIDevice *d = &s->dev;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        e_size = s->bases[i].e_size;
> +        if ((e_size == 0) || (s->bases[i].e_physbase == 
> PT_PCI_BAR_UNMAPPED)) {
> +            continue;
> +        }
> +
> +        type = d->io_regions[i].type;
> +
> +        if (type == PCI_BASE_ADDRESS_SPACE_MEMORY
> +            || type == PCI_BASE_ADDRESS_MEM_PREFETCH) {
> +            rc = xc_domain_memory_mapping(xen_xc, xen_domid,
> +                    s->bases[i].e_physbase >> XC_PAGE_SHIFT,
> +                    s->bases[i].access.maddr >> XC_PAGE_SHIFT,
> +                    (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
> +                    DPCI_REMOVE_MAPPING);
> +            if (rc != 0) {
> +                PT_ERR(d, "remove old mem mapping failed!\n");
> +                continue;
> +            }
> +
> +        } else if (type == PCI_BASE_ADDRESS_SPACE_IO) {
> +            rc = xc_domain_ioport_mapping(xen_xc, xen_domid,
> +                        s->bases[i].e_physbase,
> +                        s->bases[i].access.pio_base,
> +                        e_size,
> +                        DPCI_REMOVE_MAPPING);
> +            if (rc != 0) {
> +                PT_ERR(d, "remove old io mapping failed!\n");
> +                continue;
> +            }
> +        }
> +    }
> +}
> +
> +static int pt_initfn(PCIDevice *pcidev)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, 
> pcidev);
> +    int dom, bus;
> +    unsigned slot, func;

So how come bus is 'int' but slot and func are unsigned?

> +    int rc = 0;
> +    uint8_t machine_irq = 0;
> +    int pirq = PT_UNASSIGNED_PIRQ;
> +
> +    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
> +        PT_ERR(pcidev, "Failed to parse BDF: %s\n", s->hostaddr);
> +        return -1;
> +    }
> +
> +    /* register real device */
> +    PT_LOG(pcidev, "Assigning real physical device %02x:%02x.%x"
> +           " to devfn %#x\n", bus, slot, func, s->dev.devfn);
> +
> +    s->real_device = host_pci_device_get(bus, slot, func);
> +    if (!s->real_device) {
> +        return -1;
> +    }
> +
> +    s->is_virtfn = s->real_device->is_virtfn;
> +    if (s->is_virtfn) {
> +        PT_LOG(pcidev, "%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
> +               s->real_device->domain, bus, slot, func);
> +    }
> +
> +    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
> +    if (host_pci_get_block(s->real_device, 0, pcidev->config,
> +                           PCI_CONFIG_SPACE_SIZE) == -1) {
> +        host_pci_device_put(s->real_device);
> +        return -1;
> +    }
> +
> +    /* Handle real device's MMIO/PIO BARs */
> +    pt_register_regions(s);
> +
> +    /* Bind interrupt */
> +    if (!s->dev.config[PCI_INTERRUPT_PIN]) {

Is a 'zero' value correct? I think that is the value that
SR-IOV devices export since they don't have legacy IRQs?
Or maybe they export the MSI vector values instead.

Perhaps you should do
    if (!s->dev.config[PCI_INTERRUPT_PIN] && !s->is_virtfn) {

> +        PT_LOG(pcidev, "no pin interrupt\n");
> +        goto out;
> +    }
> +
> +    host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE, &machine_irq);

Just to double check:

> lspci -s 01:10.0 -v -xx
01:10.0 Ethernet controller: Intel Corporation 82576 Virtual Function
(rev 01)
        Subsystem: Intel Corporation Device a01c
        Flags: bus master, fast devsel, latency 0
        [virtual] Memory at b8840000 (64-bit, non-prefetchable) [size=16K]
        [virtual] Memory at b8860000 (64-bit, non-prefetchable) [size=16K]
        Capabilities: [70] MSI-X: Enable+ Count=3 Masked-
        Capabilities: [a0] Express Endpoint, MSI 00
        Capabilities: [100] Advanced Error Reporting
        Capabilities: [150] Alternative Routing-ID Interpretation (ARI)
        Kernel driver in use: igbvf
        Kernel modules: igbvf
00: ff ff ff ff 04 00 10 00 01 00 00 02 00 00 00 00
10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
20: 00 00 00 00 00 00 00 00 00 00 00 00 86 80 1c a0
30: 00 00 00 00 70 00 00 00 00 00 00 00 00 00 00 00

And a normal USB device:

>  lspci -s 00:1a.0 -v -xx 
00:1a.0 USB Controller: Intel Corporation 82801JI (ICH10 Family) USB
UHCI Controller #4 (prog-if 00 [UHCI])
        Subsystem: Intel Corporation Device 4f53
        Flags: bus master, medium devsel, latency 0, IRQ 16
        I/O ports at 40e0 [size=32]
        Capabilities: [50] PCI Advanced Features
        Kernel driver in use: uhci_hcd
00: 86 80 37 3a 05 00 90 02 00 00 03 0c 00 00 80 00
10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
20: e1 40 00 00 00 00 00 00 00 00 00 00 86 80 53 4f
30: 00 00 00 00 50 00 00 00 00 00 00 00 0b 01 00 00

The USB device has interrupt line 0xB, and pin 1.
The SR-IOV one is zero and zero.

> +    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);

so perhaps this should be guarded by a check against devices that
have no legacy IRQs, like SR-VIO? Perhaps check against is_virtfn?

> +
> +    if (rc < 0) {
> +        PT_ERR(pcidev, "Mapping machine irq %u to pirq %i failed, (rc: 
> %d)\n",
> +               machine_irq, pirq, rc);
> +
> +        /* Disable PCI intx assertion (turn on bit10 of devctl) */
> +        host_pci_set_word(s->real_device,
> +                          PCI_COMMAND,
> +                          pci_get_word(s->dev.config + PCI_COMMAND)
> +                          | PCI_COMMAND_INTX_DISABLE);
> +        machine_irq = 0;
> +        s->machine_irq = 0;
> +    } else {
> +        machine_irq = pirq;
> +        s->machine_irq = pirq;
> +        mapped_machine_irq[machine_irq]++;
> +    }
> +
> +    /* bind machine_irq to device */
> +    if (rc < 0 && machine_irq != 0) {
> +        uint8_t e_device = PCI_SLOT(s->dev.devfn);
> +        uint8_t e_intx = pci_intx(s);
> +
> +        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,

What is the zero at the end for? Should it have a comment?

> +                                       e_device, e_intx);
> +        if (rc < 0) {
> +            PT_ERR(pcidev, "Binding of interrupt %i failed! (rc: %d)\n",
> +                   e_intx, rc);
> +
> +            /* Disable PCI intx assertion (turn on bit10 of devctl) */
> +            host_pci_set_word(s->real_device, PCI_COMMAND,
> +                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
> +                              | PCI_COMMAND_INTX_DISABLE);
> +            mapped_machine_irq[machine_irq]--;

Is this code reentrant? What if the user wants to pass in two USB
devices where both of them share the same IRQ. And it so happens that we
fail (perhaps the device is owned by another guest). We get here, and
both of them decrement the mapped_machine_irq.. which means that
> +
> +            if (mapped_machine_irq[machine_irq] == 0) {

this might not be executed (as both of them read it as =1).?
> +                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
> +                    PT_ERR(pcidev, "Unmapping of machine interrupt %i 
> failed!"
> +                           " (rc: %d)\n", machine_irq, rc);

Or is the code actually single-threaded so there is no danger of this?

> +                }
> +            }
> +            s->machine_irq = 0;
> +        }
> +    }
> +
> +out:
> +    PT_LOG(pcidev, "Real physical device %02x:%02x.%x registered 
> successfuly!"
> +           "\nIRQ type = %s\n", bus, slot, func, "INTx");
> +
> +    return 0;
> +}
> +
> +static int pt_unregister_device(PCIDevice *pcidev)
> +{
> +    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, 
> pcidev);
> +    uint8_t e_device, e_intx;
> +    uint8_t machine_irq;
> +    int rc;
> +
> +    /* Unbind interrupt */
> +    e_device = PCI_SLOT(s->dev.devfn);
> +    e_intx = pci_intx(s);
> +    machine_irq = s->machine_irq;
> +
> +    if (machine_irq) {
> +        rc = xc_domain_unbind_pt_irq(xen_xc, xen_domid, machine_irq,
> +                                     PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 
> 0);

What are the two '0' for?
Can you provide a comment within the arguments?

> +        if (rc < 0) {
> +            PT_ERR(pcidev, "Unbinding of interrupt failed! rc=%d\n", rc);

Might want to mention which interrupt (or pirq?) failed.

> +        }
> +    }
> +
> +    if (machine_irq) {

Why not continue the logic within this code? As in why
the 'if (machine_irq)' here? You would still continue going
even if xc_domain_unbind_pt_irq failed..

> +        mapped_machine_irq[machine_irq]--;
> +
> +        if (mapped_machine_irq[machine_irq] == 0) {
> +            rc = xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq);
> +
> +            if (rc < 0) {
> +                PT_ERR(pcidev, "Unmaping of interrupt failed! rc=%d\n", rc);

And provide the interrupt (both the pirq and linux one).

You might want to mention in the erorr logs: "But bravely continuing on.."
> +            }
> +        }
> +    }
> +
> +    /* unregister real device's MMIO/PIO BARs */
> +    pt_unregister_regions(s);
> +
> +    host_pci_device_put(s->real_device);
> +
> +    return 0;
> +}
> +
> +static PCIDeviceInfo xen_pci_passthrough = {
> +    .init = pt_initfn,
> +    .exit = pt_unregister_device,
> +    .qdev.name = "xen-pci-passthrough",
> +    .qdev.desc = "Assign an host pci device with Xen",
> +    .qdev.size = sizeof(XenPCIPassthroughState),
> +    .config_read = pt_pci_read_config,
> +    .config_write = pt_pci_write_config,
> +    .is_express = 0,
> +    .qdev.props = (Property[]) {
> +        DEFINE_PROP_STRING("hostaddr", XenPCIPassthroughState, hostaddr),
> +        DEFINE_PROP_BIT("power-mgmt", XenPCIPassthroughState, power_mgmt,
> +                        0, false),
> +        DEFINE_PROP_END_OF_LIST(),
> +    }
> +};
> +
> +static void xen_passthrough_register(void)
> +{
> +    pci_qdev_register(&xen_pci_passthrough);
> +}
> +
> +device_init(xen_passthrough_register);
> diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
> new file mode 100644
> index 0000000..110325c
> --- /dev/null
> +++ b/hw/xen_pci_passthrough.h
> @@ -0,0 +1,282 @@
> +#ifndef QEMU_HW_XEN_PCI_PASSTHROUGH_H
> +#  define QEMU_HW_XEN_PCI_PASSTHROUGH_H
> +
> +#include "qemu-common.h"
> +#include "xen_common.h"
> +#include "pci.h"
> +#include "host-pci-device.h"
> +
> +/* #define PT_LOGGING_ENABLED */
> +/* #define PT_DEBUG_PCI_CONFIG_ACCESS */
> +
> +void pt_log(const PCIDevice *d, const char *f, ...) GCC_FMT_ATTR(2, 3);
> +
> +#define PT_ERR(d, _f, _a...)  pt_log(d, "%s: Error: " _f, __func__, ##_a)
> +
> +#ifdef PT_LOGGING_ENABLED
> +#  define PT_LOG(d, _f, _a...)  pt_log(d, "%s: " _f, __func__, ##_a)
> +#  define PT_WARN(d, _f, _a...) pt_log(d, "%s: Warning: " _f, __func__, ##_a)
> +#else
> +#  define PT_LOG(d, _f, _a...)
> +#  define PT_WARN(d, _f, _a...)
> +#endif
> +
> +#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
> +#  define PT_LOG_CONFIG(d, addr, val, len) \
> +    pt_log(d, "%s: address=0x%04x val=0x%08x len=%d\n", \
> +           __func__, addr, val, len)
> +#else
> +#  define PT_LOG_CONFIG(d, addr, val, len)
> +#endif


Nice. Thanks!

> +
> +
> +typedef struct XenPTRegInfo XenPTRegInfo;
> +typedef struct XenPTReg XenPTReg;
> +
> +typedef struct XenPCIPassthroughState XenPCIPassthroughState;
> +
> +/* function type for config reg */
> +typedef int (*conf_reg_init)
> +    (XenPCIPassthroughState *, XenPTRegInfo *, uint32_t real_offset,
> +     uint32_t *data);
> +typedef int (*conf_dword_write)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint32_t *val, uint32_t dev_value, uint32_t valid_mask);
> +typedef int (*conf_word_write)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint16_t *val, uint16_t dev_value, uint16_t valid_mask);
> +typedef int (*conf_byte_write)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint8_t *val, uint8_t dev_value, uint8_t valid_mask);
> +typedef int (*conf_dword_read)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint32_t *val, uint32_t valid_mask);
> +typedef int (*conf_word_read)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint16_t *val, uint16_t valid_mask);
> +typedef int (*conf_byte_read)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
> +     uint8_t *val, uint8_t valid_mask);
> +typedef int (*conf_dword_restore)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
> +     uint32_t dev_value, uint32_t *val);
> +typedef int (*conf_word_restore)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
> +     uint16_t dev_value, uint16_t *val);
> +typedef int (*conf_byte_restore)
> +    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
> +     uint8_t dev_value, uint8_t *val);
> +
> +/* power state transition */
> +#define PT_FLAG_TRANSITING  0x0001
> +
> +#define PT_BAR_ALLF         0xFFFFFFFF  /* BAR ALLF value */
> +#define PT_PCI_BAR_UNMAPPED (-1)
> +#define PT_UNASSIGNED_PIRQ (-1)
> +
> +
> +typedef enum {
> +    GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
> +    GRP_TYPE_EMU,                               /* emul reg group */
> +} RegisterGroupType;
> +
> +typedef enum {
> +    PT_BAR_FLAG_MEM = 0,                        /* Memory type BAR */
> +    PT_BAR_FLAG_IO,                             /* I/O type BAR */
> +    PT_BAR_FLAG_UPPER,                          /* upper 64bit BAR */
> +    PT_BAR_FLAG_UNUSED,                         /* unused BAR */
> +} PTBarFlag;
> +
> +
> +typedef struct XenPTRegion {
> +    /* Virtual phys base & size */
> +    uint32_t e_physbase;
> +    uint32_t e_size;
> +    /* Index of region in qemu */
> +    uint32_t memory_index;
> +    /* BAR flag */
> +    PTBarFlag bar_flag;
> +    /* Translation of the emulated address */
> +    union {
> +        uint64_t maddr;
> +        uint64_t pio_base;
> +        uint64_t u;
> +    } access;
> +} XenPTRegion;
> +
> +/* XenPTRegInfo declaration
> + * - only for emulated register (either a part or whole bit).
> + * - for passthrough register that need special behavior (like interacting 
> with
> + *   other component), set emu_mask to all 0 and specify r/w func properly.
> + * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
> + */
> +
> +/* emulated register infomation */
> +struct XenPTRegInfo {
> +    uint32_t offset;
> +    uint32_t size;
> +    uint32_t init_val;
> +    /* reg read only field mask (ON:RO/ROS, OFF:other) */
> +    uint32_t ro_mask;
> +    /* reg emulate field mask (ON:emu, OFF:passthrough) */
> +    uint32_t emu_mask;
> +    /* no write back allowed */
> +    uint32_t no_wb;
> +    conf_reg_init init;
> +    /* read/write/restore function pointer
> +     * for double_word/word/byte size */
> +    union {
> +        struct {
> +            conf_dword_write write;
> +            conf_dword_read read;
> +            conf_dword_restore restore;
> +        } dw;
> +        struct {
> +            conf_word_write write;
> +            conf_word_read read;
> +            conf_word_restore restore;
> +        } w;
> +        struct {
> +            conf_byte_write write;
> +            conf_byte_read read;
> +            conf_byte_restore restore;
> +        } b;
> +    } u;
> +};
> +
> +/* emulated register management */
> +struct XenPTReg {
> +    QLIST_ENTRY(XenPTReg) entries;
> +    XenPTRegInfo *reg;
> +    uint32_t data;
> +};
> +
> +typedef struct XenPTRegGroupInfo XenPTRegGroupInfo;
> +
> +/* emul reg group size initialize method */
> +typedef int (*pt_reg_size_init_fn)
> +    (XenPCIPassthroughState *, const XenPTRegGroupInfo *,
> +     uint32_t base_offset, uint8_t *size);
> +
> +/* emulated register group infomation */
> +struct XenPTRegGroupInfo {
> +    uint8_t grp_id;
> +    RegisterGroupType grp_type;
> +    uint8_t grp_size;
> +    pt_reg_size_init_fn size_init;
> +    XenPTRegInfo *emu_reg_tbl;
> +};
> +
> +/* emul register group management table */
> +typedef struct XenPTRegGroup {
> +    QLIST_ENTRY(XenPTRegGroup) entries;
> +    const XenPTRegGroupInfo *reg_grp;
> +    uint32_t base_offset;
> +    uint8_t size;
> +    QLIST_HEAD(, XenPTReg) reg_tbl_list;
> +} XenPTRegGroup;
> +
> +
> +typedef struct XenPTPM {
> +    QEMUTimer *pm_timer;  /* QEMUTimer struct */
> +    int no_soft_reset;    /* No Soft Reset flags */
> +    uint16_t flags;       /* power state transition flags */
> +    uint16_t pmc_field;   /* Power Management Capabilities field */
> +    int pm_delay;         /* power state transition delay */
> +    uint16_t cur_state;   /* current power state */
> +    uint16_t req_state;   /* requested power state */
> +    uint32_t pm_base;     /* Power Management Capability reg base offset */
> +    uint32_t aer_base;    /* AER Capability reg base offset */
> +} XenPTPM;
> +
> +struct XenPCIPassthroughState {
> +    PCIDevice dev;
> +
> +    char *hostaddr;
> +    bool is_virtfn;
> +    HostPCIDevice *real_device;
> +    XenPTRegion bases[PCI_NUM_REGIONS]; /* Access regions */
> +    QLIST_HEAD(, XenPTRegGroup) reg_grp_tbl;
> +
> +    uint32_t machine_irq;
> +
> +    uint32_t power_mgmt;
> +    XenPTPM *pm_state;
> +
> +    MemoryRegion bar[PCI_NUM_REGIONS - 1];
> +    MemoryRegion rom;
> +};
> +
> +int pt_config_init(XenPCIPassthroughState *s);
> +void pt_config_delete(XenPCIPassthroughState *s);
> +void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int 
> mem_enable);
> +void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
> +                        int io_enable, int mem_enable);
> +XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address);
> +XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address);
> +int pt_bar_offset_to_index(uint32_t offset);
> +
> +static inline pcibus_t pt_get_emul_size(PTBarFlag flag, pcibus_t r_size)
> +{
> +    /* align resource size (memory type only) */
> +    if (flag == PT_BAR_FLAG_MEM) {
> +        return (r_size + XC_PAGE_SIZE - 1) & XC_PAGE_MASK;
> +    } else {
> +        return r_size;
> +    }
> +}
> +
> +/* INTx */
> +/* The PCI Local Bus Specification, Rev. 3.0,
> + * Section 6.2.4 Miscellaneous Registers, pp 223
> + * outlines 5 valid values for the intertupt pin (intx).
> + *  0: For devices (or device functions) that don't use an interrupt in
> + *  1: INTA#
> + *  2: INTB#
> + *  3: INTC#
> + *  4: INTD#
> + *
> + * Xen uses the following 4 values for intx
> + *  0: INTA#
> + *  1: INTB#
> + *  2: INTC#
> + *  3: INTD#
> + *
> + * Observing that these list of values are not the same, pci_read_intx()
> + * uses the following mapping from hw to xen values.

Might want to add : "(from /sys/../config contents)."
> + * This seems to reflect the current usage within Xen.
> + *
> + * PCI hardware    | Xen | Notes
> + * 
> ----------------+-----+----------------------------------------------------
> + * 0               | 0   | No interrupt
> + * 1               | 0   | INTA#
> + * 2               | 1   | INTB#
> + * 3               | 2   | INTC#
> + * 4               | 3   | INTD#
> + * any other value | 0   | This should never happen, log error message
> + */
> +
> +static inline uint8_t pci_read_intx(XenPCIPassthroughState *s)
> +{
> +    uint8_t v = 0;
> +    host_pci_get_byte(s->real_device, PCI_INTERRUPT_PIN, &v);
> +    return v;
> +}
> +
> +static inline uint8_t pci_intx(XenPCIPassthroughState *s)
> +{
> +    uint8_t r_val = pci_read_intx(s);
> +
> +    PT_LOG(&s->dev, "intx=%i\n", r_val);
> +    if (r_val < 1 || r_val > 4) {
> +        PT_LOG(&s->dev, "Interrupt pin read from hardware is out of range:"
> +               " value=%i, acceptable range is 1 - 4\n", r_val);
> +        r_val = 0;
> +    } else {
> +        r_val -= 1;
> +    }
> +
> +    return r_val;
> +}
> +
> +#endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
> diff --git a/hw/xen_pci_passthrough_config_init.c 
> b/hw/xen_pci_passthrough_config_init.c
> new file mode 100644
> index 0000000..1e9de64
> --- /dev/null
> +++ b/hw/xen_pci_passthrough_config_init.c
> @@ -0,0 +1,11 @@
> +#include "xen_pci_passthrough.h"
> +
> +XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
> +{
> +    return NULL;
> +}
> +
> +XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
> +{
> +    return NULL;
> +}
> diff --git a/xen-all.c b/xen-all.c
> index b5e28ab..0e3bbcf 100644
> --- a/xen-all.c
> +++ b/xen-all.c
> @@ -979,3 +979,15 @@ void destroy_hvm_domain(void)
>          xc_interface_close(xc_handle);
>      }
>  }
> +
> +void xen_shutdown_fatal_error(const char *fmt, ...)
> +{
> +    va_list ap;
> +
> +    va_start(ap, fmt);
> +    vfprintf(stderr, fmt, ap);
> +    va_end(ap);
> +    fprintf(stderr, "Will destroy the domain.\n");
> +    /* destroy the domain */
> +    qemu_system_shutdown_request();
> +}
> -- 


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.