[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen staging] xen/pci: move x86 specific code to x86 directory



commit f7e77e55d33b7e52477d65f4c5e67281384b650f
Author:     Rahul Singh <rahul.singh@xxxxxxx>
AuthorDate: Fri Nov 27 18:06:04 2020 +0100
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Fri Nov 27 18:06:04 2020 +0100

    xen/pci: move x86 specific code to x86 directory
    
    passthrough/pci.c file is common for all architecture, but there is x86
    specific code in this file.
    
    Move x86 specific code to the drivers/passthrough/io.c file to avoid
    compilation error for other architecture.
    
    As drivers/passthrough/io.c is compiled only for x86 move it to
    x86 directory and rename it to hvm.c.
    
    No functional change intended.
    
    Signed-off-by: Rahul Singh <rahul.singh@xxxxxxx>
    Reviewed-by: Stefano Stabellini <sstabellini@xxxxxxxxxx>
    Reviewed-by: Bertrand Marquis <bertrand.marquis@xxxxxxx>
    Acked-by: Jan Beulich <jbeulich@xxxxxxxx>
---
 xen/drivers/passthrough/Makefile     |    3 -
 xen/drivers/passthrough/io.c         | 1127 --------------------------------
 xen/drivers/passthrough/pci.c        |   68 +-
 xen/drivers/passthrough/x86/Makefile |    1 +
 xen/drivers/passthrough/x86/hvm.c    | 1193 ++++++++++++++++++++++++++++++++++
 xen/include/xen/pci.h                |    9 +
 6 files changed, 1204 insertions(+), 1197 deletions(-)

diff --git a/xen/drivers/passthrough/Makefile b/xen/drivers/passthrough/Makefile
index e973e16c74..cc646612c7 100644
--- a/xen/drivers/passthrough/Makefile
+++ b/xen/drivers/passthrough/Makefile
@@ -6,6 +6,3 @@ obj-$(CONFIG_ARM) += arm/
 obj-y += iommu.o
 obj-$(CONFIG_HAS_PCI) += pci.o
 obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o
-
-x86-$(CONFIG_HVM) := io.o
-obj-$(CONFIG_X86) += $(x86-y)
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
deleted file mode 100644
index 6b1305a3e5..0000000000
--- a/xen/drivers/passthrough/io.c
+++ /dev/null
@@ -1,1127 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; If not, see <http://www.gnu.org/licenses/>.
- *
- * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx>
- * Copyright (C) Xiaohui Xin <xiaohui.xin@xxxxxxxxx>
- */
-
-#include <xen/event.h>
-#include <xen/iommu.h>
-#include <xen/cpu.h>
-#include <xen/irq.h>
-#include <asm/hvm/irq.h>
-#include <asm/hvm/support.h>
-#include <asm/io_apic.h>
-
-static DEFINE_PER_CPU(struct list_head, dpci_list);
-
-/*
- * These two bit states help to safely schedule, deschedule, and wait until
- * the softirq has finished.
- *
- * The semantics behind these two bits is as follow:
- *  - STATE_SCHED - whoever modifies it has to ref-count the domain (->dom).
- *  - STATE_RUN - only softirq is allowed to set and clear it. If it has
- *      been set hvm_dirq_assist will RUN with a saved value of the
- *      'struct domain' copied from 'pirq_dpci->dom' before STATE_RUN was set.
- *
- * The usual states are: STATE_SCHED(set) -> STATE_RUN(set) ->
- * STATE_SCHED(unset) -> STATE_RUN(unset).
- *
- * However the states can also diverge such as: STATE_SCHED(set) ->
- * STATE_SCHED(unset) -> STATE_RUN(set) -> STATE_RUN(unset). That means
- * the 'hvm_dirq_assist' never run and that the softirq did not do any
- * ref-counting.
- */
-
-enum {
-    STATE_SCHED,
-    STATE_RUN
-};
-
-/*
- * This can be called multiple times, but the softirq is only raised once.
- * That is until the STATE_SCHED state has been cleared. The state can be
- * cleared by: the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
- * or by 'pt_pirq_softirq_reset' (which will try to clear the state before
- * the softirq had a chance to run).
- */
-static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
-{
-    unsigned long flags;
-
-    if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
-        return;
-
-    get_knownalive_domain(pirq_dpci->dom);
-
-    local_irq_save(flags);
-    list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
-    local_irq_restore(flags);
-
-    raise_softirq(HVM_DPCI_SOFTIRQ);
-}
-
-/*
- * If we are racing with softirq_dpci (STATE_SCHED) we return
- * true. Otherwise we return false.
- *
- * If it is false, it is the callers responsibility to make sure
- * that the softirq (with the event_lock dropped) has ran.
- */
-bool pt_pirq_softirq_active(struct hvm_pirq_dpci *pirq_dpci)
-{
-    if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED)) )
-        return true;
-
-    /*
-     * If in the future we would call 'raise_softirq_for' right away
-     * after 'pt_pirq_softirq_active' we MUST reset the list (otherwise it
-     * might have stale data).
-     */
-    return false;
-}
-
-/*
- * Reset the pirq_dpci->dom parameter to NULL.
- *
- * This function checks the different states to make sure it can do it
- * at the right time. If it unschedules the 'hvm_dirq_assist' from running
- * it also refcounts (which is what the softirq would have done) properly.
- */
-static void pt_pirq_softirq_reset(struct hvm_pirq_dpci *pirq_dpci)
-{
-    struct domain *d = pirq_dpci->dom;
-
-    ASSERT(spin_is_locked(&d->event_lock));
-
-    switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) )
-    {
-    case (1 << STATE_SCHED):
-        /*
-         * We are going to try to de-schedule the softirq before it goes in
-         * STATE_RUN. Whoever clears STATE_SCHED MUST refcount the 'dom'.
-         */
-        put_domain(d);
-        /* fallthrough. */
-    case (1 << STATE_RUN):
-    case (1 << STATE_RUN) | (1 << STATE_SCHED):
-        /*
-         * The reason it is OK to reset 'dom' when STATE_RUN bit is set is due
-         * to a shortcut the 'dpci_softirq' implements. It stashes the 'dom'
-         * in local variable before it sets STATE_RUN - and therefore will not
-         * dereference '->dom' which would crash.
-         */
-        pirq_dpci->dom = NULL;
-        break;
-    }
-    /*
-     * Inhibit 'hvm_dirq_assist' from doing anything useful and at worst
-     * calling 'set_timer' which will blow up (as we have called kill_timer
-     * or never initialized it). Note that we hold the lock that
-     * 'hvm_dirq_assist' could be spinning on.
-     */
-    pirq_dpci->masked = 0;
-}
-
-bool pt_irq_need_timer(uint32_t flags)
-{
-    return !(flags & (HVM_IRQ_DPCI_GUEST_MSI | HVM_IRQ_DPCI_TRANSLATE |
-                      HVM_IRQ_DPCI_NO_EOI));
-}
-
-static int pt_irq_guest_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
-                            void *arg)
-{
-    if ( __test_and_clear_bit(_HVM_IRQ_DPCI_EOI_LATCH_SHIFT,
-                              &pirq_dpci->flags) )
-    {
-        pirq_dpci->masked = 0;
-        pirq_dpci->pending = 0;
-        pirq_guest_eoi(dpci_pirq(pirq_dpci));
-    }
-
-    return 0;
-}
-
-static void pt_irq_time_out(void *data)
-{
-    struct hvm_pirq_dpci *irq_map = data;
-    const struct hvm_irq_dpci *dpci;
-    const struct dev_intx_gsi_link *digl;
-
-    spin_lock(&irq_map->dom->event_lock);
-
-    if ( irq_map->flags & HVM_IRQ_DPCI_IDENTITY_GSI )
-    {
-        ASSERT(is_hardware_domain(irq_map->dom));
-        /*
-         * Identity mapped, no need to iterate over the guest GSI list to find
-         * other pirqs sharing the same guest GSI.
-         *
-         * In the identity mapped case the EOI can also be done now, this way
-         * the iteration over the list of domain pirqs is avoided.
-         */
-        hvm_gsi_deassert(irq_map->dom, dpci_pirq(irq_map)->pirq);
-        irq_map->flags |= HVM_IRQ_DPCI_EOI_LATCH;
-        pt_irq_guest_eoi(irq_map->dom, irq_map, NULL);
-        spin_unlock(&irq_map->dom->event_lock);
-        return;
-    }
-
-    dpci = domain_get_irq_dpci(irq_map->dom);
-    if ( unlikely(!dpci) )
-    {
-        ASSERT_UNREACHABLE();
-        spin_unlock(&irq_map->dom->event_lock);
-        return;
-    }
-    list_for_each_entry ( digl, &irq_map->digl_list, list )
-    {
-        unsigned int guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx);
-        const struct hvm_girq_dpci_mapping *girq;
-
-        list_for_each_entry ( girq, &dpci->girq[guest_gsi], list )
-        {
-            struct pirq *pirq = pirq_info(irq_map->dom, girq->machine_gsi);
-
-            pirq_dpci(pirq)->flags |= HVM_IRQ_DPCI_EOI_LATCH;
-        }
-        hvm_pci_intx_deassert(irq_map->dom, digl->device, digl->intx);
-    }
-
-    pt_pirq_iterate(irq_map->dom, pt_irq_guest_eoi, NULL);
-
-    spin_unlock(&irq_map->dom->event_lock);
-}
-
-struct hvm_irq_dpci *domain_get_irq_dpci(const struct domain *d)
-{
-    if ( !d || !is_hvm_domain(d) )
-        return NULL;
-
-    return hvm_domain_irq(d)->dpci;
-}
-
-void free_hvm_irq_dpci(struct hvm_irq_dpci *dpci)
-{
-    xfree(dpci);
-}
-
-/*
- * This routine handles lowest-priority interrupts using vector-hashing
- * mechanism. As an example, modern Intel CPUs use this method to handle
- * lowest-priority interrupts.
- *
- * Here is the details about the vector-hashing mechanism:
- * 1. For lowest-priority interrupts, store all the possible destination
- *    vCPUs in an array.
- * 2. Use "gvec % max number of destination vCPUs" to find the right
- *    destination vCPU in the array for the lowest-priority interrupt.
- */
-static struct vcpu *vector_hashing_dest(const struct domain *d,
-                                        uint32_t dest_id,
-                                        bool dest_mode,
-                                        uint8_t gvec)
-
-{
-    unsigned long *dest_vcpu_bitmap;
-    unsigned int dest_vcpus = 0;
-    struct vcpu *v, *dest = NULL;
-    unsigned int i;
-
-    dest_vcpu_bitmap = xzalloc_array(unsigned long,
-                                     BITS_TO_LONGS(d->max_vcpus));
-    if ( !dest_vcpu_bitmap )
-        return NULL;
-
-    for_each_vcpu ( d, v )
-    {
-        if ( !vlapic_match_dest(vcpu_vlapic(v), NULL, APIC_DEST_NOSHORT,
-                                dest_id, dest_mode) )
-            continue;
-
-        __set_bit(v->vcpu_id, dest_vcpu_bitmap);
-        dest_vcpus++;
-    }
-
-    if ( dest_vcpus != 0 )
-    {
-        unsigned int mod = gvec % dest_vcpus;
-        unsigned int idx = 0;
-
-        for ( i = 0; i <= mod; i++ )
-        {
-            idx = find_next_bit(dest_vcpu_bitmap, d->max_vcpus, idx) + 1;
-            BUG_ON(idx > d->max_vcpus);
-        }
-
-        dest = d->vcpu[idx - 1];
-    }
-
-    xfree(dest_vcpu_bitmap);
-
-    return dest;
-}
-
-int pt_irq_create_bind(
-    struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
-{
-    struct hvm_irq_dpci *hvm_irq_dpci;
-    struct hvm_pirq_dpci *pirq_dpci;
-    struct pirq *info;
-    int rc, pirq = pt_irq_bind->machine_irq;
-
-    if ( pirq < 0 || pirq >= d->nr_pirqs )
-        return -EINVAL;
-
- restart:
-    spin_lock(&d->event_lock);
-
-    hvm_irq_dpci = domain_get_irq_dpci(d);
-    if ( !hvm_irq_dpci && !is_hardware_domain(d) )
-    {
-        unsigned int i;
-
-        /*
-         * NB: the hardware domain doesn't use a hvm_irq_dpci struct because
-         * it's only allowed to identity map GSIs, and so the data contained in
-         * that struct (used to map guest GSIs into machine GSIs and perform
-         * interrupt routing) is completely useless to it.
-         */
-        hvm_irq_dpci = xzalloc(struct hvm_irq_dpci);
-        if ( hvm_irq_dpci == NULL )
-        {
-            spin_unlock(&d->event_lock);
-            return -ENOMEM;
-        }
-        for ( i = 0; i < NR_HVM_DOMU_IRQS; i++ )
-            INIT_LIST_HEAD(&hvm_irq_dpci->girq[i]);
-
-        hvm_domain_irq(d)->dpci = hvm_irq_dpci;
-    }
-
-    info = pirq_get_info(d, pirq);
-    if ( !info )
-    {
-        spin_unlock(&d->event_lock);
-        return -ENOMEM;
-    }
-    pirq_dpci = pirq_dpci(info);
-
-    /*
-     * A crude 'while' loop with us dropping the spinlock and giving
-     * the softirq_dpci a chance to run.
-     * We MUST check for this condition as the softirq could be scheduled
-     * and hasn't run yet. Note that this code replaced tasklet_kill which
-     * would have spun forever and would do the same thing (wait to flush out
-     * outstanding hvm_dirq_assist calls.
-     */
-    if ( pt_pirq_softirq_active(pirq_dpci) )
-    {
-        spin_unlock(&d->event_lock);
-        cpu_relax();
-        goto restart;
-    }
-
-    switch ( pt_irq_bind->irq_type )
-    {
-    case PT_IRQ_TYPE_MSI:
-    {
-        uint8_t dest, delivery_mode;
-        bool dest_mode;
-        int dest_vcpu_id;
-        const struct vcpu *vcpu;
-        uint32_t gflags = pt_irq_bind->u.msi.gflags &
-                          ~XEN_DOMCTL_VMSI_X86_UNMASKED;
-
-        if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
-        {
-            pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI |
-                               HVM_IRQ_DPCI_GUEST_MSI;
-            pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
-            pirq_dpci->gmsi.gflags = gflags;
-            /*
-             * 'pt_irq_create_bind' can be called after 'pt_irq_destroy_bind'.
-             * The 'pirq_cleanup_check' which would free the structure is only
-             * called if the event channel for the PIRQ is active. However
-             * OS-es that use event channels usually bind PIRQs to eventds
-             * and unbind them before calling 'pt_irq_destroy_bind' - with the
-             * result that we re-use the 'dpci' structure. This can be
-             * reproduced with unloading and loading the driver for a device.
-             *
-             * As such on every 'pt_irq_create_bind' call we MUST set it.
-             */
-            pirq_dpci->dom = d;
-            /* bind after hvm_irq_dpci is setup to avoid race with irq 
handler*/
-            rc = pirq_guest_bind(d->vcpu[0], info, 0);
-            if ( rc == 0 && pt_irq_bind->u.msi.gtable )
-            {
-                rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable);
-                if ( unlikely(rc) )
-                {
-                    pirq_guest_unbind(d, info);
-                    /*
-                     * Between 'pirq_guest_bind' and before 'pirq_guest_unbind'
-                     * an interrupt can be scheduled. No more of them are going
-                     * to be scheduled but we must deal with the one that may 
be
-                     * in the queue.
-                     */
-                    pt_pirq_softirq_reset(pirq_dpci);
-                }
-            }
-            if ( unlikely(rc) )
-            {
-                pirq_dpci->gmsi.gflags = 0;
-                pirq_dpci->gmsi.gvec = 0;
-                pirq_dpci->dom = NULL;
-                pirq_dpci->flags = 0;
-                pirq_cleanup_check(info, d);
-                spin_unlock(&d->event_lock);
-                return rc;
-            }
-        }
-        else
-        {
-            uint32_t mask = HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_MSI;
-
-            if ( (pirq_dpci->flags & mask) != mask )
-            {
-                spin_unlock(&d->event_lock);
-                return -EBUSY;
-            }
-
-            /* If pirq is already mapped as vmsi, update guest data/addr. */
-            if ( pirq_dpci->gmsi.gvec != pt_irq_bind->u.msi.gvec ||
-                 pirq_dpci->gmsi.gflags != gflags )
-            {
-                /* Directly clear pending EOIs before enabling new MSI info. */
-                pirq_guest_eoi(info);
-
-                pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
-                pirq_dpci->gmsi.gflags = gflags;
-            }
-        }
-        /* Calculate dest_vcpu_id for MSI-type pirq migration. */
-        dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
-                         XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
-        dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK;
-        delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
-                                  XEN_DOMCTL_VMSI_X86_DELIV_MASK);
-
-        dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
-        pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
-        spin_unlock(&d->event_lock);
-
-        pirq_dpci->gmsi.posted = false;
-        vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
-        if ( iommu_intpost )
-        {
-            if ( delivery_mode == dest_LowestPrio )
-                vcpu = vector_hashing_dest(d, dest, dest_mode,
-                                           pirq_dpci->gmsi.gvec);
-            if ( vcpu )
-                pirq_dpci->gmsi.posted = true;
-        }
-        if ( vcpu && is_iommu_enabled(d) )
-            hvm_migrate_pirq(pirq_dpci, vcpu);
-
-        /* Use interrupt posting if it is supported. */
-        if ( iommu_intpost )
-            pi_update_irte(vcpu ? &vcpu->arch.hvm.vmx.pi_desc : NULL,
-                           info, pirq_dpci->gmsi.gvec);
-
-        if ( pt_irq_bind->u.msi.gflags & XEN_DOMCTL_VMSI_X86_UNMASKED )
-        {
-            unsigned long flags;
-            struct irq_desc *desc = pirq_spin_lock_irq_desc(info, &flags);
-
-            if ( !desc )
-            {
-                pt_irq_destroy_bind(d, pt_irq_bind);
-                return -EINVAL;
-            }
-
-            guest_mask_msi_irq(desc, false);
-            spin_unlock_irqrestore(&desc->lock, flags);
-        }
-
-        break;
-    }
-
-    case PT_IRQ_TYPE_PCI:
-    case PT_IRQ_TYPE_MSI_TRANSLATE:
-    {
-        struct dev_intx_gsi_link *digl = NULL;
-        struct hvm_girq_dpci_mapping *girq = NULL;
-        unsigned int guest_gsi;
-
-        /*
-         * Mapping GSIs for the hardware domain is different than doing it for
-         * an unpriviledged guest, the hardware domain is only allowed to
-         * identity map GSIs, and as such all the data in the u.pci union is
-         * discarded.
-         */
-        if ( hvm_irq_dpci )
-        {
-            unsigned int link;
-
-            digl = xmalloc(struct dev_intx_gsi_link);
-            girq = xmalloc(struct hvm_girq_dpci_mapping);
-
-            if ( !digl || !girq )
-            {
-                spin_unlock(&d->event_lock);
-                xfree(girq);
-                xfree(digl);
-                return -ENOMEM;
-            }
-
-            girq->bus = digl->bus = pt_irq_bind->u.pci.bus;
-            girq->device = digl->device = pt_irq_bind->u.pci.device;
-            girq->intx = digl->intx = pt_irq_bind->u.pci.intx;
-            list_add_tail(&digl->list, &pirq_dpci->digl_list);
-
-            guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx);
-            link = hvm_pci_intx_link(digl->device, digl->intx);
-
-            hvm_irq_dpci->link_cnt[link]++;
-
-            girq->machine_gsi = pirq;
-            list_add_tail(&girq->list, &hvm_irq_dpci->girq[guest_gsi]);
-        }
-        else
-        {
-            ASSERT(is_hardware_domain(d));
-
-            /* MSI_TRANSLATE is not supported for the hardware domain. */
-            if ( pt_irq_bind->irq_type != PT_IRQ_TYPE_PCI ||
-                 pirq >= hvm_domain_irq(d)->nr_gsis )
-            {
-                spin_unlock(&d->event_lock);
-
-                return -EINVAL;
-            }
-            guest_gsi = pirq;
-        }
-
-        /* Bind the same mirq once in the same domain */
-        if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
-        {
-            unsigned int share;
-
-            /* MUST be set, as the pirq_dpci can be re-used. */
-            pirq_dpci->dom = d;
-            if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI_TRANSLATE )
-            {
-                pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED |
-                                   HVM_IRQ_DPCI_MACH_MSI |
-                                   HVM_IRQ_DPCI_GUEST_PCI |
-                                   HVM_IRQ_DPCI_TRANSLATE;
-                share = 0;
-            }
-            else    /* PT_IRQ_TYPE_PCI */
-            {
-                pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED |
-                                   HVM_IRQ_DPCI_MACH_PCI |
-                                   HVM_IRQ_DPCI_GUEST_PCI;
-                if ( !is_hardware_domain(d) )
-                    share = BIND_PIRQ__WILL_SHARE;
-                else
-                {
-                    int mask = vioapic_get_mask(d, guest_gsi);
-                    int trigger_mode = vioapic_get_trigger_mode(d, guest_gsi);
-
-                    if ( mask < 0 || trigger_mode < 0 )
-                    {
-                        spin_unlock(&d->event_lock);
-
-                        ASSERT_UNREACHABLE();
-                        return -EINVAL;
-                    }
-                    pirq_dpci->flags |= HVM_IRQ_DPCI_IDENTITY_GSI;
-                    /*
-                     * Check if the corresponding vIO APIC pin is configured
-                     * level or edge trigger, level triggered interrupts will
-                     * be marked as shareable.
-                     */
-                    ASSERT(!mask);
-                    share = trigger_mode;
-                    if ( trigger_mode == VIOAPIC_EDGE_TRIG )
-                        /*
-                         * Edge IO-APIC interrupt, no EOI or unmask to perform
-                         * and hence no timer needed.
-                         */
-                        pirq_dpci->flags |= HVM_IRQ_DPCI_NO_EOI;
-                }
-            }
-
-            /* Init timer before binding */
-            if ( pt_irq_need_timer(pirq_dpci->flags) )
-                init_timer(&pirq_dpci->timer, pt_irq_time_out, pirq_dpci, 0);
-            /* Deal with gsi for legacy devices */
-            rc = pirq_guest_bind(d->vcpu[0], info, share);
-            if ( unlikely(rc) )
-            {
-                if ( pt_irq_need_timer(pirq_dpci->flags) )
-                    kill_timer(&pirq_dpci->timer);
-                /*
-                 * There is no path for __do_IRQ to schedule softirq as
-                 * IRQ_GUEST is not set. As such we can reset 'dom' directly.
-                 */
-                pirq_dpci->dom = NULL;
-                if ( hvm_irq_dpci )
-                {
-                    unsigned int link;
-
-                    ASSERT(girq && digl);
-                    list_del(&girq->list);
-                    list_del(&digl->list);
-                    link = hvm_pci_intx_link(digl->device, digl->intx);
-                    hvm_irq_dpci->link_cnt[link]--;
-                }
-                pirq_dpci->flags = 0;
-                pirq_cleanup_check(info, d);
-                spin_unlock(&d->event_lock);
-                xfree(girq);
-                xfree(digl);
-                return rc;
-            }
-        }
-
-        spin_unlock(&d->event_lock);
-
-        if ( iommu_verbose )
-        {
-            char buf[24] = "";
-
-            if ( digl )
-                snprintf(buf, ARRAY_SIZE(buf), " dev=%02x.%02x.%u intx=%u",
-                         digl->bus, PCI_SLOT(digl->device),
-                         PCI_FUNC(digl->device), digl->intx);
-
-            printk(XENLOG_G_INFO "d%d: bind: m_gsi=%u g_gsi=%u%s\n",
-                   d->domain_id, pirq, guest_gsi, buf);
-        }
-        break;
-    }
-
-    default:
-        spin_unlock(&d->event_lock);
-        return -EOPNOTSUPP;
-    }
-
-    return 0;
-}
-
-int pt_irq_destroy_bind(
-    struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
-{
-    struct hvm_irq_dpci *hvm_irq_dpci;
-    struct hvm_pirq_dpci *pirq_dpci;
-    unsigned int machine_gsi = pt_irq_bind->machine_irq;
-    struct pirq *pirq;
-    const char *what = NULL;
-
-    switch ( pt_irq_bind->irq_type )
-    {
-    case PT_IRQ_TYPE_PCI:
-    case PT_IRQ_TYPE_MSI_TRANSLATE:
-        if ( iommu_verbose )
-        {
-            unsigned int device = pt_irq_bind->u.pci.device;
-            unsigned int intx = pt_irq_bind->u.pci.intx;
-
-            printk(XENLOG_G_INFO
-                   "d%d: unbind: m_gsi=%u g_gsi=%u dev=%02x:%02x.%u intx=%u\n",
-                   d->domain_id, machine_gsi, hvm_pci_intx_gsi(device, intx),
-                   pt_irq_bind->u.pci.bus,
-                   PCI_SLOT(device), PCI_FUNC(device), intx);
-        }
-        break;
-    case PT_IRQ_TYPE_MSI:
-    {
-        unsigned long flags;
-        struct irq_desc *desc = domain_spin_lock_irq_desc(d, machine_gsi,
-                                                          &flags);
-
-        if ( !desc )
-            return -EINVAL;
-        /*
-         * Leave the MSI masked, so that the state when calling
-         * pt_irq_create_bind is consistent across bind/unbinds.
-         */
-        guest_mask_msi_irq(desc, true);
-        spin_unlock_irqrestore(&desc->lock, flags);
-        break;
-    }
-
-    default:
-        return -EOPNOTSUPP;
-    }
-
-    spin_lock(&d->event_lock);
-
-    hvm_irq_dpci = domain_get_irq_dpci(d);
-
-    if ( !hvm_irq_dpci && !is_hardware_domain(d) )
-    {
-        spin_unlock(&d->event_lock);
-        return -EINVAL;
-    }
-
-    pirq = pirq_info(d, machine_gsi);
-    pirq_dpci = pirq_dpci(pirq);
-
-    if ( hvm_irq_dpci && pt_irq_bind->irq_type != PT_IRQ_TYPE_MSI )
-    {
-        unsigned int bus = pt_irq_bind->u.pci.bus;
-        unsigned int device = pt_irq_bind->u.pci.device;
-        unsigned int intx = pt_irq_bind->u.pci.intx;
-        unsigned int guest_gsi = hvm_pci_intx_gsi(device, intx);
-        unsigned int link = hvm_pci_intx_link(device, intx);
-        struct hvm_girq_dpci_mapping *girq;
-        struct dev_intx_gsi_link *digl, *tmp;
-
-        list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list )
-        {
-            if ( girq->bus         == bus &&
-                 girq->device      == device &&
-                 girq->intx        == intx &&
-                 girq->machine_gsi == machine_gsi )
-            {
-                list_del(&girq->list);
-                xfree(girq);
-                girq = NULL;
-                break;
-            }
-        }
-
-        if ( girq )
-        {
-            spin_unlock(&d->event_lock);
-            return -EINVAL;
-        }
-
-        hvm_irq_dpci->link_cnt[link]--;
-
-        /* clear the mirq info */
-        if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
-        {
-            list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list )
-            {
-                if ( digl->bus    == bus &&
-                     digl->device == device &&
-                     digl->intx   == intx )
-                {
-                    list_del(&digl->list);
-                    xfree(digl);
-                }
-            }
-            what = list_empty(&pirq_dpci->digl_list) ? "final" : "partial";
-        }
-        else
-            what = "bogus";
-    }
-    else if ( pirq_dpci && pirq_dpci->gmsi.posted )
-        pi_update_irte(NULL, pirq, 0);
-
-    if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
-         list_empty(&pirq_dpci->digl_list) )
-    {
-        pirq_guest_unbind(d, pirq);
-        msixtbl_pt_unregister(d, pirq);
-        if ( pt_irq_need_timer(pirq_dpci->flags) )
-            kill_timer(&pirq_dpci->timer);
-        pirq_dpci->flags = 0;
-        /*
-         * See comment in pt_irq_create_bind's PT_IRQ_TYPE_MSI before the
-         * call to pt_pirq_softirq_reset.
-         */
-        pt_pirq_softirq_reset(pirq_dpci);
-
-        pirq_cleanup_check(pirq, d);
-    }
-
-    spin_unlock(&d->event_lock);
-
-    if ( what && iommu_verbose )
-    {
-        unsigned int device = pt_irq_bind->u.pci.device;
-        char buf[24] = "";
-
-        if ( hvm_irq_dpci )
-            snprintf(buf, ARRAY_SIZE(buf), " dev=%02x.%02x.%u intx=%u",
-                     pt_irq_bind->u.pci.bus, PCI_SLOT(device),
-                     PCI_FUNC(device), pt_irq_bind->u.pci.intx);
-
-        printk(XENLOG_G_INFO "d%d %s unmap: m_irq=%u%s\n",
-               d->domain_id, what, machine_gsi, buf);
-    }
-
-    return 0;
-}
-
-void pt_pirq_init(struct domain *d, struct hvm_pirq_dpci *dpci)
-{
-    INIT_LIST_HEAD(&dpci->digl_list);
-    dpci->gmsi.dest_vcpu_id = -1;
-}
-
-bool pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci)
-{
-    if ( !dpci->flags && !pt_pirq_softirq_active(dpci) )
-    {
-        dpci->dom = NULL;
-        return true;
-    }
-    return false;
-}
-
-int pt_pirq_iterate(struct domain *d,
-                    int (*cb)(struct domain *,
-                              struct hvm_pirq_dpci *, void *),
-                    void *arg)
-{
-    int rc = 0;
-    unsigned int pirq = 0, n, i;
-    struct pirq *pirqs[8];
-
-    ASSERT(spin_is_locked(&d->event_lock));
-
-    do {
-        n = radix_tree_gang_lookup(&d->pirq_tree, (void **)pirqs, pirq,
-                                   ARRAY_SIZE(pirqs));
-        for ( i = 0; i < n; ++i )
-        {
-            struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirqs[i]);
-
-            pirq = pirqs[i]->pirq;
-            if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
-                rc = cb(d, pirq_dpci, arg);
-        }
-    } while ( !rc && ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) );
-
-    return rc;
-}
-
-int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq)
-{
-    struct hvm_irq_dpci *dpci = domain_get_irq_dpci(d);
-    struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirq);
-
-    ASSERT(is_hvm_domain(d));
-
-    if ( !is_iommu_enabled(d) || (!is_hardware_domain(d) && !dpci) ||
-         !pirq_dpci || !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
-        return 0;
-
-    pirq_dpci->masked = 1;
-    raise_softirq_for(pirq_dpci);
-    return 1;
-}
-
-/* called with d->event_lock held */
-static void __msi_pirq_eoi(struct hvm_pirq_dpci *pirq_dpci)
-{
-    irq_desc_t *desc;
-
-    if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
-         (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) )
-    {
-        struct pirq *pirq = dpci_pirq(pirq_dpci);
-
-        BUG_ON(!local_irq_is_enabled());
-        desc = pirq_spin_lock_irq_desc(pirq, NULL);
-        if ( !desc )
-            return;
-        desc_guest_eoi(desc, pirq);
-    }
-}
-
-static int _hvm_dpci_msi_eoi(struct domain *d,
-                             struct hvm_pirq_dpci *pirq_dpci, void *arg)
-{
-    int vector = (long)arg;
-
-    if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) &&
-         (pirq_dpci->gmsi.gvec == vector) )
-    {
-        unsigned int dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
-                                      XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
-        bool dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK;
-
-        if ( vlapic_match_dest(vcpu_vlapic(current), NULL, 0, dest,
-                               dest_mode) )
-        {
-            __msi_pirq_eoi(pirq_dpci);
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
-void hvm_dpci_msi_eoi(struct domain *d, int vector)
-{
-    if ( !is_iommu_enabled(d) ||
-         (!hvm_domain_irq(d)->dpci && !is_hardware_domain(d)) )
-       return;
-
-    spin_lock(&d->event_lock);
-    pt_pirq_iterate(d, _hvm_dpci_msi_eoi, (void *)(long)vector);
-    spin_unlock(&d->event_lock);
-}
-
-static void hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci)
-{
-    if ( unlikely(!hvm_domain_irq(d)->dpci) && !is_hardware_domain(d) )
-    {
-        ASSERT_UNREACHABLE();
-        return;
-    }
-
-    spin_lock(&d->event_lock);
-    if ( test_and_clear_bool(pirq_dpci->masked) )
-    {
-        struct pirq *pirq = dpci_pirq(pirq_dpci);
-        const struct dev_intx_gsi_link *digl;
-
-        if ( hvm_domain_use_pirq(d, pirq) )
-        {
-            send_guest_pirq(d, pirq);
-
-            if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
-                goto out;
-        }
-
-        if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
-        {
-            vmsi_deliver_pirq(d, pirq_dpci);
-            goto out;
-        }
-
-        list_for_each_entry ( digl, &pirq_dpci->digl_list, list )
-        {
-            ASSERT(!(pirq_dpci->flags & HVM_IRQ_DPCI_IDENTITY_GSI));
-            hvm_pci_intx_assert(d, digl->device, digl->intx);
-            pirq_dpci->pending++;
-        }
-
-        if ( pirq_dpci->flags & HVM_IRQ_DPCI_IDENTITY_GSI )
-        {
-            hvm_gsi_assert(d, pirq->pirq);
-            if ( pirq_dpci->flags & HVM_IRQ_DPCI_NO_EOI )
-                goto out;
-            pirq_dpci->pending++;
-        }
-
-        if ( pirq_dpci->flags & HVM_IRQ_DPCI_TRANSLATE )
-        {
-            /* for translated MSI to INTx interrupt, eoi as early as possible 
*/
-            __msi_pirq_eoi(pirq_dpci);
-            goto out;
-        }
-
-        /*
-         * Set a timer to see if the guest can finish the interrupt or not. For
-         * example, the guest OS may unmask the PIC during boot, before the
-         * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the
-         * guest will never deal with the irq, then the physical interrupt line
-         * will never be deasserted.
-         */
-        ASSERT(pt_irq_need_timer(pirq_dpci->flags));
-        set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT);
-    }
-
- out:
-    spin_unlock(&d->event_lock);
-}
-
-static void hvm_pirq_eoi(struct pirq *pirq,
-                         const union vioapic_redir_entry *ent)
-{
-    struct hvm_pirq_dpci *pirq_dpci;
-
-    if ( !pirq )
-    {
-        ASSERT_UNREACHABLE();
-        return;
-    }
-
-    pirq_dpci = pirq_dpci(pirq);
-
-    /*
-     * No need to get vector lock for timer
-     * since interrupt is still not EOIed
-     */
-    if ( --pirq_dpci->pending ||
-         (ent && ent->fields.mask) ||
-         !pt_irq_need_timer(pirq_dpci->flags) )
-        return;
-
-    stop_timer(&pirq_dpci->timer);
-    pirq_guest_eoi(pirq);
-}
-
-static void __hvm_dpci_eoi(struct domain *d,
-                           const struct hvm_girq_dpci_mapping *girq,
-                           const union vioapic_redir_entry *ent)
-{
-    struct pirq *pirq = pirq_info(d, girq->machine_gsi);
-
-    if ( !hvm_domain_use_pirq(d, pirq) )
-        hvm_pci_intx_deassert(d, girq->device, girq->intx);
-
-    hvm_pirq_eoi(pirq, ent);
-}
-
-static void hvm_gsi_eoi(struct domain *d, unsigned int gsi,
-                        const union vioapic_redir_entry *ent)
-{
-    struct pirq *pirq = pirq_info(d, gsi);
-
-    /* Check if GSI is actually mapped. */
-    if ( !pirq_dpci(pirq) )
-        return;
-
-    hvm_gsi_deassert(d, gsi);
-    hvm_pirq_eoi(pirq, ent);
-}
-
-void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi,
-                  const union vioapic_redir_entry *ent)
-{
-    const struct hvm_irq_dpci *hvm_irq_dpci;
-    const struct hvm_girq_dpci_mapping *girq;
-
-    if ( !is_iommu_enabled(d) )
-        return;
-
-    if ( is_hardware_domain(d) )
-    {
-        spin_lock(&d->event_lock);
-        hvm_gsi_eoi(d, guest_gsi, ent);
-        goto unlock;
-    }
-
-    if ( guest_gsi < NR_ISAIRQS )
-    {
-        hvm_dpci_isairq_eoi(d, guest_gsi);
-        return;
-    }
-
-    spin_lock(&d->event_lock);
-    hvm_irq_dpci = domain_get_irq_dpci(d);
-
-    if ( !hvm_irq_dpci )
-        goto unlock;
-
-    list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list )
-        __hvm_dpci_eoi(d, girq, ent);
-
-unlock:
-    spin_unlock(&d->event_lock);
-}
-
-/*
- * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to
- * doing it. If that is the case we let 'pt_pirq_softirq_reset' do 
ref-counting.
- */
-static void dpci_softirq(void)
-{
-    unsigned int cpu = smp_processor_id();
-    LIST_HEAD(our_list);
-
-    local_irq_disable();
-    list_splice_init(&per_cpu(dpci_list, cpu), &our_list);
-    local_irq_enable();
-
-    while ( !list_empty(&our_list) )
-    {
-        struct hvm_pirq_dpci *pirq_dpci;
-        struct domain *d;
-
-        pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, 
softirq_list);
-        list_del(&pirq_dpci->softirq_list);
-
-        d = pirq_dpci->dom;
-        smp_mb(); /* 'd' MUST be saved before we set/clear the bits. */
-        if ( test_and_set_bit(STATE_RUN, &pirq_dpci->state) )
-        {
-            unsigned long flags;
-
-            /* Put back on the list and retry. */
-            local_irq_save(flags);
-            list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
-            local_irq_restore(flags);
-
-            raise_softirq(HVM_DPCI_SOFTIRQ);
-            continue;
-        }
-        /*
-         * The one who clears STATE_SCHED MUST refcount the domain.
-         */
-        if ( test_and_clear_bit(STATE_SCHED, &pirq_dpci->state) )
-        {
-            hvm_dirq_assist(d, pirq_dpci);
-            put_domain(d);
-        }
-        clear_bit(STATE_RUN, &pirq_dpci->state);
-    }
-}
-
-static int cpu_callback(
-    struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-    unsigned int cpu = (unsigned long)hcpu;
-
-    switch ( action )
-    {
-    case CPU_UP_PREPARE:
-        INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
-        break;
-    case CPU_UP_CANCELED:
-    case CPU_DEAD:
-        /*
-         * On CPU_DYING this callback is called (on the CPU that is dying)
-         * with an possible HVM_DPIC_SOFTIRQ pending - at which point we can
-         * clear out any outstanding domains (by the virtue of the idle loop
-         * calling the softirq later). In CPU_DEAD case the CPU is deaf and
-         * there are no pending softirqs for us to handle so we can chill.
-         */
-        ASSERT(list_empty(&per_cpu(dpci_list, cpu)));
-        break;
-    }
-
-    return NOTIFY_DONE;
-}
-
-static struct notifier_block cpu_nfb = {
-    .notifier_call = cpu_callback,
-};
-
-static int __init setup_dpci_softirq(void)
-{
-    unsigned int cpu;
-
-    for_each_online_cpu(cpu)
-        INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
-
-    open_softirq(HVM_DPCI_SOFTIRQ, dpci_softirq);
-    register_cpu_notifier(&cpu_nfb);
-    return 0;
-}
-__initcall(setup_dpci_softirq);
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 51e584127e..ab590ca398 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -24,7 +24,6 @@
 #include <xen/irq.h>
 #include <xen/param.h>
 #include <xen/vm_event.h>
-#include <asm/hvm/irq.h>
 #include <xen/delay.h>
 #include <xen/keyhandler.h>
 #include <xen/event.h>
@@ -842,71 +841,6 @@ int pci_remove_device(u16 seg, u8 bus, u8 devfn)
     return ret;
 }
 
-static int pci_clean_dpci_irq(struct domain *d,
-                              struct hvm_pirq_dpci *pirq_dpci, void *arg)
-{
-    struct dev_intx_gsi_link *digl, *tmp;
-
-    pirq_guest_unbind(d, dpci_pirq(pirq_dpci));
-
-    if ( pt_irq_need_timer(pirq_dpci->flags) )
-        kill_timer(&pirq_dpci->timer);
-
-    list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list )
-    {
-        list_del(&digl->list);
-        xfree(digl);
-    }
-
-    radix_tree_delete(&d->pirq_tree, dpci_pirq(pirq_dpci)->pirq);
-
-    if ( !pt_pirq_softirq_active(pirq_dpci) )
-        return 0;
-
-    domain_get_irq_dpci(d)->pending_pirq_dpci = pirq_dpci;
-
-    return -ERESTART;
-}
-
-static int pci_clean_dpci_irqs(struct domain *d)
-{
-    struct hvm_irq_dpci *hvm_irq_dpci = NULL;
-
-    if ( !is_iommu_enabled(d) )
-        return 0;
-
-    if ( !is_hvm_domain(d) )
-        return 0;
-
-    spin_lock(&d->event_lock);
-    hvm_irq_dpci = domain_get_irq_dpci(d);
-    if ( hvm_irq_dpci != NULL )
-    {
-        int ret = 0;
-
-        if ( hvm_irq_dpci->pending_pirq_dpci )
-        {
-            if ( pt_pirq_softirq_active(hvm_irq_dpci->pending_pirq_dpci) )
-                 ret = -ERESTART;
-            else
-                 hvm_irq_dpci->pending_pirq_dpci = NULL;
-        }
-
-        if ( !ret )
-            ret = pt_pirq_iterate(d, pci_clean_dpci_irq, NULL);
-        if ( ret )
-        {
-            spin_unlock(&d->event_lock);
-            return ret;
-        }
-
-        hvm_domain_irq(d)->dpci = NULL;
-        free_hvm_irq_dpci(hvm_irq_dpci);
-    }
-    spin_unlock(&d->event_lock);
-    return 0;
-}
-
 /* Caller should hold the pcidevs_lock */
 static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus,
                            uint8_t devfn)
@@ -966,7 +900,7 @@ int pci_release_devices(struct domain *d)
     int ret;
 
     pcidevs_lock();
-    ret = pci_clean_dpci_irqs(d);
+    ret = arch_pci_clean_pirqs(d);
     if ( ret )
     {
         pcidevs_unlock();
diff --git a/xen/drivers/passthrough/x86/Makefile 
b/xen/drivers/passthrough/x86/Makefile
index a70cf9460d..69284a5d19 100644
--- a/xen/drivers/passthrough/x86/Makefile
+++ b/xen/drivers/passthrough/x86/Makefile
@@ -1,2 +1,3 @@
 obj-y += ats.o
 obj-y += iommu.o
+obj-$(CONFIG_HVM) += hvm.o
diff --git a/xen/drivers/passthrough/x86/hvm.c 
b/xen/drivers/passthrough/x86/hvm.c
new file mode 100644
index 0000000000..41cfa2e200
--- /dev/null
+++ b/xen/drivers/passthrough/x86/hvm.c
@@ -0,0 +1,1193 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@xxxxxxxxx>
+ * Copyright (C) Xiaohui Xin <xiaohui.xin@xxxxxxxxx>
+ */
+
+#include <xen/event.h>
+#include <xen/iommu.h>
+#include <xen/cpu.h>
+#include <xen/irq.h>
+#include <asm/hvm/irq.h>
+#include <asm/hvm/support.h>
+#include <asm/io_apic.h>
+
+static DEFINE_PER_CPU(struct list_head, dpci_list);
+
+/*
+ * These two bit states help to safely schedule, deschedule, and wait until
+ * the softirq has finished.
+ *
+ * The semantics behind these two bits is as follow:
+ *  - STATE_SCHED - whoever modifies it has to ref-count the domain (->dom).
+ *  - STATE_RUN - only softirq is allowed to set and clear it. If it has
+ *      been set hvm_dirq_assist will RUN with a saved value of the
+ *      'struct domain' copied from 'pirq_dpci->dom' before STATE_RUN was set.
+ *
+ * The usual states are: STATE_SCHED(set) -> STATE_RUN(set) ->
+ * STATE_SCHED(unset) -> STATE_RUN(unset).
+ *
+ * However the states can also diverge such as: STATE_SCHED(set) ->
+ * STATE_SCHED(unset) -> STATE_RUN(set) -> STATE_RUN(unset). That means
+ * the 'hvm_dirq_assist' never run and that the softirq did not do any
+ * ref-counting.
+ */
+
+enum {
+    STATE_SCHED,
+    STATE_RUN
+};
+
+/*
+ * This can be called multiple times, but the softirq is only raised once.
+ * That is until the STATE_SCHED state has been cleared. The state can be
+ * cleared by: the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
+ * or by 'pt_pirq_softirq_reset' (which will try to clear the state before
+ * the softirq had a chance to run).
+ */
+static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
+{
+    unsigned long flags;
+
+    if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
+        return;
+
+    get_knownalive_domain(pirq_dpci->dom);
+
+    local_irq_save(flags);
+    list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
+    local_irq_restore(flags);
+
+    raise_softirq(HVM_DPCI_SOFTIRQ);
+}
+
+/*
+ * If we are racing with softirq_dpci (STATE_SCHED) we return
+ * true. Otherwise we return false.
+ *
+ * If it is false, it is the callers responsibility to make sure
+ * that the softirq (with the event_lock dropped) has ran.
+ */
+bool pt_pirq_softirq_active(struct hvm_pirq_dpci *pirq_dpci)
+{
+    if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED)) )
+        return true;
+
+    /*
+     * If in the future we would call 'raise_softirq_for' right away
+     * after 'pt_pirq_softirq_active' we MUST reset the list (otherwise it
+     * might have stale data).
+     */
+    return false;
+}
+
+/*
+ * Reset the pirq_dpci->dom parameter to NULL.
+ *
+ * This function checks the different states to make sure it can do it
+ * at the right time. If it unschedules the 'hvm_dirq_assist' from running
+ * it also refcounts (which is what the softirq would have done) properly.
+ */
+static void pt_pirq_softirq_reset(struct hvm_pirq_dpci *pirq_dpci)
+{
+    struct domain *d = pirq_dpci->dom;
+
+    ASSERT(spin_is_locked(&d->event_lock));
+
+    switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) )
+    {
+    case (1 << STATE_SCHED):
+        /*
+         * We are going to try to de-schedule the softirq before it goes in
+         * STATE_RUN. Whoever clears STATE_SCHED MUST refcount the 'dom'.
+         */
+        put_domain(d);
+        /* fallthrough. */
+    case (1 << STATE_RUN):
+    case (1 << STATE_RUN) | (1 << STATE_SCHED):
+        /*
+         * The reason it is OK to reset 'dom' when STATE_RUN bit is set is due
+         * to a shortcut the 'dpci_softirq' implements. It stashes the 'dom'
+         * in local variable before it sets STATE_RUN - and therefore will not
+         * dereference '->dom' which would crash.
+         */
+        pirq_dpci->dom = NULL;
+        break;
+    }
+    /*
+     * Inhibit 'hvm_dirq_assist' from doing anything useful and at worst
+     * calling 'set_timer' which will blow up (as we have called kill_timer
+     * or never initialized it). Note that we hold the lock that
+     * 'hvm_dirq_assist' could be spinning on.
+     */
+    pirq_dpci->masked = 0;
+}
+
+bool pt_irq_need_timer(uint32_t flags)
+{
+    return !(flags & (HVM_IRQ_DPCI_GUEST_MSI | HVM_IRQ_DPCI_TRANSLATE |
+                      HVM_IRQ_DPCI_NO_EOI));
+}
+
+static int pt_irq_guest_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
+                            void *arg)
+{
+    if ( __test_and_clear_bit(_HVM_IRQ_DPCI_EOI_LATCH_SHIFT,
+                              &pirq_dpci->flags) )
+    {
+        pirq_dpci->masked = 0;
+        pirq_dpci->pending = 0;
+        pirq_guest_eoi(dpci_pirq(pirq_dpci));
+    }
+
+    return 0;
+}
+
+static void pt_irq_time_out(void *data)
+{
+    struct hvm_pirq_dpci *irq_map = data;
+    const struct hvm_irq_dpci *dpci;
+    const struct dev_intx_gsi_link *digl;
+
+    spin_lock(&irq_map->dom->event_lock);
+
+    if ( irq_map->flags & HVM_IRQ_DPCI_IDENTITY_GSI )
+    {
+        ASSERT(is_hardware_domain(irq_map->dom));
+        /*
+         * Identity mapped, no need to iterate over the guest GSI list to find
+         * other pirqs sharing the same guest GSI.
+         *
+         * In the identity mapped case the EOI can also be done now, this way
+         * the iteration over the list of domain pirqs is avoided.
+         */
+        hvm_gsi_deassert(irq_map->dom, dpci_pirq(irq_map)->pirq);
+        irq_map->flags |= HVM_IRQ_DPCI_EOI_LATCH;
+        pt_irq_guest_eoi(irq_map->dom, irq_map, NULL);
+        spin_unlock(&irq_map->dom->event_lock);
+        return;
+    }
+
+    dpci = domain_get_irq_dpci(irq_map->dom);
+    if ( unlikely(!dpci) )
+    {
+        ASSERT_UNREACHABLE();
+        spin_unlock(&irq_map->dom->event_lock);
+        return;
+    }
+    list_for_each_entry ( digl, &irq_map->digl_list, list )
+    {
+        unsigned int guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx);
+        const struct hvm_girq_dpci_mapping *girq;
+
+        list_for_each_entry ( girq, &dpci->girq[guest_gsi], list )
+        {
+            struct pirq *pirq = pirq_info(irq_map->dom, girq->machine_gsi);
+
+            pirq_dpci(pirq)->flags |= HVM_IRQ_DPCI_EOI_LATCH;
+        }
+        hvm_pci_intx_deassert(irq_map->dom, digl->device, digl->intx);
+    }
+
+    pt_pirq_iterate(irq_map->dom, pt_irq_guest_eoi, NULL);
+
+    spin_unlock(&irq_map->dom->event_lock);
+}
+
+struct hvm_irq_dpci *domain_get_irq_dpci(const struct domain *d)
+{
+    if ( !d || !is_hvm_domain(d) )
+        return NULL;
+
+    return hvm_domain_irq(d)->dpci;
+}
+
+void free_hvm_irq_dpci(struct hvm_irq_dpci *dpci)
+{
+    xfree(dpci);
+}
+
+/*
+ * This routine handles lowest-priority interrupts using vector-hashing
+ * mechanism. As an example, modern Intel CPUs use this method to handle
+ * lowest-priority interrupts.
+ *
+ * Here is the details about the vector-hashing mechanism:
+ * 1. For lowest-priority interrupts, store all the possible destination
+ *    vCPUs in an array.
+ * 2. Use "gvec % max number of destination vCPUs" to find the right
+ *    destination vCPU in the array for the lowest-priority interrupt.
+ */
+static struct vcpu *vector_hashing_dest(const struct domain *d,
+                                        uint32_t dest_id,
+                                        bool dest_mode,
+                                        uint8_t gvec)
+
+{
+    unsigned long *dest_vcpu_bitmap;
+    unsigned int dest_vcpus = 0;
+    struct vcpu *v, *dest = NULL;
+    unsigned int i;
+
+    dest_vcpu_bitmap = xzalloc_array(unsigned long,
+                                     BITS_TO_LONGS(d->max_vcpus));
+    if ( !dest_vcpu_bitmap )
+        return NULL;
+
+    for_each_vcpu ( d, v )
+    {
+        if ( !vlapic_match_dest(vcpu_vlapic(v), NULL, APIC_DEST_NOSHORT,
+                                dest_id, dest_mode) )
+            continue;
+
+        __set_bit(v->vcpu_id, dest_vcpu_bitmap);
+        dest_vcpus++;
+    }
+
+    if ( dest_vcpus != 0 )
+    {
+        unsigned int mod = gvec % dest_vcpus;
+        unsigned int idx = 0;
+
+        for ( i = 0; i <= mod; i++ )
+        {
+            idx = find_next_bit(dest_vcpu_bitmap, d->max_vcpus, idx) + 1;
+            BUG_ON(idx > d->max_vcpus);
+        }
+
+        dest = d->vcpu[idx - 1];
+    }
+
+    xfree(dest_vcpu_bitmap);
+
+    return dest;
+}
+
+int pt_irq_create_bind(
+    struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci;
+    struct hvm_pirq_dpci *pirq_dpci;
+    struct pirq *info;
+    int rc, pirq = pt_irq_bind->machine_irq;
+
+    if ( pirq < 0 || pirq >= d->nr_pirqs )
+        return -EINVAL;
+
+ restart:
+    spin_lock(&d->event_lock);
+
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+    if ( !hvm_irq_dpci && !is_hardware_domain(d) )
+    {
+        unsigned int i;
+
+        /*
+         * NB: the hardware domain doesn't use a hvm_irq_dpci struct because
+         * it's only allowed to identity map GSIs, and so the data contained in
+         * that struct (used to map guest GSIs into machine GSIs and perform
+         * interrupt routing) is completely useless to it.
+         */
+        hvm_irq_dpci = xzalloc(struct hvm_irq_dpci);
+        if ( hvm_irq_dpci == NULL )
+        {
+            spin_unlock(&d->event_lock);
+            return -ENOMEM;
+        }
+        for ( i = 0; i < NR_HVM_DOMU_IRQS; i++ )
+            INIT_LIST_HEAD(&hvm_irq_dpci->girq[i]);
+
+        hvm_domain_irq(d)->dpci = hvm_irq_dpci;
+    }
+
+    info = pirq_get_info(d, pirq);
+    if ( !info )
+    {
+        spin_unlock(&d->event_lock);
+        return -ENOMEM;
+    }
+    pirq_dpci = pirq_dpci(info);
+
+    /*
+     * A crude 'while' loop with us dropping the spinlock and giving
+     * the softirq_dpci a chance to run.
+     * We MUST check for this condition as the softirq could be scheduled
+     * and hasn't run yet. Note that this code replaced tasklet_kill which
+     * would have spun forever and would do the same thing (wait to flush out
+     * outstanding hvm_dirq_assist calls.
+     */
+    if ( pt_pirq_softirq_active(pirq_dpci) )
+    {
+        spin_unlock(&d->event_lock);
+        cpu_relax();
+        goto restart;
+    }
+
+    switch ( pt_irq_bind->irq_type )
+    {
+    case PT_IRQ_TYPE_MSI:
+    {
+        uint8_t dest, delivery_mode;
+        bool dest_mode;
+        int dest_vcpu_id;
+        const struct vcpu *vcpu;
+        uint32_t gflags = pt_irq_bind->u.msi.gflags &
+                          ~XEN_DOMCTL_VMSI_X86_UNMASKED;
+
+        if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
+        {
+            pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI |
+                               HVM_IRQ_DPCI_GUEST_MSI;
+            pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
+            pirq_dpci->gmsi.gflags = gflags;
+            /*
+             * 'pt_irq_create_bind' can be called after 'pt_irq_destroy_bind'.
+             * The 'pirq_cleanup_check' which would free the structure is only
+             * called if the event channel for the PIRQ is active. However
+             * OS-es that use event channels usually bind PIRQs to eventds
+             * and unbind them before calling 'pt_irq_destroy_bind' - with the
+             * result that we re-use the 'dpci' structure. This can be
+             * reproduced with unloading and loading the driver for a device.
+             *
+             * As such on every 'pt_irq_create_bind' call we MUST set it.
+             */
+            pirq_dpci->dom = d;
+            /* bind after hvm_irq_dpci is setup to avoid race with irq 
handler*/
+            rc = pirq_guest_bind(d->vcpu[0], info, 0);
+            if ( rc == 0 && pt_irq_bind->u.msi.gtable )
+            {
+                rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable);
+                if ( unlikely(rc) )
+                {
+                    pirq_guest_unbind(d, info);
+                    /*
+                     * Between 'pirq_guest_bind' and before 'pirq_guest_unbind'
+                     * an interrupt can be scheduled. No more of them are going
+                     * to be scheduled but we must deal with the one that may 
be
+                     * in the queue.
+                     */
+                    pt_pirq_softirq_reset(pirq_dpci);
+                }
+            }
+            if ( unlikely(rc) )
+            {
+                pirq_dpci->gmsi.gflags = 0;
+                pirq_dpci->gmsi.gvec = 0;
+                pirq_dpci->dom = NULL;
+                pirq_dpci->flags = 0;
+                pirq_cleanup_check(info, d);
+                spin_unlock(&d->event_lock);
+                return rc;
+            }
+        }
+        else
+        {
+            uint32_t mask = HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_MSI;
+
+            if ( (pirq_dpci->flags & mask) != mask )
+            {
+                spin_unlock(&d->event_lock);
+                return -EBUSY;
+            }
+
+            /* If pirq is already mapped as vmsi, update guest data/addr. */
+            if ( pirq_dpci->gmsi.gvec != pt_irq_bind->u.msi.gvec ||
+                 pirq_dpci->gmsi.gflags != gflags )
+            {
+                /* Directly clear pending EOIs before enabling new MSI info. */
+                pirq_guest_eoi(info);
+
+                pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
+                pirq_dpci->gmsi.gflags = gflags;
+            }
+        }
+        /* Calculate dest_vcpu_id for MSI-type pirq migration. */
+        dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
+                         XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
+        dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK;
+        delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
+                                  XEN_DOMCTL_VMSI_X86_DELIV_MASK);
+
+        dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
+        pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
+        spin_unlock(&d->event_lock);
+
+        pirq_dpci->gmsi.posted = false;
+        vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
+        if ( iommu_intpost )
+        {
+            if ( delivery_mode == dest_LowestPrio )
+                vcpu = vector_hashing_dest(d, dest, dest_mode,
+                                           pirq_dpci->gmsi.gvec);
+            if ( vcpu )
+                pirq_dpci->gmsi.posted = true;
+        }
+        if ( vcpu && is_iommu_enabled(d) )
+            hvm_migrate_pirq(pirq_dpci, vcpu);
+
+        /* Use interrupt posting if it is supported. */
+        if ( iommu_intpost )
+            pi_update_irte(vcpu ? &vcpu->arch.hvm.vmx.pi_desc : NULL,
+                           info, pirq_dpci->gmsi.gvec);
+
+        if ( pt_irq_bind->u.msi.gflags & XEN_DOMCTL_VMSI_X86_UNMASKED )
+        {
+            unsigned long flags;
+            struct irq_desc *desc = pirq_spin_lock_irq_desc(info, &flags);
+
+            if ( !desc )
+            {
+                pt_irq_destroy_bind(d, pt_irq_bind);
+                return -EINVAL;
+            }
+
+            guest_mask_msi_irq(desc, false);
+            spin_unlock_irqrestore(&desc->lock, flags);
+        }
+
+        break;
+    }
+
+    case PT_IRQ_TYPE_PCI:
+    case PT_IRQ_TYPE_MSI_TRANSLATE:
+    {
+        struct dev_intx_gsi_link *digl = NULL;
+        struct hvm_girq_dpci_mapping *girq = NULL;
+        unsigned int guest_gsi;
+
+        /*
+         * Mapping GSIs for the hardware domain is different than doing it for
+         * an unpriviledged guest, the hardware domain is only allowed to
+         * identity map GSIs, and as such all the data in the u.pci union is
+         * discarded.
+         */
+        if ( hvm_irq_dpci )
+        {
+            unsigned int link;
+
+            digl = xmalloc(struct dev_intx_gsi_link);
+            girq = xmalloc(struct hvm_girq_dpci_mapping);
+
+            if ( !digl || !girq )
+            {
+                spin_unlock(&d->event_lock);
+                xfree(girq);
+                xfree(digl);
+                return -ENOMEM;
+            }
+
+            girq->bus = digl->bus = pt_irq_bind->u.pci.bus;
+            girq->device = digl->device = pt_irq_bind->u.pci.device;
+            girq->intx = digl->intx = pt_irq_bind->u.pci.intx;
+            list_add_tail(&digl->list, &pirq_dpci->digl_list);
+
+            guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx);
+            link = hvm_pci_intx_link(digl->device, digl->intx);
+
+            hvm_irq_dpci->link_cnt[link]++;
+
+            girq->machine_gsi = pirq;
+            list_add_tail(&girq->list, &hvm_irq_dpci->girq[guest_gsi]);
+        }
+        else
+        {
+            ASSERT(is_hardware_domain(d));
+
+            /* MSI_TRANSLATE is not supported for the hardware domain. */
+            if ( pt_irq_bind->irq_type != PT_IRQ_TYPE_PCI ||
+                 pirq >= hvm_domain_irq(d)->nr_gsis )
+            {
+                spin_unlock(&d->event_lock);
+
+                return -EINVAL;
+            }
+            guest_gsi = pirq;
+        }
+
+        /* Bind the same mirq once in the same domain */
+        if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
+        {
+            unsigned int share;
+
+            /* MUST be set, as the pirq_dpci can be re-used. */
+            pirq_dpci->dom = d;
+            if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI_TRANSLATE )
+            {
+                pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED |
+                                   HVM_IRQ_DPCI_MACH_MSI |
+                                   HVM_IRQ_DPCI_GUEST_PCI |
+                                   HVM_IRQ_DPCI_TRANSLATE;
+                share = 0;
+            }
+            else    /* PT_IRQ_TYPE_PCI */
+            {
+                pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED |
+                                   HVM_IRQ_DPCI_MACH_PCI |
+                                   HVM_IRQ_DPCI_GUEST_PCI;
+                if ( !is_hardware_domain(d) )
+                    share = BIND_PIRQ__WILL_SHARE;
+                else
+                {
+                    int mask = vioapic_get_mask(d, guest_gsi);
+                    int trigger_mode = vioapic_get_trigger_mode(d, guest_gsi);
+
+                    if ( mask < 0 || trigger_mode < 0 )
+                    {
+                        spin_unlock(&d->event_lock);
+
+                        ASSERT_UNREACHABLE();
+                        return -EINVAL;
+                    }
+                    pirq_dpci->flags |= HVM_IRQ_DPCI_IDENTITY_GSI;
+                    /*
+                     * Check if the corresponding vIO APIC pin is configured
+                     * level or edge trigger, level triggered interrupts will
+                     * be marked as shareable.
+                     */
+                    ASSERT(!mask);
+                    share = trigger_mode;
+                    if ( trigger_mode == VIOAPIC_EDGE_TRIG )
+                        /*
+                         * Edge IO-APIC interrupt, no EOI or unmask to perform
+                         * and hence no timer needed.
+                         */
+                        pirq_dpci->flags |= HVM_IRQ_DPCI_NO_EOI;
+                }
+            }
+
+            /* Init timer before binding */
+            if ( pt_irq_need_timer(pirq_dpci->flags) )
+                init_timer(&pirq_dpci->timer, pt_irq_time_out, pirq_dpci, 0);
+            /* Deal with gsi for legacy devices */
+            rc = pirq_guest_bind(d->vcpu[0], info, share);
+            if ( unlikely(rc) )
+            {
+                if ( pt_irq_need_timer(pirq_dpci->flags) )
+                    kill_timer(&pirq_dpci->timer);
+                /*
+                 * There is no path for __do_IRQ to schedule softirq as
+                 * IRQ_GUEST is not set. As such we can reset 'dom' directly.
+                 */
+                pirq_dpci->dom = NULL;
+                if ( hvm_irq_dpci )
+                {
+                    unsigned int link;
+
+                    ASSERT(girq && digl);
+                    list_del(&girq->list);
+                    list_del(&digl->list);
+                    link = hvm_pci_intx_link(digl->device, digl->intx);
+                    hvm_irq_dpci->link_cnt[link]--;
+                }
+                pirq_dpci->flags = 0;
+                pirq_cleanup_check(info, d);
+                spin_unlock(&d->event_lock);
+                xfree(girq);
+                xfree(digl);
+                return rc;
+            }
+        }
+
+        spin_unlock(&d->event_lock);
+
+        if ( iommu_verbose )
+        {
+            char buf[24] = "";
+
+            if ( digl )
+                snprintf(buf, ARRAY_SIZE(buf), " dev=%02x.%02x.%u intx=%u",
+                         digl->bus, PCI_SLOT(digl->device),
+                         PCI_FUNC(digl->device), digl->intx);
+
+            printk(XENLOG_G_INFO "d%d: bind: m_gsi=%u g_gsi=%u%s\n",
+                   d->domain_id, pirq, guest_gsi, buf);
+        }
+        break;
+    }
+
+    default:
+        spin_unlock(&d->event_lock);
+        return -EOPNOTSUPP;
+    }
+
+    return 0;
+}
+
+int pt_irq_destroy_bind(
+    struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci;
+    struct hvm_pirq_dpci *pirq_dpci;
+    unsigned int machine_gsi = pt_irq_bind->machine_irq;
+    struct pirq *pirq;
+    const char *what = NULL;
+
+    switch ( pt_irq_bind->irq_type )
+    {
+    case PT_IRQ_TYPE_PCI:
+    case PT_IRQ_TYPE_MSI_TRANSLATE:
+        if ( iommu_verbose )
+        {
+            unsigned int device = pt_irq_bind->u.pci.device;
+            unsigned int intx = pt_irq_bind->u.pci.intx;
+
+            printk(XENLOG_G_INFO
+                   "d%d: unbind: m_gsi=%u g_gsi=%u dev=%02x:%02x.%u intx=%u\n",
+                   d->domain_id, machine_gsi, hvm_pci_intx_gsi(device, intx),
+                   pt_irq_bind->u.pci.bus,
+                   PCI_SLOT(device), PCI_FUNC(device), intx);
+        }
+        break;
+    case PT_IRQ_TYPE_MSI:
+    {
+        unsigned long flags;
+        struct irq_desc *desc = domain_spin_lock_irq_desc(d, machine_gsi,
+                                                          &flags);
+
+        if ( !desc )
+            return -EINVAL;
+        /*
+         * Leave the MSI masked, so that the state when calling
+         * pt_irq_create_bind is consistent across bind/unbinds.
+         */
+        guest_mask_msi_irq(desc, true);
+        spin_unlock_irqrestore(&desc->lock, flags);
+        break;
+    }
+
+    default:
+        return -EOPNOTSUPP;
+    }
+
+    spin_lock(&d->event_lock);
+
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+
+    if ( !hvm_irq_dpci && !is_hardware_domain(d) )
+    {
+        spin_unlock(&d->event_lock);
+        return -EINVAL;
+    }
+
+    pirq = pirq_info(d, machine_gsi);
+    pirq_dpci = pirq_dpci(pirq);
+
+    if ( hvm_irq_dpci && pt_irq_bind->irq_type != PT_IRQ_TYPE_MSI )
+    {
+        unsigned int bus = pt_irq_bind->u.pci.bus;
+        unsigned int device = pt_irq_bind->u.pci.device;
+        unsigned int intx = pt_irq_bind->u.pci.intx;
+        unsigned int guest_gsi = hvm_pci_intx_gsi(device, intx);
+        unsigned int link = hvm_pci_intx_link(device, intx);
+        struct hvm_girq_dpci_mapping *girq;
+        struct dev_intx_gsi_link *digl, *tmp;
+
+        list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list )
+        {
+            if ( girq->bus         == bus &&
+                 girq->device      == device &&
+                 girq->intx        == intx &&
+                 girq->machine_gsi == machine_gsi )
+            {
+                list_del(&girq->list);
+                xfree(girq);
+                girq = NULL;
+                break;
+            }
+        }
+
+        if ( girq )
+        {
+            spin_unlock(&d->event_lock);
+            return -EINVAL;
+        }
+
+        hvm_irq_dpci->link_cnt[link]--;
+
+        /* clear the mirq info */
+        if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
+        {
+            list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list )
+            {
+                if ( digl->bus    == bus &&
+                     digl->device == device &&
+                     digl->intx   == intx )
+                {
+                    list_del(&digl->list);
+                    xfree(digl);
+                }
+            }
+            what = list_empty(&pirq_dpci->digl_list) ? "final" : "partial";
+        }
+        else
+            what = "bogus";
+    }
+    else if ( pirq_dpci && pirq_dpci->gmsi.posted )
+        pi_update_irte(NULL, pirq, 0);
+
+    if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
+         list_empty(&pirq_dpci->digl_list) )
+    {
+        pirq_guest_unbind(d, pirq);
+        msixtbl_pt_unregister(d, pirq);
+        if ( pt_irq_need_timer(pirq_dpci->flags) )
+            kill_timer(&pirq_dpci->timer);
+        pirq_dpci->flags = 0;
+        /*
+         * See comment in pt_irq_create_bind's PT_IRQ_TYPE_MSI before the
+         * call to pt_pirq_softirq_reset.
+         */
+        pt_pirq_softirq_reset(pirq_dpci);
+
+        pirq_cleanup_check(pirq, d);
+    }
+
+    spin_unlock(&d->event_lock);
+
+    if ( what && iommu_verbose )
+    {
+        unsigned int device = pt_irq_bind->u.pci.device;
+        char buf[24] = "";
+
+        if ( hvm_irq_dpci )
+            snprintf(buf, ARRAY_SIZE(buf), " dev=%02x.%02x.%u intx=%u",
+                     pt_irq_bind->u.pci.bus, PCI_SLOT(device),
+                     PCI_FUNC(device), pt_irq_bind->u.pci.intx);
+
+        printk(XENLOG_G_INFO "d%d %s unmap: m_irq=%u%s\n",
+               d->domain_id, what, machine_gsi, buf);
+    }
+
+    return 0;
+}
+
+void pt_pirq_init(struct domain *d, struct hvm_pirq_dpci *dpci)
+{
+    INIT_LIST_HEAD(&dpci->digl_list);
+    dpci->gmsi.dest_vcpu_id = -1;
+}
+
+bool pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci)
+{
+    if ( !dpci->flags && !pt_pirq_softirq_active(dpci) )
+    {
+        dpci->dom = NULL;
+        return true;
+    }
+    return false;
+}
+
+int pt_pirq_iterate(struct domain *d,
+                    int (*cb)(struct domain *,
+                              struct hvm_pirq_dpci *, void *),
+                    void *arg)
+{
+    int rc = 0;
+    unsigned int pirq = 0, n, i;
+    struct pirq *pirqs[8];
+
+    ASSERT(spin_is_locked(&d->event_lock));
+
+    do {
+        n = radix_tree_gang_lookup(&d->pirq_tree, (void **)pirqs, pirq,
+                                   ARRAY_SIZE(pirqs));
+        for ( i = 0; i < n; ++i )
+        {
+            struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirqs[i]);
+
+            pirq = pirqs[i]->pirq;
+            if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
+                rc = cb(d, pirq_dpci, arg);
+        }
+    } while ( !rc && ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) );
+
+    return rc;
+}
+
+int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq)
+{
+    struct hvm_irq_dpci *dpci = domain_get_irq_dpci(d);
+    struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirq);
+
+    ASSERT(is_hvm_domain(d));
+
+    if ( !is_iommu_enabled(d) || (!is_hardware_domain(d) && !dpci) ||
+         !pirq_dpci || !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
+        return 0;
+
+    pirq_dpci->masked = 1;
+    raise_softirq_for(pirq_dpci);
+    return 1;
+}
+
+/* called with d->event_lock held */
+static void __msi_pirq_eoi(struct hvm_pirq_dpci *pirq_dpci)
+{
+    irq_desc_t *desc;
+
+    if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
+         (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) )
+    {
+        struct pirq *pirq = dpci_pirq(pirq_dpci);
+
+        BUG_ON(!local_irq_is_enabled());
+        desc = pirq_spin_lock_irq_desc(pirq, NULL);
+        if ( !desc )
+            return;
+        desc_guest_eoi(desc, pirq);
+    }
+}
+
+static int _hvm_dpci_msi_eoi(struct domain *d,
+                             struct hvm_pirq_dpci *pirq_dpci, void *arg)
+{
+    int vector = (long)arg;
+
+    if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) &&
+         (pirq_dpci->gmsi.gvec == vector) )
+    {
+        unsigned int dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
+                                      XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
+        bool dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK;
+
+        if ( vlapic_match_dest(vcpu_vlapic(current), NULL, 0, dest,
+                               dest_mode) )
+        {
+            __msi_pirq_eoi(pirq_dpci);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+void hvm_dpci_msi_eoi(struct domain *d, int vector)
+{
+    if ( !is_iommu_enabled(d) ||
+         (!hvm_domain_irq(d)->dpci && !is_hardware_domain(d)) )
+       return;
+
+    spin_lock(&d->event_lock);
+    pt_pirq_iterate(d, _hvm_dpci_msi_eoi, (void *)(long)vector);
+    spin_unlock(&d->event_lock);
+}
+
+static void hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci)
+{
+    if ( unlikely(!hvm_domain_irq(d)->dpci) && !is_hardware_domain(d) )
+    {
+        ASSERT_UNREACHABLE();
+        return;
+    }
+
+    spin_lock(&d->event_lock);
+    if ( test_and_clear_bool(pirq_dpci->masked) )
+    {
+        struct pirq *pirq = dpci_pirq(pirq_dpci);
+        const struct dev_intx_gsi_link *digl;
+
+        if ( hvm_domain_use_pirq(d, pirq) )
+        {
+            send_guest_pirq(d, pirq);
+
+            if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
+                goto out;
+        }
+
+        if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
+        {
+            vmsi_deliver_pirq(d, pirq_dpci);
+            goto out;
+        }
+
+        list_for_each_entry ( digl, &pirq_dpci->digl_list, list )
+        {
+            ASSERT(!(pirq_dpci->flags & HVM_IRQ_DPCI_IDENTITY_GSI));
+            hvm_pci_intx_assert(d, digl->device, digl->intx);
+            pirq_dpci->pending++;
+        }
+
+        if ( pirq_dpci->flags & HVM_IRQ_DPCI_IDENTITY_GSI )
+        {
+            hvm_gsi_assert(d, pirq->pirq);
+            if ( pirq_dpci->flags & HVM_IRQ_DPCI_NO_EOI )
+                goto out;
+            pirq_dpci->pending++;
+        }
+
+        if ( pirq_dpci->flags & HVM_IRQ_DPCI_TRANSLATE )
+        {
+            /* for translated MSI to INTx interrupt, eoi as early as possible 
*/
+            __msi_pirq_eoi(pirq_dpci);
+            goto out;
+        }
+
+        /*
+         * Set a timer to see if the guest can finish the interrupt or not. For
+         * example, the guest OS may unmask the PIC during boot, before the
+         * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the
+         * guest will never deal with the irq, then the physical interrupt line
+         * will never be deasserted.
+         */
+        ASSERT(pt_irq_need_timer(pirq_dpci->flags));
+        set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT);
+    }
+
+ out:
+    spin_unlock(&d->event_lock);
+}
+
+static void hvm_pirq_eoi(struct pirq *pirq,
+                         const union vioapic_redir_entry *ent)
+{
+    struct hvm_pirq_dpci *pirq_dpci;
+
+    if ( !pirq )
+    {
+        ASSERT_UNREACHABLE();
+        return;
+    }
+
+    pirq_dpci = pirq_dpci(pirq);
+
+    /*
+     * No need to get vector lock for timer
+     * since interrupt is still not EOIed
+     */
+    if ( --pirq_dpci->pending ||
+         (ent && ent->fields.mask) ||
+         !pt_irq_need_timer(pirq_dpci->flags) )
+        return;
+
+    stop_timer(&pirq_dpci->timer);
+    pirq_guest_eoi(pirq);
+}
+
+static void __hvm_dpci_eoi(struct domain *d,
+                           const struct hvm_girq_dpci_mapping *girq,
+                           const union vioapic_redir_entry *ent)
+{
+    struct pirq *pirq = pirq_info(d, girq->machine_gsi);
+
+    if ( !hvm_domain_use_pirq(d, pirq) )
+        hvm_pci_intx_deassert(d, girq->device, girq->intx);
+
+    hvm_pirq_eoi(pirq, ent);
+}
+
+static void hvm_gsi_eoi(struct domain *d, unsigned int gsi,
+                        const union vioapic_redir_entry *ent)
+{
+    struct pirq *pirq = pirq_info(d, gsi);
+
+    /* Check if GSI is actually mapped. */
+    if ( !pirq_dpci(pirq) )
+        return;
+
+    hvm_gsi_deassert(d, gsi);
+    hvm_pirq_eoi(pirq, ent);
+}
+
+void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi,
+                  const union vioapic_redir_entry *ent)
+{
+    const struct hvm_irq_dpci *hvm_irq_dpci;
+    const struct hvm_girq_dpci_mapping *girq;
+
+    if ( !is_iommu_enabled(d) )
+        return;
+
+    if ( is_hardware_domain(d) )
+    {
+        spin_lock(&d->event_lock);
+        hvm_gsi_eoi(d, guest_gsi, ent);
+        goto unlock;
+    }
+
+    if ( guest_gsi < NR_ISAIRQS )
+    {
+        hvm_dpci_isairq_eoi(d, guest_gsi);
+        return;
+    }
+
+    spin_lock(&d->event_lock);
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+
+    if ( !hvm_irq_dpci )
+        goto unlock;
+
+    list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list )
+        __hvm_dpci_eoi(d, girq, ent);
+
+unlock:
+    spin_unlock(&d->event_lock);
+}
+
+static int pci_clean_dpci_irq(struct domain *d,
+                              struct hvm_pirq_dpci *pirq_dpci, void *arg)
+{
+    struct dev_intx_gsi_link *digl, *tmp;
+
+    pirq_guest_unbind(d, dpci_pirq(pirq_dpci));
+
+    if ( pt_irq_need_timer(pirq_dpci->flags) )
+        kill_timer(&pirq_dpci->timer);
+
+    list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list )
+    {
+        list_del(&digl->list);
+        xfree(digl);
+    }
+
+    radix_tree_delete(&d->pirq_tree, dpci_pirq(pirq_dpci)->pirq);
+
+    if ( !pt_pirq_softirq_active(pirq_dpci) )
+        return 0;
+
+    domain_get_irq_dpci(d)->pending_pirq_dpci = pirq_dpci;
+
+    return -ERESTART;
+}
+
+int arch_pci_clean_pirqs(struct domain *d)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = NULL;
+
+    if ( !is_iommu_enabled(d) )
+        return 0;
+
+    if ( !is_hvm_domain(d) )
+        return 0;
+
+    spin_lock(&d->event_lock);
+    hvm_irq_dpci = domain_get_irq_dpci(d);
+    if ( hvm_irq_dpci != NULL )
+    {
+        int ret = 0;
+
+        if ( hvm_irq_dpci->pending_pirq_dpci )
+        {
+            if ( pt_pirq_softirq_active(hvm_irq_dpci->pending_pirq_dpci) )
+                 ret = -ERESTART;
+            else
+                 hvm_irq_dpci->pending_pirq_dpci = NULL;
+        }
+
+        if ( !ret )
+            ret = pt_pirq_iterate(d, pci_clean_dpci_irq, NULL);
+        if ( ret )
+        {
+            spin_unlock(&d->event_lock);
+            return ret;
+        }
+
+        hvm_domain_irq(d)->dpci = NULL;
+        free_hvm_irq_dpci(hvm_irq_dpci);
+    }
+    spin_unlock(&d->event_lock);
+
+    return 0;
+}
+
+/*
+ * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to
+ * doing it. If that is the case we let 'pt_pirq_softirq_reset' do 
ref-counting.
+ */
+static void dpci_softirq(void)
+{
+    unsigned int cpu = smp_processor_id();
+    LIST_HEAD(our_list);
+
+    local_irq_disable();
+    list_splice_init(&per_cpu(dpci_list, cpu), &our_list);
+    local_irq_enable();
+
+    while ( !list_empty(&our_list) )
+    {
+        struct hvm_pirq_dpci *pirq_dpci;
+        struct domain *d;
+
+        pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, 
softirq_list);
+        list_del(&pirq_dpci->softirq_list);
+
+        d = pirq_dpci->dom;
+        smp_mb(); /* 'd' MUST be saved before we set/clear the bits. */
+        if ( test_and_set_bit(STATE_RUN, &pirq_dpci->state) )
+        {
+            unsigned long flags;
+
+            /* Put back on the list and retry. */
+            local_irq_save(flags);
+            list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
+            local_irq_restore(flags);
+
+            raise_softirq(HVM_DPCI_SOFTIRQ);
+            continue;
+        }
+        /*
+         * The one who clears STATE_SCHED MUST refcount the domain.
+         */
+        if ( test_and_clear_bit(STATE_SCHED, &pirq_dpci->state) )
+        {
+            hvm_dirq_assist(d, pirq_dpci);
+            put_domain(d);
+        }
+        clear_bit(STATE_RUN, &pirq_dpci->state);
+    }
+}
+
+static int cpu_callback(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+
+    switch ( action )
+    {
+    case CPU_UP_PREPARE:
+        INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
+        break;
+    case CPU_UP_CANCELED:
+    case CPU_DEAD:
+        /*
+         * On CPU_DYING this callback is called (on the CPU that is dying)
+         * with an possible HVM_DPIC_SOFTIRQ pending - at which point we can
+         * clear out any outstanding domains (by the virtue of the idle loop
+         * calling the softirq later). In CPU_DEAD case the CPU is deaf and
+         * there are no pending softirqs for us to handle so we can chill.
+         */
+        ASSERT(list_empty(&per_cpu(dpci_list, cpu)));
+        break;
+    }
+
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block cpu_nfb = {
+    .notifier_call = cpu_callback,
+};
+
+static int __init setup_dpci_softirq(void)
+{
+    unsigned int cpu;
+
+    for_each_online_cpu(cpu)
+        INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
+
+    open_softirq(HVM_DPCI_SOFTIRQ, dpci_softirq);
+    register_cpu_notifier(&cpu_nfb);
+    return 0;
+}
+__initcall(setup_dpci_softirq);
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 20a54a5bb4..8e3d4d9454 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -208,4 +208,13 @@ int msixtbl_pt_register(struct domain *, struct pirq *, 
uint64_t gtable);
 void msixtbl_pt_unregister(struct domain *, struct pirq *);
 void msixtbl_pt_cleanup(struct domain *d);
 
+#ifdef CONFIG_HVM
+int arch_pci_clean_pirqs(struct domain *d);
+#else
+static inline int arch_pci_clean_pirqs(struct domain *d)
+{
+    return 0;
+}
+#endif /* CONFIG_HVM */
+
 #endif /* __XEN_PCI_H__ */
--
generated by git-patchbot for /home/xen/git/xen.git#staging



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.