[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen master] vmx: VT-d posted-interrupt core logic handling



commit 4b759ce163f4a6f5c31922f89110eb0c7d54d5e2
Author:     Feng Wu <feng.wu@xxxxxxxxx>
AuthorDate: Tue Mar 1 14:42:13 2016 +0100
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Tue Mar 1 14:42:13 2016 +0100

    vmx: VT-d posted-interrupt core logic handling
    
    This is the core logic handling for VT-d posted-interrupts. Basically it
    deals with how and when to update posted-interrupts during the following
    scenarios:
    - vCPU is preempted
    - vCPU is slept
    - vCPU is blocked
    
    When vCPU is preempted/slept, we update the posted-interrupts during
    scheduling by introducing two new architecutral scheduler hooks:
    vmx_pi_switch_from() and vmx_pi_switch_to(). When vCPU is blocked, we
    introduce a new architectural hook: arch_vcpu_block() to update
    posted-interrupts descriptor.
    
    Besides that, before VM-entry, we will make sure the 'NV' filed is set
    to 'posted_intr_vector' and the vCPU is not in any blocking lists, which
    is needed when vCPU is running in non-root mode. The reason we do this check
    is because we change the posted-interrupts descriptor in vcpu_block(),
    however, we don't change it back in vcpu_unblock() or when vcpu_block()
    directly returns due to event delivery (in fact, we don't need to do it
    in the two places, that is why we do it before VM-Entry).
    
    When we handle the lazy context switch for the following two scenarios:
    - Preempted by a tasklet, which uses in an idle context.
    - the prev vcpu is in offline and no new available vcpus in run queue.
    We don't change the 'SN' bit in posted-interrupt descriptor, this
    may incur spurious PI notification events, but since PI notification
    event is only sent when 'ON' is clear, and once the PI notificatoin
    is sent, ON is set by hardware, hence no more notification events
    before 'ON' is clear. Besides that, spurious PI notification events are
    going to happen from time to time in Xen hypervisor, such as, when
    guests trap to Xen and PI notification event happens, there is
    nothing Xen actually needs to do about it, the interrupts will be
    delivered to guest atht the next time we do a VMENTRY.
    
    Suggested-by: Yang Zhang <yang.z.zhang@xxxxxxxxx>
    Suggested-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
    Suggested-by: George Dunlap <george.dunlap@xxxxxxxxxx>
    Suggested-by: Jan Beulich <jbeulich@xxxxxxxx>
    Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx>
    Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx>
    Reviewed-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
    Acked-by: Kevin Tian <kevin.tian@xxxxxxxxx>
    Reviewed-by: George Dunlap <george.dunlap@xxxxxxxxxx>
---
 xen/arch/x86/hvm/vmx/vmcs.c         |   2 +
 xen/arch/x86/hvm/vmx/vmx.c          | 187 ++++++++++++++++++++++++++++++++++++
 xen/common/schedule.c               |   4 +
 xen/drivers/passthrough/vtd/iommu.c |  11 +++
 xen/include/asm-arm/domain.h        |   2 +
 xen/include/asm-x86/hvm/hvm.h       |  13 +++
 xen/include/asm-x86/hvm/vmx/vmcs.h  |  71 ++++++++++++++
 xen/include/asm-x86/hvm/vmx/vmx.h   |   5 +
 8 files changed, 295 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index ed80350..fd4d876 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -679,6 +679,8 @@ int vmx_cpu_up(void)
     if ( cpu_has_vmx_vpid )
         vpid_sync_all();
 
+    vmx_pi_per_cpu_init(cpu);
+
     return 0;
 }
 
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 6dc45b8..9c5a388 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -84,7 +84,148 @@ static int vmx_msr_write_intercept(unsigned int msr, 
uint64_t msr_content);
 static void vmx_invlpg_intercept(unsigned long vaddr);
 static int vmx_vmfunc_intercept(struct cpu_user_regs *regs);
 
+struct vmx_pi_blocking_vcpu {
+    struct list_head     list;
+    spinlock_t           lock;
+};
+
+/*
+ * We maintain a per-CPU linked-list of vCPUs, so in PI wakeup
+ * handler we can find which vCPU should be woken up.
+ */
+static DEFINE_PER_CPU(struct vmx_pi_blocking_vcpu, vmx_pi_blocking);
+
 uint8_t __read_mostly posted_intr_vector;
+static uint8_t __read_mostly pi_wakeup_vector;
+
+void vmx_pi_per_cpu_init(unsigned int cpu)
+{
+    INIT_LIST_HEAD(&per_cpu(vmx_pi_blocking, cpu).list);
+    spin_lock_init(&per_cpu(vmx_pi_blocking, cpu).lock);
+}
+
+static void vmx_vcpu_block(struct vcpu *v)
+{
+    unsigned long flags;
+    unsigned int dest;
+    spinlock_t *old_lock;
+    spinlock_t *pi_blocking_list_lock =
+               &per_cpu(vmx_pi_blocking, v->processor).lock;
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+    spin_lock_irqsave(pi_blocking_list_lock, flags);
+    old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
+                       pi_blocking_list_lock);
+
+    /*
+     * 'v->arch.hvm_vmx.pi_blocking.lock' should be NULL before
+     * being assigned to a new value, since the vCPU is currently
+     * running and it cannot be on any blocking list.
+     */
+    ASSERT(old_lock == NULL);
+
+    list_add_tail(&v->arch.hvm_vmx.pi_blocking.list,
+                  &per_cpu(vmx_pi_blocking, v->processor).list);
+    spin_unlock_irqrestore(pi_blocking_list_lock, flags);
+
+    ASSERT(!pi_test_sn(pi_desc));
+
+    dest = cpu_physical_id(v->processor);
+
+    ASSERT(pi_desc->ndst ==
+           (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
+
+    write_atomic(&pi_desc->nv, pi_wakeup_vector);
+}
+
+static void vmx_pi_switch_from(struct vcpu *v)
+{
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+    if ( test_bit(_VPF_blocked, &v->pause_flags) )
+        return;
+
+    pi_set_sn(pi_desc);
+}
+
+static void vmx_pi_switch_to(struct vcpu *v)
+{
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+    unsigned int dest = cpu_physical_id(v->processor);
+
+    write_atomic(&pi_desc->ndst,
+                 x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
+
+    pi_clear_sn(pi_desc);
+}
+
+static void vmx_pi_do_resume(struct vcpu *v)
+{
+    unsigned long flags;
+    spinlock_t *pi_blocking_list_lock;
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+    ASSERT(!test_bit(_VPF_blocked, &v->pause_flags));
+
+    /*
+     * Set 'NV' field back to posted_intr_vector, so the
+     * Posted-Interrupts can be delivered to the vCPU when
+     * it is running in non-root mode.
+     */
+    write_atomic(&pi_desc->nv, posted_intr_vector);
+
+    /* The vCPU is not on any blocking list. */
+    pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
+
+    /* Prevent the compiler from eliminating the local variable.*/
+    smp_rmb();
+
+    if ( pi_blocking_list_lock == NULL )
+        return;
+
+    spin_lock_irqsave(pi_blocking_list_lock, flags);
+
+    /*
+     * v->arch.hvm_vmx.pi_blocking.lock == NULL here means the vCPU
+     * was removed from the blocking list while we are acquiring the lock.
+     */
+    if ( v->arch.hvm_vmx.pi_blocking.lock != NULL )
+    {
+        ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
+        list_del(&v->arch.hvm_vmx.pi_blocking.list);
+        v->arch.hvm_vmx.pi_blocking.lock = NULL;
+    }
+
+    spin_unlock_irqrestore(pi_blocking_list_lock, flags);
+}
+
+/* This function is called when pcidevs_lock is held */
+void vmx_pi_hooks_assign(struct domain *d)
+{
+    if ( !iommu_intpost || !has_hvm_container_domain(d) )
+        return;
+
+    ASSERT(!d->arch.hvm_domain.vmx.vcpu_block);
+
+    d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
+    d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
+    d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
+    d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
+}
+
+/* This function is called when pcidevs_lock is held */
+void vmx_pi_hooks_deassign(struct domain *d)
+{
+    if ( !iommu_intpost || !has_hvm_container_domain(d) )
+        return;
+
+    ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
+
+    d->arch.hvm_domain.vmx.vcpu_block = NULL;
+    d->arch.hvm_domain.vmx.pi_switch_from = NULL;
+    d->arch.hvm_domain.vmx.pi_switch_to = NULL;
+    d->arch.hvm_domain.vmx.pi_do_resume = NULL;
+}
 
 static int vmx_domain_initialise(struct domain *d)
 {
@@ -113,6 +254,8 @@ static int vmx_vcpu_initialise(struct vcpu *v)
 
     spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
 
+    INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocking.list);
+
     v->arch.schedule_tail    = vmx_do_resume;
     v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
     v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
@@ -752,6 +895,9 @@ static void vmx_ctxt_switch_from(struct vcpu *v)
     vmx_save_guest_msrs(v);
     vmx_restore_host_msrs();
     vmx_save_dr(v);
+
+    if ( v->domain->arch.hvm_domain.vmx.pi_switch_from )
+        v->domain->arch.hvm_domain.vmx.pi_switch_from(v);
 }
 
 static void vmx_ctxt_switch_to(struct vcpu *v)
@@ -764,6 +910,9 @@ static void vmx_ctxt_switch_to(struct vcpu *v)
 
     vmx_restore_guest_msrs(v);
     vmx_restore_dr(v);
+
+    if ( v->domain->arch.hvm_domain.vmx.pi_switch_to )
+        v->domain->arch.hvm_domain.vmx.pi_switch_to(v);
 }
 
 
@@ -2030,6 +2179,38 @@ static struct hvm_function_table __initdata 
vmx_function_table = {
     },
 };
 
+/* Handle VT-d posted-interrupt when VCPU is blocked. */
+static void pi_wakeup_interrupt(struct cpu_user_regs *regs)
+{
+    struct arch_vmx_struct *vmx, *tmp;
+    spinlock_t *lock = &per_cpu(vmx_pi_blocking, smp_processor_id()).lock;
+    struct list_head *blocked_vcpus =
+               &per_cpu(vmx_pi_blocking, smp_processor_id()).list;
+
+    ack_APIC_irq();
+    this_cpu(irq_count)++;
+
+    spin_lock(lock);
+
+    /*
+     * XXX: The length of the list depends on how many vCPU is current
+     * blocked on this specific pCPU. This may hurt the interrupt latency
+     * if the list grows to too many entries.
+     */
+    list_for_each_entry_safe(vmx, tmp, blocked_vcpus, pi_blocking.list)
+    {
+        if ( pi_test_on(&vmx->pi_desc) )
+        {
+            list_del(&vmx->pi_blocking.list);
+            ASSERT(vmx->pi_blocking.lock == lock);
+            vmx->pi_blocking.lock = NULL;
+            vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
+        }
+    }
+
+    spin_unlock(lock);
+}
+
 /* Handle VT-d posted-interrupt when VCPU is running. */
 static void pi_notification_interrupt(struct cpu_user_regs *regs)
 {
@@ -2116,7 +2297,10 @@ const struct hvm_function_table * __init start_vmx(void)
     if ( cpu_has_vmx_posted_intr_processing )
     {
         if ( iommu_intpost )
+        {
             alloc_direct_apic_vector(&posted_intr_vector, 
pi_notification_interrupt);
+            alloc_direct_apic_vector(&pi_wakeup_vector, pi_wakeup_interrupt);
+        }
         else
             alloc_direct_apic_vector(&posted_intr_vector, 
event_check_interrupt);
     }
@@ -3631,6 +3815,9 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
     struct hvm_vcpu_asid *p_asid;
     bool_t need_flush;
 
+    if ( curr->domain->arch.hvm_domain.vmx.pi_do_resume )
+        curr->domain->arch.hvm_domain.vmx.pi_do_resume(curr);
+
     if ( !cpu_has_vmx_vpid )
         goto out;
     if ( nestedhvm_vcpu_in_guestmode(curr) )
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 434dcfc..7523968 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -803,6 +803,8 @@ void vcpu_block(void)
 
     set_bit(_VPF_blocked, &v->pause_flags);
 
+    arch_vcpu_block(v);
+
     /* Check for events /after/ blocking: avoids wakeup waiting race. */
     if ( local_events_need_delivery() )
     {
@@ -840,6 +842,8 @@ static long do_poll(struct sched_poll *sched_poll)
     v->poll_evtchn = -1;
     set_bit(v->vcpu_id, d->poll_mask);
 
+    arch_vcpu_block(v);
+
 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
     /* Check for events /after/ setting flags: avoids wakeup waiting race. */
     smp_mb();
diff --git a/xen/drivers/passthrough/vtd/iommu.c 
b/xen/drivers/passthrough/vtd/iommu.c
index ec31c6b..8022702 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2283,9 +2283,17 @@ static int reassign_device_ownership(
     if ( ret )
         return ret;
 
+    if ( !has_arch_pdevs(target) )
+        vmx_pi_hooks_assign(target);
+
     ret = domain_context_mapping(target, devfn, pdev);
     if ( ret )
+    {
+        if ( !has_arch_pdevs(target) )
+            vmx_pi_hooks_deassign(target);
+
         return ret;
+    }
 
     if ( devfn == pdev->devfn )
     {
@@ -2293,6 +2301,9 @@ static int reassign_device_ownership(
         pdev->domain = target;
     }
 
+    if ( !has_arch_pdevs(source) )
+        vmx_pi_hooks_deassign(source);
+
     return ret;
 }
 
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 8e1161f..c35ed40 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -315,6 +315,8 @@ static inline void free_vcpu_guest_context(struct 
vcpu_guest_context *vgc)
     xfree(vgc);
 }
 
+static inline void arch_vcpu_block(struct vcpu *v) {}
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 12209d5..b8ab5b1 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -606,6 +606,19 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t 
value,
                            signed int cr0_pg);
 unsigned long hvm_cr4_guest_reserved_bits(const struct vcpu *v, bool_t 
restore);
 
+/*
+ * This must be defined as a macro instead of an inline function,
+ * because it uses 'struct vcpu' and 'struct domain' which have
+ * not been defined yet.
+ */
+#define arch_vcpu_block(v) ({                                   \
+    struct vcpu *v_ = (v);                                      \
+    struct domain *d_ = v_->domain;                             \
+    if ( has_hvm_container_domain(d_) &&                        \
+         d_->arch.hvm_domain.vmx.vcpu_block )                   \
+        d_->arch.hvm_domain.vmx.vcpu_block(v_);                 \
+})
+
 #endif /* __ASM_X86_HVM_HVM_H__ */
 
 /*
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 86a9f1e..b54f52f 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -77,6 +77,65 @@ struct vmx_domain {
     unsigned long apic_access_mfn;
     /* VMX_DOMAIN_* */
     unsigned int status;
+
+    /*
+     * To handle posted interrupts correctly, we need to set the following
+     * state:
+     *
+     * * The PI notification vector (NV)
+     * * The PI notification destination processor (NDST)
+     * * The PI "suppress notification" bit (SN)
+     * * The vcpu pi "blocked" list
+     *
+     * If a VM is currently running, we want the PI delivered to the guest vcpu
+     * on the proper pcpu (NDST = v->processor, SN clear).
+     *
+     * If the vm is blocked, we want the PI delivered to Xen so that it can
+     * wake it up  (SN clear, NV = pi_wakeup_vector, vcpu on block list).
+     *
+     * If the VM is currently either preempted or offline (i.e., not running
+     * because of some reason other than blocking waiting for an interrupt),
+     * there's nothing Xen can do -- we want the interrupt pending bit set in
+     * the guest, but we don't want to bother Xen with an interrupt (SN clear).
+     *
+     * There's a brief window of time between vmx_intr_assist() and checking
+     * softirqs where if an interrupt comes in it may be lost; so we need Xen
+     * to get an interrupt and raise a softirq so that it will go through the
+     * vmx_intr_assist() path again (SN clear, NV = posted_interrupt).
+     *
+     * The way we implement this now is by looking at what needs to happen on
+     * the following runstate transitions:
+     *
+     * A: runnable -> running
+     *  - SN = 0
+     *  - NDST = v->processor
+     * B: running -> runnable
+     *  - SN = 1
+     * C: running -> blocked
+     *  - NV = pi_wakeup_vector
+     *  - Add vcpu to blocked list
+     * D: blocked -> runnable
+     *  - NV = posted_intr_vector
+     *  - Take vcpu off blocked list
+     *
+     * For transitions A and B, we add hooks into vmx_ctxt_switch_{from,to}
+     * paths.
+     *
+     * For transition C, we add a new arch hook, arch_vcpu_block(), which is
+     * called from vcpu_block() and vcpu_do_poll().
+     *
+     * For transition D, rather than add an extra arch hook on vcpu_wake, we
+     * add a hook on the vmentry path which checks to see if either of the two
+     * actions need to be taken.
+     *
+     * These hooks only need to be called when the domain in question actually
+     * has a physical device assigned to it, so we set and clear the callbacks
+     * as appropriate when device assignment changes.
+     */
+    void (*vcpu_block) (struct vcpu *);
+    void (*pi_switch_from) (struct vcpu *v);
+    void (*pi_switch_to) (struct vcpu *v);
+    void (*pi_do_resume) (struct vcpu *v);
 };
 
 struct pi_desc {
@@ -101,6 +160,11 @@ struct pi_desc {
 
 #define NR_PML_ENTRIES   512
 
+struct pi_blocking_vcpu {
+    struct list_head     list;
+    spinlock_t           *lock;
+};
+
 struct arch_vmx_struct {
     /* Physical address of VMCS. */
     paddr_t              vmcs_pa;
@@ -160,6 +224,13 @@ struct arch_vmx_struct {
     struct page_info     *vmwrite_bitmap;
 
     struct page_info     *pml_pg;
+
+    /*
+     * Before it is blocked, vCPU is added to the per-cpu list.
+     * VT-d engine can send wakeup notification event to the
+     * pCPU and wakeup the related vCPU.
+     */
+    struct pi_blocking_vcpu pi_blocking;
 };
 
 int vmx_create_vmcs(struct vcpu *v);
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h 
b/xen/include/asm-x86/hvm/vmx/vmx.h
index 14f3d32..a85d488 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -564,6 +564,11 @@ int alloc_p2m_hap_data(struct p2m_domain *p2m);
 void free_p2m_hap_data(struct p2m_domain *p2m);
 void p2m_init_hap_data(struct p2m_domain *p2m);
 
+void vmx_pi_per_cpu_init(unsigned int cpu);
+
+void vmx_pi_hooks_assign(struct domain *d);
+void vmx_pi_hooks_deassign(struct domain *d);
+
 /* EPT violation qualifications definitions */
 #define _EPT_READ_VIOLATION         0
 #define EPT_READ_VIOLATION          (1UL<<_EPT_READ_VIOLATION)
--
generated by git-patchbot for /home/xen/git/xen.git#master

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.