VMX: fix VMCS race on context-switch paths

When __context_switch() is being bypassed during original context
switch handling, the vCPU "owning" the VMCS partially loses control of
it: It will appear non-running to remote CPUs, and hence their attempt
to pause the owning vCPU will have no effect on it (as it already
looks to be paused). At the same time the "owning" CPU will re-enable
interrupts eventually (the lastest when entering the idle loop) and
hence becomes subject to IPIs from other CPUs requesting access to the
VMCS. As a result, when __context_switch() finally gets run, the CPU
may no longer have the VMCS loaded, and hence any accesses to it would
fail. Hence we may need to re-load the VMCS in vmx_ctxt_switch_from().

Similarly, when __context_switch() is being bypassed also on the second
(switch-in) path, VMCS ownership may have been lost and hence needs
re-establishing. Since there's no existing hook to put this in, add a
new one.

Reported-by: Kevin Mayer <Kevin.Mayer@xxxxxxxx>
Reported-by: Anshul Makkar <anshul.makkar@xxxxxxxxxx>
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2098,11 +2098,14 @@ void context_switch(struct vcpu *prev, s
 
     set_current(next);
 
-    if ( (per_cpu(curr_vcpu, cpu) == next) ||
-         (is_idle_domain(nextd) && cpu_online(cpu)) )
+    if ( (per_cpu(curr_vcpu, cpu) == next) )
     {
+        if ( next->arch.ctxt_switch_same )
+            next->arch.ctxt_switch_same(next);
         local_irq_enable();
     }
+    else if ( is_idle_domain(nextd) && cpu_online(cpu) )
+        local_irq_enable();
     else
     {
         __context_switch();
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -552,6 +552,27 @@ static void vmx_load_vmcs(struct vcpu *v
     local_irq_restore(flags);
 }
 
+void vmx_vmcs_reload(struct vcpu *v)
+{
+    /*
+     * As we're running with interrupts disabled, we can't acquire
+     * v->arch.hvm_vmx.vmcs_lock here. However, with interrupts disabled
+     * the VMCS can't be taken away from us anymore if we still own it.
+     */
+    ASSERT(!local_irq_is_enabled());
+    if ( v->arch.hvm_vmx.vmcs_pa == this_cpu(current_vmcs) )
+        return;
+    ASSERT(!this_cpu(current_vmcs));
+
+    /*
+     * Wait for the remote side to be done with the VMCS before loading
+     * it here.
+     */
+    while ( v->arch.hvm_vmx.active_cpu != -1 )
+        cpu_relax();
+    vmx_load_vmcs(v);
+}
+
 int vmx_cpu_up_prepare(unsigned int cpu)
 {
     /*
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -298,6 +298,7 @@ static int vmx_vcpu_initialise(struct vc
     v->arch.schedule_tail    = vmx_do_resume;
     v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
     v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
+    v->arch.ctxt_switch_same = vmx_vmcs_reload;
 
     if ( (rc = vmx_create_vmcs(v)) != 0 )
     {
@@ -936,6 +937,18 @@ static void vmx_ctxt_switch_from(struct
     if ( unlikely(!this_cpu(vmxon)) )
         return;
 
+    if ( !v->is_running )
+    {
+        /*
+         * When this vCPU isn't marked as running anymore, a remote pCPU's
+         * attempt to pause us (from vmx_vmcs_enter()) won't have a reason
+         * to spin in vcpu_sleep_sync(), and hence that pCPU might have taken
+         * away the VMCS from us. As we're running with interrupts disabled,
+         * we also can't call vmx_vmcs_enter().
+         */
+        vmx_vmcs_reload(v);
+    }
+
     vmx_fpu_leave(v);
     vmx_save_guest_msrs(v);
     vmx_restore_host_msrs();
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -514,6 +514,7 @@ struct arch_vcpu
 
     void (*ctxt_switch_from) (struct vcpu *);
     void (*ctxt_switch_to) (struct vcpu *);
+    void (*ctxt_switch_same) (struct vcpu *);
 
     struct vpmu_struct vpmu;
 
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -174,6 +174,7 @@ void vmx_destroy_vmcs(struct vcpu *v);
 void vmx_vmcs_enter(struct vcpu *v);
 bool_t __must_check vmx_vmcs_try_enter(struct vcpu *v);
 void vmx_vmcs_exit(struct vcpu *v);
+void vmx_vmcs_reload(struct vcpu *v);
 
 #define CPU_BASED_VIRTUAL_INTR_PENDING        0x00000004
 #define CPU_BASED_USE_TSC_OFFSETING           0x00000008