[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH V4] x86/hvm: fix domain crash when CR3 has the noflush bit set



The emulation layers of Xen lack PCID support, and as we only offer
PCID to HAP guests, all writes to CR3 are handled by hardware,
except when introspection is involved. Consequently, trying to set
CR3 when the noflush bit is set in hvm_set_cr3() leads to domain
crashes. The workaround is to clear the noflush bit in
hvm_set_cr3(). CR3 values in hvm_monitor_cr() are also sanitized.
Additionally, a bool parameter now propagates to
{svm,vmx}_update_guest_cr(), so that no flushes occur when
the bit was set.

Signed-off-by: Razvan Cojocaru <rcojocaru@xxxxxxxxxxxxxxx>
Reported-by: Bitweasil <bitweasil@xxxxxxxxxxxxxx>
Suggested-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

---
Changes since V3:
 - Removed unnecessary !! from "noflush = !!(value & X86_CR3_NOFLUSH);"
 - Moved the X86_CR3_NOFLUSH #define to x86-defns.h.
 - Removed X86_CR3_NOFLUSH_DISABLE_MASK (using ~X86_CR3_NOFLUSH).
 - Reverted changes to hvm_update_guest_cr() and callers, and added
   hvm_update_guest_cr3() instead.
 - Added HVM_UPDATE_GUEST_CR3_NO_FLUSH as a potential flag to pass
   to update_guest_cr(), instead of a simple bool which did not
   apply to all CRs.
---
 xen/arch/x86/hvm/hvm.c            | 13 ++++++++++---
 xen/arch/x86/hvm/monitor.c        |  3 +++
 xen/arch/x86/hvm/svm/svm.c        | 22 ++++++++++++++--------
 xen/arch/x86/hvm/vmx/vmx.c        | 18 +++++++++++-------
 xen/arch/x86/mm.c                 |  2 +-
 xen/arch/x86/mm/hap/hap.c         |  6 +++---
 xen/arch/x86/mm/shadow/common.c   |  2 +-
 xen/arch/x86/mm/shadow/multi.c    |  6 +++---
 xen/arch/x86/mm/shadow/none.c     |  2 +-
 xen/include/asm-x86/hvm/hvm.h     | 15 +++++++++++++--
 xen/include/asm-x86/hvm/svm/svm.h |  2 +-
 xen/include/asm-x86/paging.h      |  7 ++++---
 xen/include/asm-x86/x86-defns.h   |  5 +++++
 13 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 18d721d..f17163e 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2297,6 +2297,7 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
     struct vcpu *v = current;
     struct page_info *page;
     unsigned long old = v->arch.hvm_vcpu.guest_cr[3];
+    bool noflush = false;
 
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
@@ -2313,6 +2314,12 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
         }
     }
 
+    if ( hvm_pcid_enabled(v) ) /* Clear the noflush bit. */
+    {
+        noflush = value & X86_CR3_NOFLUSH;
+        value &= ~X86_CR3_NOFLUSH;
+    }
+
     if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
          (value != v->arch.hvm_vcpu.guest_cr[3]) )
     {
@@ -2330,7 +2337,7 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
     }
 
     v->arch.hvm_vcpu.guest_cr[3] = value;
-    paging_update_cr3(v);
+    paging_update_cr3(v, noflush);
     return X86EMUL_OKAY;
 
  bad_cr3:
@@ -4031,7 +4038,7 @@ static int hvmop_flush_tlb_all(void)
 
     /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
     for_each_vcpu ( d, v )
-        paging_update_cr3(v);
+        paging_update_cr3(v, false);
 
     /* Flush all dirty TLBs. */
     flush_tlb_mask(d->dirty_cpumask);
@@ -4193,7 +4200,7 @@ static int hvmop_set_param(
         domain_pause(d);
         d->arch.hvm_domain.params[a.index] = a.value;
         for_each_vcpu ( d, v )
-            paging_update_cr3(v);
+            paging_update_cr3(v, false);
         domain_unpause(d);
 
         domctl_lock_release();
diff --git a/xen/arch/x86/hvm/monitor.c b/xen/arch/x86/hvm/monitor.c
index 131b852..87e7e31 100644
--- a/xen/arch/x86/hvm/monitor.c
+++ b/xen/arch/x86/hvm/monitor.c
@@ -36,6 +36,9 @@ bool hvm_monitor_cr(unsigned int index, unsigned long value, 
unsigned long old)
     struct arch_domain *ad = &curr->domain->arch;
     unsigned int ctrlreg_bitmask = monitor_ctrlreg_bitmask(index);
 
+    if ( index == VM_EVENT_X86_CR3 && hvm_pcid_enabled(curr) )
+        value &= ~X86_CR3_NOFLUSH; /* Clear the noflush bit. */
+
     if ( (ad->monitor.write_ctrlreg_enabled & ctrlreg_bitmask) &&
          (!(ad->monitor.write_ctrlreg_onchangeonly & ctrlreg_bitmask) ||
           value != old) &&
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index 81cf5b8..4e74f62 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -315,9 +315,9 @@ static int svm_vmcb_restore(struct vcpu *v, struct 
hvm_hw_cpu *c)
     v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
     v->arch.hvm_vcpu.guest_cr[3] = c->cr3;
     v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
-    svm_update_guest_cr(v, 0);
-    svm_update_guest_cr(v, 2);
-    svm_update_guest_cr(v, 4);
+    svm_update_guest_cr(v, 0, 0);
+    svm_update_guest_cr(v, 2, 0);
+    svm_update_guest_cr(v, 4, 0);
 
     /* Load sysenter MSRs into both VMCB save area and VCPU fields. */
     vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs;
@@ -533,7 +533,7 @@ static int svm_guest_x86_mode(struct vcpu *v)
     return likely(vmcb->cs.db) ? 4 : 2;
 }
 
-void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
+void svm_update_guest_cr(struct vcpu *v, unsigned int cr, unsigned int flags)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     uint64_t value;
@@ -563,13 +563,19 @@ void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
     case 3:
         vmcb_set_cr3(vmcb, v->arch.hvm_vcpu.hw_cr[3]);
         if ( !nestedhvm_enabled(v->domain) )
-            hvm_asid_flush_vcpu(v);
+        {
+            if ( !(flags & HVM_UPDATE_GUEST_CR3_NO_FLUSH) )
+                hvm_asid_flush_vcpu(v);
+        }
         else if ( nestedhvm_vmswitch_in_progress(v) )
             ; /* CR3 switches during VMRUN/VMEXIT do not flush the TLB. */
         else
-            hvm_asid_flush_vcpu_asid(
-                nestedhvm_vcpu_in_guestmode(v)
-                ? &vcpu_nestedhvm(v).nv_n2asid : &v->arch.hvm_vcpu.n1asid);
+        {
+            if ( !(flags & HVM_UPDATE_GUEST_CR3_NO_FLUSH) )
+                hvm_asid_flush_vcpu_asid(
+                    nestedhvm_vcpu_in_guestmode(v)
+                    ? &vcpu_nestedhvm(v).nv_n2asid : &v->arch.hvm_vcpu.n1asid);
+        }
         break;
     case 4:
         value = HVM_CR4_HOST_MASK;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 3dc6a6d..4106d4c 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -70,7 +70,8 @@ static void vmx_ctxt_switch_to(struct vcpu *v);
 static int  vmx_alloc_vlapic_mapping(struct domain *d);
 static void vmx_free_vlapic_mapping(struct domain *d);
 static void vmx_install_vlapic_mapping(struct vcpu *v);
-static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
+static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
+                                unsigned int flags);
 static void vmx_update_guest_efer(struct vcpu *v);
 static void vmx_wbinvd_intercept(void);
 static void vmx_fpu_dirty_intercept(void);
@@ -840,9 +841,9 @@ static int vmx_vmcs_restore(struct vcpu *v, struct 
hvm_hw_cpu *c)
 
     v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
     v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
-    vmx_update_guest_cr(v, 0);
-    vmx_update_guest_cr(v, 2);
-    vmx_update_guest_cr(v, 4);
+    vmx_update_guest_cr(v, 0, 0);
+    vmx_update_guest_cr(v, 2, 0);
+    vmx_update_guest_cr(v, 4, 0);
 
     v->arch.hvm_vcpu.guest_efer = c->msr_efer;
     vmx_update_guest_efer(v);
@@ -1552,7 +1553,8 @@ void vmx_update_debug_state(struct vcpu *v)
     vmx_vmcs_exit(v);
 }
 
-static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
+static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
+                                unsigned int flags)
 {
     vmx_vmcs_enter(v);
 
@@ -1704,7 +1706,9 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned 
int cr)
         }
 
         __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
-        hvm_asid_flush_vcpu(v);
+
+        if ( !(flags & HVM_UPDATE_GUEST_CR3_NO_FLUSH) )
+            hvm_asid_flush_vcpu(v);
         break;
 
     default:
@@ -2656,7 +2660,7 @@ static int vmx_cr_access(unsigned long exit_qualification)
          */
         hvm_monitor_crX(CR0, value, old);
         curr->arch.hvm_vcpu.guest_cr[0] = value;
-        vmx_update_guest_cr(curr, 0);
+        vmx_update_guest_cr(curr, 0, 0);
         HVMTRACE_0D(CLTS);
         break;
     }
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 66ea822..1ef3635 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -526,7 +526,7 @@ void update_cr3(struct vcpu *v)
 
     if ( paging_mode_enabled(v->domain) )
     {
-        paging_update_cr3(v);
+        paging_update_cr3(v, false);
         return;
     }
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 003c2d8..b76e6b8 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -669,10 +669,10 @@ static bool_t hap_invlpg(struct vcpu *v, unsigned long va)
     return 1;
 }
 
-static void hap_update_cr3(struct vcpu *v, int do_locking)
+static void hap_update_cr3(struct vcpu *v, int do_locking, bool noflush)
 {
     v->arch.hvm_vcpu.hw_cr[3] = v->arch.hvm_vcpu.guest_cr[3];
-    hvm_update_guest_cr(v, 3);
+    hvm_update_guest_cr3(v, noflush);
 }
 
 const struct paging_mode *
@@ -708,7 +708,7 @@ static void hap_update_paging_modes(struct vcpu *v)
     }
 
     /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */
-    hap_update_cr3(v, 0);
+    hap_update_cr3(v, 0, false);
 
     paging_unlock(d);
     put_gfn(d, cr3_gfn);
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index c240953..20ded3e 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -3030,7 +3030,7 @@ static void sh_update_paging_modes(struct vcpu *v)
     }
 #endif /* OOS */
 
-    v->arch.paging.mode->update_cr3(v, 0);
+    v->arch.paging.mode->update_cr3(v, 0, false);
 }
 
 void shadow_update_paging_modes(struct vcpu *v)
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index a6372e3..fcc4fa3 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3173,7 +3173,7 @@ static int sh_page_fault(struct vcpu *v,
          * In any case, in the PAE case, the ASSERT is not true; it can
          * happen because of actions the guest is taking. */
 #if GUEST_PAGING_LEVELS == 3
-        v->arch.paging.mode->update_cr3(v, 0);
+        v->arch.paging.mode->update_cr3(v, 0, false);
 #else
         ASSERT(d->is_shutting_down);
 #endif
@@ -3992,7 +3992,7 @@ sh_set_toplevel_shadow(struct vcpu *v,
 
 
 static void
-sh_update_cr3(struct vcpu *v, int do_locking)
+sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
  * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
  * if appropriate).
@@ -4234,7 +4234,7 @@ sh_update_cr3(struct vcpu *v, int do_locking)
         v->arch.hvm_vcpu.hw_cr[3] =
             pagetable_get_paddr(v->arch.shadow_table[0]);
 #endif
-        hvm_update_guest_cr(v, 3);
+        hvm_update_guest_cr3(v, noflush);
     }
 
     /* Fix up the linear pagetable mappings */
diff --git a/xen/arch/x86/mm/shadow/none.c b/xen/arch/x86/mm/shadow/none.c
index 9e6ad23..a8c9604 100644
--- a/xen/arch/x86/mm/shadow/none.c
+++ b/xen/arch/x86/mm/shadow/none.c
@@ -50,7 +50,7 @@ static unsigned long _gva_to_gfn(struct vcpu *v, struct 
p2m_domain *p2m,
     return gfn_x(INVALID_GFN);
 }
 
-static void _update_cr3(struct vcpu *v, int do_locking)
+static void _update_cr3(struct vcpu *v, int do_locking, bool noflush)
 {
     ASSERT_UNREACHABLE();
 }
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index dd3dd5f..1b4080b 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -80,6 +80,9 @@ enum hvm_intblk {
 #define HVM_EVENT_VECTOR_UNSET    (-1)
 #define HVM_EVENT_VECTOR_UPDATING (-2)
 
+/* update_guest_cr() flags. */
+#define HVM_UPDATE_GUEST_CR3_NO_FLUSH 0x00000001
+
 /*
  * The hardware virtual machine (HVM) interface abstracts away from the
  * x86/x86_64 CPU virtualization assist specifics. Currently this interface
@@ -132,7 +135,8 @@ struct hvm_function_table {
     /*
      * Called to inform HVM layer that a guest CRn or EFER has changed.
      */
-    void (*update_guest_cr)(struct vcpu *v, unsigned int cr);
+    void (*update_guest_cr)(struct vcpu *v, unsigned int cr,
+                            unsigned int flags);
     void (*update_guest_efer)(struct vcpu *v);
 
     void (*cpuid_policy_changed)(struct vcpu *v);
@@ -324,7 +328,14 @@ hvm_update_host_cr3(struct vcpu *v)
 
 static inline void hvm_update_guest_cr(struct vcpu *v, unsigned int cr)
 {
-    hvm_funcs.update_guest_cr(v, cr);
+    hvm_funcs.update_guest_cr(v, cr, 0);
+}
+
+static inline void hvm_update_guest_cr3(struct vcpu *v, bool noflush)
+{
+    unsigned int flags = noflush ? HVM_UPDATE_GUEST_CR3_NO_FLUSH : 0;
+
+    hvm_funcs.update_guest_cr(v, 3, flags);
 }
 
 static inline void hvm_update_guest_efer(struct vcpu *v)
diff --git a/xen/include/asm-x86/hvm/svm/svm.h 
b/xen/include/asm-x86/hvm/svm/svm.h
index 462cb89..6c050f5 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -51,7 +51,7 @@ static inline void svm_invlpga(unsigned long vaddr, uint32_t 
asid)
 
 unsigned long *svm_msrbit(unsigned long *msr_bitmap, uint32_t msr);
 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len);
-void svm_update_guest_cr(struct vcpu *, unsigned int cr);
+void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
 
 extern u32 svm_feature_flags;
 
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index 5607ab4..dd3e31f 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -122,7 +122,8 @@ struct paging_mode {
                                             unsigned long cr3,
                                             paddr_t ga, uint32_t *pfec,
                                             unsigned int *page_order);
-    void          (*update_cr3            )(struct vcpu *v, int do_locking);
+    void          (*update_cr3            )(struct vcpu *v, int do_locking,
+                                            bool noflush);
     void          (*update_paging_modes   )(struct vcpu *v);
     void          (*write_p2m_entry       )(struct domain *d, unsigned long 
gfn,
                                             l1_pgentry_t *p, l1_pgentry_t new,
@@ -276,9 +277,9 @@ static inline unsigned long paging_ga_to_gfn_cr3(struct 
vcpu *v,
 /* Update all the things that are derived from the guest's CR3.
  * Called when the guest changes CR3; the caller can then use v->arch.cr3
  * as the value to load into the host CR3 to schedule this vcpu */
-static inline void paging_update_cr3(struct vcpu *v)
+static inline void paging_update_cr3(struct vcpu *v, bool noflush)
 {
-    paging_get_hostmode(v)->update_cr3(v, 1);
+    paging_get_hostmode(v)->update_cr3(v, 1, noflush);
 }
 
 /* Update all the things that are derived from the guest's CR0/CR3/CR4.
diff --git a/xen/include/asm-x86/x86-defns.h b/xen/include/asm-x86/x86-defns.h
index 70453e8..7bb6918 100644
--- a/xen/include/asm-x86/x86-defns.h
+++ b/xen/include/asm-x86/x86-defns.h
@@ -43,6 +43,11 @@
 #define X86_CR0_PG              0x80000000 /* Paging                   (RW) */
 
 /*
+ * Intel CPU flags in CR3
+ */
+#define X86_CR3_NOFLUSH    0x8000000000000000
+
+/*
  * Intel CPU features in CR4
  */
 #define X86_CR4_VME        0x00000001 /* enable vm86 extensions */
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.