[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH V6] x86/hvm: fix domain crash when CR3 has the noflush bit set



In hardware, when PCID support is enabled and the NOFLUSH bit is set
when writing a CR3 value, the hardware will clear that that bit and
change the CR3 without flushing the TLB. hvm_set_cr3(), however, was
ignoring this bit; the result was that post-vm_event checks detected
an invalid CR3 value and crashed the domain.

Handle NOFLUSH in hvm_set_cr3() by:
1. Clearing the bit
2. Passing a "noflush" flag to lower-level cr3 setting functions to
indicate that a flush should not be performed.

Also clear X86_CR3_NOFLUSH when reporting CR3 monitored CR3 writes.

This allows introspection to be used on VMs whose operating system uses
the NOFLUSH bit.

Signed-off-by: Razvan Cojocaru <rcojocaru@xxxxxxxxxxxxxxx>
Reported-by: Bitweasil <bitweasil@xxxxxxxxxxxxxx>
Suggested-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Acked-by: Tamas K Lengyel <tamas@xxxxxxxxxxxxx>
Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx>
Reviewed-by: Kevin Tian <kevin.tian@xxxxxxxxx>
Acked-by: George Dunlap <george.dunlap@xxxxxxxxxx>

---
Changes since V5:
 - Updated the commit message.
---
 xen/arch/x86/hvm/hvm.c            | 13 ++++++++++---
 xen/arch/x86/hvm/monitor.c        |  3 +++
 xen/arch/x86/hvm/svm/svm.c        | 15 +++++++++------
 xen/arch/x86/hvm/vmx/vmx.c        | 18 +++++++++++-------
 xen/arch/x86/mm.c                 |  2 +-
 xen/arch/x86/mm/hap/hap.c         |  6 +++---
 xen/arch/x86/mm/shadow/common.c   |  2 +-
 xen/arch/x86/mm/shadow/multi.c    |  6 +++---
 xen/arch/x86/mm/shadow/none.c     |  2 +-
 xen/include/asm-x86/hvm/hvm.h     | 15 +++++++++++++--
 xen/include/asm-x86/hvm/svm/svm.h |  2 +-
 xen/include/asm-x86/paging.h      |  7 ++++---
 xen/include/asm-x86/x86-defns.h   |  5 +++++
 13 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 0539551..10ae922 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2297,6 +2297,7 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
     struct vcpu *v = current;
     struct page_info *page;
     unsigned long old = v->arch.hvm_vcpu.guest_cr[3];
+    bool noflush = false;
 
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
@@ -2313,6 +2314,12 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
         }
     }
 
+    if ( hvm_pcid_enabled(v) ) /* Clear the noflush bit. */
+    {
+        noflush = value & X86_CR3_NOFLUSH;
+        value &= ~X86_CR3_NOFLUSH;
+    }
+
     if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
          (value != v->arch.hvm_vcpu.guest_cr[3]) )
     {
@@ -2330,7 +2337,7 @@ int hvm_set_cr3(unsigned long value, bool_t may_defer)
     }
 
     v->arch.hvm_vcpu.guest_cr[3] = value;
-    paging_update_cr3(v);
+    paging_update_cr3(v, noflush);
     return X86EMUL_OKAY;
 
  bad_cr3:
@@ -4031,7 +4038,7 @@ static int hvmop_flush_tlb_all(void)
 
     /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
     for_each_vcpu ( d, v )
-        paging_update_cr3(v);
+        paging_update_cr3(v, false);
 
     /* Flush all dirty TLBs. */
     flush_tlb_mask(d->dirty_cpumask);
@@ -4193,7 +4200,7 @@ static int hvmop_set_param(
         domain_pause(d);
         d->arch.hvm_domain.params[a.index] = a.value;
         for_each_vcpu ( d, v )
-            paging_update_cr3(v);
+            paging_update_cr3(v, false);
         domain_unpause(d);
 
         domctl_lock_release();
diff --git a/xen/arch/x86/hvm/monitor.c b/xen/arch/x86/hvm/monitor.c
index 160c032..2a41ccc 100644
--- a/xen/arch/x86/hvm/monitor.c
+++ b/xen/arch/x86/hvm/monitor.c
@@ -36,6 +36,9 @@ bool hvm_monitor_cr(unsigned int index, unsigned long value, 
unsigned long old)
     struct arch_domain *ad = &curr->domain->arch;
     unsigned int ctrlreg_bitmask = monitor_ctrlreg_bitmask(index);
 
+    if ( index == VM_EVENT_X86_CR3 && hvm_pcid_enabled(curr) )
+        value &= ~X86_CR3_NOFLUSH; /* Clear the noflush bit. */
+
     if ( (ad->monitor.write_ctrlreg_enabled & ctrlreg_bitmask) &&
          (!(ad->monitor.write_ctrlreg_onchangeonly & ctrlreg_bitmask) ||
           value != old) &&
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index f2fbe07..c34f5b5 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -325,9 +325,9 @@ static int svm_vmcb_restore(struct vcpu *v, struct 
hvm_hw_cpu *c)
     v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
     v->arch.hvm_vcpu.guest_cr[3] = c->cr3;
     v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
-    svm_update_guest_cr(v, 0);
-    svm_update_guest_cr(v, 2);
-    svm_update_guest_cr(v, 4);
+    svm_update_guest_cr(v, 0, 0);
+    svm_update_guest_cr(v, 2, 0);
+    svm_update_guest_cr(v, 4, 0);
 
     /* Load sysenter MSRs into both VMCB save area and VCPU fields. */
     vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs;
@@ -543,7 +543,7 @@ static int svm_guest_x86_mode(struct vcpu *v)
     return likely(vmcb->cs.db) ? 4 : 2;
 }
 
-void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
+void svm_update_guest_cr(struct vcpu *v, unsigned int cr, unsigned int flags)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     uint64_t value;
@@ -583,10 +583,13 @@ void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
     case 3:
         vmcb_set_cr3(vmcb, v->arch.hvm_vcpu.hw_cr[3]);
         if ( !nestedhvm_enabled(v->domain) )
-            hvm_asid_flush_vcpu(v);
+        {
+            if ( !(flags & HVM_UPDATE_GUEST_CR3_NOFLUSH) )
+                hvm_asid_flush_vcpu(v);
+        }
         else if ( nestedhvm_vmswitch_in_progress(v) )
             ; /* CR3 switches during VMRUN/VMEXIT do not flush the TLB. */
-        else
+        else if ( !(flags & HVM_UPDATE_GUEST_CR3_NOFLUSH) )
             hvm_asid_flush_vcpu_asid(
                 nestedhvm_vcpu_in_guestmode(v)
                 ? &vcpu_nestedhvm(v).nv_n2asid : &v->arch.hvm_vcpu.n1asid);
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index aa05050..ff9e997 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -68,7 +68,8 @@ static void vmx_ctxt_switch_to(struct vcpu *v);
 static int  vmx_alloc_vlapic_mapping(struct domain *d);
 static void vmx_free_vlapic_mapping(struct domain *d);
 static void vmx_install_vlapic_mapping(struct vcpu *v);
-static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
+static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
+                                unsigned int flags);
 static void vmx_update_guest_efer(struct vcpu *v);
 static void vmx_wbinvd_intercept(void);
 static void vmx_fpu_dirty_intercept(void);
@@ -836,9 +837,9 @@ static int vmx_vmcs_restore(struct vcpu *v, struct 
hvm_hw_cpu *c)
 
     v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
     v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
-    vmx_update_guest_cr(v, 0);
-    vmx_update_guest_cr(v, 2);
-    vmx_update_guest_cr(v, 4);
+    vmx_update_guest_cr(v, 0, 0);
+    vmx_update_guest_cr(v, 2, 0);
+    vmx_update_guest_cr(v, 4, 0);
 
     v->arch.hvm_vcpu.guest_efer = c->msr_efer;
     vmx_update_guest_efer(v);
@@ -1548,7 +1549,8 @@ void vmx_update_debug_state(struct vcpu *v)
     vmx_vmcs_exit(v);
 }
 
-static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
+static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
+                                unsigned int flags)
 {
     vmx_vmcs_enter(v);
 
@@ -1730,7 +1732,9 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned 
int cr)
         }
 
         __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
-        hvm_asid_flush_vcpu(v);
+
+        if ( !(flags & HVM_UPDATE_GUEST_CR3_NOFLUSH) )
+            hvm_asid_flush_vcpu(v);
         break;
 
     default:
@@ -2682,7 +2686,7 @@ static int vmx_cr_access(unsigned long exit_qualification)
          */
         hvm_monitor_crX(CR0, value, old);
         curr->arch.hvm_vcpu.guest_cr[0] = value;
-        vmx_update_guest_cr(curr, 0);
+        vmx_update_guest_cr(curr, 0, 0);
         HVMTRACE_0D(CLTS);
         break;
     }
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index e1f089b..9d26a9d 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -526,7 +526,7 @@ void update_cr3(struct vcpu *v)
 
     if ( paging_mode_enabled(v->domain) )
     {
-        paging_update_cr3(v);
+        paging_update_cr3(v, false);
         return;
     }
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 003c2d8..b76e6b8 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -669,10 +669,10 @@ static bool_t hap_invlpg(struct vcpu *v, unsigned long va)
     return 1;
 }
 
-static void hap_update_cr3(struct vcpu *v, int do_locking)
+static void hap_update_cr3(struct vcpu *v, int do_locking, bool noflush)
 {
     v->arch.hvm_vcpu.hw_cr[3] = v->arch.hvm_vcpu.guest_cr[3];
-    hvm_update_guest_cr(v, 3);
+    hvm_update_guest_cr3(v, noflush);
 }
 
 const struct paging_mode *
@@ -708,7 +708,7 @@ static void hap_update_paging_modes(struct vcpu *v)
     }
 
     /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */
-    hap_update_cr3(v, 0);
+    hap_update_cr3(v, 0, false);
 
     paging_unlock(d);
     put_gfn(d, cr3_gfn);
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index c240953..20ded3e 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -3030,7 +3030,7 @@ static void sh_update_paging_modes(struct vcpu *v)
     }
 #endif /* OOS */
 
-    v->arch.paging.mode->update_cr3(v, 0);
+    v->arch.paging.mode->update_cr3(v, 0, false);
 }
 
 void shadow_update_paging_modes(struct vcpu *v)
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index a6372e3..fcc4fa3 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3173,7 +3173,7 @@ static int sh_page_fault(struct vcpu *v,
          * In any case, in the PAE case, the ASSERT is not true; it can
          * happen because of actions the guest is taking. */
 #if GUEST_PAGING_LEVELS == 3
-        v->arch.paging.mode->update_cr3(v, 0);
+        v->arch.paging.mode->update_cr3(v, 0, false);
 #else
         ASSERT(d->is_shutting_down);
 #endif
@@ -3992,7 +3992,7 @@ sh_set_toplevel_shadow(struct vcpu *v,
 
 
 static void
-sh_update_cr3(struct vcpu *v, int do_locking)
+sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
  * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
  * if appropriate).
@@ -4234,7 +4234,7 @@ sh_update_cr3(struct vcpu *v, int do_locking)
         v->arch.hvm_vcpu.hw_cr[3] =
             pagetable_get_paddr(v->arch.shadow_table[0]);
 #endif
-        hvm_update_guest_cr(v, 3);
+        hvm_update_guest_cr3(v, noflush);
     }
 
     /* Fix up the linear pagetable mappings */
diff --git a/xen/arch/x86/mm/shadow/none.c b/xen/arch/x86/mm/shadow/none.c
index 9e6ad23..a8c9604 100644
--- a/xen/arch/x86/mm/shadow/none.c
+++ b/xen/arch/x86/mm/shadow/none.c
@@ -50,7 +50,7 @@ static unsigned long _gva_to_gfn(struct vcpu *v, struct 
p2m_domain *p2m,
     return gfn_x(INVALID_GFN);
 }
 
-static void _update_cr3(struct vcpu *v, int do_locking)
+static void _update_cr3(struct vcpu *v, int do_locking, bool noflush)
 {
     ASSERT_UNREACHABLE();
 }
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index dd3dd5f..ca3a334 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -80,6 +80,9 @@ enum hvm_intblk {
 #define HVM_EVENT_VECTOR_UNSET    (-1)
 #define HVM_EVENT_VECTOR_UPDATING (-2)
 
+/* update_guest_cr() flags. */
+#define HVM_UPDATE_GUEST_CR3_NOFLUSH 0x00000001
+
 /*
  * The hardware virtual machine (HVM) interface abstracts away from the
  * x86/x86_64 CPU virtualization assist specifics. Currently this interface
@@ -132,7 +135,8 @@ struct hvm_function_table {
     /*
      * Called to inform HVM layer that a guest CRn or EFER has changed.
      */
-    void (*update_guest_cr)(struct vcpu *v, unsigned int cr);
+    void (*update_guest_cr)(struct vcpu *v, unsigned int cr,
+                            unsigned int flags);
     void (*update_guest_efer)(struct vcpu *v);
 
     void (*cpuid_policy_changed)(struct vcpu *v);
@@ -324,7 +328,14 @@ hvm_update_host_cr3(struct vcpu *v)
 
 static inline void hvm_update_guest_cr(struct vcpu *v, unsigned int cr)
 {
-    hvm_funcs.update_guest_cr(v, cr);
+    hvm_funcs.update_guest_cr(v, cr, 0);
+}
+
+static inline void hvm_update_guest_cr3(struct vcpu *v, bool noflush)
+{
+    unsigned int flags = noflush ? HVM_UPDATE_GUEST_CR3_NOFLUSH : 0;
+
+    hvm_funcs.update_guest_cr(v, 3, flags);
 }
 
 static inline void hvm_update_guest_efer(struct vcpu *v)
diff --git a/xen/include/asm-x86/hvm/svm/svm.h 
b/xen/include/asm-x86/hvm/svm/svm.h
index 593546f..4e5e142 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -51,7 +51,7 @@ static inline void svm_invlpga(unsigned long vaddr, uint32_t 
asid)
 
 unsigned long *svm_msrbit(unsigned long *msr_bitmap, uint32_t msr);
 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len);
-void svm_update_guest_cr(struct vcpu *, unsigned int cr);
+void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
 
 extern u32 svm_feature_flags;
 
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index 5607ab4..dd3e31f 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -122,7 +122,8 @@ struct paging_mode {
                                             unsigned long cr3,
                                             paddr_t ga, uint32_t *pfec,
                                             unsigned int *page_order);
-    void          (*update_cr3            )(struct vcpu *v, int do_locking);
+    void          (*update_cr3            )(struct vcpu *v, int do_locking,
+                                            bool noflush);
     void          (*update_paging_modes   )(struct vcpu *v);
     void          (*write_p2m_entry       )(struct domain *d, unsigned long 
gfn,
                                             l1_pgentry_t *p, l1_pgentry_t new,
@@ -276,9 +277,9 @@ static inline unsigned long paging_ga_to_gfn_cr3(struct 
vcpu *v,
 /* Update all the things that are derived from the guest's CR3.
  * Called when the guest changes CR3; the caller can then use v->arch.cr3
  * as the value to load into the host CR3 to schedule this vcpu */
-static inline void paging_update_cr3(struct vcpu *v)
+static inline void paging_update_cr3(struct vcpu *v, bool noflush)
 {
-    paging_get_hostmode(v)->update_cr3(v, 1);
+    paging_get_hostmode(v)->update_cr3(v, 1, noflush);
 }
 
 /* Update all the things that are derived from the guest's CR0/CR3/CR4.
diff --git a/xen/include/asm-x86/x86-defns.h b/xen/include/asm-x86/x86-defns.h
index 70453e8..8598ade 100644
--- a/xen/include/asm-x86/x86-defns.h
+++ b/xen/include/asm-x86/x86-defns.h
@@ -43,6 +43,11 @@
 #define X86_CR0_PG              0x80000000 /* Paging                   (RW) */
 
 /*
+ * Intel CPU flags in CR3
+ */
+#define X86_CR3_NOFLUSH (_AC(1, ULL) << 63)
+
+/*
  * Intel CPU features in CR4
  */
 #define X86_CR4_VME        0x00000001 /* enable vm86 extensions */
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.