[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen staging] x86/vmx: Don't leak EFER.NXE into guest context



commit fd32dcfe4c9a539f8e5d26ff4c5ca50ee54556b2
Author:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
AuthorDate: Tue May 23 17:32:30 2017 +0100
Commit:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Wed Jul 4 12:12:15 2018 +0100

    x86/vmx: Don't leak EFER.NXE into guest context
    
    Intel hardware only uses 4 bits in MSR_EFER.  Changes to LME and LMA are
    handled automatically via the VMENTRY_CTLS.IA32E_MODE bit.
    
    SCE is handled by ad-hoc logic in context_switch(), vmx_restore_guest_msrs()
    and vmx_update_guest_efer(), and works by altering the host SCE value to 
match
    the setting the guest wants.  This works because, in HVM vcpu context, Xen
    never needs to execute a SYSCALL or SYSRET instruction.
    
    However, NXE has never been context switched.  Unlike SCE, NXE cannot be
    context switched at vcpu boundaries because disabling NXE makes PTE.NX bits
    reserved and cause a pagefault when encountered.  This means that the guest
    always has Xen's setting in effect, irrespective of the bit it can see and
    modify in its virtualised view of MSR_EFER.
    
    This isn't a major problem for production operating systems because they, 
like
    Xen, always turn the NXE on when it is available.  However, it does have an
    observable effect on which guest PTE bits are valid, and whether
    PFEC_insn_fetch is visible in a #PF error code.
    
    Second generation VT-x hardware has host and guest EFER fields in the VMCS,
    and support for loading and saving them automatically.  First generation 
VT-x
    hardware needs to use MSR load/save lists to cause an atomic switch of
    MSR_EFER on vmentry/exit.
    
    Therefore we update vmx_init_vmcs_config() to find and use guest/host EFER
    support when available (and MSR load/save lists on older hardware) and drop
    all ad-hoc alteration of SCE.
    
    There are two minor complications when selecting the EFER setting:
     * For shadow guests, NXE is a paging setting and must remain under host
       control, but this is fine as Xen also handles the pagefaults.
     * When the Unrestricted Guest control is clear, hardware doesn't tolerate 
LME
       and LMA being different.  This doesn't matter in practice as we intercept
       all writes to CR0 and reads from MSR_EFER, so can provide architecturally
       consistent behaviour from the guests point of view.
    
    With changing how EFER is loaded, vmcs_dump_vcpu() needs adjusting.  Read 
EFER
    from the appropriate information source, and identify when dumping the guest
    EFER value which source was used.
    
    As a result of fixing EFER context switching, we can remove the 
Intel-special
    case from hvm_nx_enabled() and let guest_walk_tables() work with the real
    guest paging settings.
    
    Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
    Reviewed-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
    Reviewed-by: Tim Deegan <tim@xxxxxxx>
    Reviewed-by: Kevin Tian <kevin.tian@xxxxxxxxx>
---
 xen/arch/x86/domain.c              | 10 -----
 xen/arch/x86/hvm/vmx/vmcs.c        | 26 ++++++++----
 xen/arch/x86/hvm/vmx/vmx.c         | 84 ++++++++++++++++++++++++++++++--------
 xen/include/asm-x86/hvm/hvm.h      |  4 +-
 xen/include/asm-x86/hvm/vmx/vmcs.h | 16 ++++++++
 5 files changed, 102 insertions(+), 38 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 9850a782ec..6201dffb9a 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1723,16 +1723,6 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
     {
         __context_switch();
 
-        if ( is_pv_domain(nextd) &&
-             (is_idle_domain(prevd) ||
-              is_hvm_domain(prevd) ||
-              is_pv_32bit_domain(prevd) != is_pv_32bit_domain(nextd)) )
-        {
-            uint64_t efer = read_efer();
-            if ( !(efer & EFER_SCE) )
-                write_efer(efer | EFER_SCE);
-        }
-
         /* Re-enable interrupts before restoring state which may fault. */
         local_irq_enable();
 
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index cce47f4282..6059d614ff 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -342,8 +342,8 @@ static int vmx_init_vmcs_config(void)
     }
 
     min = VM_EXIT_ACK_INTR_ON_EXIT;
-    opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT |
-          VM_EXIT_CLEAR_BNDCFGS;
+    opt = (VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT |
+           VM_EXIT_LOAD_HOST_EFER | VM_EXIT_CLEAR_BNDCFGS);
     min |= VM_EXIT_IA32E_MODE;
     _vmx_vmexit_control = adjust_vmx_controls(
         "VMExit Control", min, opt, MSR_IA32_VMX_EXIT_CTLS, &mismatch);
@@ -383,7 +383,8 @@ static int vmx_init_vmcs_config(void)
         _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
 
     min = 0;
-    opt = VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_BNDCFGS;
+    opt = (VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_GUEST_EFER |
+           VM_ENTRY_LOAD_BNDCFGS);
     _vmx_vmentry_control = adjust_vmx_controls(
         "VMEntry Control", min, opt, MSR_IA32_VMX_ENTRY_CTLS, &mismatch);
 
@@ -1150,6 +1151,8 @@ static int construct_vmcs(struct vcpu *v)
         v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
     __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
     __vmwrite(HOST_CR4, mmu_cr4_features);
+    if ( cpu_has_vmx_efer )
+        __vmwrite(HOST_EFER, read_efer());
 
     /* Host CS:RIP. */
     __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
@@ -1907,7 +1910,17 @@ void vmcs_dump_vcpu(struct vcpu *v)
     vmentry_ctl = vmr32(VM_ENTRY_CONTROLS),
     vmexit_ctl = vmr32(VM_EXIT_CONTROLS);
     cr4 = vmr(GUEST_CR4);
-    efer = vmr(GUEST_EFER);
+
+    /*
+     * The guests EFER setting comes from the GUEST_EFER VMCS field whenever
+     * available, or the guest load-only MSR list on Gen1 hardware, the entry
+     * for which may be elided for performance reasons if identical to Xen's
+     * setting.
+     */
+    if ( cpu_has_vmx_efer )
+        efer = vmr(GUEST_EFER);
+    else if ( vmx_read_guest_loadonly_msr(v, MSR_EFER, &efer) )
+        efer = read_efer();
 
     printk("*** Guest State ***\n");
     printk("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
@@ -1944,9 +1957,8 @@ void vmcs_dump_vcpu(struct vcpu *v)
     vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
     vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
     vmx_dump_sel("  TR", GUEST_TR_SELECTOR);
-    if ( (vmexit_ctl & (VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_SAVE_GUEST_EFER)) ||
-         (vmentry_ctl & (VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_GUEST_EFER)) )
-        printk("EFER = 0x%016lx  PAT = 0x%016lx\n", efer, vmr(GUEST_PAT));
+    printk("EFER(%s) = 0x%016lx  PAT = 0x%016lx\n",
+           cpu_has_vmx_efer ? "VMCS" : "MSR LL", efer, vmr(GUEST_PAT));
     printk("PreemptionTimer = 0x%08x  SM Base = 0x%08x\n",
            vmr32(GUEST_PREEMPTION_TIMER), vmr32(GUEST_SMBASE));
     printk("DebugCtl = 0x%016lx  DebugExceptions = 0x%016lx\n",
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 2ff9d684a3..12e0ee5c4e 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -509,15 +509,6 @@ static void vmx_restore_guest_msrs(struct vcpu *v)
     wrmsrl(MSR_LSTAR,          v->arch.hvm_vmx.lstar);
     wrmsrl(MSR_SYSCALL_MASK,   v->arch.hvm_vmx.sfmask);
 
-    if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE )
-    {
-        HVM_DBG_LOG(DBG_LEVEL_2,
-                    "restore guest's EFER with value %lx",
-                    v->arch.hvm_vcpu.guest_efer);
-        write_efer((read_efer() & ~EFER_SCE) |
-                   (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
-    }
-
     if ( cpu_has_rdtscp )
         wrmsr_tsc_aux(hvm_msr_tsc_aux(v));
 }
@@ -1649,22 +1640,79 @@ static void vmx_update_guest_cr(struct vcpu *v, 
unsigned int cr,
 
 static void vmx_update_guest_efer(struct vcpu *v)
 {
-    unsigned long vm_entry_value;
+    unsigned long entry_ctls, guest_efer = v->arch.hvm_vcpu.guest_efer,
+        xen_efer = read_efer();
+
+    if ( paging_mode_shadow(v->domain) )
+    {
+        /*
+         * When using shadow pagetables, EFER.NX is a Xen-owned bit and is not
+         * under guest control.
+         */
+        guest_efer &= ~EFER_NX;
+        guest_efer |= xen_efer & EFER_NX;
+    }
+
+    if ( !vmx_unrestricted_guest(v) )
+    {
+        /*
+         * When Unrestricted Guest is not enabled in the VMCS, hardware does
+         * not tolerate the LME and LMA settings being different.  As writes
+         * to CR0 are intercepted, it is safe to leave LME clear at this
+         * point, and fix up both LME and LMA when CR0.PG is set.
+         */
+        if ( !(guest_efer & EFER_LMA) )
+            guest_efer &= ~EFER_LME;
+    }
 
     vmx_vmcs_enter(v);
 
-    __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
-    if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
-        vm_entry_value |= VM_ENTRY_IA32E_MODE;
+    /*
+     * The intended guest running mode is derived from VM_ENTRY_IA32E_MODE,
+     * which (architecturally) is the guest's LMA setting.
+     */
+    __vmread(VM_ENTRY_CONTROLS, &entry_ctls);
+
+    entry_ctls &= ~VM_ENTRY_IA32E_MODE;
+    if ( guest_efer & EFER_LMA )
+        entry_ctls |= VM_ENTRY_IA32E_MODE;
+
+    __vmwrite(VM_ENTRY_CONTROLS, entry_ctls);
+
+    /* We expect to use EFER loading in the common case, but... */
+    if ( likely(cpu_has_vmx_efer) )
+        __vmwrite(GUEST_EFER, guest_efer);
+
+    /* ... on Gen1 VT-x hardware, we have to use MSR load/save lists instead. 
*/
     else
-        vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
-    __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
+    {
+        /*
+         * When the guests choice of EFER matches Xen's, remove the load/save
+         * list entries.  It is unnecessary overhead, especially as this is
+         * expected to be the common case for 64bit guests.
+         */
+        if ( guest_efer == xen_efer )
+        {
+            vmx_del_msr(v, MSR_EFER, VMX_MSR_HOST);
+            vmx_del_msr(v, MSR_EFER, VMX_MSR_GUEST_LOADONLY);
+        }
+        else
+        {
+            vmx_add_msr(v, MSR_EFER, xen_efer, VMX_MSR_HOST);
+            vmx_add_msr(v, MSR_EFER, guest_efer, VMX_MSR_GUEST_LOADONLY);
+        }
+    }
 
     vmx_vmcs_exit(v);
 
-    if ( v == current )
-        write_efer((read_efer() & ~EFER_SCE) |
-                   (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
+    /*
+     * If the guests virtualised view of MSR_EFER matches the value loaded
+     * into hardware, clear the read intercept to avoid unnecessary VMExits.
+     */
+    if ( guest_efer == v->arch.hvm_vcpu.guest_efer )
+        vmx_clear_msr_intercept(v, MSR_EFER, VMX_MSR_R);
+    else
+        vmx_set_msr_intercept(v, MSR_EFER, VMX_MSR_R);
 }
 
 void nvmx_enqueue_n2_exceptions(struct vcpu *v, 
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index ef5e198ebd..fcfc5cfcb2 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -296,10 +296,8 @@ int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t 
dest, uint8_t dest_mode);
     (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_SMEP))
 #define hvm_smap_enabled(v) \
     (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_SMAP))
-/* HVM guests on Intel hardware leak Xen's NX settings into guest context. */
 #define hvm_nx_enabled(v) \
-    ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && cpu_has_nx) ||    \
-     ((v)->arch.hvm_vcpu.guest_efer & EFER_NX))
+    ((v)->arch.hvm_vcpu.guest_efer & EFER_NX)
 #define hvm_pku_enabled(v) \
     (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PKE))
 
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 66e86f8786..c4d4f15d29 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -306,6 +306,8 @@ extern u64 vmx_ept_vpid_cap;
     (vmx_cpu_based_exec_control & CPU_BASED_MONITOR_TRAP_FLAG)
 #define cpu_has_vmx_pat \
     (vmx_vmentry_control & VM_ENTRY_LOAD_GUEST_PAT)
+#define cpu_has_vmx_efer \
+    (vmx_vmentry_control & VM_ENTRY_LOAD_GUEST_EFER)
 #define cpu_has_vmx_unrestricted_guest \
     (vmx_secondary_exec_control & SECONDARY_EXEC_UNRESTRICTED_GUEST)
 #define vmx_unrestricted_guest(v)               \
@@ -591,6 +593,20 @@ static inline int vmx_read_guest_msr(const struct vcpu *v, 
uint32_t msr,
     return 0;
 }
 
+static inline int vmx_read_guest_loadonly_msr(
+    const struct vcpu *v, uint32_t msr, uint64_t *val)
+{
+    const struct vmx_msr_entry *ent =
+        vmx_find_msr(v, msr, VMX_MSR_GUEST_LOADONLY);
+
+    if ( !ent )
+        return -ESRCH;
+
+    *val = ent->data;
+
+    return 0;
+}
+
 static inline int vmx_write_guest_msr(struct vcpu *v, uint32_t msr,
                                       uint64_t val)
 {
--
generated by git-patchbot for /home/xen/git/xen.git#staging

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.