[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] VMX world switch



The attached code implements a VMX world switch to vmxassist (a small assist
module residing in a VMX enabled partition where it is responsible for
emulating real mode) whever CR0.PE is disabled.

The patch temporarily disables the PGE feature flag in cpuid as it is
currently broken (try running an unmodified 2.6 kernel that sets PGE in
mm/init.c/paging_init()).

The patch adds consistency checks before setting the ARCH_VMX_IO_WAIT state
to detect race conditions on SMP systems.

        Signed-Off-By: Leendert van Doorn <leendert@xxxxxxxxxxxxxx>

BTW: vmxassist is undergoing cleanups and will be released soon, I'm exploring
the other two problems but the current work arounds allow me to make progress
on vmxassist.

        Leendert


diff --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet --exclude=PENDING 
-ru -Bb -N xeno-unstable.bk/xen/arch/x86/vmx.c 
xeno-unstable.real-mode/xen/arch/x86/vmx.c
--- xeno-unstable.bk/xen/arch/x86/vmx.c 2005-04-17 11:47:01.000000000 -0400
+++ xeno-unstable.real-mode/xen/arch/x86/vmx.c  2005-04-17 05:24:37.000000000 
-0400
@@ -194,6 +194,7 @@
     cpuid(input, &eax, &ebx, &ecx, &edx);
 
     if (input == 1) {
+        clear_bit(X86_FEATURE_PGE, &edx); /* temporarily disabled */
         clear_bit(X86_FEATURE_PSE, &edx);
         clear_bit(X86_FEATURE_PAE, &edx);
         clear_bit(X86_FEATURE_PSE36, &edx);
@@ -381,10 +382,261 @@
     do_block();
 }
 
-static int
-vm86assist(struct exec_domain *d)
+enum { COPY_IN = 0, COPY_OUT };
+
+static inline int
+vmx_copy(void *buf, unsigned long laddr, int size, int dir)
 {
-    /* stay tuned ... */
+    unsigned char *addr;
+    unsigned long mfn;
+
+    if ((size + (laddr & (PAGE_SIZE - 1))) >= PAGE_SIZE) {
+       printf("vmx_copy exceeds page boundary\n");
+       return 0;
+    }
+
+    mfn = phys_to_machine_mapping(gva_to_gpte(laddr) >> PAGE_SHIFT);
+    addr = map_domain_mem((mfn << PAGE_SHIFT) | (laddr & ~PAGE_MASK));
+
+    if (dir == COPY_IN)
+           memcpy(buf, addr, size);
+    else
+           memcpy(addr, buf, size);
+
+    unmap_domain_mem(addr);
+    return 1;
+}
+
+int
+vmx_world_save(struct exec_domain *d, struct vmx_assist_context *c)
+{
+    unsigned long inst_len;
+    int error = 0;
+
+    error |= __vmread(INSTRUCTION_LEN, &inst_len);
+    error |= __vmread(GUEST_EIP, &c->eip);
+    c->eip += inst_len; /* skip transition instruction */
+    error |= __vmread(GUEST_ESP, &c->esp);
+    error |= __vmread(GUEST_EFLAGS, &c->eflags);
+
+    error |= __vmread(CR0_READ_SHADOW, &c->cr0);
+    c->cr3 = d->arch.arch_vmx.cpu_cr3;
+    error |= __vmread(CR4_READ_SHADOW, &c->cr4);
+
+    error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
+    error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
+
+    error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
+    error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
+
+    error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
+    error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
+    error |= __vmread(GUEST_CS_BASE, &c->cs_base);
+    error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
+
+    error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
+    error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
+    error |= __vmread(GUEST_DS_BASE, &c->ds_base);
+    error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
+
+    error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
+    error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
+    error |= __vmread(GUEST_ES_BASE, &c->es_base);
+    error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
+
+    error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
+    error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
+    error |= __vmread(GUEST_SS_BASE, &c->ss_base);
+    error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
+
+    error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
+    error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
+    error |= __vmread(GUEST_FS_BASE, &c->fs_base);
+    error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
+
+    error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
+    error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
+    error |= __vmread(GUEST_GS_BASE, &c->gs_base);
+    error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
+
+    error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
+    error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
+    error |= __vmread(GUEST_TR_BASE, &c->tr_base);
+    error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
+
+    error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
+    error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
+    error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
+    error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
+
+    return !error;
+}
+
+int
+vmx_world_restore(struct exec_domain *d, struct vmx_assist_context *c)
+{
+    unsigned long mfn, old_cr4;
+    int error = 0;
+
+    error |= __vmwrite(GUEST_EIP, c->eip);
+    error |= __vmwrite(GUEST_ESP, c->esp);
+    error |= __vmwrite(GUEST_EFLAGS, c->eflags);
+
+    error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
+
+    if (c->cr3 == d->arch.arch_vmx.cpu_cr3) {
+       /* 
+        * This is simple TLB flush, implying the guest has 
+        * removed some translation or changed page attributes.
+        * We simply invalidate the shadow.
+        */
+       mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
+       if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table)) {
+           VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3);
+           domain_crash_synchronous();
+           return 0;
+       }
+       shadow_sync_all(d->domain);
+    } else {
+       /*
+        * If different, make a shadow. Check if the PDBR is valid
+        * first.
+        */
+       VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %lx", c->cr3);
+       if ((c->cr3 >> PAGE_SHIFT) > d->domain->max_pages) {
+           VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3);
+           domain_crash_synchronous(); 
+           return 0;
+       }
+       mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
+       d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+       update_pagetables(d);
+       /* 
+        * arch.shadow_table should now hold the next CR3 for shadow
+        */
+       d->arch.arch_vmx.cpu_cr3 = c->cr3;
+       VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", c->cr3);
+       __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table));
+    }
+
+    error |= __vmread(CR4_READ_SHADOW, &old_cr4);
+    error |= __vmwrite(GUEST_CR4, (c->cr4 | X86_CR4_VMXE));
+    error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
+
+    error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
+    error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
+
+    error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
+    error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
+
+    error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
+    error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
+    error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
+    error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
+    error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
+    error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
+    error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
+    error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
+    error |= __vmwrite(GUEST_ES_BASE, c->es_base);
+    error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
+    error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
+    error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
+    error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
+    error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
+    error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
+    error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
+    error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
+    error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
+    error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
+    error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
+    error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
+    error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
+    error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
+    error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
+    error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
+
+    return !error;
+}
+
+enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
+
+int
+vmx_assist(struct exec_domain *d, int mode)
+{
+    struct vmx_assist_context c;
+    unsigned long magic, cp;
+
+    /* make sure vmxassist exists (this is not an error) */
+    if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), COPY_IN))
+       return 0;
+    if (magic != VMXASSIST_MAGIC)
+       return 0;
+
+    switch (mode) {
+    /*
+     * Transfer control to vmxassist.
+     * Store the current context in VMXASSIST_OLD_CONTEXT and load
+     * the new VMXASSIST_NEW_CONTEXT context. This context was created
+     * by vmxassist and will transfer control to it.
+     */
+    case VMX_ASSIST_INVOKE:
+       /* save the old context */
+       if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
+           goto error;
+       if (cp != 0) {
+           if (!vmx_world_save(d, &c))
+               goto error;
+           if (!vmx_copy(&c, cp, sizeof(c), COPY_OUT))
+               goto error;
+       }
+
+       /* restore the new context, this should activate vmxassist */
+       if (!vmx_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), COPY_IN))
+           goto error;
+       if (cp != 0) {
+            if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
+               goto error;
+           if (!vmx_world_restore(d, &c))
+               goto error;
+           return 1;
+       }
+       break;
+
+    /*
+     * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
+     * above.
+     */
+    case VMX_ASSIST_RESTORE:
+       /* save the old context */
+       if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
+           goto error;
+       if (cp != 0) {
+            if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
+               goto error;
+           if (!vmx_world_restore(d, &c))
+               goto error;
+           return 1;
+       }
+       break;
+    }
+
+error:
+    printf("Failed to transfer to vmxassist\n");
+    domain_crash_synchronous(); 
     return 0;
 }
 
@@ -398,6 +650,7 @@
 {
     unsigned long value;
     unsigned long old_cr;
+    unsigned long eip;
     struct exec_domain *d = current;
 
     switch (gp) {
@@ -468,15 +721,28 @@
             put_page_and_type(&frame_table[old_base_mfn]);
         } else {
             if ((value & X86_CR0_PE) == 0) {
-               unsigned long eip;
-
                __vmread(GUEST_EIP, &eip);
                 VMX_DBG_LOG(DBG_LEVEL_1,
                        "Disabling CR0.PE at %%eip 0x%lx", eip);
-               if (vm86assist(d)) {
+               if (vmx_assist(d, VMX_ASSIST_INVOKE)) {
+                   set_bit(VMX_CPU_STATE_ASSIST_ENABLED,
+                                               &d->arch.arch_vmx.cpu_state);
                    __vmread(GUEST_EIP, &eip);
                    VMX_DBG_LOG(DBG_LEVEL_1,
-                       "Transfering control to vm86assist %%eip 0x%lx", eip);
+                       "Transfering control to vmxassist %%eip 0x%lx", eip);
+                   return 0; /* do not update eip! */
+               }
+           } else if (test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
+                                       &d->arch.arch_vmx.cpu_state)) {
+               __vmread(GUEST_EIP, &eip);
+               VMX_DBG_LOG(DBG_LEVEL_1,
+                       "Enabling CR0.PE at %%eip 0x%lx", eip);
+               if (vmx_assist(d, VMX_ASSIST_RESTORE)) {
+                   clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
+                                               &d->arch.arch_vmx.cpu_state);
+                   __vmread(GUEST_EIP, &eip);
+                   VMX_DBG_LOG(DBG_LEVEL_1,
+                       "Restoring to %%eip 0x%lx", eip);
                    return 0; /* do not update eip! */
                }
            }
@@ -548,6 +814,7 @@
          */
         if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
             vmx_shadow_clear_state(d->domain);
+            shadow_sync_all(d->domain);
         }
         break;
     default:
diff --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet --exclude=PENDING 
-ru -Bb -N xeno-unstable.bk/xen/arch/x86/vmx_platform.c 
xeno-unstable.real-mode/xen/arch/x86/vmx_platform.c
--- xeno-unstable.bk/xen/arch/x86/vmx_platform.c        2005-04-17 
11:47:01.000000000 -0400
+++ xeno-unstable.real-mode/xen/arch/x86/vmx_platform.c 2005-04-17 
11:18:32.000000000 -0400
@@ -484,6 +484,11 @@
 
     vm86 = inst_decoder_regs->eflags & X86_EFLAGS_VM;
 
+    if (test_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags)) {
+        printf("VMX I/O has not yet completed\n");
+        domain_crash_synchronous();
+    }
+
     set_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags);
     p->dir = dir;
     p->pdata_valid = pvalid;
diff --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet --exclude=PENDING 
-ru -Bb -N xeno-unstable.bk/xen/include/asm-x86/vmx_vmcs.h 
xeno-unstable.real-mode/xen/include/asm-x86/vmx_vmcs.h
--- xeno-unstable.bk/xen/include/asm-x86/vmx_vmcs.h     2005-04-17 
11:47:01.000000000 -0400
+++ xeno-unstable.real-mode/xen/include/asm-x86/vmx_vmcs.h      2005-04-15 
13:15:45.000000000 -0400
@@ -22,24 +22,15 @@
 #include <asm/config.h>
 #include <asm/vmx_cpu.h>
 #include <asm/vmx_platform.h>
+#include <public/vmx_assist.h>
 
 extern int start_vmx(void);
 extern void stop_vmx(void);
 
 void vmx_enter_scheduler(void);
 
-union vmcs_arbytes {
-    struct arbyte_fields {
-        unsigned int 
-        seg_type: 4, s: 1, dpl: 2, p: 1, 
-        reserved0: 4, avl: 1, reserved1: 1,     
-        default_ops_size: 1, g: 1, null_bit: 1, 
-        reserved2: 15;
-    }  __attribute__((packed)) fields;
-    unsigned int bytes;
-};
-
 #define VMX_CPU_STATE_PG_ENABLED        0       
+#define        VMX_CPU_STATE_ASSIST_ENABLED    1
 #define VMCS_SIZE                       0x1000
 
 struct vmcs_struct {
diff --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet --exclude=PENDING 
-ru -Bb -N xeno-unstable.bk/xen/include/public/vmx_assist.h 
xeno-unstable.real-mode/xen/include/public/vmx_assist.h
--- xeno-unstable.bk/xen/include/public/vmx_assist.h    1969-12-31 
19:00:00.000000000 -0500
+++ xeno-unstable.real-mode/xen/include/public/vmx_assist.h     2005-04-17 
11:47:58.000000000 -0400
@@ -0,0 +1,101 @@
+/*
+ * vmx_assist.h: Context definitions for the VMXASSIST world switch.
+ *
+ * Leendert van Doorn, leendert@xxxxxxxxxxxxxx
+ * Copyright (c) 2005, International Business Machines Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef _VMX_ASSIST_H_
+#define _VMX_ASSIST_H_
+
+#define        VMXASSIST_BASE          0xE0000
+#define        VMXASSIST_MAGIC         0x17101966
+#define        VMXASSIST_MAGIC_OFFSET  (VMXASSIST_BASE+8)
+
+#define        VMXASSIST_NEW_CONTEXT   (VMXASSIST_BASE + 12)
+#define        VMXASSIST_OLD_CONTEXT   (VMXASSIST_NEW_CONTEXT + 4)
+
+#ifndef __ASSEMBLY__
+
+union vmcs_arbytes {
+       struct arbyte_fields {
+               unsigned int    seg_type        : 4,
+                               s               : 1,
+                               dpl             : 2,
+                               p               : 1, 
+                               reserved0       : 4,
+                               avl             : 1,
+                               reserved1       : 1,     
+                               default_ops_size: 1,
+                               g               : 1,
+                               null_bit        : 1, 
+                               reserved2       : 15;
+       }  __attribute__((packed)) fields;
+       unsigned int bytes;
+};
+
+/*
+ * World switch state
+ */
+typedef struct vmx_assist_context {
+       unsigned long           eip;            /* execution pointer */
+       unsigned long           esp;            /* stack point */
+       unsigned long           eflags;         /* flags register */
+       unsigned long           cr0;
+       unsigned long           cr3;            /* page table directory */
+       unsigned long           cr4;
+       unsigned long           idtr_limit;     /* idt */
+       unsigned long           idtr_base;
+       unsigned long           gdtr_limit;     /* gdt */
+       unsigned long           gdtr_base;
+       unsigned long           cs_sel;         /* cs selector */
+       unsigned long           cs_limit;
+       unsigned long           cs_base;
+       union vmcs_arbytes      cs_arbytes;
+       unsigned long           ds_sel;         /* ds selector */
+       unsigned long           ds_limit;
+       unsigned long           ds_base;
+       union vmcs_arbytes      ds_arbytes;
+       unsigned long           es_sel;         /* es selector */
+       unsigned long           es_limit;
+       unsigned long           es_base;
+       union vmcs_arbytes      es_arbytes;
+       unsigned long           ss_sel;         /* ss selector */
+       unsigned long           ss_limit;
+       unsigned long           ss_base;
+       union vmcs_arbytes      ss_arbytes;
+       unsigned long           fs_sel;         /* fs selector */
+       unsigned long           fs_limit;
+       unsigned long           fs_base;
+       union vmcs_arbytes      fs_arbytes;
+       unsigned long           gs_sel;         /* gs selector */
+       unsigned long           gs_limit;
+       unsigned long           gs_base;
+       union vmcs_arbytes      gs_arbytes;
+       unsigned long           tr_sel;         /* task selector */
+       unsigned long           tr_limit;
+       unsigned long           tr_base;
+       union vmcs_arbytes      tr_arbytes;
+       unsigned long           ldtr_sel;       /* ldtr selector */
+       unsigned long           ldtr_limit;
+       unsigned long           ldtr_base;
+       union vmcs_arbytes      ldtr_arbytes;
+} vmx_assist_context_t;
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _VMX_ASSIST_H_ */
+

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.