[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [PATCH] VMX world switch



ChangeSet 1.1345, 2005/04/20 21:35:17+01:00, leendert@xxxxxxxxxxxxxx

        [PATCH] VMX world switch
        
        The attached code implements a VMX world switch to vmxassist (a small 
assist
        module residing in a VMX enabled partition where it is responsible for
        emulating real mode) whever CR0.PE is disabled.
        
        The patch temporarily disables the PGE feature flag in cpuid as it is
        currently broken (try running an unmodified 2.6 kernel that sets PGE in
        mm/init.c/paging_init()).
        
        The patch adds consistency checks before setting the ARCH_VMX_IO_WAIT 
state
        to detect race conditions on SMP systems.
        
        Signed-Off-By: Leendert van Doorn <leendert@xxxxxxxxxxxxxx>
        Signed-off-by: ian@xxxxxxxxxxxxx



 arch/x86/vmx.c              |  281 ++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/vmx_platform.c     |    5 
 include/asm-x86/vmx_vmcs.h  |   13 --
 include/public/vmx_assist.h |  101 +++++++++++++++
 4 files changed, 382 insertions(+), 18 deletions(-)


diff -Nru a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c
--- a/xen/arch/x86/vmx.c        2005-04-20 17:02:40 -04:00
+++ b/xen/arch/x86/vmx.c        2005-04-20 17:02:40 -04:00
@@ -195,6 +195,7 @@
     cpuid(input, &eax, &ebx, &ecx, &edx);
 
     if (input == 1) {
+        clear_bit(X86_FEATURE_PGE, &edx); /* temporarily disabled */
         clear_bit(X86_FEATURE_PSE, &edx);
         clear_bit(X86_FEATURE_PAE, &edx);
         clear_bit(X86_FEATURE_PSE36, &edx);
@@ -382,10 +383,261 @@
     do_block();
 }
 
-static int
-vm86assist(struct exec_domain *d)
+enum { COPY_IN = 0, COPY_OUT };
+
+static inline int
+vmx_copy(void *buf, unsigned long laddr, int size, int dir)
+{
+    unsigned char *addr;
+    unsigned long mfn;
+
+    if ((size + (laddr & (PAGE_SIZE - 1))) >= PAGE_SIZE) {
+       printf("vmx_copy exceeds page boundary\n");
+       return 0;
+    }
+
+    mfn = phys_to_machine_mapping(gva_to_gpte(laddr) >> PAGE_SHIFT);
+    addr = map_domain_mem((mfn << PAGE_SHIFT) | (laddr & ~PAGE_MASK));
+
+    if (dir == COPY_IN)
+           memcpy(buf, addr, size);
+    else
+           memcpy(addr, buf, size);
+
+    unmap_domain_mem(addr);
+    return 1;
+}
+
+int
+vmx_world_save(struct exec_domain *d, struct vmx_assist_context *c)
+{
+    unsigned long inst_len;
+    int error = 0;
+
+    error |= __vmread(INSTRUCTION_LEN, &inst_len);
+    error |= __vmread(GUEST_EIP, &c->eip);
+    c->eip += inst_len; /* skip transition instruction */
+    error |= __vmread(GUEST_ESP, &c->esp);
+    error |= __vmread(GUEST_EFLAGS, &c->eflags);
+
+    error |= __vmread(CR0_READ_SHADOW, &c->cr0);
+    c->cr3 = d->arch.arch_vmx.cpu_cr3;
+    error |= __vmread(CR4_READ_SHADOW, &c->cr4);
+
+    error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
+    error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
+
+    error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
+    error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
+
+    error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
+    error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
+    error |= __vmread(GUEST_CS_BASE, &c->cs_base);
+    error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
+
+    error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
+    error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
+    error |= __vmread(GUEST_DS_BASE, &c->ds_base);
+    error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
+
+    error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
+    error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
+    error |= __vmread(GUEST_ES_BASE, &c->es_base);
+    error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
+
+    error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
+    error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
+    error |= __vmread(GUEST_SS_BASE, &c->ss_base);
+    error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
+
+    error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
+    error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
+    error |= __vmread(GUEST_FS_BASE, &c->fs_base);
+    error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
+
+    error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
+    error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
+    error |= __vmread(GUEST_GS_BASE, &c->gs_base);
+    error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
+
+    error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
+    error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
+    error |= __vmread(GUEST_TR_BASE, &c->tr_base);
+    error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
+
+    error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
+    error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
+    error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
+    error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
+
+    return !error;
+}
+
+int
+vmx_world_restore(struct exec_domain *d, struct vmx_assist_context *c)
+{
+    unsigned long mfn, old_cr4;
+    int error = 0;
+
+    error |= __vmwrite(GUEST_EIP, c->eip);
+    error |= __vmwrite(GUEST_ESP, c->esp);
+    error |= __vmwrite(GUEST_EFLAGS, c->eflags);
+
+    error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
+
+    if (c->cr3 == d->arch.arch_vmx.cpu_cr3) {
+       /* 
+        * This is simple TLB flush, implying the guest has 
+        * removed some translation or changed page attributes.
+        * We simply invalidate the shadow.
+        */
+       mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
+       if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table)) {
+           VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3);
+           domain_crash_synchronous();
+           return 0;
+       }
+       shadow_sync_all(d->domain);
+    } else {
+       /*
+        * If different, make a shadow. Check if the PDBR is valid
+        * first.
+        */
+       VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %lx", c->cr3);
+       if ((c->cr3 >> PAGE_SHIFT) > d->domain->max_pages) {
+           VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3);
+           domain_crash_synchronous(); 
+           return 0;
+       }
+       mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
+       d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+       update_pagetables(d);
+       /* 
+        * arch.shadow_table should now hold the next CR3 for shadow
+        */
+       d->arch.arch_vmx.cpu_cr3 = c->cr3;
+       VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", c->cr3);
+       __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table));
+    }
+
+    error |= __vmread(CR4_READ_SHADOW, &old_cr4);
+    error |= __vmwrite(GUEST_CR4, (c->cr4 | X86_CR4_VMXE));
+    error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
+
+    error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
+    error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
+
+    error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
+    error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
+
+    error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
+    error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
+    error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
+    error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
+    error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
+    error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
+    error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
+    error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
+    error |= __vmwrite(GUEST_ES_BASE, c->es_base);
+    error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
+    error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
+    error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
+    error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
+    error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
+    error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
+    error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
+    error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
+    error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
+    error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
+    error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
+    error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
+    error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
+    error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
+    error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
+    error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
+
+    return !error;
+}
+
+enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
+
+int
+vmx_assist(struct exec_domain *d, int mode)
 {
-    /* stay tuned ... */
+    struct vmx_assist_context c;
+    unsigned long magic, cp;
+
+    /* make sure vmxassist exists (this is not an error) */
+    if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), COPY_IN))
+       return 0;
+    if (magic != VMXASSIST_MAGIC)
+       return 0;
+
+    switch (mode) {
+    /*
+     * Transfer control to vmxassist.
+     * Store the current context in VMXASSIST_OLD_CONTEXT and load
+     * the new VMXASSIST_NEW_CONTEXT context. This context was created
+     * by vmxassist and will transfer control to it.
+     */
+    case VMX_ASSIST_INVOKE:
+       /* save the old context */
+       if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
+           goto error;
+       if (cp != 0) {
+           if (!vmx_world_save(d, &c))
+               goto error;
+           if (!vmx_copy(&c, cp, sizeof(c), COPY_OUT))
+               goto error;
+       }
+
+       /* restore the new context, this should activate vmxassist */
+       if (!vmx_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), COPY_IN))
+           goto error;
+       if (cp != 0) {
+            if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
+               goto error;
+           if (!vmx_world_restore(d, &c))
+               goto error;
+           return 1;
+       }

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.