[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] Use virtual 8086 mode for VMX guests with CR0.PE == 0



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1228840082 0
# Node ID 6595393a3d28a7bf95f02b198f52d754bcfa7a80
# Parent  5535efd8e01141f840f9a8cbc31a9b3a4c9d49e9
Use virtual 8086 mode for VMX guests with CR0.PE == 0

When a VMX guest tries to enter real mode, put it in virtual 8086 mode
instead, if that's possible.  Handle all errors and corner cases by
falling back to the real-mode emulator.

This is similar to the old VMXASSIST system except it uses Xen's
x86_emulate emulator instead of having a partial emulator in the guest
firmware.  It more than doubles the speed of real-mode operation on
VMX.

Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---
 tools/firmware/hvmloader/hvmloader.c   |   19 ++
 tools/libxc/xc_domain_restore.c        |   16 ++
 tools/libxc/xc_domain_save.c           |   26 ++-
 xen/arch/x86/hvm/vmx/entry.S           |   14 +
 xen/arch/x86/hvm/vmx/realmode.c        |   45 ++---
 xen/arch/x86/hvm/vmx/vmcs.c            |   51 ++++--
 xen/arch/x86/hvm/vmx/vmx.c             |  250 ++++++++++++++++++++++++++++-----
 xen/arch/x86/x86_32/asm-offsets.c      |    4 
 xen/arch/x86/x86_64/asm-offsets.c      |    4 
 xen/arch/x86/x86_emulate/x86_emulate.h |    1 
 xen/include/asm-x86/hvm/vmx/vmcs.h     |   13 +
 xen/include/asm-x86/perfc_defn.h       |    3 
 xen/include/public/hvm/params.h        |    5 
 13 files changed, 356 insertions(+), 95 deletions(-)

diff -r 5535efd8e011 -r 6595393a3d28 tools/firmware/hvmloader/hvmloader.c
--- a/tools/firmware/hvmloader/hvmloader.c      Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/firmware/hvmloader/hvmloader.c      Tue Dec 09 16:28:02 2008 +0000
@@ -536,6 +536,23 @@ static uint16_t init_xen_platform_io_bas
     return bios_info->xen_pfiob;
 }
 
+/* Set up an empty TSS area for virtual 8086 mode to use. 
+ * The only important thing is that it musn't have any bits set 
+ * in the interrupt redirection bitmap, so all zeros will do.  */
+static void init_vm86_tss(void)
+{
+    uint32_t tss;
+    struct xen_hvm_param p;
+
+    tss = e820_malloc(128, 128);
+    memset((char *)tss, 0, 128);
+    p.domid = DOMID_SELF;
+    p.index = HVM_PARAM_VM86_TSS;
+    p.value = tss;
+    hypercall_hvm_op(HVMOP_set_param, &p);
+    printf("vm86 TSS at %08x\n", tss);
+}
+
 int main(void)
 {
     int option_rom_sz = 0, vgabios_sz = 0, etherboot_sz = 0;
@@ -605,6 +622,8 @@ int main(void)
         printf("Loading ACPI ...\n");
         acpi_build_tables();
     }
+
+    init_vm86_tss();
 
     cmos_write_memory_size();
 
diff -r 5535efd8e011 -r 6595393a3d28 tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/libxc/xc_domain_restore.c   Tue Dec 09 16:28:02 2008 +0000
@@ -490,6 +490,22 @@ int xc_domain_restore(int xc_handle, int
             continue;
         }
 
+        if ( j == -4 )
+        {
+            uint64_t vm86_tss;
+
+            /* Skip padding 4 bytes then read the vm86 TSS location. */
+            if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
+                 read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
+            {
+                ERROR("error read the address of the vm86 TSS");
+                goto out;
+            }
+
+            xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
+            continue;
+        }
+
         if ( j == 0 )
             break;  /* our work here is done */
 
diff -r 5535efd8e011 -r 6595393a3d28 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/libxc/xc_domain_save.c      Tue Dec 09 16:28:02 2008 +0000
@@ -1388,18 +1388,30 @@ int xc_domain_save(int xc_handle, int io
     if ( hvm )
     {
         struct {
-            int minusthree;
+            int id;
             uint32_t pad;
-            uint64_t ident_pt;
-        } chunk = { -3, 0 };
-
+            uint64_t data;
+        } chunk = { 0, };
+
+        chunk.id = -3;
         xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
-                         (unsigned long *)&chunk.ident_pt);
-
-        if ( (chunk.ident_pt != 0) &&
+                         (unsigned long *)&chunk.data);
+
+        if ( (chunk.data != 0) &&
              write_exact(io_fd, &chunk, sizeof(chunk)) )
         {
             PERROR("Error when writing the ident_pt for EPT guest");
+            goto out;
+        }
+
+        chunk.id = -4;
+        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+                         (unsigned long *)&chunk.data);
+
+        if ( (chunk.data != 0) &&
+             write_exact(io_fd, &chunk, sizeof(chunk)) )
+        {
+            PERROR("Error when writing the vm86 TSS for guest");
             goto out;
         }
     }
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/entry.S
--- a/xen/arch/x86/hvm/vmx/entry.S      Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/entry.S      Tue Dec 09 16:28:02 2008 +0000
@@ -133,9 +133,15 @@ vmx_asm_do_vmentry:
         cmpl $0,(r(dx),r(ax),1)
         jnz  .Lvmx_process_softirqs
 
-        testb $0xff,VCPU_vmx_emul(r(bx))
-        jnz  .Lvmx_goto_realmode
-
+        testb $0xff,VCPU_vmx_emulate(r(bx))
+        jnz .Lvmx_goto_emulator
+        testb $0xff,VCPU_vmx_realmode(r(bx))
+        jz .Lvmx_not_realmode
+        cmpw $0,VCPU_vm86_seg_mask(r(bx))
+        jnz .Lvmx_goto_emulator
+        call_with_regs(vmx_enter_realmode) 
+
+.Lvmx_not_realmode:
         mov  VCPU_hvm_guest_cr2(r(bx)),r(ax)
         mov  r(ax),%cr2
         call vmx_trace_vmentry
@@ -189,7 +195,7 @@ vmx_asm_do_vmentry:
         call vm_launch_fail
         ud2
 
-.Lvmx_goto_realmode:
+.Lvmx_goto_emulator:
         sti
         call_with_regs(vmx_realmode)
         jmp  vmx_asm_do_vmentry
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/realmode.c
--- a/xen/arch/x86/hvm/vmx/realmode.c   Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/realmode.c   Tue Dec 09 16:28:02 2008 +0000
@@ -103,30 +103,12 @@ static void realmode_emulate_one(struct 
 static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     struct vcpu *curr = current;
-    unsigned long seg_reg_dirty;
     uint32_t intr_info;
     int rc;
 
-    seg_reg_dirty = hvmemul_ctxt->seg_reg_dirty;
-    hvmemul_ctxt->seg_reg_dirty = 0;
+    perfc_incr(realmode_emulations);
 
     rc = hvm_emulate_one(hvmemul_ctxt);
-
-    if ( test_bit(x86_seg_cs, &hvmemul_ctxt->seg_reg_dirty) )
-    {
-        curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_CS;
-        if ( hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel & 3 )
-            curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_CS;
-    }
-
-    if ( test_bit(x86_seg_ss, &hvmemul_ctxt->seg_reg_dirty) )
-    {
-        curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_SS;
-        if ( hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt)->sel & 3 )
-            curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_SS;
-    }
-
-    hvmemul_ctxt->seg_reg_dirty |= seg_reg_dirty;
 
     if ( rc == X86EMUL_UNHANDLEABLE )
     {
@@ -210,7 +192,8 @@ void vmx_realmode(struct cpu_user_regs *
         intr_info = 0;
     }
 
-    while ( curr->arch.hvm_vmx.vmxemul &&
+    curr->arch.hvm_vmx.vmx_emulate = 1;
+    while ( curr->arch.hvm_vmx.vmx_emulate &&
             !softirq_pending(smp_processor_id()) &&
             (curr->arch.hvm_vcpu.io_state == HVMIO_none) )
     {
@@ -220,13 +203,27 @@ void vmx_realmode(struct cpu_user_regs *
          * in real mode, because we don't emulate protected-mode IDT vectoring.
          */
         if ( unlikely(!(++emulations & 15)) &&
-             !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) &&
+             curr->arch.hvm_vmx.vmx_realmode && 
              hvm_local_events_need_delivery(curr) )
             break;
+
         realmode_emulate_one(&hvmemul_ctxt);
-    }
-
-    if ( !curr->arch.hvm_vmx.vmxemul )
+
+        /* Stop emulating unless our segment state is not safe */
+        if ( curr->arch.hvm_vmx.vmx_realmode )
+            curr->arch.hvm_vmx.vmx_emulate = 
+                (curr->arch.hvm_vmx.vm86_segment_mask != 0);
+        else
+            curr->arch.hvm_vmx.vmx_emulate = 
+                 ((hvmemul_ctxt.seg_reg[x86_seg_cs].sel & 3)
+                  || (hvmemul_ctxt.seg_reg[x86_seg_ss].sel & 3));
+    }
+
+    /* Need to emulate next time if we've started an IO operation */
+    if ( curr->arch.hvm_vcpu.io_state != HVMIO_none )
+        curr->arch.hvm_vmx.vmx_emulate = 1;
+
+    if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode )
     {
         /*
          * Cannot enter protected mode with bogus selector RPLs and DPLs.
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Tue Dec 09 16:28:02 2008 +0000
@@ -880,21 +880,34 @@ void vmx_do_resume(struct vcpu *v)
     reset_stack_and_jump(vmx_asm_do_vmentry);
 }
 
-static void vmx_dump_sel(char *name, enum x86_segment seg)
-{
-    struct segment_register sreg;
-    hvm_get_segment_register(current, seg, &sreg);
-    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n", 
-           name, sreg.sel, sreg.attr.bytes, sreg.limit,
-           (unsigned long long)sreg.base);
-}
-
 static unsigned long vmr(unsigned long field)
 {
     int rc;
     unsigned long val;
     val = __vmread_safe(field, &rc);
     return rc ? 0 : val;
+}
+
+static void vmx_dump_sel(char *name, uint32_t selector)
+{
+    uint32_t sel, attr, limit;
+    uint64_t base;
+    sel = vmr(selector);
+    attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR));
+    limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR));
+    base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR));
+    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016"PRIx64"\n",
+           name, sel, attr, limit, base);
+}
+
+static void vmx_dump_sel2(char *name, uint32_t lim)
+{
+    uint32_t limit;
+    uint64_t base;
+    limit = vmr(lim);
+    base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+    printk("%s:                           limit=0x%08x, base=0x%016"PRIx64"\n",
+           name, limit, base);
 }
 
 void vmcs_dump_vcpu(struct vcpu *v)
@@ -938,16 +951,16 @@ void vmcs_dump_vcpu(struct vcpu *v)
            (unsigned long long)vmr(GUEST_SYSENTER_ESP),
            (int)vmr(GUEST_SYSENTER_CS),
            (unsigned long long)vmr(GUEST_SYSENTER_EIP));
-    vmx_dump_sel("CS", x86_seg_cs);
-    vmx_dump_sel("DS", x86_seg_ds);
-    vmx_dump_sel("SS", x86_seg_ss);
-    vmx_dump_sel("ES", x86_seg_es);
-    vmx_dump_sel("FS", x86_seg_fs);
-    vmx_dump_sel("GS", x86_seg_gs);
-    vmx_dump_sel("GDTR", x86_seg_gdtr);
-    vmx_dump_sel("LDTR", x86_seg_ldtr);
-    vmx_dump_sel("IDTR", x86_seg_idtr);
-    vmx_dump_sel("TR", x86_seg_tr);
+    vmx_dump_sel("CS", GUEST_CS_SELECTOR);
+    vmx_dump_sel("DS", GUEST_DS_SELECTOR);
+    vmx_dump_sel("SS", GUEST_SS_SELECTOR);
+    vmx_dump_sel("ES", GUEST_ES_SELECTOR);
+    vmx_dump_sel("FS", GUEST_FS_SELECTOR);
+    vmx_dump_sel("GS", GUEST_GS_SELECTOR);
+    vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT);
+    vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
+    vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
+    vmx_dump_sel("TR", GUEST_TR_SELECTOR);
     x  = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
     x |= (uint32_t)vmr(TSC_OFFSET);
     printk("TSC Offset = %016llx\n", x);
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Tue Dec 09 16:28:02 2008 +0000
@@ -704,6 +704,26 @@ static void vmx_ctxt_switch_to(struct vc
     vpmu_load(v);
 }
 
+
+/* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
+ * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
+ * The guest thinks it's got ring-0 segments, so we need to fudge
+ * things.  We store the ring-3 version in the VMCS to avoid lots of
+ * shuffling on vmenter and vmexit, and translate in these accessors. */
+
+#define rm_cs_attr (((union segment_attributes) {                       \
+        .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define rm_ds_attr (((union segment_attributes) {                       \
+        .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define vm86_ds_attr (((union segment_attributes) {                     \
+        .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define vm86_tr_attr (((union segment_attributes) {                     \
+        .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+
 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
@@ -779,14 +799,85 @@ static void vmx_get_segment_register(str
     /* Unusable flag is folded into Present flag. */
     if ( attr & (1u<<16) )
         reg->attr.fields.p = 0;
+
+    /* Adjust for virtual 8086 mode */
+    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr 
+         && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
+    {
+        struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
+        if ( seg == x86_seg_tr ) 
+            *reg = *sreg;
+        else if ( reg->base != sreg->base || seg == x86_seg_ss )
+        {
+            /* If the guest's reloaded the segment, remember the new version.
+             * We can't tell if the guest reloaded the segment with another 
+             * one that has the same base.  By default we assume it hasn't,
+             * since we don't want to lose big-real-mode segment attributes,
+             * but for SS we assume it has: the Ubuntu graphical bootloader
+             * does this and gets badly confused if we leave the old SS in 
+             * place. */
+            reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
+            *sreg = *reg;
+        }
+        else 
+        {
+            /* Always give realmode guests a selector that matches the base
+             * but keep the attr and limit from before */
+            *reg = *sreg;
+            reg->sel = reg->base >> 4;
+        }
+    }
 }
 
 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
-    uint32_t attr;
-
+    uint32_t attr, sel, limit;
+    uint64_t base;
+
+    sel = reg->sel;
     attr = reg->attr.bytes;
+    limit = reg->limit;
+    base = reg->base;
+
+    /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
+    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
+    {
+        /* Remember the proper contents */
+        v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
+        
+        if ( seg == x86_seg_tr ) 
+        {
+            if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
+            {
+                sel = 0;
+                attr = vm86_tr_attr;
+                limit = 0xff;
+                base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
+                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
+            }
+            else
+                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
+        }
+        else
+        {
+            /* Try to fake it out as a 16bit data segment.  This could
+             * cause confusion for the guest if it reads the selector,
+             * but otherwise we have to emulate if *any* segment hasn't
+             * been reloaded. */
+            if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
+                 && reg->attr.fields.p )
+            {
+                sel = base >> 4;
+                attr = vm86_ds_attr;
+                limit = 0xffff;
+                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
+            }
+            else 
+                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
+        }
+    }
+
     attr = ((attr & 0xf00) << 4) | (attr & 0xff);
 
     /* Not-present must mean unusable. */
@@ -794,67 +885,67 @@ static void vmx_set_segment_register(str
         attr |= (1u << 16);
 
     /* VMX has strict consistency requirement for flag G. */
-    attr |= !!(reg->limit >> 20) << 15;
+    attr |= !!(limit >> 20) << 15;
 
     vmx_vmcs_enter(v);
 
     switch ( seg )
     {
     case x86_seg_cs:
-        __vmwrite(GUEST_CS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_CS_LIMIT, reg->limit);
-        __vmwrite(GUEST_CS_BASE, reg->base);
+        __vmwrite(GUEST_CS_SELECTOR, sel);
+        __vmwrite(GUEST_CS_LIMIT, limit);
+        __vmwrite(GUEST_CS_BASE, base);
         __vmwrite(GUEST_CS_AR_BYTES, attr);
         break;
     case x86_seg_ds:
-        __vmwrite(GUEST_DS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_DS_LIMIT, reg->limit);
-        __vmwrite(GUEST_DS_BASE, reg->base);
+        __vmwrite(GUEST_DS_SELECTOR, sel);
+        __vmwrite(GUEST_DS_LIMIT, limit);
+        __vmwrite(GUEST_DS_BASE, base);
         __vmwrite(GUEST_DS_AR_BYTES, attr);
         break;
     case x86_seg_es:
-        __vmwrite(GUEST_ES_SELECTOR, reg->sel);
-        __vmwrite(GUEST_ES_LIMIT, reg->limit);
-        __vmwrite(GUEST_ES_BASE, reg->base);
+        __vmwrite(GUEST_ES_SELECTOR, sel);
+        __vmwrite(GUEST_ES_LIMIT, limit);
+        __vmwrite(GUEST_ES_BASE, base);
         __vmwrite(GUEST_ES_AR_BYTES, attr);
         break;
     case x86_seg_fs:
-        __vmwrite(GUEST_FS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_FS_LIMIT, reg->limit);
-        __vmwrite(GUEST_FS_BASE, reg->base);
+        __vmwrite(GUEST_FS_SELECTOR, sel);
+        __vmwrite(GUEST_FS_LIMIT, limit);
+        __vmwrite(GUEST_FS_BASE, base);
         __vmwrite(GUEST_FS_AR_BYTES, attr);
         break;
     case x86_seg_gs:
-        __vmwrite(GUEST_GS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_GS_LIMIT, reg->limit);
-        __vmwrite(GUEST_GS_BASE, reg->base);
+        __vmwrite(GUEST_GS_SELECTOR, sel);
+        __vmwrite(GUEST_GS_LIMIT, limit);
+        __vmwrite(GUEST_GS_BASE, base);
         __vmwrite(GUEST_GS_AR_BYTES, attr);
         break;
     case x86_seg_ss:
-        __vmwrite(GUEST_SS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_SS_LIMIT, reg->limit);
-        __vmwrite(GUEST_SS_BASE, reg->base);
+        __vmwrite(GUEST_SS_SELECTOR, sel);
+        __vmwrite(GUEST_SS_LIMIT, limit);
+        __vmwrite(GUEST_SS_BASE, base);
         __vmwrite(GUEST_SS_AR_BYTES, attr);
         break;
     case x86_seg_tr:
-        __vmwrite(GUEST_TR_SELECTOR, reg->sel);
-        __vmwrite(GUEST_TR_LIMIT, reg->limit);
-        __vmwrite(GUEST_TR_BASE, reg->base);
+        __vmwrite(GUEST_TR_SELECTOR, sel);
+        __vmwrite(GUEST_TR_LIMIT, limit);
+        __vmwrite(GUEST_TR_BASE, base);
         /* VMX checks that the the busy flag (bit 1) is set. */
         __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
         break;
     case x86_seg_gdtr:
-        __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_GDTR_BASE, reg->base);
+        __vmwrite(GUEST_GDTR_LIMIT, limit);
+        __vmwrite(GUEST_GDTR_BASE, base);
         break;
     case x86_seg_idtr:
-        __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_IDTR_BASE, reg->base);
+        __vmwrite(GUEST_IDTR_LIMIT, limit);
+        __vmwrite(GUEST_IDTR_BASE, base);
         break;
     case x86_seg_ldtr:
-        __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
-        __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_LDTR_BASE, reg->base);
+        __vmwrite(GUEST_LDTR_SELECTOR, sel);
+        __vmwrite(GUEST_LDTR_LIMIT, limit);
+        __vmwrite(GUEST_LDTR_BASE, base);
         __vmwrite(GUEST_LDTR_AR_BYTES, attr);
         break;
     default:
@@ -970,6 +1061,7 @@ static void vmx_update_guest_cr(struct v
     switch ( cr )
     {
     case 0: {
+        int realmode;
         unsigned long hw_cr0_mask =
             X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
 
@@ -998,9 +1090,44 @@ static void vmx_update_guest_cr(struct v
                 vmx_fpu_enter(v);
         }
 
-        v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE;
-        if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
-            v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE;
+        realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE); 
+        if ( realmode != v->arch.hvm_vmx.vmx_realmode )
+        {
+            enum x86_segment s; 
+            struct segment_register reg[x86_seg_tr + 1];
+
+            /* Entering or leaving real mode: adjust the segment registers.
+             * Need to read them all either way, as realmode reads can update
+             * the saved values we'll use when returning to prot mode. */
+            for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+                vmx_get_segment_register(v, s, &reg[s]);
+            v->arch.hvm_vmx.vmx_realmode = realmode;
+            
+            if ( realmode )
+            {
+                for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+                    vmx_set_segment_register(v, s, &reg[s]);
+                v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
+                __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+                __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
+            }
+            else 
+            {
+                for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) 
+                    if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
+                        vmx_set_segment_register(
+                            v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
+                v->arch.hvm_vcpu.hw_cr[4] =
+                    ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
+                     |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
+                __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+                __vmwrite(EXCEPTION_BITMAP, 
+                          HVM_TRAP_MASK
+                          | (paging_mode_hap(v->domain) ?
+                             0 : (1U << TRAP_page_fault))
+                          | (1U << TRAP_no_device));
+            }
+        }
 
         v->arch.hvm_vcpu.hw_cr[0] =
             v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
@@ -1028,6 +1155,8 @@ static void vmx_update_guest_cr(struct v
         if ( paging_mode_hap(v->domain) )
             v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
         v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
+        if ( v->arch.hvm_vmx.vmx_realmode ) 
+            v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
         if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
         {
             v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
@@ -1097,6 +1226,7 @@ static void __vmx_inject_exception(int t
 static void __vmx_inject_exception(int trap, int type, int error_code)
 {
     unsigned long intr_fields;
+    struct vcpu *curr = current;
 
     /*
      * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
@@ -1113,6 +1243,11 @@ static void __vmx_inject_exception(int t
     }
 
     __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
+
+    /* Can't inject exceptions in virtual 8086 mode because they would 
+     * use the protected-mode IDT.  Emulate at the next vmenter instead. */
+    if ( curr->arch.hvm_vmx.vmx_realmode ) 
+        curr->arch.hvm_vmx.vmx_emulate = 1;
 }
 
 void vmx_inject_hw_exception(int trap, int error_code)
@@ -2072,6 +2207,17 @@ static void vmx_failed_vmentry(unsigned 
     domain_crash(curr->domain);
 }
 
+asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+
+    /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3.  Since
+     * we have CR4.VME == 1 and our own TSS with an empty interrupt
+     * redirection bitmap, all software INTs will be handled by vm86 */
+    v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
+    regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
+}
+
 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
     unsigned int exit_reason, idtv_info;
@@ -2099,6 +2245,42 @@ asmlinkage void vmx_vmexit_handler(struc
 
     if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
         return vmx_failed_vmentry(exit_reason, regs);
+
+    if ( v->arch.hvm_vmx.vmx_realmode )
+    {
+        unsigned int vector;
+
+        /* Put RFLAGS back the way the guest wants it */
+        regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
+        regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
+
+        /* Unless this exit was for an interrupt, we've hit something
+         * vm86 can't handle.  Try again, using the emulator. */
+        switch ( exit_reason )
+        {
+        case EXIT_REASON_EXCEPTION_NMI:
+            vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;;
+            if ( vector != TRAP_page_fault
+                 && vector != TRAP_nmi 
+                 && vector != TRAP_machine_check ) 
+            {
+                perfc_incr(realmode_exits);
+                v->arch.hvm_vmx.vmx_emulate = 1;
+                return;
+            }
+        case EXIT_REASON_EXTERNAL_INTERRUPT:
+        case EXIT_REASON_INIT:
+        case EXIT_REASON_SIPI:
+        case EXIT_REASON_PENDING_VIRT_INTR:
+        case EXIT_REASON_PENDING_VIRT_NMI:
+        case EXIT_REASON_MACHINE_CHECK:
+            break;
+        default:
+            v->arch.hvm_vmx.vmx_emulate = 1;
+            perfc_incr(realmode_exits);
+            return;
+        }
+    }
 
     hvm_maybe_deassert_evtchn_irq();
 
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_32/asm-offsets.c Tue Dec 09 16:28:02 2008 +0000
@@ -88,7 +88,9 @@ void __dummy__(void)
     BLANK();
 
     OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
-    OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
+    OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
+    OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
+    OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
     OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
     BLANK();
 
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_64/asm-offsets.c Tue Dec 09 16:28:02 2008 +0000
@@ -107,7 +107,9 @@ void __dummy__(void)
     BLANK();
 
     OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
-    OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
+    OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
+    OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
+    OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
     OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
     BLANK();
 
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_emulate/x86_emulate.h
--- a/xen/arch/x86/x86_emulate/x86_emulate.h    Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h    Tue Dec 09 16:28:02 2008 +0000
@@ -67,6 +67,7 @@ typedef union segment_attributes {
         uint16_t l:   1;    /* 9;  Bit 53 */
         uint16_t db:  1;    /* 10; Bit 54 */
         uint16_t g:   1;    /* 11; Bit 55 */
+        uint16_t pad: 4;
     } fields;
 } __attribute__ ((packed)) segment_attributes_t;
 
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Tue Dec 09 16:28:02 2008 +0000
@@ -109,11 +109,16 @@ struct arch_vmx_struct {
 
     unsigned long        host_cr0;
 
+    /* Is the guest in real mode? */
+    uint8_t              vmx_realmode;
     /* Are we emulating rather than VMENTERing? */
-#define VMXEMUL_REALMODE 1  /* Yes, because CR0.PE == 0   */
-#define VMXEMUL_BAD_CS   2  /* Yes, because CS.RPL != CPL */
-#define VMXEMUL_BAD_SS   4  /* Yes, because SS.RPL != CPL */
-    uint8_t              vmxemul;
+    uint8_t              vmx_emulate;
+    /* Bitmask of segments that we can't safely use in virtual 8086 mode */
+    uint16_t             vm86_segment_mask;
+    /* Shadow CS, SS, DS, ES, FS, GS, TR while in virtual 8086 mode */
+    struct segment_register vm86_saved_seg[x86_seg_tr + 1];
+    /* Remember EFLAGS while in virtual 8086 mode */
+    uint32_t             vm86_saved_eflags;
 };
 
 int vmx_create_vmcs(struct vcpu *v);
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/asm-x86/perfc_defn.h  Tue Dec 09 16:28:02 2008 +0000
@@ -127,4 +127,7 @@ PERFCOUNTER(mshv_wrmsr_tpr,             
 PERFCOUNTER(mshv_wrmsr_tpr,             "MS Hv wrmsr tpr")
 PERFCOUNTER(mshv_wrmsr_eoi,             "MS Hv wrmsr eoi")
 
+PERFCOUNTER(realmode_emulations, "realmode instructions emulated")
+PERFCOUNTER(realmode_exits,      "vmexits from realmode")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/public/hvm/params.h
--- a/xen/include/public/hvm/params.h   Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/public/hvm/params.h   Tue Dec 09 16:28:02 2008 +0000
@@ -100,6 +100,9 @@
 /* ACPI S state: currently support S0 and S3 on x86. */
 #define HVM_PARAM_ACPI_S_STATE 14
 
-#define HVM_NR_PARAMS          15
+/* TSS used on Intel when CR0.PE=0. */
+#define HVM_PARAM_VM86_TSS     15
+
+#define HVM_NR_PARAMS          16
 
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.