diff -r 5535efd8e011 tools/firmware/hvmloader/hvmloader.c --- a/tools/firmware/hvmloader/hvmloader.c Tue Dec 09 13:23:15 2008 +0000 +++ b/tools/firmware/hvmloader/hvmloader.c Tue Dec 09 16:02:45 2008 +0000 @@ -536,6 +536,23 @@ static uint16_t init_xen_platform_io_bas return bios_info->xen_pfiob; } +/* Set up an empty TSS area for virtual 8086 mode to use. + * The only important thing is that it musn't have any bits set + * in the interrupt redirection bitmap, so all zeros will do. */ +static void init_vm86_tss(void) +{ + uint32_t tss; + struct xen_hvm_param p; + + tss = e820_malloc(128, 128); + memset((char *)tss, 0, 128); + p.domid = DOMID_SELF; + p.index = HVM_PARAM_VM86_TSS; + p.value = tss; + hypercall_hvm_op(HVMOP_set_param, &p); + printf("vm86 TSS at %08x\n", tss); +} + int main(void) { int option_rom_sz = 0, vgabios_sz = 0, etherboot_sz = 0; @@ -605,6 +622,8 @@ int main(void) printf("Loading ACPI ...\n"); acpi_build_tables(); } + + init_vm86_tss(); cmos_write_memory_size(); diff -r 5535efd8e011 tools/libxc/xc_domain_restore.c --- a/tools/libxc/xc_domain_restore.c Tue Dec 09 13:23:15 2008 +0000 +++ b/tools/libxc/xc_domain_restore.c Tue Dec 09 16:02:45 2008 +0000 @@ -490,6 +490,22 @@ int xc_domain_restore(int xc_handle, int continue; } + if ( j == -4 ) + { + uint64_t vm86_tss; + + /* Skip padding 4 bytes then read the vm86 TSS location. */ + if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) || + read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) ) + { + ERROR("error read the address of the vm86 TSS"); + goto out; + } + + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss); + continue; + } + if ( j == 0 ) break; /* our work here is done */ diff -r 5535efd8e011 tools/libxc/xc_domain_save.c --- a/tools/libxc/xc_domain_save.c Tue Dec 09 13:23:15 2008 +0000 +++ b/tools/libxc/xc_domain_save.c Tue Dec 09 16:02:45 2008 +0000 @@ -1388,18 +1388,30 @@ int xc_domain_save(int xc_handle, int io if ( hvm ) { struct { - int minusthree; + int id; uint32_t pad; - uint64_t ident_pt; - } chunk = { -3, 0 }; - + uint64_t data; + } chunk = { 0, }; + + chunk.id = -3; xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, - (unsigned long *)&chunk.ident_pt); - - if ( (chunk.ident_pt != 0) && + (unsigned long *)&chunk.data); + + if ( (chunk.data != 0) && write_exact(io_fd, &chunk, sizeof(chunk)) ) { PERROR("Error when writing the ident_pt for EPT guest"); + goto out; + } + + chunk.id = -4; + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, + (unsigned long *)&chunk.data); + + if ( (chunk.data != 0) && + write_exact(io_fd, &chunk, sizeof(chunk)) ) + { + PERROR("Error when writing the vm86 TSS for guest"); goto out; } } diff -r 5535efd8e011 xen/arch/x86/hvm/vmx/entry.S --- a/xen/arch/x86/hvm/vmx/entry.S Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/arch/x86/hvm/vmx/entry.S Tue Dec 09 16:02:45 2008 +0000 @@ -133,9 +133,15 @@ vmx_asm_do_vmentry: cmpl $0,(r(dx),r(ax),1) jnz .Lvmx_process_softirqs - testb $0xff,VCPU_vmx_emul(r(bx)) - jnz .Lvmx_goto_realmode - + testb $0xff,VCPU_vmx_emulate(r(bx)) + jnz .Lvmx_goto_emulator + testb $0xff,VCPU_vmx_realmode(r(bx)) + jz .Lvmx_not_realmode + cmpw $0,VCPU_vm86_seg_mask(r(bx)) + jnz .Lvmx_goto_emulator + call_with_regs(vmx_enter_realmode) + +.Lvmx_not_realmode: mov VCPU_hvm_guest_cr2(r(bx)),r(ax) mov r(ax),%cr2 call vmx_trace_vmentry @@ -189,7 +195,7 @@ vmx_asm_do_vmentry: call vm_launch_fail ud2 -.Lvmx_goto_realmode: +.Lvmx_goto_emulator: sti call_with_regs(vmx_realmode) jmp vmx_asm_do_vmentry diff -r 5535efd8e011 xen/arch/x86/hvm/vmx/realmode.c --- a/xen/arch/x86/hvm/vmx/realmode.c Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/arch/x86/hvm/vmx/realmode.c Tue Dec 09 16:02:45 2008 +0000 @@ -103,30 +103,12 @@ static void realmode_emulate_one(struct static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; - unsigned long seg_reg_dirty; uint32_t intr_info; int rc; - seg_reg_dirty = hvmemul_ctxt->seg_reg_dirty; - hvmemul_ctxt->seg_reg_dirty = 0; + perfc_incr(realmode_emulations); rc = hvm_emulate_one(hvmemul_ctxt); - - if ( test_bit(x86_seg_cs, &hvmemul_ctxt->seg_reg_dirty) ) - { - curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_CS; - if ( hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel & 3 ) - curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_CS; - } - - if ( test_bit(x86_seg_ss, &hvmemul_ctxt->seg_reg_dirty) ) - { - curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_SS; - if ( hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt)->sel & 3 ) - curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_SS; - } - - hvmemul_ctxt->seg_reg_dirty |= seg_reg_dirty; if ( rc == X86EMUL_UNHANDLEABLE ) { @@ -210,7 +192,8 @@ void vmx_realmode(struct cpu_user_regs * intr_info = 0; } - while ( curr->arch.hvm_vmx.vmxemul && + curr->arch.hvm_vmx.vmx_emulate = 1; + while ( curr->arch.hvm_vmx.vmx_emulate && !softirq_pending(smp_processor_id()) && (curr->arch.hvm_vcpu.io_state == HVMIO_none) ) { @@ -220,13 +203,27 @@ void vmx_realmode(struct cpu_user_regs * * in real mode, because we don't emulate protected-mode IDT vectoring. */ if ( unlikely(!(++emulations & 15)) && - !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) && + curr->arch.hvm_vmx.vmx_realmode && hvm_local_events_need_delivery(curr) ) break; + realmode_emulate_one(&hvmemul_ctxt); - } - - if ( !curr->arch.hvm_vmx.vmxemul ) + + /* Stop emulating unless our segment state is not safe */ + if ( curr->arch.hvm_vmx.vmx_realmode ) + curr->arch.hvm_vmx.vmx_emulate = + (curr->arch.hvm_vmx.vm86_segment_mask != 0); + else + curr->arch.hvm_vmx.vmx_emulate = + ((hvmemul_ctxt.seg_reg[x86_seg_cs].sel & 3) + || (hvmemul_ctxt.seg_reg[x86_seg_ss].sel & 3)); + } + + /* Need to emulate next time if we've started an IO operation */ + if ( curr->arch.hvm_vcpu.io_state != HVMIO_none ) + curr->arch.hvm_vmx.vmx_emulate = 1; + + if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode ) { /* * Cannot enter protected mode with bogus selector RPLs and DPLs. diff -r 5535efd8e011 xen/arch/x86/hvm/vmx/vmcs.c --- a/xen/arch/x86/hvm/vmx/vmcs.c Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/arch/x86/hvm/vmx/vmcs.c Tue Dec 09 16:02:45 2008 +0000 @@ -880,21 +880,34 @@ void vmx_do_resume(struct vcpu *v) reset_stack_and_jump(vmx_asm_do_vmentry); } -static void vmx_dump_sel(char *name, enum x86_segment seg) -{ - struct segment_register sreg; - hvm_get_segment_register(current, seg, &sreg); - printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n", - name, sreg.sel, sreg.attr.bytes, sreg.limit, - (unsigned long long)sreg.base); -} - static unsigned long vmr(unsigned long field) { int rc; unsigned long val; val = __vmread_safe(field, &rc); return rc ? 0 : val; +} + +static void vmx_dump_sel(char *name, uint32_t selector) +{ + uint32_t sel, attr, limit; + uint64_t base; + sel = vmr(selector); + attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR)); + limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR)); + base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR)); + printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016"PRIx64"\n", + name, sel, attr, limit, base); +} + +static void vmx_dump_sel2(char *name, uint32_t lim) +{ + uint32_t limit; + uint64_t base; + limit = vmr(lim); + base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); + printk("%s: limit=0x%08x, base=0x%016"PRIx64"\n", + name, limit, base); } void vmcs_dump_vcpu(struct vcpu *v) @@ -938,16 +951,16 @@ void vmcs_dump_vcpu(struct vcpu *v) (unsigned long long)vmr(GUEST_SYSENTER_ESP), (int)vmr(GUEST_SYSENTER_CS), (unsigned long long)vmr(GUEST_SYSENTER_EIP)); - vmx_dump_sel("CS", x86_seg_cs); - vmx_dump_sel("DS", x86_seg_ds); - vmx_dump_sel("SS", x86_seg_ss); - vmx_dump_sel("ES", x86_seg_es); - vmx_dump_sel("FS", x86_seg_fs); - vmx_dump_sel("GS", x86_seg_gs); - vmx_dump_sel("GDTR", x86_seg_gdtr); - vmx_dump_sel("LDTR", x86_seg_ldtr); - vmx_dump_sel("IDTR", x86_seg_idtr); - vmx_dump_sel("TR", x86_seg_tr); + vmx_dump_sel("CS", GUEST_CS_SELECTOR); + vmx_dump_sel("DS", GUEST_DS_SELECTOR); + vmx_dump_sel("SS", GUEST_SS_SELECTOR); + vmx_dump_sel("ES", GUEST_ES_SELECTOR); + vmx_dump_sel("FS", GUEST_FS_SELECTOR); + vmx_dump_sel("GS", GUEST_GS_SELECTOR); + vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT); + vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR); + vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT); + vmx_dump_sel("TR", GUEST_TR_SELECTOR); x = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32; x |= (uint32_t)vmr(TSC_OFFSET); printk("TSC Offset = %016llx\n", x); diff -r 5535efd8e011 xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/arch/x86/hvm/vmx/vmx.c Tue Dec 09 16:10:22 2008 +0000 @@ -704,6 +704,26 @@ static void vmx_ctxt_switch_to(struct vc vpmu_load(v); } + +/* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode + * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments. + * The guest thinks it's got ring-0 segments, so we need to fudge + * things. We store the ring-3 version in the VMCS to avoid lots of + * shuffling on vmenter and vmexit, and translate in these accessors. */ + +#define rm_cs_attr (((union segment_attributes) { \ + .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0, \ + .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes) +#define rm_ds_attr (((union segment_attributes) { \ + .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0, \ + .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes) +#define vm86_ds_attr (((union segment_attributes) { \ + .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0, \ + .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes) +#define vm86_tr_attr (((union segment_attributes) { \ + .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0, \ + .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes) + static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { @@ -779,14 +799,85 @@ static void vmx_get_segment_register(str /* Unusable flag is folded into Present flag. */ if ( attr & (1u<<16) ) reg->attr.fields.p = 0; + + /* Adjust for virtual 8086 mode */ + if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr + && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) ) + { + struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg]; + if ( seg == x86_seg_tr ) + *reg = *sreg; + else if ( reg->base != sreg->base || seg == x86_seg_ss ) + { + /* If the guest's reloaded the segment, remember the new version. + * We can't tell if the guest reloaded the segment with another + * one that has the same base. By default we assume it hasn't, + * since we don't want to lose big-real-mode segment attributes, + * but for SS we assume it has: the Ubuntu graphical bootloader + * does this and gets badly confused if we leave the old SS in + * place. */ + reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr); + *sreg = *reg; + } + else + { + /* Always give realmode guests a selector that matches the base + * but keep the attr and limit from before */ + *reg = *sreg; + reg->sel = reg->base >> 4; + } + } } static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { - uint32_t attr; - + uint32_t attr, sel, limit; + uint64_t base; + + sel = reg->sel; attr = reg->attr.bytes; + limit = reg->limit; + base = reg->base; + + /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */ + if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr ) + { + /* Remember the proper contents */ + v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg; + + if ( seg == x86_seg_tr ) + { + if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] ) + { + sel = 0; + attr = vm86_tr_attr; + limit = 0xff; + base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS]; + v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg); + } + else + v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg); + } + else + { + /* Try to fake it out as a 16bit data segment. This could + * cause confusion for the guest if it reads the selector, + * but otherwise we have to emulate if *any* segment hasn't + * been reloaded. */ + if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff + && reg->attr.fields.p ) + { + sel = base >> 4; + attr = vm86_ds_attr; + limit = 0xffff; + v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg); + } + else + v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg); + } + } + attr = ((attr & 0xf00) << 4) | (attr & 0xff); /* Not-present must mean unusable. */ @@ -794,67 +885,67 @@ static void vmx_set_segment_register(str attr |= (1u << 16); /* VMX has strict consistency requirement for flag G. */ - attr |= !!(reg->limit >> 20) << 15; + attr |= !!(limit >> 20) << 15; vmx_vmcs_enter(v); switch ( seg ) { case x86_seg_cs: - __vmwrite(GUEST_CS_SELECTOR, reg->sel); - __vmwrite(GUEST_CS_LIMIT, reg->limit); - __vmwrite(GUEST_CS_BASE, reg->base); + __vmwrite(GUEST_CS_SELECTOR, sel); + __vmwrite(GUEST_CS_LIMIT, limit); + __vmwrite(GUEST_CS_BASE, base); __vmwrite(GUEST_CS_AR_BYTES, attr); break; case x86_seg_ds: - __vmwrite(GUEST_DS_SELECTOR, reg->sel); - __vmwrite(GUEST_DS_LIMIT, reg->limit); - __vmwrite(GUEST_DS_BASE, reg->base); + __vmwrite(GUEST_DS_SELECTOR, sel); + __vmwrite(GUEST_DS_LIMIT, limit); + __vmwrite(GUEST_DS_BASE, base); __vmwrite(GUEST_DS_AR_BYTES, attr); break; case x86_seg_es: - __vmwrite(GUEST_ES_SELECTOR, reg->sel); - __vmwrite(GUEST_ES_LIMIT, reg->limit); - __vmwrite(GUEST_ES_BASE, reg->base); + __vmwrite(GUEST_ES_SELECTOR, sel); + __vmwrite(GUEST_ES_LIMIT, limit); + __vmwrite(GUEST_ES_BASE, base); __vmwrite(GUEST_ES_AR_BYTES, attr); break; case x86_seg_fs: - __vmwrite(GUEST_FS_SELECTOR, reg->sel); - __vmwrite(GUEST_FS_LIMIT, reg->limit); - __vmwrite(GUEST_FS_BASE, reg->base); + __vmwrite(GUEST_FS_SELECTOR, sel); + __vmwrite(GUEST_FS_LIMIT, limit); + __vmwrite(GUEST_FS_BASE, base); __vmwrite(GUEST_FS_AR_BYTES, attr); break; case x86_seg_gs: - __vmwrite(GUEST_GS_SELECTOR, reg->sel); - __vmwrite(GUEST_GS_LIMIT, reg->limit); - __vmwrite(GUEST_GS_BASE, reg->base); + __vmwrite(GUEST_GS_SELECTOR, sel); + __vmwrite(GUEST_GS_LIMIT, limit); + __vmwrite(GUEST_GS_BASE, base); __vmwrite(GUEST_GS_AR_BYTES, attr); break; case x86_seg_ss: - __vmwrite(GUEST_SS_SELECTOR, reg->sel); - __vmwrite(GUEST_SS_LIMIT, reg->limit); - __vmwrite(GUEST_SS_BASE, reg->base); + __vmwrite(GUEST_SS_SELECTOR, sel); + __vmwrite(GUEST_SS_LIMIT, limit); + __vmwrite(GUEST_SS_BASE, base); __vmwrite(GUEST_SS_AR_BYTES, attr); break; case x86_seg_tr: - __vmwrite(GUEST_TR_SELECTOR, reg->sel); - __vmwrite(GUEST_TR_LIMIT, reg->limit); - __vmwrite(GUEST_TR_BASE, reg->base); + __vmwrite(GUEST_TR_SELECTOR, sel); + __vmwrite(GUEST_TR_LIMIT, limit); + __vmwrite(GUEST_TR_BASE, base); /* VMX checks that the the busy flag (bit 1) is set. */ __vmwrite(GUEST_TR_AR_BYTES, attr | 2); break; case x86_seg_gdtr: - __vmwrite(GUEST_GDTR_LIMIT, reg->limit); - __vmwrite(GUEST_GDTR_BASE, reg->base); + __vmwrite(GUEST_GDTR_LIMIT, limit); + __vmwrite(GUEST_GDTR_BASE, base); break; case x86_seg_idtr: - __vmwrite(GUEST_IDTR_LIMIT, reg->limit); - __vmwrite(GUEST_IDTR_BASE, reg->base); + __vmwrite(GUEST_IDTR_LIMIT, limit); + __vmwrite(GUEST_IDTR_BASE, base); break; case x86_seg_ldtr: - __vmwrite(GUEST_LDTR_SELECTOR, reg->sel); - __vmwrite(GUEST_LDTR_LIMIT, reg->limit); - __vmwrite(GUEST_LDTR_BASE, reg->base); + __vmwrite(GUEST_LDTR_SELECTOR, sel); + __vmwrite(GUEST_LDTR_LIMIT, limit); + __vmwrite(GUEST_LDTR_BASE, base); __vmwrite(GUEST_LDTR_AR_BYTES, attr); break; default: @@ -970,6 +1061,7 @@ static void vmx_update_guest_cr(struct v switch ( cr ) { case 0: { + int realmode; unsigned long hw_cr0_mask = X86_CR0_NE | X86_CR0_PG | X86_CR0_PE; @@ -998,9 +1090,44 @@ static void vmx_update_guest_cr(struct v vmx_fpu_enter(v); } - v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE; - if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) ) - v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE; + realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE); + if ( realmode != v->arch.hvm_vmx.vmx_realmode ) + { + enum x86_segment s; + struct segment_register reg[x86_seg_tr + 1]; + + /* Entering or leaving real mode: adjust the segment registers. + * Need to read them all either way, as realmode reads can update + * the saved values we'll use when returning to prot mode. */ + for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) + vmx_get_segment_register(v, s, ®[s]); + v->arch.hvm_vmx.vmx_realmode = realmode; + + if ( realmode ) + { + for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) + vmx_set_segment_register(v, s, ®[s]); + v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME; + __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); + __vmwrite(EXCEPTION_BITMAP, 0xffffffff); + } + else + { + for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) + if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<arch.hvm_vmx.vm86_saved_seg[s]); + v->arch.hvm_vcpu.hw_cr[4] = + ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME) + |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME)); + __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); + __vmwrite(EXCEPTION_BITMAP, + HVM_TRAP_MASK + | (paging_mode_hap(v->domain) ? + 0 : (1U << TRAP_page_fault)) + | (1U << TRAP_no_device)); + } + } v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask; @@ -1028,6 +1155,8 @@ static void vmx_update_guest_cr(struct v if ( paging_mode_hap(v->domain) ) v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE; v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4]; + if ( v->arch.hvm_vmx.vmx_realmode ) + v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME; if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) ) { v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE; @@ -1097,6 +1226,7 @@ static void __vmx_inject_exception(int t static void __vmx_inject_exception(int trap, int type, int error_code) { unsigned long intr_fields; + struct vcpu *curr = current; /* * NB. Callers do not need to worry about clearing STI/MOV-SS blocking: @@ -1113,6 +1243,11 @@ static void __vmx_inject_exception(int t } __vmwrite(VM_ENTRY_INTR_INFO, intr_fields); + + /* Can't inject exceptions in virtual 8086 mode because they would + * use the protected-mode IDT. Emulate at the next vmenter instead. */ + if ( curr->arch.hvm_vmx.vmx_realmode ) + curr->arch.hvm_vmx.vmx_emulate = 1; } void vmx_inject_hw_exception(int trap, int error_code) @@ -2072,6 +2207,17 @@ static void vmx_failed_vmentry(unsigned domain_crash(curr->domain); } +asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs) +{ + struct vcpu *v = current; + + /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3. Since + * we have CR4.VME == 1 and our own TSS with an empty interrupt + * redirection bitmap, all software INTs will be handled by vm86 */ + v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags; + regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL); +} + asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) { unsigned int exit_reason, idtv_info; @@ -2099,6 +2245,42 @@ asmlinkage void vmx_vmexit_handler(struc if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) ) return vmx_failed_vmentry(exit_reason, regs); + + if ( v->arch.hvm_vmx.vmx_realmode ) + { + unsigned int vector; + + /* Put RFLAGS back the way the guest wants it */ + regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL); + regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL); + + /* Unless this exit was for an interrupt, we've hit something + * vm86 can't handle. Try again, using the emulator. */ + switch ( exit_reason ) + { + case EXIT_REASON_EXCEPTION_NMI: + vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;; + if ( vector != TRAP_page_fault + && vector != TRAP_nmi + && vector != TRAP_machine_check ) + { + perfc_incr(realmode_exits); + v->arch.hvm_vmx.vmx_emulate = 1; + return; + } + case EXIT_REASON_EXTERNAL_INTERRUPT: + case EXIT_REASON_INIT: + case EXIT_REASON_SIPI: + case EXIT_REASON_PENDING_VIRT_INTR: + case EXIT_REASON_PENDING_VIRT_NMI: + case EXIT_REASON_MACHINE_CHECK: + break; + default: + v->arch.hvm_vmx.vmx_emulate = 1; + perfc_incr(realmode_exits); + return; + } + } hvm_maybe_deassert_evtchn_irq(); diff -r 5535efd8e011 xen/arch/x86/x86_32/asm-offsets.c --- a/xen/arch/x86/x86_32/asm-offsets.c Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/arch/x86/x86_32/asm-offsets.c Tue Dec 09 16:02:45 2008 +0000 @@ -88,7 +88,9 @@ void __dummy__(void) BLANK(); OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched); - OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul); + OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode); + OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate); + OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask); OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]); BLANK(); diff -r 5535efd8e011 xen/arch/x86/x86_64/asm-offsets.c --- a/xen/arch/x86/x86_64/asm-offsets.c Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/arch/x86/x86_64/asm-offsets.c Tue Dec 09 16:02:45 2008 +0000 @@ -107,7 +107,9 @@ void __dummy__(void) BLANK(); OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched); - OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul); + OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode); + OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate); + OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask); OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]); BLANK(); diff -r 5535efd8e011 xen/arch/x86/x86_emulate/x86_emulate.h --- a/xen/arch/x86/x86_emulate/x86_emulate.h Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/arch/x86/x86_emulate/x86_emulate.h Tue Dec 09 16:02:45 2008 +0000 @@ -67,6 +67,7 @@ typedef union segment_attributes { uint16_t l: 1; /* 9; Bit 53 */ uint16_t db: 1; /* 10; Bit 54 */ uint16_t g: 1; /* 11; Bit 55 */ + uint16_t pad: 4; } fields; } __attribute__ ((packed)) segment_attributes_t; diff -r 5535efd8e011 xen/include/asm-x86/hvm/vmx/vmcs.h --- a/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Dec 09 16:02:45 2008 +0000 @@ -109,11 +109,16 @@ struct arch_vmx_struct { unsigned long host_cr0; + /* Is the guest in real mode? */ + uint8_t vmx_realmode; /* Are we emulating rather than VMENTERing? */ -#define VMXEMUL_REALMODE 1 /* Yes, because CR0.PE == 0 */ -#define VMXEMUL_BAD_CS 2 /* Yes, because CS.RPL != CPL */ -#define VMXEMUL_BAD_SS 4 /* Yes, because SS.RPL != CPL */ - uint8_t vmxemul; + uint8_t vmx_emulate; + /* Bitmask of segments that we can't safely use in virtual 8086 mode */ + uint16_t vm86_segment_mask; + /* Shadow CS, SS, DS, ES, FS, GS, TR while in virtual 8086 mode */ + struct segment_register vm86_saved_seg[x86_seg_tr + 1]; + /* Remember EFLAGS while in virtual 8086 mode */ + uint32_t vm86_saved_eflags; }; int vmx_create_vmcs(struct vcpu *v); diff -r 5535efd8e011 xen/include/asm-x86/perfc_defn.h --- a/xen/include/asm-x86/perfc_defn.h Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/include/asm-x86/perfc_defn.h Tue Dec 09 16:02:45 2008 +0000 @@ -127,4 +127,7 @@ PERFCOUNTER(mshv_wrmsr_tpr, PERFCOUNTER(mshv_wrmsr_tpr, "MS Hv wrmsr tpr") PERFCOUNTER(mshv_wrmsr_eoi, "MS Hv wrmsr eoi") +PERFCOUNTER(realmode_emulations, "realmode instructions emulated") +PERFCOUNTER(realmode_exits, "vmexits from realmode") + /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ diff -r 5535efd8e011 xen/include/public/hvm/params.h --- a/xen/include/public/hvm/params.h Tue Dec 09 13:23:15 2008 +0000 +++ b/xen/include/public/hvm/params.h Tue Dec 09 16:02:45 2008 +0000 @@ -100,6 +100,9 @@ /* ACPI S state: currently support S0 and S3 on x86. */ #define HVM_PARAM_ACPI_S_STATE 14 -#define HVM_NR_PARAMS 15 +/* TSS used on Intel when CR0.PE=0. */ +#define HVM_PARAM_VM86_TSS 15 + +#define HVM_NR_PARAMS 16 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */