diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c index 2d92957..f6244af 100644 --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1029,6 +1030,86 @@ static int hvmemul_wbinvd_discard( return X86EMUL_OKAY; } +static int hvmemul_vaddr_to_mfn( + unsigned long addr, + mfn_t *mfn, + uint32_t pfec, + struct x86_emulate_ctxt *ctxt) +{ + paddr_t gpa = addr & ~PAGE_MASK; + struct page_info *page; + p2m_type_t p2mt; + unsigned long gfn; + struct vcpu *curr = current; + struct hvm_emulate_ctxt *hvmemul_ctxt = + container_of(ctxt, struct hvm_emulate_ctxt, ctxt); + + gfn = paging_gva_to_gfn(curr, addr, &pfec); + + printk("gfn: 0x%lx\n", gfn); + + if ( gfn == gfn_x(INVALID_GFN) ) + { + pagefault_info_t pfinfo = {}; + + if ( ( pfec & PFEC_page_paged ) || ( pfec & PFEC_page_shared ) ) + return X86EMUL_RETRY; + + pfinfo.linear = addr; + pfinfo.ec = pfec; + + x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &hvmemul_ctxt->ctxt); + return X86EMUL_EXCEPTION; + } + + gpa |= (paddr_t)gfn << PAGE_SHIFT; + + /* + * No need to do the P2M lookup for internally handled MMIO, benefiting + * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses, + * - newer Windows (like Server 2012) for HPET accesses. + */ + if ( !nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa) ) + { + printk("!nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa)\n"); + return X86EMUL_UNHANDLEABLE; + } + + page = get_page_from_gfn(curr->domain, gfn, &p2mt, P2M_UNSHARE); + + if ( !page ) + { + printk("!page\n"); + return X86EMUL_UNHANDLEABLE; + } + + if ( p2m_is_paging(p2mt) ) + { + put_page(page); + p2m_mem_paging_populate(curr->domain, gfn); + return X86EMUL_RETRY; + } + + if ( p2m_is_shared(p2mt) ) + { + put_page(page); + return X86EMUL_RETRY; + } + + if ( p2m_is_grant(p2mt) ) + { + put_page(page); + printk("p2m_is_grant(p2mt)\n"); + return X86EMUL_UNHANDLEABLE; + } + + *mfn = _mfn(page_to_mfn(page)); + + put_page(page); + + return X86EMUL_OKAY; +} + static int hvmemul_cmpxchg( enum x86_segment seg, unsigned long offset, @@ -1037,8 +1118,98 @@ static int hvmemul_cmpxchg( unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - /* Fix this in case the guest is really relying on r-m-w atomicity. */ - return hvmemul_write(seg, offset, p_new, bytes, ctxt); + unsigned long addr = 0, reps = 1; + int rc = X86EMUL_OKAY; + unsigned long old = 0, new = 0; + uint32_t pfec = PFEC_page_present | PFEC_write_access; + struct hvm_emulate_ctxt *hvmemul_ctxt = + container_of(ctxt, struct hvm_emulate_ctxt, ctxt); + mfn_t mfn[2]; + void *map = NULL; + struct domain *currd = current->domain; + unsigned long ret; + + if ( is_x86_system_segment(seg) ) + pfec |= PFEC_implicit; + else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 ) + pfec |= PFEC_user_mode; + + rc = hvmemul_virtual_to_linear( + seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr); + + if ( rc != X86EMUL_OKAY || !bytes ) + { + printk("rc != X86EMUL_OKAY || !bytes\n"); + goto out; + } + + rc = hvmemul_vaddr_to_mfn(addr, &mfn[0], pfec, ctxt); + + if ( rc != X86EMUL_OKAY ) + { + printk("hvmemul_vaddr_to_mfn() fail\n"); + goto out; + } + + if ( likely(((addr + bytes - 1) & PAGE_MASK) == (addr & PAGE_MASK)) ) + { + /* Whole write fits on a single page. */ + mfn[1] = INVALID_MFN; + map = map_domain_page(mfn[0]); + } + else + { + rc = hvmemul_vaddr_to_mfn((addr + bytes - 1) & PAGE_MASK, &mfn[1], + pfec, ctxt); + if ( rc != X86EMUL_OKAY ) + { + printk("hvmemul_vaddr_to_mfn(mfn[1]) fail\n"); + goto out; + } + + map = vmap(mfn, 2); + } + + if ( !map ) + { + printk("!map\n"); + return X86EMUL_UNHANDLEABLE; + } + + map += (addr & ~PAGE_MASK); + + memcpy(&old, p_old, bytes); + memcpy(&new, p_new, bytes); + + ret = __cmpxchg(map, old, new, bytes); + + if ( ret != old ) + { + memcpy(p_old, &ret, bytes); + rc = X86EMUL_CMPXCHG_FAILED; + } + + paging_mark_dirty(currd, mfn[0]); + + if ( unlikely(mfn_valid(mfn[1])) ) + { + paging_mark_dirty(currd, mfn[1]); + vunmap((void *)((unsigned long)map & PAGE_MASK)); + } + else + unmap_domain_page(map); + +out: + if ( rc != X86EMUL_OKAY ) + { + if ( rc != X86EMUL_CMPXCHG_FAILED ) + rc = X86EMUL_UNHANDLEABLE; + } + + printk("[%d] virtual address: 0x%lx, rc: %d\n", + current->vcpu_id, addr, rc); + + return rc; } static int hvmemul_validate( @@ -1961,59 +2132,64 @@ int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla) void hvm_emulate_one_vm_event(enum emul_kind kind, unsigned int trapnr, unsigned int errcode) { - struct hvm_emulate_ctxt ctx = {{ 0 }}; - int rc; + int rc = X86EMUL_OKAY; - hvm_emulate_init_once(&ctx, NULL, guest_cpu_user_regs()); + /* do { */ + struct hvm_emulate_ctxt ctx = {{ 0 }}; - switch ( kind ) - { - case EMUL_KIND_NOWRITE: - rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write); - break; - case EMUL_KIND_SET_CONTEXT_INSN: { - struct vcpu *curr = current; - struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; + hvm_emulate_init_once(&ctx, NULL, guest_cpu_user_regs()); - BUILD_BUG_ON(sizeof(vio->mmio_insn) != - sizeof(curr->arch.vm_event->emul.insn.data)); - ASSERT(!vio->mmio_insn_bytes); + switch ( kind ) + { + case EMUL_KIND_NOWRITE: + rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write); + break; + case EMUL_KIND_SET_CONTEXT_INSN: { + struct vcpu *curr = current; + struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; + + BUILD_BUG_ON(sizeof(vio->mmio_insn) != + sizeof(curr->arch.vm_event->emul.insn.data)); + ASSERT(!vio->mmio_insn_bytes); + + /* + * Stash insn buffer into mmio buffer here instead of ctx + * to avoid having to add more logic to hvm_emulate_one. + */ + vio->mmio_insn_bytes = sizeof(vio->mmio_insn); + memcpy(vio->mmio_insn, curr->arch.vm_event->emul.insn.data, + vio->mmio_insn_bytes); + } + /* Fall-through */ + default: + ctx.set_context = (kind == EMUL_KIND_SET_CONTEXT_DATA); + rc = hvm_emulate_one(&ctx); + } - /* - * Stash insn buffer into mmio buffer here instead of ctx - * to avoid having to add more logic to hvm_emulate_one. - */ - vio->mmio_insn_bytes = sizeof(vio->mmio_insn); - memcpy(vio->mmio_insn, curr->arch.vm_event->emul.insn.data, - vio->mmio_insn_bytes); - } - /* Fall-through */ - default: - ctx.set_context = (kind == EMUL_KIND_SET_CONTEXT_DATA); - rc = hvm_emulate_one(&ctx); - } + if ( rc != X86EMUL_OKAY ) + { + printk("Dump follows for VCPU %d\n", current->vcpu_id); + } - switch ( rc ) - { - case X86EMUL_RETRY: - /* - * This function is called when handling an EPT-related vm_event - * reply. As such, nothing else needs to be done here, since simply - * returning makes the current instruction cause a page fault again, - * consistent with X86EMUL_RETRY. - */ - return; - case X86EMUL_UNHANDLEABLE: - hvm_dump_emulation_state(XENLOG_G_DEBUG "Mem event", &ctx); - hvm_inject_hw_exception(trapnr, errcode); - break; - case X86EMUL_EXCEPTION: - if ( ctx.ctxt.event_pending ) - hvm_inject_event(&ctx.ctxt.event); - break; - } + switch ( rc ) + { + case X86EMUL_RETRY: + /* break; */ + hvm_dump_emulation_state(XENLOG_G_DEBUG "Mem event (RETRY)", &ctx); + return; + case X86EMUL_UNHANDLEABLE: + hvm_dump_emulation_state(XENLOG_G_DEBUG "Mem event", &ctx); + hvm_inject_hw_exception(trapnr, errcode); + break; + case X86EMUL_EXCEPTION: + if ( ctx.ctxt.event_pending ) + hvm_inject_event(&ctx.ctxt.event); + break; + } + + hvm_emulate_writeback(&ctx); - hvm_emulate_writeback(&ctx); + /* } while( rc == X86EMUL_RETRY ); */ } void hvm_emulate_init_once( diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 4dbd24f..a67cd55 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -5176,16 +5176,17 @@ static int ptwr_emulated_read( static int ptwr_emulated_update( unsigned long addr, - paddr_t old, - paddr_t val, + intpte_t *p_old, + intpte_t val, unsigned int bytes, - unsigned int do_cmpxchg, struct ptwr_emulate_ctxt *ptwr_ctxt) { unsigned long mfn; unsigned long unaligned_addr = addr; struct page_info *page; l1_pgentry_t pte, ol1e, nl1e, *pl1e; + intpte_t old = p_old ? *p_old : 0; + unsigned int offset = 0; struct vcpu *v = current; struct domain *d = v->domain; int ret; @@ -5199,28 +5200,30 @@ static int ptwr_emulated_update( } /* Turn a sub-word access into a full-word access. */ - if ( bytes != sizeof(paddr_t) ) + if ( bytes != sizeof(val) ) { - paddr_t full; - unsigned int rc, offset = addr & (sizeof(paddr_t)-1); + intpte_t full; + unsigned int rc; + + offset = addr & (sizeof(full) - 1); /* Align address; read full word. */ - addr &= ~(sizeof(paddr_t)-1); - if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 ) + addr &= ~(sizeof(full) - 1); + if ( (rc = copy_from_user(&full, (void *)addr, sizeof(full))) != 0 ) { x86_emul_pagefault(0, /* Read fault. */ - addr + sizeof(paddr_t) - rc, + addr + sizeof(full) - rc, &ptwr_ctxt->ctxt); return X86EMUL_EXCEPTION; } /* Mask out bits provided by caller. */ - full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8)); + full &= ~((((intpte_t)1 << (bytes * 8)) - 1) << (offset * 8)); /* Shift the caller value and OR in the missing bits. */ - val &= (((paddr_t)1 << (bytes*8)) - 1); + val &= (((intpte_t)1 << (bytes * 8)) - 1); val <<= (offset)*8; val |= full; /* Also fill in missing parts of the cmpxchg old value. */ - old &= (((paddr_t)1 << (bytes*8)) - 1); + old &= (((intpte_t)1 << (bytes * 8)) - 1); old <<= (offset)*8; old |= full; } @@ -5242,7 +5245,7 @@ static int ptwr_emulated_update( { default: if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) && - !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) ) + !p_old && (l1e_get_flags(nl1e) & _PAGE_PRESENT) ) { /* * If this is an upper-half write to a PAE PTE then we assume that @@ -5273,21 +5276,26 @@ static int ptwr_emulated_update( /* Checked successfully: do the update (write or cmpxchg). */ pl1e = map_domain_page(_mfn(mfn)); pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK)); - if ( do_cmpxchg ) + if ( p_old ) { - int okay; - intpte_t t = old; ol1e = l1e_from_intpte(old); - okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e), - &t, l1e_get_intpte(nl1e), _mfn(mfn)); - okay = (okay && t == old); + if ( !paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e), + &old, l1e_get_intpte(nl1e), _mfn(mfn)) ) + ret = X86EMUL_UNHANDLEABLE; + else if ( l1e_get_intpte(ol1e) == old ) + ret = X86EMUL_OKAY; + else + { + *p_old = old >> (offset * 8); + ret = X86EMUL_CMPXCHG_FAILED; + } - if ( !okay ) + if ( ret != X86EMUL_OKAY ) { unmap_domain_page(pl1e); put_page_from_l1e(nl1e, d); - return X86EMUL_RETRY; + return ret; } } else @@ -5314,9 +5322,9 @@ static int ptwr_emulated_write( unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - paddr_t val = 0; + intpte_t val = 0; - if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes - 1)) || !bytes ) + if ( (bytes > sizeof(val)) || (bytes & (bytes - 1)) || !bytes ) { MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)", offset, bytes); @@ -5325,9 +5333,9 @@ static int ptwr_emulated_write( memcpy(&val, p_data, bytes); - return ptwr_emulated_update( - offset, 0, val, bytes, 0, - container_of(ctxt, struct ptwr_emulate_ctxt, ctxt)); + return ptwr_emulated_update(offset, NULL, val, bytes, + container_of(ctxt, struct ptwr_emulate_ctxt, + ctxt)); } static int ptwr_emulated_cmpxchg( @@ -5338,21 +5346,20 @@ static int ptwr_emulated_cmpxchg( unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - paddr_t old = 0, new = 0; + intpte_t new = 0; - if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) ) + if ( (bytes > sizeof(new)) || (bytes & (bytes -1)) ) { MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)", offset, bytes); return X86EMUL_UNHANDLEABLE; } - memcpy(&old, p_old, bytes); memcpy(&new, p_new, bytes); - return ptwr_emulated_update( - offset, old, new, bytes, 1, - container_of(ctxt, struct ptwr_emulate_ctxt, ctxt)); + return ptwr_emulated_update(offset, p_old, new, bytes, + container_of(ctxt, struct ptwr_emulate_ctxt, + ctxt)); } static int pv_emul_is_mem_write(const struct x86_emulate_state *state, diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c index d93f2ab..06dc9f6 100644 --- a/xen/arch/x86/mm/shadow/common.c +++ b/xen/arch/x86/mm/shadow/common.c @@ -285,7 +285,7 @@ hvm_emulate_cmpxchg(enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt = container_of(ctxt, struct sh_emulate_ctxt, ctxt); struct vcpu *v = current; - unsigned long addr, old, new; + unsigned long addr, new = 0; int rc; if ( bytes > sizeof(long) ) @@ -296,12 +296,10 @@ hvm_emulate_cmpxchg(enum x86_segment seg, if ( rc ) return rc; - old = new = 0; - memcpy(&old, p_old, bytes); memcpy(&new, p_new, bytes); return v->arch.paging.mode->shadow.x86_emulate_cmpxchg( - v, addr, old, new, bytes, sh_ctxt); + v, addr, p_old, new, bytes, sh_ctxt); } static const struct x86_emulate_ops hvm_shadow_emulator_ops = { diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c index 4798c93..d8270db 100644 --- a/xen/arch/x86/mm/shadow/multi.c +++ b/xen/arch/x86/mm/shadow/multi.c @@ -4783,11 +4783,11 @@ sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src, static int sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, - unsigned long old, unsigned long new, - unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt) + unsigned long *p_old, unsigned long new, + unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt) { void *addr; - unsigned long prev; + unsigned long prev, old = *p_old; int rv = X86EMUL_OKAY; /* Unaligned writes are only acceptable on HVM */ @@ -4811,7 +4811,10 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, } if ( prev != old ) - rv = X86EMUL_RETRY; + { + *p_old = prev; + rv = X86EMUL_CMPXCHG_FAILED; + } SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx" " wanted %#lx now %#lx bytes %u\n", diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c index bb67be6..444c84c 100644 --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -1862,6 +1862,9 @@ protmode_load_seg( default: return rc; + + case X86EMUL_CMPXCHG_FAILED: + return X86EMUL_RETRY; } /* Force the Accessed flag in our local copy. */ @@ -6680,6 +6683,7 @@ x86_emulate( break; case X86EMUL_OPC(0x0f, 0xb0): case X86EMUL_OPC(0x0f, 0xb1): /* cmpxchg */ + fail_if(!ops->cmpxchg); /* Save real source value, then compare EAX against destination. */ src.orig_val = src.val; src.val = _regs.r(ax); @@ -6688,8 +6692,17 @@ x86_emulate( if ( _regs.eflags & X86_EFLAGS_ZF ) { /* Success: write back to memory. */ - dst.val = src.orig_val; + dst.val = src.val; + rc = ops->cmpxchg(dst.mem.seg, dst.mem.off, &dst.val, + &src.orig_val, dst.bytes, ctxt); + if ( rc == X86EMUL_CMPXCHG_FAILED ) + { + _regs.eflags &= ~X86_EFLAGS_ZF; + rc = X86EMUL_OKAY; + } } + if ( _regs.eflags & X86_EFLAGS_ZF ) + dst.type = OP_NONE; else { /* Failure: write the value we saw to EAX. */ @@ -6994,6 +7007,7 @@ x86_emulate( if ( memcmp(old, aux, op_bytes) ) { + cmpxchgNb_failed: /* Expected != actual: store actual to rDX:rAX and clear ZF. */ _regs.r(ax) = !(rex_prefix & REX_W) ? old->u32[0] : old->u64[0]; _regs.r(dx) = !(rex_prefix & REX_W) ? old->u32[1] : old->u64[1]; @@ -7003,7 +7017,7 @@ x86_emulate( { /* * Expected == actual: Get proposed value, attempt atomic cmpxchg - * and set ZF. + * and set ZF if successful. */ if ( !(rex_prefix & REX_W) ) { @@ -7016,10 +7030,20 @@ x86_emulate( aux->u64[1] = _regs.r(cx); } - if ( (rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, aux, - op_bytes, ctxt)) != X86EMUL_OKAY ) + switch ( rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, aux, + op_bytes, ctxt) ) + { + case X86EMUL_OKAY: + _regs.eflags |= X86_EFLAGS_ZF; + break; + + case X86EMUL_CMPXCHG_FAILED: + rc = X86EMUL_OKAY; + goto cmpxchgNb_failed; + + default: goto done; - _regs.eflags |= X86_EFLAGS_ZF; + } } break; } @@ -7942,6 +7966,8 @@ x86_emulate( rc = ops->cmpxchg( dst.mem.seg, dst.mem.off, &dst.orig_val, &dst.val, dst.bytes, ctxt); + if ( rc == X86EMUL_CMPXCHG_FAILED ) + rc = X86EMUL_RETRY; } else { diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h index 9c5fcde..6a8d6a0 100644 --- a/xen/arch/x86/x86_emulate/x86_emulate.h +++ b/xen/arch/x86/x86_emulate/x86_emulate.h @@ -153,6 +153,8 @@ struct x86_emul_fpu_aux { #define X86EMUL_EXCEPTION 2 /* Retry the emulation for some reason. No state modified. */ #define X86EMUL_RETRY 3 + /* (cmpxchg accessor): CMPXCHG failed. */ +#define X86EMUL_CMPXCHG_FAILED 4 /* * Operation fully done by one of the hooks: * - validate(): operation completed (except common insn retire logic) @@ -160,7 +162,7 @@ struct x86_emul_fpu_aux { * - read_io() / write_io(): bypass GPR update (non-string insns only) * Undefined behavior when used anywhere else. */ -#define X86EMUL_DONE 4 +#define X86EMUL_DONE 5 /* FPU sub-types which may be requested via ->get_fpu(). */ enum x86_emulate_fpu_type { @@ -250,6 +252,8 @@ struct x86_emulate_ops /* * cmpxchg: Emulate an atomic (LOCKed) CMPXCHG operation. * @p_old: [IN ] Pointer to value expected to be current at @addr. + * [OUT] Pointer to value found at @addr (may always be + * updated, meaningful for X86EMUL_CMPXCHG_FAILED only). * @p_new: [IN ] Pointer to value to write to @addr. * @bytes: [IN ] Operation size (up to 8 (x86/32) or 16 (x86/64) bytes). */ diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h index f262c9e..3efcf6d 100644 --- a/xen/include/asm-x86/paging.h +++ b/xen/include/asm-x86/paging.h @@ -89,7 +89,7 @@ struct shadow_paging_mode { void *src, u32 bytes, struct sh_emulate_ctxt *sh_ctxt); int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va, - unsigned long old, + unsigned long *old, unsigned long new, unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt);