[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] HVM: support unaligned and page-crossing writes in the shadow emulator
# HG changeset patch # User Keir Fraser <keir.fraser@xxxxxxxxxx> # Date 1197743367 0 # Node ID d9ab9eb2bfee9de14a89ada78c45f365e489b232 # Parent 44a98411d230b3214be49b42e66d7c42e01ab59f HVM: support unaligned and page-crossing writes in the shadow emulator so that we can use it to support guests that clear CR0.WP. Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx> --- xen/arch/x86/mm/shadow/multi.c | 330 +++++++++++++++++++++++---------------- xen/arch/x86/mm/shadow/private.h | 20 +- xen/include/asm-x86/hvm/hvm.h | 2 3 files changed, 216 insertions(+), 136 deletions(-) diff -r 44a98411d230 -r d9ab9eb2bfee xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Sat Dec 15 18:26:52 2007 +0000 +++ b/xen/arch/x86/mm/shadow/multi.c Sat Dec 15 18:29:27 2007 +0000 @@ -61,12 +61,6 @@ * and if we do flush, re-do the walk. If anything has changed, then * pause all the other vcpus and do the walk *again*. * - * WP DISABLED - * Consider how to implement having the WP bit of CR0 set to 0. - * Since we need to be able to cause write faults to pagetables, this might - * end up looking like not having the (guest) pagetables present at all in - * HVM guests... - * * PSE disabled / PSE36 * We don't support any modes other than PSE enabled, PSE36 disabled. * Neither of those would be hard to change, but we'd need to be able to @@ -219,11 +213,17 @@ static uint32_t mandatory_flags(struct v /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, }; - uint32_t f = flags[(pfec & 0x1f) >> 1]; + /* Don't demand not-NX if the CPU wouldn't enforce it. */ if ( !guest_supports_nx(v) ) - f &= ~_PAGE_NX_BIT; - return f; + pfec &= ~PFEC_insn_fetch; + + /* Don't demand R/W if the CPU wouldn't enforce it. */ + if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) + && !(pfec & PFEC_user_mode) ) + pfec &= ~PFEC_write_access; + + return flags[(pfec & 0x1f) >> 1]; } /* Modify a guest pagetable entry to set the Accessed and Dirty bits. @@ -262,7 +262,8 @@ static uint32_t set_ad_bits(void *guest_ * from any guest PT pages we see, as we will be shadowing them soon * and will rely on the contents' not having changed. * - * Returns 0 for success or non-zero if the walk did not complete. + * Returns 0 for success, or the set of permission bits that we failed on + * if the walk did not complete. * N.B. This is different from the old return code but almost no callers * checked the old return code anyway. */ @@ -2717,8 +2718,9 @@ static int sh_page_fault(struct vcpu *v, fetch_type_t ft = 0; p2m_type_t p2mt; - SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n", - v->domain->domain_id, v->vcpu_id, va, regs->error_code); + SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n", + v->domain->domain_id, v->vcpu_id, va, regs->error_code, + regs->rip); perfc_incr(shadow_fault); // @@ -2790,7 +2792,7 @@ static int sh_page_fault(struct vcpu *v, shadow_lock(d); shadow_audit_tables(v); - + if ( guest_walk_tables(v, va, &gw, regs->error_code, 1) != 0 ) { perfc_incr(shadow_fault_bail_real_fault); @@ -2882,6 +2884,16 @@ static int sh_page_fault(struct vcpu *v, gpa = guest_walk_to_gpa(&gw); goto mmio; } + + /* In HVM guests, we force CR0.WP always to be set, so that the + * pagetables are always write-protected. If the guest thinks + * CR0.WP is clear, we must emulate faulting supervisor writes to + * allow the guest to write through read-only PTEs. Emulate if the + * fault was a non-user write to a present page. */ + if ( is_hvm_domain(d) + && unlikely(!hvm_wp_enabled(v)) + && regs->error_code == (PFEC_write_access|PFEC_page_present) ) + goto emulate; perfc_incr(shadow_fault_fixed); d->arch.paging.log_dirty.fault_count++; @@ -3968,25 +3980,17 @@ int sh_remove_l3_shadow(struct vcpu *v, /**************************************************************************/ /* Handling HVM guest writes to pagetables */ -/* Check that the user is allowed to perform this write. - * Returns a mapped pointer to write to, and the mfn it's on, - * or NULL for error. */ -static inline void * emulate_map_dest(struct vcpu *v, - unsigned long vaddr, - struct sh_emulate_ctxt *sh_ctxt, - mfn_t *mfnp) -{ - uint32_t pfec; +/* Translate a VA to an MFN, injecting a page-fault if we fail */ +static mfn_t emulate_gva_to_mfn(struct vcpu *v, + unsigned long vaddr, + struct sh_emulate_ctxt *sh_ctxt) +{ unsigned long gfn; mfn_t mfn; p2m_type_t p2mt; - - /* We don't emulate user-mode writes to page tables */ - if ( ring_3(sh_ctxt->ctxt.regs) ) - return NULL; - - /* Translate the VA, and exit with a page-fault if we fail */ - pfec = PFEC_page_present | PFEC_write_access; + uint32_t pfec = PFEC_page_present | PFEC_write_access; + + /* Translate the VA to a GFN */ gfn = sh_gva_to_gfn(v, vaddr, &pfec); if ( gfn == INVALID_GFN ) { @@ -3994,84 +3998,184 @@ static inline void * emulate_map_dest(st hvm_inject_exception(TRAP_page_fault, pfec, vaddr); else propagate_page_fault(vaddr, pfec); - return NULL; - } - - /* Translate the GFN */ + return _mfn(INVALID_MFN); + } + + /* Translate the GFN to an MFN */ mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt); if ( p2m_is_ram(p2mt) ) { ASSERT(mfn_valid(mfn)); - *mfnp = mfn; v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn); - return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK); + return mfn; + } + + return _mfn(INVALID_MFN); +} + +/* Check that the user is allowed to perform this write. + * Returns a mapped pointer to write to, or NULL for error. */ +static void * emulate_map_dest(struct vcpu *v, + unsigned long vaddr, + u32 bytes, + struct sh_emulate_ctxt *sh_ctxt) +{ + unsigned long offset; + void *map = NULL; + + /* We don't emulate user-mode writes to page tables */ + if ( ring_3(sh_ctxt->ctxt.regs) ) + return NULL; + + sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt); + if ( !mfn_valid(sh_ctxt->mfn1) ) + return NULL; + + /* Unaligned writes mean probably this isn't a pagetable */ + if ( vaddr & (bytes - 1) ) + sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ ); + + if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) ) + { + /* Whole write fits on a single page */ + sh_ctxt->mfn2 = _mfn(INVALID_MFN); + map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK); } else - return NULL; -} - -static int safe_not_to_verify_write(mfn_t gmfn, void *dst, void *src, - int bytes) -{ + { + /* Cross-page emulated writes are only supported for HVM guests; + * PV guests ought to know better */ + if ( !is_hvm_vcpu(v) ) + return NULL; + + /* This write crosses a page boundary. Translate the second page */ + sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK, + sh_ctxt); + if ( !mfn_valid(sh_ctxt->mfn2) ) + return NULL; + + /* Cross-page writes mean probably not a pagetable */ + sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ ); + + /* Hack: we map the pages into the vcpu's LDT space, since we + * know that we're not going to need the LDT for HVM guests, + * and only HVM guests are allowed unaligned writes. */ + ASSERT(is_hvm_vcpu(v)); + map = (void *)LDT_VIRT_START(v); + offset = l1_linear_offset((unsigned long) map); + l1e_write(&__linear_l1_table[offset], + l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR)); + l1e_write(&__linear_l1_table[offset + 1], + l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR)); + flush_tlb_local(); + map += (vaddr & ~PAGE_MASK); + } + #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) - struct page_info *pg = mfn_to_page(gmfn); - if ( !(pg->shadow_flags & SHF_32) - && ((unsigned long)dst & 7) == 0 ) - { - /* Not shadowed 32-bit: aligned 64-bit writes that leave the - * present bit unset are safe to ignore. */ - if ( (*(u64*)src & _PAGE_PRESENT) == 0 - && (*(u64*)dst & _PAGE_PRESENT) == 0 ) - return 1; - } - else if ( !(pg->shadow_flags & (SHF_PAE|SHF_64)) - && ((unsigned long)dst & 3) == 0 ) - { - /* Not shadowed PAE/64-bit: aligned 32-bit writes that leave the - * present bit unset are safe to ignore. */ - if ( (*(u32*)src & _PAGE_PRESENT) == 0 - && (*(u32*)dst & _PAGE_PRESENT) == 0 ) - return 1; - } -#endif - return 0; -} - + /* Remember if the bottom bit was clear, so we can choose not to run + * the change through the verify code if it's still clear afterwards */ + sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT); +#endif + + return map; +} + +/* Tidy up after the emulated write: mark pages dirty, verify the new + * contents, and undo the mapping */ +static void emulate_unmap_dest(struct vcpu *v, + void *addr, + u32 bytes, + struct sh_emulate_ctxt *sh_ctxt) +{ + u32 b1 = bytes, b2 = 0, shflags; + + ASSERT(mfn_valid(sh_ctxt->mfn1)); + + /* If we are writing lots of PTE-aligned zeros, might want to unshadow */ + if ( likely(bytes >= 4) + && (*(u32 *)addr == 0) + && ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 ) + check_for_early_unshadow(v, sh_ctxt->mfn1); + else + reset_early_unshadow(v); + + /* We can avoid re-verifying the page contents after the write if: + * - it was no larger than the PTE type of this pagetable; + * - it was aligned to the PTE boundaries; and + * - _PAGE_PRESENT was clear before and after the write. */ + shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags; +#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) + if ( sh_ctxt->low_bit_was_clear + && !(*(u8 *)addr & _PAGE_PRESENT) + && ((!(shflags & SHF_32) + /* Not shadowed 32-bit: aligned 64-bit writes that leave + * the present bit unset are safe to ignore. */ + && ((unsigned long)addr & 7) == 0 + && bytes <= 8) + || + (!(shflags & (SHF_PAE|SHF_64)) + /* Not shadowed PAE/64-bit: aligned 32-bit writes that + * leave the present bit unset are safe to ignore. */ + && ((unsigned long)addr & 3) == 0 + && bytes <= 4)) ) + { + /* Writes with this alignment constraint can't possibly cross pages */ + ASSERT(!mfn_valid(sh_ctxt->mfn2)); + } + else +#endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */ + { + if ( unlikely(mfn_valid(sh_ctxt->mfn2)) ) + { + /* Validate as two writes, one to each page */ + b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK); + b2 = bytes - b1; + ASSERT(b2 < bytes); + } + if ( likely(b1 > 0) ) + sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1); + if ( unlikely(b2 > 0) ) + sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2); + } + + paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1)); + + if ( unlikely(mfn_valid(sh_ctxt->mfn2)) ) + { + unsigned long offset; + paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2)); + /* Undo the hacky two-frame contiguous map. */ + ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v)); + offset = l1_linear_offset((unsigned long) addr); + l1e_write(&__linear_l1_table[offset], l1e_empty()); + l1e_write(&__linear_l1_table[offset + 1], l1e_empty()); + flush_tlb_all(); + } + else + sh_unmap_domain_page(addr); +} int sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src, u32 bytes, struct sh_emulate_ctxt *sh_ctxt) { - mfn_t mfn; void *addr; - int skip; - - if ( vaddr & (bytes-1) ) + + /* Unaligned writes are only acceptable on HVM */ + if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) ) return X86EMUL_UNHANDLEABLE; - ASSERT(((vaddr & ~PAGE_MASK) + bytes) <= PAGE_SIZE); shadow_lock(v->domain); - - addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn); + addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt); if ( addr == NULL ) { shadow_unlock(v->domain); return X86EMUL_EXCEPTION; } - skip = safe_not_to_verify_write(mfn, addr, src, bytes); memcpy(addr, src, bytes); - if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes); - - /* If we are writing zeros to this page, might want to unshadow */ - if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) ) - check_for_early_unshadow(v, mfn); - else - reset_early_unshadow(v); - - paging_mark_dirty(v->domain, mfn_x(mfn)); - - sh_unmap_domain_page(addr); + + emulate_unmap_dest(v, addr, bytes, sh_ctxt); shadow_audit_tables(v); shadow_unlock(v->domain); return X86EMUL_OKAY; @@ -4082,25 +4186,22 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u unsigned long old, unsigned long new, unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt) { - mfn_t mfn; void *addr; unsigned long prev; - int rv = X86EMUL_OKAY, skip; - - ASSERT(bytes <= sizeof(unsigned long)); + int rv = X86EMUL_OKAY; + + /* Unaligned writes are only acceptable on HVM */ + if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) ) + return X86EMUL_UNHANDLEABLE; + shadow_lock(v->domain); - if ( vaddr & (bytes-1) ) - return X86EMUL_UNHANDLEABLE; - - addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn); + addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt); if ( addr == NULL ) { shadow_unlock(v->domain); return X86EMUL_EXCEPTION; } - - skip = safe_not_to_verify_write(mfn, &new, &old, bytes); switch ( bytes ) { @@ -4113,26 +4214,14 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u prev = ~old; } - if ( prev == old ) - { - if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, bytes); - } - else + if ( prev != old ) rv = X86EMUL_CMPXCHG_FAILED; SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx" " wanted %#lx now %#lx bytes %u\n", vaddr, prev, old, new, *(unsigned long *)addr, bytes); - /* If we are writing zeros to this page, might want to unshadow */ - if ( likely(bytes >= 4) && (*(u32 *)addr == 0) && is_lo_pte(vaddr) ) - check_for_early_unshadow(v, mfn); - else - reset_early_unshadow(v); - - paging_mark_dirty(v->domain, mfn_x(mfn)); - - sh_unmap_domain_page(addr); + emulate_unmap_dest(v, addr, bytes, sh_ctxt); shadow_audit_tables(v); shadow_unlock(v->domain); return rv; @@ -4144,17 +4233,17 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long new_lo, unsigned long new_hi, struct sh_emulate_ctxt *sh_ctxt) { - mfn_t mfn; void *addr; u64 old, new, prev; - int rv = X86EMUL_OKAY, skip; - - if ( vaddr & 7 ) + int rv = X86EMUL_OKAY; + + /* Unaligned writes are only acceptable on HVM */ + if ( (vaddr & 7) && !is_hvm_vcpu(v) ) return X86EMUL_UNHANDLEABLE; shadow_lock(v->domain); - addr = emulate_map_dest(v, vaddr, sh_ctxt, &mfn); + addr = emulate_map_dest(v, vaddr, 8, sh_ctxt); if ( addr == NULL ) { shadow_unlock(v->domain); @@ -4163,25 +4252,12 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v, old = (((u64) old_hi) << 32) | (u64) old_lo; new = (((u64) new_hi) << 32) | (u64) new_lo; - skip = safe_not_to_verify_write(mfn, &new, &old, 8); prev = cmpxchg(((u64 *)addr), old, new); - if ( prev == old ) - { - if ( !skip ) sh_validate_guest_pt_write(v, mfn, addr, 8); - } - else + if ( prev != old ) rv = X86EMUL_CMPXCHG_FAILED; - /* If we are writing zeros to this page, might want to unshadow */ - if ( *(u32 *)addr == 0 ) - check_for_early_unshadow(v, mfn); - else - reset_early_unshadow(v); - - paging_mark_dirty(v->domain, mfn_x(mfn)); - - sh_unmap_domain_page(addr); + emulate_unmap_dest(v, addr, 8, sh_ctxt); shadow_audit_tables(v); shadow_unlock(v->domain); return rv; diff -r 44a98411d230 -r d9ab9eb2bfee xen/arch/x86/mm/shadow/private.h --- a/xen/arch/x86/mm/shadow/private.h Sat Dec 15 18:26:52 2007 +0000 +++ b/xen/arch/x86/mm/shadow/private.h Sat Dec 15 18:29:27 2007 +0000 @@ -429,13 +429,6 @@ int shadow_cmpxchg_guest_entry(struct vc #undef pagetable_from_page #define pagetable_from_page(pg) pagetable_from_mfn(page_to_mfn(pg)) - -#if GUEST_PAGING_LEVELS >= 3 -# define is_lo_pte(_vaddr) (((_vaddr)&0x4)==0) -#else -# define is_lo_pte(_vaddr) (1) -#endif - static inline int sh_mfn_is_a_page_table(mfn_t gmfn) { @@ -664,14 +657,23 @@ struct sh_emulate_ctxt { struct sh_emulate_ctxt { struct x86_emulate_ctxt ctxt; - /* [HVM] Cache of up to 31 bytes of instruction. */ + /* Cache of up to 31 bytes of instruction. */ uint8_t insn_buf[31]; uint8_t insn_buf_bytes; unsigned long insn_buf_eip; - /* [HVM] Cache of segment registers already gathered for this emulation. */ + /* Cache of segment registers already gathered for this emulation. */ unsigned int valid_seg_regs; struct segment_register seg_reg[6]; + + /* MFNs being written to in write/cmpxchg callbacks */ + mfn_t mfn1, mfn2; + +#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) + /* Special case for avoiding having to verify writes: remember + * whether the old value had its low bit (_PAGE_PRESENT) clear. */ + int low_bit_was_clear:1; +#endif }; struct x86_emulate_ops *shadow_init_emulation( diff -r 44a98411d230 -r d9ab9eb2bfee xen/include/asm-x86/hvm/hvm.h --- a/xen/include/asm-x86/hvm/hvm.h Sat Dec 15 18:26:52 2007 +0000 +++ b/xen/include/asm-x86/hvm/hvm.h Sat Dec 15 18:29:27 2007 +0000 @@ -144,6 +144,8 @@ u64 hvm_get_guest_tsc(struct vcpu *v); #define hvm_paging_enabled(v) \ (!!((v)->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG)) +#define hvm_wp_enabled(v) \ + (!!((v)->arch.hvm_vcpu.guest_cr[0] & X86_CR0_WP)) #define hvm_pae_enabled(v) \ (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE)) #define hvm_nx_enabled(v) \ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |