[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 26/34] x86/mm/shadow: split out HVM only code
Move the code previously enclosed in CONFIG_HVM into its own file. Note that although some code explicitly check is_hvm_*, which hints it can be used for PV too, I can't find a code path that would be the case. Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx> --- Can be squashed into previous patch if that's preferable. --- xen/arch/x86/mm/shadow/Makefile | 1 +- xen/arch/x86/mm/shadow/common.c | 542 +------------------------------ xen/arch/x86/mm/shadow/hvm.c | 590 +++++++++++++++++++++++++++++++++- 3 files changed, 596 insertions(+), 537 deletions(-) create mode 100644 xen/arch/x86/mm/shadow/hvm.c diff --git a/xen/arch/x86/mm/shadow/Makefile b/xen/arch/x86/mm/shadow/Makefile index bcb23d2..72658f3 100644 --- a/xen/arch/x86/mm/shadow/Makefile +++ b/xen/arch/x86/mm/shadow/Makefile @@ -1,5 +1,6 @@ ifeq ($(CONFIG_SHADOW_PAGING),y) obj-y += common.o guest_2.o guest_3.o guest_4.o +obj-$(CONFIG_HVM) += hvm.o else obj-y += none.o endif diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c index 4381538..aa94416 100644 --- a/xen/arch/x86/mm/shadow/common.c +++ b/xen/arch/x86/mm/shadow/common.c @@ -114,273 +114,16 @@ __initcall(shadow_audit_key_init); #if CONFIG_HVM -/**************************************************************************/ -/* x86 emulator support for the shadow code - */ - -/* - * Returns a mapped pointer to write to, or one of the following error - * indicators. - */ -#define MAPPING_UNHANDLEABLE ERR_PTR(~(long)X86EMUL_UNHANDLEABLE) -#define MAPPING_EXCEPTION ERR_PTR(~(long)X86EMUL_EXCEPTION) -#define MAPPING_SILENT_FAIL ERR_PTR(~(long)X86EMUL_OKAY) -static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr, - unsigned int bytes, - struct sh_emulate_ctxt *sh_ctxt); -static void sh_emulate_unmap_dest(struct vcpu *v, void *addr, - unsigned int bytes, - struct sh_emulate_ctxt *sh_ctxt); - -/* - * Callers which pass a known in-range x86_segment can rely on the return - * pointer being valid. Other callers must explicitly check for errors. - */ -static struct segment_register *hvm_get_seg_reg( - enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt) -{ - unsigned int idx = seg; - struct segment_register *seg_reg; - - if ( idx >= ARRAY_SIZE(sh_ctxt->seg_reg) ) - return ERR_PTR(-X86EMUL_UNHANDLEABLE); - - seg_reg = &sh_ctxt->seg_reg[idx]; - if ( !__test_and_set_bit(idx, &sh_ctxt->valid_seg_regs) ) - hvm_get_segment_register(current, idx, seg_reg); - return seg_reg; -} - -static int hvm_translate_virtual_addr( +extern const struct x86_emulate_ops hvm_shadow_emulator_ops; +extern struct segment_register *hvm_get_seg_reg( + enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt); +extern int hvm_translate_virtual_addr( enum x86_segment seg, unsigned long offset, unsigned int bytes, enum hvm_access_type access_type, struct sh_emulate_ctxt *sh_ctxt, - unsigned long *linear) -{ - const struct segment_register *reg; - int okay; - - reg = hvm_get_seg_reg(seg, sh_ctxt); - if ( IS_ERR(reg) ) - return -PTR_ERR(reg); - - okay = hvm_virtual_to_linear_addr( - seg, reg, offset, bytes, access_type, - hvm_get_seg_reg(x86_seg_cs, sh_ctxt), linear); - - if ( !okay ) - { - /* - * Leave exception injection to the caller for non-user segments: We - * neither know the exact error code to be used, nor can we easily - * determine the kind of exception (#GP or #TS) in that case. - */ - if ( is_x86_user_segment(seg) ) - x86_emul_hw_exception( - (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault, - 0, &sh_ctxt->ctxt); - return X86EMUL_EXCEPTION; - } - - return 0; -} - -static int -hvm_read(enum x86_segment seg, - unsigned long offset, - void *p_data, - unsigned int bytes, - enum hvm_access_type access_type, - struct sh_emulate_ctxt *sh_ctxt) -{ - pagefault_info_t pfinfo; - unsigned long addr; - int rc; - - rc = hvm_translate_virtual_addr( - seg, offset, bytes, access_type, sh_ctxt, &addr); - if ( rc || !bytes ) - return rc; - - if ( access_type == hvm_access_insn_fetch ) - rc = hvm_fetch_from_guest_linear(p_data, addr, bytes, 0, &pfinfo); - else - rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo); - - switch ( rc ) - { - case HVMTRANS_okay: - return X86EMUL_OKAY; - case HVMTRANS_bad_linear_to_gfn: - x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &sh_ctxt->ctxt); - return X86EMUL_EXCEPTION; - case HVMTRANS_bad_gfn_to_mfn: - case HVMTRANS_unhandleable: - return X86EMUL_UNHANDLEABLE; - case HVMTRANS_gfn_paged_out: - case HVMTRANS_gfn_shared: - return X86EMUL_RETRY; - } - - BUG(); - return X86EMUL_UNHANDLEABLE; -} - -static int -hvm_emulate_read(enum x86_segment seg, - unsigned long offset, - void *p_data, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt) -{ - if ( !is_x86_user_segment(seg) ) - return X86EMUL_UNHANDLEABLE; - return hvm_read(seg, offset, p_data, bytes, hvm_access_read, - container_of(ctxt, struct sh_emulate_ctxt, ctxt)); -} - -static int -hvm_emulate_insn_fetch(enum x86_segment seg, - unsigned long offset, - void *p_data, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt) -{ - struct sh_emulate_ctxt *sh_ctxt = - container_of(ctxt, struct sh_emulate_ctxt, ctxt); - unsigned int insn_off = offset - sh_ctxt->insn_buf_eip; - - ASSERT(seg == x86_seg_cs); - - /* Fall back if requested bytes are not in the prefetch cache. */ - if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) ) - return hvm_read(seg, offset, p_data, bytes, - hvm_access_insn_fetch, sh_ctxt); - - /* Hit the cache. Simple memcpy. */ - memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes); - return X86EMUL_OKAY; -} - -static int -hvm_emulate_write(enum x86_segment seg, - unsigned long offset, - void *p_data, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt) -{ - struct sh_emulate_ctxt *sh_ctxt = - container_of(ctxt, struct sh_emulate_ctxt, ctxt); - struct vcpu *v = current; - unsigned long addr; - void *ptr; - int rc; - - /* How many emulations could we save if we unshadowed on stack writes? */ - if ( seg == x86_seg_ss ) - perfc_incr(shadow_fault_emulate_stack); - - rc = hvm_translate_virtual_addr( - seg, offset, bytes, hvm_access_write, sh_ctxt, &addr); - if ( rc || !bytes ) - return rc; - - /* Unaligned writes are only acceptable on HVM */ - if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v) ) - return X86EMUL_UNHANDLEABLE; - - ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt); - if ( IS_ERR(ptr) ) - return ~PTR_ERR(ptr); - - paging_lock(v->domain); - memcpy(ptr, p_data, bytes); - - if ( tb_init_done ) - v->arch.paging.mode->shadow.trace_emul_write_val(ptr, addr, - p_data, bytes); - - sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt); - shadow_audit_tables(v); - paging_unlock(v->domain); - - return X86EMUL_OKAY; -} - -static int -hvm_emulate_cmpxchg(enum x86_segment seg, - unsigned long offset, - void *p_old, - void *p_new, - unsigned int bytes, - bool lock, - struct x86_emulate_ctxt *ctxt) -{ - struct sh_emulate_ctxt *sh_ctxt = - container_of(ctxt, struct sh_emulate_ctxt, ctxt); - struct vcpu *v = current; - unsigned long addr, old, new, prev; - void *ptr; - int rc; - - if ( bytes > sizeof(long) ) - return X86EMUL_UNHANDLEABLE; - - rc = hvm_translate_virtual_addr( - seg, offset, bytes, hvm_access_write, sh_ctxt, &addr); - if ( rc ) - return rc; - - /* Unaligned writes are only acceptable on HVM */ - if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v) ) - return X86EMUL_UNHANDLEABLE; - - ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt); - if ( IS_ERR(ptr) ) - return ~PTR_ERR(ptr); - - old = new = 0; - memcpy(&old, p_old, bytes); - memcpy(&new, p_new, bytes); - - paging_lock(v->domain); - switch ( bytes ) - { - case 1: prev = cmpxchg((uint8_t *)ptr, old, new); break; - case 2: prev = cmpxchg((uint16_t *)ptr, old, new); break; - case 4: prev = cmpxchg((uint32_t *)ptr, old, new); break; - case 8: prev = cmpxchg((uint64_t *)ptr, old, new); break; - default: - SHADOW_PRINTK("cmpxchg size %u is not supported\n", bytes); - prev = ~old; - } - - if ( prev != old ) - { - memcpy(p_old, &prev, bytes); - rc = X86EMUL_CMPXCHG_FAILED; - } - - SHADOW_DEBUG(EMULATE, - "va %#lx was %#lx expected %#lx wanted %#lx now %#lx bytes %u\n", - addr, prev, old, new, *(unsigned long *)ptr, bytes); - - sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt); - shadow_audit_tables(v); - paging_unlock(v->domain); - - return rc; -} - -static const struct x86_emulate_ops hvm_shadow_emulator_ops = { - .read = hvm_emulate_read, - .insn_fetch = hvm_emulate_insn_fetch, - .write = hvm_emulate_write, - .cmpxchg = hvm_emulate_cmpxchg, - .cpuid = hvmemul_cpuid, -}; + unsigned long *linear); #endif const struct x86_emulate_ops *shadow_init_emulation( @@ -1696,281 +1439,6 @@ static unsigned int shadow_get_allocation(struct domain *d) + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); } -#if CONFIG_HVM -/**************************************************************************/ -/* Handling guest writes to pagetables. */ - -/* - * Translate a VA to an MFN, injecting a page-fault if we fail. If the - * mapping succeeds, a reference will be held on the underlying page. - */ -#define BAD_GVA_TO_GFN (~0UL) -#define BAD_GFN_TO_MFN (~1UL) -#define READONLY_GFN (~2UL) -static mfn_t emulate_gva_to_mfn(struct vcpu *v, unsigned long vaddr, - struct sh_emulate_ctxt *sh_ctxt) -{ - unsigned long gfn; - struct page_info *page; - mfn_t mfn; - p2m_type_t p2mt; - uint32_t pfec = PFEC_page_present | PFEC_write_access; - - /* Translate the VA to a GFN. */ - gfn = paging_get_hostmode(v)->gva_to_gfn(v, NULL, vaddr, &pfec); - if ( gfn == gfn_x(INVALID_GFN) ) - { - x86_emul_pagefault(pfec, vaddr, &sh_ctxt->ctxt); - - return _mfn(BAD_GVA_TO_GFN); - } - - /* Translate the GFN to an MFN. */ - ASSERT(!paging_locked_by_me(v->domain)); - - page = get_page_from_gfn(v->domain, gfn, &p2mt, P2M_ALLOC); - - /* Sanity checking. */ - if ( page == NULL ) - { - return _mfn(BAD_GFN_TO_MFN); - } - if ( p2m_is_discard_write(p2mt) ) - { - put_page(page); - return _mfn(READONLY_GFN); - } - if ( !p2m_is_ram(p2mt) ) - { - put_page(page); - return _mfn(BAD_GFN_TO_MFN); - } - mfn = page_to_mfn(page); - ASSERT(mfn_valid(mfn)); - - v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn); - - return mfn; -} - -/* - * Check that the user is allowed to perform this write. If a mapping is - * returned, page references will be held on sh_ctxt->mfn[0] and - * sh_ctxt->mfn[1] iff !INVALID_MFN. - */ -static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr, - unsigned int bytes, - struct sh_emulate_ctxt *sh_ctxt) -{ - struct domain *d = v->domain; - void *map; - -#ifndef NDEBUG - /* We don't emulate user-mode writes to page tables. */ - if ( is_hvm_domain(d) ? hvm_get_cpl(v) == 3 - : !guest_kernel_mode(v, guest_cpu_user_regs()) ) - { - gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached " - "emulate_map_dest(). This should never happen!\n"); - return MAPPING_UNHANDLEABLE; - } -#endif - - sh_ctxt->mfn[0] = emulate_gva_to_mfn(v, vaddr, sh_ctxt); - if ( !mfn_valid(sh_ctxt->mfn[0]) ) - { - switch ( mfn_x(sh_ctxt->mfn[0]) ) - { - case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION; - case READONLY_GFN: return MAPPING_SILENT_FAIL; - default: return MAPPING_UNHANDLEABLE; - } - } - - /* Unaligned writes mean probably this isn't a pagetable. */ - if ( vaddr & (bytes - 1) ) - sh_remove_shadows(d, sh_ctxt->mfn[0], 0, 0 /* Slow, can fail. */ ); - - if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) ) - { - /* Whole write fits on a single page. */ - sh_ctxt->mfn[1] = INVALID_MFN; - map = map_domain_page(sh_ctxt->mfn[0]) + (vaddr & ~PAGE_MASK); - } - else if ( !is_hvm_domain(d) ) - { - /* - * Cross-page emulated writes are only supported for HVM guests; - * PV guests ought to know better. - */ - put_page(mfn_to_page(sh_ctxt->mfn[0])); - return MAPPING_UNHANDLEABLE; - } - else - { - /* This write crosses a page boundary. Translate the second page. */ - sh_ctxt->mfn[1] = emulate_gva_to_mfn( - v, (vaddr + bytes - 1) & PAGE_MASK, sh_ctxt); - if ( !mfn_valid(sh_ctxt->mfn[1]) ) - { - put_page(mfn_to_page(sh_ctxt->mfn[0])); - switch ( mfn_x(sh_ctxt->mfn[1]) ) - { - case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION; - case READONLY_GFN: return MAPPING_SILENT_FAIL; - default: return MAPPING_UNHANDLEABLE; - } - } - - /* Cross-page writes mean probably not a pagetable. */ - sh_remove_shadows(d, sh_ctxt->mfn[1], 0, 0 /* Slow, can fail. */ ); - - map = vmap(sh_ctxt->mfn, 2); - if ( !map ) - { - put_page(mfn_to_page(sh_ctxt->mfn[0])); - put_page(mfn_to_page(sh_ctxt->mfn[1])); - return MAPPING_UNHANDLEABLE; - } - map += (vaddr & ~PAGE_MASK); - } - -#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) - /* - * Remember if the bottom bit was clear, so we can choose not to run - * the change through the verify code if it's still clear afterwards. - */ - sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT); -#endif - - return map; -} - -/* - * Optimization: If we see two emulated writes of zeros to the same - * page-table without another kind of page fault in between, we guess - * that this is a batch of changes (for process destruction) and - * unshadow the page so we don't take a pagefault on every entry. This - * should also make finding writeable mappings of pagetables much - * easier. - * - * Look to see if this is the second emulated write in a row to this - * page, and unshadow if it is. - */ -static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn) -{ -#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW - struct domain *d = v->domain; - - /* - * If the domain has never made a "dying" op, use the two-writes - * heuristic; otherwise, unshadow as soon as we write a zero for a dying - * process. - * - * Don't bother trying to unshadow if it's not a PT, or if it's > l1. - */ - if ( ( v->arch.paging.shadow.pagetable_dying - || ( !d->arch.paging.shadow.pagetable_dying_op - && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn) ) ) - && sh_mfn_is_a_page_table(gmfn) - && (!d->arch.paging.shadow.pagetable_dying_op || - !(mfn_to_page(gmfn)->shadow_flags - & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64))) ) - { - perfc_incr(shadow_early_unshadow); - sh_remove_shadows(d, gmfn, 1, 0 /* Fast, can fail to unshadow */ ); - TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW); - } - v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn); -#endif -} - -/* - * Tidy up after the emulated write: mark pages dirty, verify the new - * contents, and undo the mapping. - */ -static void sh_emulate_unmap_dest(struct vcpu *v, void *addr, - unsigned int bytes, - struct sh_emulate_ctxt *sh_ctxt) -{ - u32 b1 = bytes, b2 = 0, shflags; - - ASSERT(mfn_valid(sh_ctxt->mfn[0])); - - /* If we are writing lots of PTE-aligned zeros, might want to unshadow */ - if ( likely(bytes >= 4) && (*(u32 *)addr == 0) ) - { - if ( !((unsigned long)addr & (sh_ctxt->pte_size - 1)) ) - check_for_early_unshadow(v, sh_ctxt->mfn[0]); - /* - * Don't reset the heuristic if we're writing zeros at non-aligned - * addresses, otherwise it doesn't catch REP MOVSD on PAE guests. - */ - } - else - sh_reset_early_unshadow(v); - - /* - * We can avoid re-verifying the page contents after the write if: - * - it was no larger than the PTE type of this pagetable; - * - it was aligned to the PTE boundaries; and - * - _PAGE_PRESENT was clear before and after the write. - */ - shflags = mfn_to_page(sh_ctxt->mfn[0])->shadow_flags; -#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) - if ( sh_ctxt->low_bit_was_clear - && !(*(u8 *)addr & _PAGE_PRESENT) - && ((!(shflags & SHF_32) - /* - * Not shadowed 32-bit: aligned 64-bit writes that leave - * the present bit unset are safe to ignore. - */ - && ((unsigned long)addr & 7) == 0 - && bytes <= 8) - || - (!(shflags & (SHF_PAE|SHF_64)) - /* - * Not shadowed PAE/64-bit: aligned 32-bit writes that - * leave the present bit unset are safe to ignore. - */ - && ((unsigned long)addr & 3) == 0 - && bytes <= 4)) ) - { - /* Writes with this alignment constraint can't possibly cross pages. */ - ASSERT(!mfn_valid(sh_ctxt->mfn[1])); - } - else -#endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */ - { - if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) ) - { - /* Validate as two writes, one to each page. */ - b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK); - b2 = bytes - b1; - ASSERT(b2 < bytes); - } - if ( likely(b1 > 0) ) - sh_validate_guest_pt_write(v, sh_ctxt->mfn[0], addr, b1); - if ( unlikely(b2 > 0) ) - sh_validate_guest_pt_write(v, sh_ctxt->mfn[1], addr + b1, b2); - } - - paging_mark_dirty(v->domain, sh_ctxt->mfn[0]); - put_page(mfn_to_page(sh_ctxt->mfn[0])); - - if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) ) - { - paging_mark_dirty(v->domain, sh_ctxt->mfn[1]); - put_page(mfn_to_page(sh_ctxt->mfn[1])); - vunmap((void *)((unsigned long)addr & PAGE_MASK)); - } - else - unmap_domain_page(addr); - - atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version); -} -#endif - /**************************************************************************/ /* Hash table for storing the guest->shadow mappings. * The table itself is an array of pointers to shadows; the shadows are then diff --git a/xen/arch/x86/mm/shadow/hvm.c b/xen/arch/x86/mm/shadow/hvm.c new file mode 100644 index 0000000..863f644 --- /dev/null +++ b/xen/arch/x86/mm/shadow/hvm.c @@ -0,0 +1,590 @@ + +/****************************************************************************** + * arch/x86/mm/shadow/hvm.c + * + * Shadow code that does not need to be multiply compiled and is HVM only. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/trace.h> +#include <xen/sched.h> +#include <xen/perfc.h> +#include <xen/irq.h> +#include <xen/domain_page.h> +#include <xen/guest_access.h> +#include <xen/keyhandler.h> +#include <asm/event.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/flushtlb.h> +#include <asm/shadow.h> +#include <asm/hvm/ioreq.h> +#include <xen/numa.h> +#include "private.h" + +/**************************************************************************/ +/* x86 emulator support for the shadow code + */ + +/* + * Returns a mapped pointer to write to, or one of the following error + * indicators. + */ +#define MAPPING_UNHANDLEABLE ERR_PTR(~(long)X86EMUL_UNHANDLEABLE) +#define MAPPING_EXCEPTION ERR_PTR(~(long)X86EMUL_EXCEPTION) +#define MAPPING_SILENT_FAIL ERR_PTR(~(long)X86EMUL_OKAY) +static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr, + unsigned int bytes, + struct sh_emulate_ctxt *sh_ctxt); +static void sh_emulate_unmap_dest(struct vcpu *v, void *addr, + unsigned int bytes, + struct sh_emulate_ctxt *sh_ctxt); + +/* + * Callers which pass a known in-range x86_segment can rely on the return + * pointer being valid. Other callers must explicitly check for errors. + */ +struct segment_register *hvm_get_seg_reg( + enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt) +{ + unsigned int idx = seg; + struct segment_register *seg_reg; + + if ( idx >= ARRAY_SIZE(sh_ctxt->seg_reg) ) + return ERR_PTR(-X86EMUL_UNHANDLEABLE); + + seg_reg = &sh_ctxt->seg_reg[idx]; + if ( !__test_and_set_bit(idx, &sh_ctxt->valid_seg_regs) ) + hvm_get_segment_register(current, idx, seg_reg); + return seg_reg; +} + +int hvm_translate_virtual_addr( + enum x86_segment seg, + unsigned long offset, + unsigned int bytes, + enum hvm_access_type access_type, + struct sh_emulate_ctxt *sh_ctxt, + unsigned long *linear) +{ + const struct segment_register *reg; + int okay; + + reg = hvm_get_seg_reg(seg, sh_ctxt); + if ( IS_ERR(reg) ) + return -PTR_ERR(reg); + + okay = hvm_virtual_to_linear_addr( + seg, reg, offset, bytes, access_type, + hvm_get_seg_reg(x86_seg_cs, sh_ctxt), linear); + + if ( !okay ) + { + /* + * Leave exception injection to the caller for non-user segments: We + * neither know the exact error code to be used, nor can we easily + * determine the kind of exception (#GP or #TS) in that case. + */ + if ( is_x86_user_segment(seg) ) + x86_emul_hw_exception( + (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault, + 0, &sh_ctxt->ctxt); + return X86EMUL_EXCEPTION; + } + + return 0; +} + +static int +hvm_read(enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + enum hvm_access_type access_type, + struct sh_emulate_ctxt *sh_ctxt) +{ + pagefault_info_t pfinfo; + unsigned long addr; + int rc; + + rc = hvm_translate_virtual_addr( + seg, offset, bytes, access_type, sh_ctxt, &addr); + if ( rc || !bytes ) + return rc; + + if ( access_type == hvm_access_insn_fetch ) + rc = hvm_fetch_from_guest_linear(p_data, addr, bytes, 0, &pfinfo); + else + rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo); + + switch ( rc ) + { + case HVMTRANS_okay: + return X86EMUL_OKAY; + case HVMTRANS_bad_linear_to_gfn: + x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &sh_ctxt->ctxt); + return X86EMUL_EXCEPTION; + case HVMTRANS_bad_gfn_to_mfn: + case HVMTRANS_unhandleable: + return X86EMUL_UNHANDLEABLE; + case HVMTRANS_gfn_paged_out: + case HVMTRANS_gfn_shared: + return X86EMUL_RETRY; + } + + BUG(); + return X86EMUL_UNHANDLEABLE; +} + +static int +hvm_emulate_read(enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + if ( !is_x86_user_segment(seg) ) + return X86EMUL_UNHANDLEABLE; + return hvm_read(seg, offset, p_data, bytes, hvm_access_read, + container_of(ctxt, struct sh_emulate_ctxt, ctxt)); +} + +static int +hvm_emulate_insn_fetch(enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct sh_emulate_ctxt *sh_ctxt = + container_of(ctxt, struct sh_emulate_ctxt, ctxt); + unsigned int insn_off = offset - sh_ctxt->insn_buf_eip; + + ASSERT(seg == x86_seg_cs); + + /* Fall back if requested bytes are not in the prefetch cache. */ + if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) ) + return hvm_read(seg, offset, p_data, bytes, + hvm_access_insn_fetch, sh_ctxt); + + /* Hit the cache. Simple memcpy. */ + memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes); + return X86EMUL_OKAY; +} + +static int +hvm_emulate_write(enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct sh_emulate_ctxt *sh_ctxt = + container_of(ctxt, struct sh_emulate_ctxt, ctxt); + struct vcpu *v = current; + unsigned long addr; + void *ptr; + int rc; + + /* How many emulations could we save if we unshadowed on stack writes? */ + if ( seg == x86_seg_ss ) + perfc_incr(shadow_fault_emulate_stack); + + rc = hvm_translate_virtual_addr( + seg, offset, bytes, hvm_access_write, sh_ctxt, &addr); + if ( rc || !bytes ) + return rc; + + /* Unaligned writes are only acceptable on HVM */ + if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v) ) + return X86EMUL_UNHANDLEABLE; + + ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt); + if ( IS_ERR(ptr) ) + return ~PTR_ERR(ptr); + + paging_lock(v->domain); + memcpy(ptr, p_data, bytes); + + if ( tb_init_done ) + v->arch.paging.mode->shadow.trace_emul_write_val(ptr, addr, + p_data, bytes); + + sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt); + shadow_audit_tables(v); + paging_unlock(v->domain); + + return X86EMUL_OKAY; +} + +static int +hvm_emulate_cmpxchg(enum x86_segment seg, + unsigned long offset, + void *p_old, + void *p_new, + unsigned int bytes, + bool lock, + struct x86_emulate_ctxt *ctxt) +{ + struct sh_emulate_ctxt *sh_ctxt = + container_of(ctxt, struct sh_emulate_ctxt, ctxt); + struct vcpu *v = current; + unsigned long addr, old, new, prev; + void *ptr; + int rc; + + if ( bytes > sizeof(long) ) + return X86EMUL_UNHANDLEABLE; + + rc = hvm_translate_virtual_addr( + seg, offset, bytes, hvm_access_write, sh_ctxt, &addr); + if ( rc ) + return rc; + + /* Unaligned writes are only acceptable on HVM */ + if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v) ) + return X86EMUL_UNHANDLEABLE; + + ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt); + if ( IS_ERR(ptr) ) + return ~PTR_ERR(ptr); + + old = new = 0; + memcpy(&old, p_old, bytes); + memcpy(&new, p_new, bytes); + + paging_lock(v->domain); + switch ( bytes ) + { + case 1: prev = cmpxchg((uint8_t *)ptr, old, new); break; + case 2: prev = cmpxchg((uint16_t *)ptr, old, new); break; + case 4: prev = cmpxchg((uint32_t *)ptr, old, new); break; + case 8: prev = cmpxchg((uint64_t *)ptr, old, new); break; + default: + SHADOW_PRINTK("cmpxchg size %u is not supported\n", bytes); + prev = ~old; + } + + if ( prev != old ) + { + memcpy(p_old, &prev, bytes); + rc = X86EMUL_CMPXCHG_FAILED; + } + + SHADOW_DEBUG(EMULATE, + "va %#lx was %#lx expected %#lx wanted %#lx now %#lx bytes %u\n", + addr, prev, old, new, *(unsigned long *)ptr, bytes); + + sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt); + shadow_audit_tables(v); + paging_unlock(v->domain); + + return rc; +} + +const struct x86_emulate_ops hvm_shadow_emulator_ops = { + .read = hvm_emulate_read, + .insn_fetch = hvm_emulate_insn_fetch, + .write = hvm_emulate_write, + .cmpxchg = hvm_emulate_cmpxchg, + .cpuid = hvmemul_cpuid, +}; + +/**************************************************************************/ +/* Handling guest writes to pagetables. */ + +/* + * Translate a VA to an MFN, injecting a page-fault if we fail. If the + * mapping succeeds, a reference will be held on the underlying page. + */ +#define BAD_GVA_TO_GFN (~0UL) +#define BAD_GFN_TO_MFN (~1UL) +#define READONLY_GFN (~2UL) +static mfn_t emulate_gva_to_mfn(struct vcpu *v, unsigned long vaddr, + struct sh_emulate_ctxt *sh_ctxt) +{ + unsigned long gfn; + struct page_info *page; + mfn_t mfn; + p2m_type_t p2mt; + uint32_t pfec = PFEC_page_present | PFEC_write_access; + + /* Translate the VA to a GFN. */ + gfn = paging_get_hostmode(v)->gva_to_gfn(v, NULL, vaddr, &pfec); + if ( gfn == gfn_x(INVALID_GFN) ) + { + x86_emul_pagefault(pfec, vaddr, &sh_ctxt->ctxt); + + return _mfn(BAD_GVA_TO_GFN); + } + + /* Translate the GFN to an MFN. */ + ASSERT(!paging_locked_by_me(v->domain)); + + page = get_page_from_gfn(v->domain, gfn, &p2mt, P2M_ALLOC); + + /* Sanity checking. */ + if ( page == NULL ) + { + return _mfn(BAD_GFN_TO_MFN); + } + if ( p2m_is_discard_write(p2mt) ) + { + put_page(page); + return _mfn(READONLY_GFN); + } + if ( !p2m_is_ram(p2mt) ) + { + put_page(page); + return _mfn(BAD_GFN_TO_MFN); + } + mfn = page_to_mfn(page); + ASSERT(mfn_valid(mfn)); + + v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn); + + return mfn; +} + +/* + * Check that the user is allowed to perform this write. If a mapping is + * returned, page references will be held on sh_ctxt->mfn[0] and + * sh_ctxt->mfn[1] iff !INVALID_MFN. + */ +static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr, + unsigned int bytes, + struct sh_emulate_ctxt *sh_ctxt) +{ + struct domain *d = v->domain; + void *map; + +#ifndef NDEBUG + /* We don't emulate user-mode writes to page tables. */ + if ( is_hvm_domain(d) ? hvm_get_cpl(v) == 3 + : !guest_kernel_mode(v, guest_cpu_user_regs()) ) + { + gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached " + "emulate_map_dest(). This should never happen!\n"); + return MAPPING_UNHANDLEABLE; + } +#endif + + sh_ctxt->mfn[0] = emulate_gva_to_mfn(v, vaddr, sh_ctxt); + if ( !mfn_valid(sh_ctxt->mfn[0]) ) + { + switch ( mfn_x(sh_ctxt->mfn[0]) ) + { + case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION; + case READONLY_GFN: return MAPPING_SILENT_FAIL; + default: return MAPPING_UNHANDLEABLE; + } + } + + /* Unaligned writes mean probably this isn't a pagetable. */ + if ( vaddr & (bytes - 1) ) + sh_remove_shadows(d, sh_ctxt->mfn[0], 0, 0 /* Slow, can fail. */ ); + + if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) ) + { + /* Whole write fits on a single page. */ + sh_ctxt->mfn[1] = INVALID_MFN; + map = map_domain_page(sh_ctxt->mfn[0]) + (vaddr & ~PAGE_MASK); + } + else if ( !is_hvm_domain(d) ) + { + /* + * Cross-page emulated writes are only supported for HVM guests; + * PV guests ought to know better. + */ + put_page(mfn_to_page(sh_ctxt->mfn[0])); + return MAPPING_UNHANDLEABLE; + } + else + { + /* This write crosses a page boundary. Translate the second page. */ + sh_ctxt->mfn[1] = emulate_gva_to_mfn( + v, (vaddr + bytes - 1) & PAGE_MASK, sh_ctxt); + if ( !mfn_valid(sh_ctxt->mfn[1]) ) + { + put_page(mfn_to_page(sh_ctxt->mfn[0])); + switch ( mfn_x(sh_ctxt->mfn[1]) ) + { + case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION; + case READONLY_GFN: return MAPPING_SILENT_FAIL; + default: return MAPPING_UNHANDLEABLE; + } + } + + /* Cross-page writes mean probably not a pagetable. */ + sh_remove_shadows(d, sh_ctxt->mfn[1], 0, 0 /* Slow, can fail. */ ); + + map = vmap(sh_ctxt->mfn, 2); + if ( !map ) + { + put_page(mfn_to_page(sh_ctxt->mfn[0])); + put_page(mfn_to_page(sh_ctxt->mfn[1])); + return MAPPING_UNHANDLEABLE; + } + map += (vaddr & ~PAGE_MASK); + } + +#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) + /* + * Remember if the bottom bit was clear, so we can choose not to run + * the change through the verify code if it's still clear afterwards. + */ + sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT); +#endif + + return map; +} + +/* + * Optimization: If we see two emulated writes of zeros to the same + * page-table without another kind of page fault in between, we guess + * that this is a batch of changes (for process destruction) and + * unshadow the page so we don't take a pagefault on every entry. This + * should also make finding writeable mappings of pagetables much + * easier. + * + * Look to see if this is the second emulated write in a row to this + * page, and unshadow if it is. + */ +static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn) +{ +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW + struct domain *d = v->domain; + + /* + * If the domain has never made a "dying" op, use the two-writes + * heuristic; otherwise, unshadow as soon as we write a zero for a dying + * process. + * + * Don't bother trying to unshadow if it's not a PT, or if it's > l1. + */ + if ( ( v->arch.paging.shadow.pagetable_dying + || ( !d->arch.paging.shadow.pagetable_dying_op + && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn) ) ) + && sh_mfn_is_a_page_table(gmfn) + && (!d->arch.paging.shadow.pagetable_dying_op || + !(mfn_to_page(gmfn)->shadow_flags + & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64))) ) + { + perfc_incr(shadow_early_unshadow); + sh_remove_shadows(d, gmfn, 1, 0 /* Fast, can fail to unshadow */ ); + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW); + } + v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn); +#endif +} + +/* + * Tidy up after the emulated write: mark pages dirty, verify the new + * contents, and undo the mapping. + */ +static void sh_emulate_unmap_dest(struct vcpu *v, void *addr, + unsigned int bytes, + struct sh_emulate_ctxt *sh_ctxt) +{ + u32 b1 = bytes, b2 = 0, shflags; + + ASSERT(mfn_valid(sh_ctxt->mfn[0])); + + /* If we are writing lots of PTE-aligned zeros, might want to unshadow */ + if ( likely(bytes >= 4) && (*(u32 *)addr == 0) ) + { + if ( !((unsigned long)addr & (sh_ctxt->pte_size - 1)) ) + check_for_early_unshadow(v, sh_ctxt->mfn[0]); + /* + * Don't reset the heuristic if we're writing zeros at non-aligned + * addresses, otherwise it doesn't catch REP MOVSD on PAE guests. + */ + } + else + sh_reset_early_unshadow(v); + + /* + * We can avoid re-verifying the page contents after the write if: + * - it was no larger than the PTE type of this pagetable; + * - it was aligned to the PTE boundaries; and + * - _PAGE_PRESENT was clear before and after the write. + */ + shflags = mfn_to_page(sh_ctxt->mfn[0])->shadow_flags; +#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) + if ( sh_ctxt->low_bit_was_clear + && !(*(u8 *)addr & _PAGE_PRESENT) + && ((!(shflags & SHF_32) + /* + * Not shadowed 32-bit: aligned 64-bit writes that leave + * the present bit unset are safe to ignore. + */ + && ((unsigned long)addr & 7) == 0 + && bytes <= 8) + || + (!(shflags & (SHF_PAE|SHF_64)) + /* + * Not shadowed PAE/64-bit: aligned 32-bit writes that + * leave the present bit unset are safe to ignore. + */ + && ((unsigned long)addr & 3) == 0 + && bytes <= 4)) ) + { + /* Writes with this alignment constraint can't possibly cross pages. */ + ASSERT(!mfn_valid(sh_ctxt->mfn[1])); + } + else +#endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */ + { + if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) ) + { + /* Validate as two writes, one to each page. */ + b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK); + b2 = bytes - b1; + ASSERT(b2 < bytes); + } + if ( likely(b1 > 0) ) + sh_validate_guest_pt_write(v, sh_ctxt->mfn[0], addr, b1); + if ( unlikely(b2 > 0) ) + sh_validate_guest_pt_write(v, sh_ctxt->mfn[1], addr + b1, b2); + } + + paging_mark_dirty(v->domain, sh_ctxt->mfn[0]); + put_page(mfn_to_page(sh_ctxt->mfn[0])); + + if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) ) + { + paging_mark_dirty(v->domain, sh_ctxt->mfn[1]); + put_page(mfn_to_page(sh_ctxt->mfn[1])); + vunmap((void *)((unsigned long)addr & PAGE_MASK)); + } + else + unmap_domain_page(addr); + + atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ -- git-series 0.9.1 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |