[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 26/34] x86/mm/shadow: split out HVM only code



Move the code previously enclosed in CONFIG_HVM into its own file.

Note that although some code explicitly check is_hvm_*, which hints it
can be used for PV too, I can't find a code path that would be the
case.

Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx>
---
Can be squashed into previous patch if that's preferable.
---
 xen/arch/x86/mm/shadow/Makefile |   1 +-
 xen/arch/x86/mm/shadow/common.c | 542 +------------------------------
 xen/arch/x86/mm/shadow/hvm.c    | 590 +++++++++++++++++++++++++++++++++-
 3 files changed, 596 insertions(+), 537 deletions(-)
 create mode 100644 xen/arch/x86/mm/shadow/hvm.c

diff --git a/xen/arch/x86/mm/shadow/Makefile b/xen/arch/x86/mm/shadow/Makefile
index bcb23d2..72658f3 100644
--- a/xen/arch/x86/mm/shadow/Makefile
+++ b/xen/arch/x86/mm/shadow/Makefile
@@ -1,5 +1,6 @@
 ifeq ($(CONFIG_SHADOW_PAGING),y)
 obj-y += common.o guest_2.o guest_3.o guest_4.o
+obj-$(CONFIG_HVM) += hvm.o
 else
 obj-y += none.o
 endif
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index 4381538..aa94416 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -114,273 +114,16 @@ __initcall(shadow_audit_key_init);
 
 
 #if CONFIG_HVM
-/**************************************************************************/
-/* x86 emulator support for the shadow code
- */
-
-/*
- * Returns a mapped pointer to write to, or one of the following error
- * indicators.
- */
-#define MAPPING_UNHANDLEABLE ERR_PTR(~(long)X86EMUL_UNHANDLEABLE)
-#define MAPPING_EXCEPTION    ERR_PTR(~(long)X86EMUL_EXCEPTION)
-#define MAPPING_SILENT_FAIL  ERR_PTR(~(long)X86EMUL_OKAY)
-static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
-                                 unsigned int bytes,
-                                 struct sh_emulate_ctxt *sh_ctxt);
-static void sh_emulate_unmap_dest(struct vcpu *v, void *addr,
-                                  unsigned int bytes,
-                                  struct sh_emulate_ctxt *sh_ctxt);
-
-/*
- * Callers which pass a known in-range x86_segment can rely on the return
- * pointer being valid.  Other callers must explicitly check for errors.
- */
-static struct segment_register *hvm_get_seg_reg(
-    enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
-{
-    unsigned int idx = seg;
-    struct segment_register *seg_reg;
-
-    if ( idx >= ARRAY_SIZE(sh_ctxt->seg_reg) )
-        return ERR_PTR(-X86EMUL_UNHANDLEABLE);
-
-    seg_reg = &sh_ctxt->seg_reg[idx];
-    if ( !__test_and_set_bit(idx, &sh_ctxt->valid_seg_regs) )
-        hvm_get_segment_register(current, idx, seg_reg);
-    return seg_reg;
-}
-
-static int hvm_translate_virtual_addr(
+extern const struct x86_emulate_ops hvm_shadow_emulator_ops;
+extern struct segment_register *hvm_get_seg_reg(
+    enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt);
+extern int hvm_translate_virtual_addr(
     enum x86_segment seg,
     unsigned long offset,
     unsigned int bytes,
     enum hvm_access_type access_type,
     struct sh_emulate_ctxt *sh_ctxt,
-    unsigned long *linear)
-{
-    const struct segment_register *reg;
-    int okay;
-
-    reg = hvm_get_seg_reg(seg, sh_ctxt);
-    if ( IS_ERR(reg) )
-        return -PTR_ERR(reg);
-
-    okay = hvm_virtual_to_linear_addr(
-        seg, reg, offset, bytes, access_type,
-        hvm_get_seg_reg(x86_seg_cs, sh_ctxt), linear);
-
-    if ( !okay )
-    {
-        /*
-         * Leave exception injection to the caller for non-user segments: We
-         * neither know the exact error code to be used, nor can we easily
-         * determine the kind of exception (#GP or #TS) in that case.
-         */
-        if ( is_x86_user_segment(seg) )
-            x86_emul_hw_exception(
-                (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault,
-                0, &sh_ctxt->ctxt);
-        return X86EMUL_EXCEPTION;
-    }
-
-    return 0;
-}
-
-static int
-hvm_read(enum x86_segment seg,
-         unsigned long offset,
-         void *p_data,
-         unsigned int bytes,
-         enum hvm_access_type access_type,
-         struct sh_emulate_ctxt *sh_ctxt)
-{
-    pagefault_info_t pfinfo;
-    unsigned long addr;
-    int rc;
-
-    rc = hvm_translate_virtual_addr(
-        seg, offset, bytes, access_type, sh_ctxt, &addr);
-    if ( rc || !bytes )
-        return rc;
-
-    if ( access_type == hvm_access_insn_fetch )
-        rc = hvm_fetch_from_guest_linear(p_data, addr, bytes, 0, &pfinfo);
-    else
-        rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo);
-
-    switch ( rc )
-    {
-    case HVMTRANS_okay:
-        return X86EMUL_OKAY;
-    case HVMTRANS_bad_linear_to_gfn:
-        x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &sh_ctxt->ctxt);
-        return X86EMUL_EXCEPTION;
-    case HVMTRANS_bad_gfn_to_mfn:
-    case HVMTRANS_unhandleable:
-        return X86EMUL_UNHANDLEABLE;
-    case HVMTRANS_gfn_paged_out:
-    case HVMTRANS_gfn_shared:
-        return X86EMUL_RETRY;
-    }
-
-    BUG();
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static int
-hvm_emulate_read(enum x86_segment seg,
-                 unsigned long offset,
-                 void *p_data,
-                 unsigned int bytes,
-                 struct x86_emulate_ctxt *ctxt)
-{
-    if ( !is_x86_user_segment(seg) )
-        return X86EMUL_UNHANDLEABLE;
-    return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
-                    container_of(ctxt, struct sh_emulate_ctxt, ctxt));
-}
-
-static int
-hvm_emulate_insn_fetch(enum x86_segment seg,
-                       unsigned long offset,
-                       void *p_data,
-                       unsigned int bytes,
-                       struct x86_emulate_ctxt *ctxt)
-{
-    struct sh_emulate_ctxt *sh_ctxt =
-        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
-    unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
-
-    ASSERT(seg == x86_seg_cs);
-
-    /* Fall back if requested bytes are not in the prefetch cache. */
-    if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
-        return hvm_read(seg, offset, p_data, bytes,
-                        hvm_access_insn_fetch, sh_ctxt);
-
-    /* Hit the cache. Simple memcpy. */
-    memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
-    return X86EMUL_OKAY;
-}
-
-static int
-hvm_emulate_write(enum x86_segment seg,
-                  unsigned long offset,
-                  void *p_data,
-                  unsigned int bytes,
-                  struct x86_emulate_ctxt *ctxt)
-{
-    struct sh_emulate_ctxt *sh_ctxt =
-        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
-    struct vcpu *v = current;
-    unsigned long addr;
-    void *ptr;
-    int rc;
-
-    /* How many emulations could we save if we unshadowed on stack writes? */
-    if ( seg == x86_seg_ss )
-        perfc_incr(shadow_fault_emulate_stack);
-
-    rc = hvm_translate_virtual_addr(
-        seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
-    if ( rc || !bytes )
-        return rc;
-
-    /* Unaligned writes are only acceptable on HVM */
-    if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v)  )
-        return X86EMUL_UNHANDLEABLE;
-
-    ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt);
-    if ( IS_ERR(ptr) )
-        return ~PTR_ERR(ptr);
-
-    paging_lock(v->domain);
-    memcpy(ptr, p_data, bytes);
-
-    if ( tb_init_done )
-        v->arch.paging.mode->shadow.trace_emul_write_val(ptr, addr,
-                                                         p_data, bytes);
-
-    sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt);
-    shadow_audit_tables(v);
-    paging_unlock(v->domain);
-
-    return X86EMUL_OKAY;
-}
-
-static int
-hvm_emulate_cmpxchg(enum x86_segment seg,
-                    unsigned long offset,
-                    void *p_old,
-                    void *p_new,
-                    unsigned int bytes,
-                    bool lock,
-                    struct x86_emulate_ctxt *ctxt)
-{
-    struct sh_emulate_ctxt *sh_ctxt =
-        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
-    struct vcpu *v = current;
-    unsigned long addr, old, new, prev;
-    void *ptr;
-    int rc;
-
-    if ( bytes > sizeof(long) )
-        return X86EMUL_UNHANDLEABLE;
-
-    rc = hvm_translate_virtual_addr(
-        seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
-    if ( rc )
-        return rc;
-
-    /* Unaligned writes are only acceptable on HVM */
-    if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v)  )
-        return X86EMUL_UNHANDLEABLE;
-
-    ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt);
-    if ( IS_ERR(ptr) )
-        return ~PTR_ERR(ptr);
-
-    old = new = 0;
-    memcpy(&old, p_old, bytes);
-    memcpy(&new, p_new, bytes);
-
-    paging_lock(v->domain);
-    switch ( bytes )
-    {
-    case 1: prev = cmpxchg((uint8_t  *)ptr, old, new); break;
-    case 2: prev = cmpxchg((uint16_t *)ptr, old, new); break;
-    case 4: prev = cmpxchg((uint32_t *)ptr, old, new); break;
-    case 8: prev = cmpxchg((uint64_t *)ptr, old, new); break;
-    default:
-        SHADOW_PRINTK("cmpxchg size %u is not supported\n", bytes);
-        prev = ~old;
-    }
-
-    if ( prev != old )
-    {
-        memcpy(p_old, &prev, bytes);
-        rc = X86EMUL_CMPXCHG_FAILED;
-    }
-
-    SHADOW_DEBUG(EMULATE,
-                 "va %#lx was %#lx expected %#lx wanted %#lx now %#lx bytes 
%u\n",
-                 addr, prev, old, new, *(unsigned long *)ptr, bytes);
-
-    sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt);
-    shadow_audit_tables(v);
-    paging_unlock(v->domain);
-
-    return rc;
-}
-
-static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
-    .read       = hvm_emulate_read,
-    .insn_fetch = hvm_emulate_insn_fetch,
-    .write      = hvm_emulate_write,
-    .cmpxchg    = hvm_emulate_cmpxchg,
-    .cpuid      = hvmemul_cpuid,
-};
+    unsigned long *linear);
 #endif
 
 const struct x86_emulate_ops *shadow_init_emulation(
@@ -1696,281 +1439,6 @@ static unsigned int shadow_get_allocation(struct domain 
*d)
             + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
 }
 
-#if CONFIG_HVM
-/**************************************************************************/
-/* Handling guest writes to pagetables. */
-
-/*
- * Translate a VA to an MFN, injecting a page-fault if we fail.  If the
- * mapping succeeds, a reference will be held on the underlying page.
- */
-#define BAD_GVA_TO_GFN (~0UL)
-#define BAD_GFN_TO_MFN (~1UL)
-#define READONLY_GFN   (~2UL)
-static mfn_t emulate_gva_to_mfn(struct vcpu *v, unsigned long vaddr,
-                                struct sh_emulate_ctxt *sh_ctxt)
-{
-    unsigned long gfn;
-    struct page_info *page;
-    mfn_t mfn;
-    p2m_type_t p2mt;
-    uint32_t pfec = PFEC_page_present | PFEC_write_access;
-
-    /* Translate the VA to a GFN. */
-    gfn = paging_get_hostmode(v)->gva_to_gfn(v, NULL, vaddr, &pfec);
-    if ( gfn == gfn_x(INVALID_GFN) )
-    {
-        x86_emul_pagefault(pfec, vaddr, &sh_ctxt->ctxt);
-
-        return _mfn(BAD_GVA_TO_GFN);
-    }
-
-    /* Translate the GFN to an MFN. */
-    ASSERT(!paging_locked_by_me(v->domain));
-
-    page = get_page_from_gfn(v->domain, gfn, &p2mt, P2M_ALLOC);
-
-    /* Sanity checking. */
-    if ( page == NULL )
-    {
-        return _mfn(BAD_GFN_TO_MFN);
-    }
-    if ( p2m_is_discard_write(p2mt) )
-    {
-        put_page(page);
-        return _mfn(READONLY_GFN);
-    }
-    if ( !p2m_is_ram(p2mt) )
-    {
-        put_page(page);
-        return _mfn(BAD_GFN_TO_MFN);
-    }
-    mfn = page_to_mfn(page);
-    ASSERT(mfn_valid(mfn));
-
-    v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
-
-    return mfn;
-}
-
-/*
- * Check that the user is allowed to perform this write.  If a mapping is
- * returned, page references will be held on sh_ctxt->mfn[0] and
- * sh_ctxt->mfn[1] iff !INVALID_MFN.
- */
-static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
-                                 unsigned int bytes,
-                                 struct sh_emulate_ctxt *sh_ctxt)
-{
-    struct domain *d = v->domain;
-    void *map;
-
-#ifndef NDEBUG
-    /* We don't emulate user-mode writes to page tables. */
-    if ( is_hvm_domain(d) ? hvm_get_cpl(v) == 3
-                          : !guest_kernel_mode(v, guest_cpu_user_regs()) )
-    {
-        gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
-                 "emulate_map_dest(). This should never happen!\n");
-        return MAPPING_UNHANDLEABLE;
-    }
-#endif
-
-    sh_ctxt->mfn[0] = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
-    if ( !mfn_valid(sh_ctxt->mfn[0]) )
-    {
-        switch ( mfn_x(sh_ctxt->mfn[0]) )
-        {
-        case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION;
-        case READONLY_GFN:   return MAPPING_SILENT_FAIL;
-        default:             return MAPPING_UNHANDLEABLE;
-        }
-    }
-
-    /* Unaligned writes mean probably this isn't a pagetable. */
-    if ( vaddr & (bytes - 1) )
-        sh_remove_shadows(d, sh_ctxt->mfn[0], 0, 0 /* Slow, can fail. */ );
-
-    if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
-    {
-        /* Whole write fits on a single page. */
-        sh_ctxt->mfn[1] = INVALID_MFN;
-        map = map_domain_page(sh_ctxt->mfn[0]) + (vaddr & ~PAGE_MASK);
-    }
-    else if ( !is_hvm_domain(d) )
-    {
-        /*
-         * Cross-page emulated writes are only supported for HVM guests;
-         * PV guests ought to know better.
-         */
-        put_page(mfn_to_page(sh_ctxt->mfn[0]));
-        return MAPPING_UNHANDLEABLE;
-    }
-    else
-    {
-        /* This write crosses a page boundary. Translate the second page. */
-        sh_ctxt->mfn[1] = emulate_gva_to_mfn(
-            v, (vaddr + bytes - 1) & PAGE_MASK, sh_ctxt);
-        if ( !mfn_valid(sh_ctxt->mfn[1]) )
-        {
-            put_page(mfn_to_page(sh_ctxt->mfn[0]));
-            switch ( mfn_x(sh_ctxt->mfn[1]) )
-            {
-            case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION;
-            case READONLY_GFN:   return MAPPING_SILENT_FAIL;
-            default:             return MAPPING_UNHANDLEABLE;
-            }
-        }
-
-        /* Cross-page writes mean probably not a pagetable. */
-        sh_remove_shadows(d, sh_ctxt->mfn[1], 0, 0 /* Slow, can fail. */ );
-
-        map = vmap(sh_ctxt->mfn, 2);
-        if ( !map )
-        {
-            put_page(mfn_to_page(sh_ctxt->mfn[0]));
-            put_page(mfn_to_page(sh_ctxt->mfn[1]));
-            return MAPPING_UNHANDLEABLE;
-        }
-        map += (vaddr & ~PAGE_MASK);
-    }
-
-#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
-    /*
-     * Remember if the bottom bit was clear, so we can choose not to run
-     * the change through the verify code if it's still clear afterwards.
-     */
-    sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
-#endif
-
-    return map;
-}
-
-/*
- * Optimization: If we see two emulated writes of zeros to the same
- * page-table without another kind of page fault in between, we guess
- * that this is a batch of changes (for process destruction) and
- * unshadow the page so we don't take a pagefault on every entry.  This
- * should also make finding writeable mappings of pagetables much
- * easier.
- *
- * Look to see if this is the second emulated write in a row to this
- * page, and unshadow if it is.
- */
-static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
-{
-#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
-    struct domain *d = v->domain;
-
-    /*
-     * If the domain has never made a "dying" op, use the two-writes
-     * heuristic; otherwise, unshadow as soon as we write a zero for a dying
-     * process.
-     *
-     * Don't bother trying to unshadow if it's not a PT, or if it's > l1.
-     */
-    if ( ( v->arch.paging.shadow.pagetable_dying
-           || ( !d->arch.paging.shadow.pagetable_dying_op
-                && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == 
mfn_x(gmfn) ) )
-         && sh_mfn_is_a_page_table(gmfn)
-         && (!d->arch.paging.shadow.pagetable_dying_op ||
-             !(mfn_to_page(gmfn)->shadow_flags
-               & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64))) )
-    {
-        perfc_incr(shadow_early_unshadow);
-        sh_remove_shadows(d, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
-        TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
-    }
-    v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
-#endif
-}
-
-/*
- * Tidy up after the emulated write: mark pages dirty, verify the new
- * contents, and undo the mapping.
- */
-static void sh_emulate_unmap_dest(struct vcpu *v, void *addr,
-                                  unsigned int bytes,
-                                  struct sh_emulate_ctxt *sh_ctxt)
-{
-    u32 b1 = bytes, b2 = 0, shflags;
-
-    ASSERT(mfn_valid(sh_ctxt->mfn[0]));
-
-    /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
-    if ( likely(bytes >= 4) && (*(u32 *)addr == 0) )
-    {
-        if ( !((unsigned long)addr & (sh_ctxt->pte_size - 1)) )
-            check_for_early_unshadow(v, sh_ctxt->mfn[0]);
-        /*
-         * Don't reset the heuristic if we're writing zeros at non-aligned
-         * addresses, otherwise it doesn't catch REP MOVSD on PAE guests.
-         */
-    }
-    else
-        sh_reset_early_unshadow(v);
-
-    /*
-     * We can avoid re-verifying the page contents after the write if:
-     *  - it was no larger than the PTE type of this pagetable;
-     *  - it was aligned to the PTE boundaries; and
-     *  - _PAGE_PRESENT was clear before and after the write.
-     */
-    shflags = mfn_to_page(sh_ctxt->mfn[0])->shadow_flags;
-#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
-    if ( sh_ctxt->low_bit_was_clear
-         && !(*(u8 *)addr & _PAGE_PRESENT)
-         && ((!(shflags & SHF_32)
-              /*
-               * Not shadowed 32-bit: aligned 64-bit writes that leave
-               * the present bit unset are safe to ignore.
-               */
-              && ((unsigned long)addr & 7) == 0
-              && bytes <= 8)
-             ||
-             (!(shflags & (SHF_PAE|SHF_64))
-              /*
-               * Not shadowed PAE/64-bit: aligned 32-bit writes that
-               * leave the present bit unset are safe to ignore.
-               */
-              && ((unsigned long)addr & 3) == 0
-              && bytes <= 4)) )
-    {
-        /* Writes with this alignment constraint can't possibly cross pages. */
-        ASSERT(!mfn_valid(sh_ctxt->mfn[1]));
-    }
-    else
-#endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
-    {
-        if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) )
-        {
-            /* Validate as two writes, one to each page. */
-            b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
-            b2 = bytes - b1;
-            ASSERT(b2 < bytes);
-        }
-        if ( likely(b1 > 0) )
-            sh_validate_guest_pt_write(v, sh_ctxt->mfn[0], addr, b1);
-        if ( unlikely(b2 > 0) )
-            sh_validate_guest_pt_write(v, sh_ctxt->mfn[1], addr + b1, b2);
-    }
-
-    paging_mark_dirty(v->domain, sh_ctxt->mfn[0]);
-    put_page(mfn_to_page(sh_ctxt->mfn[0]));
-
-    if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) )
-    {
-        paging_mark_dirty(v->domain, sh_ctxt->mfn[1]);
-        put_page(mfn_to_page(sh_ctxt->mfn[1]));
-        vunmap((void *)((unsigned long)addr & PAGE_MASK));
-    }
-    else
-        unmap_domain_page(addr);
-
-    atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
-}
-#endif
-
 /**************************************************************************/
 /* Hash table for storing the guest->shadow mappings.
  * The table itself is an array of pointers to shadows; the shadows are then
diff --git a/xen/arch/x86/mm/shadow/hvm.c b/xen/arch/x86/mm/shadow/hvm.c
new file mode 100644
index 0000000..863f644
--- /dev/null
+++ b/xen/arch/x86/mm/shadow/hvm.c
@@ -0,0 +1,590 @@
+
+/******************************************************************************
+ * arch/x86/mm/shadow/hvm.c
+ *
+ * Shadow code that does not need to be multiply compiled and is HVM only.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <xen/keyhandler.h>
+#include <asm/event.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/flushtlb.h>
+#include <asm/shadow.h>
+#include <asm/hvm/ioreq.h>
+#include <xen/numa.h>
+#include "private.h"
+
+/**************************************************************************/
+/* x86 emulator support for the shadow code
+ */
+
+/*
+ * Returns a mapped pointer to write to, or one of the following error
+ * indicators.
+ */
+#define MAPPING_UNHANDLEABLE ERR_PTR(~(long)X86EMUL_UNHANDLEABLE)
+#define MAPPING_EXCEPTION    ERR_PTR(~(long)X86EMUL_EXCEPTION)
+#define MAPPING_SILENT_FAIL  ERR_PTR(~(long)X86EMUL_OKAY)
+static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
+                                 unsigned int bytes,
+                                 struct sh_emulate_ctxt *sh_ctxt);
+static void sh_emulate_unmap_dest(struct vcpu *v, void *addr,
+                                  unsigned int bytes,
+                                  struct sh_emulate_ctxt *sh_ctxt);
+
+/*
+ * Callers which pass a known in-range x86_segment can rely on the return
+ * pointer being valid.  Other callers must explicitly check for errors.
+ */
+struct segment_register *hvm_get_seg_reg(
+    enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
+{
+    unsigned int idx = seg;
+    struct segment_register *seg_reg;
+
+    if ( idx >= ARRAY_SIZE(sh_ctxt->seg_reg) )
+        return ERR_PTR(-X86EMUL_UNHANDLEABLE);
+
+    seg_reg = &sh_ctxt->seg_reg[idx];
+    if ( !__test_and_set_bit(idx, &sh_ctxt->valid_seg_regs) )
+        hvm_get_segment_register(current, idx, seg_reg);
+    return seg_reg;
+}
+
+int hvm_translate_virtual_addr(
+    enum x86_segment seg,
+    unsigned long offset,
+    unsigned int bytes,
+    enum hvm_access_type access_type,
+    struct sh_emulate_ctxt *sh_ctxt,
+    unsigned long *linear)
+{
+    const struct segment_register *reg;
+    int okay;
+
+    reg = hvm_get_seg_reg(seg, sh_ctxt);
+    if ( IS_ERR(reg) )
+        return -PTR_ERR(reg);
+
+    okay = hvm_virtual_to_linear_addr(
+        seg, reg, offset, bytes, access_type,
+        hvm_get_seg_reg(x86_seg_cs, sh_ctxt), linear);
+
+    if ( !okay )
+    {
+        /*
+         * Leave exception injection to the caller for non-user segments: We
+         * neither know the exact error code to be used, nor can we easily
+         * determine the kind of exception (#GP or #TS) in that case.
+         */
+        if ( is_x86_user_segment(seg) )
+            x86_emul_hw_exception(
+                (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault,
+                0, &sh_ctxt->ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return 0;
+}
+
+static int
+hvm_read(enum x86_segment seg,
+         unsigned long offset,
+         void *p_data,
+         unsigned int bytes,
+         enum hvm_access_type access_type,
+         struct sh_emulate_ctxt *sh_ctxt)
+{
+    pagefault_info_t pfinfo;
+    unsigned long addr;
+    int rc;
+
+    rc = hvm_translate_virtual_addr(
+        seg, offset, bytes, access_type, sh_ctxt, &addr);
+    if ( rc || !bytes )
+        return rc;
+
+    if ( access_type == hvm_access_insn_fetch )
+        rc = hvm_fetch_from_guest_linear(p_data, addr, bytes, 0, &pfinfo);
+    else
+        rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo);
+
+    switch ( rc )
+    {
+    case HVMTRANS_okay:
+        return X86EMUL_OKAY;
+    case HVMTRANS_bad_linear_to_gfn:
+        x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &sh_ctxt->ctxt);
+        return X86EMUL_EXCEPTION;
+    case HVMTRANS_bad_gfn_to_mfn:
+    case HVMTRANS_unhandleable:
+        return X86EMUL_UNHANDLEABLE;
+    case HVMTRANS_gfn_paged_out:
+    case HVMTRANS_gfn_shared:
+        return X86EMUL_RETRY;
+    }
+
+    BUG();
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int
+hvm_emulate_read(enum x86_segment seg,
+                 unsigned long offset,
+                 void *p_data,
+                 unsigned int bytes,
+                 struct x86_emulate_ctxt *ctxt)
+{
+    if ( !is_x86_user_segment(seg) )
+        return X86EMUL_UNHANDLEABLE;
+    return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
+                    container_of(ctxt, struct sh_emulate_ctxt, ctxt));
+}
+
+static int
+hvm_emulate_insn_fetch(enum x86_segment seg,
+                       unsigned long offset,
+                       void *p_data,
+                       unsigned int bytes,
+                       struct x86_emulate_ctxt *ctxt)
+{
+    struct sh_emulate_ctxt *sh_ctxt =
+        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
+    unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* Fall back if requested bytes are not in the prefetch cache. */
+    if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
+        return hvm_read(seg, offset, p_data, bytes,
+                        hvm_access_insn_fetch, sh_ctxt);
+
+    /* Hit the cache. Simple memcpy. */
+    memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
+    return X86EMUL_OKAY;
+}
+
+static int
+hvm_emulate_write(enum x86_segment seg,
+                  unsigned long offset,
+                  void *p_data,
+                  unsigned int bytes,
+                  struct x86_emulate_ctxt *ctxt)
+{
+    struct sh_emulate_ctxt *sh_ctxt =
+        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
+    struct vcpu *v = current;
+    unsigned long addr;
+    void *ptr;
+    int rc;
+
+    /* How many emulations could we save if we unshadowed on stack writes? */
+    if ( seg == x86_seg_ss )
+        perfc_incr(shadow_fault_emulate_stack);
+
+    rc = hvm_translate_virtual_addr(
+        seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
+    if ( rc || !bytes )
+        return rc;
+
+    /* Unaligned writes are only acceptable on HVM */
+    if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v)  )
+        return X86EMUL_UNHANDLEABLE;
+
+    ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt);
+    if ( IS_ERR(ptr) )
+        return ~PTR_ERR(ptr);
+
+    paging_lock(v->domain);
+    memcpy(ptr, p_data, bytes);
+
+    if ( tb_init_done )
+        v->arch.paging.mode->shadow.trace_emul_write_val(ptr, addr,
+                                                         p_data, bytes);
+
+    sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt);
+    shadow_audit_tables(v);
+    paging_unlock(v->domain);
+
+    return X86EMUL_OKAY;
+}
+
+static int
+hvm_emulate_cmpxchg(enum x86_segment seg,
+                    unsigned long offset,
+                    void *p_old,
+                    void *p_new,
+                    unsigned int bytes,
+                    bool lock,
+                    struct x86_emulate_ctxt *ctxt)
+{
+    struct sh_emulate_ctxt *sh_ctxt =
+        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
+    struct vcpu *v = current;
+    unsigned long addr, old, new, prev;
+    void *ptr;
+    int rc;
+
+    if ( bytes > sizeof(long) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = hvm_translate_virtual_addr(
+        seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
+    if ( rc )
+        return rc;
+
+    /* Unaligned writes are only acceptable on HVM */
+    if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v)  )
+        return X86EMUL_UNHANDLEABLE;
+
+    ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt);
+    if ( IS_ERR(ptr) )
+        return ~PTR_ERR(ptr);
+
+    old = new = 0;
+    memcpy(&old, p_old, bytes);
+    memcpy(&new, p_new, bytes);
+
+    paging_lock(v->domain);
+    switch ( bytes )
+    {
+    case 1: prev = cmpxchg((uint8_t  *)ptr, old, new); break;
+    case 2: prev = cmpxchg((uint16_t *)ptr, old, new); break;
+    case 4: prev = cmpxchg((uint32_t *)ptr, old, new); break;
+    case 8: prev = cmpxchg((uint64_t *)ptr, old, new); break;
+    default:
+        SHADOW_PRINTK("cmpxchg size %u is not supported\n", bytes);
+        prev = ~old;
+    }
+
+    if ( prev != old )
+    {
+        memcpy(p_old, &prev, bytes);
+        rc = X86EMUL_CMPXCHG_FAILED;
+    }
+
+    SHADOW_DEBUG(EMULATE,
+                 "va %#lx was %#lx expected %#lx wanted %#lx now %#lx bytes 
%u\n",
+                 addr, prev, old, new, *(unsigned long *)ptr, bytes);
+
+    sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt);
+    shadow_audit_tables(v);
+    paging_unlock(v->domain);
+
+    return rc;
+}
+
+const struct x86_emulate_ops hvm_shadow_emulator_ops = {
+    .read       = hvm_emulate_read,
+    .insn_fetch = hvm_emulate_insn_fetch,
+    .write      = hvm_emulate_write,
+    .cmpxchg    = hvm_emulate_cmpxchg,
+    .cpuid      = hvmemul_cpuid,
+};
+
+/**************************************************************************/
+/* Handling guest writes to pagetables. */
+
+/*
+ * Translate a VA to an MFN, injecting a page-fault if we fail.  If the
+ * mapping succeeds, a reference will be held on the underlying page.
+ */
+#define BAD_GVA_TO_GFN (~0UL)
+#define BAD_GFN_TO_MFN (~1UL)
+#define READONLY_GFN   (~2UL)
+static mfn_t emulate_gva_to_mfn(struct vcpu *v, unsigned long vaddr,
+                                struct sh_emulate_ctxt *sh_ctxt)
+{
+    unsigned long gfn;
+    struct page_info *page;
+    mfn_t mfn;
+    p2m_type_t p2mt;
+    uint32_t pfec = PFEC_page_present | PFEC_write_access;
+
+    /* Translate the VA to a GFN. */
+    gfn = paging_get_hostmode(v)->gva_to_gfn(v, NULL, vaddr, &pfec);
+    if ( gfn == gfn_x(INVALID_GFN) )
+    {
+        x86_emul_pagefault(pfec, vaddr, &sh_ctxt->ctxt);
+
+        return _mfn(BAD_GVA_TO_GFN);
+    }
+
+    /* Translate the GFN to an MFN. */
+    ASSERT(!paging_locked_by_me(v->domain));
+
+    page = get_page_from_gfn(v->domain, gfn, &p2mt, P2M_ALLOC);
+
+    /* Sanity checking. */
+    if ( page == NULL )
+    {
+        return _mfn(BAD_GFN_TO_MFN);
+    }
+    if ( p2m_is_discard_write(p2mt) )
+    {
+        put_page(page);
+        return _mfn(READONLY_GFN);
+    }
+    if ( !p2m_is_ram(p2mt) )
+    {
+        put_page(page);
+        return _mfn(BAD_GFN_TO_MFN);
+    }
+    mfn = page_to_mfn(page);
+    ASSERT(mfn_valid(mfn));
+
+    v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
+
+    return mfn;
+}
+
+/*
+ * Check that the user is allowed to perform this write.  If a mapping is
+ * returned, page references will be held on sh_ctxt->mfn[0] and
+ * sh_ctxt->mfn[1] iff !INVALID_MFN.
+ */
+static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
+                                 unsigned int bytes,
+                                 struct sh_emulate_ctxt *sh_ctxt)
+{
+    struct domain *d = v->domain;
+    void *map;
+
+#ifndef NDEBUG
+    /* We don't emulate user-mode writes to page tables. */
+    if ( is_hvm_domain(d) ? hvm_get_cpl(v) == 3
+                          : !guest_kernel_mode(v, guest_cpu_user_regs()) )
+    {
+        gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
+                 "emulate_map_dest(). This should never happen!\n");
+        return MAPPING_UNHANDLEABLE;
+    }
+#endif
+
+    sh_ctxt->mfn[0] = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
+    if ( !mfn_valid(sh_ctxt->mfn[0]) )
+    {
+        switch ( mfn_x(sh_ctxt->mfn[0]) )
+        {
+        case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION;
+        case READONLY_GFN:   return MAPPING_SILENT_FAIL;
+        default:             return MAPPING_UNHANDLEABLE;
+        }
+    }
+
+    /* Unaligned writes mean probably this isn't a pagetable. */
+    if ( vaddr & (bytes - 1) )
+        sh_remove_shadows(d, sh_ctxt->mfn[0], 0, 0 /* Slow, can fail. */ );
+
+    if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
+    {
+        /* Whole write fits on a single page. */
+        sh_ctxt->mfn[1] = INVALID_MFN;
+        map = map_domain_page(sh_ctxt->mfn[0]) + (vaddr & ~PAGE_MASK);
+    }
+    else if ( !is_hvm_domain(d) )
+    {
+        /*
+         * Cross-page emulated writes are only supported for HVM guests;
+         * PV guests ought to know better.
+         */
+        put_page(mfn_to_page(sh_ctxt->mfn[0]));
+        return MAPPING_UNHANDLEABLE;
+    }
+    else
+    {
+        /* This write crosses a page boundary. Translate the second page. */
+        sh_ctxt->mfn[1] = emulate_gva_to_mfn(
+            v, (vaddr + bytes - 1) & PAGE_MASK, sh_ctxt);
+        if ( !mfn_valid(sh_ctxt->mfn[1]) )
+        {
+            put_page(mfn_to_page(sh_ctxt->mfn[0]));
+            switch ( mfn_x(sh_ctxt->mfn[1]) )
+            {
+            case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION;
+            case READONLY_GFN:   return MAPPING_SILENT_FAIL;
+            default:             return MAPPING_UNHANDLEABLE;
+            }
+        }
+
+        /* Cross-page writes mean probably not a pagetable. */
+        sh_remove_shadows(d, sh_ctxt->mfn[1], 0, 0 /* Slow, can fail. */ );
+
+        map = vmap(sh_ctxt->mfn, 2);
+        if ( !map )
+        {
+            put_page(mfn_to_page(sh_ctxt->mfn[0]));
+            put_page(mfn_to_page(sh_ctxt->mfn[1]));
+            return MAPPING_UNHANDLEABLE;
+        }
+        map += (vaddr & ~PAGE_MASK);
+    }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
+    /*
+     * Remember if the bottom bit was clear, so we can choose not to run
+     * the change through the verify code if it's still clear afterwards.
+     */
+    sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
+#endif
+
+    return map;
+}
+
+/*
+ * Optimization: If we see two emulated writes of zeros to the same
+ * page-table without another kind of page fault in between, we guess
+ * that this is a batch of changes (for process destruction) and
+ * unshadow the page so we don't take a pagefault on every entry.  This
+ * should also make finding writeable mappings of pagetables much
+ * easier.
+ *
+ * Look to see if this is the second emulated write in a row to this
+ * page, and unshadow if it is.
+ */
+static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
+{
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+    struct domain *d = v->domain;
+
+    /*
+     * If the domain has never made a "dying" op, use the two-writes
+     * heuristic; otherwise, unshadow as soon as we write a zero for a dying
+     * process.
+     *
+     * Don't bother trying to unshadow if it's not a PT, or if it's > l1.
+     */
+    if ( ( v->arch.paging.shadow.pagetable_dying
+           || ( !d->arch.paging.shadow.pagetable_dying_op
+                && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == 
mfn_x(gmfn) ) )
+         && sh_mfn_is_a_page_table(gmfn)
+         && (!d->arch.paging.shadow.pagetable_dying_op ||
+             !(mfn_to_page(gmfn)->shadow_flags
+               & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64))) )
+    {
+        perfc_incr(shadow_early_unshadow);
+        sh_remove_shadows(d, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
+        TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
+    }
+    v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
+#endif
+}
+
+/*
+ * Tidy up after the emulated write: mark pages dirty, verify the new
+ * contents, and undo the mapping.
+ */
+static void sh_emulate_unmap_dest(struct vcpu *v, void *addr,
+                                  unsigned int bytes,
+                                  struct sh_emulate_ctxt *sh_ctxt)
+{
+    u32 b1 = bytes, b2 = 0, shflags;
+
+    ASSERT(mfn_valid(sh_ctxt->mfn[0]));
+
+    /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
+    if ( likely(bytes >= 4) && (*(u32 *)addr == 0) )
+    {
+        if ( !((unsigned long)addr & (sh_ctxt->pte_size - 1)) )
+            check_for_early_unshadow(v, sh_ctxt->mfn[0]);
+        /*
+         * Don't reset the heuristic if we're writing zeros at non-aligned
+         * addresses, otherwise it doesn't catch REP MOVSD on PAE guests.
+         */
+    }
+    else
+        sh_reset_early_unshadow(v);
+
+    /*
+     * We can avoid re-verifying the page contents after the write if:
+     *  - it was no larger than the PTE type of this pagetable;
+     *  - it was aligned to the PTE boundaries; and
+     *  - _PAGE_PRESENT was clear before and after the write.
+     */
+    shflags = mfn_to_page(sh_ctxt->mfn[0])->shadow_flags;
+#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
+    if ( sh_ctxt->low_bit_was_clear
+         && !(*(u8 *)addr & _PAGE_PRESENT)
+         && ((!(shflags & SHF_32)
+              /*
+               * Not shadowed 32-bit: aligned 64-bit writes that leave
+               * the present bit unset are safe to ignore.
+               */
+              && ((unsigned long)addr & 7) == 0
+              && bytes <= 8)
+             ||
+             (!(shflags & (SHF_PAE|SHF_64))
+              /*
+               * Not shadowed PAE/64-bit: aligned 32-bit writes that
+               * leave the present bit unset are safe to ignore.
+               */
+              && ((unsigned long)addr & 3) == 0
+              && bytes <= 4)) )
+    {
+        /* Writes with this alignment constraint can't possibly cross pages. */
+        ASSERT(!mfn_valid(sh_ctxt->mfn[1]));
+    }
+    else
+#endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
+    {
+        if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) )
+        {
+            /* Validate as two writes, one to each page. */
+            b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
+            b2 = bytes - b1;
+            ASSERT(b2 < bytes);
+        }
+        if ( likely(b1 > 0) )
+            sh_validate_guest_pt_write(v, sh_ctxt->mfn[0], addr, b1);
+        if ( unlikely(b2 > 0) )
+            sh_validate_guest_pt_write(v, sh_ctxt->mfn[1], addr + b1, b2);
+    }
+
+    paging_mark_dirty(v->domain, sh_ctxt->mfn[0]);
+    put_page(mfn_to_page(sh_ctxt->mfn[0]));
+
+    if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) )
+    {
+        paging_mark_dirty(v->domain, sh_ctxt->mfn[1]);
+        put_page(mfn_to_page(sh_ctxt->mfn[1]));
+        vunmap((void *)((unsigned long)addr & PAGE_MASK));
+    }
+    else
+        unmap_domain_page(addr);
+
+    atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
git-series 0.9.1

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.