[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] x86: Move the guest pagetable walker out of shadow/multi.c



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1226581328 0
# Node ID 7fb33d15dc9bc5892e4708011beded66dd756be3
# Parent  b87cc4de3ca676e895f6374daed1d33a79849b9d
x86: Move the guest pagetable walker out of shadow/multi.c

Move the guest PT walker into its own file, and purge it of references
to the rest of the shadow code.

Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---
 xen/arch/x86/mm/Makefile         |    6 
 xen/arch/x86/mm/guest_walk.c     |  260 +++++++++++++++++++++++++++++
 xen/arch/x86/mm/shadow/multi.c   |  341 ++-------------------------------------
 xen/include/asm-x86/guest_pt.h   |   89 ++++++++++
 xen/include/asm-x86/perfc_defn.h |    2 
 5 files changed, 378 insertions(+), 320 deletions(-)

diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/Makefile
--- a/xen/arch/x86/mm/Makefile  Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/arch/x86/mm/Makefile  Thu Nov 13 13:02:08 2008 +0000
@@ -3,3 +3,9 @@ subdir-y += hap
 
 obj-y += paging.o
 obj-y += p2m.o
+obj-y += guest_walk_2.o
+obj-y += guest_walk_3.o
+obj-$(x86_64) += guest_walk_4.o
+
+guest_walk_%.o: guest_walk.c $(HDRS) Makefile
+       $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/guest_walk.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/guest_walk.c      Thu Nov 13 13:02:08 2008 +0000
@@ -0,0 +1,260 @@
+/******************************************************************************
+ * arch/x86/mm/guest_walk.c
+ *
+ * Pagetable walker for guest memory accesses.
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/page.h>
+#include <asm/guest_pt.h>
+
+
+/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
+static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
+{
+    static uint32_t flags[] = {
+        /* I/F -  Usr Wr */
+        /* 0   0   0   0 */ _PAGE_PRESENT, 
+        /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
+        /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
+        /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+        /* 0   1   0   0 */ _PAGE_PRESENT, 
+        /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
+        /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
+        /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+        /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
+        /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+        /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
+        /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+        /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+    };
+
+    /* Don't demand not-NX if the CPU wouldn't enforce it. */
+    if ( !guest_supports_nx(v) )
+        pfec &= ~PFEC_insn_fetch;
+
+    /* Don't demand R/W if the CPU wouldn't enforce it. */
+    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
+         && !(pfec & PFEC_user_mode) )
+        pfec &= ~PFEC_write_access;
+
+    return flags[(pfec & 0x1f) >> 1];
+}
+
+/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
+ * Returns non-zero if it actually writes to guest memory. */
+static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
+{
+    guest_intpte_t old, new;
+
+    old = *(guest_intpte_t *)walk_p;
+    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
+    if ( old != new ) 
+    {
+        /* Write the new entry into the walk, and try to write it back
+         * into the guest table as well.  If the guest table has changed
+         * under out feet then leave it alone. */
+        *(guest_intpte_t *)walk_p = new;
+        if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
+            return 1;
+    }
+    return 0;
+}
+
+
+/* Walk the guest pagetables, after the manner of a hardware walker. */
+uint32_t
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                  uint32_t pfec, mfn_t top_mfn, void *top_map)
+{
+    struct domain *d = v->domain;
+    p2m_type_t p2mt;
+    guest_l1e_t *l1p = NULL;
+    guest_l2e_t *l2p = NULL;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    guest_l3e_t *l3p = NULL;
+    guest_l4e_t *l4p;
+#endif
+    uint32_t gflags, mflags, rc = 0;
+    int pse;
+
+    perfc_incr(guest_walk);
+    memset(gw, 0, sizeof(*gw));
+    gw->va = va;
+
+    /* Mandatory bits that must be set in every entry.  We invert NX, to
+     * calculate as if there were an "X" bit that allowed access. 
+     * We will accumulate, in rc, the set of flags that are missing. */
+    mflags = mandatory_flags(v, pfec);
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+
+    /* Get the l4e from the top level table and check its flags*/
+    gw->l4mfn = top_mfn;
+    l4p = (guest_l4e_t *) top_map;
+    gw->l4e = l4p[guest_l4_table_offset(va)];
+    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT ) goto out;
+
+    /* Map the l3 table */
+    gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
+    if ( !p2m_is_ram(p2mt) ) 
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+    ASSERT(mfn_valid(mfn_x(gw->l3mfn)));
+
+    /* Get the l3e and check its flags*/
+    l3p = map_domain_page(mfn_x(gw->l3mfn));
+    gw->l3e = l3p[guest_l3_table_offset(va)];
+    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT )
+        goto out;
+
+#else /* PAE only... */
+
+    /* Get the l3e and check its flag */
+    gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
+    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+
+#endif /* PAE or 64... */
+
+    /* Map the l2 table */
+    gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
+    if ( !p2m_is_ram(p2mt) )
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+    ASSERT(mfn_valid(mfn_x(gw->l2mfn)));
+
+    /* Get the l2e */
+    l2p = map_domain_page(mfn_x(gw->l2mfn));
+    gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#else /* 32-bit only... */
+
+    /* Get l2e from the top level table */
+    gw->l2mfn = top_mfn;
+    l2p = (guest_l2e_t *) top_map;
+    gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#endif /* All levels... */
+
+    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT )
+        goto out;
+
+    pse = (guest_supports_superpages(v) && 
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
+
+    if ( pse )
+    {
+        /* Special case: this guest VA is in a PSE superpage, so there's
+         * no guest l1e.  We make one up so that the propagation code
+         * can generate a shadow l1 table.  Start with the gfn of the 
+         * first 4k-page of the superpage. */
+        gfn_t start = guest_l2e_get_gfn(gw->l2e);
+        /* Grant full access in the l1e, since all the guest entry's 
+         * access controls are enforced in the shadow l2e. */
+        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+                     _PAGE_ACCESSED|_PAGE_DIRTY);
+        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+         * of the level 1. */
+        if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) 
+            flags |= _PAGE_PAT;
+        /* Copy the cache-control bits to the l1 as well, because we
+         * can't represent PAT in the (non-PSE) shadow l2e. :(
+         * This could cause problems if a guest ever maps an area of
+         * memory with superpages using more than one caching mode. */
+        flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
+        /* Increment the pfn by the right number of 4k pages.  
+         * The ~0x1 is to mask out the PAT bit mentioned above. */
+        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+        gw->l1e = guest_l1e_from_gfn(start, flags);
+        gw->l1mfn = _mfn(INVALID_MFN);
+    } 
+    else 
+    {
+        /* Not a superpage: carry on and find the l1e. */
+        gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
+        if ( !p2m_is_ram(p2mt) )
+        {
+            rc |= _PAGE_PRESENT;
+            goto out;
+        }
+        ASSERT(mfn_valid(mfn_x(gw->l1mfn)));
+        l1p = map_domain_page(mfn_x(gw->l1mfn));
+        gw->l1e = l1p[guest_l1_table_offset(va)];
+        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
+        rc |= ((gflags & mflags) ^ mflags);
+    }
+
+    /* Go back and set accessed and dirty bits only if the walk was a
+     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
+     * get set whenever a lower-level PT is used, at least some hardware
+     * walkers behave this way. */
+    if ( rc == 0 ) 
+    {
+#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
+        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
+            paging_mark_dirty(d, mfn_x(gw->l4mfn));
+        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
+            paging_mark_dirty(d, mfn_x(gw->l3mfn));
+#endif
+        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
+                         (pse && (pfec & PFEC_write_access))) )
+            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
+        if ( !pse ) 
+        {
+            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
+                             (pfec & PFEC_write_access)) )
+                paging_mark_dirty(d, mfn_x(gw->l1mfn));
+        }
+    }
+
+ out:
+#if GUEST_PAGING_LEVELS == 4
+    if ( l3p ) unmap_domain_page(l3p);
+#endif
+#if GUEST_PAGING_LEVELS >= 3
+    if ( l2p ) unmap_domain_page(l2p);
+#endif
+    if ( l1p ) unmap_domain_page(l1p);
+
+    return rc;
+}
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c    Thu Nov 13 13:02:08 2008 +0000
@@ -157,95 +157,23 @@ delete_shadow_status(struct vcpu *v, mfn
         put_page(mfn_to_page(gmfn));
 }
 
-/**************************************************************************/
-/* CPU feature support querying */
-
-static inline int
-guest_supports_superpages(struct vcpu *v)
-{
-    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
-     * CR4.PSE is set or the guest is in PAE or long mode. 
-     * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
-    return (is_hvm_vcpu(v) && 
-            (GUEST_PAGING_LEVELS != 2 
-             || !hvm_paging_enabled(v)
-             || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
-}
-
-static inline int
-guest_supports_nx(struct vcpu *v)
-{
-    if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
-        return 0;
-    if ( !is_hvm_vcpu(v) )
-        return cpu_has_nx;
-    return hvm_nx_enabled(v);
-}
-
 
 /**************************************************************************/
 /* Functions for walking the guest page tables */
 
-/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
-static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
-{
-    static uint32_t flags[] = {
-        /* I/F -  Usr Wr */
-        /* 0   0   0   0 */ _PAGE_PRESENT, 
-        /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-        /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-        /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-        /* 0   1   0   0 */ _PAGE_PRESENT, 
-        /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-        /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-        /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-        /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
-        /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-        /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
-        /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-        /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-    };
-
-    /* Don't demand not-NX if the CPU wouldn't enforce it. */
-    if ( !guest_supports_nx(v) )
-        pfec &= ~PFEC_insn_fetch;
-
-    /* Don't demand R/W if the CPU wouldn't enforce it. */
-    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
-         && !(pfec & PFEC_user_mode) )
-        pfec &= ~PFEC_write_access;
-
-    return flags[(pfec & 0x1f) >> 1];
-}
-
-/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
- * Returns non-zero if it actually writes to guest memory. */
-static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
-{
-    guest_intpte_t old, new;
-    int ret = 0;
-
-    old = *(guest_intpte_t *)walk_p;
-    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
-    if ( old != new ) 
-    {
-        /* Write the new entry into the walk, and try to write it back
-         * into the guest table as well.  If the guest table has changed
-         * under out feet then leave it alone. */
-        *(guest_intpte_t *)walk_p = new;
-        if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
-            ret = 1;
-
-        /* FIXME -- this code is longer than necessary */
-        if(set_dirty)
-            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
-        else
-            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
-    }
-    return ret;
+static inline uint32_t
+sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                     uint32_t pfec)
+{
+    return guest_walk_tables(v, va, gw, pfec, 
+#if GUEST_PAGING_LEVELS == 3 /* PAE */
+                             _mfn(INVALID_MFN),
+                             v->arch.paging.shadow.gl3e
+#else /* 32 or 64 */
+                             pagetable_get_mfn(v->arch.guest_table),
+                             v->arch.paging.shadow.guest_vtable
+#endif
+                             );
 }
 
 /* This validation is called with lock held, and after write permission
@@ -364,236 +292,6 @@ gw_remove_write_accesses(struct vcpu *v,
     return rc;
 }
 
-/* Walk the guest pagetables, after the manner of a hardware walker. 
- *
- * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
- *         pointer to a pagefault code
- * 
- * We walk the vcpu's guest pagetables, filling the walk_t with what we
- * see and adding any Accessed and Dirty bits that are needed in the
- * guest entries.  Using the pagefault code, we check the permissions as
- * we go.  For the purposes of reading pagetables we treat all non-RAM
- * memory as contining zeroes.
- * 
- * The walk is done in a lock-free style, with some sanity check postponed
- * after grabbing shadow lock later. Those delayed checks will make sure
- * no inconsistent mapping being translated into shadow page table.
- * 
- * Returns 0 for success, or the set of permission bits that we failed on 
- * if the walk did not complete.
- * N.B. This is different from the old return code but almost no callers
- * checked the old return code anyway.
- */
-static uint32_t
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
-{
-    struct domain *d = v->domain;
-    p2m_type_t p2mt;
-    guest_l1e_t *l1p = NULL;
-    guest_l2e_t *l2p = NULL;
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    guest_l3e_t *l3p = NULL;
-    guest_l4e_t *l4p;
-#endif
-    uint32_t gflags, mflags, rc = 0;
-    int pse;
-
-    perfc_incr(shadow_guest_walk);
-    memset(gw, 0, sizeof(*gw));
-    gw->va = va;
-
-    /* Mandatory bits that must be set in every entry.  We invert NX, to
-     * calculate as if there were an "X" bit that allowed access. 
-     * We will accumulate, in rc, the set of flags that are missing. */
-    mflags = mandatory_flags(v, pfec);
-
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-
-    /* Get the l4e from the top level table and check its flags*/
-    gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
-    l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
-    gw->l4e = l4p[guest_l4_table_offset(va)];
-    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT ) goto out;
-
-    /* Map the l3 table */
-    gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
-    if ( !p2m_is_ram(p2mt) ) 
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-    ASSERT(mfn_valid(gw->l3mfn));
-
-    /* Get the l3e and check its flags*/
-    l3p = sh_map_domain_page(gw->l3mfn);
-    gw->l3e = l3p[guest_l3_table_offset(va)];
-    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT )
-        goto out;
-
-#else /* PAE only... */
-
-    /* Get l3e from the cache of the top level table and check its flag */
-    gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
-    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-
-#endif /* PAE or 64... */
-
-    /* Map the l2 table */
-    gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
-    if ( !p2m_is_ram(p2mt) )
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-    ASSERT(mfn_valid(gw->l2mfn));
-
-    /* Get the l2e */
-    l2p = sh_map_domain_page(gw->l2mfn);
-    gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#else /* 32-bit only... */
-
-    /* Get l2e from the top level table */
-    gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
-    l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
-    gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#endif /* All levels... */
-
-    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT )
-        goto out;
-
-    pse = (guest_supports_superpages(v) && 
-           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
-
-    if ( pse )
-    {
-        /* Special case: this guest VA is in a PSE superpage, so there's
-         * no guest l1e.  We make one up so that the propagation code
-         * can generate a shadow l1 table.  Start with the gfn of the 
-         * first 4k-page of the superpage. */
-        gfn_t start = guest_l2e_get_gfn(gw->l2e);
-        /* Grant full access in the l1e, since all the guest entry's 
-         * access controls are enforced in the shadow l2e. */
-        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
-                     _PAGE_ACCESSED|_PAGE_DIRTY);
-        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
-         * of the level 1. */
-        if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) 
-            flags |= _PAGE_PAT;
-        /* Copy the cache-control bits to the l1 as well, because we
-         * can't represent PAT in the (non-PSE) shadow l2e. :(
-         * This could cause problems if a guest ever maps an area of
-         * memory with superpages using more than one caching mode. */
-        flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
-        /* Increment the pfn by the right number of 4k pages.  
-         * The ~0x1 is to mask out the PAT bit mentioned above. */
-        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
-        gw->l1e = guest_l1e_from_gfn(start, flags);
-        gw->l1mfn = _mfn(INVALID_MFN);
-    } 
-    else 
-    {
-        /* Not a superpage: carry on and find the l1e. */
-        gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
-        if ( !p2m_is_ram(p2mt) )
-        {
-            rc |= _PAGE_PRESENT;
-            goto out;
-        }
-        ASSERT(mfn_valid(gw->l1mfn));
-        l1p = sh_map_domain_page(gw->l1mfn);
-        gw->l1e = l1p[guest_l1_table_offset(va)];
-        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
-        rc |= ((gflags & mflags) ^ mflags);
-    }
-
-    /* Go back and set accessed and dirty bits only if the walk was a
-     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
-     * get set whenever a lower-level PT is used, at least some hardware
-     * walkers behave this way. */
-    if ( rc == 0 ) 
-    {
-#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
-        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
-            paging_mark_dirty(d, mfn_x(gw->l4mfn));
-        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
-            paging_mark_dirty(d, mfn_x(gw->l3mfn));
-#endif
-        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
-                         (pse && (pfec & PFEC_write_access))) )
-            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
-        if ( !pse ) 
-        {
-            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
-                             (pfec & PFEC_write_access)) )
-                paging_mark_dirty(d, mfn_x(gw->l1mfn));
-        }
-    }
-
- out:
-#if GUEST_PAGING_LEVELS == 4
-    if ( l3p ) sh_unmap_domain_page(l3p);
-#endif
-#if GUEST_PAGING_LEVELS >= 3
-    if ( l2p ) sh_unmap_domain_page(l2p);
-#endif
-    if ( l1p ) sh_unmap_domain_page(l1p);
-
-    return rc;
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding frame number. */
-static inline gfn_t
-guest_walk_to_gfn(walk_t *gw)
-{
-    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
-        return _gfn(INVALID_GFN);
-    return guest_l1e_get_gfn(gw->l1e);
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding physical address. */
-static inline paddr_t
-guest_walk_to_gpa(walk_t *gw)
-{
-    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
-        return 0;
-    return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
-}
-
-#if 0 /* Keep for debugging */
-/* Pretty-print the contents of a guest-walk */
-static inline void print_gw(walk_t *gw)
-{
-    SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    SHADOW_PRINTK("   l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
-    SHADOW_PRINTK("   l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
-    SHADOW_PRINTK("   l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
-#endif /* PAE or 64... */
-    SHADOW_PRINTK("   l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
-#endif /* All levels... */
-    SHADOW_PRINTK("   l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
-    SHADOW_PRINTK("   l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
-    SHADOW_PRINTK("   l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
-    SHADOW_PRINTK("   l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
-}
-#endif /* 0 */
-
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 /* Lightweight audit: pass all the shadows associated with this guest walk
  * through the audit mechanisms */
@@ -654,7 +352,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 
+    if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0 
          && mfn_valid(gw.l1mfn) )
     {
         if ( gl1mfn )
@@ -676,7 +374,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
+    (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
     *(guest_l1e_t *)eff_l1e = gw.l1e;
 }
 #endif /* CONFIG == GUEST (== SHADOW) */
@@ -3314,9 +3012,14 @@ static int sh_page_fault(struct vcpu *v,
     }
 
  rewalk:
+
+    /* The walk is done in a lock-free style, with some sanity check
+     * postponed after grabbing shadow lock later. Those delayed checks
+     * will make sure no inconsistent mapping being translated into
+     * shadow page table. */ 
     version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
     rmb();
-    rc = guest_walk_tables(v, va, &gw, regs->error_code);
+    rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     regs->error_code &= ~PFEC_page_present;
@@ -3869,7 +3572,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned l
         return vtlb_gfn;
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-    if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
+    if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
     {
         if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
             pfec[0] &= ~PFEC_page_present;
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/include/asm-x86/guest_pt.h
--- a/xen/include/asm-x86/guest_pt.h    Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/include/asm-x86/guest_pt.h    Thu Nov 13 13:02:08 2008 +0000
@@ -174,6 +174,32 @@ static inline guest_l4e_t guest_l4e_from
 #endif /* GUEST_PAGING_LEVELS != 2 */
 
 
+/* Which pagetable features are supported on this vcpu? */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
+     * CR4.PSE is set or the guest is in PAE or long mode. 
+     * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
+    return (is_hvm_vcpu(v) && 
+            (GUEST_PAGING_LEVELS != 2 
+             || !hvm_paging_enabled(v)
+             || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+    if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
+        return 0;
+    if ( !is_hvm_vcpu(v) )
+        return cpu_has_nx;
+    return hvm_nx_enabled(v);
+}
+
+
+
 /* Type used for recording a walk through guest pagetables.  It is
  * filled in by the pagetable walk function, and also used as a cache
  * for later walks.  When we encounter a superpage l2e, we fabricate an
@@ -199,4 +225,67 @@ struct guest_pagetable_walk
     mfn_t l1mfn;                /* MFN that the level 1 entry was in */
 };
 
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+        return _gfn(INVALID_GFN);
+    return guest_l1e_get_gfn(gw->l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+        return 0;
+    return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
+}
+
+/* Walk the guest pagetables, after the manner of a hardware walker. 
+ *
+ * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
+ *         pointer to a pagefault code, the MFN of the guest's 
+ *         top-level pagetable, and a mapping of the 
+ *         guest's top-level pagetable.
+ * 
+ * We walk the vcpu's guest pagetables, filling the walk_t with what we
+ * see and adding any Accessed and Dirty bits that are needed in the
+ * guest entries.  Using the pagefault code, we check the permissions as
+ * we go.  For the purposes of reading pagetables we treat all non-RAM
+ * memory as contining zeroes.
+ * 
+ * Returns 0 for success, or the set of permission bits that we failed on 
+ * if the walk did not complete. */
+
+/* Macro-fu so you can call guest_walk_tables() and get the right one. */
+#define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels
+#define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l)
+#define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
+
+extern uint32_t 
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                  uint32_t pfec, mfn_t top_mfn, void *top_map);
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+    gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    gdprintk(XENLOG_INFO, "   l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
+    gdprintk(XENLOG_INFO, "   l4e=%" PRI_gpte "\n", gw->l4e.l4);
+    gdprintk(XENLOG_INFO, "   l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
+#endif /* PAE or 64... */
+    gdprintk(XENLOG_INFO, "   l3e=%" PRI_gpte "\n", gw->l3e.l3);
+#endif /* All levels... */
+    gdprintk(XENLOG_INFO, "   l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
+    gdprintk(XENLOG_INFO, "   l2e=%" PRI_gpte "\n", gw->l2e.l2);
+    gdprintk(XENLOG_INFO, "   l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
+    gdprintk(XENLOG_INFO, "   l1e=%" PRI_gpte "\n", gw->l1e.l1);
+}
+
 #endif /* _XEN_ASM_GUEST_PT_H */
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/include/asm-x86/perfc_defn.h  Thu Nov 13 13:02:08 2008 +0000
@@ -33,6 +33,7 @@ PERFCOUNTER(ptwr_emulations,        "wri
 
 PERFCOUNTER(exception_fixed,        "pre-exception fixed")
 
+PERFCOUNTER(guest_walk,            "guest pagetable walks")
 
 /* Shadow counters */
 PERFCOUNTER(shadow_alloc,          "calls to shadow_alloc")
@@ -92,7 +93,6 @@ PERFCOUNTER(shadow_up_pointer,     "shad
 PERFCOUNTER(shadow_up_pointer,     "shadow unshadow by up-pointer")
 PERFCOUNTER(shadow_unshadow_bf,    "shadow unshadow brute-force")
 PERFCOUNTER(shadow_get_page_fail,  "shadow_get_page_from_l1e failed")
-PERFCOUNTER(shadow_guest_walk,     "shadow walks guest tables")
 PERFCOUNTER(shadow_check_gwalk,    "shadow checks gwalk")
 PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
 PERFCOUNTER(shadow_rm_write_flush_tlb,

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.