[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] x86 shadow: Remove lock on first guest table walk.



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1204204709 0
# Node ID 8612d3d9578adb8d1304aa67b0eafd65f38e6d09
# Parent  6b875abd0a9ecccb5c26ed6eeef676ef29281ca6
x86 shadow: Remove lock on first guest table walk.

Existing shadow fault path grabs big lock before walking
guest tables, to ensure consistency with shadow content
lest concurrent change from other vcpu in a bad OS.
But this lock brings more lock contention when scaled up
for a good guest which already prevents above case happen.
So this patch tries to remove the lock on first guest
table walk, and then delay check at some special points.

The key is to check whether any guest table update happens
between 1st walk and holding shadow lock. Here we take
two hints for guest table update:
    * write permission removal
    * write emulation
If any above two operations are observed within the race
window, it indicates possiblity that previous walk result
may be inaccurate and re-check is requried. If mismatch,
simply return to trigger another fault.

I made some experiment to sample perfc count:
<64bit guest>
3.7% of gwalks are re-checked
For re-check, 68% comes from write permission removal
<32bit pae guest>
7.2% of gwalks are re-checked
For re-check, 54.9% comes from write permission removal

Actually previous fast emulation optimization already skip
lots of guest table walks, and thus above ratio can be
smaller if compared to total shadow fault count.

Basically shadow promotion with write permision removal
does suffer higher overhead, but the benefit to reduce
lock contention is more obvious.

Improvement on kernel compile for this patch is:
(64bit Xen)
32bit guest: 1.1%
pae guest:   0.4%
64bit guest: 0.5%

Signed-off-by Kevin Tian <kevin.tian@xxxxxxxxx>
---
 xen/arch/x86/mm/shadow/multi.c   |  172 +++++++++++++++++++++++++++++----------
 xen/arch/x86/mm/shadow/types.h   |    1 
 xen/include/asm-x86/domain.h     |    5 +
 xen/include/asm-x86/perfc_defn.h |    5 +
 4 files changed, 140 insertions(+), 43 deletions(-)

diff -r 6b875abd0a9e -r 8612d3d9578a xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Thu Feb 28 13:10:28 2008 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c    Thu Feb 28 13:18:29 2008 +0000
@@ -55,12 +55,6 @@
  * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
  * to share l2h pages again. 
  *
- * GUEST_WALK_TABLES TLB FLUSH COALESCE
- * guest_walk_tables can do up to three remote TLB flushes as it walks to
- * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
- * and if we do flush, re-do the walk.  If anything has changed, then 
- * pause all the other vcpus and do the walk *again*.
- *
  * PSE disabled / PSE36
  * We don't support any modes other than PSE enabled, PSE36 disabled.
  * Neither of those would be hard to change, but we'd need to be able to 
@@ -246,10 +240,97 @@ static uint32_t set_ad_bits(void *guest_
     return 0;
 }
 
+/* This validation is called with lock held, and after write permission
+ * removal. Then check is atomic and no more inconsistent content can
+ * be observed before lock is released
+ *
+ * Return 1 to indicate success and 0 for inconsistency
+ */
+static inline uint32_t
+shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
+{
+    struct domain *d = v->domain;
+    guest_l1e_t *l1p;
+    guest_l2e_t *l2p;
+#if GUEST_PAGING_LEVELS >= 4
+    guest_l3e_t *l3p;
+    guest_l4e_t *l4p;
+#endif
+
+    ASSERT(shadow_locked_by_me(d));
+
+    if ( gw->version ==
+         atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
+        return 1;
+
+    /* We may consider caching guest page mapping from last
+     * guest table walk. However considering this check happens
+     * relatively less-frequent, and a bit burden here to
+     * remap guest page is better than caching mapping in each
+     * guest table walk.
+     *
+     * Also when inconsistency occurs, simply return to trigger
+     * another fault instead of re-validate new path to make
+     * logic simple.
+     */
+    perfc_incr(shadow_check_gwalk);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
+    if ( gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4 )
+        return 0;
+    l3p = sh_map_domain_page(gw->l3mfn);
+    if ( gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3 )
+        return 0; 
+#else
+    if ( gw->l3e.l3 !=
+         v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3 )
+        return 0;
+#endif
+    l2p = sh_map_domain_page(gw->l2mfn);
+    if ( gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2 )
+        return 0;
+#else
+    l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
+    if ( gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2 )
+        return 0;
+#endif
+    if ( !(guest_supports_superpages(v) &&
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
+    {
+        l1p = sh_map_domain_page(gw->l1mfn);
+        if ( gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1 )
+            return 0;
+    }
+
+    return 1;
+}
+
+/* Remove write access permissions from a gwalk_t in a batch, and
+ * return OR-ed result for TLB flush hint
+ */
+static inline uint32_t
+gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
+{
+    int rc = 0;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
+#endif
+    rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
+#endif
+    if ( !(guest_supports_superpages(v) &&
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
+        rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+
+    return rc;
+}
+
 /* Walk the guest pagetables, after the manner of a hardware walker. 
  *
  * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
- *         pointer to a pagefault code, and a flag "shadow_op".
+ *         pointer to a pagefault code
  * 
  * We walk the vcpu's guest pagetables, filling the walk_t with what we
  * see and adding any Accessed and Dirty bits that are needed in the
@@ -257,10 +338,9 @@ static uint32_t set_ad_bits(void *guest_
  * we go.  For the purposes of reading pagetables we treat all non-RAM
  * memory as contining zeroes.
  * 
- * If "shadow_op" is non-zero, we are serving a genuine guest memory access, 
- * and must (a) be under the shadow lock, and (b) remove write access
- * from any guest PT pages we see, as we will be shadowing them soon
- * and will rely on the contents' not having changed.
+ * The walk is done in a lock-free style, with some sanity check postponed
+ * after grabbing shadow lock later. Those delayed checks will make sure
+ * no inconsistent mapping being translated into shadow page table.
  * 
  * Returns 0 for success, or the set of permission bits that we failed on 
  * if the walk did not complete.
@@ -268,8 +348,7 @@ static uint32_t set_ad_bits(void *guest_
  * checked the old return code anyway.
  */
 static uint32_t
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
-                  uint32_t pfec, int shadow_op)
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
 {
     struct domain *d = v->domain;
     p2m_type_t p2mt;
@@ -282,11 +361,12 @@ guest_walk_tables(struct vcpu *v, unsign
     uint32_t gflags, mflags, rc = 0;
     int pse;
 
-    ASSERT(!shadow_op || shadow_locked_by_me(d));
-    
     perfc_incr(shadow_guest_walk);
     memset(gw, 0, sizeof(*gw));
     gw->va = va;
+
+    gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
+    rmb();
 
     /* Mandatory bits that must be set in every entry.  We invert NX, to
      * calculate as if there were an "X" bit that allowed access. 
@@ -312,9 +392,7 @@ guest_walk_tables(struct vcpu *v, unsign
         goto out;
     }
     ASSERT(mfn_valid(gw->l3mfn));
-    /* This mfn is a pagetable: make sure the guest can't write to it. */
-    if ( shadow_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
-        flush_tlb_mask(d->domain_dirty_cpumask); 
+
     /* Get the l3e and check its flags*/
     l3p = sh_map_domain_page(gw->l3mfn);
     gw->l3e = l3p[guest_l3_table_offset(va)];
@@ -343,9 +421,7 @@ guest_walk_tables(struct vcpu *v, unsign
         goto out;
     }
     ASSERT(mfn_valid(gw->l2mfn));
-    /* This mfn is a pagetable: make sure the guest can't write to it. */
-    if ( shadow_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
-        flush_tlb_mask(d->domain_dirty_cpumask); 
+
     /* Get the l2e */
     l2p = sh_map_domain_page(gw->l2mfn);
     gw->l2e = l2p[guest_l2_table_offset(va)];
@@ -403,10 +479,6 @@ guest_walk_tables(struct vcpu *v, unsign
             goto out;
         }
         ASSERT(mfn_valid(gw->l1mfn));
-        /* This mfn is a pagetable: make sure the guest can't write to it. */
-        if ( shadow_op 
-             && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
-            flush_tlb_mask(d->domain_dirty_cpumask); 
         l1p = sh_map_domain_page(gw->l1mfn);
         gw->l1e = l1p[guest_l1_table_offset(va)];
         gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
@@ -548,8 +620,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    shadow_lock(v->domain);
-    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present, 1) == 0 
+    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 
          && mfn_valid(gw.l1mfn) )
     {
         if ( gl1mfn )
@@ -558,8 +629,6 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
             (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
     }
 
-    shadow_unlock(v->domain);
-
     return pl1e;
 }
 
@@ -573,10 +642,8 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    shadow_lock(v->domain);
-    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present, 1);
+    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
     *(guest_l1e_t *)eff_l1e = gw.l1e;
-    shadow_unlock(v->domain);
 }
 #endif /* CONFIG==SHADOW==GUEST */
 
@@ -2842,14 +2909,12 @@ static int sh_page_fault(struct vcpu *v,
         return 0;
     }
 
-    shadow_lock(d);
-    
-    shadow_audit_tables(v);
-    
-    if ( guest_walk_tables(v, va, &gw, regs->error_code, 1) != 0 )
+    if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
     {
         perfc_incr(shadow_fault_bail_real_fault);
-        goto not_a_shadow_fault;
+        SHADOW_PRINTK("not a shadow fault\n");
+        reset_early_unshadow(v);
+        return 0;
     }
 
     /* It's possible that the guest has put pagetables in memory that it has 
@@ -2859,11 +2924,8 @@ static int sh_page_fault(struct vcpu *v,
     if ( unlikely(d->is_shutting_down) )
     {
         SHADOW_PRINTK("guest is shutting down\n");
-        shadow_unlock(d);
         return 0;
     }
-
-    sh_audit_gw(v, &gw);
 
     /* What kind of access are we dealing with? */
     ft = ((regs->error_code & PFEC_write_access)
@@ -2879,7 +2941,8 @@ static int sh_page_fault(struct vcpu *v,
         perfc_incr(shadow_fault_bail_bad_gfn);
         SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n", 
                       gfn_x(gfn), mfn_x(gmfn));
-        goto not_a_shadow_fault;
+        reset_early_unshadow(v);
+        return 0;
     }
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
@@ -2887,6 +2950,27 @@ static int sh_page_fault(struct vcpu *v,
     vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), 
                 regs->error_code | PFEC_page_present);
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
+
+    shadow_lock(d);
+    shadow_audit_tables(v);
+    sh_audit_gw(v, &gw);
+
+    if ( gw_remove_write_accesses(v, va, &gw) )
+    {
+        /* Write permission removal is also a hint that other gwalks
+         * overlapping with this one may be inconsistent
+         */
+        perfc_incr(shadow_rm_write_flush_tlb);
+        atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
+        flush_tlb_mask(d->domain_dirty_cpumask);
+    }
+
+    if ( !shadow_check_gwalk(v, va, &gw) )
+    {
+        perfc_incr(shadow_inconsistent_gwalk);
+        shadow_unlock(d);
+        return EXCRET_fault_fixed;
+    }
 
     /* Make sure there is enough free shadow memory to build a chain of
      * shadow tables. (We never allocate a top-level shadow on this path,
@@ -3223,7 +3307,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned l
         return vtlb_gfn;
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-    if ( guest_walk_tables(v, va, &gw, pfec[0], 0) != 0 )
+    if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
     {
         if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
             pfec[0] &= ~PFEC_page_present;
@@ -4276,6 +4360,8 @@ static void emulate_unmap_dest(struct vc
     }
     else 
         sh_unmap_domain_page(addr);
+
+    atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
 }
 
 int
diff -r 6b875abd0a9e -r 8612d3d9578a xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h    Thu Feb 28 13:10:28 2008 +0000
+++ b/xen/arch/x86/mm/shadow/types.h    Thu Feb 28 13:18:29 2008 +0000
@@ -435,6 +435,7 @@ struct shadow_walk_t
 #endif
     mfn_t l2mfn;                /* MFN that the level 2 entry was in */
     mfn_t l1mfn;                /* MFN that the level 1 entry was in */
+    int version;                /* Saved guest dirty version */
 };
 
 /* macros for dealing with the naming of the internal function names of the
diff -r 6b875abd0a9e -r 8612d3d9578a xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Thu Feb 28 13:10:28 2008 +0000
+++ b/xen/include/asm-x86/domain.h      Thu Feb 28 13:18:29 2008 +0000
@@ -97,6 +97,11 @@ struct shadow_domain {
 
     /* Fast MMIO path heuristic */
     int has_fast_mmio_entries;
+
+    /* reflect guest table dirty status, incremented by write
+     * emulation and remove write permission
+     */
+    atomic_t          gtable_dirty_version;
 };
 
 struct shadow_vcpu {
diff -r 6b875abd0a9e -r 8612d3d9578a xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Thu Feb 28 13:10:28 2008 +0000
+++ b/xen/include/asm-x86/perfc_defn.h  Thu Feb 28 13:18:29 2008 +0000
@@ -88,6 +88,11 @@ PERFCOUNTER(shadow_unshadow_bf,    "shad
 PERFCOUNTER(shadow_unshadow_bf,    "shadow unshadow brute-force")
 PERFCOUNTER(shadow_get_page_fail,  "shadow_get_page_from_l1e failed")
 PERFCOUNTER(shadow_guest_walk,     "shadow walks guest tables")
+PERFCOUNTER(shadow_check_gwalk,    "shadow checks gwalk")
+PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
+PERFCOUNTER(shadow_rm_write_flush_tlb,
+                                   "shadow flush tlb by removing write perm")
+
 PERFCOUNTER(shadow_invlpg,         "shadow emulates invlpg")
 PERFCOUNTER(shadow_invlpg_fault,   "shadow invlpg faults")
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.