[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 1/4] Out-of-sync L1 shadows: OOS base



This patch implements the basic mechanisms to get pagetables out of sync and back in sync again.

Signed-off-by: Gianluca Guida <gianluca.guida@xxxxxxxxxxxxx>
Signed-off-by: Tim Deegan <tim.deegan@xxxxxxxxxxxxx>
Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx>
diff -r 26ecd1f9e128 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm.c Fri Jun 20 15:10:08 2008 +0100
@@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page
         {
             struct domain *d = page_get_owner(page);
 
-            /* Never allow a shadowed frame to go from type count 0 to 1 */
-            if ( d && shadow_mode_enabled(d) )
-                shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
+            /* Normally we should never let a page go from type count 0
+             * to type count 1 when it is shadowed. One exception:
+             * out-of-sync shadowed pages are allowed to become
+             * writeable. */
+            if ( d && shadow_mode_enabled(d)
+                 && (page->count_info & PGC_page_table)
+                 && !((page->shadow_flags & (1u<<29))
+                      && type == PGT_writable_page) )
+               shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
 
             ASSERT(!(x & PGT_pae_xen_l2));
             if ( (x & PGT_type_mask) != type )
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/common.c   Fri Jun 20 15:10:08 2008 +0100
@@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d
     /* Use shadow pagetables for log-dirty support */
     paging_log_dirty_init(d, shadow_enable_log_dirty, 
                           shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    d->arch.paging.shadow.oos_active = 0;
+#endif
 }
 
 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
@@ -64,6 +68,13 @@ void shadow_domain_init(struct domain *d
  */
 void shadow_vcpu_init(struct vcpu *v)
 {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    int i;
+
+    for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
+        v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
+#endif
+
     v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
 }
 
@@ -427,6 +438,404 @@ void shadow_continue_emulation(struct sh
         }
     }
 }
+ 
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Out-of-sync shadows. */ 
+
+/* From time to time, we let a shadowed pagetable page go out of sync 
+ * with its shadow: the guest is allowed to write directly to the page, 
+ * and those writes are not synchronously reflected in the shadow.
+ * This lets us avoid many emulations if the guest is writing a lot to a 
+ * pagetable, but it relaxes a pretty important invariant in the shadow 
+ * pagetable design.  Therefore, some rules:
+ *
+ * 1. Only L1 pagetables may go out of sync: any page that is shadowed
+ *    at at higher level must be synchronously updated.  This makes
+ *    using linear shadow pagetables much less dangerous.
+ *    That means that: (a) unsyncing code needs to check for higher-level
+ *    shadows, and (b) promotion code needs to resync.
+ * 
+ * 2. All shadow operations on a guest page require the page to be brought
+ *    back into sync before proceeding.  This must be done under the
+ *    shadow lock so that the page is guaranteed to remain synced until
+ *    the operation completes.
+ *
+ *    Exceptions to this rule: the pagefault and invlpg handlers may 
+ *    update only one entry on an out-of-sync page without resyncing it. 
+ *
+ * 3. Operations on shadows that do not start from a guest page need to
+ *    be aware that they may be handling an out-of-sync shadow.
+ *
+ * 4. Operations that do not normally take the shadow lock (fast-path 
+ *    #PF handler, INVLPG) must fall back to a locking, syncing version 
+ *    if they see an out-of-sync table. 
+ *
+ * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
+ *    must explicitly resync all relevant pages or update their
+ *    shadows.
+ *
+ * Currently out-of-sync pages are listed in a simple open-addressed
+ * hash table with a second chance (must resist temptation to radically
+ * over-engineer hash tables...)  The virtual address of the access
+ * which caused us to unsync the page is also kept in the hash table, as
+ * a hint for finding the writable mappings later.
+ *
+ * We keep a hash per vcpu, because we want as much as possible to do
+ * the re-sync on the save vcpu we did the unsync on, so the VA hint
+ * will be valid.
+ */
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+static void sh_oos_audit(struct domain *d) 
+{
+    int idx, expected_idx, expected_idx_alt;
+    struct page_info *pg;
+    struct vcpu *v;
+    
+    for_each_vcpu(d, v) 
+    {
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+        {
+            mfn_t *oos = v->arch.paging.shadow.oos;
+            if ( !mfn_valid(oos[idx]) )
+                continue;
+            
+            expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
+            expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
+            if ( idx != expected_idx && idx != expected_idx_alt )
+            {
+                printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
+                       __func__, idx, mfn_x(oos[idx]), 
+                       expected_idx, expected_idx_alt);
+                BUG();
+            }
+            pg = mfn_to_page(oos[idx]);
+            if ( !(pg->count_info & PGC_page_table) )
+            {
+                printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->count_info);
+                BUG();
+            }
+            if ( !(pg->shadow_flags & SHF_out_of_sync) )
+            {
+                printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+                BUG();
+            }
+            if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
+            {
+                printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+                BUG();
+            }
+        }
+    }
+}
+#endif
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) 
+{
+    int idx;
+    struct vcpu *v;
+    mfn_t *oos;
+
+    ASSERT(mfn_is_out_of_sync(gmfn));
+    
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+            return;
+    }
+
+    SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+#endif
+
+/* Update the shadow, but keep the page out of sync. */
+static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn)
+{
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(mfn_valid(gmfn));
+    ASSERT(page_is_out_of_sync(pg));
+
+    /* Call out to the appropriate per-mode resyncing function */
+    if ( pg->shadow_flags & SHF_L1_32 )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn);
+    else if ( pg->shadow_flags & SHF_L1_PAE )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( pg->shadow_flags & SHF_L1_64 )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn);
+#endif
+}
+
+/* Pull all the entries on an out-of-sync page back into sync. */
+static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(shadow_locked_by_me(v->domain));
+    ASSERT(mfn_is_out_of_sync(gmfn));
+    /* Guest page must be shadowed *only* as L1 when out of sync. */
+    ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask 
+             & ~SHF_L1_ANY));
+    ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
+
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+    /* Need to pull write access so the page *stays* in sync. 
+     * This might be rather slow but we hope that in the common case 
+     * we're handling this pagetable after a guest walk has pulled 
+     * write access the fast way. */
+    switch ( sh_remove_write_access(v, gmfn, 0, va) )
+    {
+    default:
+    case 0:
+        break;
+
+    case 1:
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);
+        break;
+
+    case -1:
+        /* An unfindable writeable typecount has appeared, probably via a
+         * grant table entry: can't shoot the mapping, so try to unshadow 
+         * the page.  If that doesn't work either, the guest is granting
+         * his pagetables and must be killed after all. */
+        sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
+        return;
+    }
+
+    /* No more writable mappings of this page, please */
+    pg->shadow_flags &= ~SHF_oos_may_write;
+
+    /* Update the shadows with current guest entries. */
+    _sh_resync_l1(v, gmfn);
+
+    /* Now we know all the entries are synced, and will stay that way */
+    pg->shadow_flags &= ~SHF_out_of_sync;
+    perfc_incr(shadow_resync);
+}
+
+
+/* Add an MFN to the list of out-of-sync guest pagetables */
+static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    int idx;
+    mfn_t *oos = v->arch.paging.shadow.oos;
+    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+    if ( mfn_valid(oos[idx]) 
+         && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
+    {
+        /* Punt the current occupant into the next slot */
+        SWAP(oos[idx], gmfn);
+        SWAP(oos_va[idx], va);
+        idx = (idx + 1) % SHADOW_OOS_PAGES;
+    }
+    if ( mfn_valid(oos[idx]) )
+   {
+        /* Crush the current occupant. */
+        _sh_resync(v, oos[idx], oos_va[idx]);
+        perfc_incr(shadow_unsync_evict);
+    }
+    oos[idx] = gmfn;
+    oos_va[idx] = va;
+}
+
+/* Remove an MFN from the list of out-of-sync guest pagetables */
+static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    struct domain *d = v->domain;
+
+    SHADOW_PRINTK("D%dV%d gmfn %lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); 
+
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            oos[idx] = _mfn(INVALID_MFN);
+            return;
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+
+/* Pull a single guest page back into sync */
+void sh_resync(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    unsigned long *oos_va;
+    struct domain *d = v->domain;
+
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        oos_va = v->arch.paging.shadow.oos_va;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            _sh_resync(v, gmfn, oos_va[idx]);
+            oos[idx] = _mfn(INVALID_MFN);
+            return;
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table,
+ * by making a call out to the mode in which that shadow was made. */
+static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+    struct page_info *pg = mfn_to_page(gl1mfn);
+    if ( pg->shadow_flags & SHF_L1_32 )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
+    else if ( pg->shadow_flags & SHF_L1_PAE )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( pg->shadow_flags & SHF_L1_64 )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
+#endif
+    SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n", 
+                 mfn_x(gl1mfn));
+    BUG();
+    return 0; /* BUG() is no longer __attribute__((noreturn)). */
+}
+
+
+/* Pull all out-of-sync pages back into sync.  Pages brought out of sync
+ * on other vcpus are allowed to remain out of sync, but their contents
+ * will be made safe (TLB flush semantics); pages unsynced by this vcpu
+ * are brought back into sync and write-protected.  If skip != 0, we try
+ * to avoid resyncing at all if we think we can get away with it. */
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int 
do_locking)
+{
+    int idx;
+    struct vcpu *other;
+    mfn_t *oos = v->arch.paging.shadow.oos;
+    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+
+    SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
+
+    ASSERT(do_locking || shadow_locked_by_me(v->domain));
+
+    if ( !this )
+        goto resync_others;
+
+    if ( do_locking )
+        shadow_lock(v->domain);
+
+    /* First: resync all of this vcpu's oos pages */
+    for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        if ( mfn_valid(oos[idx]) )
+        {
+            /* Write-protect and sync contents */
+            _sh_resync(v, oos[idx], oos_va[idx]);
+            oos[idx] = _mfn(INVALID_MFN);
+        }
+
+    if ( do_locking )
+        shadow_unlock(v->domain);
+
+ resync_others:
+    if ( !others )
+        return;
+
+    /* Second: make all *other* vcpus' oos pages safe. */
+    for_each_vcpu(v->domain, other)
+    {
+        if ( v == other ) 
+            continue;
+
+        if ( do_locking )
+            shadow_lock(v->domain);
+
+        oos = other->arch.paging.shadow.oos;
+        oos_va = other->arch.paging.shadow.oos_va;
+
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        {
+            if ( !mfn_valid(oos[idx]) )
+                continue;
+
+            if ( skip )
+            {
+                /* Update the shadows and leave the page OOS. */
+                if ( sh_skip_sync(v, oos[idx]) )
+                    continue;
+                _sh_resync_l1(other, oos[idx]);
+            }
+            else
+            {
+                /* Write-protect and sync contents */
+                _sh_resync(other, oos[idx], oos_va[idx]);
+                oos[idx] = _mfn(INVALID_MFN);
+            }
+        }
+        
+        if ( do_locking )
+            shadow_unlock(v->domain);
+    }
+}
+
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    struct page_info *pg;
+    
+    ASSERT(shadow_locked_by_me(v->domain));
+
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+    pg = mfn_to_page(gmfn);
+ 
+    /* Guest page must be shadowed *only* as L1 and *only* once when out
+     * of sync.  Also, get out now if it's already out of sync. 
+     * Also, can't safely unsync if some vcpus have paging disabled.*/
+    if ( pg->shadow_flags & 
+         ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) 
+         || sh_page_has_multiple_shadows(pg)
+         || !is_hvm_domain(v->domain)
+         || !v->domain->arch.paging.shadow.oos_active )
+        return 0;
+
+    pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
+    oos_hash_add(v, gmfn, va);
+    perfc_incr(shadow_unsync);
+    return 1;
+}
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
 
 /**************************************************************************/
 /* Code for "promoting" a guest page to the point where the shadow code is
@@ -439,6 +848,12 @@ void shadow_promote(struct vcpu *v, mfn_
     struct page_info *page = mfn_to_page(gmfn);
 
     ASSERT(mfn_valid(gmfn));
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Is the page already shadowed and out of sync? */
+    if ( page_is_out_of_sync(page) ) 
+        sh_resync(v, gmfn);
+#endif
 
     /* We should never try to promote a gmfn that has writeable mappings */
     ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
@@ -463,7 +878,14 @@ void shadow_demote(struct vcpu *v, mfn_t
     clear_bit(type, &page->shadow_flags);
 
     if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
+    {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        /* Was the page out of sync? */
+        if ( page_is_out_of_sync(page) ) 
+            oos_hash_remove(v, gmfn);
+#endif 
         clear_bit(_PGC_page_table, &page->count_info);
+    }
 }
 
 /**************************************************************************/
@@ -1297,6 +1719,27 @@ static void sh_hash_audit_bucket(struct 
             /* Bad shadow flags on guest page? */
             BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
             /* Bad type count on guest page? */
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+            if ( sp->type == SH_type_l1_32_shadow
+                 || sp->type == SH_type_l1_pae_shadow
+                 || sp->type == SH_type_l1_64_shadow )
+            {
+                if ( (gpg->u.inuse.type_info & PGT_type_mask) == 
PGT_writable_page
+                     && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
+                {
+                    if ( !page_is_out_of_sync(gpg) )
+                    {
+                        SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
+                                     " and not OOS but has typecount %#lx\n",
+                                     sp->backpointer, 
+                                     mfn_x(shadow_page_to_mfn(sp)), 
+                                     gpg->u.inuse.type_info);
+                        BUG();
+                    }
+                }
+            }
+            else /* Not an l1 */
+#endif
             if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
                  && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
             {
@@ -1608,7 +2051,8 @@ void sh_destroy_shadow(struct vcpu *v, m
 /* Remove all writeable mappings of a guest frame from the shadow tables 
  * Returns non-zero if we need to flush TLBs. 
  * level and fault_addr desribe how we found this to be a pagetable;
- * level==0 means we have some other reason for revoking write access.*/
+ * level==0 means we have some other reason for revoking write access.
+ * If level==0 we are allowed to fail, returning -1. */
 
 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, 
                            unsigned int level,
@@ -1659,7 +2103,12 @@ int sh_remove_write_access(struct vcpu *
         return 0;
 
     /* Early exit if it's already a pagetable, or otherwise not writeable */
-    if ( sh_mfn_is_a_page_table(gmfn) 
+    if ( (sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+         /* Unless they've been allowed to go out of sync with their shadows */
+           && !mfn_oos_may_write(gmfn)
+#endif
+         )
          || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
         return 0;
 
@@ -1676,7 +2125,7 @@ int sh_remove_write_access(struct vcpu *
     }
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
-    if ( v == current && level != 0 )
+    if ( v == current )
     {
         unsigned long gfn;
         /* Heuristic: there is likely to be only one writeable mapping,
@@ -1690,6 +2139,8 @@ int sh_remove_write_access(struct vcpu *
                 return 1;                                                 \
         } while (0)
 
+        if ( level == 0 && fault_addr )
+            GUESS(fault_addr, 6);
         
         if ( v->arch.paging.mode->guest_levels == 2 )
         {
@@ -1780,6 +2231,9 @@ int sh_remove_write_access(struct vcpu *
      * mapping -- ioreq page, grant mapping, &c. */
     if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
     {
+        if ( level == 0 )
+            return -1;
+
         SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
                       "%lu special-use mappings of it\n", mfn_x(gmfn),
                       (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
@@ -2159,6 +2613,13 @@ static void sh_update_paging_modes(struc
         ASSERT(shadow_mode_translate(d));
         ASSERT(shadow_mode_external(d));
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        /* Need to resync all our pages now, because if a page goes out
+         * of sync with paging enabled and is resynced with paging
+         * disabled, the resync will go wrong. */
+        shadow_resync_all(v, 0);
+#endif /* OOS */
+
         if ( !hvm_paging_enabled(v) )
         {
             /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
@@ -2253,6 +2714,27 @@ static void sh_update_paging_modes(struc
         //        different values for CR4.PSE and CR4.PGE at the same time.
         //        This *does* happen, at least for CR4.PGE...
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* We need to check that all the vcpus have paging enabled to
+     * unsync PTs. */
+    if ( is_hvm_domain(d) )
+    {
+        int pe = 1;
+        struct vcpu *vptr;
+
+        for_each_vcpu(d, vptr)
+        {
+            if ( !hvm_paging_enabled(vptr) )
+            {
+                pe = 0;
+                break;
+            }
+        }
+
+        d->arch.paging.shadow.oos_active = pe;
+    }
+#endif /* OOS */
 
     v->arch.paging.mode->update_cr3(v, 0);
 }
@@ -3044,7 +3526,11 @@ void shadow_audit_tables(struct vcpu *v)
 
     if ( !(SHADOW_AUDIT_ENABLE) )
         return;
-    
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    sh_oos_audit(v->domain);
+#endif
+
     if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
         mask = ~1; /* Audit every table in the system */
     else 
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/multi.c    Fri Jun 20 15:10:08 2008 +0100
@@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig
 }
 
 /* Remove write access permissions from a gwalk_t in a batch, and
- * return OR-ed result for TLB flush hint
- */
+ * return OR-ed result for TLB flush hint and need to rewalk the guest
+ * pages.
+ *
+ * Syncing pages will remove write access to that page; but it may
+ * also give write access to other pages in the path. If we resync any
+ * pages, re-walk from the beginning.
+ */
+#define GW_RMWR_FLUSHTLB 1
+#define GW_RMWR_REWALK   2
+
 static inline uint32_t
 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
 {
-    int rc = 0;
+    uint32_t rc = 0;
 
 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
-#endif
-    rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
-#endif
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_is_out_of_sync(gw->l3mfn) )
+    {
+        sh_resync(v, gw->l3mfn);
+        rc = GW_RMWR_REWALK;
+    }
+    else
+#endif /* OOS */
+     if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
+         rc = GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_is_out_of_sync(gw->l2mfn) )
+    {
+        sh_resync(v, gw->l2mfn);
+        rc |= GW_RMWR_REWALK;
+    }
+    else
+#endif /* OOS */
+    if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
+        rc |= GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
     if ( !(guest_supports_superpages(v) &&
-           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
-        rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+         && !mfn_is_out_of_sync(gw->l1mfn)
+#endif /* OOS */
+         && sh_remove_write_access(v, gw->l1mfn, 1, va) )
+        rc |= GW_RMWR_FLUSHTLB;
 
     return rc;
 }
@@ -882,7 +914,12 @@ _sh_propagate(struct vcpu *v,
     
     // protect guest page tables
     //
-    if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
+    if ( unlikely((level == 1) 
+                  && sh_mfn_is_a_page_table(target_mfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+                  && !mfn_oos_may_write(target_mfn)
+#endif /* OOS */
+                  ) )
     {
         if ( shadow_mode_trap_reads(d) )
         {
@@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
         }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+        shadow_resync_all(v, 0);
+#endif
     }
 
     /* Write the new entry */
@@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v
              | (((unsigned long)sl3e) & ~PAGE_MASK));
     
     if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+    {
         /* About to install a new reference */        
         if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
         {
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
-        } 
+        }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+        shadow_resync_all(v, 0);
+#endif
+    }
 
     /* Write the new entry */
     shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
@@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v
              | (((unsigned long)sl2e) & ~PAGE_MASK));
 
     if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
+    {
+        mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
+
         /* About to install a new reference */
-        if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
+        if ( !sh_get_ref(v, sl1mfn, paddr) )
         {
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
-        } 
+        }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+        {
+            struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
+            mfn_t gl1mfn = _mfn(sp->backpointer);
+
+            /* If the shadow is a fl1 then the backpointer contains
+               the GFN instead of the GMFN, and it's definitely not
+               OOS. */
+            if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
+                 && mfn_is_out_of_sync(gl1mfn) )
+                sh_resync(v, gl1mfn);
+        }
+#endif
+    }
 
     /* Write the new entry */
 #if GUEST_PAGING_LEVELS == 2
@@ -2544,6 +2606,97 @@ static int validate_gl1e(struct vcpu *v,
     return result;
 }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Special validation function for re-syncing out-of-sync shadows. 
+ * Walks the *shadow* page, and for every entry that it finds,
+ * revalidates the guest entry that corresponds to it.
+ * N.B. This function is called with the vcpu that unsynced the page,
+ *      *not* the one that is causing it to be resynced. */
+void sh_resync_l1(struct vcpu *v, mfn_t gmfn)
+{
+    mfn_t sl1mfn;
+    shadow_l1e_t *sl1p;
+    guest_l1e_t *gl1p, *gp;
+    int rc = 0;
+
+    sl1mfn = get_shadow_status(v, gmfn, SH_type_l1_shadow);
+    ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
+
+    gp = sh_map_domain_page(gmfn);
+    gl1p = gp;
+
+    SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
+        rc |= validate_gl1e(v, gl1p, sl1mfn, sl1p);
+    });
+
+    sh_unmap_domain_page(gp);
+
+    /* Setting shadow L1 entries should never need us to flush the TLB */
+    ASSERT(!(rc & SHADOW_SET_FLUSH));
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table. 
+ * That is: if we can tell that it's only used once, and that the 
+ * toplevel shadow responsible is not one of ours. 
+ * N.B. This function is called with the vcpu that required the resync, 
+ *      *not* the one that originally unsynced the page, but it is
+ *      called in the *mode* of the vcpu that unsynced it.  Clear?  Good. */
+int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+    struct shadow_page_info *sp;
+    mfn_t smfn;
+
+    smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+    ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
+    
+    /* Up to l2 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+#if (SHADOW_PAGING_LEVELS == 4) 
+    /* up to l3 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+    /* up to l4 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 
+         || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+#if (GUEST_PAGING_LEVELS == 2)
+    /* In 2-on-3 shadow mode the up pointer contains the link to the
+     * shadow page, but the shadow_table contains only the first of the
+     * four pages that makes the PAE top shadow tables. */
+    smfn = _mfn(mfn_x(smfn) & ~0x3UL);
+#endif
+
+#endif
+
+    if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
+#if (SHADOW_PAGING_LEVELS == 3) 
+         || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
+         || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
+         || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) 
+#endif
+        )
+        return 0;
+    
+    /* Only in use in one toplevel shadow, and it's not the one we're 
+     * running on */
+    return 1;
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
 
 /**************************************************************************/
 /* Functions which translate and install the shadows of arbitrary guest 
@@ -2805,6 +2958,7 @@ static int sh_page_fault(struct vcpu *v,
     int r;
     fetch_type_t ft = 0;
     p2m_type_t p2mt;
+    uint32_t rc;
 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
     int fast_emul = 0;
 #endif
@@ -2830,6 +2984,17 @@ static int sh_page_fault(struct vcpu *v,
         {
             fast_emul = 1;
             gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+            /* Fall back to the slow path if we're trying to emulate
+               writes to an out of sync page. */
+            if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
+            {
+                v->arch.paging.last_write_emul_ok = 0;
+                goto page_fault_slow_path;
+            }
+#endif /* OOS */
+
             perfc_incr(shadow_fault_fast_emulate);
             goto early_emulation;
         }
@@ -2855,6 +3020,31 @@ static int sh_page_fault(struct vcpu *v,
                                       sizeof(sl1e)) == 0)
                     && sh_l1e_is_magic(sl1e)) )
         {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+             /* First, need to check that this isn't an out-of-sync
+              * shadow l1e.  If it is, we fall back to the slow path, which
+              * will sync it up again. */
+            {
+                shadow_l2e_t sl2e;
+                mfn_t gl1mfn;
+               if ( (__copy_from_user(&sl2e,
+                                       (sh_linear_l2_table(v)
+                                        + shadow_l2_linear_offset(va)),
+                                       sizeof(sl2e)) != 0)
+                     || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
+                     || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
+                                      shadow_l2e_get_mfn(sl2e))->backpointer))
+                     || unlikely(mfn_is_out_of_sync(gl1mfn)) )
+               {
+                   /* Hit the slow path as if there had been no 
+                    * shadow entry at all, and let it tidy up */
+                   ASSERT(regs->error_code & PFEC_page_present);
+                   regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
+                   goto page_fault_slow_path;
+               }
+            }
+#endif /* SHOPT_OUT_OF_SYNC */
+
             if ( sh_l1e_is_gnp(sl1e) )
             {
                 /* Not-present in a guest PT: pass to the guest as
@@ -2890,6 +3080,10 @@ static int sh_page_fault(struct vcpu *v,
             return EXCRET_fault_fixed;
         }
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+ page_fault_slow_path:
+#endif
 #endif /* SHOPT_FAST_FAULT_PATH */
 
     /* Detect if this page fault happened while we were already in Xen
@@ -2904,7 +3098,21 @@ static int sh_page_fault(struct vcpu *v,
         return 0;
     }
 
-    if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
+ rewalk:
+    rc = guest_walk_tables(v, va, &gw, regs->error_code);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( !(rc & _PAGE_PRESENT) )
+        regs->error_code |= PFEC_page_present;
+    else if ( regs->error_code & PFEC_page_present )
+    {
+            SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB"
+                         " flushing. Have fun debugging it.\n");
+            regs->error_code &= ~PFEC_page_present;
+    }
+#endif
+
+    if ( rc != 0 )
     {
         perfc_incr(shadow_fault_bail_real_fault);
         SHADOW_PRINTK("not a shadow fault\n");
@@ -2948,7 +3156,10 @@ static int sh_page_fault(struct vcpu *v,
 
     shadow_lock(d);
 
-    if ( gw_remove_write_accesses(v, va, &gw) )
+    rc = gw_remove_write_accesses(v, va, &gw);
+
+    /* First bit set: Removed write access to a page. */
+    if ( rc & GW_RMWR_FLUSHTLB )
     {
         /* Write permission removal is also a hint that other gwalks
          * overlapping with this one may be inconsistent
@@ -2958,11 +3169,20 @@ static int sh_page_fault(struct vcpu *v,
         flush_tlb_mask(d->domain_dirty_cpumask);
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Second bit set: Resynced a page. Re-walk needed. */
+    if ( rc & GW_RMWR_REWALK )
+    {
+        shadow_unlock(d);
+        goto rewalk;
+    }
+#endif /* OOS */
+
     if ( !shadow_check_gwalk(v, va, &gw) )
     {
         perfc_incr(shadow_inconsistent_gwalk);
         shadow_unlock(d);
-        return EXCRET_fault_fixed;
+        goto rewalk;
     }
 
     shadow_audit_tables(v);
@@ -3001,7 +3221,12 @@ static int sh_page_fault(struct vcpu *v,
 #endif
 
     /* Need to emulate accesses to page tables */
-    if ( sh_mfn_is_a_page_table(gmfn) )
+    if ( sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+         /* Unless they've been allowed to go out of sync with their shadows */
+         && !mfn_is_out_of_sync(gmfn)
+#endif
+         )
     {
         if ( ft == ft_demand_write )
         {
@@ -3215,6 +3440,7 @@ sh_invlpg(struct vcpu *v, unsigned long 
  * instruction should be issued on the hardware, or 0 if it's safe not
  * to do so. */
 {
+    mfn_t sl1mfn;
     shadow_l2e_t sl2e;
     
     perfc_incr(shadow_invlpg);
@@ -3278,12 +3504,64 @@ sh_invlpg(struct vcpu *v, unsigned long 
     // If so, then we'll need to flush the entire TLB (because that's
     // easier than invalidating all of the individual 4K pages).
     //
-    if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
+    sl1mfn = shadow_l2e_get_mfn(sl2e);
+    if ( mfn_to_shadow_page(sl1mfn)->type
          == SH_type_fl1_shadow )
     {
         flush_tlb_local();
         return 0;
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Check to see if the SL1 is out of sync. */
+    {
+        mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+        struct page_info *pg = mfn_to_page(gl1mfn);
+        if ( mfn_valid(gl1mfn) 
+             && page_is_out_of_sync(pg) )
+        {
+            /* The test above may give false positives, since we don't
+             * hold the shadow lock yet.  Check again with the lock held. */
+            shadow_lock(v->domain);
+
+            /* This must still be a copy-from-user because we didn't
+             * have the shadow lock last time we checked, and the
+             * higher-level shadows might have disappeared under our
+             * feet. */
+            if ( __copy_from_user(&sl2e, 
+                                  sh_linear_l2_table(v)
+                                  + shadow_l2_linear_offset(va),
+                                  sizeof (sl2e)) != 0 )
+            {
+                perfc_incr(shadow_invlpg_fault);
+                shadow_unlock(v->domain);
+                return 0;
+            }
+
+            if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
+            {
+                shadow_unlock(v->domain);
+                return 0;
+            }
+
+            sl1mfn = shadow_l2e_get_mfn(sl2e);
+            gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+            pg = mfn_to_page(gl1mfn);
+            
+            if ( likely(sh_mfn_is_a_page_table(gl1mfn)
+                        && page_is_out_of_sync(pg) ) )
+            {
+                shadow_l1e_t *sl1;
+                sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
+                /* Remove the shadow entry that maps this VA */
+                (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
+            }
+            shadow_unlock(v->domain);
+            /* Need the invlpg, to pick up the disappeareance of the sl1e */
+            return 1;
+        }
+    }
+#endif
 
     return 1;
 }
@@ -3709,6 +3987,13 @@ sh_update_cr3(struct vcpu *v, int do_loc
         ASSERT(v->arch.cr3 == 0);
         return;
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Need to resync all the shadow entries on a TLB flush.  Resync
+     * current vcpus OOS pages before switching to the new shadow
+     * tables so that the VA hint is still valid.  */
+    shadow_resync_current_vcpu(v, do_locking);
+#endif
 
     if ( do_locking ) shadow_lock(v->domain);
 
@@ -3938,6 +4223,15 @@ sh_update_cr3(struct vcpu *v, int do_loc
 
     /* Release the lock, if we took it (otherwise it's the caller's problem) */
     if ( do_locking ) shadow_unlock(v->domain);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Need to resync all the shadow entries on a TLB flush. We only
+     * update the shadows, leaving the pages out of sync. Also, we try
+     * to skip synchronization of shadows not mapped in the new
+     * tables. */
+    shadow_sync_other_vcpus(v, do_locking);
+#endif
+
 }
 
 
@@ -4437,23 +4731,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v,
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 
-#define AUDIT_FAIL(_level, _fmt, _a...) do {                               \
-    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"         \
-           "gl" #_level "mfn = %" PRI_mfn                              \
-           " sl" #_level "mfn = %" PRI_mfn                             \
-           " &gl" #_level "e = %p &sl" #_level "e = %p"                    \
-           " gl" #_level "e = %" SH_PRI_gpte                              \
-           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",        \
-           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                      \
-           _level, guest_index(gl ## _level ## e),                         \
-           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),         \
-           gl ## _level ## e, sl ## _level ## e,                           \
-           gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
-           ##_a);                                                          \
-    BUG();                                                                 \
-    done = 1;                                                              \
-} while (0)
-
+#define AUDIT_FAIL(_level, _fmt, _a...) do {                            \
+    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"       \
+           "gl" #_level "mfn = %" PRI_mfn                               \
+           " sl" #_level "mfn = %" PRI_mfn                              \
+           " &gl" #_level "e = %p &sl" #_level "e = %p"                 \
+           " gl" #_level "e = %" SH_PRI_gpte                            \
+           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",      \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
+               _level, guest_index(gl ## _level ## e),                  \
+               mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),  \
+               gl ## _level ## e, sl ## _level ## e,                    \
+               gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, 
\
+               ##_a);                                                   \
+        BUG();                                                          \
+        done = 1;                                                       \
+} while (0)
+
+#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do {                        \
+    printk("Shadow %u-on-%u audit failed at level %i\n"                 \
+           "gl" #_level "mfn = %" PRI_mfn                               \
+           " sl" #_level "mfn = %" PRI_mfn                              \
+           " Error: " _fmt "\n",                                        \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
+           _level,                                                      \
+           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),      \
+           ##_a);                                                       \
+    BUG();                                                              \
+    done = 1;                                                           \
+} while (0)
 
 static char * sh_audit_flags(struct vcpu *v, int level,
                               int gflags, int sflags) 
@@ -4494,6 +4800,16 @@ int sh_audit_l1_table(struct vcpu *v, mf
     
     /* Follow the backpointer */
     gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
+    if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
+    {
+        oos_audit_hash_is_present(v->domain, gl1mfn);
+        return 0;
+    }
+#endif
+
     gl1e = gp = sh_map_domain_page(gl1mfn);
     SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
 
@@ -4574,6 +4890,13 @@ int sh_audit_l2_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
+        AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
+#endif
+
     gl2e = gp = sh_map_domain_page(gl2mfn);
     SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
 
@@ -4616,6 +4939,13 @@ int sh_audit_l3_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
+        AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
+#endif
+
     gl3e = gp = sh_map_domain_page(gl3mfn);
     SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
 
@@ -4656,6 +4986,13 @@ int sh_audit_l4_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
+        AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
+#endif
+
     gl4e = gp = sh_map_domain_page(gl4mfn);
     SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
     {
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/multi.h
--- a/xen/arch/x86/mm/shadow/multi.h    Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/multi.h    Fri Jun 20 15:10:08 2008 +0100
@@ -115,3 +115,13 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_
 
 extern struct paging_mode 
 SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+extern void 
+SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t gmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS)
+     (struct vcpu*v, mfn_t gmfn);
+#endif
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h  Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/private.h  Fri Jun 20 15:10:08 2008 +0100
@@ -63,8 +63,9 @@ extern int shadow_audit_enable;
 #define SHOPT_SKIP_VERIFY         0x20  /* Skip PTE v'fy when safe to do so */
 #define SHOPT_VIRTUAL_TLB         0x40  /* Cache guest v->p translations */
 #define SHOPT_FAST_EMULATION      0x80  /* Fast write emulation */
+#define SHOPT_OUT_OF_SYNC        0x100  /* Allow guest writes to L1 PTs */
 
-#define SHADOW_OPTIMIZATIONS      0xff
+#define SHADOW_OPTIMIZATIONS     0x1ff
 
 
 /******************************************************************************
@@ -301,6 +302,62 @@ static inline int sh_type_is_pinnable(st
 #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE)
 #define SHF_64  (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64)
 
+#define SHF_L1_ANY  (SHF_L1_32|SHF_L1_PAE|SHF_L1_64)
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+/* Marks a guest L1 page table which is shadowed but not write-protected.
+ * If set, then *only* L1 shadows (SHF_L1_*) are allowed. 
+ *
+ * out_of_sync indicates that the shadow tables may not reflect the
+ * guest tables.  If it is clear, then the shadow tables *must* reflect
+ * the guest tables.
+ *
+ * oos_may_write indicates that a page may have writable mappings.
+ *
+ * Most of the time the flags are synonymous.  There is a short period of time 
+ * during resync that oos_may_write is clear but out_of_sync is not.  If a 
+ * codepath is called during that time and is sensitive to oos issues, it may 
+ * need to use the second flag.
+ */
+#define SHF_out_of_sync (1u<<30)
+#define SHF_oos_may_write (1u<<29)
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
+static inline int sh_page_has_multiple_shadows(struct page_info *pg)
+{
+    u32 shadows;
+    if ( !(pg->count_info & PGC_page_table) )
+        return 0;
+    shadows = pg->shadow_flags & SHF_page_type_mask;
+    /* More than one type bit set in shadow-flags? */
+    return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 );
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+/* The caller must verify this is reasonable to call; i.e., valid mfn,
+ * domain is translated, &c */
+static inline int page_is_out_of_sync(struct page_info *p) 
+{
+    return (p->count_info & PGC_page_table)
+        && (p->shadow_flags & SHF_out_of_sync);
+}
+
+static inline int mfn_is_out_of_sync(mfn_t gmfn) 
+{
+    return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn)));
+}
+
+static inline int page_oos_may_write(struct page_info *p) 
+{
+    return (p->count_info & PGC_page_table)
+        && (p->shadow_flags & SHF_oos_may_write);
+}
+
+static inline int mfn_oos_may_write(mfn_t gmfn) 
+{
+    return page_oos_may_write(mfn_to_page(mfn_x(gmfn)));
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 /******************************************************************************
  * Various function declarations 
@@ -351,7 +408,50 @@ int shadow_cmpxchg_guest_entry(struct vc
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
                                intpte_t *old, intpte_t new, mfn_t gmfn);
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va);
 
+/* Pull an out-of-sync page back into sync. */
+void sh_resync(struct vcpu *v, mfn_t gmfn);
+
+/* Pull all out-of-sync shadows back into sync.  If skip != 0, we try
+ * to avoid resyncing where we think we can get away with it. */
+
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int 
do_locking);
+
+static inline void
+shadow_resync_all(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  0 /* skip */,
+                  1 /* this */,
+                  1 /* others */,
+                  do_locking);
+}
+
+static inline void
+shadow_resync_current_vcpu(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  0 /* skip */,
+                  1 /* this */, 
+                  0 /* others */,
+                  do_locking);
+}
+
+static inline void
+shadow_sync_other_vcpus(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  1 /* skip */, 
+                  0 /* this */,
+                  1 /* others */,
+                  do_locking);
+}
+
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn);
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 /******************************************************************************
  * Flags used in the return value of the shadow_set_lXe() functions...
diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h    Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/arch/x86/mm/shadow/types.h    Fri Jun 20 15:10:08 2008 +0100
@@ -438,6 +438,10 @@ struct shadow_walk_t
 #define sh_guess_wrmap             INTERNAL_NAME(sh_guess_wrmap)
 #define sh_clear_shadow_entry      INTERNAL_NAME(sh_clear_shadow_entry)
 
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+#define sh_resync_l1               INTERNAL_NAME(sh_resync_l1)
+#define sh_safe_not_to_sync        INTERNAL_NAME(sh_safe_not_to_sync)
+#endif
 
 /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */
 #define sh_guest_map_l1e \
diff -r 26ecd1f9e128 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/domain.h      Fri Jun 20 15:10:08 2008 +0100
@@ -103,6 +103,9 @@ struct shadow_domain {
      * emulation and remove write permission
      */
     atomic_t          gtable_dirty_version;
+
+    /* OOS */
+    int oos_active;
 };
 
 struct shadow_vcpu {
@@ -122,6 +125,10 @@ struct shadow_vcpu {
     unsigned long last_emulated_frame;
     /* Last MFN that we emulated a write successfully */
     unsigned long last_emulated_mfn;
+
+    /* Shadow out-of-sync: pages that this vcpu has let go out of sync */
+    mfn_t oos[SHADOW_OOS_PAGES];
+    unsigned long oos_va[SHADOW_OOS_PAGES];
 };
 
 /************************************************/
diff -r 26ecd1f9e128 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/mm.h  Fri Jun 20 15:10:08 2008 +0100
@@ -130,6 +130,9 @@ static inline u32 pickle_domptr(struct d
 /* The order of the largest allocation unit we use for shadow pages */
 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 
+/* The number of out-of-sync shadows we allow per vcpu (prime, please) */
+#define SHADOW_OOS_PAGES 7
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
diff -r 26ecd1f9e128 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Fri Jun 20 12:26:23 2008 +0100
+++ b/xen/include/asm-x86/perfc_defn.h  Fri Jun 20 15:10:08 2008 +0100
@@ -80,6 +80,7 @@ PERFCOUNTER(shadow_writeable_h_3,  "shad
 PERFCOUNTER(shadow_writeable_h_3,  "shadow writeable: 64b w2k3")
 PERFCOUNTER(shadow_writeable_h_4,  "shadow writeable: linux low/solaris")
 PERFCOUNTER(shadow_writeable_h_5,  "shadow writeable: linux high")
+PERFCOUNTER(shadow_writeable_h_6,  "shadow writeable: unsync va")
 PERFCOUNTER(shadow_writeable_bf,   "shadow writeable brute-force")
 PERFCOUNTER(shadow_mappings,       "shadow removes all mappings")
 PERFCOUNTER(shadow_mappings_bf,    "shadow rm-mappings brute-force")
@@ -101,4 +102,8 @@ PERFCOUNTER(shadow_em_ex_non_pt,   "shad
 PERFCOUNTER(shadow_em_ex_non_pt,   "shadow extra non-pt-write op")
 PERFCOUNTER(shadow_em_ex_fail,     "shadow extra emulation failed")
 
+PERFCOUNTER(shadow_unsync,         "shadow OOS unsyncs")
+PERFCOUNTER(shadow_unsync_evict,   "shadow OOS evictions")
+PERFCOUNTER(shadow_resync,         "shadow OOS resyncs")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.