[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] implement HVMOP_pagetable_dying



On Mon, 21 Jun 2010, Keir Fraser wrote:
> On 21/06/2010 17:15, "Stefano Stabellini" <Stefano.Stabellini@xxxxxxxxxxxxx>
> wrote:
> 
> > Hi all,
> > this patch implements HVMOP_pagetable_dying: an hypercall for
> > guests to notify Xen that a pagetable is about to be destroyed so that
> > Xen can use it as a hint to unshadow the pagetable soon and unhook the
> > top-level user-mode shadow entries right away.
> 
> This patch doesn't apply to xen-unstable tip.
> 
 
here we go:


diff -r 31708477f0a9 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Mon Jun 21 18:49:28 2010 +0100
@@ -3153,6 +3153,30 @@
         break;
     }
 
+    case HVMOP_pagetable_dying:
+    {
+        struct xen_hvm_pagetable_dying a;
+        struct domain *d;
+
+        if ( copy_from_guest(&a, arg, 1) )
+            return -EFAULT;
+
+        rc = rcu_lock_target_domain_by_id(a.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        rc = -EINVAL;
+        if ( !is_hvm_domain(d) || !paging_mode_shadow(d) )
+            goto param_fail5;
+
+        rc = 0;
+        pagetable_dying(d, a.gpa);
+
+    param_fail5:
+        rcu_unlock_domain(d);
+        break;
+    }
+
     default:
     {
         gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
diff -r 31708477f0a9 xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c  Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/arch/x86/mm/paging.c  Mon Jun 21 18:49:28 2010 +0100
@@ -766,6 +766,18 @@
         return shadow_enable(d, mode | PG_SH_enable);
 }
 
+/* Called from the guest to indicate that a process is being torn down
+ * and therefore its pagetables will soon be discarded */
+void pagetable_dying(struct domain *d, paddr_t gpa)
+{
+    struct vcpu *v;
+
+    ASSERT(paging_mode_shadow(d));
+
+    v = d->vcpu[0];
+    v->arch.paging.mode->shadow.pagetable_dying(v, gpa);
+}
+
 /* Print paging-assistance info to the console */
 void paging_dump_domain_info(struct domain *d)
 {
diff -r 31708477f0a9 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/arch/x86/mm/shadow/common.c   Mon Jun 21 18:49:28 2010 +0100
@@ -60,6 +60,7 @@
     d->arch.paging.shadow.oos_active = 0;
     d->arch.paging.shadow.oos_off = (domcr_flags & DOMCRF_oos_off) ?  1 : 0;
 #endif
+    d->arch.paging.shadow.pagetable_dying_op = 0;
 }
 
 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
@@ -1314,22 +1315,23 @@
 }
 
 /* Dispatcher function: call the per-mode function that will unhook the
- * non-Xen mappings in this top-level shadow mfn */
-static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
+ * non-Xen mappings in this top-level shadow mfn.  With user_only == 1,
+ * unhooks only the user-mode mappings. */
+void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn, int user_only)
 {
     struct page_info *sp = mfn_to_page(smfn);
     switch ( sp->u.sh.type )
     {
     case SH_type_l2_32_shadow:
-        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v,smfn);
+        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v, smfn, user_only);
         break;
     case SH_type_l2_pae_shadow:
     case SH_type_l2h_pae_shadow:
-        SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v,smfn);
+        SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v, smfn, user_only);
         break;
 #if CONFIG_PAGING_LEVELS >= 4
     case SH_type_l4_64_shadow:
-        SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v,smfn);
+        SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v, smfn, user_only);
         break;
 #endif
     default:
@@ -1399,7 +1401,7 @@
             {
                 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
                 shadow_unhook_mappings(v, 
-                               pagetable_get_mfn(v2->arch.shadow_table[i]));
+                               pagetable_get_mfn(v2->arch.shadow_table[i]), 0);
 
                 /* See if that freed up enough space */
                 if ( space_is_available(d, order, count) )
@@ -1454,7 +1456,7 @@
         for ( i = 0 ; i < 4 ; i++ )
             if ( !pagetable_is_null(v->arch.shadow_table[i]) )
                 shadow_unhook_mappings(v, 
-                               pagetable_get_mfn(v->arch.shadow_table[i]));
+                               pagetable_get_mfn(v->arch.shadow_table[i]), 0);
 
     /* Make sure everyone sees the unshadowings */
     flush_tlb_mask(&d->domain_dirty_cpumask);
diff -r 31708477f0a9 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/arch/x86/mm/shadow/multi.c    Mon Jun 21 18:49:28 2010 +0100
@@ -2179,37 +2179,43 @@
  * These are called from common code when we are running out of shadow
  * memory, and unpinning all the top-level shadows hasn't worked. 
  *
+ * With user_only == 1, we leave guest kernel-mode mappings in place too,
+ * unhooking only the user-mode mappings
+ *
  * This implementation is pretty crude and slow, but we hope that it won't 
  * be called very often. */
 
 #if GUEST_PAGING_LEVELS == 2
 
-void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
+void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn, int user_only)
 {    
     shadow_l2e_t *sl2e;
     SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
-        (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+        if ( !user_only || (sl2e->l2 & _PAGE_USER) )
+            (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
     });
 }
 
 #elif GUEST_PAGING_LEVELS == 3
 
-void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
+void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn, int user_only)
 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
 {
     shadow_l2e_t *sl2e;
     SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
-        (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+        if ( !user_only || (sl2e->l2 & _PAGE_USER) )
+            (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
     });
 }
 
 #elif GUEST_PAGING_LEVELS == 4
 
-void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
+void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn, int user_only)
 {
     shadow_l4e_t *sl4e;
     SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
-        (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+        if ( !user_only || (sl4e->l4 & _PAGE_USER) )
+            (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
     });
 }
 
@@ -2693,8 +2699,18 @@
 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
 {
 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
-    if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
-         && sh_mfn_is_a_page_table(gmfn) )
+    /* If the domain has never made a "dying" op, use the two-writes
+     * heuristic; otherwise, unshadow as soon as we write a zero for a dying
+     * process.
+     *
+     * Don't bother trying to unshadow if it's not a PT, or if it's > l1.
+     */
+    if ( ( v->arch.paging.shadow.pagetable_dying
+           || ( !v->domain->arch.paging.shadow.pagetable_dying_op
+                && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == 
mfn_x(gmfn) ) )
+         && sh_mfn_is_a_page_table(gmfn)
+         && !(mfn_to_page(gmfn)->shadow_flags
+              & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64)) )
     {
         perfc_incr(shadow_early_unshadow);
         sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
@@ -3384,6 +3400,40 @@
      * caught by user-mode page-table check above.
      */
  emulate_readonly:
+
+    /* Unshadow if we are writing to a toplevel pagetable that is
+     * flagged as a dying process, and that is not currently used. */
+    if ( sh_mfn_is_a_page_table(gmfn)
+         && (mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying) )
+    {
+        int used = 0;
+        struct vcpu *tmp;
+        for_each_vcpu(d, tmp)
+        {
+#if GUEST_PAGING_LEVELS == 3
+            int i;
+            for ( i = 0; i < 4; i++ )
+            {
+                mfn_t smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[i]));
+                if ( mfn_valid(smfn) && (mfn_x(smfn) != 0) )
+                {
+                    used |= (mfn_to_page(smfn)->v.sh.back == mfn_x(gmfn));
+
+                    if ( used )
+                        break;
+                }
+            }
+#else /* 32 or 64 */
+            used = (mfn_x(pagetable_get_mfn(tmp->arch.guest_table)) == 
mfn_x(gmfn));
+#endif
+            if ( used )
+                break;
+        }
+
+        if ( !used )
+            sh_remove_shadows(v, gmfn, 1 /* fast */, 0 /* can fail */);
+    }
+
     /*
      * We don't need to hold the lock for the whole emulation; we will
      * take it again when we write to the pagetables.
@@ -4033,6 +4083,11 @@
         smfn = sh_make_shadow(v, gmfn, root_type);
     }
     ASSERT(mfn_valid(smfn));
+
+    /* Remember if we've been told that this process is being torn down */
+    v->arch.paging.shadow.pagetable_dying
+        = !!(mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying);
+
     
     /* Pin the shadow and put it (back) on the list of pinned shadows */
     if ( sh_pin(v, smfn) == 0 )
@@ -4603,6 +4658,110 @@
 #endif /* 64bit guest */ 
 
 /**************************************************************************/
+/* Function for the guest to inform us that a process is being torn
+ * down.  We remember that as a hint to unshadow its pagetables soon,
+ * and in the meantime we unhook its top-level user-mode entries. */
+
+#if GUEST_PAGING_LEVELS == 3
+static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa)
+{
+    int i = 0;
+    int flush = 0;
+    int fast_path = 0;
+    paddr_t gcr3 = 0;
+    mfn_t smfn, gmfn;
+    p2m_type_t p2mt;
+    unsigned long gl3pa;
+    guest_l3e_t *gl3e = NULL;
+    paddr_t gl2a = 0;
+
+    shadow_lock(v->domain);
+
+    gcr3 = (v->arch.hvm_vcpu.guest_cr[3]);
+    /* fast path: the pagetable belongs to the current context */
+    if ( gcr3 == gpa )
+        fast_path = 1;
+
+    gmfn = gfn_to_mfn_query(v->domain, _gfn(gpa >> PAGE_SHIFT), &p2mt);
+    if ( !mfn_valid(gmfn) || !p2m_is_ram(p2mt) )
+    {
+        printk(XENLOG_DEBUG "sh_pagetable_dying: gpa not valid %lx\n", gpa);
+        goto out;
+    }
+    if ( !fast_path )
+    {
+        gl3pa = (unsigned long) sh_map_domain_page(gmfn);
+        gl3e = (guest_l3e_t *) (gl3pa + (gpa & ~PAGE_MASK));
+    }
+    for ( i = 0; i < 4; i++ )
+    {
+        if ( fast_path )
+            smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[i]));
+        else
+        {
+            /* retrieving the l2s */
+            gl2a = guest_l3e_get_paddr(gl3e[i]);
+            gmfn = gfn_to_mfn_query(v->domain, _gfn(gl2a >> PAGE_SHIFT), 
&p2mt);
+            smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l2_pae_shadow);
+        }
+
+        if ( mfn_valid(smfn) )
+        {
+            gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
+            mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
+            shadow_unhook_mappings(v, smfn, 1/* user pages only */);
+            flush = 1;
+        }
+    }
+    if ( flush )
+        flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+
+    /* Remember that we've seen the guest use this interface, so we
+     * can rely on it using it in future, instead of guessing at
+     * when processes are being torn down. */
+    v->domain->arch.paging.shadow.pagetable_dying_op = 1;
+
+    v->arch.paging.shadow.pagetable_dying = 1;
+
+out:
+    if ( !fast_path )
+        unmap_domain_page(gl3pa);
+    shadow_unlock(v->domain);
+}
+#else
+static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa)
+{
+    mfn_t smfn, gmfn;
+    p2m_type_t p2mt;
+
+    shadow_lock(v->domain);
+
+    gmfn = gfn_to_mfn_query(v->domain, _gfn(gpa >> PAGE_SHIFT), &p2mt);
+#if GUEST_PAGING_LEVELS == 2
+    smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l2_32_shadow);
+#else
+    smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l4_64_shadow);
+#endif
+    if ( mfn_valid(smfn) )
+    {
+        mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
+        shadow_unhook_mappings(v, smfn, 1/* user pages only */);
+        /* Now flush the TLB: we removed toplevel mappings. */
+        flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+    }
+
+    /* Remember that we've seen the guest use this interface, so we
+     * can rely on it using it in future, instead of guessing at
+     * when processes are being torn down. */
+    v->domain->arch.paging.shadow.pagetable_dying_op = 1;
+
+    v->arch.paging.shadow.pagetable_dying = 1;
+
+    shadow_unlock(v->domain);
+}
+#endif
+
+/**************************************************************************/
 /* Handling HVM guest writes to pagetables  */
 
 /* Translate a VA to an MFN, injecting a page-fault if we fail */
@@ -5247,6 +5406,7 @@
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
     .shadow.guess_wrmap            = sh_guess_wrmap,
 #endif
+    .shadow.pagetable_dying        = sh_pagetable_dying,
     .shadow.shadow_levels          = SHADOW_PAGING_LEVELS,
 };
 
diff -r 31708477f0a9 xen/arch/x86/mm/shadow/multi.h
--- a/xen/arch/x86/mm/shadow/multi.h    Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/arch/x86/mm/shadow/multi.h    Mon Jun 21 18:49:28 2010 +0100
@@ -52,13 +52,13 @@
 
 extern void 
 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl2mfn);
+    (struct vcpu *v, mfn_t sl2mfn, int user_only);
 extern void 
 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl3mfn);
+    (struct vcpu *v, mfn_t sl3mfn, int user_only);
 extern void 
 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl4mfn);
+    (struct vcpu *v, mfn_t sl4mfn, int user_only);
 
 extern int
 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, GUEST_LEVELS)
diff -r 31708477f0a9 xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h  Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/arch/x86/mm/shadow/private.h  Mon Jun 21 18:49:28 2010 +0100
@@ -321,6 +321,8 @@
 
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
+#define SHF_pagetable_dying (1u<<31)
+
 static inline int sh_page_has_multiple_shadows(struct page_info *pg)
 {
     u32 shadows;
@@ -406,6 +408,10 @@
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
                                intpte_t *old, intpte_t new, mfn_t gmfn);
 
+/* Unhook the non-Xen mappings in this top-level shadow mfn.
+ * With user_only == 1, unhooks only the user-mode mappings. */
+void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn, int user_only);
+
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
 /* Allow a shadowed page to go out of sync */
 int sh_unsync(struct vcpu *v, mfn_t gmfn);
diff -r 31708477f0a9 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/include/asm-x86/domain.h      Mon Jun 21 18:49:28 2010 +0100
@@ -121,6 +121,8 @@
     /* OOS */
     int oos_active;
     int oos_off;
+
+    int pagetable_dying_op;
 };
 
 struct shadow_vcpu {
@@ -149,6 +151,8 @@
         mfn_t smfn[SHADOW_OOS_FIXUPS];
         unsigned long off[SHADOW_OOS_FIXUPS];
     } oos_fixup[SHADOW_OOS_PAGES];
+
+    int pagetable_dying;
 };
 
 /************************************************/
diff -r 31708477f0a9 xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h      Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/include/asm-x86/paging.h      Mon Jun 21 18:49:28 2010 +0100
@@ -95,6 +95,7 @@
     void          (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
     int           (*guess_wrmap           )(struct vcpu *v, 
                                             unsigned long vaddr, mfn_t gmfn);
+    void          (*pagetable_dying       )(struct vcpu *v, paddr_t gpa);
     /* For outsiders to tell what mode we're in */
     unsigned int shadow_levels;
 };
@@ -342,6 +343,10 @@
         safe_write_pte(p, new);
 }
 
+/* Called from the guest to indicate that the a process is being
+ * torn down and its pagetables will soon be discarded */
+void pagetable_dying(struct domain *d, paddr_t gpa);
+
 /* Print paging-assistance info to the console */
 void paging_dump_domain_info(struct domain *d);
 void paging_dump_vcpu_info(struct vcpu *v);
diff -r 31708477f0a9 xen/include/public/hvm/hvm_op.h
--- a/xen/include/public/hvm/hvm_op.h   Mon Jun 21 09:59:10 2010 +0100
+++ b/xen/include/public/hvm/hvm_op.h   Mon Jun 21 18:49:28 2010 +0100
@@ -127,6 +127,16 @@
 typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_type_t);
 
+/* Hint from PV drivers for pagetable destruction. */
+#define HVMOP_pagetable_dying        9
+struct xen_hvm_pagetable_dying {
+    /* Domain with a pagetable about to be destroyed. */
+    domid_t  domid;
+    /* guest physical address of the toplevel pagetable dying */
+    uint64_aligned_t gpa;
+};
+typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_pagetable_dying_t);
 
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.