[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] x86/mm: Rework locking in the PoD layer



# HG changeset patch
# User Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx>
# Date 1328890027 0
# Node ID 28edc2b31a9bc50bae915129dfbd315a73260a09
# Parent  a55ea8dc9c2566eabd19211c7debe575b63be0e3
x86/mm: Rework locking in the PoD layer

The PoD layer has a complex locking discipline. It relies on the
p2m being globally locked, and it also relies on the page alloc
lock to protect some of its data structures. Replace this all by an
explicit pod lock: per p2m, order enforced.

Three consequences:
    - Critical sections in the pod code protected by the page alloc
      lock are now reduced to modifications of the domain page list.
    - When the p2m lock becomes fine-grained, there are no
      assumptions broken in the PoD layer.
    - The locking is easier to understand.

Signed-off-by: Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx>
Acked-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx>
Committed-by: Tim Deegan <tim@xxxxxxx>
---


diff -r a55ea8dc9c25 -r 28edc2b31a9b xen/arch/x86/mm/mm-locks.h
--- a/xen/arch/x86/mm/mm-locks.h        Fri Feb 10 16:07:07 2012 +0000
+++ b/xen/arch/x86/mm/mm-locks.h        Fri Feb 10 16:07:07 2012 +0000
@@ -194,6 +194,16 @@
  * - setting the "cr3" field of any p2m table to a non-CR3_EADDR value. 
  *   (i.e. assigning a p2m table to be the shadow of that cr3 */
 
+/* PoD lock (per-p2m-table)
+ * 
+ * Protects private PoD data structs: entry and cache
+ * counts, page lists, sweep parameters. */
+
+declare_mm_lock(pod)
+#define pod_lock(p)           mm_lock(pod, &(p)->pod.lock)
+#define pod_unlock(p)         mm_unlock(&(p)->pod.lock)
+#define pod_locked_by_me(p)   mm_locked_by_me(&(p)->pod.lock)
+
 /* Page alloc lock (per-domain)
  *
  * This is an external lock, not represented by an mm_lock_t. However, 
diff -r a55ea8dc9c25 -r 28edc2b31a9b xen/arch/x86/mm/p2m-pod.c
--- a/xen/arch/x86/mm/p2m-pod.c Fri Feb 10 16:07:07 2012 +0000
+++ b/xen/arch/x86/mm/p2m-pod.c Fri Feb 10 16:07:07 2012 +0000
@@ -100,7 +100,7 @@
     }
 #endif
 
-    ASSERT(p2m_locked_by_me(p2m));
+    ASSERT(pod_locked_by_me(p2m));
 
     /*
      * Pages from domain_alloc and returned by the balloon driver aren't
@@ -114,15 +114,16 @@
         unmap_domain_page(b);
     }
 
+    /* First, take all pages off the domain list */
     lock_page_alloc(p2m);
-
-    /* First, take all pages off the domain list */
     for(i=0; i < 1 << order ; i++)
     {
         p = page + i;
         page_list_del(p, &d->page_list);
     }
 
+    unlock_page_alloc(p2m);
+
     /* Then add the first one to the appropriate populate-on-demand list */
     switch(order)
     {
@@ -138,25 +139,20 @@
         BUG();
     }
 
-    /* Ensure that the PoD cache has never been emptied.  
-     * This may cause "zombie domains" since the page will never be freed. */
-    BUG_ON( d->arch.relmem != RELMEM_not_started );
-
-    unlock_page_alloc(p2m);
-
     return 0;
 }
 
 /* Get a page of size order from the populate-on-demand cache.  Will break
  * down 2-meg pages into singleton pages automatically.  Returns null if
- * a superpage is requested and no superpages are available.  Must be called
- * with the d->page_lock held. */
+ * a superpage is requested and no superpages are available. */
 static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
                                             unsigned long order)
 {
     struct page_info *p = NULL;
     int i;
 
+    ASSERT(pod_locked_by_me(p2m));
+
     if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) )
     {
         return NULL;
@@ -185,7 +181,7 @@
     case PAGE_ORDER_2M:
         BUG_ON( page_list_empty(&p2m->pod.super) );
         p = page_list_remove_head(&p2m->pod.super);
-        p2m->pod.count -= 1 << order; /* Lock: page_alloc */
+        p2m->pod.count -= 1 << order;
         break;
     case PAGE_ORDER_4K:
         BUG_ON( page_list_empty(&p2m->pod.single) );
@@ -197,11 +193,13 @@
     }
 
     /* Put the pages back on the domain page_list */
+    lock_page_alloc(p2m);
     for ( i = 0 ; i < (1 << order); i++ )
     {
         BUG_ON(page_get_owner(p + i) != p2m->domain);
         page_list_add_tail(p + i, &p2m->domain->page_list);
     }
+    unlock_page_alloc(p2m);
 
     return p;
 }
@@ -213,6 +211,8 @@
     struct domain *d = p2m->domain;
     int ret = 0;
 
+    ASSERT(pod_locked_by_me(p2m));
+
     /* Increasing the target */
     while ( pod_target > p2m->pod.count )
     {
@@ -250,17 +250,13 @@
     }
 
     /* Decreasing the target */
-    /* We hold the p2m lock here, so we don't need to worry about
+    /* We hold the pod lock here, so we don't need to worry about
      * cache disappearing under our feet. */
     while ( pod_target < p2m->pod.count )
     {
         struct page_info * page;
         int order, i;
 
-        /* Grab the lock before checking that pod.super is empty, or the last
-         * entries may disappear before we grab the lock. */
-        lock_page_alloc(p2m);
-
         if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
              && !page_list_empty(&p2m->pod.super) )
             order = PAGE_ORDER_2M;
@@ -271,8 +267,6 @@
 
         ASSERT(page != NULL);
 
-        unlock_page_alloc(p2m);
-
         /* Then free them */
         for ( i = 0 ; i < (1 << order) ; i++ )
         {
@@ -348,7 +342,7 @@
     int ret = 0;
     unsigned long populated;
 
-    p2m_lock(p2m);
+    pod_lock(p2m);
 
     /* P == B: Nothing to do. */
     if ( p2m->pod.entry_count == 0 )
@@ -377,7 +371,7 @@
     ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
 
 out:
-    p2m_unlock(p2m);
+    pod_unlock(p2m);
 
     return ret;
 }
@@ -390,7 +384,7 @@
 
     /* After this barrier no new PoD activities can happen. */
     BUG_ON(!d->is_dying);
-    spin_barrier(&p2m->lock.lock);
+    spin_barrier(&p2m->pod.lock.lock);
 
     lock_page_alloc(p2m);
 
@@ -431,7 +425,7 @@
     if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
         return 0;
 
-    lock_page_alloc(p2m);
+    pod_lock(p2m);
     bmfn = mfn_x(page_to_mfn(p));
     page_list_for_each_safe(q, tmp, &p2m->pod.super)
     {
@@ -462,12 +456,14 @@
         }
     }
 
-    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
     return 0;
 
 pod_hit:
+    lock_page_alloc(p2m);
     page_list_add_tail(p, &d->arch.relmem_list);
     unlock_page_alloc(p2m);
+    pod_unlock(p2m);
     return 1;
 }
 
@@ -486,9 +482,9 @@
     if ( unlikely(!p) )
         return;
 
-    p2m_lock(p2m);
+    pod_lock(p2m);
     p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K);
-    p2m_unlock(p2m);
+    pod_unlock(p2m);
     return;
 }
 
@@ -511,18 +507,18 @@
 
     int steal_for_cache = 0;
     int pod = 0, nonpod = 0, ram = 0;
-    
+
+    gfn_lock(p2m, gpfn, order);
+    pod_lock(p2m);    
 
     /* If we don't have any outstanding PoD entries, let things take their
      * course */
     if ( p2m->pod.entry_count == 0 )
-        goto out;
+        goto out_unlock;
 
     /* Figure out if we need to steal some freed memory for our cache */
     steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
 
-    gfn_lock(p2m, gpfn, order);
-
     if ( unlikely(d->is_dying) )
         goto out_unlock;
 
@@ -554,7 +550,7 @@
         /* All PoD: Mark the whole region invalid and tell caller
          * we're done. */
         set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, 
p2m->default_access);
-        p2m->pod.entry_count-=(1<<order); /* Lock: p2m */
+        p2m->pod.entry_count-=(1<<order);
         BUG_ON(p2m->pod.entry_count < 0);
         ret = 1;
         goto out_entry_check;
@@ -578,7 +574,7 @@
         if ( t == p2m_populate_on_demand )
         {
             set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, 
p2m->default_access);
-            p2m->pod.entry_count--; /* Lock: p2m */
+            p2m->pod.entry_count--;
             BUG_ON(p2m->pod.entry_count < 0);
             pod--;
         }
@@ -615,9 +611,8 @@
     }
 
 out_unlock:
+    pod_unlock(p2m);
     gfn_unlock(p2m, gpfn, order);
-
-out:
     return ret;
 }
 
@@ -630,7 +625,8 @@
 
 
 /* Search for all-zero superpages to be reclaimed as superpages for the
- * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */
+ * PoD cache. Must be called w/ pod lock held, must lock the superpage
+ * in the p2m */
 static int
 p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
 {
@@ -642,6 +638,8 @@
     int max_ref = 1;
     struct domain *d = p2m->domain;
 
+    ASSERT(pod_locked_by_me(p2m));
+
     if ( !superpage_aligned(gfn) )
         goto out;
 
@@ -649,6 +647,10 @@
     if ( paging_mode_shadow(d) )
         max_ref++;
 
+    /* NOTE: this is why we don't enforce deadlock constraints between p2m 
+     * and pod locks */
+    gfn_lock(p2m, gfn, SUPERPAGE_ORDER);
+
     /* Look up the mfns, checking to make sure they're the same mfn
      * and aligned, and mapping them. */
     for ( i=0; i<SUPERPAGE_PAGES; i++ )
@@ -761,6 +763,7 @@
         set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);
     
 out:
+    gfn_unlock(p2m, gfn, SUPERPAGE_ORDER);
     return ret;
 }
 
@@ -922,6 +925,10 @@
     limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
 
     /* FIXME: Figure out how to avoid superpages */
+    /* NOTE: Promote to globally locking the p2m. This will get complicated
+     * in a fine-grained scenario. If we lock each gfn individually we must be
+     * careful about spinlock recursion limits and POD_SWEEP_STRIDE. */
+    p2m_lock(p2m);
     for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )
     {
         p2m_access_t a;
@@ -940,7 +947,7 @@
         /* Stop if we're past our limit and we have found *something*.
          *
          * NB that this is a zero-sum game; we're increasing our cache size
-         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * by re-increasing our 'debt'.  Since we hold the pod lock,
          * (entry_count - count) must remain the same. */
         if ( p2m->pod.count > 0 && i < limit )
             break;
@@ -949,6 +956,7 @@
     if ( j )
         p2m_pod_zero_check(p2m, gfns, j);
 
+    p2m_unlock(p2m);
     p2m->pod.reclaim_single = i ? i - 1 : i;
 
 }
@@ -965,8 +973,9 @@
     int i;
 
     ASSERT(gfn_locked_by_me(p2m, gfn));
+    pod_lock(p2m);
 
-    /* This check is done with the p2m lock held.  This will make sure that
+    /* This check is done with the pod lock held.  This will make sure that
      * even if d->is_dying changes under our feet, p2m_pod_empty_cache() 
      * won't start until we're done. */
     if ( unlikely(d->is_dying) )
@@ -977,6 +986,7 @@
      * 1GB region to 2MB chunks for a retry. */
     if ( order == PAGE_ORDER_1G )
     {
+        pod_unlock(p2m);
         gfn_aligned = (gfn >> order) << order;
         /* Note that we are supposed to call set_p2m_entry() 512 times to 
          * split 1GB into 512 2MB pages here. But We only do once here because
@@ -1000,11 +1010,15 @@
 
         /* If we're low, start a sweep */
         if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) )
+            /* Note that sweeps scan other ranges in the p2m. In an scenario
+             * in which p2m locks are fine-grained, this may result in 
deadlock.
+             * Using trylock on the gfn's as we sweep would avoid it. */
             p2m_pod_emergency_sweep_super(p2m);
 
         if ( page_list_empty(&p2m->pod.single) &&
              ( ( order == PAGE_ORDER_4K )
                || (order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) 
) ) )
+            /* Same comment regarding deadlock applies */
             p2m_pod_emergency_sweep(p2m);
     }
 
@@ -1012,8 +1026,6 @@
     if ( q == p2m_guest && gfn > p2m->pod.max_guest )
         p2m->pod.max_guest = gfn;
 
-    lock_page_alloc(p2m);
-
     if ( p2m->pod.count == 0 )
         goto out_of_memory;
 
@@ -1026,8 +1038,6 @@
 
     BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
 
-    unlock_page_alloc(p2m);
-
     gfn_aligned = (gfn >> order) << order;
 
     set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, 
p2m->default_access);
@@ -1038,7 +1048,7 @@
         paging_mark_dirty(d, mfn_x(mfn) + i);
     }
     
-    p2m->pod.entry_count -= (1 << order); /* Lock: p2m */
+    p2m->pod.entry_count -= (1 << order);
     BUG_ON(p2m->pod.entry_count < 0);
 
     if ( tb_init_done )
@@ -1056,20 +1066,24 @@
         __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
     }
 
+    pod_unlock(p2m);
     return 0;
 out_of_memory:
-    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
 
     printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " 
pod_entries %" PRIi32 "\n",
            __func__, d->tot_pages, p2m->pod.entry_count);
     domain_crash(d);
 out_fail:
+    pod_unlock(p2m);
     return -1;
 remap_and_retry:
     BUG_ON(order != PAGE_ORDER_2M);
-    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
 
     /* Remap this 2-meg region in singleton chunks */
+    /* NOTE: In a p2m fine-grained lock scenario this might
+     * need promoting the gfn lock from gfn->2M superpage */
     gfn_aligned = (gfn>>order)<<order;
     for(i=0; i<(1<<order); i++)
         set_p2m_entry(p2m, gfn_aligned+i, _mfn(0), PAGE_ORDER_4K,
@@ -1137,9 +1151,11 @@
         rc = -EINVAL;
     else
     {
-        p2m->pod.entry_count += 1 << order; /* Lock: p2m */
+        pod_lock(p2m);
+        p2m->pod.entry_count += 1 << order;
         p2m->pod.entry_count -= pod_count;
         BUG_ON(p2m->pod.entry_count < 0);
+        pod_unlock(p2m);
     }
 
     gfn_unlock(p2m, gfn, order);
diff -r a55ea8dc9c25 -r 28edc2b31a9b xen/arch/x86/mm/p2m-pt.c
--- a/xen/arch/x86/mm/p2m-pt.c  Fri Feb 10 16:07:07 2012 +0000
+++ b/xen/arch/x86/mm/p2m-pt.c  Fri Feb 10 16:07:07 2012 +0000
@@ -954,6 +954,7 @@
     struct domain *d = p2m->domain;
 
     ASSERT(p2m_locked_by_me(p2m));
+    ASSERT(pod_locked_by_me(p2m));
 
     test_linear = ( (d == current->domain)
                     && !pagetable_is_null(current->arch.monitor_table) );
diff -r a55ea8dc9c25 -r 28edc2b31a9b xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Fri Feb 10 16:07:07 2012 +0000
+++ b/xen/arch/x86/mm/p2m.c     Fri Feb 10 16:07:07 2012 +0000
@@ -72,6 +72,7 @@
 static void p2m_initialise(struct domain *d, struct p2m_domain *p2m)
 {
     mm_lock_init(&p2m->lock);
+    mm_lock_init(&p2m->pod.lock);
     INIT_LIST_HEAD(&p2m->np2m_list);
     INIT_PAGE_LIST_HEAD(&p2m->pages);
     INIT_PAGE_LIST_HEAD(&p2m->pod.super);
@@ -568,8 +569,10 @@
             rc = -EINVAL;
         else
         {
-            p2m->pod.entry_count -= pod_count; /* Lock: p2m */
+            pod_lock(p2m);
+            p2m->pod.entry_count -= pod_count;
             BUG_ON(p2m->pod.entry_count < 0);
+            pod_unlock(p2m);
         }
     }
 
@@ -1350,6 +1353,7 @@
     /* "Host" p2m tables can have shared entries &c that need a bit more 
      * care when discarding them */
     ASSERT(p2m_is_nestedp2m(p2m));
+    /* Nested p2m's do not do pod, hence the asserts (and no pod lock)*/
     ASSERT(page_list_empty(&p2m->pod.super));
     ASSERT(page_list_empty(&p2m->pod.single));
 
@@ -1507,6 +1511,7 @@
     P2M_PRINTK("p2m audit starts\n");
 
     p2m_lock(p2m);
+    pod_lock(p2m);
 
     if (p2m->audit_p2m)
         pmbad = p2m->audit_p2m(p2m);
@@ -1567,6 +1572,7 @@
     }
     spin_unlock(&d->page_alloc_lock);
 
+    pod_unlock(p2m);
     p2m_unlock(p2m);
  
     P2M_PRINTK("p2m audit complete\n");
diff -r a55ea8dc9c25 -r 28edc2b31a9b xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Fri Feb 10 16:07:07 2012 +0000
+++ b/xen/include/asm-x86/p2m.h Fri Feb 10 16:07:07 2012 +0000
@@ -261,25 +261,12 @@
     unsigned long max_mapped_pfn;
 
     /* Populate-on-demand variables
-     * NB on locking.  {super,single,count} are
-     * covered by d->page_alloc_lock, since they're almost always used in
-     * conjunction with that functionality.  {entry_count} is covered by
-     * the domain p2m lock, since it's almost always used in conjunction
-     * with changing the p2m tables.
-     *
-     * At this point, both locks are held in two places.  In both,
-     * the order is [p2m,page_alloc]:
-     * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(),
-     *   which grabs page_alloc
-     * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid
-     *   double-demand-populating of pages, the page_alloc lock to
-     *   protect moving stuff from the PoD cache to the domain page list.
-     *
-     * We enforce this lock ordering through a construct in mm-locks.h.
-     * This demands, however, that we store the previous lock-ordering
-     * level in effect before grabbing the page_alloc lock. The unlock
-     * level is stored in the arch section of the domain struct.
-     */
+     * All variables are protected with the pod lock. We cannot rely on
+     * the p2m lock if it's turned into a fine-grained lock.
+     * We only use the domain page_alloc lock for additions and 
+     * deletions to the domain's page list. Because we use it nested
+     * within the PoD lock, we enforce it's ordering (by remembering
+     * the unlock level in the arch_domain sub struct). */
     struct {
         struct page_list_head super,   /* List of superpages                */
                          single;       /* Non-super lists                   */
@@ -288,6 +275,8 @@
         unsigned         reclaim_super; /* Last gpfn of a scan */
         unsigned         reclaim_single; /* Last gpfn of a scan */
         unsigned         max_guest;    /* gpfn of max guest demand-populate */
+        mm_lock_t        lock;         /* Locking of private pod structs,   *
+                                        * not relying on the p2m lock.      */
     } pod;
 };
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.