[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] PoD memory 7/9: Xen interface



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1231152348 0
# Node ID 2090917489c52af6d8604ffeb7db00cbbb0a51da
# Parent  ebe11a4523931adb616e07f7ac0c0c89e526ca95
PoD memory 7/9: Xen interface

Implement Xen interface to PoD functionality.
* Increase the number of MEMOP bits from 4 to 6 (increasing the number
of available memory operations from 16 to 64).
* Introduce XENMEMF_populate_on_demand, which will cause
populate_physmap() to fill a range with PoD entries rather than
backing it with ram
* Introduce XENMEM_[sg]et_pod_target operation to the memory
hypercall, to get and set PoD cache size.  set_pod_target() should be
called during domain creation, as well as after modifying the memory
target of any domain which may have outstanding PoD entries.

Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx>
---
 xen/arch/x86/mm.c               |   43 +++++++++++
 xen/arch/x86/mm/p2m.c           |  153 +++++++++++++++++++++++++++++++++++++++-
 xen/arch/x86/x86_64/compat/mm.c |   23 ++++++
 xen/common/memory.c             |   59 +++++++++------
 xen/include/asm-x86/p2m.h       |    4 +
 xen/include/public/memory.h     |   15 +++
 xen/include/xen/hypercall.h     |    2 
 xen/include/xen/mm.h            |    2 
 xen/include/xlat.lst            |    1 
 9 files changed, 277 insertions(+), 25 deletions(-)

diff -r ebe11a452393 -r 2090917489c5 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/arch/x86/mm.c Mon Jan 05 10:45:48 2009 +0000
@@ -3976,6 +3976,49 @@ long arch_memory_op(int op, XEN_GUEST_HA
         return 0;
     }
 
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+    {
+        xen_pod_target_t target;
+        struct domain *d;
+
+        /* Support DOMID_SELF? */
+        if ( !IS_PRIV(current->domain) )
+            return -EINVAL;
+
+        if ( copy_from_guest(&target, arg, 1) )
+            return -EFAULT;
+
+        rc = rcu_lock_target_domain_by_id(target.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        if ( op == XENMEM_set_pod_target )
+        {
+            if ( target.target_pages > d->max_pages )
+            {
+                rc = -EINVAL;
+                goto pod_target_out_unlock;
+            }
+            
+            rc = p2m_pod_set_mem_target(d, target.target_pages);
+        }
+
+        target.tot_pages       = d->tot_pages;
+        target.pod_cache_pages = d->arch.p2m->pod.count;
+        target.pod_entries     = d->arch.p2m->pod.entry_count;
+
+        if ( copy_to_guest(arg, &target, 1) )
+        {
+            rc= -EFAULT;
+            goto pod_target_out_unlock;
+        }
+        
+    pod_target_out_unlock:
+        rcu_unlock_domain(d);
+        return rc;
+    }
+
     default:
         return subarch_memory_op(op, arg);
     }
diff -r ebe11a452393 -r 2090917489c5 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/arch/x86/mm/p2m.c     Mon Jan 05 10:45:48 2009 +0000
@@ -387,6 +387,150 @@ static struct page_info * p2m_pod_cache_
     return p;
 }
 
+/* Set the size of the cache, allocating or freeing as necessary. */
+static int
+p2m_pod_set_cache_target(struct domain *d, unsigned long pod_target)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret = 0;
+
+    /* Increasing the target */
+    while ( pod_target > p2md->pod.count )
+    {
+        struct page_info * page;
+        int order;
+
+        if ( (pod_target - p2md->pod.count) >= (1>>9) )
+            order = 9;
+        else
+            order = 0;
+
+        page = alloc_domheap_pages(d, order, 0);
+        if ( unlikely(page == NULL) )
+            goto out;
+
+        p2m_pod_cache_add(d, page, order);
+    }
+
+    /* Decreasing the target */
+    /* We hold the p2m lock here, so we don't need to worry about
+     * cache disappearing under our feet. */
+    while ( pod_target < p2md->pod.count )
+    {
+        struct page_info * page;
+        int order, i;
+
+        /* Grab the lock before checking that pod.super is empty, or the last
+         * entries may disappear before we grab the lock. */
+        spin_lock(&d->page_alloc_lock);
+
+        if ( (p2md->pod.count - pod_target) > (1>>9)
+             && !list_empty(&p2md->pod.super) )
+            order = 9;
+        else
+            order = 0;
+
+        page = p2m_pod_cache_get(d, order);
+
+        ASSERT(page != NULL);
+
+        spin_unlock(&d->page_alloc_lock);
+
+        /* Then free them */
+        for ( i = 0 ; i < (1 << order) ; i++ )
+        {
+            /* Copied from common/memory.c:guest_remove_page() */
+            if ( unlikely(!get_page(page+i, d)) )
+            {
+                gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", 
d->domain_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) 
)
+                put_page_and_type(page+i);
+            
+            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
+                put_page(page+i);
+
+            put_page(page+i);
+        }
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * The "right behavior" here requires some careful thought.  First, some
+ * definitions:
+ * + M: static_max
+ * + B: number of pages the balloon driver has ballooned down to.
+ * + P: Number of populated pages. 
+ * + T: Old target
+ * + T': New target
+ *
+ * The following equations should hold:
+ *  0 <= P <= T <= B <= M
+ *  d->arch.p2m->pod.entry_count == B - P
+ *  d->tot_pages == P + d->arch.p2m->pod.count
+ *
+ * Now we have the following potential cases to cover:
+ *     B <T': Set the PoD cache size equal to the number of outstanding PoD
+ *   entries.  The balloon driver will deflate the balloon to give back
+ *   the remainder of the ram to the guest OS.
+ *  T <T'<B : Increase PoD cache size.
+ *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
+ *   get the memory right away.  However, that means every time we 
+ *   reduce the memory target we risk the guest attempting to populate the 
+ *   memory before the balloon driver has reached its new target.  Safer to
+ *   never reduce the cache size here, but only when the balloon driver frees 
+ *   PoD ranges.
+ *
+ * If there are many zero pages, we could reach the target also by doing
+ * zero sweeps and marking the ranges PoD; but the balloon driver will have
+ * to free this memory eventually anyway, so we don't actually gain that much
+ * by doing so.
+ *
+ * NB that the equation (B<T') may require adjustment to the cache
+ * size as PoD pages are freed as well; i.e., freeing a PoD-backed
+ * entry when pod.entry_count == pod.count requires us to reduce both
+ * pod.entry_count and pod.count.
+ */
+int
+p2m_pod_set_mem_target(struct domain *d, unsigned long target)
+{
+    unsigned pod_target;
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret = 0;
+    unsigned long populated;
+
+    /* P == B: Nothing to do. */
+    if ( p2md->pod.entry_count == 0 )
+        goto out;
+
+    /* T' < B: Don't reduce the cache size; let the balloon driver
+     * take care of it. */
+    if ( target < d->tot_pages )
+        goto out;
+
+    populated  = d->tot_pages - p2md->pod.count;
+
+    pod_target = target - populated;
+
+    /* B < T': Set the cache size equal to # of outstanding entries,
+     * let the balloon driver fill in the rest. */
+    if ( pod_target > p2md->pod.entry_count )
+        pod_target = p2md->pod.entry_count;
+
+    ASSERT( pod_target > p2md->pod.count );
+
+    ret = p2m_pod_set_cache_target(d, pod_target);
+
+out:
+    return ret;
+}
+
 void
 p2m_pod_empty_cache(struct domain *d)
 {
@@ -537,6 +681,13 @@ p2m_pod_decrease_reservation(struct doma
             ram--;
         }
     }    
+
+    /* If we've reduced our "liabilities" beyond our "assets", free some */
+    if ( p2md->pod.entry_count < p2md->pod.count )
+    {
+        printk("b %d\n", p2md->pod.entry_count);
+        p2m_pod_set_cache_target(d, p2md->pod.entry_count);
+    }
 
     /* If there are no more non-PoD entries, tell decrease_reservation() that
      * there's nothing left to do. */
@@ -786,7 +937,7 @@ p2m_pod_emergency_sweep_super(struct dom
         /* Stop if we're past our limit and we have found *something*.
          *
          * NB that this is a zero-sum game; we're increasing our cache size
-         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * by increasing our 'debt'.  Since we hold the p2m lock,
          * (entry_count - count) must remain the same. */
         if ( !list_empty(&p2md->pod.super) &&  i < limit )
             break;
diff -r ebe11a452393 -r 2090917489c5 xen/arch/x86/x86_64/compat/mm.c
--- a/xen/arch/x86/x86_64/compat/mm.c   Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/arch/x86/x86_64/compat/mm.c   Mon Jan 05 10:45:48 2009 +0000
@@ -122,6 +122,29 @@ int compat_arch_memory_op(int op, XEN_GU
 #define XLAT_memory_map_HNDL_buffer(_d_, _s_) ((void)0)
         XLAT_memory_map(&cmp, nat);
 #undef XLAT_memory_map_HNDL_buffer
+        if ( copy_to_guest(arg, &cmp, 1) )
+            rc = -EFAULT;
+
+        break;
+    }
+
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+    {
+        struct compat_pod_target cmp;
+        struct xen_pod_target *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE;
+
+        if ( copy_from_guest(&cmp, arg, 1) )
+            return -EFAULT;
+
+        XLAT_pod_target(nat, &cmp);
+
+        rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));
+        if ( rc < 0 )
+            break;
+
+        XLAT_pod_target(&cmp, nat);
+
         if ( copy_to_guest(arg, &cmp, 1) )
             rc = -EFAULT;
 
diff -r ebe11a452393 -r 2090917489c5 xen/common/memory.c
--- a/xen/common/memory.c       Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/common/memory.c       Mon Jan 05 10:45:48 2009 +0000
@@ -111,31 +111,40 @@ static void populate_physmap(struct memo
         if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) )
             goto out;
 
-        page = alloc_domheap_pages(d, a->extent_order, a->memflags);
-        if ( unlikely(page == NULL) ) 
-        {
-            gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
-                     "id=%d memflags=%x (%ld of %d)\n",
-                     a->extent_order, d->domain_id, a->memflags,
-                     i, a->nr_extents);
-            goto out;
-        }
-
-        mfn = page_to_mfn(page);
-        guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
-
-        if ( !paging_mode_translate(d) )
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                set_gpfn_from_mfn(mfn + j, gpfn + j);
-
-            /* Inform the domain of the new page's machine address. */ 
-            if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
+        if ( a->memflags & MEMF_populate_on_demand )
+        {
+            if ( guest_physmap_mark_populate_on_demand(d, gpfn,
+                                                       a->extent_order) < 0 )
                 goto out;
         }
-    }
-
- out:
+        else
+        {
+            page = alloc_domheap_pages(d, a->extent_order, a->memflags);
+            if ( unlikely(page == NULL) ) 
+            {
+                gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
+                         "id=%d memflags=%x (%ld of %d)\n",
+                         a->extent_order, d->domain_id, a->memflags,
+                         i, a->nr_extents);
+                goto out;
+            }
+
+            mfn = page_to_mfn(page);
+            guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
+
+            if ( !paging_mode_translate(d) )
+            {
+                for ( j = 0; j < (1 << a->extent_order); j++ )
+                    set_gpfn_from_mfn(mfn + j, gpfn + j);
+
+                /* Inform the domain of the new page's machine address. */ 
+                if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 
1)) )
+                    goto out;
+            }
+        }
+    }
+
+out:
     a->nr_done = i;
 }
 
@@ -527,6 +536,10 @@ long do_memory_op(unsigned long cmd, XEN
 
         args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));
 
+        if ( op == XENMEM_populate_physmap
+             && (reservation.mem_flags & XENMEMF_populate_on_demand) )
+            args.memflags |= MEMF_populate_on_demand;
+
         if ( likely(reservation.domid == DOMID_SELF) )
         {
             d = rcu_lock_current_domain();
diff -r ebe11a452393 -r 2090917489c5 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/include/asm-x86/p2m.h Mon Jan 05 10:45:48 2009 +0000
@@ -261,6 +261,10 @@ void p2m_pod_dump_data(struct domain *d)
  * (usually in preparation for domain destruction) */
 void p2m_pod_empty_cache(struct domain *d);
 
+/* Set populate-on-demand cache size so that the total memory allocated to a
+ * domain matches target */
+int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
+
 /* Call when decreasing memory reservation to handle PoD entries properly.
  * Will return '1' if all entries were handled and nothing more need be done.*/
 int
diff -r ebe11a452393 -r 2090917489c5 xen/include/public/memory.h
--- a/xen/include/public/memory.h       Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/include/public/memory.h       Mon Jan 05 10:45:48 2009 +0000
@@ -48,6 +48,8 @@
 /* NUMA node to allocate from. */
 #define XENMEMF_node(x)     (((x) + 1) << 8)
 #define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu)
+/* Flag to populate physmap with populate-on-demand entries */
+#define XENMEMF_populate_on_demand (1<<16)
 #endif
 
 struct xen_memory_reservation {
@@ -299,6 +301,19 @@ typedef struct xen_foreign_memory_map xe
 typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
 DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
 
+#define XENMEM_set_pod_target       16
+#define XENMEM_get_pod_target       17
+struct xen_pod_target {
+    /* IN */
+    uint64_t target_pages;
+    /* OUT */
+    uint64_t tot_pages;
+    uint64_t pod_cache_pages;
+    uint64_t pod_entries;
+    /* IN */
+    domid_t domid;
+};
+typedef struct xen_pod_target xen_pod_target_t;
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
 /*
diff -r ebe11a452393 -r 2090917489c5 xen/include/xen/hypercall.h
--- a/xen/include/xen/hypercall.h       Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/include/xen/hypercall.h       Mon Jan 05 10:45:48 2009 +0000
@@ -48,7 +48,7 @@ do_platform_op(
  * at what point in the page list to resume. For this purpose I steal the
  * high-order bits of the @cmd parameter, which are otherwise unused and zero.
  */
-#define MEMOP_EXTENT_SHIFT 4 /* cmd[:4] == start_extent */
+#define MEMOP_EXTENT_SHIFT 6 /* cmd[:6] == start_extent */
 #define MEMOP_CMD_MASK     ((1 << MEMOP_EXTENT_SHIFT) - 1)
 
 extern long
diff -r ebe11a452393 -r 2090917489c5 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/include/xen/mm.h      Mon Jan 05 10:45:48 2009 +0000
@@ -72,6 +72,8 @@ int assign_pages(
 /* memflags: */
 #define _MEMF_no_refcount 0
 #define  MEMF_no_refcount (1U<<_MEMF_no_refcount)
+#define _MEMF_populate_on_demand 1
+#define  MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand)
 #define _MEMF_node        8
 #define  MEMF_node(n)     ((((n)+1)&0xff)<<_MEMF_node)
 #define _MEMF_bits        24
diff -r ebe11a452393 -r 2090917489c5 xen/include/xlat.lst
--- a/xen/include/xlat.lst      Mon Jan 05 10:45:09 2009 +0000
+++ b/xen/include/xlat.lst      Mon Jan 05 10:45:48 2009 +0000
@@ -38,6 +38,7 @@
 !      memory_exchange                 memory.h
 !      memory_map                      memory.h
 !      memory_reservation              memory.h
+!      pod_target                      memory.h
 !      translate_gpfn_list             memory.h
 !      sched_poll                      sched.h
 ?      sched_remote_shutdown           sched.h

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.