[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] Page offline support in Xen side



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1236367119 0
# Node ID dd489125a2e7718efa0e7afe89e7875d7909353f
# Parent  f1080b20cd15e06d5fc72062c35b627b2f947339
Page offline support in Xen side

This patch add support to offline a page. The basical idea is, when a
page is assigned, it will be marked offline pending and be moved out of
buddy when freed, when a page is free, it will be moved out of buddy directly.

One notice after this change is, now the page->count_info is not
always 0, especially for shadow page, since the PGC_offlining bit may be set.

Signed-off-by: Wang, Shane <shane.wang@xxxxxxxxx>
Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
---
 xen/common/page_alloc.c     |  341 +++++++++++++++++++++++++++++++++++++++++++-
 xen/common/sysctl.c         |   55 +++++++
 xen/include/asm-x86/mm.h    |   23 ++
 xen/include/public/sysctl.h |   49 ++++++
 xen/include/public/xen.h    |    3 
 xen/include/xen/mm.h        |    3 
 6 files changed, 470 insertions(+), 4 deletions(-)

diff -r f1080b20cd15 -r dd489125a2e7 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/common/page_alloc.c   Fri Mar 06 19:18:39 2009 +0000
@@ -35,6 +35,7 @@
 #include <xen/perfc.h>
 #include <xen/numa.h>
 #include <xen/nodemask.h>
+#include <public/sysctl.h>
 #include <asm/page.h>
 #include <asm/numa.h>
 #include <asm/flushtlb.h>
@@ -74,6 +75,11 @@ PAGE_LIST_HEAD(page_scrub_list);
 PAGE_LIST_HEAD(page_scrub_list);
 static unsigned long scrub_pages;
 
+/* Offlined page list, protected by heap_lock */
+PAGE_LIST_HEAD(page_offlined_list);
+
+/* Broken page list, protected by heap_lock */
+PAGE_LIST_HEAD(page_broken_list);
 /*********************
  * ALLOCATION BITMAP
  *  One bit per page of memory. Bit set => page is allocated.
@@ -421,12 +427,92 @@ static struct page_info *alloc_heap_page
     return pg;
 }
 
+/*
+ * Remove any offlined page in the buddy poined by head
+ */
+static int reserve_offlined_page(struct page_info *head)
+{
+    unsigned int node = phys_to_nid(page_to_maddr(head));
+    int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
+    struct page_info *cur_head;
+    int cur_order;
+
+    ASSERT(spin_is_locked(&heap_lock));
+
+    cur_head = head;
+
+    page_list_del(head, &heap(node, zone, head_order));
+
+    while ( cur_head < (head + (1 << head_order)) )
+    {
+        struct page_info *pg;
+        int next_order;
+
+        if (test_bit(_PGC_offlined, &cur_head->count_info))
+        {
+            cur_head++;
+            continue;
+        }
+
+        next_order = cur_order = 0;
+
+        while (cur_order < head_order)
+        {
+            next_order = cur_order + 1;
+
+            if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)))
+                goto merge;
+
+            for (i = (1 << cur_order), pg = cur_head + (1 << cur_order);
+              i < (1 << next_order);
+              i++, pg ++)
+                if (test_bit(_PGC_offlined, &pg->count_info))
+                    break;
+            if (i == ( 1 << next_order))
+            {
+                cur_order = next_order;
+                continue;
+            }
+            else
+            {
+                /*
+                 * We don't need considering merge outside the head_order
+                 */
+merge:
+                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
+                PFN_ORDER(cur_head) = cur_order;
+                cur_head += (1 << cur_order);
+                break;
+            }
+        }
+    }
+
+    for (cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++)
+    {
+        if (!test_bit(_PGC_offlined, &cur_head->count_info))
+            continue;
+
+        avail[node][zone] --;
+
+        map_alloc(page_to_mfn(cur_head), 1);
+
+        if (test_bit(_PGC_broken, &cur_head->count_info))
+            page_list_add_tail(cur_head, &page_broken_list);
+        else
+            page_list_add_tail(cur_head, &page_offlined_list);
+
+        count ++;
+    }
+
+    return count;
+}
+
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
     struct page_info *pg, unsigned int order)
 {
     unsigned long mask;
-    unsigned int i, node = phys_to_nid(page_to_maddr(pg));
+    unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
     unsigned int zone = page_to_zone(pg);
 
     ASSERT(order <= MAX_ORDER);
@@ -446,7 +532,14 @@ static void free_heap_pages(
          *     in its pseudophysical address space).
          * In all the above cases there can be no guest mappings of this page.
          */
-        pg[i].count_info = 0;
+        ASSERT(!(pg[i].count_info & PGC_offlined));
+        pg[i].count_info &= PGC_offlining | PGC_broken;
+        if (pg[i].count_info & PGC_offlining)
+        {
+            pg[i].count_info &= ~PGC_offlining;
+            pg[i].count_info |= PGC_offlined;
+            tainted = 1;
+        }
 
         /* If a page has no owner it will need no safety TLB flush. */
         pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
@@ -481,7 +574,7 @@ static void free_heap_pages(
                 break;
             page_list_del(pg + mask, &heap(node, zone, order));
         }
-        
+
         order++;
 
         /* After merging, pg should remain in the same node. */
@@ -491,7 +584,249 @@ static void free_heap_pages(
     PFN_ORDER(pg) = order;
     page_list_add_tail(pg, &heap(node, zone, order));
 
+    if (tainted)
+        reserve_offlined_page(pg);
+
     spin_unlock(&heap_lock);
+}
+
+
+/*
+ * Following possible status for a page:
+ * free and Online; free and offlined; free and offlined and broken;
+ * assigned and online; assigned and offlining; assigned and offling and broken
+ *
+ * Following rules applied for page offline:
+ * Once a page is broken, it can't be assigned anymore
+ * A page will be offlined only if it is free
+ * return original count_info
+ *
+ */
+static unsigned long mark_page_offline(struct page_info *pg, int broken)
+{
+    unsigned long nx, x, y = pg->count_info;
+
+    ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
+    /*
+     * Caller gurantee the page will not be reassigned during this process
+     */
+    ASSERT(spin_is_locked(&heap_lock));
+
+    do {
+        nx = x = y;
+
+        if ( ((x & PGC_offlined_broken) == PGC_offlined_broken) )
+            return y;
+        /* PGC_offlined means it is free pages */
+        if (x & PGC_offlined)
+        {
+            if (broken && !(nx & PGC_broken))
+                nx |= PGC_broken;
+            else
+                return y;
+        }
+        /* It is not offlined, not reserved page */
+        else if ( allocated_in_map(page_to_mfn(pg)) )
+            nx |= PGC_offlining;
+        else
+            nx |= PGC_offlined;
+
+        if (broken)
+            nx |= PGC_broken;
+    } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
+
+    return y;
+}
+
+static int reserve_heap_page(struct page_info *pg)
+{
+    struct page_info *head = NULL;
+    unsigned int i, node = phys_to_nid(page_to_maddr(pg));
+    unsigned int zone = page_to_zone(pg);
+
+    /* get the header */
+    for ( i = 0; i <= MAX_ORDER; i++ )
+    {
+        struct page_info *tmp;
+
+        if ( page_list_empty(&heap(node, zone, i)) )
+            continue;
+
+        page_list_for_each_safe(head, tmp, &heap(node, zone, i))
+        {
+            if ( (head <= pg) &&
+                 (head + (1UL << i) > pg) )
+                return reserve_offlined_page(head);
+        }
+    }
+
+    return -EINVAL;
+
+}
+
+/*
+ * offline one page
+ */
+int offline_page(unsigned long mfn, int broken, uint32_t *status)
+{
+    unsigned long old_info = 0;
+    struct domain *owner;
+    int ret = 0;
+    struct page_info *pg;
+
+    if (mfn > max_page)
+    {
+        dprintk(XENLOG_WARNING,
+                "try to offline page out of range %lx\n", mfn);
+        return -EINVAL;
+    }
+
+    *status = 0;
+    pg = mfn_to_page(mfn);
+
+
+#if defined(__x86_64__)
+     /* Xen's txt mfn in x86_64 is reserved in e820 */
+    if ( is_xen_fixed_mfn(mfn) )
+#elif defined(__i386__)
+    if ( is_xen_heap_mfn(mfn) )
+#endif
+    {
+        *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
+          (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
+        return -EPERM;
+    }
+
+    /*
+     * N.B. xen's txt in x86_64 is marked reserved and handled already
+     *  Also kexec range is reserved
+     */
+     if (!page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL))
+     {
+        *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
+        return -EINVAL;
+     }
+
+    spin_lock(&heap_lock);
+
+    old_info = mark_page_offline(pg, broken);
+
+    if ( !allocated_in_map(mfn) )
+    {
+        /* Free pages are reserve directly */
+        reserve_heap_page(pg);
+        *status = PG_OFFLINE_OFFLINED;
+    }
+    else if (test_bit(_PGC_offlined, &pg->count_info))
+    {
+        *status = PG_OFFLINE_OFFLINED;
+    }
+    else if ((owner = page_get_owner_and_reference(pg)))
+    {
+            *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
+              (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
+            /* Release the reference since it will not be allocated anymore */
+            put_page(pg);
+    }
+    else if ( old_info & PGC_xen_heap)
+    {
+        *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
+          (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
+    }
+    else
+    {
+        /*
+         * assign_pages does not hold heap_lock, so small window that the owner
+         * may be set later, but please notice owner will only change from
+         * NULL to be set, not verse, since page is offlining now.
+         * No windows If called from #MC handler, since all CPU are in softirq
+         * If called from user space like CE handling, tools can wait some time
+         * before call again.
+         */
+        *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
+                  (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
+    }
+
+    if (broken)
+        *status |= PG_OFFLINE_BROKEN;
+
+    spin_unlock(&heap_lock);
+
+    return ret;
+}
+
+/*
+ * Online the memory.
+ *   The caller should make sure end_pfn <= max_page,
+ *   if not, expand_pages() should be called prior to online_page().
+ */
+unsigned int online_page(unsigned long mfn, uint32_t *status)
+{
+    struct page_info *pg;
+    int ret = 0, free = 0;
+
+    if ( mfn > max_page )
+    {
+        dprintk(XENLOG_WARNING, "call expand_pages() first\n");
+        return -EINVAL;
+    }
+
+    pg = mfn_to_page(mfn);
+
+    *status = 0;
+
+    spin_lock(&heap_lock);
+
+    if ( unlikely(is_page_broken(pg)) )
+    {
+        ret = -EINVAL;
+        *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
+    }
+    else if (pg->count_info & PGC_offlined)
+    {
+        clear_bit(_PGC_offlined, &pg->count_info);
+        page_list_del(pg, &page_offlined_list);
+        *status = PG_ONLINE_ONLINED;
+        free = 1;
+    }
+    else if (pg->count_info & PGC_offlining)
+    {
+        clear_bit(_PGC_offlining, &pg->count_info);
+        *status = PG_ONLINE_ONLINED;
+    }
+    spin_unlock(&heap_lock);
+
+    if (free)
+        free_heap_pages(pg, 0);
+
+    return ret;
+}
+
+int query_page_offline(unsigned long mfn, uint32_t *status)
+{
+    struct page_info *pg;
+
+    if ( (mfn > max_page) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
+    {
+        dprintk(XENLOG_WARNING, "call expand_pages() first\n");
+        return -EINVAL;
+    }
+
+    *status = 0;
+    spin_lock(&heap_lock);
+
+    pg = mfn_to_page(mfn);
+
+    if (pg->count_info & PGC_offlining)
+        *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
+    if (pg->count_info & PGC_broken)
+        *status |= PG_OFFLINE_STATUS_BROKEN;
+    if (pg->count_info & PGC_offlined)
+        *status |= PG_OFFLINE_STATUS_OFFLINED;
+
+    spin_unlock(&heap_lock);
+
+    return 0;
 }
 
 /*
diff -r f1080b20cd15 -r dd489125a2e7 xen/common/sysctl.c
--- a/xen/common/sysctl.c       Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/common/sysctl.c       Fri Mar 06 19:18:39 2009 +0000
@@ -233,6 +233,61 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
     }
     break;
 
+    case XEN_SYSCTL_page_offline_op:
+    {
+        uint32_t *status, *ptr;
+        unsigned long pfn;
+
+        ptr = status = xmalloc_bytes( sizeof(uint32_t) *
+                                (op->u.page_offline.end -
+                                  op->u.page_offline.start + 1));
+        if (!status)
+        {
+            dprintk(XENLOG_WARNING, "Out of memory for page offline op\n");
+            ret = -ENOMEM;
+            break;
+        }
+
+        memset(status, PG_OFFLINE_INVALID, sizeof(uint32_t) *
+                      (op->u.page_offline.end - op->u.page_offline.start + 1));
+
+        for ( pfn = op->u.page_offline.start;
+              pfn <= op->u.page_offline.end;
+              pfn ++ )
+        {
+            switch (op->u.page_offline.cmd)
+            {
+                /* Shall revert her if failed, or leave caller do it? */
+                case sysctl_page_offline:
+                    ret = offline_page(pfn, 0, ptr++);
+                    break;
+                case sysctl_page_online:
+                    ret = online_page(pfn, ptr++);
+                    break;
+                case sysctl_query_page_offline:
+                    ret = query_page_offline(pfn, ptr++);
+                    break;
+                default:
+                    gdprintk(XENLOG_WARNING, "invalid page offline op %x\n",
+                            op->u.page_offline.cmd);
+                    ret = -EINVAL;
+                    break;
+            }
+
+            if (ret)
+                break;
+        }
+
+        if (copy_to_guest(op->u.page_offline.status, status,
+                          op->u.page_offline.end - op->u.page_offline.start + 
1))
+        {
+            ret = -EFAULT;
+            break;
+        }
+        xfree(status);
+    }
+    break;
+
     default:
         ret = arch_do_sysctl(op, u_sysctl);
         break;
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/asm-x86/mm.h  Fri Mar 06 19:18:39 2009 +0000
@@ -198,8 +198,25 @@ struct page_info
  /* 3-bit PAT/PCD/PWT cache-attribute hint. */
 #define PGC_cacheattr_base PG_shift(6)
 #define PGC_cacheattr_mask PG_mask(7, 6)
+
+  /* Page is broken? */
+ #define _PGC_broken         PG_shift(7)
+ #define PGC_broken          PG_mask(1, 7)
+  /* Page is offline pending ? */
+ #define _PGC_offlining      PG_shift(8)
+ #define PGC_offlining       PG_mask(1, 8)
+  /* Page is offlined */
+ #define _PGC_offlined       PG_shift(9)
+ #define PGC_offlined        PG_mask(1, 9)
+ #define PGC_offlined_broken (PGC_offlined | PGC_broken)
+
+ #define is_page_offlining(page)          ((page)->count_info & PGC_offlining)
+ #define is_page_offlined(page)          ((page)->count_info & PGC_offlined)
+ #define is_page_broken(page)           ((page)->count_info & PGC_broken)
+ #define is_page_online(page)           (!is_page_offlined(page))
+
  /* Count of references to this frame. */
-#define PGC_count_width   PG_shift(6)
+#define PGC_count_width   PG_shift(9)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
 #if defined(__i386__)
@@ -209,9 +226,13 @@ struct page_info
     (_mfn < paddr_to_pfn(xenheap_phys_end));            \
 })
 #else
+extern unsigned long allocator_bitmap_end;
 #define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap)
 #define is_xen_heap_mfn(mfn) \
     (__mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn)))
+#define is_xen_fixed_mfn(mfn) \
+    ( (mfn << PAGE_SHIFT) >= __pa(&_start) &&    \
+          (mfn << PAGE_SHIFT) <= allocator_bitmap_end )
 #endif
 
 #if defined(__i386__)
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/public/sysctl.h
--- a/xen/include/public/sysctl.h       Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/public/sysctl.h       Fri Mar 06 19:18:39 2009 +0000
@@ -359,6 +359,54 @@ struct xen_sysctl_pm_op {
     };
 };
 
+#define XEN_SYSCTL_page_offline_op        14
+struct xen_sysctl_page_offline_op {
+    /* IN: range of page to be offlined */
+#define sysctl_page_offline     1
+#define sysctl_page_online      2
+#define sysctl_query_page_offline  3
+    uint32_t cmd;
+    uint32_t start;
+    uint32_t end;
+    /* OUT: result of page offline request */
+    /*
+     * bit 0~15: result flags
+     * bit 16~31: owner
+     */
+    XEN_GUEST_HANDLE(uint32) status;
+};
+
+#define PG_OFFLINE_STATUS_MASK    (0xFFUL)
+
+/* The result is invalid, i.e. HV does not handle it */
+#define PG_OFFLINE_INVALID   (0x1UL << 0)
+
+#define PG_OFFLINE_OFFLINED  (0x1UL << 1)
+#define PG_OFFLINE_PENDING   (0x1UL << 2)
+#define PG_OFFLINE_FAILED    (0x1UL << 3)
+
+#define PG_ONLINE_FAILED     PG_OFFLINE_FAILED
+#define PG_ONLINE_ONLINED    PG_OFFLINE_OFFLINED
+
+#define PG_OFFLINE_STATUS_OFFLINED              (0x1UL << 1)
+#define PG_OFFLINE_STATUS_ONLINE                (0x1UL << 2)
+#define PG_OFFLINE_STATUS_OFFLINE_PENDING       (0x1UL << 3)
+#define PG_OFFLINE_STATUS_BROKEN                (0x1UL << 4)
+
+#define PG_OFFLINE_MISC_MASK    (0xFFUL << 4)
+
+/* only valid when PG_OFFLINE_FAILED */
+#define PG_OFFLINE_XENPAGE   (0x1UL << 8)
+#define PG_OFFLINE_DOM0PAGE  (0x1UL << 9)
+#define PG_OFFLINE_ANONYMOUS (0x1UL << 10)
+#define PG_OFFLINE_NOT_CONV_RAM   (0x1UL << 11)
+#define PG_OFFLINE_OWNED     (0x1UL << 12)
+
+#define PG_OFFLINE_BROKEN    (0x1UL << 13)
+#define PG_ONLINE_BROKEN     PG_OFFLINE_BROKEN
+
+#define PG_OFFLINE_OWNER_SHIFT 16
+
 struct xen_sysctl {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
@@ -375,6 +423,7 @@ struct xen_sysctl {
         struct xen_sysctl_get_pmstat        get_pmstat;
         struct xen_sysctl_cpu_hotplug       cpu_hotplug;
         struct xen_sysctl_pm_op             pm_op;
+        struct xen_sysctl_page_offline_op   page_offline;
         uint8_t                             pad[128];
     } u;
 };
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/public/xen.h
--- a/xen/include/public/xen.h  Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/public/xen.h  Fri Mar 06 19:18:39 2009 +0000
@@ -354,6 +354,9 @@ typedef uint16_t domid_t;
  */
 #define DOMID_XEN  (0x7FF2U)
 
+/* DOMID_INVALID is used to identity invalid domid */
+#define DOMID_INVALID (0x7FFFU)
+
 /*
  * Send an array of these to HYPERVISOR_mmu_update().
  * NB. The fields are natural pointer/address size for this architecture.
diff -r f1080b20cd15 -r dd489125a2e7 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Fri Mar 06 19:14:50 2009 +0000
+++ b/xen/include/xen/mm.h      Fri Mar 06 19:18:39 2009 +0000
@@ -60,6 +60,9 @@ unsigned long avail_domheap_pages(void);
 unsigned long avail_domheap_pages(void);
 #define alloc_domheap_page(d,f) (alloc_domheap_pages(d,0,f))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
+unsigned int online_page(unsigned long mfn, uint32_t *status);
+int offline_page(unsigned long mfn, int broken, uint32_t *status);
+int query_page_offline(unsigned long mfn, uint32_t *status);
 
 void scrub_heap_pages(void);
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.