[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] Page offline support in Xen side
# HG changeset patch # User Keir Fraser <keir.fraser@xxxxxxxxxx> # Date 1236367119 0 # Node ID dd489125a2e7718efa0e7afe89e7875d7909353f # Parent f1080b20cd15e06d5fc72062c35b627b2f947339 Page offline support in Xen side This patch add support to offline a page. The basical idea is, when a page is assigned, it will be marked offline pending and be moved out of buddy when freed, when a page is free, it will be moved out of buddy directly. One notice after this change is, now the page->count_info is not always 0, especially for shadow page, since the PGC_offlining bit may be set. Signed-off-by: Wang, Shane <shane.wang@xxxxxxxxx> Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx> --- xen/common/page_alloc.c | 341 +++++++++++++++++++++++++++++++++++++++++++- xen/common/sysctl.c | 55 +++++++ xen/include/asm-x86/mm.h | 23 ++ xen/include/public/sysctl.h | 49 ++++++ xen/include/public/xen.h | 3 xen/include/xen/mm.h | 3 6 files changed, 470 insertions(+), 4 deletions(-) diff -r f1080b20cd15 -r dd489125a2e7 xen/common/page_alloc.c --- a/xen/common/page_alloc.c Fri Mar 06 19:14:50 2009 +0000 +++ b/xen/common/page_alloc.c Fri Mar 06 19:18:39 2009 +0000 @@ -35,6 +35,7 @@ #include <xen/perfc.h> #include <xen/numa.h> #include <xen/nodemask.h> +#include <public/sysctl.h> #include <asm/page.h> #include <asm/numa.h> #include <asm/flushtlb.h> @@ -74,6 +75,11 @@ PAGE_LIST_HEAD(page_scrub_list); PAGE_LIST_HEAD(page_scrub_list); static unsigned long scrub_pages; +/* Offlined page list, protected by heap_lock */ +PAGE_LIST_HEAD(page_offlined_list); + +/* Broken page list, protected by heap_lock */ +PAGE_LIST_HEAD(page_broken_list); /********************* * ALLOCATION BITMAP * One bit per page of memory. Bit set => page is allocated. @@ -421,12 +427,92 @@ static struct page_info *alloc_heap_page return pg; } +/* + * Remove any offlined page in the buddy poined by head + */ +static int reserve_offlined_page(struct page_info *head) +{ + unsigned int node = phys_to_nid(page_to_maddr(head)); + int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0; + struct page_info *cur_head; + int cur_order; + + ASSERT(spin_is_locked(&heap_lock)); + + cur_head = head; + + page_list_del(head, &heap(node, zone, head_order)); + + while ( cur_head < (head + (1 << head_order)) ) + { + struct page_info *pg; + int next_order; + + if (test_bit(_PGC_offlined, &cur_head->count_info)) + { + cur_head++; + continue; + } + + next_order = cur_order = 0; + + while (cur_order < head_order) + { + next_order = cur_order + 1; + + if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order))) + goto merge; + + for (i = (1 << cur_order), pg = cur_head + (1 << cur_order); + i < (1 << next_order); + i++, pg ++) + if (test_bit(_PGC_offlined, &pg->count_info)) + break; + if (i == ( 1 << next_order)) + { + cur_order = next_order; + continue; + } + else + { + /* + * We don't need considering merge outside the head_order + */ +merge: + page_list_add_tail(cur_head, &heap(node, zone, cur_order)); + PFN_ORDER(cur_head) = cur_order; + cur_head += (1 << cur_order); + break; + } + } + } + + for (cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++) + { + if (!test_bit(_PGC_offlined, &cur_head->count_info)) + continue; + + avail[node][zone] --; + + map_alloc(page_to_mfn(cur_head), 1); + + if (test_bit(_PGC_broken, &cur_head->count_info)) + page_list_add_tail(cur_head, &page_broken_list); + else + page_list_add_tail(cur_head, &page_offlined_list); + + count ++; + } + + return count; +} + /* Free 2^@order set of pages. */ static void free_heap_pages( struct page_info *pg, unsigned int order) { unsigned long mask; - unsigned int i, node = phys_to_nid(page_to_maddr(pg)); + unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0; unsigned int zone = page_to_zone(pg); ASSERT(order <= MAX_ORDER); @@ -446,7 +532,14 @@ static void free_heap_pages( * in its pseudophysical address space). * In all the above cases there can be no guest mappings of this page. */ - pg[i].count_info = 0; + ASSERT(!(pg[i].count_info & PGC_offlined)); + pg[i].count_info &= PGC_offlining | PGC_broken; + if (pg[i].count_info & PGC_offlining) + { + pg[i].count_info &= ~PGC_offlining; + pg[i].count_info |= PGC_offlined; + tainted = 1; + } /* If a page has no owner it will need no safety TLB flush. */ pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL); @@ -481,7 +574,7 @@ static void free_heap_pages( break; page_list_del(pg + mask, &heap(node, zone, order)); } - + order++; /* After merging, pg should remain in the same node. */ @@ -491,7 +584,249 @@ static void free_heap_pages( PFN_ORDER(pg) = order; page_list_add_tail(pg, &heap(node, zone, order)); + if (tainted) + reserve_offlined_page(pg); + spin_unlock(&heap_lock); +} + + +/* + * Following possible status for a page: + * free and Online; free and offlined; free and offlined and broken; + * assigned and online; assigned and offlining; assigned and offling and broken + * + * Following rules applied for page offline: + * Once a page is broken, it can't be assigned anymore + * A page will be offlined only if it is free + * return original count_info + * + */ +static unsigned long mark_page_offline(struct page_info *pg, int broken) +{ + unsigned long nx, x, y = pg->count_info; + + ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL)); + /* + * Caller gurantee the page will not be reassigned during this process + */ + ASSERT(spin_is_locked(&heap_lock)); + + do { + nx = x = y; + + if ( ((x & PGC_offlined_broken) == PGC_offlined_broken) ) + return y; + /* PGC_offlined means it is free pages */ + if (x & PGC_offlined) + { + if (broken && !(nx & PGC_broken)) + nx |= PGC_broken; + else + return y; + } + /* It is not offlined, not reserved page */ + else if ( allocated_in_map(page_to_mfn(pg)) ) + nx |= PGC_offlining; + else + nx |= PGC_offlined; + + if (broken) + nx |= PGC_broken; + } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x ); + + return y; +} + +static int reserve_heap_page(struct page_info *pg) +{ + struct page_info *head = NULL; + unsigned int i, node = phys_to_nid(page_to_maddr(pg)); + unsigned int zone = page_to_zone(pg); + + /* get the header */ + for ( i = 0; i <= MAX_ORDER; i++ ) + { + struct page_info *tmp; + + if ( page_list_empty(&heap(node, zone, i)) ) + continue; + + page_list_for_each_safe(head, tmp, &heap(node, zone, i)) + { + if ( (head <= pg) && + (head + (1UL << i) > pg) ) + return reserve_offlined_page(head); + } + } + + return -EINVAL; + +} + +/* + * offline one page + */ +int offline_page(unsigned long mfn, int broken, uint32_t *status) +{ + unsigned long old_info = 0; + struct domain *owner; + int ret = 0; + struct page_info *pg; + + if (mfn > max_page) + { + dprintk(XENLOG_WARNING, + "try to offline page out of range %lx\n", mfn); + return -EINVAL; + } + + *status = 0; + pg = mfn_to_page(mfn); + + +#if defined(__x86_64__) + /* Xen's txt mfn in x86_64 is reserved in e820 */ + if ( is_xen_fixed_mfn(mfn) ) +#elif defined(__i386__) + if ( is_xen_heap_mfn(mfn) ) +#endif + { + *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED | + (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT); + return -EPERM; + } + + /* + * N.B. xen's txt in x86_64 is marked reserved and handled already + * Also kexec range is reserved + */ + if (!page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL)) + { + *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM; + return -EINVAL; + } + + spin_lock(&heap_lock); + + old_info = mark_page_offline(pg, broken); + + if ( !allocated_in_map(mfn) ) + { + /* Free pages are reserve directly */ + reserve_heap_page(pg); + *status = PG_OFFLINE_OFFLINED; + } + else if (test_bit(_PGC_offlined, &pg->count_info)) + { + *status = PG_OFFLINE_OFFLINED; + } + else if ((owner = page_get_owner_and_reference(pg))) + { + *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING | + (owner->domain_id << PG_OFFLINE_OWNER_SHIFT); + /* Release the reference since it will not be allocated anymore */ + put_page(pg); + } + else if ( old_info & PGC_xen_heap) + { + *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING | + (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT); + } + else + { + /* + * assign_pages does not hold heap_lock, so small window that the owner + * may be set later, but please notice owner will only change from + * NULL to be set, not verse, since page is offlining now. + * No windows If called from #MC handler, since all CPU are in softirq + * If called from user space like CE handling, tools can wait some time + * before call again. + */ + *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED | + (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT ); + } + + if (broken) + *status |= PG_OFFLINE_BROKEN; + + spin_unlock(&heap_lock); + + return ret; +} + +/* + * Online the memory. + * The caller should make sure end_pfn <= max_page, + * if not, expand_pages() should be called prior to online_page(). + */ +unsigned int online_page(unsigned long mfn, uint32_t *status) +{ + struct page_info *pg; + int ret = 0, free = 0; + + if ( mfn > max_page ) + { + dprintk(XENLOG_WARNING, "call expand_pages() first\n"); + return -EINVAL; + } + + pg = mfn_to_page(mfn); + + *status = 0; + + spin_lock(&heap_lock); + + if ( unlikely(is_page_broken(pg)) ) + { + ret = -EINVAL; + *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN; + } + else if (pg->count_info & PGC_offlined) + { + clear_bit(_PGC_offlined, &pg->count_info); + page_list_del(pg, &page_offlined_list); + *status = PG_ONLINE_ONLINED; + free = 1; + } + else if (pg->count_info & PGC_offlining) + { + clear_bit(_PGC_offlining, &pg->count_info); + *status = PG_ONLINE_ONLINED; + } + spin_unlock(&heap_lock); + + if (free) + free_heap_pages(pg, 0); + + return ret; +} + +int query_page_offline(unsigned long mfn, uint32_t *status) +{ + struct page_info *pg; + + if ( (mfn > max_page) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) ) + { + dprintk(XENLOG_WARNING, "call expand_pages() first\n"); + return -EINVAL; + } + + *status = 0; + spin_lock(&heap_lock); + + pg = mfn_to_page(mfn); + + if (pg->count_info & PGC_offlining) + *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING; + if (pg->count_info & PGC_broken) + *status |= PG_OFFLINE_STATUS_BROKEN; + if (pg->count_info & PGC_offlined) + *status |= PG_OFFLINE_STATUS_OFFLINED; + + spin_unlock(&heap_lock); + + return 0; } /* diff -r f1080b20cd15 -r dd489125a2e7 xen/common/sysctl.c --- a/xen/common/sysctl.c Fri Mar 06 19:14:50 2009 +0000 +++ b/xen/common/sysctl.c Fri Mar 06 19:18:39 2009 +0000 @@ -233,6 +233,61 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc } break; + case XEN_SYSCTL_page_offline_op: + { + uint32_t *status, *ptr; + unsigned long pfn; + + ptr = status = xmalloc_bytes( sizeof(uint32_t) * + (op->u.page_offline.end - + op->u.page_offline.start + 1)); + if (!status) + { + dprintk(XENLOG_WARNING, "Out of memory for page offline op\n"); + ret = -ENOMEM; + break; + } + + memset(status, PG_OFFLINE_INVALID, sizeof(uint32_t) * + (op->u.page_offline.end - op->u.page_offline.start + 1)); + + for ( pfn = op->u.page_offline.start; + pfn <= op->u.page_offline.end; + pfn ++ ) + { + switch (op->u.page_offline.cmd) + { + /* Shall revert her if failed, or leave caller do it? */ + case sysctl_page_offline: + ret = offline_page(pfn, 0, ptr++); + break; + case sysctl_page_online: + ret = online_page(pfn, ptr++); + break; + case sysctl_query_page_offline: + ret = query_page_offline(pfn, ptr++); + break; + default: + gdprintk(XENLOG_WARNING, "invalid page offline op %x\n", + op->u.page_offline.cmd); + ret = -EINVAL; + break; + } + + if (ret) + break; + } + + if (copy_to_guest(op->u.page_offline.status, status, + op->u.page_offline.end - op->u.page_offline.start + 1)) + { + ret = -EFAULT; + break; + } + xfree(status); + } + break; + default: ret = arch_do_sysctl(op, u_sysctl); break; diff -r f1080b20cd15 -r dd489125a2e7 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Fri Mar 06 19:14:50 2009 +0000 +++ b/xen/include/asm-x86/mm.h Fri Mar 06 19:18:39 2009 +0000 @@ -198,8 +198,25 @@ struct page_info /* 3-bit PAT/PCD/PWT cache-attribute hint. */ #define PGC_cacheattr_base PG_shift(6) #define PGC_cacheattr_mask PG_mask(7, 6) + + /* Page is broken? */ + #define _PGC_broken PG_shift(7) + #define PGC_broken PG_mask(1, 7) + /* Page is offline pending ? */ + #define _PGC_offlining PG_shift(8) + #define PGC_offlining PG_mask(1, 8) + /* Page is offlined */ + #define _PGC_offlined PG_shift(9) + #define PGC_offlined PG_mask(1, 9) + #define PGC_offlined_broken (PGC_offlined | PGC_broken) + + #define is_page_offlining(page) ((page)->count_info & PGC_offlining) + #define is_page_offlined(page) ((page)->count_info & PGC_offlined) + #define is_page_broken(page) ((page)->count_info & PGC_broken) + #define is_page_online(page) (!is_page_offlined(page)) + /* Count of references to this frame. */ -#define PGC_count_width PG_shift(6) +#define PGC_count_width PG_shift(9) #define PGC_count_mask ((1UL<<PGC_count_width)-1) #if defined(__i386__) @@ -209,9 +226,13 @@ struct page_info (_mfn < paddr_to_pfn(xenheap_phys_end)); \ }) #else +extern unsigned long allocator_bitmap_end; #define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap) #define is_xen_heap_mfn(mfn) \ (__mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn))) +#define is_xen_fixed_mfn(mfn) \ + ( (mfn << PAGE_SHIFT) >= __pa(&_start) && \ + (mfn << PAGE_SHIFT) <= allocator_bitmap_end ) #endif #if defined(__i386__) diff -r f1080b20cd15 -r dd489125a2e7 xen/include/public/sysctl.h --- a/xen/include/public/sysctl.h Fri Mar 06 19:14:50 2009 +0000 +++ b/xen/include/public/sysctl.h Fri Mar 06 19:18:39 2009 +0000 @@ -359,6 +359,54 @@ struct xen_sysctl_pm_op { }; }; +#define XEN_SYSCTL_page_offline_op 14 +struct xen_sysctl_page_offline_op { + /* IN: range of page to be offlined */ +#define sysctl_page_offline 1 +#define sysctl_page_online 2 +#define sysctl_query_page_offline 3 + uint32_t cmd; + uint32_t start; + uint32_t end; + /* OUT: result of page offline request */ + /* + * bit 0~15: result flags + * bit 16~31: owner + */ + XEN_GUEST_HANDLE(uint32) status; +}; + +#define PG_OFFLINE_STATUS_MASK (0xFFUL) + +/* The result is invalid, i.e. HV does not handle it */ +#define PG_OFFLINE_INVALID (0x1UL << 0) + +#define PG_OFFLINE_OFFLINED (0x1UL << 1) +#define PG_OFFLINE_PENDING (0x1UL << 2) +#define PG_OFFLINE_FAILED (0x1UL << 3) + +#define PG_ONLINE_FAILED PG_OFFLINE_FAILED +#define PG_ONLINE_ONLINED PG_OFFLINE_OFFLINED + +#define PG_OFFLINE_STATUS_OFFLINED (0x1UL << 1) +#define PG_OFFLINE_STATUS_ONLINE (0x1UL << 2) +#define PG_OFFLINE_STATUS_OFFLINE_PENDING (0x1UL << 3) +#define PG_OFFLINE_STATUS_BROKEN (0x1UL << 4) + +#define PG_OFFLINE_MISC_MASK (0xFFUL << 4) + +/* only valid when PG_OFFLINE_FAILED */ +#define PG_OFFLINE_XENPAGE (0x1UL << 8) +#define PG_OFFLINE_DOM0PAGE (0x1UL << 9) +#define PG_OFFLINE_ANONYMOUS (0x1UL << 10) +#define PG_OFFLINE_NOT_CONV_RAM (0x1UL << 11) +#define PG_OFFLINE_OWNED (0x1UL << 12) + +#define PG_OFFLINE_BROKEN (0x1UL << 13) +#define PG_ONLINE_BROKEN PG_OFFLINE_BROKEN + +#define PG_OFFLINE_OWNER_SHIFT 16 + struct xen_sysctl { uint32_t cmd; uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */ @@ -375,6 +423,7 @@ struct xen_sysctl { struct xen_sysctl_get_pmstat get_pmstat; struct xen_sysctl_cpu_hotplug cpu_hotplug; struct xen_sysctl_pm_op pm_op; + struct xen_sysctl_page_offline_op page_offline; uint8_t pad[128]; } u; }; diff -r f1080b20cd15 -r dd489125a2e7 xen/include/public/xen.h --- a/xen/include/public/xen.h Fri Mar 06 19:14:50 2009 +0000 +++ b/xen/include/public/xen.h Fri Mar 06 19:18:39 2009 +0000 @@ -354,6 +354,9 @@ typedef uint16_t domid_t; */ #define DOMID_XEN (0x7FF2U) +/* DOMID_INVALID is used to identity invalid domid */ +#define DOMID_INVALID (0x7FFFU) + /* * Send an array of these to HYPERVISOR_mmu_update(). * NB. The fields are natural pointer/address size for this architecture. diff -r f1080b20cd15 -r dd489125a2e7 xen/include/xen/mm.h --- a/xen/include/xen/mm.h Fri Mar 06 19:14:50 2009 +0000 +++ b/xen/include/xen/mm.h Fri Mar 06 19:18:39 2009 +0000 @@ -60,6 +60,9 @@ unsigned long avail_domheap_pages(void); unsigned long avail_domheap_pages(void); #define alloc_domheap_page(d,f) (alloc_domheap_pages(d,0,f)) #define free_domheap_page(p) (free_domheap_pages(p,0)) +unsigned int online_page(unsigned long mfn, uint32_t *status); +int offline_page(unsigned long mfn, int broken, uint32_t *status); +int query_page_offline(unsigned long mfn, uint32_t *status); void scrub_heap_pages(void); _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |