Xen project Mailing List

[Xen-devel] [RFC][PATCH] 7/9 Populate-on-demand memory: Xen interface

To: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>

From: "George Dunlap" <dunlapg@xxxxxxxxx>

Date: Tue, 23 Dec 2008 13:55:29 +0000

Delivery-date: Tue, 23 Dec 2008 05:55:57 -0800

Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=message-id:date:from:sender:to:subject:mime-version:content-type :content-transfer-encoding:content-disposition:x-google-sender-auth; b=qOAhsrjLYlVU/msyj2b/ltXnhDnwFXJqZ5F0Liv1r7DK4ElIcoqk0eEc5yw0X/yeha sYA/U0uimtJBVjZnknFxYYR20UrorxrIM0Y1K93uUTefiB/ggSBa9x13BCFPEaSBrfCB JxhRpsQqB1WHJOW0Xvg/T4rzzVETRoA8m0nJo=

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

Implement Xen interface to PoD functionality. * Increase the number of MEMOP bits from 4 to 6 (increasing the number of available memory operations from 16 to 64). * Introduce XENMEMF_populate_on_demand, which will cause populate_physmap() to fill a range with PoD entries rather than backing it with ram * Introduce XENMEM_[sg]et_pod_target operation to the memory hypercall, to get and set PoD cache size. set_pod_target() should be called during domain creation, as well as after modifying the memory target of any domain which may have outstanding PoD entries. Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx> diff -r 90feb993b0b8 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/arch/x86/mm.c Tue Dec 23 11:35:30 2008 +0000 @@ -3976,6 +3976,49 @@ return 0; } + case XENMEM_set_pod_target: + case XENMEM_get_pod_target: + { + xen_pod_target_t target; + struct domain *d; + + /* Support DOMID_SELF? */ + if ( !IS_PRIV(current->domain) ) + return -EINVAL; + + if ( copy_from_guest(&target, arg, 1) ) + return -EFAULT; + + rc = rcu_lock_target_domain_by_id(target.domid, &d); + if ( rc != 0 ) + return rc; + + if ( op == XENMEM_set_pod_target ) + { + if ( target.target_pages > d->max_pages ) + { + rc = -EINVAL; + goto pod_target_out_unlock; + } + + rc = p2m_pod_set_mem_target(d, target.target_pages); + } + + target.tot_pages = d->tot_pages; + target.pod_cache_pages = d->arch.p2m->pod.count; + target.pod_entries = d->arch.p2m->pod.entry_count; + + if ( copy_to_guest(arg, &target, 1) ) + { + rc= -EFAULT; + goto pod_target_out_unlock; + } + + pod_target_out_unlock: + rcu_unlock_domain(d); + return rc; + } + default: return subarch_memory_op(op, arg); } diff -r 90feb993b0b8 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/arch/x86/mm/p2m.c Tue Dec 23 11:35:30 2008 +0000 @@ -387,6 +387,150 @@ return p; } +/* Set the size of the cache, allocating or freeing as necessary. */ +static int +p2m_pod_set_cache_target(struct domain *d, unsigned long pod_target) +{ + struct p2m_domain *p2md = d->arch.p2m; + int ret = 0; + + /* Increasing the target */ + while ( pod_target > p2md->pod.count ) + { + struct page_info * page; + int order; + + if ( (pod_target - p2md->pod.count) >= (1>>9) ) + order = 9; + else + order = 0; + + page = alloc_domheap_pages(d, order, 0); + if ( unlikely(page == NULL) ) + goto out; + + p2m_pod_cache_add(d, page, order); + } + + /* Decreasing the target */ + /* We hold the p2m lock here, so we don't need to worry about + * cache disappearing under our feet. */ + while ( pod_target < p2md->pod.count ) + { + struct page_info * page; + int order, i; + + /* Grab the lock before checking that pod.super is empty, or the last + * entries may disappear before we grab the lock. */ + spin_lock(&d->page_alloc_lock); + + if ( (p2md->pod.count - pod_target) > (1>>9) + && !list_empty(&p2md->pod.super) ) + order = 9; + else + order = 0; + + page = p2m_pod_cache_get(d, order); + + ASSERT(page != NULL); + + spin_unlock(&d->page_alloc_lock); + + /* Then free them */ + for ( i = 0 ; i < (1 << order) ; i++ ) + { + /* Copied from common/memory.c:guest_remove_page() */ + if ( unlikely(!get_page(page+i, d)) ) + { + gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id); + ret = -EINVAL; + goto out; + } + + if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) ) + put_page_and_type(page+i); + + if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) ) + put_page(page+i); + + put_page(page+i); + } + } + +out: + return ret; +} + +/* + * The "right behavior" here requires some careful thought. First, some + * definitions: + * + M: static_max + * + B: number of pages the balloon driver has ballooned down to. + * + P: Number of populated pages. + * + T: Old target + * + T': New target + * + * The following equations should hold: + * 0 <= P <= T <= B <= M + * d->arch.p2m->pod.entry_count == B - P + * d->tot_pages == P + d->arch.p2m->pod.count + * + * Now we have the following potential cases to cover: + * B <T': Set the PoD cache size equal to the number of outstanding PoD + * entries. The balloon driver will deflate the balloon to give back + * the remainder of the ram to the guest OS. + * T <T'<B : Increase PoD cache size. + * T'<T<=B : Here we have a choice. We can decrease the size of the cache, + * get the memory right away. However, that means every time we + * reduce the memory target we risk the guest attempting to populate the + * memory before the balloon driver has reached its new target. Safer to + * never reduce the cache size here, but only when the balloon driver frees + * PoD ranges. + * + * If there are many zero pages, we could reach the target also by doing + * zero sweeps and marking the ranges PoD; but the balloon driver will have + * to free this memory eventually anyway, so we don't actually gain that much + * by doing so. + * + * NB that the equation (B<T') may require adjustment to the cache + * size as PoD pages are freed as well; i.e., freeing a PoD-backed + * entry when pod.entry_count == pod.count requires us to reduce both + * pod.entry_count and pod.count. + */ +int +p2m_pod_set_mem_target(struct domain *d, unsigned long target) +{ + unsigned pod_target; + struct p2m_domain *p2md = d->arch.p2m; + int ret = 0; + unsigned long populated; + + /* P == B: Nothing to do. */ + if ( p2md->pod.entry_count == 0 ) + goto out; + + /* T' < B: Don't reduce the cache size; let the balloon driver + * take care of it. */ + if ( target < d->tot_pages ) + goto out; + + populated = d->tot_pages - p2md->pod.count; + + pod_target = target - populated; + + /* B < T': Set the cache size equal to # of outstanding entries, + * let the balloon driver fill in the rest. */ + if ( pod_target > p2md->pod.entry_count ) + pod_target = p2md->pod.entry_count; + + ASSERT( pod_target > p2md->pod.count ); + + ret = p2m_pod_set_cache_target(d, pod_target); + +out: + return ret; +} + void p2m_pod_empty_cache(struct domain *d) { @@ -537,6 +681,13 @@ ram--; } } + + /* If we've reduced our "liabilities" beyond our "assets", free some */ + if ( p2md->pod.entry_count < p2md->pod.count ) + { + printk("b %d\n", p2md->pod.entry_count); + p2m_pod_set_cache_target(d, p2md->pod.entry_count); + } /* If there are no more non-PoD entries, tell decrease_reservation() that * there's nothing left to do. */ @@ -786,7 +937,7 @@ /* Stop if we're past our limit and we have found *something*. * * NB that this is a zero-sum game; we're increasing our cache size - * by re-increasing our 'debt'. Since we hold the p2m lock, + * by increasing our 'debt'. Since we hold the p2m lock, * (entry_count - count) must remain the same. */ if ( !list_empty(&p2md->pod.super) && i < limit ) break; diff -r 90feb993b0b8 xen/arch/x86/x86_64/compat/mm.c --- a/xen/arch/x86/x86_64/compat/mm.c Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/arch/x86/x86_64/compat/mm.c Tue Dec 23 11:35:30 2008 +0000 @@ -122,6 +122,29 @@ #define XLAT_memory_map_HNDL_buffer(_d_, _s_) ((void)0) XLAT_memory_map(&cmp, nat); #undef XLAT_memory_map_HNDL_buffer + if ( copy_to_guest(arg, &cmp, 1) ) + rc = -EFAULT; + + break; + } + + case XENMEM_set_pod_target: + case XENMEM_get_pod_target: + { + struct compat_pod_target cmp; + struct xen_pod_target *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE; + + if ( copy_from_guest(&cmp, arg, 1) ) + return -EFAULT; + + XLAT_pod_target(nat, &cmp); + + rc = arch_memory_op(op, guest_handle_from_ptr(nat, void)); + if ( rc < 0 ) + break; + + XLAT_pod_target(&cmp, nat); + if ( copy_to_guest(arg, &cmp, 1) ) rc = -EFAULT; diff -r 90feb993b0b8 xen/common/memory.c --- a/xen/common/memory.c Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/common/memory.c Tue Dec 23 11:35:30 2008 +0000 @@ -111,31 +111,40 @@ if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) ) goto out; - page = alloc_domheap_pages(d, a->extent_order, a->memflags); - if ( unlikely(page == NULL) ) + if ( a->memflags & MEMF_populate_on_demand ) { - gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: " - "id=%d memflags=%x (%ld of %d)\n", - a->extent_order, d->domain_id, a->memflags, - i, a->nr_extents); - goto out; + if ( guest_physmap_mark_populate_on_demand(d, gpfn, + a->extent_order) < 0 ) + goto out; } + else + { + page = alloc_domheap_pages(d, a->extent_order, a->memflags); + if ( unlikely(page == NULL) ) + { + gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: " + "id=%d memflags=%x (%ld of %d)\n", + a->extent_order, d->domain_id, a->memflags, + i, a->nr_extents); + goto out; + } - mfn = page_to_mfn(page); - guest_physmap_add_page(d, gpfn, mfn, a->extent_order); + mfn = page_to_mfn(page); + guest_physmap_add_page(d, gpfn, mfn, a->extent_order); - if ( !paging_mode_translate(d) ) - { - for ( j = 0; j < (1 << a->extent_order); j++ ) - set_gpfn_from_mfn(mfn + j, gpfn + j); + if ( !paging_mode_translate(d) ) + { + for ( j = 0; j < (1 << a->extent_order); j++ ) + set_gpfn_from_mfn(mfn + j, gpfn + j); - /* Inform the domain of the new page's machine address. */ - if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) ) - goto out; + /* Inform the domain of the new page's machine address. */ + if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) ) + goto out; + } } } - out: +out: a->nr_done = i; } @@ -527,6 +536,10 @@ args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags)); + if ( op == XENMEM_populate_physmap + && (reservation.mem_flags & XENMEMF_populate_on_demand) ) + args.memflags |= MEMF_populate_on_demand; + if ( likely(reservation.domid == DOMID_SELF) ) { d = rcu_lock_current_domain(); diff -r 90feb993b0b8 xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/include/asm-x86/p2m.h Tue Dec 23 11:35:30 2008 +0000 @@ -261,6 +261,10 @@ * (usually in preparation for domain destruction) */ void p2m_pod_empty_cache(struct domain *d); +/* Set populate-on-demand cache size so that the total memory allocated to a + * domain matches target */ +int p2m_pod_set_mem_target(struct domain *d, unsigned long target); + /* Call when decreasing memory reservation to handle PoD entries properly. * Will return '1' if all entries were handled and nothing more need be done.*/ int diff -r 90feb993b0b8 xen/include/public/memory.h --- a/xen/include/public/memory.h Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/include/public/memory.h Tue Dec 23 11:35:30 2008 +0000 @@ -48,6 +48,8 @@ /* NUMA node to allocate from. */ #define XENMEMF_node(x) (((x) + 1) << 8) #define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu) +/* Flag to populate physmap with populate-on-demand entries */ +#define XENMEMF_populate_on_demand (1<<16) #endif struct xen_memory_reservation { @@ -299,6 +301,19 @@ typedef struct xen_foreign_memory_map xen_foreign_memory_map_t; DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t); +#define XENMEM_set_pod_target 16 +#define XENMEM_get_pod_target 17 +struct xen_pod_target { + /* IN */ + uint64_t target_pages; + /* OUT */ + uint64_t tot_pages; + uint64_t pod_cache_pages; + uint64_t pod_entries; + /* IN */ + domid_t domid; +}; +typedef struct xen_pod_target xen_pod_target_t; #endif /* __XEN_PUBLIC_MEMORY_H__ */ /* diff -r 90feb993b0b8 xen/include/xen/hypercall.h --- a/xen/include/xen/hypercall.h Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/include/xen/hypercall.h Tue Dec 23 11:35:30 2008 +0000 @@ -48,7 +48,7 @@ * at what point in the page list to resume. For this purpose I steal the * high-order bits of the @cmd parameter, which are otherwise unused and zero. */ -#define MEMOP_EXTENT_SHIFT 4 /* cmd[:4] == start_extent */ +#define MEMOP_EXTENT_SHIFT 6 /* cmd[:6] == start_extent */ #define MEMOP_CMD_MASK ((1 << MEMOP_EXTENT_SHIFT) - 1) extern long diff -r 90feb993b0b8 xen/include/xen/mm.h --- a/xen/include/xen/mm.h Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/include/xen/mm.h Tue Dec 23 11:35:30 2008 +0000 @@ -72,6 +72,8 @@ /* memflags: */ #define _MEMF_no_refcount 0 #define MEMF_no_refcount (1U<<_MEMF_no_refcount) +#define _MEMF_populate_on_demand 1 +#define MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand) #define _MEMF_node 8 #define MEMF_node(n) ((((n)+1)&0xff)<<_MEMF_node) #define _MEMF_bits 24 diff -r 90feb993b0b8 xen/include/xlat.lst --- a/xen/include/xlat.lst Fri Dec 19 17:54:23 2008 +0000 +++ b/xen/include/xlat.lst Tue Dec 23 11:35:30 2008 +0000 @@ -38,6 +38,7 @@ ! memory_exchange memory.h ! memory_map memory.h ! memory_reservation memory.h +! pod_target memory.h ! translate_gpfn_list memory.h ! sched_poll sched.h ? sched_remote_shutdown sched.h _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.