[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] numa: Attempt more efficient NUMA allocation in hypervisor by default.
# HG changeset patch # User Keir Fraser <keir.fraser@xxxxxxxxxx> # Date 1280932528 -3600 # Node ID 581ebaa7e2da17c23a2dd890943572837a02b29f # Parent 39448a99227b61abb463c91e7e7c93763ddb3dce numa: Attempt more efficient NUMA allocation in hypervisor by default. 1. Try to allocate from nodes containing CPUs which a guest can be scheduled on. 2. Remember which node we allocated from last, and round-robin allocations among above-mentioned nodes. Signed-off-by: Keir Fraser <keir.fraser@xxxxxxxxxx> --- xen/common/domain.c | 29 +++++++++++++++++++ xen/common/memory.c | 9 ++---- xen/common/page_alloc.c | 72 ++++++++++++++++++++++++++++++++---------------- xen/common/schedule.c | 3 ++ xen/include/xen/sched.h | 9 ++++++ 5 files changed, 93 insertions(+), 29 deletions(-) diff -r 39448a99227b -r 581ebaa7e2da xen/common/domain.c --- a/xen/common/domain.c Wed Aug 04 11:21:40 2010 +0100 +++ b/xen/common/domain.c Wed Aug 04 15:35:28 2010 +0100 @@ -191,6 +191,8 @@ struct vcpu *alloc_vcpu( /* Must be called after making new vcpu visible to for_each_vcpu(). */ vcpu_check_shutdown(v); + domain_update_node_affinity(d); + return v; } @@ -234,6 +236,8 @@ struct domain *domain_create( spin_lock_init(&d->hypercall_deadlock_mutex); INIT_PAGE_LIST_HEAD(&d->page_list); INIT_PAGE_LIST_HEAD(&d->xenpage_list); + + spin_lock_init(&d->node_affinity_lock); spin_lock_init(&d->shutdown_lock); d->shutdown_code = -1; @@ -338,6 +342,31 @@ struct domain *domain_create( xfree(d->pirq_to_evtchn); free_domain_struct(d); return NULL; +} + + +void domain_update_node_affinity(struct domain *d) +{ + cpumask_t cpumask = CPU_MASK_NONE; + nodemask_t nodemask = NODE_MASK_NONE; + struct vcpu *v; + unsigned int node; + + spin_lock(&d->node_affinity_lock); + + for_each_vcpu ( d, v ) + cpus_or(cpumask, cpumask, v->cpu_affinity); + + for_each_online_node ( node ) + { + if ( cpus_intersects(node_to_cpumask(node), cpumask) ) + node_set(node, nodemask); + else + node_clear(node, nodemask); + } + + d->node_affinity = nodemask; + spin_unlock(&d->node_affinity_lock); } diff -r 39448a99227b -r 581ebaa7e2da xen/common/memory.c --- a/xen/common/memory.c Wed Aug 04 11:21:40 2010 +0100 +++ b/xen/common/memory.c Wed Aug 04 15:35:28 2010 +0100 @@ -259,7 +259,7 @@ static long memory_exchange(XEN_GUEST_HA unsigned long in_chunk_order, out_chunk_order; xen_pfn_t gpfn, gmfn, mfn; unsigned long i, j, k; - unsigned int node, memflags = 0; + unsigned int memflags = 0; long rc = 0; struct domain *d; struct page_info *page; @@ -324,10 +324,7 @@ static long memory_exchange(XEN_GUEST_HA d, XENMEMF_get_address_bits(exch.out.mem_flags) ? : (BITS_PER_LONG+PAGE_SHIFT))); - node = XENMEMF_get_node(exch.out.mem_flags); - if ( node == NUMA_NO_NODE ) - node = domain_to_node(d); - memflags |= MEMF_node(node); + memflags |= MEMF_node(XENMEMF_get_node(exch.out.mem_flags)); for ( i = (exch.nr_exchanged >> in_chunk_order); i < (exch.in.nr_extents >> in_chunk_order); @@ -545,7 +542,7 @@ long do_memory_op(unsigned long cmd, XEN } args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags)); - if (reservation.mem_flags & XENMEMF_exact_node_request) + if ( reservation.mem_flags & XENMEMF_exact_node_request ) args.memflags |= MEMF_exact_node; if ( op == XENMEM_populate_physmap diff -r 39448a99227b -r 581ebaa7e2da xen/common/page_alloc.c --- a/xen/common/page_alloc.c Wed Aug 04 11:21:40 2010 +0100 +++ b/xen/common/page_alloc.c Wed Aug 04 15:35:28 2010 +0100 @@ -295,20 +295,29 @@ static unsigned long init_node_heap(int /* Allocate 2^@order contiguous pages. */ static struct page_info *alloc_heap_pages( unsigned int zone_lo, unsigned int zone_hi, - unsigned int node, unsigned int order, unsigned int memflags) -{ - unsigned int i, j, zone = 0; - unsigned int num_nodes = num_online_nodes(); + unsigned int order, unsigned int memflags, + struct domain *d) +{ + unsigned int first_node, i, j, zone = 0, nodemask_retry = 0; + unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1); unsigned long request = 1UL << order; - bool_t exact_node_request = !!(memflags & MEMF_exact_node); cpumask_t extra_cpus_mask, mask; struct page_info *pg; + nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map; if ( node == NUMA_NO_NODE ) { - node = cpu_to_node(smp_processor_id()); - exact_node_request = 0; - } + memflags &= ~MEMF_exact_node; + if ( d != NULL ) + { + node = next_node(d->last_alloc_node, nodemask); + if ( node >= MAX_NUMNODES ) + node = first_node(nodemask); + } + if ( node >= MAX_NUMNODES ) + node = cpu_to_node(smp_processor_id()); + } + first_node = node; ASSERT(node >= 0); ASSERT(zone_lo <= zone_hi); @@ -335,7 +344,7 @@ static struct page_info *alloc_heap_page * zone before failing, only calc new node value if we fail to find memory * in target node, this avoids needless computation on fast-path. */ - for ( i = 0; i < num_nodes; i++ ) + for ( ; ; ) { zone = zone_hi; do { @@ -349,18 +358,35 @@ static struct page_info *alloc_heap_page goto found; } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */ - if ( exact_node_request ) + if ( memflags & MEMF_exact_node ) goto not_found; - /* Pick next node, wrapping around if needed. */ - node = next_node(node, node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); + /* Pick next node. */ + if ( !node_isset(node, nodemask) ) + { + /* Very first node may be caller-specified and outside nodemask. */ + ASSERT(!nodemask_retry); + first_node = node = first_node(nodemask); + if ( node < MAX_NUMNODES ) + continue; + } + else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES ) + node = first_node(nodemask); + if ( node == first_node ) + { + /* When we have tried all in nodemask, we fall back to others. */ + if ( nodemask_retry++ ) + goto not_found; + nodes_andnot(nodemask, node_online_map, nodemask); + first_node = node = first_node(nodemask); + if ( node >= MAX_NUMNODES ) + goto not_found; + } } try_tmem: /* Try to free memory from tmem */ - if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL ) + if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL ) { /* reassigning an already allocated anonymous heap page */ spin_unlock(&heap_lock); @@ -385,6 +411,9 @@ static struct page_info *alloc_heap_page avail[node][zone] -= request; total_avail_pages -= request; ASSERT(total_avail_pages >= 0); + + if ( d != NULL ) + d->last_alloc_node = node; spin_unlock(&heap_lock); @@ -1010,7 +1039,7 @@ void *alloc_xenheap_pages(unsigned int o ASSERT(!in_irq()); pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN, - cpu_to_node(smp_processor_id()), order, memflags); + order, memflags, NULL); if ( unlikely(pg == NULL) ) return NULL; @@ -1153,24 +1182,21 @@ struct page_info *alloc_domheap_pages( { struct page_info *pg = NULL; unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1; - unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone; + unsigned int dma_zone; ASSERT(!in_irq()); - - if ( (node == NUMA_NO_NODE) && (d != NULL) ) - node = domain_to_node(d); bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT)); if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 ) return NULL; if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) ) - pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags); + pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d); if ( (pg == NULL) && ((memflags & MEMF_no_dma) || - ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, - node, order, memflags)) == NULL)) ) + ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order, + memflags, d)) == NULL)) ) return NULL; if ( (d != NULL) && assign_pages(d, pg, order, memflags) ) diff -r 39448a99227b -r 581ebaa7e2da xen/common/schedule.c --- a/xen/common/schedule.c Wed Aug 04 11:21:40 2010 +0100 +++ b/xen/common/schedule.c Wed Aug 04 15:35:28 2010 +0100 @@ -270,6 +270,7 @@ int sched_move_domain(struct domain *d, SCHED_OP(VCPU2OP(v), destroy_vcpu, v); cpus_setall(v->cpu_affinity); + domain_update_node_affinity(d); v->processor = new_p; v->sched_priv = vcpu_priv[v->vcpu_id]; evtchn_move_pirqs(v); @@ -477,6 +478,7 @@ int cpu_disable_scheduler(unsigned int c printk("Breaking vcpu affinity for domain %d vcpu %d\n", v->domain->domain_id, v->vcpu_id); cpus_setall(v->cpu_affinity); + domain_update_node_affinity(d); } if ( v->processor == cpu ) @@ -519,6 +521,7 @@ int vcpu_set_affinity(struct vcpu *v, cp old_affinity = v->cpu_affinity; v->cpu_affinity = *affinity; + domain_update_node_affinity(v->domain); *affinity = old_affinity; if ( !cpu_isset(v->processor, v->cpu_affinity) ) set_bit(_VPF_migrating, &v->pause_flags); diff -r 39448a99227b -r 581ebaa7e2da xen/include/xen/sched.h --- a/xen/include/xen/sched.h Wed Aug 04 11:21:40 2010 +0100 +++ b/xen/include/xen/sched.h Wed Aug 04 15:35:28 2010 +0100 @@ -23,6 +23,8 @@ #include <xen/mm.h> #include <xen/tasklet.h> #include <public/mem_event.h> +#include <xen/cpumask.h> +#include <xen/nodemask.h> #ifdef CONFIG_COMPAT #include <compat/vcpu.h> @@ -326,6 +328,11 @@ struct domain /* Memory paging support */ struct mem_event_domain mem_event; + + /* Currently computed from union of all vcpu cpu-affinity masks. */ + nodemask_t node_affinity; + unsigned int last_alloc_node; + spinlock_t node_affinity_lock; }; struct domain_setup_info @@ -393,6 +400,8 @@ static inline void get_knownalive_domain ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); } +void domain_update_node_affinity(struct domain *d); + struct domain *domain_create( domid_t domid, unsigned int domcr_flags, ssidref_t ssidref); /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |