[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] numa: Attempt more efficient NUMA allocation in hypervisor by default.



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1280932528 -3600
# Node ID 581ebaa7e2da17c23a2dd890943572837a02b29f
# Parent  39448a99227b61abb463c91e7e7c93763ddb3dce
numa: Attempt more efficient NUMA allocation in hypervisor by default.

 1. Try to allocate from nodes containing CPUs which a guest can be
 scheduled on.
 2. Remember which node we allocated from last, and round-robin
 allocations among above-mentioned nodes.

Signed-off-by: Keir Fraser <keir.fraser@xxxxxxxxxx>
---
 xen/common/domain.c     |   29 +++++++++++++++++++
 xen/common/memory.c     |    9 ++----
 xen/common/page_alloc.c |   72 ++++++++++++++++++++++++++++++++----------------
 xen/common/schedule.c   |    3 ++
 xen/include/xen/sched.h |    9 ++++++
 5 files changed, 93 insertions(+), 29 deletions(-)

diff -r 39448a99227b -r 581ebaa7e2da xen/common/domain.c
--- a/xen/common/domain.c       Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/common/domain.c       Wed Aug 04 15:35:28 2010 +0100
@@ -191,6 +191,8 @@ struct vcpu *alloc_vcpu(
     /* Must be called after making new vcpu visible to for_each_vcpu(). */
     vcpu_check_shutdown(v);
 
+    domain_update_node_affinity(d);
+
     return v;
 }
 
@@ -234,6 +236,8 @@ struct domain *domain_create(
     spin_lock_init(&d->hypercall_deadlock_mutex);
     INIT_PAGE_LIST_HEAD(&d->page_list);
     INIT_PAGE_LIST_HEAD(&d->xenpage_list);
+
+    spin_lock_init(&d->node_affinity_lock);
 
     spin_lock_init(&d->shutdown_lock);
     d->shutdown_code = -1;
@@ -338,6 +342,31 @@ struct domain *domain_create(
     xfree(d->pirq_to_evtchn);
     free_domain_struct(d);
     return NULL;
+}
+
+
+void domain_update_node_affinity(struct domain *d)
+{
+    cpumask_t cpumask = CPU_MASK_NONE;
+    nodemask_t nodemask = NODE_MASK_NONE;
+    struct vcpu *v;
+    unsigned int node;
+
+    spin_lock(&d->node_affinity_lock);
+
+    for_each_vcpu ( d, v )
+        cpus_or(cpumask, cpumask, v->cpu_affinity);
+
+    for_each_online_node ( node )
+    {
+        if ( cpus_intersects(node_to_cpumask(node), cpumask) )
+            node_set(node, nodemask);
+        else
+            node_clear(node, nodemask);
+    }
+
+    d->node_affinity = nodemask;
+    spin_unlock(&d->node_affinity_lock);
 }
 
 
diff -r 39448a99227b -r 581ebaa7e2da xen/common/memory.c
--- a/xen/common/memory.c       Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/common/memory.c       Wed Aug 04 15:35:28 2010 +0100
@@ -259,7 +259,7 @@ static long memory_exchange(XEN_GUEST_HA
     unsigned long in_chunk_order, out_chunk_order;
     xen_pfn_t     gpfn, gmfn, mfn;
     unsigned long i, j, k;
-    unsigned int  node, memflags = 0;
+    unsigned int  memflags = 0;
     long          rc = 0;
     struct domain *d;
     struct page_info *page;
@@ -324,10 +324,7 @@ static long memory_exchange(XEN_GUEST_HA
         d,
         XENMEMF_get_address_bits(exch.out.mem_flags) ? :
         (BITS_PER_LONG+PAGE_SHIFT)));
-    node = XENMEMF_get_node(exch.out.mem_flags);
-    if ( node == NUMA_NO_NODE )
-        node = domain_to_node(d);
-    memflags |= MEMF_node(node);
+    memflags |= MEMF_node(XENMEMF_get_node(exch.out.mem_flags));
 
     for ( i = (exch.nr_exchanged >> in_chunk_order);
           i < (exch.in.nr_extents >> in_chunk_order);
@@ -545,7 +542,7 @@ long do_memory_op(unsigned long cmd, XEN
         }
 
         args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));
-        if (reservation.mem_flags & XENMEMF_exact_node_request)
+        if ( reservation.mem_flags & XENMEMF_exact_node_request )
             args.memflags |= MEMF_exact_node;
 
         if ( op == XENMEM_populate_physmap
diff -r 39448a99227b -r 581ebaa7e2da xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/common/page_alloc.c   Wed Aug 04 15:35:28 2010 +0100
@@ -295,20 +295,29 @@ static unsigned long init_node_heap(int 
 /* Allocate 2^@order contiguous pages. */
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
-    unsigned int node, unsigned int order, unsigned int memflags)
-{
-    unsigned int i, j, zone = 0;
-    unsigned int num_nodes = num_online_nodes();
+    unsigned int order, unsigned int memflags,
+    struct domain *d)
+{
+    unsigned int first_node, i, j, zone = 0, nodemask_retry = 0;
+    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
     unsigned long request = 1UL << order;
-    bool_t exact_node_request = !!(memflags & MEMF_exact_node);
     cpumask_t extra_cpus_mask, mask;
     struct page_info *pg;
+    nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
 
     if ( node == NUMA_NO_NODE )
     {
-        node = cpu_to_node(smp_processor_id());
-        exact_node_request = 0;
-    }
+        memflags &= ~MEMF_exact_node;
+        if ( d != NULL )
+        {
+            node = next_node(d->last_alloc_node, nodemask);
+            if ( node >= MAX_NUMNODES )
+                node = first_node(nodemask);
+        }
+        if ( node >= MAX_NUMNODES )
+            node = cpu_to_node(smp_processor_id());
+    }
+    first_node = node;
 
     ASSERT(node >= 0);
     ASSERT(zone_lo <= zone_hi);
@@ -335,7 +344,7 @@ static struct page_info *alloc_heap_page
      * zone before failing, only calc new node value if we fail to find memory 
      * in target node, this avoids needless computation on fast-path.
      */
-    for ( i = 0; i < num_nodes; i++ )
+    for ( ; ; )
     {
         zone = zone_hi;
         do {
@@ -349,18 +358,35 @@ static struct page_info *alloc_heap_page
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
-        if ( exact_node_request )
+        if ( memflags & MEMF_exact_node )
             goto not_found;
 
-        /* Pick next node, wrapping around if needed. */
-        node = next_node(node, node_online_map);
-        if (node == MAX_NUMNODES)
-            node = first_node(node_online_map);
+        /* Pick next node. */
+        if ( !node_isset(node, nodemask) )
+        {
+            /* Very first node may be caller-specified and outside nodemask. */
+            ASSERT(!nodemask_retry);
+            first_node = node = first_node(nodemask);
+            if ( node < MAX_NUMNODES )
+                continue;
+        }
+        else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
+            node = first_node(nodemask);
+        if ( node == first_node )
+        {
+            /* When we have tried all in nodemask, we fall back to others. */
+            if ( nodemask_retry++ )
+                goto not_found;
+            nodes_andnot(nodemask, node_online_map, nodemask);
+            first_node = node = first_node(nodemask);
+            if ( node >= MAX_NUMNODES )
+                goto not_found;
+        }
     }
 
  try_tmem:
     /* Try to free memory from tmem */
-    if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
+    if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL )
     {
         /* reassigning an already allocated anonymous heap page */
         spin_unlock(&heap_lock);
@@ -385,6 +411,9 @@ static struct page_info *alloc_heap_page
     avail[node][zone] -= request;
     total_avail_pages -= request;
     ASSERT(total_avail_pages >= 0);
+
+    if ( d != NULL )
+        d->last_alloc_node = node;
 
     spin_unlock(&heap_lock);
 
@@ -1010,7 +1039,7 @@ void *alloc_xenheap_pages(unsigned int o
     ASSERT(!in_irq());
 
     pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
-        cpu_to_node(smp_processor_id()), order, memflags);
+                          order, memflags, NULL);
     if ( unlikely(pg == NULL) )
         return NULL;
 
@@ -1153,24 +1182,21 @@ struct page_info *alloc_domheap_pages(
 {
     struct page_info *pg = NULL;
     unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
-    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
+    unsigned int dma_zone;
 
     ASSERT(!in_irq());
-
-    if ( (node == NUMA_NO_NODE) && (d != NULL) )
-        node = domain_to_node(d);
 
     bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
     if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
         return NULL;
 
     if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
-        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
+        pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
 
     if ( (pg == NULL) &&
          ((memflags & MEMF_no_dma) ||
-          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
-                                  node, order, memflags)) == NULL)) )
+          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
+                                  memflags, d)) == NULL)) )
          return NULL;
 
     if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
diff -r 39448a99227b -r 581ebaa7e2da xen/common/schedule.c
--- a/xen/common/schedule.c     Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/common/schedule.c     Wed Aug 04 15:35:28 2010 +0100
@@ -270,6 +270,7 @@ int sched_move_domain(struct domain *d, 
         SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
 
         cpus_setall(v->cpu_affinity);
+        domain_update_node_affinity(d);
         v->processor = new_p;
         v->sched_priv = vcpu_priv[v->vcpu_id];
         evtchn_move_pirqs(v);
@@ -477,6 +478,7 @@ int cpu_disable_scheduler(unsigned int c
                 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
                         v->domain->domain_id, v->vcpu_id);
                 cpus_setall(v->cpu_affinity);
+                domain_update_node_affinity(d);
             }
 
             if ( v->processor == cpu )
@@ -519,6 +521,7 @@ int vcpu_set_affinity(struct vcpu *v, cp
 
     old_affinity = v->cpu_affinity;
     v->cpu_affinity = *affinity;
+    domain_update_node_affinity(v->domain);
     *affinity = old_affinity;
     if ( !cpu_isset(v->processor, v->cpu_affinity) )
         set_bit(_VPF_migrating, &v->pause_flags);
diff -r 39448a99227b -r 581ebaa7e2da xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Wed Aug 04 11:21:40 2010 +0100
+++ b/xen/include/xen/sched.h   Wed Aug 04 15:35:28 2010 +0100
@@ -23,6 +23,8 @@
 #include <xen/mm.h>
 #include <xen/tasklet.h>
 #include <public/mem_event.h>
+#include <xen/cpumask.h>
+#include <xen/nodemask.h>
 
 #ifdef CONFIG_COMPAT
 #include <compat/vcpu.h>
@@ -326,6 +328,11 @@ struct domain
 
     /* Memory paging support */
     struct mem_event_domain mem_event;
+
+    /* Currently computed from union of all vcpu cpu-affinity masks. */
+    nodemask_t node_affinity;
+    unsigned int last_alloc_node;
+    spinlock_t node_affinity_lock;
 };
 
 struct domain_setup_info
@@ -393,6 +400,8 @@ static inline void get_knownalive_domain
     ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
 }
 
+void domain_update_node_affinity(struct domain *d);
+
 struct domain *domain_create(
     domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.