[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 2/6] xen: Add NUMA support to Xen



This patch introduces a per-node layer to the buddy allocator.  Xen currently
defines the heap as a two-dimensional array, [zone][order].  This patch adds a
node layer between zone and order.  This allows Xen to hand memory out in the
proper zone while preferring local memory allocation, but can fall-back on
non-local to satisfy a zone request.

When the heap is initialized, for each page that is added we determine the node
to which the page belongs and insert into the proper zone, node and order.

When allocating memory via the alloc_heap_pages() function, we try to satisfy
the zone request in the target node which is determined by the requesting cpu.
If no memory is found in the target node for a given zone, we examine other
nodes before increasing the order of the memory request.

Existing heap API has been preserved and uses smp_processor_id() to supply the
required parameter to alloc_heap_pages() and alloc_domheap_pages.  Also, Xen
code can directly call alloc_heap_pages()/__alloc_domheap_pages() supplying the
required cpu parameter to request pages local to the processor in question.

avail_heap_pages() and avail_domheap_pages() have been altered to provide an
easier method for querying total available memory given a zone or node.


-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 common/page_alloc.c |  254 ++++++++++++++++++++++++++++++++++++++++++----------
 include/xen/mm.h    |   10 +-
 include/xen/numa.h  |    2 
 3 files changed, 217 insertions(+), 49 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
Signed-off-by: Ryan Grimm <grimm@xxxxxxxxxx>
---
# HG changeset patch
# User Ryan Harper <ryanh@xxxxxxxxxx>
# Node ID 44ee2cfd164d091249dd133fa65dfd1a4d3f1e66
# Parent  2a81ffed9e53be432c95c3bc99fa2fadd8f93bb9
This patch introduces a per-node layer to the buddy allocator.  Xen currently
defines the heap as a two-dimensional array, [zone][order].  This patch adds a
node layer between zone and order.  This allows Xen to hand memory out in the
proper zone while preferring local memory allocation, but can fall-back on
non-local to satisfy a zone request.

When the heap is initialized, for each page that is added we determine the node
to which the page belongs and insert into the proper zone, node and order.

When allocating memory via the alloc_heap_pages() function, we try to satisfy
the zone request in the target node which is determined by the requesting cpu.
If no memory is found in the target node for a given zone, we examine other
nodes before increasing the order of the memory request.

Existing heap API has been preserved and uses smp_processor_id() to supply the
required parameter to alloc_heap_pages() and alloc_domheap_pages.  Also, Xen
code can directly call alloc_heap_pages()/__alloc_domheap_pages() supplying the
required cpu parameter to request pages local to the processor in question.

avail_heap_pages() and avail_domheap_pages() have been altered to provide an
easier method for querying total available memory given a zone or node.

diff -r 2a81ffed9e53 -r 44ee2cfd164d xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Fri Apr 28 17:56:52 2006
+++ b/xen/common/page_alloc.c   Fri Apr 28 18:00:23 2006
@@ -4,6 +4,7 @@
  * Simple buddy heap allocator for Xen.
  * 
  * Copyright (c) 2002-2004 K A Fraser
+ * Copyright (c) 2006 IBM
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,6 +35,54 @@
 #include <xen/domain_page.h>
 #include <xen/keyhandler.h>
 #include <asm/page.h>
+#include <xen/nodemask.h>
+#ifdef CONFIG_NUMA
+#include <xen/numa.h>
+
+/* min and max paddr per node */
+extern int num_memory_chunks;
+extern node_memory_chunk_t node_memory_chunk[];
+extern int cpu_to_node[];
+
+/* 
+ * NB: assumes caller used page_spans_chunk to check for
+ * splitting across chunk boundaries 
+ */
+int page_to_node(struct page_info *pg)
+{
+    node_memory_chunk_t *c;
+    u64 pg_paddr = page_to_maddr(pg);
+
+    for (c = node_memory_chunk; c < (node_memory_chunk+num_memory_chunks); 
c++) {
+        if ( pg_paddr >= c->start_paddr && pg_paddr <= c->end_paddr ) {
+            ASSERT(c->nid < num_online_nodes());
+            return (int)c->nid;
+        }
+    }
+    return -1;
+}
+
+/* check if the list page is head of spans a chunk */
+int page_spans_chunk(struct page_info *pg, unsigned int order)
+{
+    node_memory_chunk_t *c;
+    u64 pg_start = page_to_maddr(pg);
+    u64 pg_end   = pg_start + ((PAGE_SIZE << order)-1);
+
+    if (order == 0)
+        return 0;  /* single page cannot span a chunk */
+
+    for (c = node_memory_chunk; c < (node_memory_chunk+num_memory_chunks); 
c++) {
+        if ( pg_start >= c->start_paddr && pg_start <= c->end_paddr &&
+             pg_end   >= c->start_paddr && pg_end   <= c->end_paddr ) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+#endif
 
 /*
  * Comma-separated list of hexadecimal page numbers containing bad bytes.
@@ -246,9 +295,16 @@
 #define pfn_dom_zone_type(_pfn)                                 \
     (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
 
+/* Up to 2^20 pages can be allocated at once. */
+#ifdef CONFIG_NUMA
+static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1];
+
+static unsigned long avail[NR_ZONES][MAX_NUMNODES];
+#else
 static struct list_head heap[NR_ZONES][MAX_ORDER+1];
 
 static unsigned long avail[NR_ZONES];
+#endif
 
 static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
 
@@ -260,8 +316,16 @@
     memset(avail, 0, sizeof(avail));
 
     for ( i = 0; i < NR_ZONES; i++ )
+#ifdef CONFIG_NUMA
+        for ( j = 0; j < MAX_NUMNODES; j++ ) {
+            unsigned int k; 
+            for ( k = 0; k <= MAX_ORDER; k++ )
+                INIT_LIST_HEAD(&heap[i][j][k]);
+        }
+#else
         for ( j = 0; j <= MAX_ORDER; j++ )
             INIT_LIST_HEAD(&heap[i][j]);
+#endif
 
     /* Pages that are free now go to the domain sub-allocator. */
     for ( i = 0; i < max_page; i++ )
@@ -289,11 +353,22 @@
 
 
 /* Allocate 2^@order contiguous pages. */
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order)
-{
-    int i;
+struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu,
+                                   unsigned int order)
+{
+    int i,j, node, target_node, nodes_online;
     struct page_info *pg;
-
+    struct list_head *list;
+
+#ifdef CONFIG_NUMA
+    target_node = cpu_to_node[cpu];
+    nodes_online = num_online_nodes();
+    ASSERT(target_node >= 0);
+    ASSERT(target_node < num_online_nodes());
+#else
+    target_node = 0;
+    nodes_online = 1;
+#endif
     ASSERT(zone < NR_ZONES);
 
     if ( unlikely(order > MAX_ORDER) )
@@ -301,50 +376,70 @@
 
     spin_lock(&heap_lock);
 
-    /* Find smallest order which can satisfy the request. */
-    for ( i = order; i <= MAX_ORDER; i++ )
-        if ( !list_empty(&heap[zone][i]) )
-            goto found;
+    /* start with requested node, but exhaust all node memory
+     * in requested zone before failing */
+    for ( i = 0; i < nodes_online; i++ ) {
+        node = (target_node+i) % nodes_online;
+        /* Find smallest order which can satisfy the request. */
+        for ( j = order; j <= MAX_ORDER; j++ ) {
+#ifdef CONFIG_NUMA 
+            list = heap[zone][node];
+#else
+            list = heap[zone];
+#endif
+            if ( !list_empty(&list[j]) )
+                goto found;
+        }
+    }
 
     /* No suitable memory blocks. Fail the request. */
     spin_unlock(&heap_lock);
     return NULL;
 
  found: 
-    pg = list_entry(heap[zone][i].next, struct page_info, list);
+    pg = list_entry(list[j].next, struct page_info, list);
     list_del(&pg->list);
 
     /* We may have to halve the chunk a number of times. */
-    while ( i != order )
-    {
-        PFN_ORDER(pg) = --i;
-        list_add_tail(&pg->list, &heap[zone][i]);
-        pg += 1 << i;
+    while ( j != order )
+    {
+        PFN_ORDER(pg) = --j;
+        list_add_tail(&pg->list, &list[j]);
+        pg += 1 << j;
     }
     
     map_alloc(page_to_mfn(pg), 1 << order);
+#ifdef CONFIG_NUMA
+    avail[zone][node] -= 1 << order;
+#else    
     avail[zone] -= 1 << order;
+#endif
 
     spin_unlock(&heap_lock);
 
     return pg;
 }
 
-
-/* Free 2^@order set of pages. */
-void free_heap_pages(
-    unsigned int zone, struct page_info *pg, unsigned int order)
+/*
+ * helper function for free_heap_pages
+ * NB: assumes caller holds heap_lock
+ */
+void merge_pages(
+    struct page_info *pg, unsigned int zone, unsigned int order)
 {
     unsigned long mask;
-
-    ASSERT(zone < NR_ZONES);
-    ASSERT(order <= MAX_ORDER);
-
-    spin_lock(&heap_lock);
-
-    map_free(page_to_mfn(pg), 1 << order);
+    struct list_head *list;
+#ifdef CONFIG_NUMA
+    unsigned int node = page_to_node(pg);
+
+    ASSERT((node >= 0) && (node < num_online_nodes()));
+    avail[zone][node] += 1 << order;
+    list = heap[zone][node];
+#else
     avail[zone] += 1 << order;
-    
+    list = heap[zone];
+#endif
+
     /* Merge chunks as far as possible. */
     while ( order < MAX_ORDER )
     {
@@ -372,8 +467,42 @@
     }
 
     PFN_ORDER(pg) = order;
-    list_add_tail(&pg->list, &heap[zone][order]);
-
+
+    list_add_tail(&pg->list, &list[order]);
+}
+
+/* Free 2^@order set of pages. */
+void free_heap_pages(
+    unsigned int zone, struct page_info *pg, unsigned int order)
+{
+    ASSERT(zone < NR_ZONES);
+    ASSERT(order <= MAX_ORDER);
+
+    spin_lock(&heap_lock);
+
+    map_free(page_to_mfn(pg), 1 << order);
+
+#ifdef CONFIG_NUMA
+    /*
+     * If the page list order spans a chunk, halve the region
+     * until it fits and merge the remaining pages one at a time.
+     */
+    while ( page_spans_chunk(pg, order) ) {
+        int i;
+        struct page_info *p;
+
+        ASSERT(order > 0);
+
+        PFN_ORDER(pg) = --order;
+        for ( i=0; i<(1<<order); i++ ) {
+            p = pg+(1<<order)+i;
+            PFN_ORDER(p) = 0;
+            merge_pages(p, zone, 0);
+        }
+    }
+#endif
+    merge_pages(pg, zone, order);
+    
     spin_unlock(&heap_lock);
 }
 
@@ -467,7 +596,7 @@
     int i;
 
     local_irq_save(flags);
-    pg = alloc_heap_pages(MEMZONE_XEN, order);
+    pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order);
     local_irq_restore(flags);
 
     if ( unlikely(pg == NULL) )
@@ -531,8 +660,8 @@
 }
 
 
-struct page_info *alloc_domheap_pages(
-    struct domain *d, unsigned int order, unsigned int flags)
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags)
 {
     struct page_info *pg = NULL;
     cpumask_t mask;
@@ -542,17 +671,17 @@
 
     if ( !(flags & ALLOC_DOM_DMA) )
     {
-        pg = alloc_heap_pages(MEMZONE_DOM, order);
+        pg = alloc_heap_pages(MEMZONE_DOM, cpu, order);
         /* Failure? Then check if we can fall back to the DMA pool. */
-        if ( unlikely(pg == NULL) &&
-             ((order > MAX_ORDER) ||
-              (avail[MEMZONE_DMADOM] <
+        if ( unlikely(pg == NULL) 
+             && ((order > MAX_ORDER) ||
+              (avail_heap_pages(MEMZONE_DMADOM,-1) <
                (lowmem_emergency_pool_pages + (1UL << order)))) )
             return NULL;
     }
 
     if ( pg == NULL )
-        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
+        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL )
             return NULL;
 
     mask = pg->u.free.cpumask;
@@ -615,6 +744,13 @@
     spin_unlock(&d->page_alloc_lock);
     
     return pg;
+}
+
+inline struct page_info *alloc_domheap_pages(
+    struct domain *d, unsigned int order, unsigned int flags)
+{
+    return __alloc_domheap_pages(d, smp_processor_id(), order, flags);
+
 }
 
 
@@ -690,13 +826,32 @@
 }
 
 
-unsigned long avail_domheap_pages(void)
+u64 avail_heap_pages(int zone, int node)
+{
+    int i,j;
+    u64 free_pages = 0;
+   
+    for (i=0; i<NR_ZONES; i++)
+        if ( (zone == -1) || (zone == i) )
+            for (j=0; j<num_online_nodes(); j++)
+                if ( (node == -1) || (node == j) )
+#ifdef CONFIG_NUMA
+                    free_pages += avail[i][j];            
+#else
+                    free_pages += avail[i];
+#endif
+
+    return free_pages;
+}
+
+u64 avail_domheap_pages(void)
 {
     unsigned long avail_nrm, avail_dma;
-
-    avail_nrm = avail[MEMZONE_DOM];
-
-    avail_dma = avail[MEMZONE_DMADOM];
+    
+    /* return avail[MEMZONE_DOM] + avail[MEMZONE_DMADOM] */
+    avail_nrm = avail_heap_pages(MEMZONE_DOM,-1);
+    avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1);
+
     if ( avail_dma > lowmem_emergency_pool_pages )
         avail_dma -= lowmem_emergency_pool_pages;
     else
@@ -705,16 +860,21 @@
     return avail_nrm + avail_dma;
 }
 
+u64 avail_nodeheap_pages(int node)
+{
+    return avail_heap_pages(-1, node);
+}
 
 static void pagealloc_keyhandler(unsigned char key)
 {
     printk("Physical memory information:\n");
-    printk("    Xen heap: %lukB free\n"
-           "    DMA heap: %lukB free\n"
-           "    Dom heap: %lukB free\n",
-           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
-           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+
+    printk("    Xen heap: %"PRIu64"kB free\n"
+           "    DMA heap: %"PRIu64"kB free\n"
+           "    Dom heap: %"PRIu64"kB free\n",
+           avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), 
+           avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10));
 }
 
 
diff -r 2a81ffed9e53 -r 44ee2cfd164d xen/include/xen/mm.h
--- a/xen/include/xen/mm.h      Fri Apr 28 17:56:52 2006
+++ b/xen/include/xen/mm.h      Fri Apr 28 18:00:23 2006
@@ -45,7 +45,8 @@
 /* Generic allocator. These functions are *not* interrupt-safe. */
 void init_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned long nr_pages);
-struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order);
+struct page_info *alloc_heap_pages(
+    unsigned int zone, unsigned int cpu, unsigned int order);
 void free_heap_pages(
     unsigned int zone, struct page_info *pg, unsigned int order);
 void scrub_heap_pages(void);
@@ -61,8 +62,13 @@
 void init_domheap_pages(paddr_t ps, paddr_t pe);
 struct page_info *alloc_domheap_pages(
     struct domain *d, unsigned int order, unsigned int flags);
+#ifdef CONFIG_NUMA
+struct page_info *__alloc_domheap_pages(
+    struct domain *d, unsigned int cpu, unsigned int order, unsigned int 
flags);
+#endif
 void free_domheap_pages(struct page_info *pg, unsigned int order);
-unsigned long avail_domheap_pages(void);
+u64 avail_domheap_pages(void);
+u64 avail_heap_pages(int zone, int node);
 #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
 
diff -r 2a81ffed9e53 -r 44ee2cfd164d xen/include/xen/numa.h
--- a/xen/include/xen/numa.h    Fri Apr 28 17:56:52 2006
+++ b/xen/include/xen/numa.h    Fri Apr 28 18:00:23 2006
@@ -35,6 +35,8 @@
 extern int cpu_to_node[];
 extern cpumask_t node_to_cpumask[];
 
+int page_to_node(struct page_info *pg);
+
 int numa_init(void);
 
 #endif /* _XEN_NUMA_H */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.