[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC] [PATCH] Memory allocation from the nearest node.



On current xen, memory is allocated from a NUMA node on which vcpu#0 is
If there is no free memory on the node, next NUMA node is selected in
numeric order. But second selected node is not always the nearest node
from first selected node.

For example, If NUMA node #n is selected as first node on a machine
which has m nodes, free memory is searched in the following order.

NUMA node #
  (n) -> (n+1) -> ... -> (m-2) -> (m-1) -> (0) -> (1) -> ... -> (n-1)

On memory allocation, if there is no free memory on first selected
node, the nearest node should be used for the next memory allocation.
Because we can get better memory latency by using memory of near node.

This patch fixes to select the nearest node for next memory allocation.

BIOS sets distance between NUMA node to SLIT table of ACPI. This patch
uses the value of SLIT table to get distance between NUMA node.
If ACPI does not have SLIT table, the value of "10" is defined as the
same node and "20" is defined as the different node.

I would like ask any comments of developers.
I will make the modified patch after Xen 3.4 is released.

Thanks,
--
Yuji Shimada


Signed-off-by: Yuji Shimada <shimada-yxb@xxxxxxxxxxxxxxx>

diff -r 5a60eb7fad79 xen/arch/x86/srat.c
--- a/xen/arch/x86/srat.c       Thu Apr 02 14:17:19 2009 +0100
+++ b/xen/arch/x86/srat.c       Tue Apr 07 10:40:06 2009 +0900
@@ -307,7 +307,7 @@
        int index;
 
        if (!acpi_slit)
-               return a == b ? 10 : 20;
+               return a == b ? LOCAL_DISTANCE : REMOTE_DISTANCE;
        index = acpi_slit->locality_count * node_to_pxm(a);
        return acpi_slit->entry[index + node_to_pxm(b)];
 }
diff -r 5a60eb7fad79 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Thu Apr 02 14:17:19 2009 +0100
+++ b/xen/common/page_alloc.c   Tue Apr 07 10:40:06 2009 +0900
@@ -332,6 +332,36 @@
     return needed;
 }
 
+/*
+ * find_nearest_node - find the nearest node
+ * @node: NUMA node number
+ * @used_node_mask: already used nodes
+ *
+ * It returns -1 if no node is found.
+ */
+static int find_nearest_node(unsigned int node, nodemask_t *used_node_mask)
+{
+    int n, val;
+    int min_val = INT_MAX;
+    int best_node = -1;
+
+    for_each_online_node(n) {
+        /* Don't want a node more than once */
+        if (node_isset(n, *used_node_mask))
+            continue;
+
+        /* Use the distance array to find the distance */
+        val = node_distance(node, n);
+
+        if (val < min_val) {
+            min_val = val;
+            best_node = n;
+        }
+    }
+
+    return best_node;
+}
+
 /* Allocate 2^@order contiguous pages. */
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
@@ -342,6 +372,8 @@
     unsigned long request = 1UL << order;
     cpumask_t extra_cpus_mask, mask;
     struct page_info *pg;
+    int nearest_node;
+    nodemask_t used_node_mask;
 
     if ( node == NUMA_NO_NODE )
         node = cpu_to_node(smp_processor_id());
@@ -354,6 +386,9 @@
     if ( unlikely(order > MAX_ORDER) )
         return NULL;
 
+    nearest_node = node;
+    nodes_clear(used_node_mask);
+
     spin_lock(&heap_lock);
 
     /*
@@ -363,28 +398,33 @@
      */
     for ( i = 0; i < num_nodes; i++ )
     {
+        node_set(nearest_node, used_node_mask);
         zone = zone_hi;
         do {
             /* Check if target node can support the allocation. */
-            if ( !avail[node] || (avail[node][zone] < request) )
+            if ( !avail[nearest_node] || (avail[nearest_node][zone] < request) 
)
                 continue;
 
             /* Find smallest order which can satisfy the request. */
             for ( j = order; j <= MAX_ORDER; j++ )
-                if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
+                if ( (pg =
+                   page_list_remove_head(&heap(nearest_node, zone, j))) )
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
-        /* Pick next node, wrapping around if needed. */
-        if ( ++node == num_nodes )
-            node = 0;
+        /* Pick the next nearest node. */
+        if ( (nearest_node = find_nearest_node(node, &used_node_mask)) < 0 ) {
+            break;
+        }
     }
 
     /* No suitable memory blocks. Fail the request. */
     spin_unlock(&heap_lock);
     return NULL;
 
- found: 
+ found:
+    node = nearest_node;
+
     /* We may have to halve the chunk a number of times. */
     while ( j != order )
     {
diff -r 5a60eb7fad79 xen/include/asm-x86/numa.h
--- a/xen/include/asm-x86/numa.h        Thu Apr 02 14:17:19 2009 +0100
+++ b/xen/include/asm-x86/numa.h        Tue Apr 07 10:40:06 2009 +0900
@@ -23,6 +23,8 @@
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 #define VIRTUAL_BUG_ON(x) 
 #define NODEMAPSIZE 0xfff
+#define LOCAL_DISTANCE 10
+#define REMOTE_DISTANCE 20
 
 extern void numa_add_cpu(int cpu);
 extern void numa_init_array(void);
@@ -67,10 +69,13 @@
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
                                 NODE_DATA(nid)->node_spanned_pages)
 
+extern int __node_distance(int, int);
+#define node_distance(a, b) __node_distance(a,b)
 
 #else
 #define init_cpu_to_node() do {} while (0)
 #define clear_node_cpumask(cpu) do {} while (0)
+#define node_distance(a, b) ((a) == (b) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
 #endif
 
 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.