[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] 5/7 xen: Add basic NUMA support - Physinfo Stats



This patch exports NUMA specific information collected by the hypervisor
in the physinfo hypercall.  This additional information is also
integrated into the xm info command which will display the NUMA
information.

Here is a sample output of xm info on a Dual Opteron (2 Node, 2 CPU):

root@bebop:~ # xm info
system                 : Linux
host                   : bebop
release                : 2.6.12.6-xen0-smp
version                : #1 SMP Fri Dec 16 10:44:58 CST 2005
machine                : i686
nr_cpus                : 2
nr_nodes               : 2
sockets_per_node       : 2
cores_per_socket       : 1
threads_per_core       : 1
cpu_mhz                : 2193
hw_caps                : 078bfbff:e1d3fbff:00000000:00000010
total_memory           : 3583
free_memory            : 2907
mem_chunks             : node0:0x0000000000000000-0x000000000009ffff
                         node0:0x0000000000100000-0x000000007fffffff
                         node1:0x0000000080000000-0x00000000dfffffff
node_to_cpu            : node0:0
                         node1:1
xen_major              : 3
xen_minor              : 0
xen_extra              : .0
xen_caps               : xen-3.0-x86_32
platform_params        : virt_start=0xfc000000
xen_changeset          : Fri Dec 16 10:34:22 2005 -0500 8396:652d00e358e4
cc_compiler            : gcc version 3.3.5 (Debian 1:3.3.5-8ubuntu2)
cc_compile_by          : rharper
cc_compile_domain      : localdomain
cc_compile_date        : Fri Dec 16 13:51:48 CST 2005



nr_nodes               : 2

Note that this is now calculated from num_online_nodes, rather than a
hard-coded value of 1.

mem_chunks             : node0:0x0000000000000000-0x000000000009ffff
                         node0:0x0000000000100000-0x000000007fffffff
                         node1:0x0000000080000000-0x00000000dfffffff

We display the 64-bit address of each memory chunk and which node to
which it belongs.

node_to_cpu            : node0:0
                         node1:1

This provides node to cpu mapping.  The cpu value is a collapsed range,
so for example, on a two node 32-way, the node_to_cpu value might
look like:

node_to_cpu            : node0:0-15
                         node1:16-31

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@xxxxxxxxxx


diffstat output:
 b/xen/include/public/numa_structs.h |   19 ++++++++
 tools/libxc/xc_misc.c               |    3 +
 tools/libxc/xenctrl.h               |    3 +
 tools/python/xen/lowlevel/xc/xc.c   |   64 +++++++++++++++++++++++++++--
 tools/python/xen/xend/XendNode.py   |   67 ++++++++++++++++++++++++++++++
 xen/arch/x86/dom0_ops.c             |   78 +++++++++++++++++++++++++++++++++++-
 xen/include/public/dom0_ops.h       |    4 +
 xen/include/xen/numa.h              |    7 ---
 8 files changed, 232 insertions(+), 13 deletions(-)

Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
Signed-off-by: Ryan Grimm <grimm@xxxxxxxxxx>
---
diff -r ce4e724a0cdd -r a9dc1db4006c tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c     Wed Dec 14 22:49:59 2005
+++ b/tools/libxc/xc_misc.c     Wed Dec 14 23:03:37 2005
@@ -56,6 +56,9 @@
     
     op.cmd = DOM0_PHYSINFO;
     op.interface_version = DOM0_INTERFACE_VERSION;
+    /* set pointers to caller's so memcpy doesn't clobber them */
+    op.u.physinfo.memory_chunks = put_info->memory_chunks;
+    op.u.physinfo.node_to_cpu = put_info->node_to_cpu;
 
     if ( (ret = do_dom0_op(xc_handle, &op)) != 0 )
         return ret;
diff -r ce4e724a0cdd -r a9dc1db4006c tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Wed Dec 14 22:49:59 2005
+++ b/tools/libxc/xenctrl.h     Wed Dec 14 23:03:37 2005
@@ -20,6 +20,7 @@
 #include <xen/sched_ctl.h>
 #include <xen/memory.h>
 #include <xen/acm.h>
+#include <xen/numa_structs.h>
 
 #ifdef __ia64__
 #define XC_PAGE_SHIFT           14
@@ -350,6 +351,8 @@
                        int clear);
 
 typedef dom0_physinfo_t xc_physinfo_t;
+typedef struct node_memory_chunk_s xc_memory_chunk_t;
+typedef uint64_t xc_node_to_cpu_t;
 int xc_physinfo(int xc_handle,
                 xc_physinfo_t *info);
 
diff -r ce4e724a0cdd -r a9dc1db4006c tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Dec 14 22:49:59 2005
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Dec 14 23:03:37 2005
@@ -597,8 +597,19 @@
 {
     xc_physinfo_t info;
     char cpu_cap[128], *p=cpu_cap, *q=cpu_cap;
-    int i;
-    
+    int i,j;
+    PyObject *ret_obj, *memchunk_obj, *node_to_cpu_obj;
+
+    /* make space for mem chunks */
+    info.memory_chunks = 
+        (xc_memory_chunk_t *)malloc( sizeof(xc_memory_chunk_t) * 
+                                     PUBLIC_MAXCHUNKS );
+
+    /* make space for node_to_cpu mapping */
+    info.node_to_cpu = 
+        (xc_node_to_cpu_t *)malloc( sizeof(xc_node_to_cpu_t) *
+                                    PUBLIC_MAX_NUMNODES ); 
+
     if ( xc_physinfo(self->xc_handle, &info) != 0 )
         return PyErr_SetFromErrno(xc_error);
 
@@ -611,16 +622,59 @@
     }
     if(q>cpu_cap)
         *(q-1)=0;
-
-    return Py_BuildValue("{s:i,s:i,s:i,s:i,s:l,s:l,s:i,s:s}",
+    
+    ret_obj = Py_BuildValue("{s:i,s:i,s:i,s:l,s:l,s:i,s:s}",
                          "threads_per_core", info.threads_per_core,
                          "cores_per_socket", info.cores_per_socket,
                          "sockets_per_node", info.sockets_per_node,
-                         "nr_nodes",         info.nr_nodes,
                          "total_memory",     pages_to_mb(info.total_pages),
                          "free_memory",      pages_to_mb(info.free_pages),
                          "cpu_khz",          info.cpu_khz,
                          "hw_caps",          cpu_cap);
+     
+    /* memchunks */
+    memchunk_obj = PyList_New(0);
+ 
+    /* build list of each memchunk's attributes */
+    for ( i=0; i<info.nr_chunks; i++ ) 
+    {
+        PyList_Append(memchunk_obj, 
+                      Py_BuildValue("{s:i,s:K,s:K}",
+                      "node"       , info.memory_chunks[i].nid,
+                      "start_paddr", info.memory_chunks[i].start_paddr,
+                      "end_paddr"  , info.memory_chunks[i].end_paddr));
+    }
+    /* add list of attributes and nr_chunks to physinfo dictionary */
+    PyDict_SetItemString(ret_obj, "mem_chunks", memchunk_obj);
+    PyDict_SetItemString(ret_obj, "nr_chunks", 
+             Py_BuildValue("i", info.nr_chunks));
+ 
+    /* node to cpu mappings */
+    node_to_cpu_obj = PyList_New(0);
+    /* build list of node to cpu mappings */
+    for ( i=0; i<info.nr_nodes; i++ )
+    {
+        cpumap_t cpumap = (cpumap_t)info.node_to_cpu[i];
+        PyObject *cpus = PyList_New(0);
+ 
+        for ( j=0; cpumap != 0; j++ ) 
+        {
+            if ( cpumap & 1 )
+                PyList_Append(cpus, PyInt_FromLong(j));
+            cpumap >>=1;
+        }
+        PyList_Append(node_to_cpu_obj, cpus); 
+    }
+    /* add list of node to cpu mappings and nr_nodes to physinfo dictionary */
+    PyDict_SetItemString(ret_obj, "node_to_cpu",  node_to_cpu_obj);
+    PyDict_SetItemString(ret_obj, "nr_nodes", 
+             Py_BuildValue("i", info.nr_nodes));
+
+    /* free malloc'd memory */
+    free(info.memory_chunks);
+    free(info.node_to_cpu);
+ 
+    return ret_obj;
 }
 
 static PyObject *pyxc_xeninfo(XcObject *self)
diff -r ce4e724a0cdd -r a9dc1db4006c tools/python/xen/xend/XendNode.py
--- a/tools/python/xen/xend/XendNode.py Wed Dec 14 22:49:59 2005
+++ b/tools/python/xen/xend/XendNode.py Wed Dec 14 23:03:37 2005
@@ -56,6 +56,69 @@
                 ['version', ver],
                 ['machine', mch]]
 
+    def list_to_rangepairs(self,cmap):
+            cmap.sort()
+            pairs = []
+            x = y = 0
+            for i in range(0,len(cmap)):
+                try:
+                    if ((cmap[y+1] - cmap[i]) > 1):
+                        pairs.append((cmap[x],cmap[y]))
+                        x = y = i+1
+                    else:
+                        y = y + 1
+                # if we go off the end, then just add x to y
+                except IndexError:
+                    pairs.append((cmap[x],cmap[y]))
+
+            return pairs
+
+    def format_pairs(self,pairs):
+            if not pairs:
+                return "no cpus"
+            out = ""
+            for f,s in pairs:
+                if (f==s):
+                    out += '%d'%f
+                else:
+                    out += '%d-%d'%(f,s)
+                out += ','
+            # trim trailing ','
+            return out[:-1]
+
+    def list_to_strrange(self,list):
+        return self.format_pairs(self.list_to_rangepairs(list))
+
+    def format_memchunks(self, pinfo):
+        str=''
+        whitespace=''
+        try:
+            chunk=pinfo['mem_chunks']
+            for i in range(0, pinfo['nr_chunks']):
+                str+='%snode%d:0x%016x-0x%016x\n' % (whitespace,
+                                                    chunk[i]['node'],
+                                                    chunk[i]['start_paddr'], 
+                                                    chunk[i]['end_paddr']) 
+                whitespace='%25s' % ''
+        except:
+            str='none\n' 
+        return str[:-1]
+        
+    def format_node_to_cpu(self, pinfo):
+        str=''
+        whitespace=''
+        try:
+            node_to_cpu=pinfo['node_to_cpu']
+            for i in range(0, pinfo['nr_nodes']):
+                str+='%snode%d:%s\n' % (whitespace,
+                                        i, 
+                                      self.list_to_strrange(node_to_cpu[i]))
+                whitespace='%25s' % ''        
+        except:
+            str='none\n'
+        return str[:-1];
+
+
     def physinfo(self):
         info = self.xc.physinfo()
 
@@ -64,6 +127,8 @@
                            info['cores_per_socket'] *
                            info['threads_per_core'])
         info['cpu_mhz'] = info['cpu_khz'] / 1000
+        info['mem_chunks'] = self.format_memchunks(info)
+        info['node_to_cpu'] = self.format_node_to_cpu(info)
 
         ITEM_ORDER = ['nr_cpus',
                       'nr_nodes',
@@ -74,6 +139,8 @@
                       'hw_caps',
                       'total_memory',
                       'free_memory',
+                      'mem_chunks',
+                      'node_to_cpu'
                       ]
 
         return [[k, info[k]] for k in ITEM_ORDER]
diff -r ce4e724a0cdd -r a9dc1db4006c xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Wed Dec 14 22:49:59 2005
+++ b/xen/arch/x86/dom0_ops.c   Wed Dec 14 23:03:37 2005
@@ -21,6 +21,7 @@
 #include <asm/irq.h>
 #include <asm/processor.h>
 #include <public/sched_ctl.h>
+#include <xen/numa.h>
 
 #include <asm/mtrr.h>
 #include "mtrr/mtrr.h"
@@ -180,20 +181,93 @@
     case DOM0_PHYSINFO:
     {
         dom0_physinfo_t *pi = &op->u.physinfo;
+        int i;
+        u64 node_to_cpu_64[MAX_NUMNODES];
 
         pi->threads_per_core = smp_num_siblings;
         pi->cores_per_socket = boot_cpu_data.x86_num_cores;
         pi->sockets_per_node = 
             num_online_cpus() / (pi->threads_per_core * pi->cores_per_socket);
-        pi->nr_nodes         = 1;
         pi->total_pages      = total_pages;
         pi->free_pages       = avail_domheap_pages();
         pi->cpu_khz          = cpu_khz;
         memset(pi->hw_cap, 0, sizeof(pi->hw_cap));
         memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4);
+
+#ifdef CONFIG_NUMA
+        /* memory chunks */
+        pi->nr_chunks = num_memory_chunks;
+        DPRINTK("num_memory_chunks:%d\n", num_memory_chunks);
+        for ( i=0; i<num_memory_chunks; i++ ) {
+            DPRINTK("node%d:%"PRIx64"\n", node_memory_chunk[i].nid,
+                                          node_memory_chunk[i].start_paddr);
+            DPRINTK("node%d:%"PRIx64"\n", node_memory_chunk[i].nid,  
+                                          node_memory_chunk[i].end_paddr);
+        }
+
+        /* node to cpu mask */
+        pi->nr_nodes = nodes_detected;
+        for ( i=0; i<nodes_detected; i++ )
+            DPRINTK("node_to_cpu:%lx\n", node_to_cpumask[i].bits[0]);
+    
+        /* copy memory chunk structs to userspace */
+        ret = 0;
+        if ( copy_to_user(u_dom0_op->u.physinfo.memory_chunks, 
+                          node_memory_chunk, 
+                          sizeof(struct node_memory_chunk_s) * 
+                          num_memory_chunks) ) {
+            ret = -EFAULT;
+            break;
+        }
+            
+        /* copy cpu to node mapping to domU */
+        /* converting cpumask to u64 b/c userspace doesn't know about cpumask_t
+           and is accepting a u64 */
+        memset(node_to_cpu_64, 0, sizeof(node_to_cpu_64));
+        for ( i=0; i<nodes_detected; i++) {
+            int j = 0;
+            for ( j=0; j<num_online_cpus(); j++)
+                if ( cpu_isset(j, node_to_cpumask[i]) )
+                    node_to_cpu_64[i] |= (u64)1 << j;        
+        }
+        if ( copy_to_user(u_dom0_op->u.physinfo.node_to_cpu, 
+                          node_to_cpu_64, 
+                          sizeof(node_to_cpu_64[0]) * nodes_detected ) ) {
+            ret = -EFAULT;
+            break;
+        }
+#else
+        /* if no CONFIG_NUMA, construct a memory chunk of all memory 
+         * in system and node to all online cpus map */ 
+        pi->nr_chunks = 1; 
+        /* send over node_memory_chunk */
+        struct node_memory_chunk_s chunk;
+        chunk.start_paddr = 0;
+        chunk.end_paddr = total_pages * PAGE_SIZE;
+        chunk.nid = 1;
+        chunk.pxm = 1;
+        ret = 0;
+        if ( copy_to_user(u_dom0_op->u.physinfo.memory_chunks, 
+                          &chunk, 
+                          sizeof(struct node_memory_chunk_s)) ) {
+            ret = -EFAULT;
+            break;
+        }
+        
+        /* create node to cpu mapping of one node to all online cpus */
+        pi->nr_nodes = 1; 
+        node_to_cpu_64[0] = 0;
+        for ( i=0; i<num_online_cpus(); i++)
+            node_to_cpu_64[0] |= (u64)1 << i; 
+        if ( copy_to_user(u_dom0_op->u.physinfo.node_to_cpu, 
+                          node_to_cpu_64, sizeof(node_to_cpu_64[0])) ) {
+            ret = -EFAULT;
+            break;
+        }
+#endif    
         ret = 0;
         if ( copy_to_user(u_dom0_op, op, sizeof(*op)) )
-           ret = -EFAULT;
+            ret = -EFAULT;
     }
     break;
     
diff -r ce4e724a0cdd -r a9dc1db4006c xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Wed Dec 14 22:49:59 2005
+++ b/xen/include/public/dom0_ops.h     Wed Dec 14 23:03:37 2005
@@ -13,6 +13,7 @@
 
 #include "xen.h"
 #include "sched_ctl.h"
+#include "numa_structs.h"
 
 /*
  * Make sure you increment the interface version whenever you modify this file!
@@ -203,6 +204,9 @@
     unsigned long total_pages;
     unsigned long free_pages;
     uint32_t hw_cap[8];
+    uint32_t nr_chunks;
+    struct node_memory_chunk_s *memory_chunks;
+    cpumap_t *node_to_cpu;
 } dom0_physinfo_t;
 
 /*
diff -r ce4e724a0cdd -r a9dc1db4006c xen/include/xen/numa.h
--- a/xen/include/xen/numa.h    Wed Dec 14 22:49:59 2005
+++ b/xen/include/xen/numa.h    Wed Dec 14 23:03:37 2005
@@ -2,6 +2,7 @@
 #define _LINUX_NUMA_H
 
 #include <xen/config.h>
+#include <public/numa_structs.h>
 
 #ifdef CONFIG_DISCONTIGMEM
 #include <asm/numnodes.h>
@@ -20,12 +21,6 @@
 #define MAXCHUNKS    (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
 
 
-struct node_memory_chunk_s {
-   u64 start_paddr;
-   u64 end_paddr;
-   u8 pxm;     // proximity domain of node
-   u8 nid;     // which cnode contains this chunk?
-};
 
 extern struct node_memory_chunk_s node_memory_chunk[];
 extern int num_memory_chunks;
diff -r ce4e724a0cdd -r a9dc1db4006c xen/include/public/numa_structs.h
--- /dev/null   Wed Dec 14 22:49:59 2005
+++ b/xen/include/public/numa_structs.h Wed Dec 14 23:03:37 2005
@@ -0,0 +1,19 @@
+#ifndef __XEN_PUBLIC_NUMA_STRUCTS_H__
+
+#define __XEN_PUBLIC_NUMA_STRUCTS_H__
+
+#include "xen.h"
+
+/* define these for xc to use b/c MAX_NUMNODES and MAX_CHUNKS
+ * are not exposed in /public */
+#define PUBLIC_MAX_NUMNODES 16
+#define PUBLIC_MAXCHUNKS 32
+
+struct node_memory_chunk_s {
+   uint64_t start_paddr; /* physical address of chunk start */
+   uint64_t end_paddr;   /* physical address of chunk end */
+   uint8_t pxm;          /* proximity domain of node */
+   uint8_t nid;          /* which cnode contains this chunk? */
+};
+
+#endif

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.