[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [XEN] Add basic NUMA/SRAT support to Xen from Linux 2.6.16.29.



# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Node ID f312c2d01d8b9a3c237543b65157da83696cbff5
# Parent  a1f987e9640f3824b15158be1ba0d426503e282f
[XEN] Add basic NUMA/SRAT support to Xen from Linux 2.6.16.29.
Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
 xen/arch/x86/Makefile                        |    2 
 xen/arch/x86/numa.c                          |  302 +++++++++++++++++++++++
 xen/arch/x86/setup.c                         |   34 ++
 xen/arch/x86/smpboot.c                       |    4 
 xen/arch/x86/srat.c                          |  325 +++++++++++++++++++++++++
 xen/drivers/acpi/Makefile                    |    1 
 xen/drivers/acpi/numa.c                      |  216 +++++++++++++++++
 xen/include/asm-x86/acpi.h                   |    3 
 xen/include/asm-x86/config.h                 |    5 
 xen/include/asm-x86/mach-generic/mach_apic.h |    6 
 xen/include/asm-x86/numa.h                   |   65 +++++
 xen/include/asm-x86/numnodes.h               |   26 ++
 xen/include/asm-x86/topology.h               |   40 +++
 xen/include/xen/config.h                     |    2 
 xen/include/xen/nodemask.h                   |  342 +++++++++++++++++++++++++++
 xen/include/xen/numa.h                       |   35 ++
 xen/include/xen/topology.h                   |   27 ++
 17 files changed, 1428 insertions(+), 7 deletions(-)

diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/arch/x86/Makefile     Wed Oct 25 12:25:54 2006 +0100
@@ -28,12 +28,14 @@ obj-y += mm.o
 obj-y += mm.o
 obj-y += mpparse.o
 obj-y += nmi.o
+obj-y += numa.o
 obj-y += physdev.o
 obj-y += rwlock.o
 obj-y += setup.o
 obj-y += shutdown.o
 obj-y += smp.o
 obj-y += smpboot.o
+obj-y += srat.o
 obj-y += string.o
 obj-y += sysctl.o
 obj-y += time.o
diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/arch/x86/setup.c      Wed Oct 25 12:25:54 2006 +0100
@@ -16,6 +16,7 @@
 #include <xen/percpu.h>
 #include <xen/hypercall.h>
 #include <xen/keyhandler.h>
+#include <xen/numa.h>
 #include <public/version.h>
 #include <asm/bitops.h>
 #include <asm/smp.h>
@@ -25,10 +26,12 @@
 #include <asm/desc.h>
 #include <asm/shadow.h>
 #include <asm/e820.h>
+#include <asm/numa.h>
 #include <acm/acm_hooks.h>
 
 extern void dmi_scan_machine(void);
 extern void generic_apic_probe(void);
+extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 
 /*
  * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
@@ -59,6 +62,9 @@ boolean_param("watchdog", opt_watchdog);
 /* "acpi=noirq":  Disables ACPI interrupt routing.                  */
 static void parse_acpi_param(char *s);
 custom_param("acpi", parse_acpi_param);
+
+extern int numa_setup(char *s);
+custom_param("numa", numa_setup);
 
 /* **** Linux config option: propagated to domain0. */
 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
@@ -255,6 +261,20 @@ static void __init init_idle_domain(void
     idle_vcpu[0] = this_cpu(curr_vcpu) = current;
 
     setup_idle_pagetable();
+}
+
+static void srat_detect_node(int cpu)
+{
+   unsigned node;
+   u8 apicid = x86_cpu_to_apicid[cpu];
+
+   node = apicid_to_node[apicid];
+   if (node == NUMA_NO_NODE)
+      node = 0;
+   numa_set_node(cpu, node);
+
+   if (acpi_numa > 0)
+      printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
 }
 
 void __init __start_xen(multiboot_info_t *mbi)
@@ -485,6 +505,12 @@ void __init __start_xen(multiboot_info_t
 
     init_frametable();
 
+    acpi_boot_table_init();
+
+    acpi_numa_init();
+
+    numa_initmem_init(0, max_page);
+
     end_boot_allocator();
 
     /* Initialise the Xen heap, skipping RAM holes. */
@@ -536,8 +562,9 @@ void __init __start_xen(multiboot_info_t
 
     generic_apic_probe();
 
-    acpi_boot_table_init();
     acpi_boot_init();
+
+    init_cpu_to_node();
 
     if ( smp_found_config )
         get_smp_config();
@@ -589,6 +616,11 @@ void __init __start_xen(multiboot_info_t
             break;
         if ( !cpu_online(i) )
             __cpu_up(i);
+
+               /* setup cpu_to_node[] */
+        srat_detect_node(i);
+               /* setup node_to_cpumask based on cpu_to_node[] */
+        numa_add_cpu(i);        
     }
 
     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/arch/x86/smpboot.c    Wed Oct 25 12:25:54 2006 +0100
@@ -43,6 +43,8 @@
 #include <xen/delay.h>
 #include <xen/softirq.h>
 #include <xen/serial.h>
+#include <xen/numa.h>
+#include <asm/numa.h>
 #include <asm/current.h>
 #include <asm/mc146818rtc.h>
 #include <asm/desc.h>
@@ -628,7 +630,7 @@ static void map_cpu_to_logical_apicid(vo
 static void map_cpu_to_logical_apicid(void)
 {
        int cpu = smp_processor_id();
-       int apicid = logical_smp_processor_id();
+       int apicid = hard_smp_processor_id();
 
        cpu_2_logical_apicid[cpu] = apicid;
        map_cpu_to_node(cpu, apicid_to_node(apicid));
diff -r a1f987e9640f -r f312c2d01d8b xen/drivers/acpi/Makefile
--- a/xen/drivers/acpi/Makefile Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/drivers/acpi/Makefile Wed Oct 25 12:25:54 2006 +0100
@@ -1,1 +1,2 @@ obj-y += tables.o
 obj-y += tables.o
+obj-y += numa.o
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/acpi.h
--- a/xen/include/asm-x86/acpi.h        Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/include/asm-x86/acpi.h        Wed Oct 25 12:25:54 2006 +0100
@@ -157,6 +157,8 @@ static inline void check_acpi_pci(void) 
 
 static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
 static inline int acpi_irq_balance_set(char *str) { return 0; }
+extern int acpi_scan_nodes(u64 start, u64 end);
+extern int acpi_numa;
 
 #ifdef CONFIG_ACPI_SLEEP
 
@@ -173,5 +175,6 @@ extern void acpi_reserve_bootmem(void);
 #endif /*CONFIG_ACPI_SLEEP*/
 
 extern u8 x86_acpiid_to_apicid[];
+#define MAX_LOCAL_APIC 256
 
 #endif /*_ASM_ACPI_H*/
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/include/asm-x86/config.h      Wed Oct 25 12:25:54 2006 +0100
@@ -24,6 +24,11 @@
 #define CONFIG_X86_IO_APIC 1
 #define CONFIG_HPET_TIMER 1
 #define CONFIG_X86_MCE_P4THERMAL 1
+#define CONFIG_ACPI_NUMA 1
+#define CONFIG_NUMA 1
+#define CONFIG_ACPI_SRAT 1
+#define CONFIG_DISCONTIGMEM 1
+#define CONFIG_NUMA_EMU 1
 
 /* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */
 #define CONFIG_X86_L1_CACHE_SHIFT 7
diff -r a1f987e9640f -r f312c2d01d8b 
xen/include/asm-x86/mach-generic/mach_apic.h
--- a/xen/include/asm-x86/mach-generic/mach_apic.h      Wed Oct 25 11:51:23 
2006 +0100
+++ b/xen/include/asm-x86/mach-generic/mach_apic.h      Wed Oct 25 12:25:54 
2006 +0100
@@ -22,11 +22,7 @@ static inline void enable_apic_mode(void
        return;
 }
 
-/* No sane NUMA support right now. We should parse ACPI SRAT. */
-static inline int apicid_to_node(int logical_apicid)
-{
-       return 0;
-}
+#define apicid_to_node(apicid) ((int)apicid_to_node[(u8)apicid])
 
 extern u8 bios_cpu_apicid[];
 static inline int cpu_present_to_apicid(int mps_cpu)
diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/config.h
--- a/xen/include/xen/config.h  Wed Oct 25 11:51:23 2006 +0100
+++ b/xen/include/xen/config.h  Wed Oct 25 12:25:54 2006 +0100
@@ -50,5 +50,7 @@
 #endif /* !__ASSEMBLY__ */
 
 #define fastcall
+#define __cpuinitdata
+#define __cpuinit
 
 #endif /* __XEN_CONFIG_H__ */
diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/numa.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/numa.c       Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,302 @@
+/* 
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Adapted for Xen: Ryan Harper <ryanh@xxxxxxxxxx>
+ */ 
+
+#include <xen/mm.h>
+#include <xen/string.h>
+#include <xen/init.h>
+#include <xen/ctype.h>
+#include <xen/nodemask.h>
+#include <xen/numa.h>
+#include <xen/keyhandler.h>
+#include <xen/time.h>
+
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+/* from proto.h */
+#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
+
+struct node_data node_data[MAX_NUMNODES];
+
+int memnode_shift;
+u8  memnodemap[NODEMAPSIZE];
+
+unsigned int cpu_to_node[NR_CPUS] __read_mostly = {
+       [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+       [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+
+nodemask_t node_online_map = { { [0] = 1UL } };
+
+int numa_off __initdata;
+
+int acpi_numa __initdata;
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init
+populate_memnodemap(const struct node *nodes, int numnodes, int shift)
+{
+       int i; 
+       int res = -1;
+       unsigned long addr, end;
+
+       if (shift >= 64)
+               return -1;
+       memset(memnodemap, 0xff, sizeof(memnodemap));
+       for (i = 0; i < numnodes; i++) {
+               addr = nodes[i].start;
+               end = nodes[i].end;
+               if (addr >= end)
+                       continue;
+               if ((end >> shift) >= NODEMAPSIZE)
+                       return 0;
+               do {
+                       if (memnodemap[addr >> shift] != 0xff)
+                               return -1;
+                       memnodemap[addr >> shift] = i;
+                       addr += (1UL << shift);
+               } while (addr < end);
+               res = 1;
+       } 
+       return res;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+       int shift = 20;
+
+       while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+               shift++;
+
+       printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
+               shift);
+
+       if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+               printk(KERN_INFO
+       "Your memory is not aligned you need to rebuild your kernel "
+       "with a bigger NODEMAPSIZE shift=%d\n",
+                       shift);
+               return -1;
+       }
+       return shift;
+}
+
+/* initialize NODE_DATA given nodeid and start/end */
+void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
+{ 
+       unsigned long start_pfn, end_pfn;
+
+       start_pfn = start >> PAGE_SHIFT;
+       end_pfn = end >> PAGE_SHIFT;
+
+       NODE_DATA(nodeid)->node_id = nodeid;
+       NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+       NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+       node_set_online(nodeid);
+} 
+
+void __init numa_init_array(void)
+{
+       int rr, i;
+       /* There are unfortunately some poorly designed mainboards around
+          that only connect memory to a single CPU. This breaks the 1:1 
cpu->node
+          mapping. To avoid this fill in the mapping for all possible
+          CPUs, as the number of CPUs is not known yet. 
+          We round robin the existing nodes. */
+       rr = first_node(node_online_map);
+       for (i = 0; i < NR_CPUS; i++) {
+               if (cpu_to_node[i] != NUMA_NO_NODE)
+                       continue;
+               numa_set_node(i, rr);
+               rr = next_node(rr, node_online_map);
+               if (rr == MAX_NUMNODES)
+                       rr = first_node(node_online_map);
+       }
+
+}
+
+#ifdef CONFIG_NUMA_EMU
+/* default to faking a single node as fallback for non-NUMA hardware */
+int numa_fake __initdata = 1;
+
+/* Numa emulation */
+static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+       int i;
+       struct node nodes[MAX_NUMNODES];
+       unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+
+       /* Kludge needed for the hash function */
+       if (hweight64(sz) > 1) {
+               unsigned long x = 1;
+               while ((x << 1) < sz)
+                       x <<= 1;
+               if (x < sz/2)
+                       printk(KERN_ERR "Numa emulation unbalanced. Complain to 
maintainer\n");
+               sz = x;
+       }
+
+       memset(&nodes,0,sizeof(nodes));
+       for (i = 0; i < numa_fake; i++) {
+               nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+               if (i == numa_fake-1)
+                       sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
+               nodes[i].end = nodes[i].start + sz;
+               printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" 
(%"PRIu64"MB)\n",
+                      i,
+                      nodes[i].start, nodes[i].end,
+                      (nodes[i].end - nodes[i].start) >> 20);
+               node_set_online(i);
+       }
+       memnode_shift = compute_hash_shift(nodes, numa_fake);
+       if (memnode_shift < 0) {
+               memnode_shift = 0;
+               printk(KERN_ERR "No NUMA hash function found. Emulation 
disabled.\n");
+               return -1;
+       }
+       for_each_online_node(i)
+               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       numa_init_array();
+       return 0;
+}
+#endif
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{ 
+       int i;
+
+#ifdef CONFIG_ACPI_NUMA
+       if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+                                         end_pfn << PAGE_SHIFT))
+               return;
+#endif
+
+#ifdef CONFIG_NUMA_EMU
+   /* fake a numa node for non-numa hardware */
+       if (numa_fake && !numa_emulation(start_pfn, end_pfn))
+               return;
+#endif
+
+       printk(KERN_INFO "%s\n",
+              numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+       printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+              start_pfn << PAGE_SHIFT,
+              end_pfn << PAGE_SHIFT); 
+               /* setup dummy node covering all memory */ 
+       memnode_shift = 63; 
+       memnodemap[0] = 0;
+       nodes_clear(node_online_map);
+       node_set_online(0);
+       for (i = 0; i < NR_CPUS; i++)
+               numa_set_node(i, 0);
+       node_to_cpumask[0] = cpumask_of_cpu(0);
+       setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+__cpuinit void numa_add_cpu(int cpu)
+{
+       set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+} 
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+       cpu_to_node[cpu] = node;
+}
+
+/* [numa=off] */
+__init int numa_setup(char *opt) 
+{ 
+       if (!strncmp(opt,"off",3))
+               numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+       if(!strncmp(opt, "fake=", 5)) {
+               numa_fake = simple_strtoul(opt+5,NULL,0); ;
+               if (numa_fake >= MAX_NUMNODES)
+                       numa_fake = MAX_NUMNODES;
+       }
+#endif
+#ifdef CONFIG_ACPI_NUMA
+       if (!strncmp(opt,"noacpi",6))
+               acpi_numa = -1;
+#endif
+       return 1;
+} 
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ */
+void __init init_cpu_to_node(void)
+{
+       int i;
+       for (i = 0; i < NR_CPUS; i++) {
+               u8 apicid = x86_cpu_to_apicid[i];
+               if (apicid == BAD_APICID)
+                       continue;
+               if (apicid_to_node[apicid] == NUMA_NO_NODE)
+                       continue;
+               numa_set_node(i,apicid_to_node[apicid]);
+       }
+}
+
+EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(node_to_cpumask);
+EXPORT_SYMBOL(memnode_shift);
+EXPORT_SYMBOL(memnodemap);
+EXPORT_SYMBOL(node_data);
+
+static void dump_numa(unsigned char key)
+{
+       s_time_t now = NOW();
+       int i;
+
+       printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
+                 (u32)(now>>32), (u32)now);
+
+       for_each_online_node(i) {
+               unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< 
PAGE_SHIFT;
+               printk("idx%d -> NODE%d start->%lu size->%lu\n",
+                         i, NODE_DATA(i)->node_id,
+                         NODE_DATA(i)->node_start_pfn,
+                         NODE_DATA(i)->node_spanned_pages);
+               /* sanity check phys_to_nid() */
+               printk("phys_to_nid(%lx) -> %d should be %d\n", pa, 
phys_to_nid(pa),
+                         NODE_DATA(i)->node_id);
+       }
+       for_each_online_cpu(i)
+               printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]);
+}
+
+static __init int register_numa_trigger(void)
+{
+       register_keyhandler('u', dump_numa, "dump numa info");
+       return 0;
+}
+__initcall(register_numa_trigger);
+
diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/srat.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/srat.c       Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,325 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ * 
+ * Adapted for Xen: Ryan Harper <ryanh@xxxxxxxxxx>
+ */
+
+#if 0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/proto.h>
+#include <xen/bitmap.h>
+#include <xen/numa.h>
+#include <xen/topology.h>
+#include <asm/e820.h>
+#endif
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/inttypes.h>
+#include <xen/nodemask.h>
+#include <xen/acpi.h>
+
+#include <asm/numa.h>
+#include <asm/page.h>
+
+static struct acpi_table_slit *acpi_slit;
+
+static nodemask_t nodes_parsed __initdata;
+static nodemask_t nodes_found __initdata;
+static struct node nodes[MAX_NUMNODES] __initdata;
+static u8 pxm2node[256] = { [0 ... 255] = 0xff };
+
+/* Too small nodes confuse the VM badly. Usually they result
+   from BIOS bugs. */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+static int node_to_pxm(int n);
+
+int pxm_to_node(int pxm)
+{
+       if ((unsigned)pxm >= 256)
+               return -1;
+       /* Extend 0xff to (int)-1 */
+       return (signed char)pxm2node[pxm];
+}
+
+static __init int setup_node(int pxm)
+{
+       unsigned node = pxm2node[pxm];
+       if (node == 0xff) {
+               if (nodes_weight(nodes_found) >= MAX_NUMNODES)
+                       return -1;
+               node = first_unset_node(nodes_found); 
+               node_set(node, nodes_found);
+               pxm2node[pxm] = node;
+       }
+       return pxm2node[pxm];
+}
+
+static __init int conflicting_nodes(u64 start, u64 end)
+{
+       int i;
+       for_each_node_mask(i, nodes_parsed) {
+               struct node *nd = &nodes[i];
+               if (nd->start == nd->end)
+                       continue;
+               if (nd->end > start && nd->start < end)
+                       return i;
+               if (nd->end == end && nd->start == start)
+                       return i;
+       }
+       return -1;
+}
+
+static __init void cutoff_node(int i, u64 start, u64 end)
+{
+       struct node *nd = &nodes[i];
+       if (nd->start < start) {
+               nd->start = start;
+               if (nd->end < nd->start)
+                       nd->start = nd->end;
+       }
+       if (nd->end > end) {
+               nd->end = end;
+               if (nd->start > nd->end)
+                       nd->start = nd->end;
+       }
+}
+
+static __init void bad_srat(void)
+{
+       int i;
+       printk(KERN_ERR "SRAT: SRAT not used.\n");
+       acpi_numa = -1;
+       for (i = 0; i < MAX_LOCAL_APIC; i++)
+               apicid_to_node[i] = NUMA_NO_NODE;
+}
+
+static __init inline int srat_disabled(void)
+{
+       return numa_off || acpi_numa < 0;
+}
+
+/*
+ * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
+ * up the NUMA heuristics which wants the local node to have a smaller
+ * distance than the others.
+ * Do some quick checks here and only use the SLIT if it passes.
+ */
+static __init int slit_valid(struct acpi_table_slit *slit)
+{
+       int i, j;
+       int d = slit->localities;
+       for (i = 0; i < d; i++) {
+               for (j = 0; j < d; j++)  {
+                       u8 val = slit->entry[d*i + j];
+                       if (i == j) {
+                               if (val != 10)
+                                       return 0;
+                       } else if (val <= 10)
+                               return 0;
+               }
+       }
+       return 1;
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+       if (!slit_valid(slit)) {
+               printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
+               return;
+       }
+       acpi_slit = slit;
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
+{
+       int pxm, node;
+       if (srat_disabled())
+               return;
+       if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) 
{                bad_srat();
+               return;
+       }
+       if (pa->flags.enabled == 0)
+               return;
+       pxm = pa->proximity_domain;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+               bad_srat();
+               return;
+       }
+       apicid_to_node[pa->apic_id] = node;
+       acpi_numa = 1;
+       printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+              pxm, pa->apic_id, node);
+}
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
+{
+       struct node *nd;
+       u64 start, end;
+       int node, pxm;
+       int i;
+
+       if (srat_disabled())
+               return;
+       if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
+               bad_srat();
+               return;
+       }
+       if (ma->flags.enabled == 0)
+               return;
+       start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
+       end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
+       pxm = ma->proximity_domain;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+               bad_srat();
+               return;
+       }
+       /* It is fine to add this area to the nodes data it will be used later*/
+       if (ma->flags.hot_pluggable == 1)
+               printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - 
%"PRIx64" \n",
+                               start, end);
+       i = conflicting_nodes(start, end);
+       if (i == node) {
+               printk(KERN_WARNING
+               "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with 
itself (%"
+               PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, 
nodes[i].end);
+       } else if (i >= 0) {
+               printk(KERN_ERR
+                      "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d 
(%"
+                      PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i),
+                          nodes[i].start, nodes[i].end);
+               bad_srat();
+               return;
+       }
+       nd = &nodes[node];
+       if (!node_test_and_set(node, nodes_parsed)) {
+               nd->start = start;
+               nd->end = end;
+       } else {
+               if (start < nd->start)
+                       nd->start = start;
+               if (nd->end < end)
+                       nd->end = end;
+       }
+       printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, 
pxm,
+              nd->start, nd->end);
+}
+
+/* Sanity check to catch more bad SRATs (they are amazingly common).
+   Make sure the PXMs cover all memory. */
+static int nodes_cover_memory(void)
+{
+       int i;
+       u64 pxmram, e820ram;
+
+       pxmram = 0;
+       for_each_node_mask(i, nodes_parsed) {
+               u64 s = nodes[i].start >> PAGE_SHIFT;
+               u64 e = nodes[i].end >> PAGE_SHIFT;
+               pxmram += e - s;
+       }
+
+       e820ram = max_page;
+       /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
+       if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+               printk(KERN_ERR "SRAT: PXMs only cover %"PRIu64"MB of your %"
+                       PRIu64"MB e820 RAM. Not used.\n",
+                       (pxmram << PAGE_SHIFT) >> 20,
+                       (e820ram << PAGE_SHIFT) >> 20);
+               return 0;
+       }
+       return 1;
+}
+
+static void unparse_node(int node)
+{
+       int i;
+       node_clear(node, nodes_parsed);
+       for (i = 0; i < MAX_LOCAL_APIC; i++) {
+               if (apicid_to_node[i] == node)
+                       apicid_to_node[i] = NUMA_NO_NODE;
+       }
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+/* Use the information discovered above to actually set up the nodes. */
+int __init acpi_scan_nodes(u64 start, u64 end)
+{
+       int i;
+
+       /* First clean up the node list */
+       for (i = 0; i < MAX_NUMNODES; i++) {
+               cutoff_node(i, start, end);
+               if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
+                       unparse_node(i);
+       }
+
+       if (acpi_numa <= 0)
+               return -1;
+
+       if (!nodes_cover_memory()) {
+               bad_srat();
+               return -1;
+       }
+
+       memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+       if (memnode_shift < 0) {
+               printk(KERN_ERR
+                    "SRAT: No NUMA node hash function found. Contact 
maintainer\n");
+               bad_srat();
+               return -1;
+       }
+
+       /* Finally register nodes */
+       for_each_node_mask(i, nodes_parsed)
+               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       for (i = 0; i < NR_CPUS; i++) { 
+               if (cpu_to_node[i] == NUMA_NO_NODE)
+                       continue;
+               if (!node_isset(cpu_to_node[i], nodes_parsed))
+                       numa_set_node(i, NUMA_NO_NODE);
+       }
+       numa_init_array();
+       return 0;
+}
+
+static int node_to_pxm(int n)
+{
+       int i;
+       if (pxm2node[n] == n)
+               return n;
+       for (i = 0; i < 256; i++)
+               if (pxm2node[i] == n)
+                       return i;
+       return 0;
+}
+
+int __node_distance(int a, int b)
+{
+       int index;
+
+       if (!acpi_slit)
+               return a == b ? 10 : 20;
+       index = acpi_slit->localities * node_to_pxm(a);
+       return acpi_slit->entry[index + node_to_pxm(b)];
+}
+
+EXPORT_SYMBOL(__node_distance);
diff -r a1f987e9640f -r f312c2d01d8b xen/drivers/acpi/numa.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/drivers/acpi/numa.c   Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,216 @@
+/*
+ *  acpi_numa.c - ACPI NUMA support
+ *
+ *  Copyright (C) 2002 Takayoshi Kochi <t-kochi@xxxxxxxxxxxxx>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ */
+#if 0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#endif
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/acpi.h>
+#include <xen/numa.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/acmacros.h>
+#include <asm/page.h> /* __va() */
+
+#define ACPI_NUMA      0x80000000
+#define _COMPONENT     ACPI_NUMA
+ACPI_MODULE_NAME("numa")
+
+extern int __init acpi_table_parse_madt_family(enum acpi_table_id id,
+                                              unsigned long madt_size,
+                                              int entry_id,
+                                              acpi_madt_entry_handler handler,
+                                              unsigned int max_entries);
+
+void __init acpi_table_print_srat_entry(acpi_table_entry_header * header)
+{
+
+       ACPI_FUNCTION_NAME("acpi_table_print_srat_entry");
+
+       if (!header)
+               return;
+
+       switch (header->type) {
+
+       case ACPI_SRAT_PROCESSOR_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+               {
+                       struct acpi_table_processor_affinity *p =
+                           (struct acpi_table_processor_affinity *)header;
+                       ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                                         "SRAT Processor (id[0x%02x] 
eid[0x%02x]) in proximity domain %d %s\n",
+                                         p->apic_id, p->lsapic_eid,
+                                         p->proximity_domain,
+                                         p->flags.
+                                         enabled ? "enabled" : "disabled"));
+               }
+#endif                         /* ACPI_DEBUG_OUTPUT */
+               break;
+
+       case ACPI_SRAT_MEMORY_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+               {
+                       struct acpi_table_memory_affinity *p =
+                           (struct acpi_table_memory_affinity *)header;
+                       ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                                         "SRAT Memory (0x%08x%08x length 
0x%08x%08x type 0x%x) in proximity domain %d %s%s\n",
+                                         p->base_addr_hi, p->base_addr_lo,
+                                         p->length_hi, p->length_lo,
+                                         p->memory_type, p->proximity_domain,
+                                         p->flags.
+                                         enabled ? "enabled" : "disabled",
+                                         p->flags.
+                                         hot_pluggable ? " hot-pluggable" :
+                                         ""));
+               }
+#endif                         /* ACPI_DEBUG_OUTPUT */
+               break;
+
+       default:
+               printk(KERN_WARNING PREFIX
+                      "Found unsupported SRAT entry (type = 0x%x)\n",
+                      header->type);
+               break;
+       }
+}
+
+static int __init acpi_parse_slit(unsigned long phys_addr, unsigned long size)
+{
+       struct acpi_table_slit *slit;
+       u32 localities;
+
+       if (!phys_addr || !size)
+               return -EINVAL;
+
+       slit = (struct acpi_table_slit *)__va(phys_addr);
+
+       /* downcast just for %llu vs %lu for i386/ia64  */
+       localities = (u32) slit->localities;
+
+       acpi_numa_slit_init(slit);
+
+       return 0;
+}
+
+static int __init
+acpi_parse_processor_affinity(acpi_table_entry_header * header,
+                             const unsigned long end)
+{
+       struct acpi_table_processor_affinity *processor_affinity;
+
+       processor_affinity = (struct acpi_table_processor_affinity *)header;
+       if (!processor_affinity)
+               return -EINVAL;
+
+       acpi_table_print_srat_entry(header);
+
+       /* let architecture-dependent part to do it */
+       acpi_numa_processor_affinity_init(processor_affinity);
+
+       return 0;
+}
+
+static int __init
+acpi_parse_memory_affinity(acpi_table_entry_header * header,
+                          const unsigned long end)
+{
+       struct acpi_table_memory_affinity *memory_affinity;
+
+       memory_affinity = (struct acpi_table_memory_affinity *)header;
+       if (!memory_affinity)
+               return -EINVAL;
+
+       acpi_table_print_srat_entry(header);
+
+       /* let architecture-dependent part to do it */
+       acpi_numa_memory_affinity_init(memory_affinity);
+
+       return 0;
+}
+
+static int __init acpi_parse_srat(unsigned long phys_addr, unsigned long size)
+{
+       struct acpi_table_srat *srat;
+
+       if (!phys_addr || !size)
+               return -EINVAL;
+
+       srat = (struct acpi_table_srat *)__va(phys_addr);
+
+       return 0;
+}
+
+int __init
+acpi_table_parse_srat(enum acpi_srat_entry_id id,
+                     acpi_madt_entry_handler handler, unsigned int max_entries)
+{
+       return acpi_table_parse_madt_family(ACPI_SRAT,
+                                           sizeof(struct acpi_table_srat), id,
+                                           handler, max_entries);
+}
+
+int __init acpi_numa_init(void)
+{
+       int result;
+
+       /* SRAT: Static Resource Affinity Table */
+       result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat);
+
+       if (result > 0) {
+               result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY,
+                                              acpi_parse_processor_affinity,
+                                              NR_CPUS);
+               result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY, 
acpi_parse_memory_affinity, NR_NODE_MEMBLKS); // IA64 specific
+       }
+
+       /* SLIT: System Locality Information Table */
+       result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit);
+
+       acpi_numa_arch_fixup();
+       return 0;
+}
+
+#if 0
+int acpi_get_pxm(acpi_handle h)
+{
+       unsigned long pxm;
+       acpi_status status;
+       acpi_handle handle;
+       acpi_handle phandle = h;
+
+       do {
+               handle = phandle;
+               status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
+               if (ACPI_SUCCESS(status))
+                       return (int)pxm;
+               status = acpi_get_parent(handle, &phandle);
+       } while (ACPI_SUCCESS(status));
+       return -1;
+}
+
+EXPORT_SYMBOL(acpi_get_pxm);
+#endif
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/numa.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/numa.h        Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,65 @@
+#ifndef _ASM_X8664_NUMA_H 
+#define _ASM_X8664_NUMA_H 1
+
+#include <xen/nodemask.h>
+#include <xen/topology.h>
+#include <asm/numnodes.h>
+#include <asm/smp.h>
+
+struct node { 
+       u64 start,end; 
+};
+
+extern int compute_hash_shift(struct node *nodes, int numnodes);
+extern int pxm_to_node(int nid);
+
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+#define VIRTUAL_BUG_ON(x) 
+#define NODEMAPSIZE 0xfff
+
+extern void numa_add_cpu(int cpu);
+extern void numa_init_array(void);
+extern int numa_off;
+
+extern void numa_set_node(int cpu, int node);
+
+extern void setup_node_bootmem(int nodeid, u64 start, u64 end);
+extern unsigned char apicid_to_node[256];
+#ifdef CONFIG_NUMA
+extern void __init init_cpu_to_node(void);
+
+static inline void clear_node_cpumask(int cpu)
+{
+       clear_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+}
+
+/* Simple perfect hash to map physical addresses to node numbers */
+extern int memnode_shift; 
+extern u8  memnodemap[NODEMAPSIZE]; 
+
+extern struct node_data node_data[];
+
+static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 
+{ 
+       unsigned nid; 
+       VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+       nid = memnodemap[addr >> memnode_shift]; 
+       VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
+       return nid; 
+} 
+
+#define NODE_DATA(nid)         (&(node_data[nid]))
+
+#define node_start_pfn(nid)    (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
+                                NODE_DATA(nid)->node_spanned_pages)
+
+
+#else
+#define init_cpu_to_node() do {} while (0)
+#define clear_node_cpumask(cpu) do {} while (0)
+#endif
+
+#define NUMA_NO_NODE 0xff
+
+#endif
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/numnodes.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/numnodes.h    Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,26 @@
+#ifndef _ASM_MAX_NUMNODES_H
+#define _ASM_MAX_NUMNODES_H
+
+#include <xen/config.h>
+
+#if defined(__i386__)
+#ifdef CONFIG_X86_NUMAQ
+
+/* Max 16 Nodes */
+#define NODES_SHIFT    4
+
+#elif defined(CONFIG_ACPI_SRAT)
+
+/* Max 8 Nodes */
+#define NODES_SHIFT    3
+
+#endif /* CONFIG_X86_NUMAQ */
+
+
+#endif /* __i386__ */
+
+#if defined(CONFIG_NUMA) && defined(__x86_64__)
+#define NODES_SHIFT  6
+#endif /* __x86_64__ */
+
+#endif /* _ASM_MAX_NUMNODES_H */
diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/topology.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/topology.h    Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2006, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Ryan Harper <ryanh@xxxxxxxxxx>
+ */
+
+#ifndef _ASM_X86_TOPOLOGY_H
+#define _ASM_X86_TOPOLOGY_H
+
+#include <xen/config.h>
+#include <xen/bitops.h>
+
+extern cpumask_t cpu_online_map;
+
+extern unsigned int cpu_to_node[];
+extern cpumask_t     node_to_cpumask[];
+
+#define cpu_to_node(cpu)               (cpu_to_node[cpu])
+#define parent_node(node)              (node)
+#define node_to_first_cpu(node)  (__ffs(node_to_cpumask[node]))
+#define node_to_cpumask(node)    (node_to_cpumask[node])
+
+#endif  /* _ASM_X86_TOPOLOGY_H */
diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/nodemask.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/nodemask.h        Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,342 @@
+#ifndef __LINUX_NODEMASK_H
+#define __LINUX_NODEMASK_H
+
+/*
+ * Nodemasks provide a bitmap suitable for representing the
+ * set of Node's in a system, one bit position per Node number.
+ *
+ * See detailed comments in the file linux/bitmap.h describing the
+ * data type on which these nodemasks are based.
+ *
+ * For details of nodemask_scnprintf() and nodemask_parse(),
+ * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ *
+ * The available nodemask operations are:
+ *
+ * void node_set(node, mask)           turn on bit 'node' in mask
+ * void node_clear(node, mask)         turn off bit 'node' in mask
+ * void nodes_setall(mask)             set all bits
+ * void nodes_clear(mask)              clear all bits
+ * int node_isset(node, mask)          true iff bit 'node' set in mask
+ * int node_test_and_set(node, mask)   test and set bit 'node' in mask
+ *
+ * void nodes_and(dst, src1, src2)     dst = src1 & src2  [intersection]
+ * void nodes_or(dst, src1, src2)      dst = src1 | src2  [union]
+ * void nodes_xor(dst, src1, src2)     dst = src1 ^ src2
+ * void nodes_andnot(dst, src1, src2)  dst = src1 & ~src2
+ * void nodes_complement(dst, src)     dst = ~src
+ *
+ * int nodes_equal(mask1, mask2)       Does mask1 == mask2?
+ * int nodes_intersects(mask1, mask2)  Do mask1 and mask2 intersect?
+ * int nodes_subset(mask1, mask2)      Is mask1 a subset of mask2?
+ * int nodes_empty(mask)               Is mask empty (no bits sets)?
+ * int nodes_full(mask)                        Is mask full (all bits sets)?
+ * int nodes_weight(mask)              Hamming weight - number of set bits
+ *
+ * void nodes_shift_right(dst, src, n) Shift right
+ * void nodes_shift_left(dst, src, n)  Shift left
+ *
+ * int first_node(mask)                        Number lowest set bit, or 
MAX_NUMNODES
+ * int next_node(node, mask)           Next node past 'node', or MAX_NUMNODES
+ * int first_unset_node(mask)          First node not set in mask, or 
+ *                                     MAX_NUMNODES.
+ *
+ * nodemask_t nodemask_of_node(node)   Return nodemask with bit 'node' set
+ * NODE_MASK_ALL                       Initializer - all bits set
+ * NODE_MASK_NONE                      Initializer - no bits set
+ * unsigned long *nodes_addr(mask)     Array of unsigned long's in mask
+ *
+ * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
+ * int nodemask_parse(ubuf, ulen, mask)        Parse ascii string as nodemask
+ *
+ * for_each_node_mask(node, mask)      for-loop node over mask
+ *
+ * int num_online_nodes()              Number of online Nodes
+ * int num_possible_nodes()            Number of all possible Nodes
+ *
+ * int node_online(node)               Is some node online?
+ * int node_possible(node)             Is some node possible?
+ *
+ * int any_online_node(mask)           First online node in mask
+ *
+ * node_set_online(node)               set bit 'node' in node_online_map
+ * node_set_offline(node)              clear bit 'node' in node_online_map
+ *
+ * for_each_node(node)                 for-loop node over node_possible_map
+ * for_each_online_node(node)          for-loop node over node_online_map
+ *
+ * Subtlety:
+ * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
+ *    to generate slightly worse code.  So use a simple one-line #define
+ *    for node_isset(), instead of wrapping an inline inside a macro, the
+ *    way we do the other calls.
+ */
+
+#if 0
+#include <linux/threads.h>
+#include <asm/bug.h>
+#endif
+#include <xen/kernel.h>
+#include <xen/bitmap.h>
+#include <xen/numa.h>
+
+typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
+extern nodemask_t _unused_nodemask_arg_;
+
+#define node_set(node, dst) __node_set((node), &(dst))
+static inline void __node_set(int node, volatile nodemask_t *dstp)
+{
+       set_bit(node, dstp->bits);
+}
+
+#define node_clear(node, dst) __node_clear((node), &(dst))
+static inline void __node_clear(int node, volatile nodemask_t *dstp)
+{
+       clear_bit(node, dstp->bits);
+}
+
+#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
+static inline void __nodes_setall(nodemask_t *dstp, int nbits)
+{
+       bitmap_fill(dstp->bits, nbits);
+}
+
+#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
+static inline void __nodes_clear(nodemask_t *dstp, int nbits)
+{
+       bitmap_zero(dstp->bits, nbits);
+}
+
+/* No static inline type checking - see Subtlety (1) above. */
+#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)
+
+#define node_test_and_set(node, nodemask) \
+                       __node_test_and_set((node), &(nodemask))
+static inline int __node_test_and_set(int node, nodemask_t *addr)
+{
+       return test_and_set_bit(node, addr->bits);
+}
+
+#define nodes_and(dst, src1, src2) \
+                       __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_or(dst, src1, src2) \
+                       __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_xor(dst, src1, src2) \
+                       __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_andnot(dst, src1, src2) \
+                       __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_complement(dst, src) \
+                       __nodes_complement(&(dst), &(src), MAX_NUMNODES)
+static inline void __nodes_complement(nodemask_t *dstp,
+                                       const nodemask_t *srcp, int nbits)
+{
+       bitmap_complement(dstp->bits, srcp->bits, nbits);
+}
+
+#define nodes_equal(src1, src2) \
+                       __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_equal(const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       return bitmap_equal(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_intersects(src1, src2) \
+                       __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_intersects(const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_subset(src1, src2) \
+                       __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_subset(const nodemask_t *src1p,
+                                       const nodemask_t *src2p, int nbits)
+{
+       return bitmap_subset(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
+static inline int __nodes_empty(const nodemask_t *srcp, int nbits)
+{
+       return bitmap_empty(srcp->bits, nbits);
+}
+
+#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
+static inline int __nodes_full(const nodemask_t *srcp, int nbits)
+{
+       return bitmap_full(srcp->bits, nbits);
+}
+
+#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
+static inline int __nodes_weight(const nodemask_t *srcp, int nbits)
+{
+       return bitmap_weight(srcp->bits, nbits);
+}
+
+#define nodes_shift_right(dst, src, n) \
+                       __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
+static inline void __nodes_shift_right(nodemask_t *dstp,
+                                       const nodemask_t *srcp, int n, int 
nbits)
+{
+       bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
+}
+
+#define nodes_shift_left(dst, src, n) \
+                       __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
+static inline void __nodes_shift_left(nodemask_t *dstp,
+                                       const nodemask_t *srcp, int n, int 
nbits)
+{
+       bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+}
+
+/* FIXME: better would be to fix all architectures to never return
+          > MAX_NUMNODES, then the silly min_ts could be dropped. */
+
+#define first_node(src) __first_node(&(src))
+static inline int __first_node(const nodemask_t *srcp)
+{
+       return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, 
MAX_NUMNODES));
+}
+
+#define next_node(n, src) __next_node((n), &(src))
+static inline int __next_node(int n, const nodemask_t *srcp)
+{
+       return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, 
n+1));
+}
+
+#define nodemask_of_node(node)                                         \
+({                                                                     \
+       typeof(_unused_nodemask_arg_) m;                                \
+       if (sizeof(m) == sizeof(unsigned long)) {                       \
+               m.bits[0] = 1UL<<(node);                                \
+       } else {                                                        \
+               nodes_clear(m);                                         \
+               node_set((node), m);                                    \
+       }                                                               \
+       m;                                                              \
+})
+
+#define first_unset_node(mask) __first_unset_node(&(mask))
+static inline int __first_unset_node(const nodemask_t *maskp)
+{
+       return min_t(int,MAX_NUMNODES,
+                       find_first_zero_bit(maskp->bits, MAX_NUMNODES));
+}
+
+#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)
+
+#if MAX_NUMNODES <= BITS_PER_LONG
+
+#define NODE_MASK_ALL                                                  \
+((nodemask_t) { {                                                      \
+       [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD           \
+} })
+
+#else
+
+#define NODE_MASK_ALL                                                  \
+((nodemask_t) { {                                                      \
+       [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                   \
+       [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD           \
+} })
+
+#endif
+
+#define NODE_MASK_NONE                                                 \
+((nodemask_t) { {                                                      \
+       [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                    \
+} })
+
+#define nodes_addr(src) ((src).bits)
+
+#if 0
+#define nodemask_scnprintf(buf, len, src) \
+                       __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES)
+static inline int __nodemask_scnprintf(char *buf, int len,
+                                       const nodemask_t *srcp, int nbits)
+{
+       return bitmap_scnprintf(buf, len, srcp->bits, nbits);
+}
+
+#define nodemask_parse(ubuf, ulen, dst) \
+                       __nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES)
+static inline int __nodemask_parse(const char __user *buf, int len,
+                                       nodemask_t *dstp, int nbits)
+{
+       return bitmap_parse(buf, len, dstp->bits, nbits);
+}
+#endif
+
+#if MAX_NUMNODES > 1
+#define for_each_node_mask(node, mask)                 \
+       for ((node) = first_node(mask);                 \
+               (node) < MAX_NUMNODES;                  \
+               (node) = next_node((node), (mask)))
+#else /* MAX_NUMNODES == 1 */
+#define for_each_node_mask(node, mask)                 \
+       if (!nodes_empty(mask))                         \
+               for ((node) = 0; (node) < 1; (node)++)
+#endif /* MAX_NUMNODES */
+
+/*
+ * The following particular system nodemasks and operations
+ * on them manage all possible and online nodes.
+ */
+
+extern nodemask_t node_online_map;
+extern nodemask_t node_possible_map;
+
+#if MAX_NUMNODES > 1
+#define num_online_nodes()     nodes_weight(node_online_map)
+#define num_possible_nodes()   nodes_weight(node_possible_map)
+#define node_online(node)      node_isset((node), node_online_map)
+#define node_possible(node)    node_isset((node), node_possible_map)
+#else
+#define num_online_nodes()     1
+#define num_possible_nodes()   1
+#define node_online(node)      ((node) == 0)
+#define node_possible(node)    ((node) == 0)
+#endif
+
+#define any_online_node(mask)                  \
+({                                             \
+       int node;                               \
+       for_each_node_mask(node, (mask))        \
+               if (node_online(node))          \
+                       break;                  \
+       node;                                   \
+})
+
+#define node_set_online(node)     set_bit((node), node_online_map.bits)
+#define node_set_offline(node)    clear_bit((node), node_online_map.bits)
+
+#define for_each_node(node)       for_each_node_mask((node), node_possible_map)
+#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
+
+#endif /* __LINUX_NODEMASK_H */
diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/numa.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/numa.h    Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,35 @@
+#ifndef _XEN_NUMA_H
+#define _XEN_NUMA_H
+
+#include <xen/config.h>
+
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/numnodes.h>
+#endif
+
+#ifndef NODES_SHIFT
+#define NODES_SHIFT     0
+#endif
+
+#define MAX_NUMNODES    (1 << NODES_SHIFT)
+#define NUMA_NO_NODE    0xff
+
+#define MAX_PXM_DOMAINS    256   /* 1 byte and no promises about values */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
+#define MAX_CHUNKS_PER_NODE   4
+#define MAXCHUNKS    (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+
+/* needed for drivers/acpi/numa.c */
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+
+extern unsigned int cpu_to_node[];
+#include <xen/cpumask.h>
+extern cpumask_t node_to_cpumask[];
+
+typedef struct node_data {
+    unsigned long node_start_pfn;
+    unsigned long node_spanned_pages;
+    unsigned int  node_id;
+} node_data_t;
+
+#endif /* _XEN_NUMA_H */
diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/topology.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/topology.h        Wed Oct 25 12:25:54 2006 +0100
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2006, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _XEN_TOPOLOGY_H
+#define _XEN_TOPOLOGY_H
+
+#include <asm/topology.h>
+
+#endif /* _XEN_TOPOLOGY_H */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.