Xen project Mailing List

[Xen-changelog] [xen-unstable] [XEN] Add basic NUMA/SRAT support to Xen from Linux 2.6.16.29.

From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>

Date: Wed, 25 Oct 2006 15:20:16 +0000

Delivery-date: Wed, 25 Oct 2006 08:20:49 -0700

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User kfraser@xxxxxxxxxxxxxxxxxxxxx # Node ID f312c2d01d8b9a3c237543b65157da83696cbff5 # Parent a1f987e9640f3824b15158be1ba0d426503e282f [XEN] Add basic NUMA/SRAT support to Xen from Linux 2.6.16.29. Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx> --- xen/arch/x86/Makefile | 2 xen/arch/x86/numa.c | 302 +++++++++++++++++++++++ xen/arch/x86/setup.c | 34 ++ xen/arch/x86/smpboot.c | 4 xen/arch/x86/srat.c | 325 +++++++++++++++++++++++++ xen/drivers/acpi/Makefile | 1 xen/drivers/acpi/numa.c | 216 +++++++++++++++++ xen/include/asm-x86/acpi.h | 3 xen/include/asm-x86/config.h | 5 xen/include/asm-x86/mach-generic/mach_apic.h | 6 xen/include/asm-x86/numa.h | 65 +++++ xen/include/asm-x86/numnodes.h | 26 ++ xen/include/asm-x86/topology.h | 40 +++ xen/include/xen/config.h | 2 xen/include/xen/nodemask.h | 342 +++++++++++++++++++++++++++ xen/include/xen/numa.h | 35 ++ xen/include/xen/topology.h | 27 ++ 17 files changed, 1428 insertions(+), 7 deletions(-) diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/Makefile --- a/xen/arch/x86/Makefile Wed Oct 25 11:51:23 2006 +0100 +++ b/xen/arch/x86/Makefile Wed Oct 25 12:25:54 2006 +0100 @@ -28,12 +28,14 @@ obj-y += mm.o obj-y += mm.o obj-y += mpparse.o obj-y += nmi.o +obj-y += numa.o obj-y += physdev.o obj-y += rwlock.o obj-y += setup.o obj-y += shutdown.o obj-y += smp.o obj-y += smpboot.o +obj-y += srat.o obj-y += string.o obj-y += sysctl.o obj-y += time.o diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Wed Oct 25 11:51:23 2006 +0100 +++ b/xen/arch/x86/setup.c Wed Oct 25 12:25:54 2006 +0100 @@ -16,6 +16,7 @@ #include <xen/percpu.h> #include <xen/hypercall.h> #include <xen/keyhandler.h> +#include <xen/numa.h> #include <public/version.h> #include <asm/bitops.h> #include <asm/smp.h> @@ -25,10 +26,12 @@ #include <asm/desc.h> #include <asm/shadow.h> #include <asm/e820.h> +#include <asm/numa.h> #include <acm/acm_hooks.h> extern void dmi_scan_machine(void); extern void generic_apic_probe(void); +extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn); /* * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the @@ -59,6 +62,9 @@ boolean_param("watchdog", opt_watchdog); /* "acpi=noirq": Disables ACPI interrupt routing. */ static void parse_acpi_param(char *s); custom_param("acpi", parse_acpi_param); + +extern int numa_setup(char *s); +custom_param("numa", numa_setup); /* **** Linux config option: propagated to domain0. */ /* acpi_skip_timer_override: Skip IRQ0 overrides. */ @@ -255,6 +261,20 @@ static void __init init_idle_domain(void idle_vcpu[0] = this_cpu(curr_vcpu) = current; setup_idle_pagetable(); +} + +static void srat_detect_node(int cpu) +{ + unsigned node; + u8 apicid = x86_cpu_to_apicid[cpu]; + + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE) + node = 0; + numa_set_node(cpu, node); + + if (acpi_numa > 0) + printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node); } void __init __start_xen(multiboot_info_t *mbi) @@ -485,6 +505,12 @@ void __init __start_xen(multiboot_info_t init_frametable(); + acpi_boot_table_init(); + + acpi_numa_init(); + + numa_initmem_init(0, max_page); + end_boot_allocator(); /* Initialise the Xen heap, skipping RAM holes. */ @@ -536,8 +562,9 @@ void __init __start_xen(multiboot_info_t generic_apic_probe(); - acpi_boot_table_init(); acpi_boot_init(); + + init_cpu_to_node(); if ( smp_found_config ) get_smp_config(); @@ -589,6 +616,11 @@ void __init __start_xen(multiboot_info_t break; if ( !cpu_online(i) ) __cpu_up(i); + + /* setup cpu_to_node[] */ + srat_detect_node(i); + /* setup node_to_cpumask based on cpu_to_node[] */ + numa_add_cpu(i); } printk("Brought up %ld CPUs\n", (long)num_online_cpus()); diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Wed Oct 25 11:51:23 2006 +0100 +++ b/xen/arch/x86/smpboot.c Wed Oct 25 12:25:54 2006 +0100 @@ -43,6 +43,8 @@ #include <xen/delay.h> #include <xen/softirq.h> #include <xen/serial.h> +#include <xen/numa.h> +#include <asm/numa.h> #include <asm/current.h> #include <asm/mc146818rtc.h> #include <asm/desc.h> @@ -628,7 +630,7 @@ static void map_cpu_to_logical_apicid(vo static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); - int apicid = logical_smp_processor_id(); + int apicid = hard_smp_processor_id(); cpu_2_logical_apicid[cpu] = apicid; map_cpu_to_node(cpu, apicid_to_node(apicid)); diff -r a1f987e9640f -r f312c2d01d8b xen/drivers/acpi/Makefile --- a/xen/drivers/acpi/Makefile Wed Oct 25 11:51:23 2006 +0100 +++ b/xen/drivers/acpi/Makefile Wed Oct 25 12:25:54 2006 +0100 @@ -1,1 +1,2 @@ obj-y += tables.o obj-y += tables.o +obj-y += numa.o diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/acpi.h --- a/xen/include/asm-x86/acpi.h Wed Oct 25 11:51:23 2006 +0100 +++ b/xen/include/asm-x86/acpi.h Wed Oct 25 12:25:54 2006 +0100 @@ -157,6 +157,8 @@ static inline void check_acpi_pci(void) static inline void acpi_noirq_set(void) { acpi_noirq = 1; } static inline int acpi_irq_balance_set(char *str) { return 0; } +extern int acpi_scan_nodes(u64 start, u64 end); +extern int acpi_numa; #ifdef CONFIG_ACPI_SLEEP @@ -173,5 +175,6 @@ extern void acpi_reserve_bootmem(void); #endif /*CONFIG_ACPI_SLEEP*/ extern u8 x86_acpiid_to_apicid[]; +#define MAX_LOCAL_APIC 256 #endif /*_ASM_ACPI_H*/ diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/config.h --- a/xen/include/asm-x86/config.h Wed Oct 25 11:51:23 2006 +0100 +++ b/xen/include/asm-x86/config.h Wed Oct 25 12:25:54 2006 +0100 @@ -24,6 +24,11 @@ #define CONFIG_X86_IO_APIC 1 #define CONFIG_HPET_TIMER 1 #define CONFIG_X86_MCE_P4THERMAL 1 +#define CONFIG_ACPI_NUMA 1 +#define CONFIG_NUMA 1 +#define CONFIG_ACPI_SRAT 1 +#define CONFIG_DISCONTIGMEM 1 +#define CONFIG_NUMA_EMU 1 /* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */ #define CONFIG_X86_L1_CACHE_SHIFT 7 diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/mach-generic/mach_apic.h --- a/xen/include/asm-x86/mach-generic/mach_apic.h Wed Oct 25 11:51:23 2006 +0100 +++ b/xen/include/asm-x86/mach-generic/mach_apic.h Wed Oct 25 12:25:54 2006 +0100 @@ -22,11 +22,7 @@ static inline void enable_apic_mode(void return; } -/* No sane NUMA support right now. We should parse ACPI SRAT. */ -static inline int apicid_to_node(int logical_apicid) -{ - return 0; -} +#define apicid_to_node(apicid) ((int)apicid_to_node[(u8)apicid]) extern u8 bios_cpu_apicid[]; static inline int cpu_present_to_apicid(int mps_cpu) diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/config.h --- a/xen/include/xen/config.h Wed Oct 25 11:51:23 2006 +0100 +++ b/xen/include/xen/config.h Wed Oct 25 12:25:54 2006 +0100 @@ -50,5 +50,7 @@ #endif /* !__ASSEMBLY__ */ #define fastcall +#define __cpuinitdata +#define __cpuinit #endif /* __XEN_CONFIG_H__ */ diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/numa.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/numa.c Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,302 @@ +/* + * Generic VM initialization for x86-64 NUMA setups. + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Adapted for Xen: Ryan Harper <ryanh@xxxxxxxxxx> + */ + +#include <xen/mm.h> +#include <xen/string.h> +#include <xen/init.h> +#include <xen/ctype.h> +#include <xen/nodemask.h> +#include <xen/numa.h> +#include <xen/keyhandler.h> +#include <xen/time.h> + +#include <asm/numa.h> +#include <asm/acpi.h> + +#ifndef Dprintk +#define Dprintk(x...) +#endif + +/* from proto.h */ +#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1))) + +struct node_data node_data[MAX_NUMNODES]; + +int memnode_shift; +u8 memnodemap[NODEMAPSIZE]; + +unsigned int cpu_to_node[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS-1] = NUMA_NO_NODE +}; +unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; +cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; + +nodemask_t node_online_map = { { [0] = 1UL } }; + +int numa_off __initdata; + +int acpi_numa __initdata; + +/* + * Given a shift value, try to populate memnodemap[] + * Returns : + * 1 if OK + * 0 if memnodmap[] too small (of shift too small) + * -1 if node overlap or lost ram (shift too big) + */ +static int __init +populate_memnodemap(const struct node *nodes, int numnodes, int shift) +{ + int i; + int res = -1; + unsigned long addr, end; + + if (shift >= 64) + return -1; + memset(memnodemap, 0xff, sizeof(memnodemap)); + for (i = 0; i < numnodes; i++) { + addr = nodes[i].start; + end = nodes[i].end; + if (addr >= end) + continue; + if ((end >> shift) >= NODEMAPSIZE) + return 0; + do { + if (memnodemap[addr >> shift] != 0xff) + return -1; + memnodemap[addr >> shift] = i; + addr += (1UL << shift); + } while (addr < end); + res = 1; + } + return res; +} + +int __init compute_hash_shift(struct node *nodes, int numnodes) +{ + int shift = 20; + + while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) + shift++; + + printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", + shift); + + if (populate_memnodemap(nodes, numnodes, shift) != 1) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d\n", + shift); + return -1; + } + return shift; +} + +/* initialize NODE_DATA given nodeid and start/end */ +void __init setup_node_bootmem(int nodeid, u64 start, u64 end) +{ + unsigned long start_pfn, end_pfn; + + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + NODE_DATA(nodeid)->node_id = nodeid; + NODE_DATA(nodeid)->node_start_pfn = start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; + + node_set_online(nodeid); +} + +void __init numa_init_array(void) +{ + int rr, i; + /* There are unfortunately some poorly designed mainboards around + that only connect memory to a single CPU. This breaks the 1:1 cpu->node + mapping. To avoid this fill in the mapping for all possible + CPUs, as the number of CPUs is not known yet. + We round robin the existing nodes. */ + rr = first_node(node_online_map); + for (i = 0; i < NR_CPUS; i++) { + if (cpu_to_node[i] != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + } + +} + +#ifdef CONFIG_NUMA_EMU +/* default to faking a single node as fallback for non-NUMA hardware */ +int numa_fake __initdata = 1; + +/* Numa emulation */ +static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + struct node nodes[MAX_NUMNODES]; + unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; + + /* Kludge needed for the hash function */ + if (hweight64(sz) > 1) { + unsigned long x = 1; + while ((x << 1) < sz) + x <<= 1; + if (x < sz/2) + printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); + sz = x; + } + + memset(&nodes,0,sizeof(nodes)); + for (i = 0; i < numa_fake; i++) { + nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; + if (i == numa_fake-1) + sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; + nodes[i].end = nodes[i].start + sz; + printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n", + i, + nodes[i].start, nodes[i].end, + (nodes[i].end - nodes[i].start) >> 20); + node_set_online(i); + } + memnode_shift = compute_hash_shift(nodes, numa_fake); + if (memnode_shift < 0) { + memnode_shift = 0; + printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); + return -1; + } + for_each_online_node(i) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + numa_init_array(); + return 0; +} +#endif + +void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + +#ifdef CONFIG_ACPI_NUMA + if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, + end_pfn << PAGE_SHIFT)) + return; +#endif + +#ifdef CONFIG_NUMA_EMU + /* fake a numa node for non-numa hardware */ + if (numa_fake && !numa_emulation(start_pfn, end_pfn)) + return; +#endif + + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + start_pfn << PAGE_SHIFT, + end_pfn << PAGE_SHIFT); + /* setup dummy node covering all memory */ + memnode_shift = 63; + memnodemap[0] = 0; + nodes_clear(node_online_map); + node_set_online(0); + for (i = 0; i < NR_CPUS; i++) + numa_set_node(i, 0); + node_to_cpumask[0] = cpumask_of_cpu(0); + setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); +} + +__cpuinit void numa_add_cpu(int cpu) +{ + set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); +} + +void __cpuinit numa_set_node(int cpu, int node) +{ + cpu_to_node[cpu] = node; +} + +/* [numa=off] */ +__init int numa_setup(char *opt) +{ + if (!strncmp(opt,"off",3)) + numa_off = 1; +#ifdef CONFIG_NUMA_EMU + if(!strncmp(opt, "fake=", 5)) { + numa_fake = simple_strtoul(opt+5,NULL,0); ; + if (numa_fake >= MAX_NUMNODES) + numa_fake = MAX_NUMNODES; + } +#endif +#ifdef CONFIG_ACPI_NUMA + if (!strncmp(opt,"noacpi",6)) + acpi_numa = -1; +#endif + return 1; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + */ +void __init init_cpu_to_node(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) { + u8 apicid = x86_cpu_to_apicid[i]; + if (apicid == BAD_APICID) + continue; + if (apicid_to_node[apicid] == NUMA_NO_NODE) + continue; + numa_set_node(i,apicid_to_node[apicid]); + } +} + +EXPORT_SYMBOL(cpu_to_node); +EXPORT_SYMBOL(node_to_cpumask); +EXPORT_SYMBOL(memnode_shift); +EXPORT_SYMBOL(memnodemap); +EXPORT_SYMBOL(node_data); + +static void dump_numa(unsigned char key) +{ + s_time_t now = NOW(); + int i; + + printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key, + (u32)(now>>32), (u32)now); + + for_each_online_node(i) { + unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT; + printk("idx%d -> NODE%d start->%lu size->%lu\n", + i, NODE_DATA(i)->node_id, + NODE_DATA(i)->node_start_pfn, + NODE_DATA(i)->node_spanned_pages); + /* sanity check phys_to_nid() */ + printk("phys_to_nid(%lx) -> %d should be %d\n", pa, phys_to_nid(pa), + NODE_DATA(i)->node_id); + } + for_each_online_cpu(i) + printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]); +} + +static __init int register_numa_trigger(void) +{ + register_keyhandler('u', dump_numa, "dump numa info"); + return 0; +} +__initcall(register_numa_trigger); + diff -r a1f987e9640f -r f312c2d01d8b xen/arch/x86/srat.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/srat.c Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,325 @@ +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + * + * Adapted for Xen: Ryan Harper <ryanh@xxxxxxxxxx> + */ + +#if 0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/proto.h> +#include <xen/bitmap.h> +#include <xen/numa.h> +#include <xen/topology.h> +#include <asm/e820.h> +#endif +#include <xen/init.h> +#include <xen/mm.h> +#include <xen/inttypes.h> +#include <xen/nodemask.h> +#include <xen/acpi.h> + +#include <asm/numa.h> +#include <asm/page.h> + +static struct acpi_table_slit *acpi_slit; + +static nodemask_t nodes_parsed __initdata; +static nodemask_t nodes_found __initdata; +static struct node nodes[MAX_NUMNODES] __initdata; +static u8 pxm2node[256] = { [0 ... 255] = 0xff }; + +/* Too small nodes confuse the VM badly. Usually they result + from BIOS bugs. */ +#define NODE_MIN_SIZE (4*1024*1024) + +static int node_to_pxm(int n); + +int pxm_to_node(int pxm) +{ + if ((unsigned)pxm >= 256) + return -1; + /* Extend 0xff to (int)-1 */ + return (signed char)pxm2node[pxm]; +} + +static __init int setup_node(int pxm) +{ + unsigned node = pxm2node[pxm]; + if (node == 0xff) { + if (nodes_weight(nodes_found) >= MAX_NUMNODES) + return -1; + node = first_unset_node(nodes_found); + node_set(node, nodes_found); + pxm2node[pxm] = node; + } + return pxm2node[pxm]; +} + +static __init int conflicting_nodes(u64 start, u64 end) +{ + int i; + for_each_node_mask(i, nodes_parsed) { + struct node *nd = &nodes[i]; + if (nd->start == nd->end) + continue; + if (nd->end > start && nd->start < end) + return i; + if (nd->end == end && nd->start == start) + return i; + } + return -1; +} + +static __init void cutoff_node(int i, u64 start, u64 end) +{ + struct node *nd = &nodes[i]; + if (nd->start < start) { + nd->start = start; + if (nd->end < nd->start) + nd->start = nd->end; + } + if (nd->end > end) { + nd->end = end; + if (nd->start > nd->end) + nd->start = nd->end; + } +} + +static __init void bad_srat(void) +{ + int i; + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; + for (i = 0; i < MAX_LOCAL_APIC; i++) + apicid_to_node[i] = NUMA_NO_NODE; +} + +static __init inline int srat_disabled(void) +{ + return numa_off || acpi_numa < 0; +} + +/* + * A lot of BIOS fill in 10 (= no distance) everywhere. This messes + * up the NUMA heuristics which wants the local node to have a smaller + * distance than the others. + * Do some quick checks here and only use the SLIT if it passes. + */ +static __init int slit_valid(struct acpi_table_slit *slit) +{ + int i, j; + int d = slit->localities; + for (i = 0; i < d; i++) { + for (j = 0; j < d; j++) { + u8 val = slit->entry[d*i + j]; + if (i == j) { + if (val != 10) + return 0; + } else if (val <= 10) + return 0; + } + } + return 1; +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + if (!slit_valid(slit)) { + printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); + return; + } + acpi_slit = slit; +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) +{ + int pxm, node; + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat(); + return; + } + if (pa->flags.enabled == 0) + return; + pxm = pa->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + apicid_to_node[pa->apic_id] = node; + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + pxm, pa->apic_id, node); +} + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +void __init +acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) +{ + struct node *nd; + u64 start, end; + int node, pxm; + int i; + + if (srat_disabled()) + return; + if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) { + bad_srat(); + return; + } + if (ma->flags.enabled == 0) + return; + start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); + end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); + pxm = ma->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains.\n"); + bad_srat(); + return; + } + /* It is fine to add this area to the nodes data it will be used later*/ + if (ma->flags.hot_pluggable == 1) + printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - %"PRIx64" \n", + start, end); + i = conflicting_nodes(start, end); + if (i == node) { + printk(KERN_WARNING + "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%" + PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end); + } else if (i >= 0) { + printk(KERN_ERR + "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%" + PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i), + nodes[i].start, nodes[i].end); + bad_srat(); + return; + } + nd = &nodes[node]; + if (!node_test_and_set(node, nodes_parsed)) { + nd->start = start; + nd->end = end; + } else { + if (start < nd->start) + nd->start = start; + if (nd->end < end) + nd->end = end; + } + printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, pxm, + nd->start, nd->end); +} + +/* Sanity check to catch more bad SRATs (they are amazingly common). + Make sure the PXMs cover all memory. */ +static int nodes_cover_memory(void) +{ + int i; + u64 pxmram, e820ram; + + pxmram = 0; + for_each_node_mask(i, nodes_parsed) { + u64 s = nodes[i].start >> PAGE_SHIFT; + u64 e = nodes[i].end >> PAGE_SHIFT; + pxmram += e - s; + } + + e820ram = max_page; + /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ + if ((long)(e820ram - pxmram) >= 1*1024*1024) { + printk(KERN_ERR "SRAT: PXMs only cover %"PRIu64"MB of your %" + PRIu64"MB e820 RAM. Not used.\n", + (pxmram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return 0; + } + return 1; +} + +static void unparse_node(int node) +{ + int i; + node_clear(node, nodes_parsed); + for (i = 0; i < MAX_LOCAL_APIC; i++) { + if (apicid_to_node[i] == node) + apicid_to_node[i] = NUMA_NO_NODE; + } +} + +void __init acpi_numa_arch_fixup(void) {} + +/* Use the information discovered above to actually set up the nodes. */ +int __init acpi_scan_nodes(u64 start, u64 end) +{ + int i; + + /* First clean up the node list */ + for (i = 0; i < MAX_NUMNODES; i++) { + cutoff_node(i, start, end); + if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) + unparse_node(i); + } + + if (acpi_numa <= 0) + return -1; + + if (!nodes_cover_memory()) { + bad_srat(); + return -1; + } + + memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); + if (memnode_shift < 0) { + printk(KERN_ERR + "SRAT: No NUMA node hash function found. Contact maintainer\n"); + bad_srat(); + return -1; + } + + /* Finally register nodes */ + for_each_node_mask(i, nodes_parsed) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + for (i = 0; i < NR_CPUS; i++) { + if (cpu_to_node[i] == NUMA_NO_NODE) + continue; + if (!node_isset(cpu_to_node[i], nodes_parsed)) + numa_set_node(i, NUMA_NO_NODE); + } + numa_init_array(); + return 0; +} + +static int node_to_pxm(int n) +{ + int i; + if (pxm2node[n] == n) + return n; + for (i = 0; i < 256; i++) + if (pxm2node[i] == n) + return i; + return 0; +} + +int __node_distance(int a, int b) +{ + int index; + + if (!acpi_slit) + return a == b ? 10 : 20; + index = acpi_slit->localities * node_to_pxm(a); + return acpi_slit->entry[index + node_to_pxm(b)]; +} + +EXPORT_SYMBOL(__node_distance); diff -r a1f987e9640f -r f312c2d01d8b xen/drivers/acpi/numa.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/drivers/acpi/numa.c Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,216 @@ +/* + * acpi_numa.c - ACPI NUMA support + * + * Copyright (C) 2002 Takayoshi Kochi <t-kochi@xxxxxxxxxxxxx> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + */ +#if 0 +#include <linux/module.h> +#include <linux/kernel.h> +#endif +#include <xen/config.h> +#include <xen/init.h> +#include <xen/types.h> +#include <xen/errno.h> +#include <xen/acpi.h> +#include <xen/numa.h> +#include <acpi/acpi_bus.h> +#include <acpi/acmacros.h> +#include <asm/page.h> /* __va() */ + +#define ACPI_NUMA 0x80000000 +#define _COMPONENT ACPI_NUMA +ACPI_MODULE_NAME("numa") + +extern int __init acpi_table_parse_madt_family(enum acpi_table_id id, + unsigned long madt_size, + int entry_id, + acpi_madt_entry_handler handler, + unsigned int max_entries); + +void __init acpi_table_print_srat_entry(acpi_table_entry_header * header) +{ + + ACPI_FUNCTION_NAME("acpi_table_print_srat_entry"); + + if (!header) + return; + + switch (header->type) { + + case ACPI_SRAT_PROCESSOR_AFFINITY: +#ifdef ACPI_DEBUG_OUTPUT + { + struct acpi_table_processor_affinity *p = + (struct acpi_table_processor_affinity *)header; + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n", + p->apic_id, p->lsapic_eid, + p->proximity_domain, + p->flags. + enabled ? "enabled" : "disabled")); + } +#endif /* ACPI_DEBUG_OUTPUT */ + break; + + case ACPI_SRAT_MEMORY_AFFINITY: +#ifdef ACPI_DEBUG_OUTPUT + { + struct acpi_table_memory_affinity *p = + (struct acpi_table_memory_affinity *)header; + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "SRAT Memory (0x%08x%08x length 0x%08x%08x type 0x%x) in proximity domain %d %s%s\n", + p->base_addr_hi, p->base_addr_lo, + p->length_hi, p->length_lo, + p->memory_type, p->proximity_domain, + p->flags. + enabled ? "enabled" : "disabled", + p->flags. + hot_pluggable ? " hot-pluggable" : + "")); + } +#endif /* ACPI_DEBUG_OUTPUT */ + break; + + default: + printk(KERN_WARNING PREFIX + "Found unsupported SRAT entry (type = 0x%x)\n", + header->type); + break; + } +} + +static int __init acpi_parse_slit(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_slit *slit; + u32 localities; + + if (!phys_addr || !size) + return -EINVAL; + + slit = (struct acpi_table_slit *)__va(phys_addr); + + /* downcast just for %llu vs %lu for i386/ia64 */ + localities = (u32) slit->localities; + + acpi_numa_slit_init(slit); + + return 0; +} + +static int __init +acpi_parse_processor_affinity(acpi_table_entry_header * header, + const unsigned long end) +{ + struct acpi_table_processor_affinity *processor_affinity; + + processor_affinity = (struct acpi_table_processor_affinity *)header; + if (!processor_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(header); + + /* let architecture-dependent part to do it */ + acpi_numa_processor_affinity_init(processor_affinity); + + return 0; +} + +static int __init +acpi_parse_memory_affinity(acpi_table_entry_header * header, + const unsigned long end) +{ + struct acpi_table_memory_affinity *memory_affinity; + + memory_affinity = (struct acpi_table_memory_affinity *)header; + if (!memory_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(header); + + /* let architecture-dependent part to do it */ + acpi_numa_memory_affinity_init(memory_affinity); + + return 0; +} + +static int __init acpi_parse_srat(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_srat *srat; + + if (!phys_addr || !size) + return -EINVAL; + + srat = (struct acpi_table_srat *)__va(phys_addr); + + return 0; +} + +int __init +acpi_table_parse_srat(enum acpi_srat_entry_id id, + acpi_madt_entry_handler handler, unsigned int max_entries) +{ + return acpi_table_parse_madt_family(ACPI_SRAT, + sizeof(struct acpi_table_srat), id, + handler, max_entries); +} + +int __init acpi_numa_init(void) +{ + int result; + + /* SRAT: Static Resource Affinity Table */ + result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat); + + if (result > 0) { + result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY, + acpi_parse_processor_affinity, + NR_CPUS); + result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); // IA64 specific + } + + /* SLIT: System Locality Information Table */ + result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit); + + acpi_numa_arch_fixup(); + return 0; +} + +#if 0 +int acpi_get_pxm(acpi_handle h) +{ + unsigned long pxm; + acpi_status status; + acpi_handle handle; + acpi_handle phandle = h; + + do { + handle = phandle; + status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); + if (ACPI_SUCCESS(status)) + return (int)pxm; + status = acpi_get_parent(handle, &phandle); + } while (ACPI_SUCCESS(status)); + return -1; +} + +EXPORT_SYMBOL(acpi_get_pxm); +#endif diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/numa.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/asm-x86/numa.h Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,65 @@ +#ifndef _ASM_X8664_NUMA_H +#define _ASM_X8664_NUMA_H 1 + +#include <xen/nodemask.h> +#include <xen/topology.h> +#include <asm/numnodes.h> +#include <asm/smp.h> + +struct node { + u64 start,end; +}; + +extern int compute_hash_shift(struct node *nodes, int numnodes); +extern int pxm_to_node(int nid); + +#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) +#define VIRTUAL_BUG_ON(x) +#define NODEMAPSIZE 0xfff + +extern void numa_add_cpu(int cpu); +extern void numa_init_array(void); +extern int numa_off; + +extern void numa_set_node(int cpu, int node); + +extern void setup_node_bootmem(int nodeid, u64 start, u64 end); +extern unsigned char apicid_to_node[256]; +#ifdef CONFIG_NUMA +extern void __init init_cpu_to_node(void); + +static inline void clear_node_cpumask(int cpu) +{ + clear_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); +} + +/* Simple perfect hash to map physical addresses to node numbers */ +extern int memnode_shift; +extern u8 memnodemap[NODEMAPSIZE]; + +extern struct node_data node_data[]; + +static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) +{ + unsigned nid; + VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE); + nid = memnodemap[addr >> memnode_shift]; + VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); + return nid; +} + +#define NODE_DATA(nid) (&(node_data[nid])) + +#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) +#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ + NODE_DATA(nid)->node_spanned_pages) + + +#else +#define init_cpu_to_node() do {} while (0) +#define clear_node_cpumask(cpu) do {} while (0) +#endif + +#define NUMA_NO_NODE 0xff + +#endif diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/numnodes.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/asm-x86/numnodes.h Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,26 @@ +#ifndef _ASM_MAX_NUMNODES_H +#define _ASM_MAX_NUMNODES_H + +#include <xen/config.h> + +#if defined(__i386__) +#ifdef CONFIG_X86_NUMAQ + +/* Max 16 Nodes */ +#define NODES_SHIFT 4 + +#elif defined(CONFIG_ACPI_SRAT) + +/* Max 8 Nodes */ +#define NODES_SHIFT 3 + +#endif /* CONFIG_X86_NUMAQ */ + + +#endif /* __i386__ */ + +#if defined(CONFIG_NUMA) && defined(__x86_64__) +#define NODES_SHIFT 6 +#endif /* __x86_64__ */ + +#endif /* _ASM_MAX_NUMNODES_H */ diff -r a1f987e9640f -r f312c2d01d8b xen/include/asm-x86/topology.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/asm-x86/topology.h Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2006, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Ryan Harper <ryanh@xxxxxxxxxx> + */ + +#ifndef _ASM_X86_TOPOLOGY_H +#define _ASM_X86_TOPOLOGY_H + +#include <xen/config.h> +#include <xen/bitops.h> + +extern cpumask_t cpu_online_map; + +extern unsigned int cpu_to_node[]; +extern cpumask_t node_to_cpumask[]; + +#define cpu_to_node(cpu) (cpu_to_node[cpu]) +#define parent_node(node) (node) +#define node_to_first_cpu(node) (__ffs(node_to_cpumask[node])) +#define node_to_cpumask(node) (node_to_cpumask[node]) + +#endif /* _ASM_X86_TOPOLOGY_H */ diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/nodemask.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/xen/nodemask.h Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,342 @@ +#ifndef __LINUX_NODEMASK_H +#define __LINUX_NODEMASK_H + +/* + * Nodemasks provide a bitmap suitable for representing the + * set of Node's in a system, one bit position per Node number. + * + * See detailed comments in the file linux/bitmap.h describing the + * data type on which these nodemasks are based. + * + * For details of nodemask_scnprintf() and nodemask_parse(), + * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. + * + * The available nodemask operations are: + * + * void node_set(node, mask) turn on bit 'node' in mask + * void node_clear(node, mask) turn off bit 'node' in mask + * void nodes_setall(mask) set all bits + * void nodes_clear(mask) clear all bits + * int node_isset(node, mask) true iff bit 'node' set in mask + * int node_test_and_set(node, mask) test and set bit 'node' in mask + * + * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] + * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] + * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 + * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 + * void nodes_complement(dst, src) dst = ~src + * + * int nodes_equal(mask1, mask2) Does mask1 == mask2? + * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? + * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? + * int nodes_empty(mask) Is mask empty (no bits sets)? + * int nodes_full(mask) Is mask full (all bits sets)? + * int nodes_weight(mask) Hamming weight - number of set bits + * + * void nodes_shift_right(dst, src, n) Shift right + * void nodes_shift_left(dst, src, n) Shift left + * + * int first_node(mask) Number lowest set bit, or MAX_NUMNODES + * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * int first_unset_node(mask) First node not set in mask, or + * MAX_NUMNODES. + * + * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set + * NODE_MASK_ALL Initializer - all bits set + * NODE_MASK_NONE Initializer - no bits set + * unsigned long *nodes_addr(mask) Array of unsigned long's in mask + * + * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing + * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask + * + * for_each_node_mask(node, mask) for-loop node over mask + * + * int num_online_nodes() Number of online Nodes + * int num_possible_nodes() Number of all possible Nodes + * + * int node_online(node) Is some node online? + * int node_possible(node) Is some node possible? + * + * int any_online_node(mask) First online node in mask + * + * node_set_online(node) set bit 'node' in node_online_map + * node_set_offline(node) clear bit 'node' in node_online_map + * + * for_each_node(node) for-loop node over node_possible_map + * for_each_online_node(node) for-loop node over node_online_map + * + * Subtlety: + * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) + * to generate slightly worse code. So use a simple one-line #define + * for node_isset(), instead of wrapping an inline inside a macro, the + * way we do the other calls. + */ + +#if 0 +#include <linux/threads.h> +#include <asm/bug.h> +#endif +#include <xen/kernel.h> +#include <xen/bitmap.h> +#include <xen/numa.h> + +typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; +extern nodemask_t _unused_nodemask_arg_; + +#define node_set(node, dst) __node_set((node), &(dst)) +static inline void __node_set(int node, volatile nodemask_t *dstp) +{ + set_bit(node, dstp->bits); +} + +#define node_clear(node, dst) __node_clear((node), &(dst)) +static inline void __node_clear(int node, volatile nodemask_t *dstp) +{ + clear_bit(node, dstp->bits); +} + +#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) +static inline void __nodes_setall(nodemask_t *dstp, int nbits) +{ + bitmap_fill(dstp->bits, nbits); +} + +#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) +static inline void __nodes_clear(nodemask_t *dstp, int nbits) +{ + bitmap_zero(dstp->bits, nbits); +} + +/* No static inline type checking - see Subtlety (1) above. */ +#define node_isset(node, nodemask) test_bit((node), (nodemask).bits) + +#define node_test_and_set(node, nodemask) \ + __node_test_and_set((node), &(nodemask)) +static inline int __node_test_and_set(int node, nodemask_t *addr) +{ + return test_and_set_bit(node, addr->bits); +} + +#define nodes_and(dst, src1, src2) \ + __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) +static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define nodes_or(dst, src1, src2) \ + __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) +static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define nodes_xor(dst, src1, src2) \ + __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) +static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define nodes_andnot(dst, src1, src2) \ + __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) +static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define nodes_complement(dst, src) \ + __nodes_complement(&(dst), &(src), MAX_NUMNODES) +static inline void __nodes_complement(nodemask_t *dstp, + const nodemask_t *srcp, int nbits) +{ + bitmap_complement(dstp->bits, srcp->bits, nbits); +} + +#define nodes_equal(src1, src2) \ + __nodes_equal(&(src1), &(src2), MAX_NUMNODES) +static inline int __nodes_equal(const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + return bitmap_equal(src1p->bits, src2p->bits, nbits); +} + +#define nodes_intersects(src1, src2) \ + __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) +static inline int __nodes_intersects(const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + return bitmap_intersects(src1p->bits, src2p->bits, nbits); +} + +#define nodes_subset(src1, src2) \ + __nodes_subset(&(src1), &(src2), MAX_NUMNODES) +static inline int __nodes_subset(const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + return bitmap_subset(src1p->bits, src2p->bits, nbits); +} + +#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) +static inline int __nodes_empty(const nodemask_t *srcp, int nbits) +{ + return bitmap_empty(srcp->bits, nbits); +} + +#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) +static inline int __nodes_full(const nodemask_t *srcp, int nbits) +{ + return bitmap_full(srcp->bits, nbits); +} + +#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) +static inline int __nodes_weight(const nodemask_t *srcp, int nbits) +{ + return bitmap_weight(srcp->bits, nbits); +} + +#define nodes_shift_right(dst, src, n) \ + __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) +static inline void __nodes_shift_right(nodemask_t *dstp, + const nodemask_t *srcp, int n, int nbits) +{ + bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); +} + +#define nodes_shift_left(dst, src, n) \ + __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) +static inline void __nodes_shift_left(nodemask_t *dstp, + const nodemask_t *srcp, int n, int nbits) +{ + bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); +} + +/* FIXME: better would be to fix all architectures to never return + > MAX_NUMNODES, then the silly min_ts could be dropped. */ + +#define first_node(src) __first_node(&(src)) +static inline int __first_node(const nodemask_t *srcp) +{ + return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); +} + +#define next_node(n, src) __next_node((n), &(src)) +static inline int __next_node(int n, const nodemask_t *srcp) +{ + return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); +} + +#define nodemask_of_node(node) \ +({ \ + typeof(_unused_nodemask_arg_) m; \ + if (sizeof(m) == sizeof(unsigned long)) { \ + m.bits[0] = 1UL<<(node); \ + } else { \ + nodes_clear(m); \ + node_set((node), m); \ + } \ + m; \ +}) + +#define first_unset_node(mask) __first_unset_node(&(mask)) +static inline int __first_unset_node(const nodemask_t *maskp) +{ + return min_t(int,MAX_NUMNODES, + find_first_zero_bit(maskp->bits, MAX_NUMNODES)); +} + +#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) + +#if MAX_NUMNODES <= BITS_PER_LONG + +#define NODE_MASK_ALL \ +((nodemask_t) { { \ + [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ +} }) + +#else + +#define NODE_MASK_ALL \ +((nodemask_t) { { \ + [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ + [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ +} }) + +#endif + +#define NODE_MASK_NONE \ +((nodemask_t) { { \ + [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ +} }) + +#define nodes_addr(src) ((src).bits) + +#if 0 +#define nodemask_scnprintf(buf, len, src) \ + __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES) +static inline int __nodemask_scnprintf(char *buf, int len, + const nodemask_t *srcp, int nbits) +{ + return bitmap_scnprintf(buf, len, srcp->bits, nbits); +} + +#define nodemask_parse(ubuf, ulen, dst) \ + __nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES) +static inline int __nodemask_parse(const char __user *buf, int len, + nodemask_t *dstp, int nbits) +{ + return bitmap_parse(buf, len, dstp->bits, nbits); +} +#endif + +#if MAX_NUMNODES > 1 +#define for_each_node_mask(node, mask) \ + for ((node) = first_node(mask); \ + (node) < MAX_NUMNODES; \ + (node) = next_node((node), (mask))) +#else /* MAX_NUMNODES == 1 */ +#define for_each_node_mask(node, mask) \ + if (!nodes_empty(mask)) \ + for ((node) = 0; (node) < 1; (node)++) +#endif /* MAX_NUMNODES */ + +/* + * The following particular system nodemasks and operations + * on them manage all possible and online nodes. + */ + +extern nodemask_t node_online_map; +extern nodemask_t node_possible_map; + +#if MAX_NUMNODES > 1 +#define num_online_nodes() nodes_weight(node_online_map) +#define num_possible_nodes() nodes_weight(node_possible_map) +#define node_online(node) node_isset((node), node_online_map) +#define node_possible(node) node_isset((node), node_possible_map) +#else +#define num_online_nodes() 1 +#define num_possible_nodes() 1 +#define node_online(node) ((node) == 0) +#define node_possible(node) ((node) == 0) +#endif + +#define any_online_node(mask) \ +({ \ + int node; \ + for_each_node_mask(node, (mask)) \ + if (node_online(node)) \ + break; \ + node; \ +}) + +#define node_set_online(node) set_bit((node), node_online_map.bits) +#define node_set_offline(node) clear_bit((node), node_online_map.bits) + +#define for_each_node(node) for_each_node_mask((node), node_possible_map) +#define for_each_online_node(node) for_each_node_mask((node), node_online_map) + +#endif /* __LINUX_NODEMASK_H */ diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/numa.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/xen/numa.h Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,35 @@ +#ifndef _XEN_NUMA_H +#define _XEN_NUMA_H + +#include <xen/config.h> + +#ifdef CONFIG_DISCONTIGMEM +#include <asm/numnodes.h> +#endif + +#ifndef NODES_SHIFT +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) +#define NUMA_NO_NODE 0xff + +#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ +#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) +#define MAX_CHUNKS_PER_NODE 4 +#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) + +/* needed for drivers/acpi/numa.c */ +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) + +extern unsigned int cpu_to_node[]; +#include <xen/cpumask.h> +extern cpumask_t node_to_cpumask[]; + +typedef struct node_data { + unsigned long node_start_pfn; + unsigned long node_spanned_pages; + unsigned int node_id; +} node_data_t; + +#endif /* _XEN_NUMA_H */ diff -r a1f987e9640f -r f312c2d01d8b xen/include/xen/topology.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/xen/topology.h Wed Oct 25 12:25:54 2006 +0100 @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2006, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#ifndef _XEN_TOPOLOGY_H +#define _XEN_TOPOLOGY_H + +#include <asm/topology.h> + +#endif /* _XEN_TOPOLOGY_H */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.