[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 1/5 v2] x86: allow specifying the NUMA nodes Dom0 should run on



... by introducing a "dom0_nodes" option augmenting the "dom0_mem" and
"dom0_max_vcpus" ones.

Note that this gives meaning to MEMF_exact_node specified alone (i.e.
implicitly combined with NUMA_NO_NODE): In such a case any node inside
the domain's node mask is acceptable, but no other node. This changed
behavior is (implicitly) being exposed through the memop hypercalls.

Note further that this change doesn't take care of moving the initrd
image into memory matching Dom0's affinity when the initrd doesn't get
copied (because of being part of the initial mapping) anyway.

And note finally that this doesn't get us meaningfully closer to
handing vNUMA information to Dom0 (which will require the current
striping of allocations to become node-specific in order for the passed
on information to be meaningful).

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
Acked-by: Ian Campbell <ian.campbell@xxxxxxxxxx>
---
v2: Use MAX_NUMNODES sized array for storing PXMs. Implement relaxed
    mode. Minor other cleanup.

--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -540,6 +540,18 @@ any dom0 autoballooning feature present 
 _xl.conf(5)_ man page or [Xen Best
 
Practices](http://wiki.xen.org/wiki/Xen_Best_Practices#Xen_dom0_dedicated_memory_and_preventing_dom0_memory_ballooning).
 
+### dom0\_nodes
+
+> `= List of [ <integer> | relaxed | strict ]`
+
+> Default: `strict`
+
+Specify the NUMA nodes to place Dom0 on. Defaults for vCPU-s created
+and memory assigned to Dom0 will be adjusted to match the node
+restrictions set up here. Note that the values to be specified here are
+ACPI PXM ones, not Xen internal node numbers. `relaxed` sets up vCPU
+affinities to prefer but be not limited to the specified node(s).
+
 ### dom0\_shadow
 > `= <boolean>`
 
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -100,11 +100,70 @@ static void __init parse_dom0_max_vcpus(
 }
 custom_param("dom0_max_vcpus", parse_dom0_max_vcpus);
 
+static __initdata unsigned int dom0_nr_pxms;
+static __initdata unsigned int dom0_pxms[MAX_NUMNODES] =
+    { [0 ... MAX_NUMNODES - 1] = ~0 };
+static __initdata bool_t dom0_affinity_relaxed;
+
+static void __init parse_dom0_nodes(const char *s)
+{
+    do {
+        if ( isdigit(*s) )
+            dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0);
+        else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') )
+        {
+            dom0_affinity_relaxed = 1;
+            s += 7;
+        }
+        else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') )
+        {
+            dom0_affinity_relaxed = 0;
+            s += 6;
+        }
+        else
+            break;
+    } while ( ++dom0_nr_pxms < ARRAY_SIZE(dom0_pxms) && *s++ == ',' );
+}
+custom_param("dom0_nodes", parse_dom0_nodes);
+
+static cpumask_t __initdata dom0_cpus;
+
+static struct vcpu *__init setup_dom0_vcpu(struct domain *d,
+                                           unsigned int vcpu_id,
+                                           unsigned int cpu)
+{
+    struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu);
+
+    if ( v )
+    {
+        if ( !d->is_pinned && !dom0_affinity_relaxed )
+            cpumask_copy(v->cpu_hard_affinity, &dom0_cpus);
+        cpumask_copy(v->cpu_soft_affinity, &dom0_cpus);
+    }
+
+    return v;
+}
+
+static nodemask_t __initdata dom0_nodes;
+
 unsigned int __init dom0_max_vcpus(void)
 {
-    unsigned max_vcpus;
+    unsigned int i, max_vcpus;
+    nodeid_t node;
+
+    for ( i = 0; i < dom0_nr_pxms; ++i )
+        if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE )
+            node_set(node, dom0_nodes);
+    nodes_and(dom0_nodes, dom0_nodes, node_online_map);
+    if ( nodes_empty(dom0_nodes) )
+        dom0_nodes = node_online_map;
+    for_each_node_mask ( node, dom0_nodes )
+        cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
+    cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid);
+    if ( cpumask_empty(&dom0_cpus) )
+        cpumask_copy(&dom0_cpus, cpupool0->cpu_valid);
 
-    max_vcpus = num_cpupool_cpus(cpupool0);
+    max_vcpus = cpumask_weight(&dom0_cpus);
     if ( opt_dom0_max_vcpus_min > max_vcpus )
         max_vcpus = opt_dom0_max_vcpus_min;
     if ( opt_dom0_max_vcpus_max < max_vcpus )
@@ -119,12 +178,15 @@ struct vcpu *__init alloc_dom0_vcpu0(str
 {
     unsigned int max_vcpus = dom0_max_vcpus();
 
+    dom0->node_affinity = dom0_nodes;
+    dom0->auto_node_affinity = !dom0_nr_pxms;
+
     dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus);
     if ( !dom0->vcpu )
         return NULL;
     dom0->max_vcpus = max_vcpus;
 
-    return alloc_vcpu(dom0, 0, 0);
+    return setup_dom0_vcpu(dom0, 0, cpumask_first(&dom0_cpus));
 }
 
 #ifdef CONFIG_SHADOW_PAGING
@@ -156,7 +218,7 @@ static struct page_info * __init alloc_c
     struct domain *d, unsigned long max_pages)
 {
     static unsigned int __initdata last_order = MAX_ORDER;
-    static unsigned int __initdata memflags = MEMF_no_dma;
+    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
     struct page_info *page;
     unsigned int order = get_order_from_pages(max_pages), free_order;
 
@@ -190,7 +252,7 @@ static struct page_info * __init alloc_c
 
         if ( d->tot_pages + (1 << order) > d->max_pages )
             continue;
-        pg2 = alloc_domheap_pages(d, order, 0);
+        pg2 = alloc_domheap_pages(d, order, MEMF_exact_node);
         if ( pg2 > page )
         {
             free_domheap_pages(page, free_order);
@@ -217,10 +279,14 @@ static unsigned long __init dom0_paging_
 static unsigned long __init compute_dom0_nr_pages(
     struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
 {
-    unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
-    unsigned long nr_pages, min_pages, max_pages;
+    nodeid_t node;
+    unsigned long avail = 0, nr_pages, min_pages, max_pages;
     bool_t need_paging;
 
+    for_each_node_mask ( node, dom0_nodes )
+        avail += avail_domheap_pages_region(node, 0, 0) +
+                 initial_images_nrpages(node);
+
     /* Reserve memory for further dom0 vcpu-struct allocations... */
     avail -= (d->max_vcpus - 1UL)
              << get_order_from_bytes(sizeof(struct vcpu));
@@ -1230,11 +1296,11 @@ int __init construct_dom0(
 
     printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus);
 
-    cpu = cpumask_first(cpupool0->cpu_valid);
+    cpu = v->processor;
     for ( i = 1; i < d->max_vcpus; i++ )
     {
-        cpu = cpumask_cycle(cpu, cpupool0->cpu_valid);
-        (void)alloc_vcpu(d, i, cpu);
+        cpu = cpumask_cycle(cpu, &dom0_cpus);
+        setup_dom0_vcpu(d, i, cpu);
     }
 
     /*
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -140,13 +140,21 @@ static void __init parse_acpi_param(char
 static const module_t *__initdata initial_images;
 static unsigned int __initdata nr_initial_images;
 
-unsigned long __init initial_images_nrpages(void)
+unsigned long __init initial_images_nrpages(nodeid_t node)
 {
+    unsigned long node_start = node_start_pfn(node);
+    unsigned long node_end = node_end_pfn(node);
     unsigned long nr;
     unsigned int i;
 
     for ( nr = i = 0; i < nr_initial_images; ++i )
-        nr += PFN_UP(initial_images[i].mod_end);
+    {
+        unsigned long start = initial_images[i].mod_start;
+        unsigned long end = start + PFN_UP(initial_images[i].mod_end);
+
+        if ( end > node_start && node_end > start )
+            nr += min(node_end, end) - max(node_start, start);
+    }
 
     return nr;
 }
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -581,7 +581,7 @@ static struct page_info *alloc_heap_page
     struct domain *d)
 {
     unsigned int i, j, zone = 0, nodemask_retry = 0;
-    nodeid_t first_node, node = MEMF_get_node(memflags);
+    nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
     unsigned long request = 1UL << order;
     struct page_info *pg;
     nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
@@ -593,7 +593,6 @@ static struct page_info *alloc_heap_page
 
     if ( node == NUMA_NO_NODE )
     {
-        memflags &= ~MEMF_exact_node;
         if ( d != NULL )
         {
             node = next_node(d->last_alloc_node, nodemask);
@@ -654,7 +653,7 @@ static struct page_info *alloc_heap_page
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
-        if ( memflags & MEMF_exact_node )
+        if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
             goto not_found;
 
         /* Pick next node. */
@@ -671,7 +670,7 @@ static struct page_info *alloc_heap_page
         if ( node == first_node )
         {
             /* When we have tried all in nodemask, we fall back to others. */
-            if ( nodemask_retry++ )
+            if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
                 goto not_found;
             nodes_andnot(nodemask, node_online_map, nodemask);
             first_node = node = first_node(nodemask);
--- a/xen/include/asm-x86/setup.h
+++ b/xen/include/asm-x86/setup.h
@@ -2,6 +2,7 @@
 #define __X86_SETUP_H_
 
 #include <xen/multiboot.h>
+#include <asm/numa.h>
 
 extern unsigned long xenheap_initial_phys_start;
 
@@ -32,7 +33,7 @@ int construct_dom0(
     void *(*bootstrap_map)(const module_t *),
     char *cmdline);
 
-unsigned long initial_images_nrpages(void);
+unsigned long initial_images_nrpages(nodeid_t node);
 void discard_initial_images(void);
 
 unsigned int dom0_max_vcpus(void);


Attachment: x86-Dom0-nodes.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.