[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen master] x86: allow specifying the NUMA nodes Dom0 should run on



commit 705313c093b7c6eb72e2842c22a9f05201924ab4
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Fri Mar 6 17:26:30 2015 +0100
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Fri Mar 6 17:26:30 2015 +0100

    x86: allow specifying the NUMA nodes Dom0 should run on
    
    ... by introducing a "dom0_nodes" option augmenting the "dom0_mem" and
    "dom0_max_vcpus" ones.
    
    Note that this gives meaning to MEMF_exact_node specified alone (i.e.
    implicitly combined with NUMA_NO_NODE): In such a case any node inside
    the domain's node mask is acceptable, but no other node. This changed
    behavior is (implicitly) being exposed through the memop hypercalls.
    
    Note further that this change doesn't take care of moving the initrd
    image into memory matching Dom0's affinity when the initrd doesn't get
    copied (because of being part of the initial mapping) anyway.
    
    And note finally that this doesn't get us meaningfully closer to
    handing vNUMA information to Dom0 (which will require the current
    striping of allocations to become node-specific in order for the passed
    on information to be meaningful).
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Acked-by: Ian Campbell <ian.campbell@xxxxxxxxxx>
    Reviewed-by: Dario Faggioli <dario.faggioli@xxxxxxxxx>
    Reviewed-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
 docs/misc/xen-command-line.markdown |   12 +++++
 xen/arch/x86/domain_build.c         |   88 ++++++++++++++++++++++++++++++----
 xen/arch/x86/setup.c                |   12 ++++-
 xen/common/page_alloc.c             |    7 +--
 xen/include/asm-x86/setup.h         |    3 +-
 5 files changed, 104 insertions(+), 18 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 63871cb..e125212 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -551,6 +551,18 @@ any dom0 autoballooning feature present in your toolstack. 
See the
 _xl.conf(5)_ man page or [Xen Best
 
Practices](http://wiki.xen.org/wiki/Xen_Best_Practices#Xen_dom0_dedicated_memory_and_preventing_dom0_memory_ballooning).
 
+### dom0\_nodes
+
+> `= List of [ <integer> | relaxed | strict ]`
+
+> Default: `strict`
+
+Specify the NUMA nodes to place Dom0 on. Defaults for vCPU-s created
+and memory assigned to Dom0 will be adjusted to match the node
+restrictions set up here. Note that the values to be specified here are
+ACPI PXM ones, not Xen internal node numbers. `relaxed` sets up vCPU
+affinities to prefer but be not limited to the specified node(s).
+
 ### dom0\_shadow
 > `= <boolean>`
 
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index a7d55ca..6d7a592 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -100,11 +100,70 @@ static void __init parse_dom0_max_vcpus(const char *s)
 }
 custom_param("dom0_max_vcpus", parse_dom0_max_vcpus);
 
-unsigned int __init dom0_max_vcpus(void)
+static __initdata unsigned int dom0_nr_pxms;
+static __initdata unsigned int dom0_pxms[MAX_NUMNODES] =
+    { [0 ... MAX_NUMNODES - 1] = ~0 };
+static __initdata bool_t dom0_affinity_relaxed;
+
+static void __init parse_dom0_nodes(const char *s)
 {
-    unsigned max_vcpus;
+    do {
+        if ( isdigit(*s) )
+            dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0);
+        else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') )
+        {
+            dom0_affinity_relaxed = 1;
+            s += 7;
+        }
+        else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') )
+        {
+            dom0_affinity_relaxed = 0;
+            s += 6;
+        }
+        else
+            break;
+    } while ( ++dom0_nr_pxms < ARRAY_SIZE(dom0_pxms) && *s++ == ',' );
+}
+custom_param("dom0_nodes", parse_dom0_nodes);
+
+static cpumask_t __initdata dom0_cpus;
 
-    max_vcpus = num_cpupool_cpus(cpupool0);
+static struct vcpu *__init setup_dom0_vcpu(struct domain *d,
+                                           unsigned int vcpu_id,
+                                           unsigned int cpu)
+{
+    struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu);
+
+    if ( v )
+    {
+        if ( !d->is_pinned && !dom0_affinity_relaxed )
+            cpumask_copy(v->cpu_hard_affinity, &dom0_cpus);
+        cpumask_copy(v->cpu_soft_affinity, &dom0_cpus);
+    }
+
+    return v;
+}
+
+static nodemask_t __initdata dom0_nodes;
+
+unsigned int __init dom0_max_vcpus(void)
+{
+    unsigned int i, max_vcpus;
+    nodeid_t node;
+
+    for ( i = 0; i < dom0_nr_pxms; ++i )
+        if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE )
+            node_set(node, dom0_nodes);
+    nodes_and(dom0_nodes, dom0_nodes, node_online_map);
+    if ( nodes_empty(dom0_nodes) )
+        dom0_nodes = node_online_map;
+    for_each_node_mask ( node, dom0_nodes )
+        cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
+    cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid);
+    if ( cpumask_empty(&dom0_cpus) )
+        cpumask_copy(&dom0_cpus, cpupool0->cpu_valid);
+
+    max_vcpus = cpumask_weight(&dom0_cpus);
     if ( opt_dom0_max_vcpus_min > max_vcpus )
         max_vcpus = opt_dom0_max_vcpus_min;
     if ( opt_dom0_max_vcpus_max < max_vcpus )
@@ -119,12 +178,15 @@ struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0)
 {
     unsigned int max_vcpus = dom0_max_vcpus();
 
+    dom0->node_affinity = dom0_nodes;
+    dom0->auto_node_affinity = !dom0_nr_pxms;
+
     dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus);
     if ( !dom0->vcpu )
         return NULL;
     dom0->max_vcpus = max_vcpus;
 
-    return alloc_vcpu(dom0, 0, 0);
+    return setup_dom0_vcpu(dom0, 0, cpumask_first(&dom0_cpus));
 }
 
 #ifdef CONFIG_SHADOW_PAGING
@@ -156,7 +218,7 @@ static struct page_info * __init alloc_chunk(
     struct domain *d, unsigned long max_pages)
 {
     static unsigned int __initdata last_order = MAX_ORDER;
-    static unsigned int __initdata memflags = MEMF_no_dma;
+    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
     struct page_info *page;
     unsigned int order = get_order_from_pages(max_pages), free_order;
 
@@ -190,7 +252,7 @@ static struct page_info * __init alloc_chunk(
 
         if ( d->tot_pages + (1 << order) > d->max_pages )
             continue;
-        pg2 = alloc_domheap_pages(d, order, 0);
+        pg2 = alloc_domheap_pages(d, order, MEMF_exact_node);
         if ( pg2 > page )
         {
             free_domheap_pages(page, free_order);
@@ -217,10 +279,14 @@ static unsigned long __init dom0_paging_pages(const 
struct domain *d,
 static unsigned long __init compute_dom0_nr_pages(
     struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
 {
-    unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
-    unsigned long nr_pages, min_pages, max_pages;
+    nodeid_t node;
+    unsigned long avail = 0, nr_pages, min_pages, max_pages;
     bool_t need_paging;
 
+    for_each_node_mask ( node, dom0_nodes )
+        avail += avail_domheap_pages_region(node, 0, 0) +
+                 initial_images_nrpages(node);
+
     /* Reserve memory for further dom0 vcpu-struct allocations... */
     avail -= (d->max_vcpus - 1UL)
              << get_order_from_bytes(sizeof(struct vcpu));
@@ -1230,11 +1296,11 @@ int __init construct_dom0(
 
     printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus);
 
-    cpu = cpumask_first(cpupool0->cpu_valid);
+    cpu = v->processor;
     for ( i = 1; i < d->max_vcpus; i++ )
     {
-        cpu = cpumask_cycle(cpu, cpupool0->cpu_valid);
-        (void)alloc_vcpu(d, i, cpu);
+        cpu = cpumask_cycle(cpu, &dom0_cpus);
+        setup_dom0_vcpu(d, i, cpu);
     }
 
     /*
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 7593533..384df03 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -140,13 +140,21 @@ static void __init parse_acpi_param(char *s)
 static const module_t *__initdata initial_images;
 static unsigned int __initdata nr_initial_images;
 
-unsigned long __init initial_images_nrpages(void)
+unsigned long __init initial_images_nrpages(nodeid_t node)
 {
+    unsigned long node_start = node_start_pfn(node);
+    unsigned long node_end = node_end_pfn(node);
     unsigned long nr;
     unsigned int i;
 
     for ( nr = i = 0; i < nr_initial_images; ++i )
-        nr += PFN_UP(initial_images[i].mod_end);
+    {
+        unsigned long start = initial_images[i].mod_start;
+        unsigned long end = start + PFN_UP(initial_images[i].mod_end);
+
+        if ( end > node_start && node_end > start )
+            nr += min(node_end, end) - max(node_start, start);
+    }
 
     return nr;
 }
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index d96d25b..9060386 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -581,7 +581,7 @@ static struct page_info *alloc_heap_pages(
     struct domain *d)
 {
     unsigned int i, j, zone = 0, nodemask_retry = 0;
-    nodeid_t first_node, node = MEMF_get_node(memflags);
+    nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
     unsigned long request = 1UL << order;
     struct page_info *pg;
     nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
@@ -593,7 +593,6 @@ static struct page_info *alloc_heap_pages(
 
     if ( node == NUMA_NO_NODE )
     {
-        memflags &= ~MEMF_exact_node;
         if ( d != NULL )
         {
             node = next_node(d->last_alloc_node, nodemask);
@@ -654,7 +653,7 @@ static struct page_info *alloc_heap_pages(
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
-        if ( memflags & MEMF_exact_node )
+        if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
             goto not_found;
 
         /* Pick next node. */
@@ -671,7 +670,7 @@ static struct page_info *alloc_heap_pages(
         if ( node == first_node )
         {
             /* When we have tried all in nodemask, we fall back to others. */
-            if ( nodemask_retry++ )
+            if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
                 goto not_found;
             nodes_andnot(nodemask, node_online_map, nodemask);
             first_node = node = first_node(nodemask);
diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
index 762eb02..08bc23a 100644
--- a/xen/include/asm-x86/setup.h
+++ b/xen/include/asm-x86/setup.h
@@ -2,6 +2,7 @@
 #define __X86_SETUP_H_
 
 #include <xen/multiboot.h>
+#include <asm/numa.h>
 
 extern unsigned long xenheap_initial_phys_start;
 
@@ -32,7 +33,7 @@ int construct_dom0(
     void *(*bootstrap_map)(const module_t *),
     char *cmdline);
 
-unsigned long initial_images_nrpages(void);
+unsigned long initial_images_nrpages(nodeid_t node);
 void discard_initial_images(void);
 
 unsigned int dom0_max_vcpus(void);
--
generated by git-patchbot for /home/xen/git/xen.git#master

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.