[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 1/5] x86: allow specifying the NUMA nodes Dom0 should run on



On 26/02/15 13:52, Jan Beulich wrote:
> ... by introducing a "dom0_nodes" option augmenting the "dom0_mem" and
> "dom0_max_vcpus" ones.
>
> Note that this gives meaning to MEMF_exact_node specified alone (i.e.
> implicitly combined with NUMA_NO_NODE): In such a case any node inside
> the domain's node mask is acceptable, but no other node. This changed
> behavior is (implicitly) being exposed through the memop hypercalls.
>
> Note further that this change doesn't take care of moving the initrd
> image into memory matching Dom0's affinity when the initrd doesn't get
> copied (because of being part of the initial mapping) anyway.
>
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

A couple of comments, but Reviewed-by: Andrew Cooper
<andrew.cooper3@xxxxxxxxxx>

> ---
> I'm uncertain whether range restricting the PXMs permitted for Dom0 is
> the right approach (matching what other NUMA code did until recently),
> or whether we would instead want to simply limit the number of PXMs we
> can handler there (i.e. using a static array instead of a static
> bitmap).

I am not quite sure what you mean by this.

>
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -540,6 +540,15 @@ any dom0 autoballooning feature present 
>  _xl.conf(5)_ man page or [Xen Best
>  
> Practices](http://wiki.xen.org/wiki/Xen_Best_Practices#Xen_dom0_dedicated_memory_and_preventing_dom0_memory_ballooning).
>  
> +### dom0\_nodes
> +
> +> `= <integer>[,...]`
> +
> +Specify the NUMA nodes to place Dom0 on. Defaults for vCPU-s created
> +and memory assigned to Dom0 will be adjusted to match the node
> +restrictions set up here. Note that the values to be specified here are
> +ACPI PXM ones, not Xen internal node numbers.
> +
>  ### dom0\_shadow
>  > `= <boolean>`
>  
> --- a/xen/arch/x86/domain_build.c
> +++ b/xen/arch/x86/domain_build.c
> @@ -100,11 +100,58 @@ static void __init parse_dom0_max_vcpus(
>  }
>  custom_param("dom0_max_vcpus", parse_dom0_max_vcpus);
>  
> +#define MAX_DOM0_PXM 255
> +static __initdata DECLARE_BITMAP(dom0_pxms, MAX_DOM0_PXM + 1);
> +
> +static void __init parse_dom0_nodes(const char *s)
> +{
> +    do {
> +        unsigned int pxm = simple_strtoul(s, &s, 0);
> +
> +        if ( pxm <= MAX_DOM0_PXM )
> +            __set_bit(pxm, dom0_pxms);
> +    } while ( *s++ == ',' );
> +}
> +custom_param("dom0_nodes", parse_dom0_nodes);
> +
> +static cpumask_t __initdata dom0_cpus;
> +
> +static struct vcpu *__init setup_vcpu(struct domain *d, unsigned int vcpu_id,
> +                                      unsigned int cpu)

I would be tempted to name this setup_dom0_vcpu() to bring it in line
with other dom0 construction functions, and to make it clear that it is
not for use on general domains.

> +{
> +    struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu);
> +
> +    if ( v )
> +    {
> +        if ( !d->is_pinned )
> +            cpumask_copy(v->cpu_hard_affinity, &dom0_cpus);
> +        cpumask_copy(v->cpu_soft_affinity, &dom0_cpus);
> +    }
> +
> +    return v;
> +}
> +
> +static nodemask_t __initdata dom0_nodes;
> +
>  unsigned int __init dom0_max_vcpus(void)
>  {
> -    unsigned max_vcpus;
> +    unsigned int pxm, max_vcpus;
> +    nodeid_t node;
> +
> +    for ( pxm = 0; pxm <= MAX_DOM0_PXM; ++pxm )
> +        if ( test_bit(pxm, dom0_pxms) &&
> +             (node = pxm_to_node(pxm)) != NUMA_NO_NODE )
> +            node_set(node, dom0_nodes);

for_each_set_bit() ?  It would simply the above loop slightly, and is
more efficient.

> +    nodes_and(dom0_nodes, dom0_nodes, node_online_map);
> +    if ( nodes_empty(dom0_nodes) )
> +        dom0_nodes = node_online_map;
> +    for_each_node_mask ( node, dom0_nodes )
> +        cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
> +    cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid);
> +    if ( cpumask_empty(&dom0_cpus) )
> +        cpumask_copy(&dom0_cpus, cpupool0->cpu_valid);
>  
> -    max_vcpus = num_cpupool_cpus(cpupool0);
> +    max_vcpus = cpumask_weight(&dom0_cpus);
>      if ( opt_dom0_max_vcpus_min > max_vcpus )
>          max_vcpus = opt_dom0_max_vcpus_min;
>      if ( opt_dom0_max_vcpus_max < max_vcpus )
> @@ -119,12 +166,15 @@ struct vcpu *__init alloc_dom0_vcpu0(str
>  {
>      unsigned int max_vcpus = dom0_max_vcpus();
>  
> +    dom0->node_affinity = dom0_nodes;
> +    dom0->auto_node_affinity = !!bitmap_empty(dom0_pxms, MAX_DOM0_PXM + 1);
> +
>      dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus);
>      if ( !dom0->vcpu )
>          return NULL;
>      dom0->max_vcpus = max_vcpus;
>  
> -    return alloc_vcpu(dom0, 0, 0);
> +    return setup_vcpu(dom0, 0, cpumask_first(&dom0_cpus));
>  }
>  
>  #ifdef CONFIG_SHADOW_PAGING
> @@ -156,7 +206,7 @@ static struct page_info * __init alloc_c
>      struct domain *d, unsigned long max_pages)
>  {
>      static unsigned int __initdata last_order = MAX_ORDER;
> -    static unsigned int __initdata memflags = MEMF_no_dma;
> +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
>      struct page_info *page;
>      unsigned int order = get_order_from_pages(max_pages), free_order;
>  
> @@ -190,7 +240,7 @@ static struct page_info * __init alloc_c
>  
>          if ( d->tot_pages + (1 << order) > d->max_pages )
>              continue;
> -        pg2 = alloc_domheap_pages(d, order, 0);
> +        pg2 = alloc_domheap_pages(d, order, MEMF_exact_node);
>          if ( pg2 > page )
>          {
>              free_domheap_pages(page, free_order);
> @@ -217,10 +267,14 @@ static unsigned long __init dom0_paging_
>  static unsigned long __init compute_dom0_nr_pages(
>      struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
>  {
> -    unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
> -    unsigned long nr_pages, min_pages, max_pages;
> +    nodeid_t node;
> +    unsigned long avail = 0, nr_pages, min_pages, max_pages;
>      bool_t need_paging;
>  
> +    for_each_node_mask ( node, dom0_nodes )
> +        avail += avail_domheap_pages_region(node, 0, 0) +
> +                 initial_images_nrpages(node);
> +
>      /* Reserve memory for further dom0 vcpu-struct allocations... */
>      avail -= (d->max_vcpus - 1UL)
>               << get_order_from_bytes(sizeof(struct vcpu));
> @@ -1230,11 +1284,11 @@ int __init construct_dom0(
>  
>      printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus);
>  
> -    cpu = cpumask_first(cpupool0->cpu_valid);
> +    cpu = v->processor;
>      for ( i = 1; i < d->max_vcpus; i++ )
>      {
> -        cpu = cpumask_cycle(cpu, cpupool0->cpu_valid);
> -        (void)alloc_vcpu(d, i, cpu);
> +        cpu = cpumask_cycle(cpu, &dom0_cpus);
> +        setup_vcpu(d, i, cpu);

I know this is a preexisting bug, but you might want to fix it as you
are changing the affected codepath.

Construction of dom0 should fail if any alloc_vcpu() call fails (and now
setup_vcpu()).   Nothing currently catches a failure to allocate vcpu
1..$N, and this patch introduces a way for dom0 be more
memory-constrained than previously.

>      }
>  
>      /*
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -140,13 +140,21 @@ static void __init parse_acpi_param(char
>  static const module_t *__initdata initial_images;
>  static unsigned int __initdata nr_initial_images;
>  
> -unsigned long __init initial_images_nrpages(void)
> +unsigned long __init initial_images_nrpages(nodeid_t node)
>  {
> +    unsigned long node_start = node_start_pfn(node);
> +    unsigned long node_end = node_end_pfn(node);
>      unsigned long nr;
>      unsigned int i;
>  
>      for ( nr = i = 0; i < nr_initial_images; ++i )
> -        nr += PFN_UP(initial_images[i].mod_end);
> +    {
> +        unsigned long start = initial_images[i].mod_start;
> +        unsigned long end = start + PFN_UP(initial_images[i].mod_end);
> +
> +        if ( end > node_start && node_end > start )
> +            nr += min(node_end, end) - max(node_start, start);
> +    }
>  
>      return nr;
>  }
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -581,7 +581,7 @@ static struct page_info *alloc_heap_page
>      struct domain *d)
>  {
>      unsigned int i, j, zone = 0, nodemask_retry = 0;
> -    nodeid_t first_node, node = MEMF_get_node(memflags);
> +    nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
>      unsigned long request = 1UL << order;
>      struct page_info *pg;
>      nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
> @@ -593,7 +593,6 @@ static struct page_info *alloc_heap_page
>  
>      if ( node == NUMA_NO_NODE )
>      {
> -        memflags &= ~MEMF_exact_node;
>          if ( d != NULL )
>          {
>              node = next_node(d->last_alloc_node, nodemask);
> @@ -654,7 +653,7 @@ static struct page_info *alloc_heap_page
>                      goto found;
>          } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
>  
> -        if ( memflags & MEMF_exact_node )
> +        if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
>              goto not_found;
>  
>          /* Pick next node. */
> @@ -671,7 +670,7 @@ static struct page_info *alloc_heap_page
>          if ( node == first_node )
>          {
>              /* When we have tried all in nodemask, we fall back to others. */
> -            if ( nodemask_retry++ )
> +            if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
>                  goto not_found;
>              nodes_andnot(nodemask, node_online_map, nodemask);
>              first_node = node = first_node(nodemask);
> --- a/xen/include/asm-x86/setup.h
> +++ b/xen/include/asm-x86/setup.h
> @@ -2,6 +2,7 @@
>  #define __X86_SETUP_H_
>  
>  #include <xen/multiboot.h>
> +#include <asm/numa.h>
>  
>  extern unsigned long xenheap_initial_phys_start;
>  
> @@ -32,7 +33,7 @@ int construct_dom0(
>      void *(*bootstrap_map)(const module_t *),
>      char *cmdline);
>  
> -unsigned long initial_images_nrpages(void);
> +unsigned long initial_images_nrpages(nodeid_t node);
>  void discard_initial_images(void);
>  
>  unsigned int dom0_max_vcpus(void);
>
>



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.