 
	
| [Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH 1/5] x86: allow specifying the NUMA nodes Dom0 should run on
 On 26/02/15 13:52, Jan Beulich wrote: > ... by introducing a "dom0_nodes" option augmenting the "dom0_mem" and > "dom0_max_vcpus" ones. > > Note that this gives meaning to MEMF_exact_node specified alone (i.e. > implicitly combined with NUMA_NO_NODE): In such a case any node inside > the domain's node mask is acceptable, but no other node. This changed > behavior is (implicitly) being exposed through the memop hypercalls. > > Note further that this change doesn't take care of moving the initrd > image into memory matching Dom0's affinity when the initrd doesn't get > copied (because of being part of the initial mapping) anyway. > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> A couple of comments, but Reviewed-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> > --- > I'm uncertain whether range restricting the PXMs permitted for Dom0 is > the right approach (matching what other NUMA code did until recently), > or whether we would instead want to simply limit the number of PXMs we > can handler there (i.e. using a static array instead of a static > bitmap). I am not quite sure what you mean by this. > > --- a/docs/misc/xen-command-line.markdown > +++ b/docs/misc/xen-command-line.markdown > @@ -540,6 +540,15 @@ any dom0 autoballooning feature present > _xl.conf(5)_ man page or [Xen Best > > Practices](http://wiki.xen.org/wiki/Xen_Best_Practices#Xen_dom0_dedicated_memory_and_preventing_dom0_memory_ballooning). > > +### dom0\_nodes > + > +> `= <integer>[,...]` > + > +Specify the NUMA nodes to place Dom0 on. Defaults for vCPU-s created > +and memory assigned to Dom0 will be adjusted to match the node > +restrictions set up here. Note that the values to be specified here are > +ACPI PXM ones, not Xen internal node numbers. > + > ### dom0\_shadow > > `= <boolean>` > > --- a/xen/arch/x86/domain_build.c > +++ b/xen/arch/x86/domain_build.c > @@ -100,11 +100,58 @@ static void __init parse_dom0_max_vcpus( > } > custom_param("dom0_max_vcpus", parse_dom0_max_vcpus); > > +#define MAX_DOM0_PXM 255 > +static __initdata DECLARE_BITMAP(dom0_pxms, MAX_DOM0_PXM + 1); > + > +static void __init parse_dom0_nodes(const char *s) > +{ > + do { > + unsigned int pxm = simple_strtoul(s, &s, 0); > + > + if ( pxm <= MAX_DOM0_PXM ) > + __set_bit(pxm, dom0_pxms); > + } while ( *s++ == ',' ); > +} > +custom_param("dom0_nodes", parse_dom0_nodes); > + > +static cpumask_t __initdata dom0_cpus; > + > +static struct vcpu *__init setup_vcpu(struct domain *d, unsigned int vcpu_id, > + unsigned int cpu) I would be tempted to name this setup_dom0_vcpu() to bring it in line with other dom0 construction functions, and to make it clear that it is not for use on general domains. > +{ > + struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu); > + > + if ( v ) > + { > + if ( !d->is_pinned ) > + cpumask_copy(v->cpu_hard_affinity, &dom0_cpus); > + cpumask_copy(v->cpu_soft_affinity, &dom0_cpus); > + } > + > + return v; > +} > + > +static nodemask_t __initdata dom0_nodes; > + > unsigned int __init dom0_max_vcpus(void) > { > - unsigned max_vcpus; > + unsigned int pxm, max_vcpus; > + nodeid_t node; > + > + for ( pxm = 0; pxm <= MAX_DOM0_PXM; ++pxm ) > + if ( test_bit(pxm, dom0_pxms) && > + (node = pxm_to_node(pxm)) != NUMA_NO_NODE ) > + node_set(node, dom0_nodes); for_each_set_bit() ? It would simply the above loop slightly, and is more efficient. > + nodes_and(dom0_nodes, dom0_nodes, node_online_map); > + if ( nodes_empty(dom0_nodes) ) > + dom0_nodes = node_online_map; > + for_each_node_mask ( node, dom0_nodes ) > + cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node)); > + cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid); > + if ( cpumask_empty(&dom0_cpus) ) > + cpumask_copy(&dom0_cpus, cpupool0->cpu_valid); > > - max_vcpus = num_cpupool_cpus(cpupool0); > + max_vcpus = cpumask_weight(&dom0_cpus); > if ( opt_dom0_max_vcpus_min > max_vcpus ) > max_vcpus = opt_dom0_max_vcpus_min; > if ( opt_dom0_max_vcpus_max < max_vcpus ) > @@ -119,12 +166,15 @@ struct vcpu *__init alloc_dom0_vcpu0(str > { > unsigned int max_vcpus = dom0_max_vcpus(); > > + dom0->node_affinity = dom0_nodes; > + dom0->auto_node_affinity = !!bitmap_empty(dom0_pxms, MAX_DOM0_PXM + 1); > + > dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus); > if ( !dom0->vcpu ) > return NULL; > dom0->max_vcpus = max_vcpus; > > - return alloc_vcpu(dom0, 0, 0); > + return setup_vcpu(dom0, 0, cpumask_first(&dom0_cpus)); > } > > #ifdef CONFIG_SHADOW_PAGING > @@ -156,7 +206,7 @@ static struct page_info * __init alloc_c > struct domain *d, unsigned long max_pages) > { > static unsigned int __initdata last_order = MAX_ORDER; > - static unsigned int __initdata memflags = MEMF_no_dma; > + static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; > struct page_info *page; > unsigned int order = get_order_from_pages(max_pages), free_order; > > @@ -190,7 +240,7 @@ static struct page_info * __init alloc_c > > if ( d->tot_pages + (1 << order) > d->max_pages ) > continue; > - pg2 = alloc_domheap_pages(d, order, 0); > + pg2 = alloc_domheap_pages(d, order, MEMF_exact_node); > if ( pg2 > page ) > { > free_domheap_pages(page, free_order); > @@ -217,10 +267,14 @@ static unsigned long __init dom0_paging_ > static unsigned long __init compute_dom0_nr_pages( > struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len) > { > - unsigned long avail = avail_domheap_pages() + initial_images_nrpages(); > - unsigned long nr_pages, min_pages, max_pages; > + nodeid_t node; > + unsigned long avail = 0, nr_pages, min_pages, max_pages; > bool_t need_paging; > > + for_each_node_mask ( node, dom0_nodes ) > + avail += avail_domheap_pages_region(node, 0, 0) + > + initial_images_nrpages(node); > + > /* Reserve memory for further dom0 vcpu-struct allocations... */ > avail -= (d->max_vcpus - 1UL) > << get_order_from_bytes(sizeof(struct vcpu)); > @@ -1230,11 +1284,11 @@ int __init construct_dom0( > > printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus); > > - cpu = cpumask_first(cpupool0->cpu_valid); > + cpu = v->processor; > for ( i = 1; i < d->max_vcpus; i++ ) > { > - cpu = cpumask_cycle(cpu, cpupool0->cpu_valid); > - (void)alloc_vcpu(d, i, cpu); > + cpu = cpumask_cycle(cpu, &dom0_cpus); > + setup_vcpu(d, i, cpu); I know this is a preexisting bug, but you might want to fix it as you are changing the affected codepath. Construction of dom0 should fail if any alloc_vcpu() call fails (and now setup_vcpu()). Nothing currently catches a failure to allocate vcpu 1..$N, and this patch introduces a way for dom0 be more memory-constrained than previously. > } > > /* > --- a/xen/arch/x86/setup.c > +++ b/xen/arch/x86/setup.c > @@ -140,13 +140,21 @@ static void __init parse_acpi_param(char > static const module_t *__initdata initial_images; > static unsigned int __initdata nr_initial_images; > > -unsigned long __init initial_images_nrpages(void) > +unsigned long __init initial_images_nrpages(nodeid_t node) > { > + unsigned long node_start = node_start_pfn(node); > + unsigned long node_end = node_end_pfn(node); > unsigned long nr; > unsigned int i; > > for ( nr = i = 0; i < nr_initial_images; ++i ) > - nr += PFN_UP(initial_images[i].mod_end); > + { > + unsigned long start = initial_images[i].mod_start; > + unsigned long end = start + PFN_UP(initial_images[i].mod_end); > + > + if ( end > node_start && node_end > start ) > + nr += min(node_end, end) - max(node_start, start); > + } > > return nr; > } > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -581,7 +581,7 @@ static struct page_info *alloc_heap_page > struct domain *d) > { > unsigned int i, j, zone = 0, nodemask_retry = 0; > - nodeid_t first_node, node = MEMF_get_node(memflags); > + nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node; > unsigned long request = 1UL << order; > struct page_info *pg; > nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map; > @@ -593,7 +593,6 @@ static struct page_info *alloc_heap_page > > if ( node == NUMA_NO_NODE ) > { > - memflags &= ~MEMF_exact_node; > if ( d != NULL ) > { > node = next_node(d->last_alloc_node, nodemask); > @@ -654,7 +653,7 @@ static struct page_info *alloc_heap_page > goto found; > } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */ > > - if ( memflags & MEMF_exact_node ) > + if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE ) > goto not_found; > > /* Pick next node. */ > @@ -671,7 +670,7 @@ static struct page_info *alloc_heap_page > if ( node == first_node ) > { > /* When we have tried all in nodemask, we fall back to others. */ > - if ( nodemask_retry++ ) > + if ( (memflags & MEMF_exact_node) || nodemask_retry++ ) > goto not_found; > nodes_andnot(nodemask, node_online_map, nodemask); > first_node = node = first_node(nodemask); > --- a/xen/include/asm-x86/setup.h > +++ b/xen/include/asm-x86/setup.h > @@ -2,6 +2,7 @@ > #define __X86_SETUP_H_ > > #include <xen/multiboot.h> > +#include <asm/numa.h> > > extern unsigned long xenheap_initial_phys_start; > > @@ -32,7 +33,7 @@ int construct_dom0( > void *(*bootstrap_map)(const module_t *), > char *cmdline); > > -unsigned long initial_images_nrpages(void); > +unsigned long initial_images_nrpages(nodeid_t node); > void discard_initial_images(void); > > unsigned int dom0_max_vcpus(void); > > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel 
 
 | 
|  | Lists.xenproject.org is hosted with RackSpace, monitoring our |