Xen project Mailing List

[Xen-devel] [PATCH RESEND 12/12] xl: numa-sched: enable specifying node-affinity in VM config file

in a similar way to how it is possible to specify vcpu-affinity. Manual page is updated accordingly. Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx> --- docs/man/xl.cfg.pod.5 | 70 +++++++++++++++++++++++++++++++++------ tools/libxl/libxl_dom.c | 18 ++++++---- tools/libxl/libxl_numa.c | 14 +------- tools/libxl/libxl_utils.h | 12 ++++++- tools/libxl/xl_cmdimpl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 159 insertions(+), 35 deletions(-) diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5 index 1c98cb4..1212426 100644 --- a/docs/man/xl.cfg.pod.5 +++ b/docs/man/xl.cfg.pod.5 @@ -144,18 +144,64 @@ run on cpu #3 of the host. =back If this option is not specified, no vcpu to cpu pinning is established, -and the vcpus of the guest can run on all the cpus of the host. - -If we are on a NUMA machine (i.e., if the host has more than one NUMA -node) and this option is not specified, libxl automatically tries to -place the guest on the least possible number of nodes. That, however, -will not affect vcpu pinning, so the guest will still be able to run on -all the cpus, it will just prefer the ones from the node it has been -placed on. A heuristic approach is used for choosing the best node (or -set of nodes), with the goals of maximizing performance for the guest -and, at the same time, achieving efficient utilization of host cpus -and memory. See F<docs/misc/xl-numa-placement.markdown> for more -details. +and the vcpus of the guest can run on all the cpus of the host. If this +option is specified, and no B<nodes=> option present the vcpu pinning +mask of each vcpu is utilized to compute its vcpu node-affinity, and the +union of all the vcpus node-affinities is what constitutes the domain +node-affinity (which drives memory allocations). + +=back + +=item B<nodes="NODE-LIST"> + +List of on which NUMA nodes the memory for the guest is allocated. This +also means (starting from Xen 4.3 and if the credit scheduler is used) +the vcpus of the domain prefers to run on the those NUMA nodes. Default is +xl (via libxl) guesses. A C<NODE-LIST> may be specified as follows: + +=item "all" + +To specify no particular preference and avoid xl to automatically pick +a (set of) NUMA ndoe(s). In practice, using "all", this domain will have +no NUMA node-affinity and it's memory will be spread on all the host's +NUMA nodes. + +=item "0-3,5,^1" + +To specify a NUMA node-affinity with the host NUMA nodes 0,2,3,5. +Combining this with "all" is possible, meaning "all,^7" results in the +memory being allocated on all the host NUMA nodes except node 7, as well +as trying to avoid running the domain's vcpu on the pcpus that belong to +node 7. + +=item ["1", "4"] (or [1, 4]) + +To ask for specific vcpu to NUMA node mapping. That means (in this example), +memory will be allocated on host NUMA nodes 1 and 4 but, at the same time, +vcpu #0 of the guest prefers to run on the pcpus of host NUMA node 1, while +vcpu #1 on the pcpus of host NUMA node 4. + +=back + +If this option is not specified, xl picks up a NUMA node (or a set of NUMA +nodes), according to some heuristics, and use that as the NUMA node-affinity +for the guest. + +If we are on a NUMA machine (i.e., if the host has more than one NUMA node) +and this option is not specified, libxl automatically tries to place the +guest on the least possible number of nodes. A heuristic approach is used +for choosing the best node (or set of nodes), with the goals of maximizing +performance for the guest and, at the same time, achieving efficient +utilization of host cpus and memory. In this case, all the vcpus of the +guest will have the same vcpu node-affinity. + +Notice that, independently from whether the node-affinity is specified +via this parameter, or automatically decided by libxl, that does not affect +vcpu pinning, so the guest will still be able to run on all the cpus to +which its vcpus are pinned, or all the cpus, if no B<cpus=> option is +provided. + +See F<docs/misc/xl-numa-placement.markdown> for more details. =back diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c index 1812bdc..bc4cf9a 100644 --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -215,19 +215,21 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid, } /* - * Check if the domain has any CPU affinity. If not, try to build - * up one. In case numa_place_domain() find at least a suitable - * candidate, it will affect info->nodemap accordingly; if it - * does not, it just leaves it as it is. This means (unless - * some weird error manifests) the subsequent call to - * libxl_domain_set_nodeaffinity() will do the actual placement, + * Check if the domain has any pinning or node-affinity and, if not, try + * to build up one. + * + * In case numa_place_domain() find at least a suitable candidate, it will + * affect info->nodemap accordingly; if it does not, it just leaves it as + * it is. This means (unless some weird error manifests) the subsequent + * call to libxl_domain_set_nodeaffinity() will do the actual placement, * whatever that turns out to be. */ if (libxl_defbool_val(info->numa_placement)) { - if (!libxl_bitmap_is_full(&info->cpumap)) { + if (!libxl_bitmap_is_full(&info->cpumap) || + !libxl_bitmap_is_full(&info->nodemap)) { LOG(ERROR, "Can run NUMA placement only if no vcpu " - "affinity is specified"); + "pinning or node-affinity is specified"); return ERROR_INVAL; } diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c index 20c99ac..1026579 100644 --- a/tools/libxl/libxl_numa.c +++ b/tools/libxl/libxl_numa.c @@ -184,7 +184,7 @@ static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, int vcpus_on_node[]) { libxl_dominfo *dinfo = NULL; - libxl_bitmap dom_nodemap, nodes_counted; + libxl_bitmap nodes_counted; int nr_doms, nr_cpus; int i, j, k; @@ -197,12 +197,6 @@ static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, return ERROR_FAIL; } - if (libxl_node_bitmap_alloc(CTX, &dom_nodemap, 0) < 0) { - libxl_bitmap_dispose(&nodes_counted); - libxl_dominfo_list_free(dinfo, nr_doms); - return ERROR_FAIL; - } - for (i = 0; i < nr_doms; i++) { libxl_vcpuinfo *vinfo; int nr_dom_vcpus; @@ -211,9 +205,6 @@ static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, if (vinfo == NULL) continue; - /* Retrieve the domain's node-affinity map */ - libxl_domain_get_nodeaffinity(CTX, dinfo[i].domid, &dom_nodemap); - for (j = 0; j < nr_dom_vcpus; j++) { /* * For each vcpu of each domain, it must have both vcpu-affinity @@ -225,7 +216,7 @@ static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, int node = tinfo[k].node; if (libxl_bitmap_test(suitable_cpumap, k) && - libxl_bitmap_test(&dom_nodemap, node) && + libxl_bitmap_test(&vinfo[j].nodemap, node) && !libxl_bitmap_test(&nodes_counted, node)) { libxl_bitmap_set(&nodes_counted, node); vcpus_on_node[node]++; @@ -236,7 +227,6 @@ static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); } - libxl_bitmap_dispose(&dom_nodemap); libxl_bitmap_dispose(&nodes_counted); libxl_dominfo_list_free(dinfo, nr_doms); return 0; diff --git a/tools/libxl/libxl_utils.h b/tools/libxl/libxl_utils.h index 7b84e6a..cac057c 100644 --- a/tools/libxl/libxl_utils.h +++ b/tools/libxl/libxl_utils.h @@ -90,7 +90,7 @@ static inline void libxl_bitmap_set_none(libxl_bitmap *bitmap) { memset(bitmap->map, 0, bitmap->size); } -static inline int libxl_bitmap_cpu_valid(libxl_bitmap *bitmap, int bit) +static inline int libxl_bitmap_valid(libxl_bitmap *bitmap, int bit) { return bit >= 0 && bit < (bitmap->size * 8); } @@ -125,6 +125,16 @@ static inline int libxl_node_bitmap_alloc(libxl_ctx *ctx, return libxl_bitmap_alloc(ctx, nodemap, max_nodes); } +static inline int libxl_bitmap_cpu_valid(libxl_bitmap *cpumap, int cpu) +{ + return libxl_bitmap_valid(cpumap, cpu); +} + +static inline int libxl_bitmap_node_valid(libxl_bitmap *nodemap, int node) +{ + return libxl_bitmap_valid(nodemap, node); +} + /* Populate cpumap with the cpus spanned by the nodes in nodemap */ int libxl_nodemap_to_cpumap(libxl_ctx *ctx, const libxl_bitmap *nodemap, diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c index 1659259..a035162 100644 --- a/tools/libxl/xl_cmdimpl.c +++ b/tools/libxl/xl_cmdimpl.c @@ -76,8 +76,9 @@ xlchild children[child_max]; static const char *common_domname; static int fd_lock = -1; -/* Stash for specific vcpu to pcpu mappping */ +/* Stash for specific vcpu to pcpu and vcpu to node mappping */ static int *vcpu_to_pcpu; +static int *vcpu_to_node; static const char savefileheader_magic[32]= "Xen saved domain, xl format\n \0 \r"; @@ -670,7 +671,7 @@ static void parse_config_data(const char *config_source, const char *buf; long l; XLU_Config *config; - XLU_ConfigList *cpus, *vbds, *nics, *pcis, *cvfbs, *cpuids, *vtpms; + XLU_ConfigList *cpus, *nodes, *vbds, *nics, *pcis, *cvfbs, *cpuids, *vtpms; XLU_ConfigList *ioports, *irqs, *iomem; int num_ioports, num_irqs, num_iomem; int pci_power_mgmt = 0; @@ -846,6 +847,53 @@ static void parse_config_data(const char *config_source, libxl_defbool_set(&b_info->numa_placement, false); } + if (!xlu_cfg_get_list (config, "nodes", &nodes, 0, 1)) { + int n_cpus = 0; + + if (libxl_node_bitmap_alloc(ctx, &b_info->nodemap, 0)) { + fprintf(stderr, "Unable to allocate nodemap\n"); + exit(1); + } + + /* + * As above, use a temporary storage for the single vcpus' + * node-affinities. + */ + vcpu_to_node = xmalloc(sizeof(int) * b_info->max_vcpus); + memset(vcpu_to_node, -1, sizeof(int) * b_info->max_vcpus); + + libxl_bitmap_set_none(&b_info->nodemap); + while ((buf = xlu_cfg_get_listitem(nodes, n_cpus)) != NULL) { + i = atoi(buf); + if (!libxl_bitmap_node_valid(&b_info->nodemap, i)) { + fprintf(stderr, "node %d illegal\n", i); + exit(1); + } + libxl_bitmap_set(&b_info->nodemap, i); + if (n_cpus < b_info->max_vcpus) + vcpu_to_node[n_cpus] = i; + n_cpus++; + } + + /* We have a nodemap, disable automatic placement */ + libxl_defbool_set(&b_info->numa_placement, false); + } + else if (!xlu_cfg_get_string (config, "nodes", &buf, 0)) { + char *buf2 = strdup(buf); + + if (libxl_node_bitmap_alloc(ctx, &b_info->nodemap, 0)) { + fprintf(stderr, "Unable to allocate nodemap\n"); + exit(1); + } + + libxl_bitmap_set_none(&b_info->nodemap); + if (parse_bitmap_range(buf2, &b_info->nodemap)) + exit(1); + free(buf2); + + libxl_defbool_set(&b_info->numa_placement, false); + } + if (!xlu_cfg_get_long (config, "memory", &l, 0)) { b_info->max_memkb = l * 1024; b_info->target_memkb = b_info->max_memkb; @@ -2205,6 +2253,34 @@ start: free(vcpu_to_pcpu); vcpu_to_pcpu = NULL; } + /* And do the same for single vcpu to node-affinity mapping */ + if (vcpu_to_node) { + libxl_bitmap vcpu_nodemap; + + ret = libxl_node_bitmap_alloc(ctx, &vcpu_nodemap, 0); + if (ret) + goto error_out; + for (i = 0; i < d_config.b_info.max_vcpus; i++) { + + if (vcpu_to_node[i] != -1) { + libxl_bitmap_set_none(&vcpu_nodemap); + libxl_bitmap_set(&vcpu_nodemap, vcpu_to_node[i]); + } else { + libxl_bitmap_set_any(&vcpu_nodemap); + } + if (libxl_set_vcpunodeaffinity(ctx, domid, i, &vcpu_nodemap)) { + fprintf(stderr, "setting node-affinity failed" + " on vcpu `%d'.\n", i); + libxl_bitmap_dispose(&vcpu_nodemap); + free(vcpu_to_node); + ret = ERROR_FAIL; + goto error_out; + } + } + libxl_bitmap_dispose(&vcpu_nodemap); + free(vcpu_to_node); vcpu_to_node = NULL; + } + ret = libxl_userdata_store(ctx, domid, "xl", config_data, config_len); if (ret) { _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.