[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RESEND 11/12] xl: numa-sched: enable getting/specifying per-vcpu node-affinity



by showing it, upon request ('-n' switch), as a part of the
output of `xl vcpu-list'. Such output looks like this:

# xl vcpu-list -n

Name          ID  VCPU   CPU State   Time(s) CPU Affinity / NUMA Affinity
Domain-0       0     0    8   -b-       4.4  any cpu / any node
Domain-0       0     1    0   -b-       1.2  any cpu / any node
vm-test        1     0    8   -b-       2.4  any cpu / 1
vm-test        1     1   13   -b-       3.3  any cpu / 1

The "vcpu-pinning / vcpu-affinity" part may indeed look
weird, as it is different from the other fields.
Unfortunately, it's very hard to properly format it in
columns, since there is no such things as minimal or
maximal lengths for the bitmaps being printed.

Specifying a particular node-affinity happens via a new xl
command, `xl vcpu-node-affinity'. Introducing a "numa mode",
to `xl vcpu-pin' was the other option, but doing that would
probably fuel the confusion between vcpu pinning and NUMA
node-affinity, and that is certainly something we do not
want.

While at it, the implementation of `xl vcpu-list' is reworked
a little bit, making it more similar to the one of `xl list'
and more compliant with the libxl programming paradigm (e.g.,
regarding allocating at the beginning and freeing on exit).

xl manual page is updated accordingly.

Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
---
 docs/man/xl.pod.1         |   25 +++++
 tools/libxl/xl.h          |    1 
 tools/libxl/xl_cmdimpl.c  |  210 ++++++++++++++++++++++++++++++++++++---------
 tools/libxl/xl_cmdtable.c |    9 ++
 4 files changed, 201 insertions(+), 44 deletions(-)

diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index e7b9de2..e1894b4 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -614,11 +614,21 @@ quietly ignored.
 Some guests may need to actually bring the newly added CPU online
 after B<vcpu-set>, go to B<SEE ALSO> section for information.
 
-=item B<vcpu-list> [I<domain-id>]
+=item B<vcpu-list> [I<OPTIONS>] [I<domain-id>, ...]
 
 Lists VCPU information for a specific domain.  If no domain is
 specified, VCPU information for all domains will be provided.
 
+B<OPTIONS>
+
+=over 4
+
+=item B<-n>, B<--numa>
+
+Print the NUMA node-affinity of each VCPU.
+
+=back
+
 =item B<vcpu-pin> I<domain-id> I<vcpu> I<cpus>
 
 Pins the VCPU to only run on the specific CPUs.  The keyword
@@ -630,6 +640,19 @@ different run state is appropriate.  Pinning can be used 
to restrict
 this, by ensuring certain VCPUs can only run on certain physical
 CPUs.
 
+=item B<vcpu-node-affinity> I<domain-id> I<vcpu> I<nodes>
+
+Specifies the I<nodes> on which the I<vcpu> prefers to run. The
+keyword B<all> can be used to set this to I<nodes> for all the
+VCPUs in the domain.
+
+Normally VCPUs can float within the set of CPUs that they are pinned
+to (see B<vcpu-pin> above). Specifying a node-affinity does not change
+that, but, if using the credit scheduler, the VCPUs will strongly prefer
+running on the NUMA nodes they have node-affinity with. If using any
+scheduler other than credit, node-affinity is just ignored (credit2
+support is planned but not ready yet).
+
 =item B<vm-list>
 
 Prints information about guests. This list excludes information about
diff --git a/tools/libxl/xl.h b/tools/libxl/xl.h
index e005c39..b708199 100644
--- a/tools/libxl/xl.h
+++ b/tools/libxl/xl.h
@@ -59,6 +59,7 @@ int main_create(int argc, char **argv);
 int main_config_update(int argc, char **argv);
 int main_button_press(int argc, char **argv);
 int main_vcpupin(int argc, char **argv);
+int main_vcpuaff(int argc, char **argv);
 int main_vcpuset(int argc, char **argv);
 int main_memmax(int argc, char **argv);
 int main_memset(int argc, char **argv);
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index f6239d5..1659259 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -542,10 +542,32 @@ static int parse_range(const char *str, unsigned long *a, 
unsigned long *b)
 }
 
 /*
- * Add or removes a specific set of cpus (specified in str, either as
- * single cpus or as entire NUMA nodes) to/from cpumap.
+ * Set or reset some bits from bitmap, as specified in str.
+ *
+ * If str  * is "all", bitmap gets fully set. If str contains a specific
+ * value or range (e.g., "2" or "4-6"), the corresponding bits are set (or
+ * unset, if the value/range is prefixed with a "^"). In this case, we don't
+ * really care whether the value range is supposed to represent cpus or nodes
+ * (that is to say whether bitmap is supposed to be a nodemap or a cpumap),
+ * we just set and unset bits.
+ *
+ * In case str is prefixed by "nodes:" (or "^nodes:") we assume that the
+ * caller is dealing with a cpumap and wants to specify what bits to (re)set
+ * NUMA node-wise. We therefore act as follows:
+ *  - consider the value/range a node or a set of nodes;
+ *  - obtain the cpumap corresponding to such node/set of nodes;
+ *  - set (or reset, in the "^" case) in bitmap what ended up
+ *    in such cpumap.
+ *
+ * This means that, for instance, if calling this with str="3-6", bits
+ * 3-6 in bitmap will be set, and it's up to the caller to know whether
+ * that meant cpus 3,4,5,6 or NUMA nodes 3,4,5,6. On the other hand, if
+ * calling this with str="nodes:2", bits corresponding to all the cpus
+ * of NUMA node  2 will be the ones that are set in bitmap.
+ *
+ * This may look tricky, but it avoids quite a bit of code duplication.
  */
-static int update_cpumap_range(const char *str, libxl_bitmap *cpumap)
+static int update_bitmap_range(const char *str, libxl_bitmap *bitmap)
 {
     unsigned long ida, idb;
     libxl_bitmap node_cpumap;
@@ -571,7 +593,7 @@ static int update_cpumap_range(const char *str, 
libxl_bitmap *cpumap)
     }
 
     if (STR_HAS_PREFIX(str, "all")) {
-        libxl_bitmap_set_any(cpumap);
+        libxl_bitmap_set_any(bitmap);
         goto out;
     }
 
@@ -595,13 +617,13 @@ static int update_cpumap_range(const char *str, 
libxl_bitmap *cpumap)
 
             /* Add/Remove all the cpus in the node cpumap */
             libxl_for_each_set_bit(i, node_cpumap) {
-                is_not ? libxl_bitmap_reset(cpumap, i) :
-                         libxl_bitmap_set(cpumap, i);
+                is_not ? libxl_bitmap_reset(bitmap, i) :
+                         libxl_bitmap_set(bitmap, i);
             }
         } else {
             /* Add/Remove this cpu */
-            is_not ? libxl_bitmap_reset(cpumap, ida) :
-                     libxl_bitmap_set(cpumap, ida);
+            is_not ? libxl_bitmap_reset(bitmap, ida) :
+                     libxl_bitmap_set(bitmap, ida);
         }
         ida++;
     }
@@ -612,18 +634,25 @@ static int update_cpumap_range(const char *str, 
libxl_bitmap *cpumap)
 }
 
 /*
- * Takes a string representing a set of cpus (specified either as
- * single cpus or as eintire NUMA nodes) and turns it into the
- * corresponding libxl_bitmap (in cpumap).
+ * Takes a string representing one or more values or range of values
+ * separated by ',' (e.g., something like "3,2-6,8"), split it in substrings
+ * and, for each one of them, update bitmap to reflect that. This may result
+ * in bits in bitmap being both set or reset, since it is possible to append
+ * a not prefix ("^") to both values and ranges.
+ *
+ * It is also possible for the caller to indicate that a specific value or
+ * range should be treated specially, i.e., resulting in not just one bit
+ * (or not just the specified range of bits) to be set or reset. For details
+ * on that, see update_bitmap_range() above.
  */
-static int vcpupin_parse(char *cpu, libxl_bitmap *cpumap)
+static int parse_bitmap_range(char *cpu, libxl_bitmap *bitmap)
 {
     char *ptr, *saveptr = NULL;
     int rc = 0;
 
     for (ptr = strtok_r(cpu, ",", &saveptr); ptr;
          ptr = strtok_r(NULL, ",", &saveptr)) {
-        rc = update_cpumap_range(ptr, cpumap);
+        rc = update_bitmap_range(ptr, bitmap);
         if (rc)
             break;
     }
@@ -810,7 +839,7 @@ static void parse_config_data(const char *config_source,
         }
 
         libxl_bitmap_set_none(&b_info->cpumap);
-        if (vcpupin_parse(buf2, &b_info->cpumap))
+        if (parse_bitmap_range(buf2, &b_info->cpumap))
             exit(1);
         free(buf2);
 
@@ -4483,7 +4512,9 @@ int main_button_press(int argc, char **argv)
 
 static void print_vcpuinfo(uint32_t tdomid,
                            const libxl_vcpuinfo *vcpuinfo,
-                           uint32_t nr_cpus)
+                           uint32_t nr_cpus,
+                           uint32_t nr_nodes,
+                           int numa)
 {
     char *domname;
 
@@ -4505,10 +4536,18 @@ static void print_vcpuinfo(uint32_t tdomid,
     printf("%9.1f  ", ((float)vcpuinfo->vcpu_time / 1e9));
     /* CPU AFFINITY */
     print_cpumap(vcpuinfo->cpumap.map, nr_cpus, stdout);
+    /* NUMA Affinity*/
+    if (numa) {
+        printf(" / ");
+        print_nodemap(vcpuinfo->nodemap.map, nr_nodes, stdout);
+    }
     printf("\n");
 }
 
-static void print_domain_vcpuinfo(uint32_t domid, uint32_t nr_cpus)
+static void print_domain_vcpuinfo(uint32_t domid,
+                                  uint32_t nr_cpus,
+                                  uint32_t nr_nodes,
+                                  int numa)
 {
     libxl_vcpuinfo *vcpuinfo;
     int i, nb_vcpu, nrcpus;
@@ -4521,55 +4560,67 @@ static void print_domain_vcpuinfo(uint32_t domid, 
uint32_t nr_cpus)
     }
 
     for (i = 0; i < nb_vcpu; i++) {
-        print_vcpuinfo(domid, &vcpuinfo[i], nr_cpus);
+        print_vcpuinfo(domid, &vcpuinfo[i], nr_cpus, nr_nodes, numa);
     }
 
     libxl_vcpuinfo_list_free(vcpuinfo, nb_vcpu);
 }
 
-static void vcpulist(int argc, char **argv)
+int main_vcpulist(int argc, char **argv)
 {
     libxl_dominfo *dominfo;
     libxl_physinfo physinfo;
-    int i, nb_domain;
+    int opt;
+    int numa = 0;
+    static struct option opts[] = {
+        {"numa", 0, 0, 'n'},
+        COMMON_LONG_OPTS,
+        {0, 0, 0, 0}
+    };
+    int rc = -1;
+
+    SWITCH_FOREACH_OPT(opt, "n", opts, "vcpu-list", 0) {
+    case 'n':
+        numa = 1;
+        break;
+    }
 
     if (libxl_get_physinfo(ctx, &physinfo) != 0) {
         fprintf(stderr, "libxl_physinfo failed.\n");
-        goto vcpulist_out;
+        goto out;
     }
 
-    printf("%-32s %5s %5s %5s %5s %9s %s\n",
+    printf("%-32s %5s %5s %5s %5s %9s %s",
            "Name", "ID", "VCPU", "CPU", "State", "Time(s)", "CPU Affinity");
-    if (!argc) {
+    if (numa)
+        printf(" / NUMA Affinity");
+    printf("\n");
+
+    if (optind >= argc) {
+        int i, nb_domain;
+
         if (!(dominfo = libxl_list_domain(ctx, &nb_domain))) {
             fprintf(stderr, "libxl_list_domain failed.\n");
-            goto vcpulist_out;
+            goto out;
         }
 
-        for (i = 0; i<nb_domain; i++)
-            print_domain_vcpuinfo(dominfo[i].domid, physinfo.nr_cpus);
+        for (i = 0; i < nb_domain; i++)
+            print_domain_vcpuinfo(dominfo[i].domid, physinfo.nr_cpus,
+                                  physinfo.nr_nodes, numa);
 
         libxl_dominfo_list_free(dominfo, nb_domain);
     } else {
-        for (; argc > 0; ++argv, --argc) {
-            uint32_t domid = find_domain(*argv);
-            print_domain_vcpuinfo(domid, physinfo.nr_cpus);
+        for (; argc > optind; ++optind) {
+            uint32_t domid = find_domain(argv[optind]);
+            print_domain_vcpuinfo(domid, physinfo.nr_cpus,
+                                  physinfo.nr_nodes, numa);
         }
     }
-  vcpulist_out:
-    libxl_physinfo_dispose(&physinfo);
-}
-
-int main_vcpulist(int argc, char **argv)
-{
-    int opt;
-
-    SWITCH_FOREACH_OPT(opt, "", NULL, "vcpu-list", 0) {
-        /* No options */
-    }
 
-    vcpulist(argc - optind, argv + optind);
-    return 0;
+    rc = 0;
+ out:
+    libxl_physinfo_dispose(&physinfo);
+    return rc;
 }
 
 static int vcpupin(uint32_t domid, const char *vcpu, char *cpu)
@@ -4595,7 +4646,7 @@ static int vcpupin(uint32_t domid, const char *vcpu, char 
*cpu)
     if (libxl_cpu_bitmap_alloc(ctx, &cpumap, 0))
         goto out;
 
-    if (vcpupin_parse(cpu, &cpumap))
+    if (parse_bitmap_range(cpu, &cpumap))
         goto out;
 
     if (dryrun_only) {
@@ -4646,6 +4697,69 @@ static int vcpupin(uint32_t domid, const char *vcpu, 
char *cpu)
     return rc;
 }
 
+static int vcpuaff(uint32_t domid, const char *vcpu, char *node)
+{
+    libxl_bitmap nodemap;
+    uint32_t vcpuid;
+    char *endptr;
+    int rc = -1;
+
+    libxl_bitmap_init(&nodemap);
+
+    vcpuid = strtoul(vcpu, &endptr, 10);
+    if (vcpu == endptr) {
+        if (strcmp(vcpu, "all")) {
+            fprintf(stderr, "Error: Invalid argument\n");
+            goto out;
+        }
+        vcpuid = -1;
+    }
+
+    if (libxl_node_bitmap_alloc(ctx, &nodemap, 0))
+        goto out;
+
+    if (parse_bitmap_range(node, &nodemap))
+        goto out;
+
+    if (dryrun_only) {
+        int nr_nodes = 0;
+        libxl_numainfo *info = libxl_get_numainfo(ctx, &nr_nodes);
+
+        if (!info) {
+            fprintf(stderr, "libxl_get_numainfo failed\n");
+            goto out;
+        }
+        libxl_numainfo_list_free(info, nr_nodes);
+
+        fprintf(stdout, "nodemap: ");
+        print_nodemap(nodemap.map, nr_nodes, stdout);
+        fprintf(stdout, "\n");
+
+        if (ferror(stdout) || fflush(stdout)) {
+            perror("stdout");
+            exit(-1);
+        }
+
+        rc = 0;
+        goto out;
+    }
+
+    if (vcpuid != -1) {
+        if (libxl_set_vcpunodeaffinity(ctx, domid, vcpuid, &nodemap))
+            fprintf(stderr, "Could not set node-affinity for vcpu `%u'\n",
+                    vcpuid);
+        goto out;
+    }
+
+    if (libxl_domain_set_nodeaffinity(ctx, domid, &nodemap))
+        fprintf(stderr, "libxl_domain_set_nodeaffinity failed\n");
+
+    rc = 0;
+ out:
+    libxl_bitmap_dispose(&nodemap);
+    return rc;
+}
+
 int main_vcpupin(int argc, char **argv)
 {
     int opt;
@@ -4657,6 +4771,18 @@ int main_vcpupin(int argc, char **argv)
     return vcpupin(find_domain(argv[optind]), argv[optind+1] , argv[optind+2]);
 }
 
+int main_vcpuaff(int argc, char **argv)
+{
+    int opt;
+
+    SWITCH_FOREACH_OPT(opt, "", NULL, "vcpu-node-affinity", 3) {
+        /* No options */
+    }
+
+    vcpuaff(find_domain(argv[optind]), argv[optind+1] , argv[optind+2]);
+    return 0;
+}
+
 static void vcpuset(uint32_t domid, const char* nr_vcpus, int check_host)
 {
     char *endptr;
diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c
index d3dcbf0..d3dd7e6 100644
--- a/tools/libxl/xl_cmdtable.c
+++ b/tools/libxl/xl_cmdtable.c
@@ -208,13 +208,20 @@ struct cmd_spec cmd_table[] = {
     { "vcpu-list",
       &main_vcpulist, 0, 0,
       "List the VCPUs for all/some domains",
-      "[Domain, ...]",
+      "[option] [Domain, ...]",
+      "-n, --numa         Show NUMA node-affinity",
+
     },
     { "vcpu-pin",
       &main_vcpupin, 1, 1,
       "Set which CPUs a VCPU can use",
       "<Domain> <VCPU|all> <CPUs|all>",
     },
+    { "vcpu-node-affinity",
+      &main_vcpuaff, 1, 1,
+      "Set on which CPUs a VCPU prefers to run",
+      "<Domain> <VCPU|all> <CPUs|all>",
+    },
     { "vcpu-set",
       &main_vcpuset, 0, 1,
       "Set the number of active VCPUs allowed for the domain",


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.