[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v5 4/8] vnuma topology parsing routines



Parses vnuma topoplogy number of nodes and memory
ranges. If not defined, initializes vnuma with
only one node and default topology.

Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
---
 tools/libxl/libxl_vnuma.h |   11 ++
 tools/libxl/xl_cmdimpl.c  |  406 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 417 insertions(+)
 create mode 100644 tools/libxl/libxl_vnuma.h

diff --git a/tools/libxl/libxl_vnuma.h b/tools/libxl/libxl_vnuma.h
new file mode 100644
index 0000000..f1568ae
--- /dev/null
+++ b/tools/libxl/libxl_vnuma.h
@@ -0,0 +1,11 @@
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#define VNUMA_NO_NODE ~((unsigned int)0)
+
+/*
+ * Max vNUMA node size in Mb is taken 64Mb even now Linux lets
+ * 32Mb, thus letting some slack. Will be modified to match Linux.
+ */
+#define MIN_VNODE_SIZE  64U
+
+#define MAX_VNUMA_NODES (unsigned int)1 << 10
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 5195914..59855ed 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -40,6 +40,7 @@
 #include "libxl_json.h"
 #include "libxlutil.h"
 #include "xl.h"
+#include "libxl_vnuma.h"
 
 /* For calls which return an errno on failure */
 #define CHK_ERRNOVAL( call ) ({                                         \
@@ -725,6 +726,403 @@ static void parse_top_level_sdl_options(XLU_Config 
*config,
     xlu_cfg_replace_string (config, "xauthority", &sdl->xauthority, 0);
 }
 
+
+static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
+{
+    const char *buf;
+    char *ep;
+    unsigned long ul;
+    int rc = -EINVAL;
+    buf = xlu_cfg_get_listitem(list, i);
+    if (!buf)
+        return rc;
+    ul = strtoul(buf, &ep, 10);
+    if (ep == buf)
+        return rc;
+    if (ul >= UINT16_MAX)
+        return rc;
+    return (unsigned int)ul;
+}
+
+static void vdistance_set(unsigned int *vdistance,
+                                unsigned int nr_vnodes,
+                                unsigned int samenode,
+                                unsigned int othernode)
+{
+    unsigned int idx, slot;
+    for (idx = 0; idx < nr_vnodes; idx++)
+        for (slot = 0; slot < nr_vnodes; slot++)
+            *(vdistance + slot * nr_vnodes + idx) =
+                idx == slot ? samenode : othernode;
+}
+
+static void vcputovnode_default(unsigned int *cpu_to_node,
+                                unsigned int nr_vnodes,
+                                unsigned int max_vcpus)
+{
+    unsigned int cpu;
+    for (cpu = 0; cpu < max_vcpus; cpu++)
+        cpu_to_node[cpu] = cpu % nr_vnodes;
+}
+
+/* Split domain memory between vNUMA nodes equally */
+static int split_vnumamem(libxl_domain_build_info *b_info)
+{
+    unsigned long long vnodemem = 0;
+    unsigned long n;
+    unsigned int i;
+
+    /* In MBytes */
+    if (b_info->nr_nodes == 0)
+        return -1;
+    vnodemem = (b_info->max_memkb >> 10) / b_info->nr_nodes;
+    if (vnodemem < MIN_VNODE_SIZE)
+        return -1;
+    /* reminder in MBytes */
+    n = (b_info->max_memkb >> 10) % b_info->nr_nodes;
+    /* get final sizes in MBytes */
+    for (i = 0; i < (b_info->nr_nodes - 1); i++)
+        b_info->numa_memszs[i] = vnodemem;
+    /* add the reminder to the last node */
+    b_info->numa_memszs[i] = vnodemem + n;
+    return 0;
+}
+
+static void vnode_to_pnode_default(unsigned int *vnode_to_pnode,
+                                   unsigned int nr_vnodes)
+{
+    unsigned int i;
+    for (i = 0; i < nr_vnodes; i++)
+        vnode_to_pnode[i] = VNUMA_NO_NODE;
+}
+
+/*
+ * init vNUMA to "zero config" with one node and all other
+ * topology parameters set to default.
+ */
+static int vnuma_zero_config(libxl_domain_build_info *b_info)
+{
+    b_info->nr_nodes = 1;
+    /* all memory goes to this one vnode */
+    if (!(b_info->numa_memszs = (uint64_t *)calloc(b_info->nr_nodes,
+                                sizeof(*b_info->numa_memszs))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->cpu_to_node = (unsigned int *)calloc(b_info->max_vcpus,
+                                sizeof(*b_info->cpu_to_node))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->distance = (unsigned int *)calloc(b_info->nr_nodes *
+                                b_info->nr_nodes, sizeof(*b_info->distance))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->vnode_to_pnode = (unsigned int *)calloc(b_info->nr_nodes,
+                                sizeof(*b_info->vnode_to_pnode))))
+        goto bad_vnumazerocfg;
+
+    b_info->numa_memszs[0] = b_info->max_memkb >> 10;
+
+    /* all vcpus assigned to this vnode */
+    vcputovnode_default(b_info->cpu_to_node, b_info->nr_nodes,
+                        b_info->max_vcpus);
+
+    /* default vdistance is 10 */
+    vdistance_set(b_info->distance, b_info->nr_nodes, 10, 10);
+
+    /* VNUMA_NO_NODE for vnode_to_pnode */
+    vnode_to_pnode_default(b_info->vnode_to_pnode, b_info->nr_nodes);
+
+    /*
+     * will be placed to some physical nodes defined by automatic
+     * numa placement or VNUMA_NO_NODE will not request exact node
+     */
+    libxl_defbool_set(&b_info->vnuma_autoplacement, true);
+    return 0;
+
+ bad_vnumazerocfg:
+    return -1;
+}
+
+/* Caller must exit */
+static void free_vnuma_info(libxl_domain_build_info *b_info)
+{
+    free(b_info->numa_memszs);
+    free(b_info->distance);
+    free(b_info->cpu_to_node);
+    free(b_info->vnode_to_pnode);
+    b_info->nr_nodes = 0;
+}
+
+/*
+static int vdistance_parse(char *vdistcfg, unsigned int *vdistance,
+                            unsigned int nr_vnodes)
+{
+    char *endptr, *toka, *tokb, *saveptra = NULL, *saveptrb = NULL;
+    unsigned int *vdist_tmp = NULL;
+    int rc = 0;
+    unsigned int i, j, parsed = 0;
+    unsigned long dist;
+
+    rc = -EINVAL;
+    if (vdistance == NULL) {
+        return rc;
+    }
+    vdist_tmp = (unsigned int *)malloc(nr_vnodes * nr_vnodes * 
sizeof(*vdistance));
+    if (vdist_tmp == NULL)
+        return rc;
+
+    i = j = 0;
+    for (toka = strtok_r(vdistcfg, ",", &saveptra); toka;
+        toka = strtok_r(NULL, ",", &saveptra)) {
+        if ( i >= nr_vnodes )
+            goto vdist_parse_err;
+        for (tokb = strtok_r(toka, " ", &saveptrb); tokb;
+            tokb = strtok_r(NULL, " ", &saveptrb)) {
+            if (j >= nr_vnodes)
+                goto vdist_parse_err;
+            dist = strtol(tokb, &endptr, 10);
+            if (dist > UINT16_MAX || dist < 0)
+                goto vdist_parse_err;
+            if (tokb == endptr)
+                goto vdist_parse_err;
+            *(vdist_tmp + j*nr_vnodes + i) = dist;
+            parsed++;
+            j++;
+        }
+        i++;
+        j = 0;
+    }
+    rc = parsed;
+    memcpy(vdistance, vdist_tmp, nr_vnodes * nr_vnodes * sizeof(*vdistance));
+
+ vdist_parse_err:
+    free(vdist_tmp);
+    return rc;
+}
+*/
+
+static void parse_vnuma_config(XLU_Config *config, libxl_domain_build_info 
*b_info)
+{
+    XLU_ConfigList *vnumamemcfg;
+    XLU_ConfigList *vdistancecfg, *vnodemap, *vcpumap;
+    int nr_vnuma_regions;
+    int nr_vdist, nr_vnodemap, nr_vcpumap, i;
+    unsigned long long vnuma_memparsed = 0;
+    long l;
+    unsigned long ul;
+    const char *buf;
+
+    if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
+        if (l > MAX_VNUMA_NODES) {
+            fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n", 
MAX_VNUMA_NODES);
+            goto bad_vnuma_config;
+        }
+        b_info->nr_nodes = l;
+
+        xlu_cfg_get_defbool(config, "vnuma_autoplacement", 
&b_info->vnuma_autoplacement, 0);
+
+        /* Only construct nodes with at least one vcpu for now */
+        if (b_info->nr_nodes != 0 && b_info->max_vcpus >= b_info->nr_nodes) {
+            if (!xlu_cfg_get_list(config, "vnumamem",
+                                  &vnumamemcfg, &nr_vnuma_regions, 0)) {
+
+                if (nr_vnuma_regions != b_info->nr_nodes) {
+                    fprintf(stderr, "Number of numa regions (vnumamem = %d) is 
incorrect (should be %d).\n",
+                            nr_vnuma_regions, b_info->nr_nodes);
+                    goto bad_vnuma_config;
+                }
+
+                b_info->numa_memszs = calloc(b_info->nr_nodes,
+                                              sizeof(*b_info->numa_memszs));
+                if (b_info->numa_memszs == NULL) {
+                    fprintf(stderr, "Unable to allocate memory for vnuma 
ranges.\n");
+                    goto bad_vnuma_config;
+                }
+
+                char *ep;
+                /*
+                 * Will parse only nr_vnodes times, even if we have more/less 
regions.
+                 * Take care of it later if less or discard if too many 
regions.
+                 */
+                for (i = 0; i < b_info->nr_nodes; i++) {
+                    buf = xlu_cfg_get_listitem(vnumamemcfg, i);
+                    if (!buf) {
+                        fprintf(stderr,
+                                "xl: Unable to get element %d in vnuma memory 
list.\n", i);
+                        break;
+                    }
+                    ul = strtoul(buf, &ep, 10);
+                    if (ep == buf) {
+                        fprintf(stderr,
+                                "xl: Invalid argument parsing vnumamem: 
%s.\n", buf);
+                        break;
+                    }
+
+                    /* 32Mb is a min size for a node, taken from Linux */
+                    if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
+                        fprintf(stderr, "xl: vnuma memory %lu is not within %u 
- %u range.\n",
+                                ul, MIN_VNODE_SIZE, UINT32_MAX);
+                        break;
+                    }
+
+                    /* memory in MBytes */
+                    b_info->numa_memszs[i] = ul;
+                }
+
+                /* Total memory for vNUMA parsed to verify */
+                for (i = 0; i < nr_vnuma_regions; i++)
+                    vnuma_memparsed = vnuma_memparsed + 
(b_info->numa_memszs[i]);
+
+                /* Amount of memory for vnodes same as total? */
+                if ((vnuma_memparsed << 10) != (b_info->max_memkb)) {
+                    fprintf(stderr, "xl: vnuma memory is not the same as 
domain memory size.\n");
+                    goto bad_vnuma_config;
+                }
+            } else {
+                b_info->numa_memszs = calloc(b_info->nr_nodes,
+                                              sizeof(*b_info->numa_memszs));
+                if (b_info->numa_memszs == NULL) {
+                    fprintf(stderr, "Unable to allocate memory for vnuma 
ranges.\n");
+                    goto bad_vnuma_config;
+                }
+
+                fprintf(stderr, "WARNING: vNUMA memory ranges were not 
specified.\n");
+                fprintf(stderr, "Using default equal vnode memory size %lu 
Kbytes to cover %lu Kbytes.\n",
+                                b_info->max_memkb / b_info->nr_nodes, 
b_info->max_memkb);
+
+                if (split_vnumamem(b_info) < 0) {
+                    fprintf(stderr, "Could not split vnuma memory into equal 
chunks.\n");
+                    goto bad_vnuma_config;
+                }
+            }
+
+            b_info->distance = calloc(b_info->nr_nodes * b_info->nr_nodes,
+                                       sizeof(*b_info->distance));
+            if (b_info->distance == NULL)
+                goto bad_vnuma_config;
+
+            if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, 
&nr_vdist, 0)) {
+                int d1, d2;
+                /*
+                 * First value is the same node distance, the second as the
+                 * rest of distances. The following is required right now to
+                 * avoid non-symmetrical distance table as it may break latest 
kernel.
+                 * TODO: Better way to analyze extended distance table, 
possibly
+                 * OS specific.
+                 */
+                 d1 = get_list_item_uint(vdistancecfg, 0);
+                 d2 = get_list_item_uint(vdistancecfg, 1);
+
+                 if (d1 >= 0 && d2 >= 0 && d1 < d2) {
+                    vdistance_set(b_info->distance, b_info->nr_nodes, d1, d2);
+                 } else {
+                    fprintf(stderr, "WARNING: vnuma distance values are 
incorrect.\n");
+                    goto bad_vnuma_config;
+                 }
+
+            } else {
+                fprintf(stderr, "Could not parse vnuma distances.\n");
+                vdistance_set(b_info->distance, b_info->nr_nodes, 10, 20);
+            }
+
+            b_info->cpu_to_node = (unsigned int *)calloc(b_info->max_vcpus,
+                                     sizeof(*b_info->cpu_to_node));
+            if (b_info->cpu_to_node == NULL)
+                goto bad_vnuma_config;
+
+            if (!xlu_cfg_get_list(config, "numa_cpumask",
+                                  &vcpumap, &nr_vcpumap, 0)) {
+                if (nr_vcpumap == b_info->max_vcpus) {
+                    unsigned int  vnode, vcpumask = 0, vmask;
+                    vmask = ~(~0 << nr_vcpumap);
+                    for (i = 0; i < nr_vcpumap; i++) {
+                        vnode = get_list_item_uint(vcpumap, i);
+                        if (vnode >= 0 && vnode < b_info->nr_nodes) {
+                            vcpumask  |= (1 << i);
+                            b_info->cpu_to_node[i] = vnode;
+                        }
+                    }
+
+                    /* Did it covered all vnodes in the vcpu mask? */
+                    if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
+                        fprintf(stderr, "WARNING: Not all vnodes were covered 
in numa_cpumask.\n");
+                        goto bad_vnuma_config;
+                    }
+                } else {
+                    fprintf(stderr, "WARNING:  Bad vnuma_vcpumap.\n");
+                    goto bad_vnuma_config;
+                }
+            }
+            else
+                vcputovnode_default(b_info->cpu_to_node,
+                                    b_info->nr_nodes,
+                                    b_info->max_vcpus);
+
+            /* There is mapping to NUMA physical nodes? */
+            b_info->vnode_to_pnode = (unsigned int *)calloc(b_info->nr_nodes,
+                                            sizeof(*b_info->vnode_to_pnode));
+            if (b_info->vnode_to_pnode == NULL)
+                goto bad_vnuma_config;
+            if (!xlu_cfg_get_list(config, "vnuma_vnodemap",&vnodemap,
+                                                    &nr_vnodemap, 0)) {
+                /*
+                * If not specified or incorred, will be defined
+                * later based on the machine architecture, configuration
+                * and memory availble when creating domain.
+                */
+                if (nr_vnodemap == b_info->nr_nodes) {
+                    unsigned int vnodemask = 0, pnode, smask;
+                    smask = ~(~0 << b_info->nr_nodes);
+                    for (i = 0; i < b_info->nr_nodes; i++) {
+                        pnode = get_list_item_uint(vnodemap, i);
+                        if (pnode >= 0) {
+                            vnodemask |= (1 << i);
+                            b_info->vnode_to_pnode[i] = pnode;
+                        }
+                    }
+
+                    /* Did it covered all vnodes in the mask? */
+                    if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
+                        fprintf(stderr, "WARNING: Not all vnodes were covered 
vnuma_vnodemap.\n");
+
+                        if (libxl_defbool_val(b_info->vnuma_autoplacement)) {
+                            fprintf(stderr, "Automatic placement will be used 
for vnodes.\n");
+                            vnode_to_pnode_default(b_info->vnode_to_pnode, 
b_info->nr_nodes);
+                        } else
+                            goto bad_vnuma_config;
+                    }
+                } else {
+                    fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
+
+                    if (libxl_defbool_val(b_info->vnuma_autoplacement)) {
+                        fprintf(stderr, "Automatic placement will be used for 
vnodes.\n");
+                        vnode_to_pnode_default(b_info->vnode_to_pnode, 
b_info->nr_nodes);
+                    } else
+                        goto bad_vnuma_config;
+                }
+            } else {
+                fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n");
+
+                if (libxl_defbool_val(b_info->vnuma_autoplacement)) {
+                    fprintf(stderr, "Automatic placement will be used for 
vnodes.\n");
+                    vnode_to_pnode_default(b_info->vnode_to_pnode, 
b_info->nr_nodes);
+                } else
+                    goto bad_vnuma_config;
+            }
+        }
+        else if (vnuma_zero_config(b_info))
+            goto bad_vnuma_config;
+    }
+    /* If vnuma topology is not defined for domain, init one node */
+    else if (vnuma_zero_config(b_info))
+            goto bad_vnuma_config;
+    return;
+
+ bad_vnuma_config:
+    free_vnuma_info(b_info);
+    exit(1);
+}
+
 static void parse_config_data(const char *config_source,
                               const char *config_data,
                               int config_len,
@@ -1081,6 +1479,14 @@ static void parse_config_data(const char *config_source,
             exit(1);
         }
 
+        libxl_defbool_set(&b_info->vnuma_autoplacement, false);
+
+        /*
+         * If there is no vnuma in config, "zero" vnuma config
+         * will be initialized with one node and other defaults.
+         */
+        parse_vnuma_config(config, b_info);
+
         xlu_cfg_replace_string (config, "bootloader", 
&b_info->u.pv.bootloader, 0);
         switch (xlu_cfg_get_list_as_string_list(config, "bootloader_args",
                                       &b_info->u.pv.bootloader_args, 1))
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.