[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 4/7] libxl: vNUMA supporting interface.



* Provides verification and construction of vnode to pnode mapping.
Will be in use for vNUMA nodes allocation if running on NUMA
machine; If the mapping can be used, disables automatic NUMA placement;
* Verifies the correctness of memory blocks pfns for Linux guest
by requesting the e820 map for that domain;
* Provides information to Xen with domain vNUMA topology;

TODO:
add additional check, as to vcpu pinning, before disabling automatic
NUMA placement machanism;

Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>

---
Changes since RFC v2:
- added vnode_to_pnode map and its verification;
- in case vnode_to_pnode map can be used, turns off
automatic NUMA placement;
- removed bogus memory blocks pfn alignment;
---
 tools/libxl/libxl.c          |   19 +++++
 tools/libxl/libxl.h          |   18 ++++
 tools/libxl/libxl_arch.h     |    8 ++
 tools/libxl/libxl_dom.c      |  186 +++++++++++++++++++++++++++++++++++++++++-
 tools/libxl/libxl_internal.h |    3 +
 tools/libxl/libxl_types.idl  |    5 +-
 tools/libxl/libxl_x86.c      |   53 ++++++++++++
 7 files changed, 290 insertions(+), 2 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 29e66f2..5f11641 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -4306,6 +4306,25 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc, 
uint32_t domid,
     }
     return 0;
 }
+int libxl_domain_setvnodes(libxl_ctx *ctx,
+                            uint32_t domid,
+                            uint16_t nr_vnodes,
+                            uint16_t nr_vcpus,
+                            vnuma_memblk_t *vnuma_memblks,
+                            unsigned int *vdistance,
+                            unsigned int *vcpu_to_vnode,
+                            unsigned int *vnode_to_pnode)
+{
+    GC_INIT(ctx);
+    int ret;
+    ret = xc_domain_setvnodes(ctx->xch, domid, nr_vnodes,
+                                nr_vcpus, vnuma_memblks,
+                                vdistance, 
+                                vcpu_to_vnode,
+                                vnode_to_pnode);
+    GC_FREE;
+    return ret;
+}
 
 int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap)
 {
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 1c6675d..ceb4e38 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -281,6 +281,7 @@
 #include <netinet/in.h>
 #include <sys/wait.h> /* for pid_t */
 
+#include <xen/memory.h>
 #include <xentoollog.h>
 
 #include <libxl_uuid.h>
@@ -376,6 +377,14 @@
 #define LIBXL_EXTERNAL_CALLERS_ONLY /* disappears for callers outside libxl */
 #endif
 
+/*
+ * LIBXL_HAVE_BUILDINFO_VNUMA indicates that vnuma topology will be
+ * build for the guest upon request and with VM configuration.
+ * It will try to define best allocation for vNUMA
+ * nodes on real NUMA nodes.
+ */
+#define LIBXL_HAVE_BUILDINFO_VNUMA 1
+
 typedef uint8_t libxl_mac[6];
 #define LIBXL_MAC_FMT "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx"
 #define LIBXL_MAC_FMTLEN ((2*6)+5) /* 6 hex bytes plus 5 colons */
@@ -753,6 +762,15 @@ void libxl_vcpuinfo_list_free(libxl_vcpuinfo *, int 
nr_vcpus);
 void libxl_device_vtpm_list_free(libxl_device_vtpm*, int nr_vtpms);
 void libxl_vtpminfo_list_free(libxl_vtpminfo *, int nr_vtpms);
 
+int libxl_domain_setvnodes(libxl_ctx *ctx,
+                           uint32_t domid,
+                           uint16_t nr_vnodes,
+                           uint16_t nr_vcpus,
+                           vnuma_memblk_t *vnuma_memblks,
+                           unsigned int *vdistance,
+                           unsigned int *vcpu_to_vnode,
+                           unsigned int *vnode_to_pnode);
+
 /*
  * Devices
  * =======
diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h
index abe6685..442aaec 100644
--- a/tools/libxl/libxl_arch.h
+++ b/tools/libxl/libxl_arch.h
@@ -19,4 +19,12 @@
 int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
                uint32_t domid);
 
+int libxl__vnuma_align_mem(libxl__gc *gc,
+                            uint32_t domid,
+                            struct libxl_domain_build_info *b_info,
+                            vnuma_memblk_t *memblks); 
+
+int libxl__vnodemap_is_usable(libxl__gc *gc,
+                                libxl_domain_build_info *info);
+
 #endif
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 356f920..12dc12a 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -201,6 +201,91 @@ static int numa_place_domain(libxl__gc *gc, uint32_t domid,
     return rc;
 }
 
+/* prepares vnode to pnode map for domain vNUMA memory allocation */
+int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid,
+                        libxl_domain_build_info *info)
+{
+    int i, n, nr_nodes, rc;
+    uint64_t *mems;
+    unsigned long long *claim = NULL;
+    libxl_numainfo *ninfo = NULL;
+
+    rc = -EINVAL;
+    if (info->vnode_to_pnode == NULL) {
+        info->vnode_to_pnode = calloc(info->nr_vnodes,
+                                        sizeof(*info->vnode_to_pnode));
+        if (info->vnode_to_pnode == NULL)                                    
+            return rc; 
+    }
+    else 
+        return 0;
+
+    /* 
+     * If this is no NUMA machine, vnode_to_pnode map will 
+     * be initilizes with VNUMA_NO_NODE
+     */
+    
+    /* Get NUMA info */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL || nr_vnodes <= 0) {
+        for (i=0; i< info->nr_vnodes; i++)
+            info->vnode_to_pnode[i] = VNUMA_NO_NODE;
+        LOG(DEBUG, "No HW NUMA found\n");
+        goto vnmapout;
+    }
+    claim = calloc(info->nr_vnodes, sizeof(*claim));
+    if (claim == NULL)
+        return rc;
+        
+    for (i=0; i< info->nr_vnodes; i++)
+        info->vnode_to_pnode[i] = VNUMA_NO_NODE;
+
+    /* 
+     * check if we have any hardware NUMA nodes selected,
+     * otherwise VNUMA_NO_NODE set and used default allocation
+     */ 
+    if (libxl_bitmap_is_empty(&info->nodemap))
+        return 0;
+    mems = info->vnuma_memszs;
+    
+    /* check if all vnodes will fit in one node */
+    libxl_for_each_set_bit(n, info->nodemap) {
+        if (ninfo[n].free/1024 >= info->max_memkb  && 
+           libxl_bitmap_test(&info->nodemap, n))
+           {
+               /* 
+                * all domain v-nodes will fit one p-node, 
+                * p-node is a best candidate selected by automatic 
+                * NUMA placement.
+                */
+               for (i=0; i< info->nr_vnodes; i++)
+                    info->vnode_to_pnode[i] = n;
+               return 0;
+           }
+    }
+    /* TODO: change algorithm. The current just fits the nodes
+     * Will be nice to have them also sorted by size 
+     * If no p-node found, will be set to NUMA_NO_NODE
+     */
+    libxl_for_each_set_bit(n, info->nodemap)
+    {
+        for ( i = 0; i < info->nr_vnodes; i++ )
+        {
+            if ( ((claim[n] + (mems[i] << 20)) <= ninfo[n].free)    && 
+                 /*vnode was not set yet */
+                 (info->vnode_to_pnode[i] == VNUMA_NO_NODE ) )
+            {
+                info->vnode_to_pnode[i] = n;
+                claim[n] += (mems[i] << 20);
+            }
+        }
+    }
+    rc = 0;
+vnmapout:
+    if (claim) free(claim);
+    return rc;
+}
+
 int libxl__build_pre(libxl__gc *gc, uint32_t domid,
               libxl_domain_config *d_config, libxl__domain_build_state *state)
 {
@@ -209,8 +294,29 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
     char *xs_domid, *con_domid;
     int rc;
 
+    rc = -EINVAL;
     xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus);
 
+    /* 
+     * If vNUMA vnode_to_pnode map defined, determine if we
+     * can disable automatic numa placement and place vnodes
+     * on specified pnodes.
+     * For now, if vcpu affinity specified, we will use 
+     * specified vnode to pnode map.
+     */
+    if (info->nr_vnodes != 0) {
+       if ( libxl__vnodemap_is_usable(gc, info) ) {
+           LOG(DETAIL, "vNUMA automatic placement disabled\n");
+           libxl_defbool_set(&info->numa_placement, false);
+       }
+       else {
+          /* release the map as unusable */
+          free(info->vnode_to_pnode);
+          LOG(DETAIL, "vNUMA will use default vnode to pnode map\n");
+          info->vnode_to_pnode = NULL;
+       }
+    }
+
     /*
      * Check if the domain has any CPU affinity. If not, try to build
      * up one. In case numa_place_domain() find at least a suitable
@@ -232,6 +338,26 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         if (rc)
             return rc;
     }
+    if (info->nr_vnodes != 0) {
+        /* The memory blocks will be formed here from sizes */
+        vnuma_memblk_t *memblks = libxl__calloc(gc, info->nr_vnodes,
+                                                sizeof(*memblks));
+
+        libxl__vnuma_align_mem(gc, domid, info, memblks);
+        /* Construct the vnode to pnode mapping if possible */
+        if (libxl__init_vnodemap(gc, domid, info) < 0) {
+            LOG(DEBUG, "Failed to call init_vnodemap\n");
+            info->nr_vnodes = 0;
+        }
+        /* plumb domain with vNUMA topology */
+        libxl_domain_setvnodes(ctx, domid, info->nr_vnodes,
+                                info->max_vcpus, memblks,
+                                info->vdistance, info->vcpu_to_vnode,
+                                info->vnode_to_pnode);
+    }
+    else
+        LOG(DEBUG, "Will not construct vNUMA topology with 0 nodes.\n");
+    
     libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
     libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap);
 
@@ -253,6 +379,48 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
     return rc;
 }
 
+int libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info *info)
+{
+    int rc, nr_nodes, i;
+    libxl_numainfo *ninfo = NULL;
+    unsigned long long *claim;
+    unsigned int node;
+    uint64_t *mems;
+
+    rc = 0;
+    if (info->vnode_to_pnode == NULL)
+        return rc;
+    /*
+     * Cannot use specified mapping if not NUMA machine 
+     */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL) {
+        return rc;   
+    }
+    mems = info->vnuma_memszs;   
+    claim = calloc(info->nr_vnodes, sizeof(*claim));
+    if (claim == NULL)
+        return rc;
+    /* Sum memory request on per pnode basis */ 
+    for ( i = 0; i < info->nr_vnodes; i++ )
+    {
+        node = info->vnode_to_pnode[i];
+        /* Correct pnode number? */
+        if (node < nr_nodes)
+            claim[node] += (mems[i] << 20);
+        else
+            goto vmapu;
+   }
+   for ( i = 0; i < nr_nodes; i++)
+       if (claim[i] > ninfo[i].free)
+          /* Cannot complete user request, falling to default */
+          goto vmapu;
+   rc = 1;
+vmapu:
+   if(claim) free(claim);
+   return rc;
+       
+}
 int libxl__build_post(libxl__gc *gc, uint32_t domid,
                       libxl_domain_build_info *info,
                       libxl__domain_build_state *state,
@@ -375,7 +543,23 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
             }
         }
     }
-
+    if (info->nr_vnodes != 0) { 
+        dom->nr_vnodes = info->nr_vnodes;
+        dom->vnode_to_pnode = malloc(dom->nr_vnodes * 
sizeof(*dom->vnode_to_pnode));
+        dom->vnuma_memszs = malloc(dom->nr_vnodes * 
sizeof(*dom->vnuma_memszs));
+        if (dom->vnuma_memszs == NULL || dom->vnode_to_pnode == NULL) {
+            LOGE(ERROR, "Failed to allocate memory for vNUMA domain image.\n");
+            dom->nr_vnodes = 0;
+            info->nr_vnodes = 0;
+            if (dom->vnode_to_pnode) free(dom->vnode_to_pnode);
+            if (dom->vnuma_memszs) free(dom->vnuma_memszs);
+            goto out;
+        }
+        memcpy(dom->vnuma_memszs, info->vnuma_memszs,
+               sizeof(*dom->vnuma_memszs) * dom->nr_vnodes);
+        memcpy(dom->vnode_to_pnode, info->vnode_to_pnode,
+               sizeof(*dom->vnode_to_pnode) * dom->nr_vnodes);
+    }
     dom->flags = flags;
     dom->console_evtchn = state->console_port;
     dom->console_domid = state->console_domid;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 165dc00..19ac0fe 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2710,6 +2710,7 @@ static inline void libxl__ctx_unlock(libxl_ctx *ctx) {
 #define CTX_LOCK (libxl__ctx_lock(CTX))
 #define CTX_UNLOCK (libxl__ctx_unlock(CTX))
 
+#define VNUMA_NO_NODE ~((unsigned int)0) 
 /*
  * Automatic NUMA placement
  *
@@ -2833,6 +2834,8 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
     libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
 }
 
+int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid,
+                                libxl_domain_build_info *info);
 /*
  * Inserts "elm_new" into the sorted list "head".
  *
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index d2cea8a..5418966 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -292,7 +292,10 @@ libxl_domain_build_info = Struct("domain_build_info",[
     ("disable_migrate", libxl_defbool),
     ("cpuid",           libxl_cpuid_policy_list),
     ("blkdev_start",    string),
-    
+    ("vnuma_memszs",    Array(uint64, "nr_vnodes")),
+    ("vcpu_to_vnode",   Array(uint32, "nr_vnodemap")),
+    ("vdistance",       Array(uint32, "nr_vdist")),
+    ("vnode_to_pnode",  Array(uint32, "nr_vnode_to_pnode")),
     ("device_model_version", libxl_device_model_version),
     ("device_model_stubdomain", libxl_defbool),
     # if you set device_model you must set device_model_version too
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index a78c91d..01edc2b 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -308,3 +308,56 @@ int libxl__arch_domain_create(libxl__gc *gc, 
libxl_domain_config *d_config,
 
     return ret;
 }
+
+/*
+ * Checks for the beginnig and end of RAM in e820 map for domain
+ * and aligns start of first and end of last vNUMA memory block to
+ * that map. vnode memory size are passed here Megabytes.
+ */
+int libxl__vnuma_align_mem(libxl__gc *gc,
+                            uint32_t domid,
+                            /* IN: mem sizes in Mbytes*/
+                            libxl_domain_build_info *b_info,
+                            /* OUT: linux numa blocks in pfn */
+                            vnuma_memblk_t *memblks)
+{
+#ifndef roundup
+#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+#endif 
+    int i, rc;
+    unsigned long shift = 0;
+    unsigned long end_max;
+    uint32_t nr;
+    struct e820entry map[E820MAX];
+    
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
+    if (rc < 0) {
+
+        errno = rc;
+        return -EINVAL;
+    }
+    nr = rc;
+    rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
+                       (b_info->max_memkb - b_info->target_memkb) +
+                       b_info->u.pv.slack_memkb);
+    if (rc)
+        return -EINVAL;
+    end_max = map[nr-1].addr + map[nr-1].size;
+    shift = 0;
+    memset(memblks, 0, sizeof(*memblks)*b_info->nr_vnodes);
+    memblks[0].start = map[0].addr;
+
+    for(i = 0; i < b_info->nr_vnodes; i++) {
+        memblks[i].start += shift;
+        memblks[i].end += shift + (b_info->vnuma_memszs[i] << 20);
+        shift = memblks[i].end;
+        memblks[i].start = roundup(memblks[i].start, 1024 * 4);
+        LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"start = %#010lx, end = %#010lx, size 
MB = %#010lx\n",
+                                        memblks[i].start, memblks[i].end, 
b_info->vnuma_memszs[i]);
+    }
+
+    if(memblks[i-1].end > end_max)
+        memblks[i-1].end = end_max;
+    return 0;
+}
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.