[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v7 1/5] sysctl: Make XEN_SYSCTL_numainfo a little more efficient



A number of changes to XEN_SYSCTL_numainfo interface:

* Make sysctl NUMA topology query use fewer copies by combining some
  fields into a single structure and copying distances for each node
  in a single copy.
* NULL meminfo and distance handles are a request for maximum number
  of nodes (num_nodes). If those handles are valid and num_nodes is
  is smaller than the number of nodes in the system then -ENOBUFS is
  returned (and correct num_nodes is provided)
* Instead of using max_node_index for passing number of nodes keep this
  value in num_nodes: almost all uses of max_node_index required adding
  or subtracting one to eventually get to number of nodes anyway.
* Replace INVALID_NUMAINFO_ID with XEN_INVALID_MEM_SZ and add
  XEN_INVALID_NODE_DIST.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
Acked-by: Ian Campbell <ian.campbell@xxxxxxxxxx>
---

Changes in v7:
* Allow one of arguments to  NUMA info sysctls to be NULL, in which case only
  the non-NULL buffer will be filled in by hypervisor (changes in sysctl.[ch])

 tools/libxl/libxl.c               |   66 +++++++++++++----------------
 tools/python/xen/lowlevel/xc/xc.c |   58 ++++++++++++--------------
 xen/common/sysctl.c               |   84 +++++++++++++++++++++++--------------
 xen/include/public/sysctl.h       |   54 ++++++++++++++----------
 4 files changed, 141 insertions(+), 121 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 511eef1..2ff46b4 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -5156,65 +5156,59 @@ libxl_numainfo *libxl_get_numainfo(libxl_ctx *ctx, int 
*nr)
 {
     GC_INIT(ctx);
     xc_numainfo_t ninfo;
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_memsize_t, memsize);
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_memfree_t, memfree);
-    DECLARE_HYPERCALL_BUFFER(uint32_t, node_dists);
+    DECLARE_HYPERCALL_BUFFER(xen_sysctl_meminfo_t, meminfo);
+    DECLARE_HYPERCALL_BUFFER(uint32_t, distance);
     libxl_numainfo *ret = NULL;
-    int i, j, max_nodes;
+    int i, j;
 
-    max_nodes = libxl_get_max_nodes(ctx);
-    if (max_nodes < 0)
-    {
+    set_xen_guest_handle(ninfo.meminfo, HYPERCALL_BUFFER_NULL);
+    set_xen_guest_handle(ninfo.distance, HYPERCALL_BUFFER_NULL);
+    if (xc_numainfo(ctx->xch, &ninfo) != 0) {
         LIBXL__LOG(ctx, XTL_ERROR, "Unable to determine number of NODES");
         ret = NULL;
         goto out;
     }
 
-    memsize = xc_hypercall_buffer_alloc
-        (ctx->xch, memsize, sizeof(*memsize) * max_nodes);
-    memfree = xc_hypercall_buffer_alloc
-        (ctx->xch, memfree, sizeof(*memfree) * max_nodes);
-    node_dists = xc_hypercall_buffer_alloc
-        (ctx->xch, node_dists, sizeof(*node_dists) * max_nodes * max_nodes);
-    if ((memsize == NULL) || (memfree == NULL) || (node_dists == NULL)) {
+    meminfo = xc_hypercall_buffer_alloc(ctx->xch, meminfo,
+                                        sizeof(*meminfo) * ninfo.num_nodes);
+    distance = xc_hypercall_buffer_alloc(ctx->xch, distance,
+                                         sizeof(*distance) *
+                                         ninfo.num_nodes * ninfo.num_nodes);
+    if ((meminfo == NULL) || (distance == NULL)) {
         LIBXL__LOG_ERRNOVAL(ctx, XTL_ERROR, ENOMEM,
                             "Unable to allocate hypercall arguments");
         goto fail;
     }
 
-    set_xen_guest_handle(ninfo.node_to_memsize, memsize);
-    set_xen_guest_handle(ninfo.node_to_memfree, memfree);
-    set_xen_guest_handle(ninfo.node_to_node_distance, node_dists);
-    ninfo.max_node_index = max_nodes - 1;
+    set_xen_guest_handle(ninfo.meminfo, meminfo);
+    set_xen_guest_handle(ninfo.distance, distance);
     if (xc_numainfo(ctx->xch, &ninfo) != 0) {
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "getting numainfo");
         goto fail;
     }
 
-    if (ninfo.max_node_index < max_nodes - 1)
-        max_nodes = ninfo.max_node_index + 1;
+    *nr = ninfo.num_nodes;
 
-    *nr = max_nodes;
+    ret = libxl__zalloc(NOGC, sizeof(libxl_numainfo) * ninfo.num_nodes);
+    for (i = 0; i < ninfo.num_nodes; i++)
+        ret[i].dists = libxl__calloc(NOGC, ninfo.num_nodes, sizeof(*distance));
 
-    ret = libxl__zalloc(NOGC, sizeof(libxl_numainfo) * max_nodes);
-    for (i = 0; i < max_nodes; i++)
-        ret[i].dists = libxl__calloc(NOGC, max_nodes, sizeof(*node_dists));
-
-    for (i = 0; i < max_nodes; i++) {
-#define V(mem, i) (mem[i] == INVALID_NUMAINFO_ID) ? \
-    LIBXL_NUMAINFO_INVALID_ENTRY : mem[i]
-        ret[i].size = V(memsize, i);
-        ret[i].free = V(memfree, i);
-        ret[i].num_dists = max_nodes;
-        for (j = 0; j < ret[i].num_dists; j++)
-            ret[i].dists[j] = V(node_dists, i * max_nodes + j);
+    for (i = 0; i < ninfo.num_nodes; i++) {
+#define V(val, invalid) (val == invalid) ? \
+       LIBXL_NUMAINFO_INVALID_ENTRY : val
+        ret[i].size = V(meminfo[i].memsize, XEN_INVALID_MEM_SZ);
+        ret[i].free = V(meminfo[i].memfree, XEN_INVALID_MEM_SZ);
+        ret[i].num_dists = ninfo.num_nodes;
+        for (j = 0; j < ret[i].num_dists; j++) {
+            unsigned idx = i * ninfo.num_nodes + j;
+            ret[i].dists[j] = V(distance[idx], XEN_INVALID_NODE_DIST);
+        }
 #undef V
     }
 
  fail:
-    xc_hypercall_buffer_free(ctx->xch, memsize);
-    xc_hypercall_buffer_free(ctx->xch, memfree);
-    xc_hypercall_buffer_free(ctx->xch, node_dists);
+    xc_hypercall_buffer_free(ctx->xch, meminfo);
+    xc_hypercall_buffer_free(ctx->xch, distance);
 
  out:
     GC_FREE;
diff --git a/tools/python/xen/lowlevel/xc/xc.c 
b/tools/python/xen/lowlevel/xc/xc.c
index 5e81c4a..ba66d55 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -1297,55 +1297,52 @@ out:
 
 static PyObject *pyxc_numainfo(XcObject *self)
 {
-#define MAX_NODE_INDEX 31
     xc_numainfo_t ninfo = { 0 };
-    int i, j, max_node_index;
+    unsigned i, j;
     uint64_t free_heap;
     PyObject *ret_obj = NULL, *node_to_node_dist_list_obj;
     PyObject *node_to_memsize_obj, *node_to_memfree_obj;
     PyObject *node_to_dma32_mem_obj, *node_to_node_dist_obj;
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_memsize_t, node_memsize);
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_memfree_t, node_memfree);
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_node_dist_t, nodes_dist);
+    DECLARE_HYPERCALL_BUFFER(xen_sysctl_meminfo_t, meminfo);
+    DECLARE_HYPERCALL_BUFFER(uint32_t, distance);
 
-    node_memsize = xc_hypercall_buffer_alloc(self->xc_handle, node_memsize, 
sizeof(*node_memsize)*(MAX_NODE_INDEX+1));
-    if ( node_memsize == NULL )
+    set_xen_guest_handle(ninfo.meminfo, HYPERCALL_BUFFER_NULL);
+    set_xen_guest_handle(ninfo.distance, HYPERCALL_BUFFER_NULL);
+    if ( xc_numainfo(self->xc_handle, &ninfo) != 0 )
         goto out;
-    node_memfree = xc_hypercall_buffer_alloc(self->xc_handle, node_memfree, 
sizeof(*node_memfree)*(MAX_NODE_INDEX+1));
-    if ( node_memfree == NULL )
+
+    meminfo = xc_hypercall_buffer_alloc(self->xc_handle, meminfo,
+                                        sizeof(*meminfo) * ninfo.num_nodes);
+    if ( meminfo == NULL )
         goto out;
-    nodes_dist = xc_hypercall_buffer_alloc(self->xc_handle, nodes_dist, 
sizeof(*nodes_dist)*(MAX_NODE_INDEX+1)*(MAX_NODE_INDEX+1));
-    if ( nodes_dist == NULL )
+    distance = xc_hypercall_buffer_alloc(self->xc_handle, distance,
+                                         sizeof(*distance) *
+                                         ninfo.num_nodes * ninfo.num_nodes);
+    if ( distance == NULL )
         goto out;
 
-    set_xen_guest_handle(ninfo.node_to_memsize, node_memsize);
-    set_xen_guest_handle(ninfo.node_to_memfree, node_memfree);
-    set_xen_guest_handle(ninfo.node_to_node_distance, nodes_dist);
-    ninfo.max_node_index = MAX_NODE_INDEX;
-
+    set_xen_guest_handle(ninfo.meminfo, meminfo);
+    set_xen_guest_handle(ninfo.distance, distance);
     if ( xc_numainfo(self->xc_handle, &ninfo) != 0 )
         goto out;
 
-    max_node_index = ninfo.max_node_index;
-    if ( max_node_index > MAX_NODE_INDEX )
-        max_node_index = MAX_NODE_INDEX;
-
     /* Construct node-to-* lists. */
     node_to_memsize_obj = PyList_New(0);
     node_to_memfree_obj = PyList_New(0);
     node_to_dma32_mem_obj = PyList_New(0);
     node_to_node_dist_list_obj = PyList_New(0);
-    for ( i = 0; i <= max_node_index; i++ )
+    for ( i = 0; i < ninfo.num_nodes; i++ )
     {
         PyObject *pyint;
+        unsigned invalid_node;
 
         /* Total Memory */
-        pyint = PyInt_FromLong(node_memsize[i] >> 20); /* MB */
+        pyint = PyInt_FromLong(meminfo[i].memsize >> 20); /* MB */
         PyList_Append(node_to_memsize_obj, pyint);
         Py_DECREF(pyint);
 
         /* Free Memory */
-        pyint = PyInt_FromLong(node_memfree[i] >> 20); /* MB */
+        pyint = PyInt_FromLong(meminfo[i].memfree >> 20); /* MB */
         PyList_Append(node_to_memfree_obj, pyint);
         Py_DECREF(pyint);
 
@@ -1357,10 +1354,11 @@ static PyObject *pyxc_numainfo(XcObject *self)
 
         /* Node to Node Distance */
         node_to_node_dist_obj = PyList_New(0);
-        for ( j = 0; j <= max_node_index; j++ )
+        invalid_node = (meminfo[i].memsize == XEN_INVALID_MEM_SZ);
+        for ( j = 0; j < ninfo.num_nodes; j++ )
         {
-            uint32_t dist = nodes_dist[i*(max_node_index+1) + j];
-            if ( dist == ~0u )
+            uint32_t dist = distance[i * ninfo.num_nodes + j];
+            if ( invalid_node || (dist == XEN_INVALID_NODE_DIST) )
             {
                 PyList_Append(node_to_node_dist_obj, Py_None);
             }
@@ -1375,7 +1373,7 @@ static PyObject *pyxc_numainfo(XcObject *self)
         Py_DECREF(node_to_node_dist_obj);
     }
 
-    ret_obj = Py_BuildValue("{s:i}", "max_node_index", max_node_index);
+    ret_obj = Py_BuildValue("{s:i}", "max_node_index", ninfo.num_nodes + 1);
 
     PyDict_SetItemString(ret_obj, "node_memsize", node_to_memsize_obj);
     Py_DECREF(node_to_memsize_obj);
@@ -1391,11 +1389,9 @@ static PyObject *pyxc_numainfo(XcObject *self)
     Py_DECREF(node_to_node_dist_list_obj);
 
 out:
-    xc_hypercall_buffer_free(self->xc_handle, node_memsize);
-    xc_hypercall_buffer_free(self->xc_handle, node_memfree);
-    xc_hypercall_buffer_free(self->xc_handle, nodes_dist);
+    xc_hypercall_buffer_free(self->xc_handle, meminfo);
+    xc_hypercall_buffer_free(self->xc_handle, distance);
     return ret_obj ? ret_obj : pyxc_error_to_exception(self->xc_handle);
-#undef MAX_NODE_INDEX
 }
 
 static PyObject *pyxc_xeninfo(XcObject *self)
diff --git a/xen/common/sysctl.c b/xen/common/sysctl.c
index 70413cc..b025a90 100644
--- a/xen/common/sysctl.c
+++ b/xen/common/sysctl.c
@@ -274,54 +274,76 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) 
u_sysctl)
 
     case XEN_SYSCTL_numainfo:
     {
-        uint32_t i, j, max_node_index, last_online_node;
+        unsigned int i, j, num_nodes;
         xen_sysctl_numainfo_t *ni = &op->u.numainfo;
+        bool_t do_meminfo = !guest_handle_is_null(ni->meminfo);
+        bool_t do_distance = !guest_handle_is_null(ni->distance);
 
-        last_online_node = last_node(node_online_map);
-        max_node_index = min_t(uint32_t, ni->max_node_index, last_online_node);
-        ni->max_node_index = last_online_node;
+        num_nodes = last_node(node_online_map) + 1;
 
-        for ( i = 0; i <= max_node_index; i++ )
+        if ( do_meminfo || do_distance )
         {
-            if ( !guest_handle_is_null(ni->node_to_memsize) )
+            if ( ni->num_nodes < num_nodes )
             {
-                uint64_t memsize = node_online(i) ?
-                                   node_spanned_pages(i) << PAGE_SHIFT : 0ul;
-                if ( copy_to_guest_offset(ni->node_to_memsize, i, &memsize, 1) 
)
-                    break;
-            }
-            if ( !guest_handle_is_null(ni->node_to_memfree) )
-            {
-                uint64_t memfree = node_online(i) ?
-                                   avail_node_heap_pages(i) << PAGE_SHIFT : 
0ul;
-                if ( copy_to_guest_offset(ni->node_to_memfree, i, &memfree, 1) 
)
-                    break;
+                ret = -ENOBUFS;
+                i = num_nodes;
             }
+            else
+                i = 0;
 
-            if ( !guest_handle_is_null(ni->node_to_node_distance) )
+            for ( ; i < num_nodes; i++ )
             {
-                for ( j = 0; j <= max_node_index; j++)
+                xen_sysctl_meminfo_t meminfo;
+                static uint32_t distance[MAX_NUMNODES];
+
+                if ( do_meminfo )
                 {
-                    uint32_t distance = ~0u;
-                    if ( node_online(i) && node_online(j) )
+                    if ( node_online(i) )
+                    {
+                        meminfo.memsize = node_spanned_pages(i) << PAGE_SHIFT;
+                        meminfo.memfree = avail_node_heap_pages(i) << 
PAGE_SHIFT;
+                    }
+                    else
+                        meminfo.memsize = meminfo.memfree = XEN_INVALID_MEM_SZ;
+
+                    if ( copy_to_guest_offset(ni->meminfo, i, &meminfo, 1) )
                     {
-                        u8 d = __node_distance(i, j);
+                        ret = -EFAULT;
+                        break;
+                    }
+                }
 
-                        if ( d != NUMA_NO_DISTANCE )
-                            distance = d;
+                if ( do_distance )
+                {
+                    for ( j = 0; j < num_nodes; j++ )
+                    {
+                        distance[j] = __node_distance(i, j);
+                        if ( distance[j] == NUMA_NO_DISTANCE )
+                            distance[j] = XEN_INVALID_NODE_DIST;
                     }
-                    if ( copy_to_guest_offset(
-                        ni->node_to_node_distance,
-                        i*(max_node_index+1) + j, &distance, 1) )
+
+                    if ( copy_to_guest_offset(ni->distance, i * num_nodes,
+                                              distance, num_nodes) )
+                    {
+                        ret = -EFAULT;
                         break;
+                    }
                 }
-                if ( j <= max_node_index )
-                    break;
             }
         }
+        else
+            i = num_nodes;
 
-        ret = ((i <= max_node_index) || copy_to_guest(u_sysctl, op, 1))
-            ? -EFAULT : 0;
+        if ( (!ret || (ret == -ENOBUFS)) && (ni->num_nodes != i) )
+        {
+            ni->num_nodes = i;
+            if ( __copy_field_to_guest(u_sysctl, op,
+                                       u.numainfo.num_nodes) )
+            {
+                ret = -EFAULT;
+                break;
+            }
+        }
     }
     break;
 
diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
index 711441f..021d505 100644
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -494,34 +494,42 @@ typedef struct xen_sysctl_cputopoinfo 
xen_sysctl_cputopoinfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cputopoinfo_t);
 
 /* XEN_SYSCTL_numainfo */
-#define INVALID_NUMAINFO_ID (~0U)
+#define XEN_INVALID_MEM_SZ     (~0U)
+#define XEN_INVALID_NODE_DIST  (~0U)
+
+struct xen_sysctl_meminfo {
+    uint64_t memsize;
+    uint64_t memfree;
+};
+typedef struct xen_sysctl_meminfo xen_sysctl_meminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_meminfo_t);
+
+/*
+ * IN:
+ *  - Both 'meminfo' and 'distance' handles being null is a request
+ *    for maximum value of 'num_nodes'.
+ *  - Otherwise it's the number of entries in 'meminfo' and square root
+ *    of number of entries in 'distance' (when corresponding handle is
+ *    non-null)
+ *
+ * OUT:
+ *  - If 'num_nodes' is less than the number Xen needs to write, -ENOBUFS shall
+ *    be returned and 'num_nodes' updated to reflect the intended number.
+ *  - On success, 'num_nodes' shall indicate the number of entries written, 
which
+ *    may be less than the maximum.
+ */
+
 struct xen_sysctl_numainfo {
-    /*
-     * IN: maximum addressable entry in the caller-provided arrays.
-     * OUT: largest node identifier in the system.
-     * If OUT is greater than IN then the arrays are truncated!
-     */
-    uint32_t max_node_index;
+    uint32_t num_nodes;
 
-    /* NB. Entries are 0 if node is not present. */
-    XEN_GUEST_HANDLE_64(uint64) node_to_memsize;
-    XEN_GUEST_HANDLE_64(uint64) node_to_memfree;
+    XEN_GUEST_HANDLE_64(xen_sysctl_meminfo_t) meminfo;
 
     /*
-     * Array, of size (max_node_index+1)^2, listing memory access distances
-     * between nodes. If an entry has no node distance information (e.g., node 
-     * not present) then the value ~0u is written.
-     * 
-     * Note that the array rows must be indexed by multiplying by the minimum 
-     * of the caller-provided max_node_index and the returned value of
-     * max_node_index. That is, if the largest node index in the system is
-     * smaller than the caller can handle, a smaller 2-d array is constructed
-     * within the space provided by the caller. When this occurs, trailing
-     * space provided by the caller is not modified. If the largest node index
-     * in the system is larger than the caller can handle, then a 2-d array of
-     * the maximum size handleable by the caller is constructed.
+     * Distance between nodes 'i' and 'j' is stored in index 'i*N + j',
+     * where N is the number of nodes that will be returned in 'num_nodes'
+     * (i.e. not 'num_nodes' provided by the caller)
      */
-    XEN_GUEST_HANDLE_64(uint32) node_to_node_distance;
+    XEN_GUEST_HANDLE_64(uint32) distance;
 };
 typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t);
-- 
1.7.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.