[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 1 of 4 RFC] xl/remus : Network buffering setup helper functions



The functions contained in xl_netbuf.c are built on top of libnl3 API.
They setup the necessary infrastructure required to for network output
buffering in Remus.  In xend, the Remus python code performed this setup
by invoking shell commands.  This code is built purely on top of APIs
supplied by libnl3.0.

There are two public helper functions:
1) remus_install_netbuf_on_dev to install a network buffer on a vif.
Its basically equivalent to:
a)find free ifb (say ifb0)
b)ip link set ifb0 up
c)tc qdisc add dev vif1.0 ingress
d)tc filter add dev vif1.0 parent ffff: proto ip pref 10 u32 match u32 0 0 \
        action mirred egress redirect dev ifb0
e)tc qdisc add dev ifb0 root plug
f)get handle to plug qdisc and control it programmatically after suspending
  a VM/receiving checkpoint ack from remote


2) remus_uninstall_netbufs to remove all
network buffers and ingress qdiscs from the supplied interface list.

I have managed to code up 5 of the 6 steps mentioned above, purely in C, by 
building
on top of libnl3. There is currently no support in libnl3 to implement step (d) 
(redirection)
and it is pretty complicated to implement. So I am currently resorting to 
system() lib call
to get the job done.

N.B. This implementation is just xl (cmdline utility) specific. Other 
toolstacks may choose
to reuse this code or do this setup in their own way (scripts/shell commands, 
etc.).

Signed-off-by: Shriram Rajagopalan <rshriram@xxxxxxxxx>

diff -r 711fe8ed54e4 -r 3ae38cbe535c tools/libxl/xl.h
--- a/tools/libxl/xl.h  Wed Jul 24 22:48:09 2013 -0700
+++ b/tools/libxl/xl.h  Wed Jul 24 22:55:00 2013 -0700
@@ -161,6 +161,9 @@ enum output_format {
 extern enum output_format default_output_format;
 
 extern void printf_info_sexp(int domid, libxl_domain_config *d_config);
+extern char *remus_install_netbuf_on_dev(char *vifname);
+extern void remus_uninstall_netbufs(char **vifs, int num_vifs,
+                                    char **ifbs, int num_ifbs);
 
 #define XL_GLOBAL_CONFIG XEN_CONFIG_DIR "/xl.conf"
 #define XL_LOCK_FILE XEN_LOCK_DIR "/xl"
diff -r 711fe8ed54e4 -r 3ae38cbe535c tools/libxl/xl_netbuf.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxl/xl_netbuf.c   Wed Jul 24 22:55:00 2013 -0700
@@ -0,0 +1,441 @@
+/*
+ * xl command line utility - helper functions for setting up 
+ *  remus network buffering.
+ */
+
+#include "libxl_osdeps.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <regex.h>
+#include <xentoollog.h>
+
+#include <netlink/cache.h>
+#include <netlink/socket.h>
+#include <netlink/attr.h>
+#include <netlink/route/link.h>
+#include <netlink/route/route.h>
+#include <netlink/route/qdisc.h>
+#include <netlink/route/qdisc/plug.h>
+
+#include "libxl.h"
+#include "libxl_utils.h"
+#include "xl.h"
+
+#define RET_BAD_ARG(x, y)                                                 \
+    do {                                                                \
+        fprintf(stderr, "%s:%d bad argument "#x"\n",__FUNCTION__, __LINE__); \
+        return y;                                                       \
+    } while (0)
+
+static struct rtnl_link *get_free_ifbdev(struct nl_cache *link_cache,
+                                         struct nl_cache *qdisc_cache)
+{
+  struct nl_object *o = NULL;
+  struct rtnl_link *temp = NULL, *ifb = NULL;
+  struct rtnl_qdisc *qdisc = NULL;
+  char *ifbname = NULL;
+  regex_t ifbregex;
+  int ifindex;
+
+  if (!link_cache) RET_BAD_ARG(link_cache, NULL);
+  if (!qdisc_cache) RET_BAD_ARG(qdisc_cache, NULL);
+
+  if (regcomp(&ifbregex, "^ifb[0-9]+$", REG_EXTENDED|REG_NOSUB)) {
+      fprintf(stderr, "Failed to alloc regex while "
+              "searching for free ifbs\n");
+      return NULL;
+  }
+
+  for (o = nl_cache_get_first(link_cache); o; o = nl_cache_get_next(o)) {
+
+      temp = (struct rtnl_link *)o;
+      ifbname = rtnl_link_get_name(temp);
+
+      if (!ifbname || regexec(&ifbregex, ifbname, 0, NULL, 0))
+       continue;
+
+      ifindex = rtnl_link_get_ifindex(temp);
+      if (!ifindex) continue;
+
+      /* found an IFB. check if it has a qdisc on root */
+      qdisc = rtnl_qdisc_get_by_parent(qdisc_cache, ifindex, TC_H_ROOT);
+      if (qdisc) {
+       char *kind = rtnl_tc_get_kind(TC_CAST(qdisc));
+       rtnl_qdisc_put(qdisc);
+
+       if (strcmp(kind, "pfifo_fast") && strcmp(kind, "mq")
+            && strcmp(kind, "ingress"))
+         continue;
+      }
+
+      ifb = rtnl_link_get(link_cache, ifindex);
+      printf("Acquired IFB dev %s for network buffering\n", ifbname);
+      break;
+  }
+
+  if (!ifb)
+      fprintf(stderr, "No free ifb device available\n");
+
+  regfree(&ifbregex);
+  return ifb;
+}
+
+static int do_net_ifupdown(struct nl_sock *sock, char *ifname, int ifup)
+{
+  struct nl_msg *msg;
+  struct nlmsghdr *nlh;
+  struct ifinfomsg ifm;
+  int ret;
+
+  if (!sock) RET_BAD_ARG(sock, -1);
+  if (!ifname || !strlen(ifname)) RET_BAD_ARG(ifname, -1);
+
+  msg = nlmsg_alloc(); //allocates 4K msg by default
+  nlh = nlmsg_hdr(msg);
+  nlh->nlmsg_type = RTM_NEWLINK;
+  nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+
+  ifm.ifi_family = AF_UNSPEC;
+  ifm.ifi_change = IFF_UP;
+  ifm.ifi_flags = ifup ? IFF_UP : ~IFF_UP;
+
+  if (nlmsg_append(msg, &ifm, sizeof(ifm), NLMSG_ALIGNTO) < 0)
+    goto nla_put_failure;
+  NLA_PUT_STRING(msg, IFLA_IFNAME, ifname);
+
+  ret = nl_send_sync(sock, msg); //this call also frees msg
+
+  if (!ret) {
+      printf("IFB dev %s link %s\n", ifname, ifup ? "up" : "down");
+      return 0;
+  } else {
+      fprintf(stderr, "Unable to bring %s interface %s : %s\n",
+              ifup ? "up" : "down",
+              ifname, nl_geterror(ret));
+      return -1;
+  }
+
+ nla_put_failure:
+  fprintf(stderr, "failed to create nl_msg for bringing %s interface %s\n",
+          ifup ? "up" : "down", ifname);
+  nlmsg_free(msg);
+  return -1;
+}
+
+static int tc_redirect_traffic(char *fromdev, char *todev)
+{
+    /* command:
+     * tc filter add dev vif1.0 parent ffff: proto ip pref 10
+     *    u32 match u32 0 0 action mirred egress redirect dev ifb0
+     */
+    char *tc_cmd = NULL;
+    int ret = 0;
+
+    if (!fromdev) RET_BAD_ARG(fromdev, -1);
+    if (!todev) RET_BAD_ARG(todev, -1);
+
+    if (asprintf(&tc_cmd,
+                 "exec tc filter add dev %s parent ffff: "
+                 "proto ip pref 10 u32 match u32 0 0 "
+                 "act mirred egress redirect dev %s",
+                 fromdev, todev) < 0) {
+        fprintf(stderr, "Failed to compose tc redirection command (%s->%s)\n",
+                fromdev, todev);
+        return -1;
+    }
+
+    ret = system(tc_cmd);
+    if (ret) ret = errno;
+    free(tc_cmd);
+    return -ret;
+}
+
+static int do_qdisc_add_or_del(struct nl_sock *sock,
+                               struct rtnl_link *interface,
+                               char *kind, uint32_t parent, int add)
+{
+    struct rtnl_qdisc *qdisc = NULL;
+    int ret = -1;
+
+    if (!sock) RET_BAD_ARG(sock, -1);
+    if (!interface) RET_BAD_ARG(interface, -1);
+    if (!kind) RET_BAD_ARG(kind, -1);
+
+    qdisc = rtnl_qdisc_alloc();
+    if (!qdisc) {
+        fprintf(stderr, "Failed to allocate %s qdisc\n", kind);
+        return -1;
+    }
+
+    rtnl_tc_set_link(TC_CAST(qdisc), interface);
+    rtnl_tc_set_parent(TC_CAST(qdisc), parent);
+    rtnl_tc_set_kind(TC_CAST(qdisc), kind);
+
+    /* Submit request to kernel and wait for response */
+    if (add)
+        ret = rtnl_qdisc_add(sock, qdisc, NLM_F_CREATE|NLM_F_EXCL);
+    else
+        ret = rtnl_qdisc_delete(sock, qdisc);
+    rtnl_qdisc_put(qdisc);
+
+    return ret;
+}
+
+static int netdev_ifup(struct nl_sock *sock, char *ifname)
+{
+    return do_net_ifupdown(sock, ifname, 1);
+}
+
+static int netdev_ifdown(struct nl_sock *sock, char *ifname)
+{
+    return do_net_ifupdown(sock, ifname, 0);
+}
+
+/* Need to reimplement code from do_qdisc_add_or_del because for plug
+ * qdisc, we need to set an initial buffer size of 4MB atleast. The default
+ * buffer size of 100 packets (set by sch_plug kernel module) is too
+ * small to hold all packets from a VM during an epoch, resulting in
+ * packet drops.
+ */
+#define PLUG_QDISC_LIMIT (4 * 1024 * 1024)
+static int add_plug_qdisc(struct nl_sock *sock, struct rtnl_link *ifb)
+{
+    struct rtnl_qdisc *qdisc = NULL;
+    int ret = -1;
+    char *kind = "plug";
+
+    if (!sock) RET_BAD_ARG(sock, -1);
+    if (!ifb) RET_BAD_ARG(ifb, -1);
+
+    qdisc = rtnl_qdisc_alloc();
+    if (!qdisc) {
+        fprintf(stderr, "Failed to allocate %s qdisc\n", kind);
+        return -1;
+    }
+
+    rtnl_tc_set_link(TC_CAST(qdisc), ifb);
+    rtnl_tc_set_parent(TC_CAST(qdisc), TC_H_ROOT);
+    rtnl_tc_set_kind(TC_CAST(qdisc), kind);
+
+    if ((ret = rtnl_qdisc_plug_set_limit(qdisc, PLUG_QDISC_LIMIT)) < 0) {
+        fprintf(stderr, "Unable to change plug buffer size: %s\n",
+                nl_geterror(ret));
+        rtnl_qdisc_put(qdisc);
+        return ret;
+    } 
+
+    /* Submit request to kernel and wait for response */
+    ret = rtnl_qdisc_add(sock, qdisc, NLM_F_CREATE|NLM_F_EXCL);
+    rtnl_qdisc_put(qdisc);
+
+    return ret;
+}
+
+static int del_plug_qdisc(struct nl_sock *sock, struct rtnl_link *ifb)
+{
+    return do_qdisc_add_or_del(sock, ifb, "plug", TC_H_ROOT, 0);
+}
+
+static int add_ingress_qdisc(struct nl_sock *sock, struct rtnl_link *vif)
+{
+    int ret;
+    ret = do_qdisc_add_or_del(sock, vif, "ingress", TC_H_INGRESS, 1);
+    if (ret == -NLE_EXIST)
+        ret = 0; /* ignore error, if ingress qdisc is already installed
+                    on vif. */
+    return ret;
+}
+
+static int del_ingress_qdisc(struct nl_sock *sock, struct rtnl_link *vif)
+{
+    return do_qdisc_add_or_del(sock, vif, "ingress", TC_H_INGRESS, 0);
+}
+
+/*
+ * When Remus (xend version) installs a network buffer on a guest vif,
+ * it does the following (using a mix of shell commands and netlink messages)
+ *
+ *  ip link set dev ifb0 up
+ *  tc qdisc add dev vif1.0 ingress
+ *  tc filter add dev vif1.0 parent ffff: proto ip \
+ *    prio 10 u32 match u32 0 0 action mirred egress redirect dev ifb0
+ *  send netlink message to kernel to add plug_qdisc to ifb0 and get
+ *     handle
+ *  use handle to control operations of plug qdisc.
+ *
+ * So order of operations when installing a network buffer on vif1.0
+ * 1. find a free ifb and bring up the device
+ * 2. add ingress qdisc to vif1.0 (to capture outgoing packets from guest)
+ * 3. redirect traffic from vif1.0 to ifb device
+ * 4. install plug_qdisc on ifb device, with which we can buffer/release
+ *    guest's network output from vif1.0
+ *
+ * Return value: ifb dev name on success. NULL on failure.
+ */
+char *remus_install_netbuf_on_dev(char *vifname)
+{
+  struct nl_cache *link_cache = NULL, *qdisc_cache = NULL;
+  struct rtnl_link *vif = NULL;
+  struct rtnl_link *ifb = NULL;
+  struct nl_sock *sock = NULL;
+  char *ifbname = NULL;
+  int ret = -1;
+
+  if (!vifname) return NULL;
+
+  sock = nl_socket_alloc();
+  if (!sock) {
+    fprintf(stderr, "failed to allocate libnl socket\n");
+    return NULL;
+  }
+  nl_connect(sock, NETLINK_ROUTE);
+
+  /* get the link link_cache (equivalent to ifconfig -a -s) */
+  if ((ret = rtnl_link_alloc_cache(sock, AF_UNSPEC, &link_cache)) < 0) {
+      fprintf(stderr, "failed to allocate link cache : %s\n", 
nl_geterror(ret));
+      goto end;
+  }
+
+  /* similarly, get the qdisc cache, i.e. list of all qdiscs
+   * installed currently on various network interfaces.
+   */
+  if ((ret = rtnl_qdisc_alloc_cache(sock, &qdisc_cache)) < 0) {
+      fprintf(stderr, "failed to allocate qdisc cache : %s\n", 
nl_geterror(ret));
+      goto end;
+  }
+
+  vif = rtnl_link_get_by_name(link_cache, vifname);
+  if (!vif) {
+      /* link does not exist */
+      fprintf(stderr, "interface %s does not exist\n", vifname);
+      ret = -1;
+      goto end;
+  }
+
+  ret = -1;
+  /* 1. find a free ifb and bring up the device */
+  ifb = get_free_ifbdev(link_cache, qdisc_cache);
+  if (!ifb) goto end;
+
+  ret = netdev_ifup(sock, rtnl_link_get_name(ifb));
+  if (ret) goto end;
+
+  /* 2. add ingress qdisc to vif1.0 (to catch outgoing packets from guest) */
+  ret = add_ingress_qdisc(sock, vif);
+  if (ret) goto end;
+
+  /* 3. redirect traffic from vif1.0 to ifb device */
+  ret = tc_redirect_traffic(vifname, rtnl_link_get_name(ifb));
+  if (ret) goto end;
+
+  /* 4. install plug_qdisc on ifb device, with which we can buffer/release
+   *    guest's network output from vif1.0
+   */
+  ret = add_plug_qdisc(sock, ifb);
+  if (!ret) {
+      ifbname = strdup(rtnl_link_get_name(ifb));
+      if (!ifbname) {
+          perror("Failed to allocate mem for ifbname!");
+          exit(-1);
+      }
+      fprintf(stderr, "Added plug qdisc to %s\n", ifbname);
+  }
+  else
+      fprintf(stderr, "Unable to add plug qdisc: %s\n", nl_geterror(ret));
+
+end:
+  if (ret) {
+      if (vif) {
+          del_ingress_qdisc(sock, vif);
+          if (ifb) {
+              /*
+               * To release ifb dev, first bring it down and then remove
+               * the qdisc. Otherwise, the kernel will replace plug with
+               * pfifo_fast and it will remain attached to the ifb even
+               * if we bring down the device.
+               */
+              netdev_ifdown(sock, rtnl_link_get_name(ifb));
+              del_plug_qdisc(sock, ifb);
+          }
+      }
+  }
+
+  if (vif) rtnl_link_put(vif);
+  if (ifb) rtnl_link_put(ifb);
+  if (link_cache) nl_cache_free(link_cache);
+  if (qdisc_cache) nl_cache_free(qdisc_cache);
+  if (sock) nl_close(sock);
+  return ifbname;
+}
+
+/*
+ * Cleanup code. Remove ingress qdisc from the supplied list of vif
+ * interfaces. Bring down the ifb devices and remove the plug qdiscs
+ * from each of them, thereby releasing the ifbs back into the ifb pool.
+ */
+void remus_uninstall_netbufs(char **vifs, int num_vifs,
+                             char **ifbs, int num_ifbs)
+{
+  struct rtnl_link *netdev = NULL;
+  struct nl_sock *sock = NULL;
+  int i;
+
+  if (!vifs && !ifbs) return;
+
+  sock = nl_socket_alloc();
+  if (!sock) {
+    fprintf(stderr, "failed to allocate libnl socket "
+            "while uninstalling netbufs\n");
+    return;
+  }
+  nl_connect(sock, NETLINK_ROUTE);
+
+  /* We cant use link caches when tearing down network buffering because
+   * someone may have destroyed the domU, thus tearing down the
+   * interfaces itself. As a safer alternative, we will resort to
+   * un-cached direct queries to the kernel for each vif interface.
+   */
+  netdev = rtnl_link_alloc();
+  if (!netdev) {
+      fprintf(stderr, "cannot allocate netdev\n");
+      nl_close(sock);
+      return;
+  }
+
+  for (i = 0; i < num_ifbs; i++) {
+      if (ifbs[i]) {
+          if (rtnl_link_get_kernel(sock, 0, ifbs[i], &netdev) < 0)
+              continue;
+          netdev_ifdown(sock, ifbs[i]);
+          del_plug_qdisc(sock, netdev);
+      }
+  }
+
+  for (i = 0; i < num_vifs; i++) {
+      if (vifs[i]) {
+          if (rtnl_link_get_kernel(sock, 0, vifs[i], &netdev) < 0)
+              continue;
+          del_ingress_qdisc(sock, netdev);
+      }
+  }
+
+  rtnl_link_put(netdev);
+  nl_close(sock);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.