[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [patch/rfc] xenbus and driver issues with kexec


  • To: Xen devel list <xen-devel@xxxxxxxxxxxxxxxxxxx>
  • From: Gerd Hoffmann <kraxel@xxxxxxx>
  • Date: Mon, 14 Aug 2006 17:17:52 +0200
  • Delivery-date: Mon, 14 Aug 2006 08:20:44 -0700
  • List-id: Xen developer discussion <xen-devel.lists.xensource.com>

  Hi,

There are a number of problems in xenbus and the paravirtualized drivers
when it comes to kexec'ing a domU kernel.  It's not specific to kexec,
but to handing over devices to someone else within the same domU.  The
same issues will come up when someone tries to combine mini-os and grub
to build some kind of in-domain bootloader.

Fundamental issue is that there is no way to cleanly shutdown the
frontend/backend drivers without destroying the domain and the backend
devices.  The patch addresses some but not all of the problems arising
from that.  It also has some debug stuff left in and needs some more
polishing before final submission.

Here are the changes:

  (1) ballon: print unconditionally driver_pages in /proc/xen/balloon.
  (2) xenbus: unregister watches in xs_suspend()
  (3) blk/netback: allow Closing -> Initialising state transition.
  (4) blk/netfront: shutdown (but *not* destroy) devices
      when devices_shutdown is called, by jumping from Closing to
      Initialising state.
  (5) Some cleanups along the way.

Not solved yet:

The netfront driver leaks pages in case it is shutdown while the
interface is active.  Problem is that it can't get back the rx buffers,
the netback driver sits on them and doesn't release them until it has
some packet data to fill in.  The patch adds a netif_release_rx_bufs()
function, but it doesn't work reliable, usually it gets back only a few
pages.

To reproduce this you don't need to play with kexec.  Just apply the
balloon driver patch.  Boot a domU with the new kernel.  Configure the
eth0 interface, "xm network-detach ...", "xm network-attach", repeat.
Watch driver_pages grow in /proc/xen/balloon.

cheers,

  Gerd

PS: documentation and complete domU kexec patches are available from
    http://www.suse.de/~kraxel/xen/kexec.html

-- 
Gerd Hoffmann <kraxel@xxxxxxx>
http://www.suse.de/~kraxel/julika-dora.jpeg
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c
--- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c        Sun Aug 13 
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c        Mon Aug 14 
12:49:11 2006 +0200
@@ -440,15 +440,17 @@ static int balloon_read(char *page, char
                "Requested target:   %8lu kB\n"
                "Low-mem balloon:    %8lu kB\n"
                "High-mem balloon:   %8lu kB\n"
+               "Driver pages:       %8lu kB\n"
                "Xen hard limit:     ",
                PAGES2KB(current_pages), PAGES2KB(target_pages), 
-               PAGES2KB(balloon_low), PAGES2KB(balloon_high));
+               PAGES2KB(balloon_low), PAGES2KB(balloon_high),
+               PAGES2KB(driver_pages));
 
        if (hard_limit != ~0UL) {
                len += sprintf(
                        page + len, 
-                       "%8lu kB (inc. %8lu kB driver headroom)\n",
-                       PAGES2KB(hard_limit), PAGES2KB(driver_pages));
+                       "%8lu kB\n",
+                       PAGES2KB(hard_limit));
        } else {
                len += sprintf(
                        page + len,
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Sun Aug 13 09:44:07 
2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Mon Aug 14 12:49:11 
2006 +0200
@@ -305,6 +305,11 @@ static void frontend_changed(struct xenb
 
        switch (frontend_state) {
        case XenbusStateInitialising:
+               if (dev->state == XenbusStateClosing) {
+                       printk("%s: %s: prepare for reconnect\n",
+                              __FUNCTION__, dev->nodename);
+                       xenbus_switch_state(dev, XenbusStateInitWait);
+               }
                break;
 
        case XenbusStateInitialised:
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c      Sun Aug 13 
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c      Mon Aug 14 
12:49:11 2006 +0200
@@ -272,10 +272,13 @@ static void backend_changed(struct xenbu
                        xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
 
                down(&bd->bd_sem);
-               if (info->users > 0)
+               if (info->users)
+                       printk("%-20s: %s: %d user(s) left\n", __FUNCTION__,
+                              dev->nodename, info->users);
+               if (info->users > 0 && !dev->shutdown_in_progress) {
                        xenbus_dev_error(dev, -EBUSY,
                                         "Device in use; refusing to close");
-               else
+               } else
                        blkfront_closing(dev);
                up(&bd->bd_sem);
                bdput(bd);
@@ -359,7 +362,7 @@ static void blkfront_closing(struct xenb
 
        xlvbd_del(info);
 
-       xenbus_switch_state(dev, XenbusStateClosed);
+       xenbus_closing_done(dev);
 }
 
 
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Sun Aug 13 09:44:07 
2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Mon Aug 14 12:49:11 
2006 +0200
@@ -228,10 +228,25 @@ static void frontend_changed(struct xenb
 
        switch (frontend_state) {
        case XenbusStateInitialising:
+               if (dev->state == XenbusStateClosing) {
+                       printk("%s: %s: prepare for reconnect\n",
+                              __FUNCTION__, dev->nodename);
+                       if (be->netif) {
+                               netif_disconnect(be->netif);
+                               be->netif = NULL;
+                       }
+                       xenbus_switch_state(dev, XenbusStateInitWait);
+               }
+               break;
+
        case XenbusStateInitialised:
                break;
 
        case XenbusStateConnected:
+               if (!be->netif) {
+                       /* reconnect: setup be->netif */
+                       backend_changed(&be->backend_watch, NULL, 0);
+               }
                maybe_connect(be);
                break;
 
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netfront/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/Makefile        Sun Aug 13 
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/Makefile        Mon Aug 14 
12:49:11 2006 +0200
@@ -1,3 +1,4 @@
+EXTRA_CFLAGS += -DDEBUG=1
 
 obj-$(CONFIG_XEN_NETDEV_FRONTEND)      := xennet.o
 
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Sun Aug 13 
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Mon Aug 14 
12:49:11 2006 +0200
@@ -436,7 +436,7 @@ static void backend_changed(struct xenbu
        struct netfront_info *np = dev->dev.driver_data;
        struct net_device *netdev = np->netdev;
 
-       DPRINTK("\n");
+       DPRINTK("%s\n", xenbus_strstate(backend_state));
 
        switch (backend_state) {
        case XenbusStateInitialising:
@@ -447,9 +447,11 @@ static void backend_changed(struct xenbu
                break;
 
        case XenbusStateInitWait:
-               network_connect(netdev);
-               xenbus_switch_state(dev, XenbusStateConnected);
-               (void)send_fake_arp(netdev);
+               if (!dev->shutdown_in_progress) {
+                       network_connect(netdev);
+                       xenbus_switch_state(dev, XenbusStateConnected);
+                       (void)send_fake_arp(netdev);
+               }
                break;
 
        case XenbusStateClosing:
@@ -492,6 +494,7 @@ static int network_open(struct net_devic
 {
        struct netfront_info *np = netdev_priv(dev);
 
+       DPRINTK("%s\n", np->xbdev->nodename);
        memset(&np->stats, 0, sizeof(np->stats));
 
        network_alloc_rx_buffers(dev);
@@ -651,6 +654,8 @@ no_skb:
            ((np->rx_target *= 2) > np->rx_max_target))
                np->rx_target = np->rx_max_target;
 
+//     printk("%s: %d bufs, target %d\n", __FUNCTION__, i, np->rx_target);
+       
  refill:
        for (i = 0; ; i++) {
                if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
@@ -1223,6 +1228,7 @@ err:
 
        /* Some pages are no longer absent... */
        balloon_update_driver_allowance(-pages_done);
+//     printk("%s: %d driver pages\n", __FUNCTION__, work_done);
 
        /* Do all the remapping work, and M2P updates, in one big hypercall. */
        if (likely(pages_done)) {
@@ -1283,10 +1289,80 @@ err:
        return more_to_do;
 }
 
+static void netif_release_rx_bufs(struct netfront_info *np)
+{
+       struct mmu_update      *mmu = np->rx_mmu;
+       struct multicall_entry *mcl = np->rx_mcl;
+       struct sk_buff *skb;
+       unsigned long mfn;
+       int bufs = 0, gnttab = 0, unused = 0;
+       int id, ref;
+
+       spin_lock(&np->rx_lock);
+
+       for (id = 0; id < NET_RX_RING_SIZE; id++) {
+               if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
+                       unused++;
+                       continue;
+               }
+               if ((mfn = gnttab_end_foreign_transfer_ref(ref)) == 0) {
+                       gnttab++;
+                       continue;
+               }
+
+               gnttab_release_grant_reference(&np->gref_rx_head, ref);
+               np->grant_rx_ref[id] = GRANT_INVALID_REF;
+
+               skb = np->rx_skbs[id];
+               add_id_to_freelist(np->rx_skbs, id);
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* Remap the page. */
+                       MULTI_update_va_mapping(mcl, (unsigned long)skb->head,
+                                               pfn_pte_ma(mfn, PAGE_KERNEL),
+                                               0);
+                       mcl++;
+                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+                               | MMU_MACHPHYS_UPDATE;
+                       mmu->val = __pa(skb->head) >> PAGE_SHIFT;
+                       mmu++;
+
+                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
+                                           mfn);
+               }
+               bufs++;
+
+#if 0 /* FIXME */
+               dev_kfree_skb(skb);
+#endif
+       }
+
+       printk("%s: %d released ok, %d gnttab errs, %d unused slots\n",
+              __FUNCTION__, bufs, gnttab, unused);
+       if (0 == bufs)
+               return;
+
+       /* Some pages are no longer absent... */
+       balloon_update_driver_allowance(-bufs);
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               /* Do all the remapping work, and M2P updates, in one big 
hypercall. */
+               mcl->op = __HYPERVISOR_mmu_update;
+               mcl->args[0] = (unsigned long)np->rx_mmu;
+               mcl->args[1] = mmu - np->rx_mmu;
+               mcl->args[2] = 0;
+               mcl->args[3] = DOMID_SELF;
+               mcl++;
+               HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
+       }
+
+       spin_unlock(&np->rx_lock);
+}
 
 static int network_close(struct net_device *dev)
 {
        struct netfront_info *np = netdev_priv(dev);
+       DPRINTK("%s\n", np->xbdev->nodename);
        netif_stop_queue(np->netdev);
        return 0;
 }
@@ -1425,6 +1501,8 @@ static void netif_uninit(struct net_devi
 static void netif_uninit(struct net_device *dev)
 {
        struct netfront_info *np = netdev_priv(dev);
+       DPRINTK("%s\n", np->xbdev->nodename);
+       netif_release_rx_bufs(np);
        gnttab_free_grant_references(np->gref_tx_head);
        gnttab_free_grant_references(np->gref_rx_head);
 }
@@ -1720,11 +1798,11 @@ static void netfront_closing(struct xenb
 {
        struct netfront_info *info = dev->dev.driver_data;
 
-       DPRINTK("netfront_closing: %s removed\n", dev->nodename);
+       DPRINTK("%s\n", dev->nodename);
 
        close_netdev(info);
-
-       xenbus_switch_state(dev, XenbusStateClosed);
+//     netif_release_rx_bufs(info);
+       xenbus_closing_done(dev);
 }
 
 
@@ -1740,9 +1818,9 @@ static int __devexit netfront_remove(str
        return 0;
 }
 
-
 static void close_netdev(struct netfront_info *info)
 {
+       DPRINTK("%s\n", info->xbdev->nodename);
        del_timer_sync(&info->rx_refill_timer);
 
        xennet_sysfs_delif(info->netdev);
@@ -1752,6 +1830,7 @@ static void close_netdev(struct netfront
 
 static void netif_disconnect_backend(struct netfront_info *info)
 {
+       DPRINTK("%s\n", info->xbdev->nodename);
        /* Stop old i/f to prevent errors whilst we rebuild the state. */
        spin_lock_irq(&info->tx_lock);
        spin_lock(&info->rx_lock);
@@ -1774,6 +1853,7 @@ static void netif_disconnect_backend(str
 
 static void netif_free(struct netfront_info *info)
 {
+       DPRINTK("%s\n", info->xbdev->nodename);
        close_netdev(info);
        netif_disconnect_backend(info);
        free_netdev(info->netdev);
@@ -1797,9 +1877,9 @@ static struct xenbus_device_id netfront_
 
 
 static struct xenbus_driver netfront = {
+       .ids = netfront_ids,
        .name = "vif",
        .owner = THIS_MODULE,
-       .ids = netfront_ids,
        .probe = netfront_probe,
        .remove = __devexit_p(netfront_remove),
        .resume = netfront_resume,
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c
--- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c   Sun Aug 13 
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c   Mon Aug 14 
12:49:11 2006 +0200
@@ -41,6 +41,21 @@ extern char *kasprintf(const char *fmt, 
 #define DPRINTK(fmt, args...) \
     pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, 
##args)
 
+char *xenbus_strstate(enum xenbus_state state)
+{
+       static char *name[] = {
+               [ XenbusStateUnknown      ] = "Unknown",
+               [ XenbusStateInitialising ] = "Initialising",
+               [ XenbusStateInitWait     ] = "InitWait",
+               [ XenbusStateInitialised  ] = "Initialised",
+               [ XenbusStateConnected    ] = "Connected",
+               [ XenbusStateClosing      ] = "Closing",
+               [ XenbusStateClosed       ] = "Closed",
+       };
+       return state < sizeof(name)/sizeof(name[0])
+               ? name[state] : "INVALID";
+}
+
 int xenbus_watch_path(struct xenbus_device *dev, const char *path,
                      struct xenbus_watch *watch,
                      void (*callback)(struct xenbus_watch *,
@@ -124,6 +139,23 @@ int xenbus_switch_state(struct xenbus_de
 }
 EXPORT_SYMBOL_GPL(xenbus_switch_state);
 
+int xenbus_closing_done(struct xenbus_device *dev)
+{
+       if (dev->shutdown_in_progress) {
+               /* frontend (we) triggered closing, because of 
devices_shutdown(),
+                * prepare for possible re-connect */
+               printk("%-20s: %s: => Initialising\n", __FUNCTION__, 
dev->nodename);
+               xenbus_switch_state(dev, XenbusStateInitialising);
+       } else {
+               /* backend triggered closing, probably the device went away,
+                * thus we are going to close down too. */
+               printk("%-20s: %s: => Closed\n", __FUNCTION__, dev->nodename);
+               xenbus_switch_state(dev, XenbusStateClosed);
+       }
+       complete(&dev->down);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_closing_done);
 
 /**
  * Return the path to the error node for the given device, or NULL on failure.
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c
--- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c    Sun Aug 13 
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c    Mon Aug 14 
12:49:11 2006 +0200
@@ -60,6 +60,15 @@ static struct notifier_block *xenstore_c
 
 static void wait_for_devices(struct xenbus_driver *xendrv);
 
+static int xenbus_probe_frontend(const char *type, const char *name);
+static int xenbus_uevent_backend(struct device *dev, char **envp,
+                                int num_envp, char *buffer, int buffer_size);
+static int xenbus_probe_backend(const char *type, const char *domid);
+
+static int xenbus_dev_probe(struct device *_dev);
+static int xenbus_dev_remove(struct device *_dev);
+static void xenbus_dev_shutdown(struct device *_dev);
+
 /* If something in array of ids matches this device, return it. */
 static const struct xenbus_device_id *
 match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -168,15 +177,17 @@ static int read_frontend_details(struct 
 
 
 /* Bus type for frontend drivers. */
-static int xenbus_probe_frontend(const char *type, const char *name);
 static struct xen_bus_type xenbus_frontend = {
        .root = "device",
        .levels = 2,            /* device/type/<id> */
        .get_bus_id = frontend_bus_id,
        .probe = xenbus_probe_frontend,
        .bus = {
-               .name  = "xen",
-               .match = xenbus_match,
+               .name     = "xen",
+               .match    = xenbus_match,
+               .probe    = xenbus_dev_probe,
+               .remove   = xenbus_dev_remove,
+               .shutdown = xenbus_dev_shutdown,
        },
        .dev = {
                .bus_id = "xen",
@@ -221,18 +232,18 @@ static int backend_bus_id(char bus_id[BU
        return 0;
 }
 
-static int xenbus_uevent_backend(struct device *dev, char **envp,
-                                int num_envp, char *buffer, int buffer_size);
-static int xenbus_probe_backend(const char *type, const char *domid);
 static struct xen_bus_type xenbus_backend = {
        .root = "backend",
        .levels = 3,            /* backend/type/<frontend>/<id> */
        .get_bus_id = backend_bus_id,
        .probe = xenbus_probe_backend,
        .bus = {
-               .name  = "xen-backend",
-               .match = xenbus_match,
-               .uevent = xenbus_uevent_backend,
+               .name     = "xen-backend",
+               .match    = xenbus_match,
+               .probe    = xenbus_dev_probe,
+               .remove   = xenbus_dev_remove,
+//             .shutdown = xenbus_dev_shutdown,
+               .uevent   = xenbus_uevent_backend,
        },
        .dev = {
                .bus_id = "xen-backend",
@@ -391,6 +402,26 @@ static int xenbus_dev_remove(struct devi
        return 0;
 }
 
+static void xenbus_dev_shutdown(struct device *_dev)
+{
+       struct xenbus_device *dev = to_xenbus_device(_dev);
+       unsigned long timeout = 5*HZ;
+
+       get_device(&dev->dev);
+       if (dev->state != XenbusStateConnected) {
+               printk("%s: %s: %s != Connected, skipping\n", __FUNCTION__,
+                      dev->nodename, xenbus_strstate(dev->state));
+               goto out;
+       }
+       dev->shutdown_in_progress = 1;
+       xenbus_switch_state(dev, XenbusStateClosing);
+       timeout = wait_for_completion_timeout(&dev->down, timeout);
+       if (!timeout)
+               printk("%s: %s timeout closing device\n", __FUNCTION__, 
dev->nodename);
+ out:
+       put_device(&dev->dev);
+}
+
 static int xenbus_register_driver_common(struct xenbus_driver *drv,
                                         struct xen_bus_type *bus)
 {
@@ -399,8 +430,6 @@ static int xenbus_register_driver_common
        drv->driver.name = drv->name;
        drv->driver.bus = &bus->bus;
        drv->driver.owner = drv->owner;
-       drv->driver.probe = xenbus_dev_probe;
-       drv->driver.remove = xenbus_dev_remove;
 
        mutex_lock(&xenwatch_mutex);
        ret = driver_register(&drv->driver);
@@ -575,6 +604,7 @@ static int xenbus_probe_node(struct xen_
        tmpstring += strlen(tmpstring) + 1;
        strcpy(tmpstring, type);
        xendev->devicetype = tmpstring;
+       init_completion(&xendev->down);
 
        xendev->dev.parent = &bus->dev;
        xendev->dev.bus = &bus->bus;
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c
--- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c       Sun Aug 13 
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c       Mon Aug 14 
12:49:11 2006 +0200
@@ -665,7 +665,18 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watc
 
 void xs_suspend(void)
 {
+       struct xenbus_watch *watch;
+       char token[sizeof(watch) * 2 + 1];
+
        down_write(&xs_state.suspend_mutex);
+
+       /* No need for watches_lock: the suspend_mutex is sufficient. */
+       list_for_each_entry(watch, &watches, list) {
+               sprintf(token, "%lX", (long)watch);
+               xs_unwatch(watch->node, token);
+//             printk("%s: unwatch %s\n", __FUNCTION__, watch->node);
+       }
+
        mutex_lock(&xs_state.request_mutex);
 }
 
diff -r befab551b0e1 linux-2.6-xen-sparse/include/xen/xenbus.h
--- a/linux-2.6-xen-sparse/include/xen/xenbus.h Sun Aug 13 09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/include/xen/xenbus.h Mon Aug 14 12:49:11 2006 +0200
@@ -37,6 +37,7 @@
 #include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/mutex.h>
+#include <linux/completion.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/grant_table.h>
 #include <xen/interface/io/xenbus.h>
@@ -74,6 +75,9 @@ struct xenbus_device {
        struct xenbus_watch otherend_watch;
        struct device dev;
        enum xenbus_state state;
+
+       struct completion down;
+       int shutdown_in_progress;
 };
 
 static inline struct xenbus_device *to_xenbus_device(struct device *dev)
@@ -296,4 +300,7 @@ void xenbus_dev_fatal(struct xenbus_devi
                      ...);
 
 
+int xenbus_closing_done(struct xenbus_device *dev);
+char *xenbus_strstate(enum xenbus_state state);
+
 #endif /* _XEN_XENBUS_H */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.