[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [patch/rfc] xenbus and driver issues with kexec
Hi, There are a number of problems in xenbus and the paravirtualized drivers when it comes to kexec'ing a domU kernel. It's not specific to kexec, but to handing over devices to someone else within the same domU. The same issues will come up when someone tries to combine mini-os and grub to build some kind of in-domain bootloader. Fundamental issue is that there is no way to cleanly shutdown the frontend/backend drivers without destroying the domain and the backend devices. The patch addresses some but not all of the problems arising from that. It also has some debug stuff left in and needs some more polishing before final submission. Here are the changes: (1) ballon: print unconditionally driver_pages in /proc/xen/balloon. (2) xenbus: unregister watches in xs_suspend() (3) blk/netback: allow Closing -> Initialising state transition. (4) blk/netfront: shutdown (but *not* destroy) devices when devices_shutdown is called, by jumping from Closing to Initialising state. (5) Some cleanups along the way. Not solved yet: The netfront driver leaks pages in case it is shutdown while the interface is active. Problem is that it can't get back the rx buffers, the netback driver sits on them and doesn't release them until it has some packet data to fill in. The patch adds a netif_release_rx_bufs() function, but it doesn't work reliable, usually it gets back only a few pages. To reproduce this you don't need to play with kexec. Just apply the balloon driver patch. Boot a domU with the new kernel. Configure the eth0 interface, "xm network-detach ...", "xm network-attach", repeat. Watch driver_pages grow in /proc/xen/balloon. cheers, Gerd PS: documentation and complete domU kexec patches are available from http://www.suse.de/~kraxel/xen/kexec.html -- Gerd Hoffmann <kraxel@xxxxxxx> http://www.suse.de/~kraxel/julika-dora.jpeg diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c --- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Mon Aug 14 12:49:11 2006 +0200 @@ -440,15 +440,17 @@ static int balloon_read(char *page, char "Requested target: %8lu kB\n" "Low-mem balloon: %8lu kB\n" "High-mem balloon: %8lu kB\n" + "Driver pages: %8lu kB\n" "Xen hard limit: ", PAGES2KB(current_pages), PAGES2KB(target_pages), - PAGES2KB(balloon_low), PAGES2KB(balloon_high)); + PAGES2KB(balloon_low), PAGES2KB(balloon_high), + PAGES2KB(driver_pages)); if (hard_limit != ~0UL) { len += sprintf( page + len, - "%8lu kB (inc. %8lu kB driver headroom)\n", - PAGES2KB(hard_limit), PAGES2KB(driver_pages)); + "%8lu kB\n", + PAGES2KB(hard_limit)); } else { len += sprintf( page + len, diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Mon Aug 14 12:49:11 2006 +0200 @@ -305,6 +305,11 @@ static void frontend_changed(struct xenb switch (frontend_state) { case XenbusStateInitialising: + if (dev->state == XenbusStateClosing) { + printk("%s: %s: prepare for reconnect\n", + __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } break; case XenbusStateInitialised: diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Mon Aug 14 12:49:11 2006 +0200 @@ -272,10 +272,13 @@ static void backend_changed(struct xenbu xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); down(&bd->bd_sem); - if (info->users > 0) + if (info->users) + printk("%-20s: %s: %d user(s) left\n", __FUNCTION__, + dev->nodename, info->users); + if (info->users > 0 && !dev->shutdown_in_progress) { xenbus_dev_error(dev, -EBUSY, "Device in use; refusing to close"); - else + } else blkfront_closing(dev); up(&bd->bd_sem); bdput(bd); @@ -359,7 +362,7 @@ static void blkfront_closing(struct xenb xlvbd_del(info); - xenbus_switch_state(dev, XenbusStateClosed); + xenbus_closing_done(dev); } diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Mon Aug 14 12:49:11 2006 +0200 @@ -228,10 +228,25 @@ static void frontend_changed(struct xenb switch (frontend_state) { case XenbusStateInitialising: + if (dev->state == XenbusStateClosing) { + printk("%s: %s: prepare for reconnect\n", + __FUNCTION__, dev->nodename); + if (be->netif) { + netif_disconnect(be->netif); + be->netif = NULL; + } + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + case XenbusStateInitialised: break; case XenbusStateConnected: + if (!be->netif) { + /* reconnect: setup be->netif */ + backend_changed(&be->backend_watch, NULL, 0); + } maybe_connect(be); break; diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netfront/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/netfront/Makefile Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/Makefile Mon Aug 14 12:49:11 2006 +0200 @@ -1,3 +1,4 @@ +EXTRA_CFLAGS += -DDEBUG=1 obj-$(CONFIG_XEN_NETDEV_FRONTEND) := xennet.o diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Mon Aug 14 12:49:11 2006 +0200 @@ -436,7 +436,7 @@ static void backend_changed(struct xenbu struct netfront_info *np = dev->dev.driver_data; struct net_device *netdev = np->netdev; - DPRINTK("\n"); + DPRINTK("%s\n", xenbus_strstate(backend_state)); switch (backend_state) { case XenbusStateInitialising: @@ -447,9 +447,11 @@ static void backend_changed(struct xenbu break; case XenbusStateInitWait: - network_connect(netdev); - xenbus_switch_state(dev, XenbusStateConnected); - (void)send_fake_arp(netdev); + if (!dev->shutdown_in_progress) { + network_connect(netdev); + xenbus_switch_state(dev, XenbusStateConnected); + (void)send_fake_arp(netdev); + } break; case XenbusStateClosing: @@ -492,6 +494,7 @@ static int network_open(struct net_devic { struct netfront_info *np = netdev_priv(dev); + DPRINTK("%s\n", np->xbdev->nodename); memset(&np->stats, 0, sizeof(np->stats)); network_alloc_rx_buffers(dev); @@ -651,6 +654,8 @@ no_skb: ((np->rx_target *= 2) > np->rx_max_target)) np->rx_target = np->rx_max_target; +// printk("%s: %d bufs, target %d\n", __FUNCTION__, i, np->rx_target); + refill: for (i = 0; ; i++) { if ((skb = __skb_dequeue(&np->rx_batch)) == NULL) @@ -1223,6 +1228,7 @@ err: /* Some pages are no longer absent... */ balloon_update_driver_allowance(-pages_done); +// printk("%s: %d driver pages\n", __FUNCTION__, work_done); /* Do all the remapping work, and M2P updates, in one big hypercall. */ if (likely(pages_done)) { @@ -1283,10 +1289,80 @@ err: return more_to_do; } +static void netif_release_rx_bufs(struct netfront_info *np) +{ + struct mmu_update *mmu = np->rx_mmu; + struct multicall_entry *mcl = np->rx_mcl; + struct sk_buff *skb; + unsigned long mfn; + int bufs = 0, gnttab = 0, unused = 0; + int id, ref; + + spin_lock(&np->rx_lock); + + for (id = 0; id < NET_RX_RING_SIZE; id++) { + if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) { + unused++; + continue; + } + if ((mfn = gnttab_end_foreign_transfer_ref(ref)) == 0) { + gnttab++; + continue; + } + + gnttab_release_grant_reference(&np->gref_rx_head, ref); + np->grant_rx_ref[id] = GRANT_INVALID_REF; + + skb = np->rx_skbs[id]; + add_id_to_freelist(np->rx_skbs, id); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Remap the page. */ + MULTI_update_va_mapping(mcl, (unsigned long)skb->head, + pfn_pte_ma(mfn, PAGE_KERNEL), + 0); + mcl++; + mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT) + | MMU_MACHPHYS_UPDATE; + mmu->val = __pa(skb->head) >> PAGE_SHIFT; + mmu++; + + set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT, + mfn); + } + bufs++; + +#if 0 /* FIXME */ + dev_kfree_skb(skb); +#endif + } + + printk("%s: %d released ok, %d gnttab errs, %d unused slots\n", + __FUNCTION__, bufs, gnttab, unused); + if (0 == bufs) + return; + + /* Some pages are no longer absent... */ + balloon_update_driver_allowance(-bufs); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Do all the remapping work, and M2P updates, in one big hypercall. */ + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)np->rx_mmu; + mcl->args[1] = mmu - np->rx_mmu; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + mcl++; + HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl); + } + + spin_unlock(&np->rx_lock); +} static int network_close(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); + DPRINTK("%s\n", np->xbdev->nodename); netif_stop_queue(np->netdev); return 0; } @@ -1425,6 +1501,8 @@ static void netif_uninit(struct net_devi static void netif_uninit(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); + DPRINTK("%s\n", np->xbdev->nodename); + netif_release_rx_bufs(np); gnttab_free_grant_references(np->gref_tx_head); gnttab_free_grant_references(np->gref_rx_head); } @@ -1720,11 +1798,11 @@ static void netfront_closing(struct xenb { struct netfront_info *info = dev->dev.driver_data; - DPRINTK("netfront_closing: %s removed\n", dev->nodename); + DPRINTK("%s\n", dev->nodename); close_netdev(info); - - xenbus_switch_state(dev, XenbusStateClosed); +// netif_release_rx_bufs(info); + xenbus_closing_done(dev); } @@ -1740,9 +1818,9 @@ static int __devexit netfront_remove(str return 0; } - static void close_netdev(struct netfront_info *info) { + DPRINTK("%s\n", info->xbdev->nodename); del_timer_sync(&info->rx_refill_timer); xennet_sysfs_delif(info->netdev); @@ -1752,6 +1830,7 @@ static void close_netdev(struct netfront static void netif_disconnect_backend(struct netfront_info *info) { + DPRINTK("%s\n", info->xbdev->nodename); /* Stop old i/f to prevent errors whilst we rebuild the state. */ spin_lock_irq(&info->tx_lock); spin_lock(&info->rx_lock); @@ -1774,6 +1853,7 @@ static void netif_disconnect_backend(str static void netif_free(struct netfront_info *info) { + DPRINTK("%s\n", info->xbdev->nodename); close_netdev(info); netif_disconnect_backend(info); free_netdev(info->netdev); @@ -1797,9 +1877,9 @@ static struct xenbus_device_id netfront_ static struct xenbus_driver netfront = { + .ids = netfront_ids, .name = "vif", .owner = THIS_MODULE, - .ids = netfront_ids, .probe = netfront_probe, .remove = __devexit_p(netfront_remove), .resume = netfront_resume, diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c Mon Aug 14 12:49:11 2006 +0200 @@ -41,6 +41,21 @@ extern char *kasprintf(const char *fmt, #define DPRINTK(fmt, args...) \ pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) +char *xenbus_strstate(enum xenbus_state state) +{ + static char *name[] = { + [ XenbusStateUnknown ] = "Unknown", + [ XenbusStateInitialising ] = "Initialising", + [ XenbusStateInitWait ] = "InitWait", + [ XenbusStateInitialised ] = "Initialised", + [ XenbusStateConnected ] = "Connected", + [ XenbusStateClosing ] = "Closing", + [ XenbusStateClosed ] = "Closed", + }; + return state < sizeof(name)/sizeof(name[0]) + ? name[state] : "INVALID"; +} + int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, @@ -124,6 +139,23 @@ int xenbus_switch_state(struct xenbus_de } EXPORT_SYMBOL_GPL(xenbus_switch_state); +int xenbus_closing_done(struct xenbus_device *dev) +{ + if (dev->shutdown_in_progress) { + /* frontend (we) triggered closing, because of devices_shutdown(), + * prepare for possible re-connect */ + printk("%-20s: %s: => Initialising\n", __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateInitialising); + } else { + /* backend triggered closing, probably the device went away, + * thus we are going to close down too. */ + printk("%-20s: %s: => Closed\n", __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateClosed); + } + complete(&dev->down); + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_closing_done); /** * Return the path to the error node for the given device, or NULL on failure. diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Mon Aug 14 12:49:11 2006 +0200 @@ -60,6 +60,15 @@ static struct notifier_block *xenstore_c static void wait_for_devices(struct xenbus_driver *xendrv); +static int xenbus_probe_frontend(const char *type, const char *name); +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size); +static int xenbus_probe_backend(const char *type, const char *domid); + +static int xenbus_dev_probe(struct device *_dev); +static int xenbus_dev_remove(struct device *_dev); +static void xenbus_dev_shutdown(struct device *_dev); + /* If something in array of ids matches this device, return it. */ static const struct xenbus_device_id * match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) @@ -168,15 +177,17 @@ static int read_frontend_details(struct /* Bus type for frontend drivers. */ -static int xenbus_probe_frontend(const char *type, const char *name); static struct xen_bus_type xenbus_frontend = { .root = "device", .levels = 2, /* device/type/<id> */ .get_bus_id = frontend_bus_id, .probe = xenbus_probe_frontend, .bus = { - .name = "xen", - .match = xenbus_match, + .name = "xen", + .match = xenbus_match, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, + .shutdown = xenbus_dev_shutdown, }, .dev = { .bus_id = "xen", @@ -221,18 +232,18 @@ static int backend_bus_id(char bus_id[BU return 0; } -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); -static int xenbus_probe_backend(const char *type, const char *domid); static struct xen_bus_type xenbus_backend = { .root = "backend", .levels = 3, /* backend/type/<frontend>/<id> */ .get_bus_id = backend_bus_id, .probe = xenbus_probe_backend, .bus = { - .name = "xen-backend", - .match = xenbus_match, - .uevent = xenbus_uevent_backend, + .name = "xen-backend", + .match = xenbus_match, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, +// .shutdown = xenbus_dev_shutdown, + .uevent = xenbus_uevent_backend, }, .dev = { .bus_id = "xen-backend", @@ -391,6 +402,26 @@ static int xenbus_dev_remove(struct devi return 0; } +static void xenbus_dev_shutdown(struct device *_dev) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned long timeout = 5*HZ; + + get_device(&dev->dev); + if (dev->state != XenbusStateConnected) { + printk("%s: %s: %s != Connected, skipping\n", __FUNCTION__, + dev->nodename, xenbus_strstate(dev->state)); + goto out; + } + dev->shutdown_in_progress = 1; + xenbus_switch_state(dev, XenbusStateClosing); + timeout = wait_for_completion_timeout(&dev->down, timeout); + if (!timeout) + printk("%s: %s timeout closing device\n", __FUNCTION__, dev->nodename); + out: + put_device(&dev->dev); +} + static int xenbus_register_driver_common(struct xenbus_driver *drv, struct xen_bus_type *bus) { @@ -399,8 +430,6 @@ static int xenbus_register_driver_common drv->driver.name = drv->name; drv->driver.bus = &bus->bus; drv->driver.owner = drv->owner; - drv->driver.probe = xenbus_dev_probe; - drv->driver.remove = xenbus_dev_remove; mutex_lock(&xenwatch_mutex); ret = driver_register(&drv->driver); @@ -575,6 +604,7 @@ static int xenbus_probe_node(struct xen_ tmpstring += strlen(tmpstring) + 1; strcpy(tmpstring, type); xendev->devicetype = tmpstring; + init_completion(&xendev->down); xendev->dev.parent = &bus->dev; xendev->dev.bus = &bus->bus; diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Mon Aug 14 12:49:11 2006 +0200 @@ -665,7 +665,18 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watc void xs_suspend(void) { + struct xenbus_watch *watch; + char token[sizeof(watch) * 2 + 1]; + down_write(&xs_state.suspend_mutex); + + /* No need for watches_lock: the suspend_mutex is sufficient. */ + list_for_each_entry(watch, &watches, list) { + sprintf(token, "%lX", (long)watch); + xs_unwatch(watch->node, token); +// printk("%s: unwatch %s\n", __FUNCTION__, watch->node); + } + mutex_lock(&xs_state.request_mutex); } diff -r befab551b0e1 linux-2.6-xen-sparse/include/xen/xenbus.h --- a/linux-2.6-xen-sparse/include/xen/xenbus.h Sun Aug 13 09:44:07 2006 +0100 +++ b/linux-2.6-xen-sparse/include/xen/xenbus.h Mon Aug 14 12:49:11 2006 +0200 @@ -37,6 +37,7 @@ #include <linux/device.h> #include <linux/notifier.h> #include <linux/mutex.h> +#include <linux/completion.h> #include <xen/interface/xen.h> #include <xen/interface/grant_table.h> #include <xen/interface/io/xenbus.h> @@ -74,6 +75,9 @@ struct xenbus_device { struct xenbus_watch otherend_watch; struct device dev; enum xenbus_state state; + + struct completion down; + int shutdown_in_progress; }; static inline struct xenbus_device *to_xenbus_device(struct device *dev) @@ -296,4 +300,7 @@ void xenbus_dev_fatal(struct xenbus_devi ...); +int xenbus_closing_done(struct xenbus_device *dev); +char *xenbus_strstate(enum xenbus_state state); + #endif /* _XEN_XENBUS_H */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |