[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [HVM][QEMU] Save/restore: enable HVM live migration



# HG changeset patch
# User Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
# Date 1174045190 0
# Node ID 8e76e1b95b127c2bfca94cb3cb660c54bcced8b7
# Parent  422a61ebac541a40d60eee66e5ddf87d4855201e
[HVM][QEMU] Save/restore: enable HVM live migration
by getting page-dirtying bitmaps from qemu-dm as well as from xen.
Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
---
 tools/ioemu/target-i386-dm/exec-dm.c |   16 ++++
 tools/ioemu/xenstore.c               |  127 ++++++++++++++++++++++++++++++++++
 tools/libxc/Makefile                 |    2 
 tools/libxc/xc_hvm_save.c            |   39 +++++++++-
 tools/libxc/xenguest.h               |    6 +
 tools/libxc/xg_private.c             |    4 -
 tools/xcutils/Makefile               |    6 -
 tools/xcutils/xc_save.c              |  129 ++++++++++++++++++++++++++++++++++-
 8 files changed, 314 insertions(+), 15 deletions(-)

diff -r 422a61ebac54 -r 8e76e1b95b12 tools/ioemu/target-i386-dm/exec-dm.c
--- a/tools/ioemu/target-i386-dm/exec-dm.c      Fri Mar 16 10:42:25 2007 +0000
+++ b/tools/ioemu/target-i386-dm/exec-dm.c      Fri Mar 16 11:39:50 2007 +0000
@@ -450,6 +450,9 @@ static inline int paddr_is_ram(target_ph
 #define phys_ram_addr(x) (phys_ram_base + (x))
 #endif
 
+extern unsigned long *logdirty_bitmap;
+extern unsigned long logdirty_bitmap_size;
+
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, 
                             int len, int is_write)
 {
@@ -485,9 +488,20 @@ void cpu_physical_memory_rw(target_phys_
                     l = 1;
                 }
             } else if (paddr_is_ram(addr)) {
-                /* Reading from RAM */
+                /* Writing to RAM */
                 ptr = phys_ram_addr(addr);
                 memcpy(ptr, buf, l);
+                if (logdirty_bitmap != NULL) {
+                    /* Record that we have dirtied this frame */
+                    unsigned long pfn = addr >> TARGET_PAGE_BITS;
+                    if (pfn / 8 >= logdirty_bitmap_size) {
+                        fprintf(logfile, "dirtying pfn %x >= bitmap size %x\n",
+                                pfn, logdirty_bitmap_size * 8);
+                    } else {
+                        logdirty_bitmap[pfn / HOST_LONG_BITS]
+                            |= 1UL << pfn % HOST_LONG_BITS;
+                    }
+                }
 #ifdef __ia64__
                 sync_icache(ptr, l);
 #endif 
diff -r 422a61ebac54 -r 8e76e1b95b12 tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c    Fri Mar 16 10:42:25 2007 +0000
+++ b/tools/ioemu/xenstore.c    Fri Mar 16 11:39:50 2007 +0000
@@ -11,6 +11,11 @@
 #include "vl.h"
 #include "block_int.h"
 #include <unistd.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 
 static struct xs_handle *xsh = NULL;
 static char *hd_filename[MAX_DISKS];
@@ -183,6 +188,13 @@ void xenstore_parse_domain_config(int do
        }
     }
 
+    /* Set a watch for log-dirty requests from the migration tools */
+    if (pasprintf(&buf, "%s/logdirty/next-active", path) != -1) {
+        xs_watch(xsh, buf, "logdirty");
+        fprintf(logfile, "Watching %s\n", buf);
+    }
+
+
  out:
     free(type);
     free(params);
@@ -201,6 +213,116 @@ int xenstore_fd(void)
     return -1;
 }
 
+unsigned long *logdirty_bitmap = NULL;
+unsigned long logdirty_bitmap_size;
+extern int vga_ram_size, bios_size;
+
+void xenstore_process_logdirty_event(void)
+{
+    char *act;
+    static char *active_path = NULL;
+    static char *next_active_path = NULL;
+    static char *seg = NULL;
+    unsigned int len;
+    int i;
+
+    fprintf(logfile, "Triggered log-dirty buffer switch\n");
+
+    if (!seg) {
+        char *path, *p, *key_ascii, *key_terminated[17] = {0,};
+        key_t key;
+        int shmid;
+
+        /* Find and map the shared memory segment for log-dirty bitmaps */
+        if (!(path = xs_get_domain_path(xsh, domid))) {            
+            fprintf(logfile, "Log-dirty: can't get domain path in store\n");
+            exit(1);
+        }
+        if (!(path = realloc(path, strlen(path) 
+                             + strlen("/logdirty/next-active") + 1))) {
+            fprintf(logfile, "Log-dirty: out of memory\n");
+            exit(1);
+        }
+        strcat(path, "/logdirty/");
+        p = path + strlen(path);
+        strcpy(p, "key");
+        
+        key_ascii = xs_read(xsh, XBT_NULL, path, &len);
+        if (!key_ascii) {
+            /* No key yet: wait for the next watch */
+            free(path);
+            return;
+        }
+        strncpy(key_terminated, key_ascii, 16);
+        free(key_ascii);
+        key = (key_t) strtoull(key_terminated, NULL, 16);
+
+        /* Figure out how bit the log-dirty bitmaps are */
+        logdirty_bitmap_size = ((phys_ram_size + 0x20 
+                                 - (vga_ram_size + bios_size)) 
+                                >> (TARGET_PAGE_BITS)); /* nr of bits in map*/
+        if (logdirty_bitmap_size > HVM_BELOW_4G_MMIO_START >> TARGET_PAGE_BITS)
+            logdirty_bitmap_size += 
+                HVM_BELOW_4G_MMIO_LENGTH >> TARGET_PAGE_BITS; /* still bits */
+        logdirty_bitmap_size = ((logdirty_bitmap_size + HOST_LONG_BITS - 1)
+                                / HOST_LONG_BITS); /* longs */
+        logdirty_bitmap_size *= sizeof (unsigned long); /* bytes */
+
+        /* Map the shared-memory segment */
+        if ((shmid = shmget(key, 
+                            2 * logdirty_bitmap_size, 
+                            S_IRUSR|S_IWUSR)) == -1 
+            || (seg = shmat(shmid, NULL, 0)) == (void *)-1) {
+            fprintf(logfile, "Log-dirty: can't map segment %16.16llx (%s)\n",
+                    (unsigned long long) key, strerror(errno));
+            exit(1);
+        }
+
+        fprintf(logfile, "Log-dirty: mapped segment at %p\n", seg);
+
+        /* Double-check that the bitmaps are the size we expect */
+        if (logdirty_bitmap_size != *(uint32_t *)seg) {
+            fprintf(logfile, "Log-dirty: got %lu, calc %lu\n", 
+                    *(uint32_t *)seg, logdirty_bitmap_size);
+            return;
+        }
+
+        /* Remember the paths for the next-active and active entries */
+        strcpy(p, "active");
+        if (!(active_path = strdup(path))) {
+            fprintf(logfile, "Log-dirty: out of memory\n");
+            exit(1);
+        }
+        strcpy(p, "next-active");
+        if (!(next_active_path = strdup(path))) {
+            fprintf(logfile, "Log-dirty: out of memory\n");
+            exit(1);
+        }
+        free(path);
+    }
+    
+    /* Read the required active buffer from the store */
+    act = xs_read(xsh, XBT_NULL, next_active_path, &len);
+    if (!act) {
+        fprintf(logfile, "Log-dirty: can't read next-active\n");
+        exit(1);
+    }
+
+    /* Switch buffers */
+    i = act[0] - '0';
+    if (i != 0 && i != 1) {
+        fprintf(logfile, "Log-dirty: bad next-active entry: %s\n", act);
+        exit(1);
+    }
+    logdirty_bitmap = seg + i * logdirty_bitmap_size;
+
+    /* Ack that we've switched */
+    xs_write(xsh, XBT_NULL, active_path, act, len);
+    free(act);
+}
+
+
+
 void xenstore_process_event(void *opaque)
 {
     char **vec, *image = NULL;
@@ -209,6 +331,11 @@ void xenstore_process_event(void *opaque
     vec = xs_read_watch(xsh, &num);
     if (!vec)
        return;
+
+    if (!strcmp(vec[XS_WATCH_TOKEN], "logdirty")) {
+        xenstore_process_logdirty_event();
+        goto out;
+    }
 
     if (strncmp(vec[XS_WATCH_TOKEN], "hd", 2) ||
        strlen(vec[XS_WATCH_TOKEN]) != 3)
diff -r 422a61ebac54 -r 8e76e1b95b12 tools/libxc/Makefile
--- a/tools/libxc/Makefile      Fri Mar 16 10:42:25 2007 +0000
+++ b/tools/libxc/Makefile      Fri Mar 16 11:39:50 2007 +0000
@@ -57,7 +57,7 @@ GUEST_SRCS-$(CONFIG_IA64)    += xc_dom_c
 
 CFLAGS   += -Werror -Wmissing-prototypes
 CFLAGS   += -fno-strict-aliasing
-CFLAGS   += $(INCLUDES) -I.
+CFLAGS   += $(INCLUDES) -I. -I../xenstore
 
 # Needed for posix_fadvise64() in xc_linux.c
 CFLAGS-$(CONFIG_Linux) += -D_GNU_SOURCE
diff -r 422a61ebac54 -r 8e76e1b95b12 tools/libxc/xc_hvm_save.c
--- a/tools/libxc/xc_hvm_save.c Fri Mar 16 10:42:25 2007 +0000
+++ b/tools/libxc/xc_hvm_save.c Fri Mar 16 11:39:50 2007 +0000
@@ -54,6 +54,11 @@ static unsigned long hvirt_start;
 /* #levels of page tables used by the current guest */
 static unsigned int pt_levels;
 
+/* Shared-memory bitmaps for getting log-dirty bits from qemu */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
 int xc_hvm_drain_io(int handle, domid_t dom)
 {
     DECLARE_HYPERCALL;
@@ -77,7 +82,8 @@ int xc_hvm_drain_io(int handle, domid_t 
 */
 
 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define BITMAP_SIZE   ((pfn_array_size + BITS_PER_LONG - 1) / 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE   (BITS_TO_LONGS(pfn_array_size) * sizeof(unsigned long))
 
 #define BITMAP_ENTRY(_nr,_bmap) \
    ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
@@ -123,6 +129,7 @@ static inline int permute( int i, int nr
 
     return i;
 }
+
 
 static uint64_t tv_to_us(struct timeval *new)
 {
@@ -277,7 +284,9 @@ static int suspend_and_state(int (*suspe
 }
 
 int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-                  uint32_t max_factor, uint32_t flags, int (*suspend)(int))
+                uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+                void *(*init_qemu_maps)(int, unsigned), 
+                void (*qemu_flip_buffer)(int, int))
 {
     xc_dominfo_t info;
 
@@ -392,8 +401,6 @@ int xc_hvm_save(int xc_handle, int io_fd
             "nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages); 
 
     if (live) {
-        ERROR("hvm domain doesn't support live migration now.\n");
-        goto out;
         
         if (xc_shadow_control(xc_handle, dom,
                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
@@ -453,6 +460,15 @@ int xc_hvm_save(int xc_handle, int io_fd
     to_skip = malloc(BITMAP_SIZE);
 
 
+    if (live) {
+        /* Get qemu-dm logging dirty pages too */
+        void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+        qemu_bitmaps[0] = seg;
+        qemu_bitmaps[1] = seg + BITMAP_SIZE;
+        qemu_active = 0;
+        qemu_non_active = 1;
+    }
+
     hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
     if ( hvm_buf_size == -1 )
     {
@@ -677,10 +693,23 @@ int xc_hvm_save(int xc_handle, int io_fd
                 goto out;
             }
 
+            /* Pull in the dirty bits from qemu too */
+            if (!last_iter) {
+                qemu_active = qemu_non_active;
+                qemu_non_active = qemu_active ? 0 : 1;
+                qemu_flip_buffer(dom, qemu_active);
+                for (j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++) {
+                    to_send[j] |= qemu_bitmaps[qemu_non_active][j];
+                    qemu_bitmaps[qemu_non_active][j] = 0;
+                }
+            } else {
+                for (j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++) 
+                    to_send[j] |= qemu_bitmaps[qemu_active][j];
+            }
+
             sent_last_iter = sent_this_iter;
 
             print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
-
         }
 
 
diff -r 422a61ebac54 -r 8e76e1b95b12 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Fri Mar 16 10:42:25 2007 +0000
+++ b/tools/libxc/xenguest.h    Fri Mar 16 11:39:50 2007 +0000
@@ -32,8 +32,10 @@ int xc_linux_save(int xc_handle, int io_
  * @return 0 on success, -1 on failure
  */
 int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-                  uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
-                  int (*suspend)(int domid));
+                uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+                int (*suspend)(int domid),  
+                void *(*init_qemu_maps)(int, unsigned), 
+                void (*qemu_flip_buffer)(int, int));
 
 /**
  * This function will restore a saved domain running Linux.
diff -r 422a61ebac54 -r 8e76e1b95b12 tools/libxc/xg_private.c
--- a/tools/libxc/xg_private.c  Fri Mar 16 10:42:25 2007 +0000
+++ b/tools/libxc/xg_private.c  Fri Mar 16 11:39:50 2007 +0000
@@ -201,7 +201,9 @@ __attribute__((weak))
 __attribute__((weak)) 
     int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
                     uint32_t max_factor, uint32_t flags,
-                    int (*suspend)(int domid))
+                    int (*suspend)(int domid), 
+                    void *(*init_qemu_maps)(int, unsigned), 
+                    void (*qemu_flip_buffer)(int, int))
 {
     errno = ENOSYS;
     return -1;
diff -r 422a61ebac54 -r 8e76e1b95b12 tools/xcutils/Makefile
--- a/tools/xcutils/Makefile    Fri Mar 16 10:42:25 2007 +0000
+++ b/tools/xcutils/Makefile    Fri Mar 16 11:39:50 2007 +0000
@@ -13,7 +13,7 @@ include $(XEN_ROOT)/tools/Rules.mk
 
 PROGRAMS_INSTALL_DIR = /usr/$(LIBDIR)/xen/bin
 
-INCLUDES += -I $(XEN_LIBXC)
+INCLUDES += -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
 
 CFLAGS += -Werror -fno-strict-aliasing
 CFLAGS += $(INCLUDES)
@@ -22,9 +22,9 @@ CFLAGS += -Wp,-MD,.$(@F).d
 CFLAGS += -Wp,-MD,.$(@F).d
 PROG_DEP = .*.d
 
-PROGRAMS               = xc_restore xc_save readnotes
+PROGRAMS = xc_restore xc_save readnotes
 
-LDLIBS                 = -L$(XEN_LIBXC) -lxenguest -lxenctrl
+LDLIBS   = -L$(XEN_LIBXC) -L$(XEN_XENSTORE) -lxenguest -lxenctrl -lxenstore
 
 .PHONY: all
 all: build
diff -r 422a61ebac54 -r 8e76e1b95b12 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c   Fri Mar 16 10:42:25 2007 +0000
+++ b/tools/xcutils/xc_save.c   Fri Mar 16 11:39:50 2007 +0000
@@ -12,7 +12,13 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 
+#include <xs.h>
 #include <xenctrl.h>
 #include <xenguest.h>
 
@@ -29,6 +35,123 @@ static int suspend(int domid)
 
     return (fgets(ans, sizeof(ans), stdin) != NULL &&
             !strncmp(ans, "done\n", 5));
+}
+
+/* For HVM guests, there are two sources of dirty pages: the Xen shadow
+ * log-dirty bitmap, which we get with a hypercall, and qemu's version.
+ * The protocol for getting page-dirtying data from qemu uses a
+ * double-buffered shared memory interface directly between xc_save and
+ * qemu-dm. 
+ *
+ * xc_save calculates the size of the bitmaps and notifies qemu-dm 
+ * through the store that it wants to share the bitmaps.  qemu-dm then 
+ * starts filling in the 'active' buffer. 
+ *
+ * To change the buffers over, xc_save writes the other buffer number to
+ * the store and waits for qemu to acknowledge that it is now writing to
+ * the new active buffer.  xc_save can then process and clear the old
+ * active buffer. */
+
+static char *qemu_active_path;
+static char *qemu_next_active_path;
+static struct xs_handle *xs;
+
+/* Get qemu to change buffers. */
+static void qemu_flip_buffer(int domid, int next_active)
+{
+    char digit = '0' + next_active;
+    unsigned int len;
+    char *active_str, **watch;
+    struct timeval tv;
+    fd_set fdset;
+
+    /* Tell qemu that we want it to start writing log-dirty bits to the
+     * other buffer */
+    if (!xs_write(xs, XBT_NULL, qemu_next_active_path, &digit, 1)) {
+        errx(1, "can't write next-active to store path (%s)\n", 
+              qemu_next_active_path);
+        exit(1);
+    }
+
+    /* Wait a while for qemu to signal that it has switched to the new 
+     * active buffer */
+ read_again: 
+    tv.tv_sec = 5;
+    tv.tv_usec = 0;
+    FD_ZERO(&fdset);
+    FD_SET(xs_fileno(xs), &fdset);
+    if ((select(xs_fileno(xs) + 1, &fdset, NULL, NULL, &tv)) != 1) {
+        errx(1, "timed out waiting for qemu to switch buffers\n");
+        exit(1);
+    }
+    watch = xs_read_watch(xs, &len);
+    free(watch);
+    
+    active_str = xs_read(xs, XBT_NULL, qemu_active_path, &len);
+    if (active_str == NULL || active_str[0] - '0' != next_active) 
+        /* Watch fired but value is not yet right */
+        goto read_again;
+}
+
+static void * init_qemu_maps(int domid, unsigned int bitmap_size)
+{
+    key_t key;
+    char key_ascii[17] = {0,};
+    int shmid = -1;
+    void *seg; 
+    char *path, *p;
+
+    /* Make a shared-memory segment */
+    while (shmid == -1)
+    {
+        key = rand(); /* No security, just a sequence of numbers */
+        shmid = shmget(key, 2 * bitmap_size, 
+                       IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR);
+        if (shmid == -1 && errno != EEXIST)
+            errx(1, "can't get shmem to talk to qemu-dm");
+    }
+
+    /* Map it into our address space */
+    seg = shmat(shmid, NULL, 0);
+    if (seg == (void *) -1) 
+        errx(1, "can't map shmem to talk to qemu-dm");
+    memset(seg, 0, 2 * bitmap_size);
+
+    /* Write the size of it into the first 32 bits */
+    *(uint32_t *)seg = bitmap_size;
+
+    /* Tell qemu about it */
+    if ((xs = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+    if (!(path = xs_get_domain_path(xs, domid)))
+        errx(1, "can't get domain path in store");
+    if (!(path = realloc(path, strlen(path) 
+                         + strlen("/logdirty/next-active") + 1))) 
+        errx(1, "no memory for constructing xenstore path");
+    strcat(path, "/logdirty/");
+    p = path + strlen(path);
+
+    strcpy(p, "key");
+    snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key);
+    if (!xs_write(xs, XBT_NULL, path, key_ascii, 16))
+        errx(1, "can't write key (%s) to store path (%s)\n", key_ascii, path);
+
+    /* Watch for qemu's indication of the active buffer, and request it 
+     * to start writing to buffer 0 */
+    strcpy(p, "active");
+    if (!xs_watch(xs, path, "qemu-active-buffer"))
+        errx(1, "can't set watch in store (%s)\n", path);
+    if (!(qemu_active_path = strdup(path)))
+        errx(1, "no memory for copying xenstore path");
+
+    strcpy(p, "next-active");
+    if (!(qemu_next_active_path = strdup(path)))
+        errx(1, "no memory for copying xenstore path");
+
+    qemu_flip_buffer(domid, 0);
+
+    free(path);
+    return seg;
 }
 
 
@@ -52,9 +175,11 @@ main(int argc, char **argv)
     flags = atoi(argv[5]);
 
     if (flags & XCFLAGS_HVM)
-        ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+        ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
+                          &suspend, &init_qemu_maps, &qemu_flip_buffer);
     else 
-        ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
&suspend);
+        ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
+                            &suspend);
 
     xc_interface_close(xc_fd);
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.