[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC][PATCH 05/13] Kemari: Kemari sender



This is an updated version of the following patch.  It uses an event
channel instead of a signal to notify buffer flip and order save of
the QEMU status.

http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00749.html

Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@xxxxxxxxxxxxx>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@xxxxxxxxxxxxx>
---
 tools/libxc/xc_dom_kemari_save.c | 1114 +++++++++++++++++++++++++++++++++++++++
 tools/xcutils/xc_kemari_save.c   |  525 ++++++++++++++++++
 2 files changed, 1639 insertions(+)

diff -r b249f3e979a5 -r 06b950859c92 tools/libxc/xc_dom_kemari_save.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari_save.c  Tue Mar 24 15:11:38 2009 +0900
@@ -0,0 +1,1114 @@
+/******************************************************************************
+ * xc_dom_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This source code is based on xc_domain_save.c.
+ * Copied BITS_PER_LONG, BITS_TO_LONGS, BITMAP_SIZE, BITMAP_SHIFT,
+ * RATE_IS_MAX, test_bit, clear_bit, set_bit, tv_delta, noncached_write,
+ * initialize_mbit_rate, and ratewrite from xc_domain_save.c
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include "xc_e820.h"
+
+#ifdef  __MINIOS__
+/*
+ * Caution: atomicity of following alternative libc functions are broken.
+ */
+static ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+    char buf[1024];
+    int len, wrote_len = 0;
+
+    if (offset != NULL) {
+        ERROR("Sorry sendfile for stubdomain should not have offset");
+        errno = EIO;
+        return -1;
+    }
+
+    while (count > 0) {
+        len = (count < sizeof(buf))?count:sizeof(buf);
+        len = read(in_fd, buf, len);
+        if (len < 0)
+            return -1;
+        if (write_exact(out_fd, buf, len))
+            return -1;
+        wrote_len += len;
+        count -= len;
+    }
+    return wrote_len;
+}
+
+#define IOV_MAX 1024
+struct iovec {
+    void *iov_base; /* Base address. */
+    size_t iov_len; /* Length. */
+};
+static ssize_t writev(int d, const struct iovec *iov, int iovcnt)
+{
+    int i;
+    int len, wrote_len;
+
+    if (iovcnt < 0 || iovcnt > IOV_MAX) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    for (i = 0, wrote_len = 0; i < iovcnt; i++) {
+        len = write(d, iov[i].iov_base, iov[i].iov_len);
+        if (len < 0)
+            return -1;
+
+        wrote_len += len;
+        if (wrote_len < 0) { /* integer overflow */
+            errno = EINVAL;
+            return -1;
+        }
+
+        if (len != iov[i].iov_len)
+            return wrote_len;
+    }
+
+    return wrote_len;
+}
+#else  /* !__MINIOS__ */
+#include <sys/sendfile.h>
+#include <sys/uio.h>
+#endif  /* __MINIOS__ */
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* page frame numbers */
+static unsigned long *pfn_type = NULL;
+
+/* The new domain's shared-info frame number. */
+static unsigned long shared_info_frame;
+
+/*
+ * guest memory
+ */
+#define GUEST_MEM_ENTRY_SIZE    1024 /* up to 4MB at a time. */
+static unsigned char ** guest_memory = NULL;
+static unsigned long ** guest_memory_status = NULL;
+static unsigned long guest_memory_size = 0;
+
+static inline int map_guest_mem(int xc_handle, uint32_t domid,
+    unsigned long base)
+{
+    int j;
+    unsigned char * region_base;
+    unsigned long * pfn_base;
+
+    pfn_base = guest_memory_status[base];
+
+    memset(pfn_base, 0, GUEST_MEM_ENTRY_SIZE);
+    for (j = 0; j < GUEST_MEM_ENTRY_SIZE; j++) {
+        pfn_base[j] = base * GUEST_MEM_ENTRY_SIZE + j;
+    }
+    region_base = xc_map_foreign_batch(
+        xc_handle, domid, PROT_READ, pfn_base, GUEST_MEM_ENTRY_SIZE);
+    if ( region_base == NULL )
+    {
+        PERROR("map failed at guest memory frame 0x%lx - 0x%lx (%lu)",
+            base * GUEST_MEM_ENTRY_SIZE, (base + 1)* GUEST_MEM_ENTRY_SIZE - 1,
+            base);
+        return -1;
+    }
+
+    /* Look for and skip completely empty batches. */
+    for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+        pfn_base[j] &= XEN_DOMCTL_PFINFO_LTAB_MASK;
+    for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+        if ( pfn_base[j] != XEN_DOMCTL_PFINFO_XTAB )
+            break;
+    if ( j == GUEST_MEM_ENTRY_SIZE )
+    {
+        munmap(region_base, GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+        guest_memory[base] = NULL;
+        return 1;
+    }
+
+    guest_memory[base] = region_base;
+
+    return 0;
+}
+
+static inline unsigned char * search_guest_mem(int xc_handle, uint32_t domid,
+    unsigned long mfn)
+{
+    unsigned long base = mfn / GUEST_MEM_ENTRY_SIZE;
+    unsigned long offset = mfn % GUEST_MEM_ENTRY_SIZE;
+
+    if (base >= guest_memory_size) {
+        ERROR("Error base(%lu) is greater than guest_memory_size(%lu)\n",
+            base, guest_memory_size);
+        return NULL;
+    }
+
+    if ( guest_memory_status[base][offset]  == XEN_DOMCTL_PFINFO_XTAB ) {
+        /* reload XTAB place */
+        munmap(guest_memory[base], GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+        guest_memory[base] = NULL;
+        DPRINTF("guest_memory[%lu] (frame 0x%lx - 0x%lx) will be remapped\n",
+            base, base * GUEST_MEM_ENTRY_SIZE,
+            (base + 1) * GUEST_MEM_ENTRY_SIZE - 1);
+    }
+
+    if (guest_memory[base] == NULL)
+        if (map_guest_mem(xc_handle, domid, offset))
+            return NULL;
+
+    return guest_memory[base] + offset * PAGE_SIZE;
+    /* Since I don't care of XEN_DOMCTL_PFINFO_LTAB_MASK,
+        this program may cause some accidents. */
+}
+
+static inline int init_guest_mem(int xc_handle, uint32_t dom)
+{
+    int i;
+
+    guest_memory_size = p2m_size / GUEST_MEM_ENTRY_SIZE + 1;
+    DPRINTF("guest_memory_size: %lu\n", guest_memory_size);
+
+    /* mapped memory */
+    guest_memory = xg_memalign(PAGE_SIZE,
+        guest_memory_size * sizeof(guest_memory[0]));
+    if (guest_memory == NULL)
+    {
+        PERROR("failed to allocate guest_memory");
+        return -1;
+    }
+    if ( lock_pages(guest_memory, guest_memory_size * sizeof(guest_memory[0])))
+    {
+        ERROR("Unable to lock guest_memory array");
+        return -1;
+    }
+
+    /* memory status */
+    guest_memory_status   = xg_memalign(PAGE_SIZE,
+        guest_memory_size * sizeof(guest_memory_status[0]));
+    if ( guest_memory_status == NULL )
+    {
+        ERROR("failed to alloc memory for guest_memory_status");
+        errno = ENOMEM;
+        return -1;
+    }
+    if ( lock_pages(guest_memory_status,
+        guest_memory_size * sizeof(guest_memory_status[0])))
+    {
+        ERROR("Unable to lock guest_memory_status array");
+        return -1;
+    }
+
+    for (i = 0; i < guest_memory_size; i++) {
+        guest_memory_status[i] = xg_memalign(PAGE_SIZE,
+            GUEST_MEM_ENTRY_SIZE * sizeof(guest_memory_status[0][0]));
+        if (guest_memory_status[i] == NULL) {
+            ERROR("failed to alloc memory for guest_memory_status[%d]", i);
+            errno = ENOMEM;
+            return -1;
+        }
+        if ( lock_pages(guest_memory_status,
+            guest_memory_size * sizeof(guest_memory_status[0][0])))
+        {
+            ERROR("Unable to lock guest_memory_status[%d]", i);
+            return -1;
+        }
+    }
+
+    for (i = 0; i < guest_memory_size; i++)
+        if (map_guest_mem(xc_handle, dom, i) < 0)
+            return -1;
+
+    return 0;
+}
+
+static int writev_exact(int fd, const struct iovec *iov, size_t count)
+{
+    int i;
+    size_t sum;
+    for (i = 0, sum = 0; i < count; i++)
+        sum += iov[i].iov_len;
+
+    if (writev(fd, iov, count) != sum)
+        return -1;
+    else
+        return 0;
+}
+
+/* grep fodder: machine_to_phys */
+
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE   (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+    return (((new->tv_sec - old->tv_sec)*1000000) +
+            (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, void *buffer, int len)
+{
+    static int write_count = 0;
+    int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
+
+    write_count += len;
+    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+    {
+        /* Time to discard cache - dont care if this fails */
+        discard_file_cache(fd, 0 /* no flush */);
+        write_count = 0;
+    }
+
+    return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
+#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU      781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+    mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, void *buf, int n)
+{
+    static int budget = 0;
+    static int burst_time_us = -1;
+    static struct timeval last_put = { 0 };
+    struct timeval now;
+    struct timespec delay;
+    long long delta;
+
+    if ( START_MBIT_RATE == 0 )
+        return noncached_write(io_fd, buf, n);
+
+    budget -= n;
+    if ( budget < 0 )
+    {
+        if ( mbit_rate != ombit_rate )
+        {
+            burst_time_us = RATE_TO_BTU / mbit_rate;
+            ombit_rate = mbit_rate;
+            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+                    mbit_rate, BURST_BUDGET, burst_time_us);
+        }
+        if ( last_put.tv_sec == 0 )
+        {
+            budget += BURST_BUDGET;
+            gettimeofday(&last_put, NULL);
+        }
+        else
+        {
+            while ( budget < 0 )
+            {
+                gettimeofday(&now, NULL);
+                delta = tv_delta(&now, &last_put);
+                while ( delta > burst_time_us )
+                {
+                    budget += BURST_BUDGET;
+                    last_put.tv_usec += burst_time_us;
+                    if ( last_put.tv_usec > 1000000 )
+                    {
+                        last_put.tv_usec -= 1000000;
+                        last_put.tv_sec++;
+                    }
+                    delta -= burst_time_us;
+                }
+                if ( budget > 0 )
+                    break;
+                delay.tv_sec = 0;
+                delay.tv_nsec = 1000 * (burst_time_us - delta);
+                while ( delay.tv_nsec > 0 )
+                    if ( nanosleep(&delay, &delay) == 0 )
+                        break;
+            }
+        }
+    }
+    return noncached_write(io_fd, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _buf, _n) noncached_write((_io_fd), (_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+                       xc_shadow_op_stats_t *stats, int print)
+{
+    static struct timeval wall_last;
+    static long long      d0_cpu_last;
+    static long long      d1_cpu_last;
+
+    struct timeval        wall_now;
+    long long             wall_delta;
+    long long             d0_cpu_now, d0_cpu_delta;
+    long long             d1_cpu_now, d1_cpu_delta;
+
+    gettimeofday(&wall_now, NULL);
+
+    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+        DPRINTF("ARRHHH!!\n");
+
+    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+    if ( wall_delta == 0 )
+        wall_delta = 1;
+
+    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+    if ( print )
+        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+                "dirtied %dMb/s %" PRId32 " pages\n",
+                wall_delta,
+                (int)((d0_cpu_delta*100)/wall_delta),
+                (int)((d1_cpu_delta*100)/wall_delta),
+                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+                stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+    {
+        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+            + 50;
+        if ( mbit_rate > MAX_MBIT_RATE )
+            mbit_rate = MAX_MBIT_RATE;
+    }
+#endif
+
+    d0_cpu_last = d0_cpu_now;
+    d1_cpu_last = d1_cpu_now;
+    wall_last   = wall_now;
+
+    return 0;
+}
+
+static int send_qemu_image(int xc_handle, int io_fd, uint32_t dom)
+{
+    char path[128];
+    struct stat st;
+    struct {
+        int minusfour;
+        uint32_t image_size;
+    } chunk = { -1, 0 };
+    int qemu_fd;
+    int rc = -1;
+
+    snprintf(path, sizeof(path), "/dev/shm/qemu-save.%d", dom);
+    if ((qemu_fd = open(path, O_RDONLY)) == -1)
+    {
+        PERROR("Error when opening qemu image %s", path);
+        goto out;
+    }
+
+    if (fstat(qemu_fd, &st) == -1)
+    {
+        PERROR("Error fstat qemu file %s", path);
+        goto out;
+    }
+    chunk.image_size = st.st_size;
+
+    if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
+    {
+        PERROR("Error when writing header for qemu image");
+        goto out;
+    }
+
+    if ( sendfile(io_fd, qemu_fd, NULL, chunk.image_size) !=
+        chunk.image_size)
+    {
+        PERROR("Error when writing qemu image");
+        goto out;
+    }
+    close(qemu_fd);
+
+    rc = 0;
+out:
+    return rc;
+}
+
+static int send_hvm_params(int xc_handle, int io_fd, uint32_t dom)
+{
+    struct {
+        int id;
+        uint32_t pad;
+        uint64_t data;
+    } chunk = { 0, 0 };
+
+    chunk.id = -3;
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+                     &chunk.data);
+
+    if ( (chunk.data != 0) &&
+         write_exact(io_fd, &chunk, sizeof(chunk)) )
+    {
+        PERROR("Error when writing the ident_pt for EPT guest");
+        return -1;
+    }
+
+    chunk.id = -4;
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+                     &chunk.data);
+
+    if ( (chunk.data != 0) &&
+         write_exact(io_fd, &chunk, sizeof(chunk)) )
+    {
+        PERROR("Error when writing the vm86 TSS for guest");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int send_hvm_context(int xc_handle, int io_fd,
+                            struct kemari_ring *ring, uint32_t dom)
+{
+    uint32_t buf_size = ring->hvm_ctxt.buf_size;
+    uint32_t rec_size = ring->hvm_ctxt.rec_size;
+    uint8_t *hvm_buf = (uint8_t *)ring + ring->hvm_ctxt.buf_offset;
+    int rc = -1;
+
+    /* Get HVM context from Xen and save it too */
+    if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
+                                              buf_size)) == -1 )
+    {
+        ERROR("HVM:Could not get hvm buffer");
+        goto out;
+    }
+
+    if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+    {
+        PERROR("error write hvm buffer size");
+        goto out;
+    }
+
+    if ( write_exact(io_fd, hvm_buf, rec_size) )
+    {
+        PERROR("write HVM info failed!\n");
+        goto out;
+    }
+    rc = 0;
+
+out:
+    return rc;
+}
+
+int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom,
+                   void *kemari_ring, uint32_t flags,
+                   int hvm, void *(*init_qemu_maps)(int, unsigned))
+{
+    int rc = 1, i, j, iter = 0;
+    int debug = (flags & XCFLAGS_DEBUG);
+    int sent_last_iter, skip_this_iter;
+    xc_dominfo_t info;
+    struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    /* bitmap of pages:
+       - that should be sent this iteration (unless later marked as skip);
+       - to skip this iteration because already dirty;
+       - to fixup by sending at the end if not already resent; */
+    unsigned long *to_send = NULL, *to_fix = NULL;
+
+    xc_shadow_op_stats_t stats;
+
+    unsigned long needed_to_fix = 0;
+    unsigned long total_sent    = 0;
+
+    /* HVM: magic frames for ioreqs and xenstore comms. */
+    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+    /* callback irq */
+    uint64_t callback_irq = 0;
+
+    if ( !hvm )
+    {
+        ERROR("HVM domain is required for the kemari migration.");
+        return 1;
+    }
+
+    initialize_mbit_rate();
+
+    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+    {
+        ERROR("Could not get domain info");
+        return 1;
+    }
+
+    shared_info_frame = info.shared_info_frame;
+    DPRINTF("xc_kemari_save: shared_info_frame: %lu\n", shared_info_frame);
+
+    /* Get the size of the P2M table */
+    p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
+    DPRINTF("xc_kemari_save: p2m_size: %lu\n", p2m_size);
+
+    /* Domain is still running at this point */
+    {
+        /* Get qemu-dm logging dirty pages too */
+        void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+        qemu_bitmaps[0] = seg;
+        qemu_bitmaps[1] = seg + BITMAP_SIZE;
+        qemu_active = 0;
+        qemu_non_active = 1;
+    }
+
+    /* pretend we sent all the pages last iteration */
+    sent_last_iter = p2m_size;
+
+    /* Setup to_send / to_fix bitmaps */
+    to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
+    to_fix  = calloc(1, BITMAP_SIZE);
+
+    if ( !to_send || !to_fix )
+    {
+        ERROR("Couldn't allocate to_send array");
+        goto out;
+    }
+
+    memset(to_send, 0xff, BITMAP_SIZE);
+
+    if ( lock_pages(to_send, BITMAP_SIZE) )
+    {
+        ERROR("Unable to lock to_send");
+        return 1;
+    }
+
+    pfn_type   = xg_memalign(PAGE_SIZE, ROUNDUP(
+                              MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+    if ( pfn_type == NULL )
+    {
+        ERROR("failed to alloc memory for pfn_type arrays");
+        errno = ENOMEM;
+        goto out;
+    }
+    memset(pfn_type, 0,
+           ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+
+    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+    {
+        ERROR("Unable to lock pfn_type array");
+        goto out;
+    }
+
+    /* Start writing out the saved-domain record. */
+    if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+    {
+        PERROR("write: p2m_size");
+        goto out;
+    }
+
+    /* send shared_info_frame */
+    if ( write_exact(io_fd, &shared_info_frame, sizeof(unsigned long)) )
+    {
+        PERROR("write: shared_info_frame");
+        goto out;
+    }
+
+    /* Save magic-page locations. */
+    memset(magic_pfns, 0, sizeof(magic_pfns));
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+                     &magic_pfns[0]);
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+                     &magic_pfns[1]);
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+                     &magic_pfns[2]);
+    DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n",
+        magic_pfns[0], magic_pfns[1], magic_pfns[2]);
+    if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+    {
+        PERROR("Error when writing to state file (7)");
+        goto out;
+    }
+
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_CALLBACK_IRQ,
+                     &callback_irq);
+    DPRINTF("kemari_restore: callback irq %llx", callback_irq);
+    if ( write_exact(io_fd, &callback_irq, sizeof(callback_irq)) )
+    {
+        PERROR("Error when writing to state file (8)");
+        goto out;
+    }
+
+    print_stats(xc_handle, dom, 0, &stats, 0);
+
+    /* Now write out each data page, canonicalising page tables as we go... */
+    {
+        unsigned int prev_pc, sent_this_iter, N, batch, run;
+
+        iter++;
+        sent_this_iter = 0;
+        skip_this_iter = 0;
+        prev_pc = 0;
+        N = 0;
+
+        DPRINTF("Saving memory pages: iter %d   0%%", iter);
+
+        while ( N < p2m_size )
+        {
+            unsigned int this_pc = (N * 100) / p2m_size;
+
+            if ( (this_pc - prev_pc) >= 5 )
+            {
+                DPRINTF("\b\b\b\b%3d%%", this_pc);
+                prev_pc = this_pc;
+            }
+
+            /* load pfn_type[] with the mfn of all the pages we're doing in
+               this batch. */
+            for  ( batch = 0;
+                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+                   N++ )
+            {
+                int n = N;
+
+                if ( debug )
+                {
+                    DPRINTF("%d pfn= %08lx mfn= %08lx %d",
+                            iter, (unsigned long)n,
+                            (long unsigned int)0,
+                            test_bit(n, to_send));
+                    DPRINTF("\n");
+                }
+
+                if ( !( (test_bit(n, to_send)) || (test_bit(n, to_fix))) )
+                    continue;
+
+                /*
+                ** we get here if:
+                **  1. page is marked to_send & hasn't already been re-dirtied
+                **  2. add in pages that still need fixup (net bufs)
+                */
+
+                /* Hypercall interfaces operate in PFNs for HVM guests
+                * and MFNs for PV guests */
+                pfn_type[batch] = n;
+
+                if ( !is_mapped(pfn_type[batch]) )
+                {
+                    /*
+                    ** not currently in psuedo-physical map -- set bit
+                    ** in to_fix since we must send this page in last_iter
+                    ** unless its sent sooner anyhow, or it never enters
+                    ** pseudo-physical map (e.g. for ballooned down doms)
+                    */
+                    set_bit(n, to_fix);
+                    continue;
+                }
+
+                if ( test_bit(n, to_fix) &&
+                     !test_bit(n, to_send) )
+                {
+                    needed_to_fix++;
+                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+                            iter, n, pfn_type[batch]);
+                }
+
+                clear_bit(n, to_fix);
+
+                batch++;
+            }
+
+            if ( batch == 0 )
+                goto skip; /* vanishingly unlikely... */
+
+            region_base = xc_map_foreign_batch(
+                xc_handle, dom, PROT_READ, pfn_type, batch);
+            if ( region_base == NULL )
+            {
+                ERROR("map batch failed");
+                goto out;
+            }
+
+            {
+                /* Look for and skip completely empty batches. */
+                for ( j = 0; j < batch; j++ )
+                    if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
+                         XEN_DOMCTL_PFINFO_XTAB )
+                        break;
+                if ( j == batch )
+                {
+                    munmap(region_base, batch*PAGE_SIZE);
+                    continue; /* bail on this batch: no valid pages */
+                }
+            }
+
+            if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
+            {
+                PERROR("Error when writing to state file (2)");
+                goto out;
+            }
+
+            if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+            {
+                PERROR("Error when writing to state file (3)");
+                goto out;
+            }
+
+            /* entering this loop, pfn_type is now in pfns (Not mfns) */
+            run = 0;
+            for ( j = 0; j < batch; j++ )
+            {
+                unsigned long pfn, pagetype;
+
+                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( pagetype != 0 )
+                {
+                    /* If the page is not a normal data page, write out any
+                       run of pages we may have previously acumulated */
+                    if ( run )
+                    {
+                        if ( ratewrite(io_fd,
+                                       (char*)region_base+(PAGE_SIZE*(j-run)),
+                                       PAGE_SIZE*run) != PAGE_SIZE*run )
+                        {
+                            ERROR("Error when writing to state file (4a)"
+                                  " (errno %d)", errno);
+                            goto out;
+                        }
+                        run = 0;
+                    }
+                }
+
+                /* skip pages that aren't present */
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                    continue;
+
+                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+                {
+                    DPRINTF("canonicalize_pagetable pagetype = %lx pfn = 
%lu\n", pagetype, pfn);
+                }
+                else
+                {
+                    /* We have a normal page: accumulate it for writing. */
+                    run++;
+                }
+            } /* end of the write out for this batch */
+
+            if ( run )
+            {
+                /* write out the last accumulated run of pages */
+                if ( ratewrite(io_fd,
+                               (char*)region_base+(PAGE_SIZE*(j-run)),
+                               PAGE_SIZE*run) != PAGE_SIZE*run )
+                {
+                    ERROR("Error when writing to state file (4c)"
+                          " (errno %d)", errno);
+                    goto out;
+                }
+            }
+
+            sent_this_iter += batch;
+
+            munmap(region_base, batch*PAGE_SIZE);
+
+        } /* end of this while loop for this iteration */
+
+      skip:
+
+        total_sent += sent_this_iter;
+
+        DPRINTF("\r %d: sent %d, skipped %d, ",
+                iter, sent_this_iter, skip_this_iter );
+
+        {
+            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+            DPRINTF("Total pages sent= %ld (%.2fx)\n",
+                    total_sent, ((float)total_sent)/p2m_size );
+            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
+        }
+    } /* end of infinite for loop */
+
+    DPRINTF("All memory is saved\n");
+
+    if (send_hvm_params(xc_handle, io_fd, dom) < 0)
+        goto out;
+
+    /* Zero terminate */
+    i = 0;
+    if ( write_exact(io_fd, &i, sizeof(int)) )
+    {
+        PERROR("Error when writing to state file (6')");
+        goto out;
+    }
+
+    if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+        goto out;
+
+    if (!debug)
+    {
+        int rcv_status;
+        if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+            ERROR("Error when reading receiver status");
+            goto out;
+        }
+        DPRINTF("status received: %d\n", rcv_status);
+    }
+
+    if (init_guest_mem(xc_handle, dom) < 0)
+        goto out;
+
+    /* HVM guests are done now */
+    rc = 0;
+
+ out:
+
+    /* Flush last write and discard cache for file. */
+    discard_file_cache(io_fd, 1 /* flush */);
+
+    free(to_send);
+    free(to_fix);
+
+    DPRINTF("Save exit rc=%d\n",rc);
+
+    return !!rc;
+}
+
+
+int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom,
+                     void *kemari_ring, uint32_t flags,
+                     void (*qemu_save_image)(int),
+                     void (*qemu_end_flip)(void),
+                     void (*qemu_end_save)(void),
+                     void (*qemu_image_sent)(void))
+{
+    int rc = 1, k;
+    int debug = (flags & XCFLAGS_DEBUG);
+    uint32_t i, j, index = 0;
+    unsigned int batch = 0;
+    struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+    struct kemari_ent *buf;
+    struct iovec iov[MAX_BATCH_SIZE + 2]; /* 2 for batch and pfn_type */
+    int iovcnt = 2;
+
+#define ADD_IOV(base, len) do {                                         \
+    iov[iovcnt].iov_base = base;                                        \
+    iov[iovcnt].iov_len = len;                                          \
+    iovcnt++;                                                           \
+} while (0)
+
+
+
+    /* flip active qemu */
+    qemu_active = qemu_non_active;
+    qemu_non_active = qemu_active ? 0 : 1;
+    qemu_save_image(qemu_active);
+
+    /*
+     * main iteration starts from here
+     */
+    while (ring->cons < ring->prod) {
+
+        kemari_ring_read(ring, &buf);
+
+        for (i = buf->u.index.start, j = buf->u.index.end; i < j; i++) {
+
+            int next, offset = 0;
+
+            index = i * BITS_PER_LONG;
+
+            kemari_ring_read(ring, &buf);
+
+            while (buf->u.dirty_bitmap && offset < BITS_PER_LONG) {
+                int n;
+                next = ffs(buf->u.dirty_bitmap);
+                buf->u.dirty_bitmap >>= next;
+                offset += next;
+                n = offset + index - 1;
+                ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE);
+                pfn_type[batch] = n;
+                batch++;
+            }
+
+            if ((batch + BITS_PER_LONG - 1 < MAX_BATCH_SIZE) &&
+                !(ring->cons == ring->prod))
+                continue;
+
+            /* Pull in the dirty bits from qemu-dm too */
+            qemu_end_flip();
+            for ( k = 0; k < BITMAP_SIZE / BITS_PER_LONG; k++) {
+                if (qemu_bitmaps[qemu_non_active][k] != 0) {
+                    unsigned int bmp = qemu_bitmaps[qemu_non_active][k];
+
+                    index = k * BITS_PER_LONG;
+                    while (bmp && offset < BITS_PER_LONG) {
+                        int n, next, offset = 0;
+                        next = ffs(bmp);
+                        bmp >>= next;
+                        offset += next;
+                        n = offset + index - 1;
+
+                        ADD_IOV(search_guest_mem(xc_handle, dom, n), 
PAGE_SIZE);
+                        pfn_type[batch] = n;
+                        batch++;
+                    }
+                    qemu_bitmaps[qemu_non_active][k] = 0;
+                }
+                if (batch >= MAX_BATCH_SIZE) {
+                    ERROR("Sorry, reached MAX_BATCH_SIZE.  "
+                        "We will fix this lator.");
+                    goto out;
+                }
+            }
+
+            PPRINTF("batch %d\n", batch);
+
+            /* send pages */
+            iov[0].iov_base = &batch;
+            iov[0].iov_len = sizeof(batch);
+
+            iov[1].iov_base = pfn_type;
+            iov[1].iov_len = sizeof(pfn_type[0]) * batch;
+
+            for (k = 0; k < iovcnt / IOV_MAX + 1; k++) {
+                int count = (iovcnt<IOV_MAX*(k+1))?(iovcnt-IOV_MAX*k):IOV_MAX;
+                if (writev_exact(io_fd, &iov[IOV_MAX * k], count)) {
+                    ERROR("Error when writing pages state file (2--4)"
+                          " (errno %d)", errno);
+                    goto out;
+                }
+            }
+
+            batch = 0;
+        }
+    }
+
+    if (send_hvm_params(xc_handle, io_fd, dom) < 0)
+        goto out;
+    qemu_end_save();
+    if (!debug && send_qemu_image(xc_handle, io_fd, dom) < 0)
+        goto out;
+    qemu_image_sent();
+
+    /* Zero terminate */
+    i = 0;
+    if ( write_exact(io_fd, &i, sizeof(int)) )
+    {
+        PERROR("Error when writing to state file (6')");
+        goto out;
+    }
+
+    if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+        goto out;
+
+    if (!debug)
+    {
+        int rcv_status;
+        if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+            ERROR("Error when reading receiver status");
+            goto out;
+        }
+    }
+
+    rc = 0;
+out:
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r b249f3e979a5 -r 06b950859c92 tools/xcutils/xc_kemari_save.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_save.c    Tue Mar 24 15:11:38 2009 +0900
@@ -0,0 +1,525 @@
+/*
+ * xc_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008-2009 Nippon Telegraph and Telephone Corporation.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * This source code is based on xc_save.c.
+ * Copied qemu_destroy_buffer and init_qemu_maps from xc_save.c.
+ *
+ * Copyright (C) 2005 by Christian Limpach
+ *
+ */
+
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <xs.h>
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xc_private.h>
+#include <xen/kemari.h>
+
+static volatile sig_atomic_t run = 1;
+static int xc_handle, xce_handle, io_fd;
+static struct kemari_ring *ring = NULL;
+static uint32_t kemari_ring_size = 0;
+static int qemu_port;
+static int is_finalized = 0;
+static int domid;
+
+/* For HVM guests, there are two sources of dirty pages: the Xen shadow
+ * log-dirty bitmap, which we get with a hypercall, and qemu's version.
+ * The protocol for getting page-dirtying data from qemu uses a
+ * double-buffered shared memory interface directly between xc_save and
+ * qemu-dm.
+ *
+ * xc_save calculates the size of the bitmaps and notifies qemu-dm
+ * through the store that it wants to share the bitmaps.  qemu-dm then
+ * starts filling in the 'active' buffer.
+ *
+ * To change the buffers over, xc_save writes the other buffer number to
+ * the store and waits for qemu to acknowledge that it is now writing to
+ * the new active buffer.  xc_save can then process and clear the old
+ * active buffer. */
+
+static char *qemu_active_path;
+static char *qemu_next_active_path;
+static int qemu_shmid = -1;
+static struct xs_handle *xs;
+
+
+/* Mark the shared-memory segment for destruction */
+static void qemu_destroy_buffer(void)
+{
+    if (qemu_shmid != -1)
+        shmctl(qemu_shmid, IPC_RMID, NULL);
+    qemu_shmid = -1;
+}
+
+static char *kemari_qemu_info = NULL;
+static void qemu_save_image(int next_active)
+{
+    kemari_qemu_info[0] = next_active;
+    kemari_qemu_info[1] = 0;
+    xen_wmb();
+    xc_evtchn_notify(xce_handle, qemu_port);
+}
+
+static void qemu_end_flip(void)
+{
+    while (kemari_qemu_info[1] == 0)
+        xen_rmb();
+}
+
+static void qemu_end_save(void)
+{
+    while (kemari_qemu_info[2] == 0)
+        xen_rmb();
+}
+
+static void qemu_image_sent(void)
+{
+    /* after QEMU image sent */
+    kemari_qemu_info[2] = 0;
+    xen_wmb();
+}
+
+static void *init_qemu_maps(int domid, unsigned int bitmap_size)
+{
+    key_t key;
+    char key_ascii[17] = {0,};
+    void *seg;
+    char *path, *p;
+
+    /* Make a shared-memory segment */
+    do {
+        key = rand(); /* No security, just a sequence of numbers */
+        qemu_shmid = shmget(key, 2 * bitmap_size + PAGE_SIZE,
+                       IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR);
+        if (qemu_shmid == -1 && errno != EEXIST)
+            errx(1, "can't get shmem to talk to qemu-dm");
+    } while (qemu_shmid == -1);
+
+    /* Remember to tidy up after ourselves */
+    atexit(qemu_destroy_buffer);
+
+    /* Map it into our address space */
+    seg = shmat(qemu_shmid, NULL, 0);
+    if (seg == (void *) -1)
+        errx(1, "can't map shmem to talk to qemu-dm");
+    memset(seg, 0, 2 * bitmap_size + PAGE_SIZE);
+
+    /* Write the size of it into the first 32 bits */
+    *(uint32_t *)seg = bitmap_size;
+
+    /* Tell qemu about it */
+    if ((xs = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+    if (!(path = strdup("/local/domain/0/device-model/")))
+        errx(1, "can't get domain path in store");
+    if (!(path = realloc(path, strlen(path)
+                         + 10
+                         + strlen("/logdirty/next-active") + 1)))
+        errx(1, "no memory for constructing xenstore path");
+    snprintf(path + strlen(path), 11, "%i", domid);
+    strcat(path, "/logdirty/");
+    p = path + strlen(path);
+
+    strcpy(p, "key");
+    snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key);
+    if (!xs_write(xs, XBT_NULL, path, key_ascii, 16))
+        errx(1, "can't write key (%s) to store path (%s)\n", key_ascii, path);
+
+    /* Watch for qemu's indication of the active buffer, and request it
+     * to start writing to buffer 0 */
+    strcpy(p, "active");
+    if (!xs_watch(xs, path, "qemu-active-buffer"))
+        errx(1, "can't set watch in store (%s)\n", path);
+    if (!(qemu_active_path = strdup(path)))
+        errx(1, "no memory for copying xenstore path");
+
+    strcpy(p, "next-active");
+    if (!(qemu_next_active_path = strdup(path)))
+        errx(1, "no memory for copying xenstore path");
+
+    kemari_qemu_info = seg + 2 * bitmap_size;
+    xen_wmb();
+    qemu_save_image(0);
+
+    free(path);
+    return seg;
+}
+
+static void close_handler(int sig_type)
+{
+    run = 0;
+}
+
+static int handle_event(int domid, unsigned int flags)
+{
+    int ret = 1, rcv_port;
+
+    if ((rcv_port = xc_evtchn_pending(xce_handle)) < 0) {
+        ERROR("Failed to read from event fd");
+        goto out;
+    }
+
+    if (xc_kemari_update(xc_handle, io_fd, domid, ring, flags,
+       qemu_save_image, qemu_end_flip, qemu_end_save, qemu_image_sent) != 0) {
+        xc_domain_pause(xc_handle, domid);
+        ERROR("xc_kemari_update failed");
+        goto out;
+    }
+
+    if (xc_evtchn_unmask(xce_handle, rcv_port) < 0) {
+        ERROR("Failed to write to event fd");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static void set_signal_handler(void (*handler)(int))
+{
+    struct sigaction act;
+
+    act.sa_handler = handler;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+    sigaction(SIGQUIT, &act, 0);
+    sigaction(SIGINT, &act, 0);
+    sigaction(SIGHUP, &act, 0);
+    sigaction(SIGTERM, &act, 0);
+}
+
+static int attach_ports(int domid)
+{
+    struct xs_handle *xs_handle;
+    char **list, *data;
+    unsigned int list_size, data_size;
+    char path[128];
+    uint32_t port;
+    int i, ret = 1;
+
+    if ((xs_handle = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+
+    /*
+     * attach block port.
+     */
+    snprintf(path, sizeof(path), "/local/domain/%d/device/vbd", domid);
+    list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+    if (list == NULL)
+        errx(1, "xs_directory (%s) failed", path);
+
+    for (i = 0; i < list_size; i++) {
+        snprintf(path, sizeof(path),
+            "/local/domain/%d/device/vbd/%s/event-channel", domid, list[i]);
+        data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+        if (data == NULL)
+            continue;
+        port = strtoul(data, NULL, 10);
+        if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+                                 &port, NULL,
+                                 NULL, KEMARI_TAP_OUT)) != 0) {
+            ERROR("Error when attaching blk_port (%d) on kemari", port);
+            goto out;
+        }
+        free(data);
+        DPRINTF("blk_port %d attached\n", port);
+    }
+    free(list);
+
+    /*
+     * attach net port.
+     */
+    snprintf(path, sizeof(path), "/local/domain/%d/device/vif", domid);
+    list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+    if (list == NULL)
+        errx(1, "xs_directory (%s) failed", path);
+
+    for (i = 0; i < list_size; i++) {
+        snprintf(path, sizeof(path),
+            "/local/domain/%d/device/vif/%s/event-channel", domid, list[i]);
+        data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+        if (data == NULL)
+            continue;
+        port = strtoul(data, NULL, 10);
+        if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+                                 &port, NULL,
+                                 NULL, KEMARI_TAP_OUT)) != 0) {
+            ERROR("Error when attaching net_port (%d) on kemari", port);
+            goto out;
+        }
+        free(data);
+        DPRINTF("net_port %d attached\n", port);
+    }
+    free(list);
+
+    /* attach success */
+    ret = 0;
+
+out:
+    xs_daemon_close(xs_handle);
+
+    return ret;
+}
+
+static int get_qemu_port(unsigned int domid)
+{
+    struct xs_handle *xs_handle;
+    char path[128];
+    char *data;
+    unsigned int data_size;
+    int port, inter_port = -1;
+
+    if ((xs_handle = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+
+    snprintf(path, sizeof(path),
+        "/local/domain/%u/kemari/event-channel", domid);
+
+    data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+    if (data == NULL) {
+        ERROR("Could not find QEMU port for domid %d", domid);
+        goto out;
+    }
+    port = strtoul(data, NULL, 10);
+    free(data);
+
+    inter_port = xc_evtchn_bind_interdomain(xce_handle, DOMID_SELF, port);
+    if (inter_port < 0)
+        errx(1, "Port assigned by Xen is strange: %d", inter_port);
+
+    DPRINTF("qemu_port: %d %d\n", port, inter_port);
+
+out:
+    xs_daemon_close(xs_handle);
+
+    return inter_port;
+}
+
+static void finalize(void)
+{
+    int ret;
+
+    if (is_finalized)
+        return;
+
+    set_signal_handler(SIG_IGN);
+    if (ring != NULL)
+        munmap(ring, kemari_ring_size * PAGE_SIZE);
+
+    if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_off,
+                            NULL, NULL, NULL, 0)) != 0) {
+        ERROR("Error when turning off kemari");
+    } else {
+        DPRINTF("successufully execute KEMARI_OP_off\n");
+    }
+
+    if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                           NULL, 0, NULL, 0, NULL) < 0 ) {
+        ERROR("Warning - couldn't disable shadow mode");
+    }
+
+    if (!run)
+        xc_domain_destroy(xc_handle, domid);
+
+    xc_interface_close(xc_handle);
+
+    is_finalized = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+    unsigned int maxit, max_f, flags;
+    int ret;
+    int evtchn_fd;
+    uint32_t port, kemari_port;
+    uint64_t kemari_mfn;
+    fd_set inset;
+
+    if (argc != 6)
+        errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
+
+    xc_handle = xc_interface_open();
+    if (xc_handle < 0)
+        errx(1, "failed to open control interface");
+
+    io_fd = atoi(argv[1]);
+    domid = atoi(argv[2]);
+    maxit = atoi(argv[3]);
+    max_f = atoi(argv[4]);
+    flags = atoi(argv[5]);
+
+    set_signal_handler(close_handler);
+    atexit(finalize);
+
+    if (io_fd == -1) /* means test mode */
+    {
+        io_fd = open("/dev/null", O_RDWR);
+        flags |= XCFLAGS_DEBUG;
+    }
+    else
+    {
+        int one = 1;
+        if (setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY,
+                       &one, sizeof(one)) < 0) {
+            ERROR("failed to set TCP_NODELAY");
+        }
+    }
+
+    if ((xce_handle = xc_evtchn_open()) < 0) {
+        errx(1, "failed to open control interface");
+    }
+
+    evtchn_fd = xc_evtchn_fd(xce_handle);
+
+    if ((qemu_port = get_qemu_port(domid)) < 0)
+        errx(1, "failed to get qemu port");
+
+    if ( xc_shadow_control(xc_handle, domid,
+                           XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                           NULL, 0, NULL, 0, NULL) < 0 )
+    {
+       int frc;
+        /* log-dirty already enabled? There's no test op,
+           so attempt to disable then reenable it */
+        frc = xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                                NULL, 0, NULL, 0, NULL);
+        if ( frc >= 0 )
+        {
+            frc = xc_shadow_control(xc_handle, domid,
+                                    XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                                    NULL, 0, NULL, 0, NULL);
+        }
+
+        if ( frc < 0 )
+        {
+            err(errno, "Couldn't enable shadow mode (rc %d)", frc);
+        }
+    }
+
+    if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_enable,
+                                 &kemari_port, &kemari_ring_size,
+                                 &kemari_mfn, 0) != 0)) {
+        errx(1, "Error when turning on kemari");
+    }
+
+    DPRINTF("kemari_port=%u, kemari_mfn=%llu, kemari_ring_size=%u\n",
+           kemari_port, kemari_mfn, kemari_ring_size);
+
+    if (attach_ports(domid) != 0) {
+        ERROR("attaching port failed ");
+        goto out;
+    }
+
+    if ((port = xc_evtchn_bind_interdomain(xce_handle, domid,
+                                           kemari_port)) < 0) {
+        ERROR("xc_evtchn_bind_interdomain failed ");
+        goto out;
+    }
+
+    if ((ring = xc_map_foreign_range(xc_handle, DOMID_XEN,
+                                     kemari_ring_size * PAGE_SIZE, PROT_READ | 
PROT_WRITE,
+                                     kemari_mfn)) == 0) {
+        ERROR("xc_map_foreign_range failed");
+        goto out;
+    }
+
+    if (xc_domain_pause(xc_handle, domid) < 0) {
+        ERROR("Domain appears not to have paused");
+        goto out;
+    }
+
+    ret = xc_kemari_save(xc_handle, io_fd, domid, ring, flags,
+                         !!(flags & XCFLAGS_HVM),
+                         &init_qemu_maps);
+    if (ret != 0) {
+        ERROR("xc_kemari_save failed");
+        goto out;
+    }
+
+    FD_ZERO(&inset);
+    FD_SET(evtchn_fd, &inset);
+
+    if (xc_domain_unpause(xc_handle, domid) < 0) {
+        ERROR("Domain appears not to have unpaused");
+        goto out;
+    }
+
+    DPRINTF("running start");
+
+    while (run) {
+
+        if (select(evtchn_fd + 1, &inset, NULL, NULL, NULL) < 0) {
+            if (errno == EINTR)
+                continue;
+            ERROR("Error when waiting events by select()");
+            break;
+        }
+
+        if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) {
+
+            if ((ret = handle_event(domid, flags)) != 0) {
+                ERROR("Error when handling events");
+                break;
+            }
+
+            /* usleep(10000); */
+
+            if (xc_evtchn_notify(xce_handle, port) < 0) {
+                ERROR("xc_evtchn_notify failed");
+                /* goto out; */
+                break;
+            }
+
+            if(xc_domain_unpause(xc_handle, domid) < 0) {
+                ERROR("xc_domain_unpause");
+                /* goto out; */
+                break;
+            }
+
+        }
+    }
+
+ out:
+    close(io_fd);
+    finalize();
+
+    return ret;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.