[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [HVM] Save/restore: merge xc_linux_save and xc_hvm_save



# HG changeset patch
# User Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
# Date 1176299114 -3600
# Node ID 90a6af455bbd335f1c4f9809e568d1b0af8c074c
# Parent  6e7ef794cdbc1e9ad83e93945a4ece01daebeb0a
[HVM] Save/restore: merge xc_linux_save and xc_hvm_save
into xc_domain_save, like we did for xc_domain_restore
Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
---
 tools/libxc/xc_hvm_save.c             |  755 ---------------
 tools/libxc/xc_linux_save.c           | 1414 -----------------------------
 tools/libxc/Makefile                  |    4 
 tools/libxc/ia64/xc_ia64_linux_save.c |    6 
 tools/libxc/xc_domain_save.c          | 1609 ++++++++++++++++++++++++++++++++++
 tools/libxc/xenguest.h                |   19 
 tools/libxc/xg_private.c              |   11 
 tools/xcutils/xc_save.c               |    9 
 8 files changed, 1624 insertions(+), 2203 deletions(-)

diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/Makefile
--- a/tools/libxc/Makefile      Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/libxc/Makefile      Wed Apr 11 14:45:14 2007 +0100
@@ -26,8 +26,8 @@ CTRL_SRCS-$(CONFIG_X86_Linux) += xc_ptra
 
 GUEST_SRCS-y :=
 GUEST_SRCS-y += xg_private.c
-GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_linux_save.c
-GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_save.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
+GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
 
 # symlink libelf from xen/common/libelf/
 LIBELF_SRCS := libelf-tools.c libelf-loader.c
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c     Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c     Wed Apr 11 14:45:14 2007 +0100
@@ -134,8 +134,10 @@ retry:
 }
 
 int
-xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-              uint32_t max_factor, uint32_t flags, int (*suspend)(int))
+xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+               uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+               int hvm, void *(*init_qemu_maps)(int, unsigned),
+               void (*qemu_flip_buffer)(int, int))
 {
     DECLARE_DOMCTL;
     xc_dominfo_t info;
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xc_domain_save.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_domain_save.c      Wed Apr 11 14:45:14 2007 +0100
@@ -0,0 +1,1609 @@
+/******************************************************************************
+ * xc_linux_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include <xen/hvm/e820.h>
+
+/*
+** Default values for important tuning parameters. Can override by passing
+** non-zero replacement values to xc_domain_save().
+**
+** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
+**
+*/
+#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
+#define DEF_MAX_FACTOR   3   /* never send more than 3x p2m_size  */
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the current guest */
+static unsigned int pt_levels;
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* Live mapping of the table mapping each PFN to its current MFN. */
+static xen_pfn_t *live_p2m = NULL;
+
+/* Live mapping of system MFN to PFN table. */
+static xen_pfn_t *live_m2p = NULL;
+static unsigned long m2p_mfn0;
+
+/* grep fodder: machine_to_phys */
+
+#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
+
+/*
+ * Returns TRUE if the given machine frame number has a unique mapping
+ * in the guest's pseudophysical map.
+ */
+#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
+    (((_mfn) < (max_mfn)) &&                    \
+     ((mfn_to_pfn(_mfn) < (p2m_size)) &&        \
+      (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
+
+/* Returns TRUE if MFN is successfully converted to a PFN. */
+#define translate_mfn_to_pfn(_pmfn)                             \
+({                                                              \
+    unsigned long mfn = *(_pmfn);                               \
+    int _res = 1;                                               \
+    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
+        _res = 0;                                               \
+    else                                                        \
+        *(_pmfn) = mfn_to_pfn(mfn);                             \
+    _res;                                                       \
+})
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE   (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
+static inline unsigned int hweight32(unsigned int w)
+{
+    unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
+    res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+    res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
+    res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
+    return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+}
+
+static inline int count_bits ( int nr, volatile void *addr)
+{
+    int i, count = 0;
+    volatile unsigned long *p = (volatile unsigned long *)addr;
+    /* We know that the array is padded to unsigned long. */
+    for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
+        count += hweight32(*p);
+    return count;
+}
+
+static inline int permute( int i, int nr, int order_nr  )
+{
+    /* Need a simple permutation function so that we scan pages in a
+       pseudo random order, enabling us to get a better estimate of
+       the domain's page dirtying rate as we go (there are often
+       contiguous ranges of pfns that have similar behaviour, and we
+       want to mix them up. */
+
+    /* e.g. nr->oder 15->4 16->4 17->5 */
+    /* 512MB domain, 128k pages, order 17 */
+
+    /*
+      QPONMLKJIHGFEDCBA
+             QPONMLKJIH
+      GFEDCBA
+     */
+
+    /*
+      QPONMLKJIHGFEDCBA
+                  EDCBA
+             QPONM
+      LKJIHGF
+      */
+
+    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
+    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
+
+    return i;
+}
+
+static uint64_t tv_to_us(struct timeval *new)
+{
+    return (new->tv_sec * 1000000) + new->tv_usec;
+}
+
+static uint64_t llgettimeofday(void)
+{
+    struct timeval now;
+    gettimeofday(&now, NULL);
+    return tv_to_us(&now);
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+    return (((new->tv_sec - old->tv_sec)*1000000) +
+            (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, int live, void *buffer, int len) 
+{
+    static int write_count = 0;
+
+    int rc = write(fd,buffer,len);
+
+    write_count += len;
+    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+    {
+        /* Time to discard cache - dont care if this fails */
+        discard_file_cache(fd, 0 /* no flush */);
+        write_count = 0;
+    }
+
+    return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
+#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU      781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+    mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, int live, void *buf, int n)
+{
+    static int budget = 0;
+    static int burst_time_us = -1;
+    static struct timeval last_put = { 0 };
+    struct timeval now;
+    struct timespec delay;
+    long long delta;
+
+    if ( START_MBIT_RATE == 0 )
+        return noncached_write(io_fd, live, buf, n);
+
+    budget -= n;
+    if ( budget < 0 )
+    {
+        if ( mbit_rate != ombit_rate )
+        {
+            burst_time_us = RATE_TO_BTU / mbit_rate;
+            ombit_rate = mbit_rate;
+            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+                    mbit_rate, BURST_BUDGET, burst_time_us);
+        }
+        if ( last_put.tv_sec == 0 )
+        {
+            budget += BURST_BUDGET;
+            gettimeofday(&last_put, NULL);
+        }
+        else
+        {
+            while ( budget < 0 )
+            {
+                gettimeofday(&now, NULL);
+                delta = tv_delta(&now, &last_put);
+                while ( delta > burst_time_us )
+                {
+                    budget += BURST_BUDGET;
+                    last_put.tv_usec += burst_time_us;
+                    if ( last_put.tv_usec > 1000000 
+                    {
+                        last_put.tv_usec -= 1000000;
+                        last_put.tv_sec++;
+                    }
+                    delta -= burst_time_us;
+                }
+                if ( budget > 0 )
+                    break;
+                delay.tv_sec = 0;
+                delay.tv_nsec = 1000 * (burst_time_us - delta);
+                while ( delay.tv_nsec > 0 )
+                    if ( nanosleep(&delay, &delay) == 0 )
+                        break;
+            }
+        }
+    }
+    return noncached_write(io_fd, live, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), 
(_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+    return (write(fd, buf, count) == count);
+}
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+                       xc_shadow_op_stats_t *stats, int print)
+{
+    static struct timeval wall_last;
+    static long long      d0_cpu_last;
+    static long long      d1_cpu_last;
+
+    struct timeval        wall_now;
+    long long             wall_delta;
+    long long             d0_cpu_now, d0_cpu_delta;
+    long long             d1_cpu_now, d1_cpu_delta;
+
+    gettimeofday(&wall_now, NULL);
+
+    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+        DPRINTF("ARRHHH!!\n");
+
+    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+    if ( wall_delta == 0 )
+        wall_delta = 1;
+
+    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+    if ( print )
+        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+                "dirtied %dMb/s %" PRId32 " pages\n",
+                wall_delta,
+                (int)((d0_cpu_delta*100)/wall_delta),
+                (int)((d1_cpu_delta*100)/wall_delta),
+                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+                stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+    {
+        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+            + 50;
+        if ( mbit_rate > MAX_MBIT_RATE )
+            mbit_rate = MAX_MBIT_RATE;
+    }
+#endif
+
+    d0_cpu_last = d0_cpu_now;
+    d1_cpu_last = d1_cpu_now;
+    wall_last   = wall_now;
+
+    return 0;
+}
+
+
+static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
+                          unsigned long *arr, int runs)
+{
+    long long start, now;
+    xc_shadow_op_stats_t stats;
+    int j;
+
+    start = llgettimeofday();
+
+    for ( j = 0; j < runs; j++ )
+    {
+        int i;
+
+        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+                          arr, p2m_size, NULL, 0, NULL);
+        DPRINTF("#Flush\n");
+        for ( i = 0; i < 40; i++ )
+        {
+            usleep(50000);
+            now = llgettimeofday();
+            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
+                              NULL, 0, NULL, 0, &stats);
+            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
+                    ((now-start)+500)/1000,
+                    stats.fault_count, stats.dirty_count);
+        }
+    }
+
+    return -1;
+}
+
+
+static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+                             int dom, xc_dominfo_t *info,
+                             vcpu_guest_context_t *ctxt)
+{
+    int i = 0;
+
+    if ( !(*suspend)(dom) )
+    {
+        ERROR("Suspend request failed");
+        return -1;
+    }
+
+ retry:
+
+    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
+    {
+        ERROR("Could not get domain info");
+        return -1;
+    }
+
+    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
+        ERROR("Could not get vcpu context");
+
+
+    if ( info->dying )
+    {
+        ERROR("domain is dying");
+        return -1;
+    }
+
+    if ( info->crashed )
+    {
+        ERROR("domain has crashed");
+        return -1;
+    }
+
+    if ( info->shutdown )
+    {
+        switch ( info->shutdown_reason )
+        {
+        case SHUTDOWN_poweroff:
+        case SHUTDOWN_reboot:
+            ERROR("domain has shut down");
+            return -1;
+        case SHUTDOWN_suspend:
+            return 0;
+        case SHUTDOWN_crash:
+            ERROR("domain has crashed");
+            return -1;
+        }
+    }
+
+    if ( info->paused )
+    {
+        /* Try unpausing domain, wait, and retest. */
+        xc_domain_unpause( xc_handle, dom );
+        ERROR("Domain was paused. Wait and re-test.");
+        usleep(10000); /* 10ms */
+        goto retry;
+    }
+
+    if ( ++i < 100 )
+    {
+        ERROR("Retry suspend domain");
+        usleep(10000); /* 10ms */
+        goto retry;
+    }
+
+    ERROR("Unable to suspend domain.");
+
+    return -1;
+}
+
+/*
+** Map the top-level page of MFNs from the guest. The guest might not have
+** finished resuming from a previous restore operation, so we wait a while for
+** it to update the MFN to a reasonable value.
+*/
+static void *map_frame_list_list(int xc_handle, uint32_t dom,
+                                 shared_info_t *shinfo)
+{
+    int count = 100;
+    void *p;
+
+    while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
+        usleep(10000);
+
+    if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
+    {
+        ERROR("Timed out waiting for frame list updated.");
+        return NULL;
+    }
+
+    p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
+                             shinfo->arch.pfn_to_mfn_frame_list_list);
+    if ( p == NULL )
+        ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
+
+    return p;
+}
+
+/*
+** During transfer (or in the state file), all page-table pages must be
+** converted into a 'canonical' form where references to actual mfns
+** are replaced with references to the corresponding pfns.
+**
+** This function performs the appropriate conversion, taking into account
+** which entries do not require canonicalization (in particular, those
+** entries which map the virtual address reserved for the hypervisor).
+*/
+static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
+                           const void *spage, void *dpage)
+{
+
+    int i, pte_last, xen_start, xen_end, race = 0; 
+    uint64_t pte;
+
+    /*
+    ** We need to determine which entries in this page table hold
+    ** reserved hypervisor mappings. This depends on the current
+    ** page table type as well as the number of paging levels.
+    */
+    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
+
+    if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
+
+    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
+        xen_start = L3_PAGETABLE_ENTRIES_PAE;
+
+    /*
+    ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
+    ** We can spot this by looking for the guest linear mapping which
+    ** Xen always ensures is present in that L2. Guests must ensure
+    ** that this check will fail for other L2s.
+    */
+    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+    {
+        int hstart;
+        uint64_t he;
+
+        hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+        he = ((const uint64_t *) spage)[hstart];
+
+        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+        {
+            /* hvirt starts with xen stuff... */
+            xen_start = hstart;
+        }
+        else if ( hvirt_start != 0xf5800000 )
+        {
+            /* old L2s from before hole was shrunk... */
+            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+            he = ((const uint64_t *) spage)[hstart];
+            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+                xen_start = hstart;
+        }
+    }
+
+    if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
+    {
+        /*
+        ** XXX SMH: should compute these from hvirt_start (which we have)
+        ** and hvirt_end (which we don't)
+        */
+        xen_start = 256;
+        xen_end   = 272;
+    }
+
+    /* Now iterate through the page table, canonicalizing each PTE */
+    for (i = 0; i < pte_last; i++ )
+    {
+        unsigned long pfn, mfn;
+
+        if ( pt_levels == 2 )
+            pte = ((const uint32_t*)spage)[i];
+        else
+            pte = ((const uint64_t*)spage)[i];
+
+        if ( (i >= xen_start) && (i < xen_end) )
+            pte = 0;
+
+        if ( pte & _PAGE_PRESENT )
+        {
+            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
+            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+            {
+                /* This will happen if the type info is stale which
+                   is quite feasible under live migration */
+                pfn  = 0;  /* zap it - we'll retransmit this page later */
+                race = 1;  /* inform the caller of race; fatal if !live */ 
+            }
+            else
+                pfn = mfn_to_pfn(mfn);
+
+            pte &= ~MADDR_MASK_X86;
+            pte |= (uint64_t)pfn << PAGE_SHIFT;
+
+            /*
+             * PAE guest L3Es can contain these flags when running on
+             * a 64bit hypervisor. We zap these here to avoid any
+             * surprise at restore time...
+             */
+            if ( (pt_levels == 3) &&
+                 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
+                 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
+                pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
+        }
+
+        if ( pt_levels == 2 )
+            ((uint32_t*)dpage)[i] = pte;
+        else
+            ((uint64_t*)dpage)[i] = pte;
+    }
+
+    return race;
+}
+
+static xen_pfn_t *xc_map_m2p(int xc_handle,
+                                 unsigned long max_mfn,
+                                 int prot)
+{
+    struct xen_machphys_mfn_list xmml;
+    privcmd_mmap_entry_t *entries;
+    unsigned long m2p_chunks, m2p_size;
+    xen_pfn_t *m2p;
+    xen_pfn_t *extent_start;
+    int i, rc;
+
+    m2p_size   = M2P_SIZE(max_mfn);
+    m2p_chunks = M2P_CHUNKS(max_mfn);
+
+    xmml.max_extents = m2p_chunks;
+    if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
+    {
+        ERROR("failed to allocate space for m2p mfns");
+        return NULL;
+    }
+    set_xen_guest_handle(xmml.extent_start, extent_start);
+
+    if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
+         (xmml.nr_extents != m2p_chunks) )
+    {
+        ERROR("xc_get_m2p_mfns");
+        return NULL;
+    }
+
+    if ( (m2p = mmap(NULL, m2p_size, prot,
+                     MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
+    {
+        ERROR("failed to mmap m2p");
+        return NULL;
+    }
+
+    if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
+    {
+        ERROR("failed to allocate space for mmap entries");
+        return NULL;
+    }
+
+    for ( i = 0; i < m2p_chunks; i++ )
+    {
+        entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
+        entries[i].mfn = extent_start[i];
+        entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
+    }
+
+    if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
+                                     entries, m2p_chunks)) < 0 )
+    {
+        ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
+        return NULL;
+    }
+
+    m2p_mfn0 = entries[0].mfn;
+
+    free(extent_start);
+    free(entries);
+
+    return m2p;
+}
+
+
+static xen_pfn_t *map_and_save_p2m_table(int xc_handle, 
+                                         int io_fd, 
+                                         uint32_t dom,
+                                         vcpu_guest_context_t *ctxt,
+                                         unsigned long p2m_size,
+                                         shared_info_t *live_shinfo)
+{
+    /* Double and single indirect references to the live P2M table */
+    xen_pfn_t *live_p2m_frame_list_list = NULL;
+    xen_pfn_t *live_p2m_frame_list = NULL;
+
+    /* A copy of the pfn-to-mfn table frame list. */
+    xen_pfn_t *p2m_frame_list = NULL;
+
+    /* The mapping of the live p2m table itself */
+    xen_pfn_t *p2m = NULL;
+
+    int i, success = 0;
+
+    live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
+                                                   live_shinfo);
+    if ( !live_p2m_frame_list_list )
+        goto out;
+
+    live_p2m_frame_list =
+        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
+                             live_p2m_frame_list_list,
+                             P2M_FLL_ENTRIES);
+    if ( !live_p2m_frame_list )
+    {
+        ERROR("Couldn't map p2m_frame_list");
+        goto out;
+    }
+
+
+    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
+       the guest must not change which frames are used for this purpose.
+       (its not clear why it would want to change them, and we'll be OK
+       from a safety POV anyhow. */
+
+    p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
+                               live_p2m_frame_list,
+                               P2M_FL_ENTRIES);
+    if ( !p2m )
+    {
+        ERROR("Couldn't map p2m table");
+        goto out;
+    }
+    live_p2m = p2m; /* So that translation macros will work */
+    
+    /* Get a local copy of the live_P2M_frame_list */
+    if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
+    {
+        ERROR("Couldn't allocate p2m_frame_list array");
+        goto out;
+    }
+    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
+
+    /* Canonicalise the pfn-to-mfn table frame-number list. */
+    for ( i = 0; i < p2m_size; i += fpp )
+    {
+        if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
+        {
+            ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
+            ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
+                  (uint64_t)p2m_frame_list[i/fpp]);
+            goto out;
+        }
+    }
+
+    /*
+     * Write an extended-info structure to inform the restore code that
+     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
+     * slow paths in the restore code.
+     */
+    if ( (pt_levels == 3) &&
+         (ctxt->vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
+    {
+        unsigned long signature = ~0UL;
+        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
+        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
+        char chunk_sig[]  = "vcpu";
+        if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
+             !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
+             !write_exact(io_fd, &chunk_sig, 4) ||
+             !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
+             !write_exact(io_fd, ctxt,       sizeof(*ctxt)) )
+        {
+            ERROR("write: extended info");
+            goto out;
+        }
+    }
+
+    if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
+    {
+        ERROR("write: p2m_frame_list");
+        goto out;
+    }    
+
+    success = 1;
+
+ out:
+    
+    if ( !success && p2m )
+        munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+    if ( live_p2m_frame_list_list )
+        munmap(live_p2m_frame_list_list, PAGE_SIZE);
+
+    if ( live_p2m_frame_list )
+        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
+
+    if ( p2m_frame_list ) 
+        free(p2m_frame_list);
+
+    return success ? p2m : NULL;
+}
+
+
+
+int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                   uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+                   int hvm, void *(*init_qemu_maps)(int, unsigned), 
+                   void (*qemu_flip_buffer)(int, int))
+{
+    xc_dominfo_t info;
+
+    int rc = 1, i, j, last_iter, iter = 0;
+    int live  = (flags & XCFLAGS_LIVE);
+    int debug = (flags & XCFLAGS_DEBUG);
+    int race = 0, sent_last_iter, skip_this_iter;
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    /* A table containing the type of each PFN (/not/ MFN!). */
+    unsigned long *pfn_type = NULL;
+    unsigned long *pfn_batch = NULL;
+
+    /* A copy of one frame of guest memory. */
+    char page[PAGE_SIZE];
+
+    /* Live mapping of shared info structure */
+    shared_info_t *live_shinfo = NULL;
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    /* power of 2 order of p2m_size */
+    int order_nr;
+
+    /* bitmap of pages:
+       - that should be sent this iteration (unless later marked as skip);
+       - to skip this iteration because already dirty;
+       - to fixup by sending at the end if not already resent; */
+    unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
+
+    xc_shadow_op_stats_t stats;
+
+    unsigned long needed_to_fix = 0;
+    unsigned long total_sent    = 0;
+
+    uint64_t vcpumap = 1ULL;
+
+    /* HVM: a buffer for holding HVM context */
+    uint32_t hvm_buf_size = 0;
+    uint8_t *hvm_buf = NULL;
+
+    /* HVM: magic frames for ioreqs and xenstore comms. */
+    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+    /* If no explicit control parameters given, use defaults */
+    max_iters  = max_iters  ? : DEF_MAX_ITERS;
+    max_factor = max_factor ? : DEF_MAX_FACTOR;
+
+    initialize_mbit_rate();
+
+    if ( !get_platform_info(xc_handle, dom,
+                            &max_mfn, &hvirt_start, &pt_levels) )
+    {
+        ERROR("Unable to get platform info.");
+        return 1;
+    }
+
+    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+    {
+        ERROR("Could not get domain info");
+        return 1;
+    }
+
+    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
+    {
+        ERROR("Could not get vcpu context");
+        goto out;
+    }
+    shared_info_frame = info.shared_info_frame;
+
+    /* Map the shared info frame */
+    if ( !hvm )
+    {
+        live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                           PROT_READ, shared_info_frame);
+        if ( !live_shinfo )
+        {
+            ERROR("Couldn't map live_shinfo");
+            goto out;
+        }
+    }
+
+    /* Get the size of the P2M table */
+    p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
+
+    /* Domain is still running at this point */
+    if ( live )
+    {
+        /* Live suspend. Enable log-dirty mode. */
+        if ( xc_shadow_control(xc_handle, dom,
+                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                               NULL, 0, NULL, 0, NULL) < 0 )
+        {
+            ERROR("Couldn't enable shadow mode");
+            goto out;
+        }
+
+        if ( hvm )
+        {
+            /* Get qemu-dm logging dirty pages too */
+            void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+            qemu_bitmaps[0] = seg;
+            qemu_bitmaps[1] = seg + BITMAP_SIZE;
+            qemu_active = 0;
+            qemu_non_active = 1;
+        }
+    }
+    else
+    {
+        /* This is a non-live suspend. Suspend the domain .*/
+        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
+        {
+            ERROR("Domain appears not to have suspended");
+            goto out;
+        }
+    }
+
+    last_iter = !live;
+
+    /* pretend we sent all the pages last iteration */
+    sent_last_iter = p2m_size;
+
+    /* calculate the power of 2 order of p2m_size, e.g.
+       15->4 16->4 17->5 */
+    for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
+        continue;
+
+    /* Setup to_send / to_fix and to_skip bitmaps */
+    to_send = malloc(BITMAP_SIZE);
+    to_fix  = calloc(1, BITMAP_SIZE);
+    to_skip = malloc(BITMAP_SIZE);
+
+    if ( !to_send || !to_fix || !to_skip )
+    {
+        ERROR("Couldn't allocate to_send array");
+        goto out;
+    }
+
+    memset(to_send, 0xff, BITMAP_SIZE);
+
+    if ( lock_pages(to_send, BITMAP_SIZE) )
+    {
+        ERROR("Unable to lock to_send");
+        return 1;
+    }
+
+    /* (to fix is local only) */
+    if ( lock_pages(to_skip, BITMAP_SIZE) )
+    {
+        ERROR("Unable to lock to_skip");
+        return 1;
+    }
+
+    if ( hvm ) 
+    {
+        /* Need another buffer for HVM context */
+        hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
+        if ( hvm_buf_size == -1 )
+        {
+            ERROR("Couldn't get HVM context size from Xen");
+            goto out;
+        }
+        hvm_buf = malloc(hvm_buf_size);
+        if ( !hvm_buf )
+        {
+            ERROR("Couldn't allocate memory");
+            goto out;
+        }
+    }
+
+    analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
+
+    /* We want zeroed memory so use calloc rather than malloc. */
+    pfn_type   = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
+    pfn_batch  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
+    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
+    {
+        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+    {
+        ERROR("Unable to lock");
+        goto out;
+    }
+
+    /* Setup the mfn_to_pfn table mapping */
+    if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
+    {
+        ERROR("Failed to map live M2P table");
+        goto out;
+    }
+
+    /* Start writing out the saved-domain record. */
+    if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+    {
+        ERROR("write: p2m_size");
+        goto out;
+    }
+
+    if ( !hvm )
+    {
+        int err = 0;
+        unsigned long mfn;
+
+        /* Map the P2M table, and write the list of P2M frames */
+        live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom, 
+                                          &ctxt, p2m_size, live_shinfo);
+        if ( live_p2m == NULL )
+        {
+            ERROR("Failed to map/save the p2m frame list");
+            goto out;
+        }
+
+        /*
+         * Quick belt and braces sanity check.
+         */
+        
+        for ( i = 0; i < p2m_size; i++ )
+        {
+            mfn = live_p2m[i];
+            if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
+            {
+                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
+                        mfn, mfn_to_pfn(mfn));
+                err++;
+            }
+        }
+        DPRINTF("Had %d unexplained entries in p2m table\n", err);
+    }
+
+    print_stats(xc_handle, dom, 0, &stats, 0);
+
+    /* Now write out each data page, canonicalising page tables as we go... */
+    for ( ; ; )
+    {
+        unsigned int prev_pc, sent_this_iter, N, batch;
+
+        iter++;
+        sent_this_iter = 0;
+        skip_this_iter = 0;
+        prev_pc = 0;
+        N = 0;
+
+        DPRINTF("Saving memory pages: iter %d   0%%", iter);
+
+        while ( N < p2m_size )
+        {
+            unsigned int this_pc = (N * 100) / p2m_size;
+            int rc;
+
+            if ( (this_pc - prev_pc) >= 5 )
+            {
+                DPRINTF("\b\b\b\b%3d%%", this_pc);
+                prev_pc = this_pc;
+            }
+
+            if ( !last_iter )
+            {
+                /* Slightly wasteful to peek the whole array evey time,
+                   but this is fast enough for the moment. */
+                rc = xc_shadow_control(
+                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
+                    p2m_size, NULL, 0, NULL);
+                if ( rc != p2m_size )
+                {
+                    ERROR("Error peeking shadow bitmap");
+                    goto out;
+                }
+            }
+
+            /* load pfn_type[] with the mfn of all the pages we're doing in
+               this batch. */
+            for  ( batch = 0;
+                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+                   N++ )
+            {
+                int n = permute(N, p2m_size, order_nr);
+
+                if ( debug )
+                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
+                            iter, (unsigned long)n, hvm ? 0 : live_p2m[n],
+                            test_bit(n, to_send),
+                            hvm ? 0 : mfn_to_pfn(live_p2m[n]&0xFFFFF));
+
+                if ( !last_iter &&
+                     test_bit(n, to_send) &&
+                     test_bit(n, to_skip) )
+                    skip_this_iter++; /* stats keeping */
+
+                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
+                       (test_bit(n, to_send) && last_iter) ||
+                       (test_bit(n, to_fix)  && last_iter)) )
+                    continue;
+
+                /* Skip PFNs that aren't really there */
+                if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
+                             || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) 
+                                 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
+                    continue;
+
+                /*
+                ** we get here if:
+                **  1. page is marked to_send & hasn't already been re-dirtied
+                **  2. (ignore to_skip in last iteration)
+                **  3. add in pages that still need fixup (net bufs)
+                */
+
+                pfn_batch[batch] = n;
+
+                /* Hypercall interfaces operate in PFNs for HVM guests
+                * and MFNs for PV guests */
+                if ( hvm ) 
+                    pfn_type[batch] = n;
+                else
+                    pfn_type[batch] = live_p2m[n];
+                    
+                if ( !is_mapped(pfn_type[batch]) )
+                {
+                    /*
+                    ** not currently in psuedo-physical map -- set bit
+                    ** in to_fix since we must send this page in last_iter
+                    ** unless its sent sooner anyhow, or it never enters
+                    ** pseudo-physical map (e.g. for ballooned down doms)
+                    */
+                    set_bit(n, to_fix);
+                    continue;
+                }
+
+                if ( last_iter &&
+                     test_bit(n, to_fix) &&
+                     !test_bit(n, to_send) )
+                {
+                    needed_to_fix++;
+                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+                            iter, n, pfn_type[batch]);
+                }
+                
+                clear_bit(n, to_fix);
+                
+                batch++;
+            }
+
+            if ( batch == 0 )
+                goto skip; /* vanishingly unlikely... */
+
+            region_base = xc_map_foreign_batch(
+                xc_handle, dom, PROT_READ, pfn_type, batch);
+            if ( region_base == NULL )
+            {
+                ERROR("map batch failed");
+                goto out;
+            }
+
+            if ( !hvm )
+            {
+                /* Get page types */
+                for ( j = 0; j < batch; j++ )
+                    ((uint32_t *)pfn_type)[j] = pfn_type[j];
+                if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
+                                           (uint32_t *)pfn_type) )
+                {
+                    ERROR("get_pfn_type_batch failed");
+                    goto out;
+                }
+                for ( j = batch-1; j >= 0; j-- )
+                    pfn_type[j] = ((uint32_t *)pfn_type)[j];
+
+                for ( j = 0; j < batch; j++ )
+                {
+                    
+                    if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
+                         XEN_DOMCTL_PFINFO_XTAB )
+                    {
+                        DPRINTF("type fail: page %i mfn %08lx\n", 
+                                j, pfn_type[j]);
+                        continue;
+                    }
+                    
+                    if ( debug )
+                        DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
+                                " sum= %08lx\n",
+                                iter,
+                                (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
+                                pfn_batch[j],
+                                pfn_type[j],
+                                mfn_to_pfn(pfn_type[j] &
+                                           ~XEN_DOMCTL_PFINFO_LTAB_MASK),
+                                csum_page(region_base + (PAGE_SIZE*j)));
+                    
+                    /* canonicalise mfn->pfn */
+                    pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
+                        pfn_batch[j];
+                }
+            }
+
+            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
+            {
+                ERROR("Error when writing to state file (2) (errno %d)",
+                      errno);
+                goto out;
+            }
+
+            if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+            {
+                ERROR("Error when writing to state file (3) (errno %d)",
+                      errno);
+                goto out;
+            }
+
+            /* entering this loop, pfn_type is now in pfns (Not mfns) */
+            for ( j = 0; j < batch; j++ )
+            {
+                unsigned long pfn, pagetype;
+                void *spage = (char *)region_base + (PAGE_SIZE*j);
+
+                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                /* write out pages in batch */
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                    continue;
+
+                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+                {
+                    /* We have a pagetable page: need to rewrite it. */
+                    race = 
+                        canonicalize_pagetable(pagetype, pfn, spage, page); 
+
+                    if ( race && !live )
+                    {
+                        ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
+                              pagetype);
+                        goto out;
+                    }
+
+                    if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
+                    {
+                        ERROR("Error when writing to state file (4)"
+                              " (errno %d)", errno);
+                        goto out;
+                    }
+                }
+                else
+                {
+                    /* We have a normal page: just write it directly. */
+                    if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
+                         PAGE_SIZE )
+                    {
+                        ERROR("Error when writing to state file (5)"
+                              " (errno %d)", errno);
+                        goto out;
+                    }
+                }
+            } /* end of the write out for this batch */
+
+            sent_this_iter += batch;
+
+            munmap(region_base, batch*PAGE_SIZE);
+
+        } /* end of this while loop for this iteration */
+
+      skip:
+
+        total_sent += sent_this_iter;
+
+        DPRINTF("\r %d: sent %d, skipped %d, ",
+                iter, sent_this_iter, skip_this_iter );
+
+        if ( last_iter )
+        {
+            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+            DPRINTF("Total pages sent= %ld (%.2fx)\n",
+                    total_sent, ((float)total_sent)/p2m_size );
+            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
+        }
+
+        if ( last_iter && debug )
+        {
+            int minusone = -1;
+            memset(to_send, 0xff, BITMAP_SIZE);
+            debug = 0;
+            DPRINTF("Entering debug resend-all mode\n");
+
+            /* send "-1" to put receiver into debug mode */
+            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
+            {
+                ERROR("Error when writing to state file (6) (errno %d)",
+                      errno);
+                goto out;
+            }
+
+            continue;
+        }
+
+        if ( last_iter )
+            break;
+
+        if ( live )
+        {
+            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
+                 (iter >= max_iters) ||
+                 (sent_this_iter+skip_this_iter < 50) ||
+                 (total_sent > p2m_size*max_factor) )
+            {
+                DPRINTF("Start last iteration\n");
+                last_iter = 1;
+
+                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
+                                       &ctxt) )
+                {
+                    ERROR("Domain appears not to have suspended");
+                    goto out;
+                }
+
+                DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
+                        info.shared_info_frame,
+                        (unsigned long)ctxt.user_regs.eip,
+                        (unsigned long)ctxt.user_regs.edx);
+            }
+
+            if ( xc_shadow_control(xc_handle, dom, 
+                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
+                                   p2m_size, NULL, 0, &stats) != p2m_size )
+            {
+                ERROR("Error flushing shadow PT");
+                goto out;
+            }
+
+            if ( hvm ) 
+            {
+                /* Pull in the dirty bits from qemu-dm too */
+                if ( !last_iter )
+                {
+                    qemu_active = qemu_non_active;
+                    qemu_non_active = qemu_active ? 0 : 1;
+                    qemu_flip_buffer(dom, qemu_active);
+                    for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
+                    {
+                        to_send[j] |= qemu_bitmaps[qemu_non_active][j];
+                        qemu_bitmaps[qemu_non_active][j] = 0;
+                    }
+                }
+                else
+                {
+                    for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
+                        to_send[j] |= qemu_bitmaps[qemu_active][j];
+                }
+            }
+
+            sent_last_iter = sent_this_iter;
+
+            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
+
+        }
+    } /* end of infinite for loop */
+
+    DPRINTF("All memory is saved\n");
+
+    {
+        struct {
+            int minustwo;
+            int max_vcpu_id;
+            uint64_t vcpumap;
+        } chunk = { -2, info.max_vcpu_id };
+
+        if ( info.max_vcpu_id >= 64 )
+        {
+            ERROR("Too many VCPUS in guest!");
+            goto out;
+        }
+
+        for ( i = 1; i <= info.max_vcpu_id; i++ )
+        {
+            xc_vcpuinfo_t vinfo;
+            if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
+                 vinfo.online )
+                vcpumap |= 1ULL << i;
+        }
+
+        chunk.vcpumap = vcpumap;
+        if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
+        {
+            ERROR("Error when writing to state file (errno %d)", errno);
+            goto out;
+        }
+    }
+
+    /* Zero terminate */
+    i = 0;
+    if ( !write_exact(io_fd, &i, sizeof(int)) )
+    {
+        ERROR("Error when writing to state file (6') (errno %d)", errno);
+        goto out;
+    }
+
+    if ( hvm ) 
+    {
+        uint32_t rec_size;
+
+        /* Save magic-page locations. */
+        memset(magic_pfns, 0, sizeof(magic_pfns));
+        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+                         (unsigned long *)&magic_pfns[0]);
+        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+                         (unsigned long *)&magic_pfns[1]);
+        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+                         (unsigned long *)&magic_pfns[2]);
+        if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+        {
+            ERROR("Error when writing to state file (7)");
+            goto out;
+        }
+
+        /* Save vcpu contexts */
+
+        for ( i = 0; i <= info.max_vcpu_id; i++ )
+        {
+            if ( !(vcpumap & (1ULL << i)) )
+                continue;
+            
+            if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
+            {
+                ERROR("HVM:Could not get vcpu context");
+                goto out;
+            }
+            
+            DPRINTF("write vcpu %d context.\n", i); 
+            if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
+            {
+                ERROR("write vcpu context failed!\n");
+                goto out;
+            }
+        }
+
+        /* Get HVM context from Xen and save it too */
+        if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 
+                                                  hvm_buf_size)) == -1 )
+        {
+            ERROR("HVM:Could not get hvm buffer");
+            goto out;
+        }
+        
+        if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+        {
+            ERROR("error write hvm buffer size");
+            goto out;
+        }
+        
+        if ( !write_exact(io_fd, hvm_buf, rec_size) )
+        {
+            ERROR("write HVM info failed!\n");
+            goto out;
+        }
+        
+        /* HVM guests are done now */
+        rc = 0;
+        goto out;
+    }
+
+    /* PV guests only from now on */
+
+    /* Send through a list of all the PFNs that were not in map at the close */
+    {
+        unsigned int i,j;
+        unsigned long pfntab[1024];
+
+        for ( i = 0, j = 0; i < p2m_size; i++ )
+        {
+            if ( !is_mapped(live_p2m[i]) )
+                j++;
+        }
+
+        if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
+        {
+            ERROR("Error when writing to state file (6a) (errno %d)", errno);
+            goto out;
+        }
+
+        for ( i = 0, j = 0; i < p2m_size; )
+        {
+            if ( !is_mapped(live_p2m[i]) )
+                pfntab[j++] = i;
+
+            i++;
+            if ( (j == 1024) || (i == p2m_size) )
+            {
+                if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
+                {
+                    ERROR("Error when writing to state file (6b) (errno %d)",
+                          errno);
+                    goto out;
+                }
+                j = 0;
+            }
+        }
+    }
+
+    /* Canonicalise the suspend-record frame number. */
+    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
+    {
+        ERROR("Suspend record is not in range of pseudophys map");
+        goto out;
+    }
+
+    for ( i = 0; i <= info.max_vcpu_id; i++ )
+    {
+        if ( !(vcpumap & (1ULL << i)) )
+            continue;
+
+        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
+        {
+            ERROR("No context for VCPU%d", i);
+            goto out;
+        }
+
+        /* Canonicalise each GDT frame number. */
+        for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
+        {
+            if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
+            {
+                ERROR("GDT frame is not in range of pseudophys map");
+                goto out;
+            }
+        }
+
+        /* Canonicalise the page table base pointer. */
+        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
+        {
+            ERROR("PT base is not in range of pseudophys map");
+            goto out;
+        }
+        ctxt.ctrlreg[3] = 
+            xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
+
+        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
+        if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
+        {
+            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
+            {
+                ERROR("PT base is not in range of pseudophys map");
+                goto out;
+            }
+            /* Least-significant bit means 'valid PFN'. */
+            ctxt.ctrlreg[1] = 1 |
+                xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
+        }
+
+        if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
+        {
+            ERROR("Error when writing to state file (1) (errno %d)", errno);
+            goto out;
+        }
+    }
+
+    /*
+     * Reset the MFN to be a known-invalid value. See map_frame_list_list().
+     */
+    memcpy(page, live_shinfo, PAGE_SIZE);
+    ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
+    if ( !write_exact(io_fd, page, PAGE_SIZE) )
+    {
+        ERROR("Error when writing to state file (1) (errno %d)", errno);
+        goto out;
+    }
+
+    /* Success! */
+    rc = 0;
+
+ out:
+
+    if ( live )
+    {
+        if ( xc_shadow_control(xc_handle, dom, 
+                               XEN_DOMCTL_SHADOW_OP_OFF,
+                               NULL, 0, NULL, 0, NULL) < 0 )
+            DPRINTF("Warning - couldn't disable shadow mode");
+    }
+
+    /* Flush last write and discard cache for file. */
+    discard_file_cache(io_fd, 1 /* flush */);
+
+    if ( live_shinfo )
+        munmap(live_shinfo, PAGE_SIZE);
+
+    if ( live_p2m )
+        munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+    if ( live_m2p )
+        munmap(live_m2p, M2P_SIZE(max_mfn));
+
+    free(pfn_type);
+    free(pfn_batch);
+    free(to_send);
+    free(to_fix);
+    free(to_skip);
+
+    DPRINTF("Save exit rc=%d\n",rc);
+
+    return !!rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xc_hvm_save.c
--- a/tools/libxc/xc_hvm_save.c Wed Apr 11 09:29:00 2007 +0100
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,755 +0,0 @@
-/******************************************************************************
- * xc_hvm_save.c
- *
- * Save the state of a running HVM guest.
- *
- * Copyright (c) 2003, K A Fraser.
- * Copyright (c) 2006 Intel Corperation
- * rewriten for hvm guest by Zhai Edwin <edwin.zhai@xxxxxxxxx>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-
-#include "xc_private.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-#include <xen/hvm/e820.h>
-#include <xen/hvm/params.h>
-
-/*
-** Default values for important tuning parameters. Can override by passing
-** non-zero replacement values to xc_hvm_save().
-**
-** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
-**
-*/
-#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
-#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
-
-/* Shared-memory bitmaps for getting log-dirty bits from qemu */
-static unsigned long *qemu_bitmaps[2];
-static int qemu_active;
-static int qemu_non_active;
-
-/*
-** During (live) save/migrate, we maintain a number of bitmaps to track
-** which pages we have to send, to fixup, and to skip.
-*/
-
-#define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
-#define BITMAP_SIZE   (BITS_TO_LONGS(pfn_array_size) * sizeof(unsigned long))
-
-#define BITMAP_ENTRY(_nr,_bmap) \
-   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
-
-#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
-
-static inline int test_bit (int nr, volatile void * addr)
-{
-    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
-}
-
-static inline void clear_bit (int nr, volatile void * addr)
-{
-    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
-}
-
-static inline int permute( int i, int nr, int order_nr  )
-{
-    /* Need a simple permutation function so that we scan pages in a
-       pseudo random order, enabling us to get a better estimate of
-       the domain's page dirtying rate as we go (there are often
-       contiguous ranges of pfns that have similar behaviour, and we
-       want to mix them up. */
-
-    /* e.g. nr->oder 15->4 16->4 17->5 */
-    /* 512MB domain, 128k pages, order 17 */
-
-    /*
-      QPONMLKJIHGFEDCBA
-             QPONMLKJIH
-      GFEDCBA
-     */
-
-    /*
-      QPONMLKJIHGFEDCBA
-                  EDCBA
-             QPONM
-      LKJIHGF
-      */
-
-    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
-    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
-
-    return i;
-}
-
-
-static uint64_t tv_to_us(struct timeval *new)
-{
-    return (new->tv_sec * 1000000) + new->tv_usec;
-}
-
-static uint64_t llgettimeofday(void)
-{
-    struct timeval now;
-    gettimeofday(&now, NULL);
-    return tv_to_us(&now);
-}
-
-static uint64_t tv_delta(struct timeval *new, struct timeval *old)
-{
-    return (((new->tv_sec - old->tv_sec)*1000000) +
-            (new->tv_usec - old->tv_usec));
-}
-
-
-#define RATE_IS_MAX() (0)
-#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
-#define initialize_mbit_rate()
-
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
-    return (write(fd, buf, count) == count);
-}
-
-static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
-                       xc_shadow_op_stats_t *stats, int print)
-{
-    static struct timeval wall_last;
-    static long long      d0_cpu_last;
-    static long long      d1_cpu_last;
-
-    struct timeval        wall_now;
-    long long             wall_delta;
-    long long             d0_cpu_now, d0_cpu_delta;
-    long long             d1_cpu_now, d1_cpu_delta;
-
-    gettimeofday(&wall_now, NULL);
-
-    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
-    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
-
-    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
-        DPRINTF("ARRHHH!!\n");
-
-    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
-    if ( wall_delta == 0 )
-        wall_delta = 1;
-
-    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
-    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
-
-    if ( print )
-        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
-                "dirtied %dMb/s %" PRId32 " pages\n",
-                wall_delta,
-                (int)((d0_cpu_delta*100)/wall_delta),
-                (int)((d1_cpu_delta*100)/wall_delta),
-                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
-                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
-                stats->dirty_count);
-
-    d0_cpu_last = d0_cpu_now;
-    d1_cpu_last = d1_cpu_now;
-    wall_last   = wall_now;
-
-    return 0;
-}
-
-static int analysis_phase(int xc_handle, uint32_t domid, int pfn_array_size,
-                          unsigned long *arr, int runs)
-{
-    long long start, now;
-    xc_shadow_op_stats_t stats;
-    int j;
-
-    start = llgettimeofday();
-
-    for ( j = 0; j < runs; j++ )
-    {
-        int i;
-
-        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
-                          arr, pfn_array_size, NULL, 0, NULL);
-        DPRINTF("#Flush\n");
-        for ( i = 0; i < 40; i++ )
-        {
-            usleep(50000);
-            now = llgettimeofday();
-            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
-                              NULL, 0, NULL, 0, &stats);
-            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
-                    ((now-start)+500)/1000,
-                    stats.fault_count, stats.dirty_count);
-        }
-    }
-
-    return -1;
-}
-
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
-                             int dom, xc_dominfo_t *info,
-                             vcpu_guest_context_t *ctxt)
-{
-    int i = 0;
-
-    if ( !(*suspend)(dom) )
-    {
-        ERROR("Suspend request failed");
-        return -1;
-    }
-
- retry:
-
-    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
-    {
-        ERROR("Could not get domain info");
-        return -1;
-    }
-
-    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
-        ERROR("Could not get vcpu context");
-
-    if ( info->shutdown && (info->shutdown_reason == SHUTDOWN_suspend) )
-        return 0; /* success */
-
-    if ( info->paused )
-    {
-        /* Try unpausing domain, wait, and retest. */
-        xc_domain_unpause( xc_handle, dom );
-        ERROR("Domain was paused. Wait and re-test.");
-        usleep(10000);  /* 10ms */
-        goto retry;
-    }
-
-    if ( ++i < 100 )
-    {
-        ERROR("Retry suspend domain.");
-        usleep(10000); /* 10ms */
-        goto retry;
-    }
-
-    ERROR("Unable to suspend domain.");
-
-    return -1;
-}
-
-int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-                uint32_t max_factor, uint32_t flags, int (*suspend)(int),
-                void *(*init_qemu_maps)(int, unsigned), 
-                void (*qemu_flip_buffer)(int, int))
-{
-    xc_dominfo_t info;
-
-    int rc = 1, i, j, last_iter, iter = 0;
-    int live  = !!(flags & XCFLAGS_LIVE);
-    int debug = !!(flags & XCFLAGS_DEBUG);
-    int sent_last_iter, skip_this_iter;
-
-    /* The highest guest-physical frame number used by the current guest */
-    unsigned long max_pfn;
-
-    /* The size of an array big enough to contain all guest pfns */
-    unsigned long pfn_array_size;
-
-    /* Magic frames: ioreqs and xenstore comms. */
-    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
-    /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_t ctxt;
-
-    /* A table containg the PFNs (/not/ MFN!) to map. */
-    xen_pfn_t *pfn_batch = NULL;
-
-    /* A copy of hvm domain context buffer*/
-    uint32_t hvm_buf_size;
-    uint8_t *hvm_buf = NULL;
-
-    /* base of the region in which domain memory is mapped */
-    unsigned char *region_base = NULL;
-
-    uint32_t rec_size, nr_vcpus;
-
-    /* power of 2 order of pfn_array_size */
-    int order_nr;
-
-    /* bitmap of pages:
-       - that should be sent this iteration (unless later marked as skip);
-       - to skip this iteration because already dirty; */
-    unsigned long *to_send = NULL, *to_skip = NULL;
-
-    xc_shadow_op_stats_t stats;
-
-    unsigned long total_sent = 0;
-
-    uint64_t vcpumap = 1ULL;
-
-    DPRINTF("xc_hvm_save: dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, "
-            "live=%d, debug=%d.\n", dom, max_iters, max_factor, flags,
-            live, debug);
-    
-    /* If no explicit control parameters given, use defaults */
-    max_iters  = max_iters  ? : DEF_MAX_ITERS;
-    max_factor = max_factor ? : DEF_MAX_FACTOR;
-
-    initialize_mbit_rate();
-
-    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
-    {
-        ERROR("HVM: Could not get domain info");
-        return 1;
-    }
-    nr_vcpus = info.nr_online_vcpus;
-
-    if ( mlock(&ctxt, sizeof(ctxt)) )
-    {
-        ERROR("HVM: Unable to mlock ctxt");
-        return 1;
-    }
-
-    /* Only have to worry about vcpu 0 even for SMP */
-    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
-    {
-        ERROR("HVM: Could not get vcpu context");
-        goto out;
-    }
-
-    DPRINTF("saved hvm domain info: max_memkb=0x%lx, nr_pages=0x%lx\n",
-            info.max_memkb, info.nr_pages); 
-
-    if ( live )
-    {
-        /* Live suspend. Enable log-dirty mode. */
-        if ( xc_shadow_control(xc_handle, dom,
-                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
-                               NULL, 0, NULL, 0, NULL) < 0 )
-        {
-            ERROR("Couldn't enable shadow mode");
-            goto out;
-        }
-    }
-    else
-    {
-        /* This is a non-live suspend. Suspend the domain .*/
-        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
-        {
-            ERROR("HVM Domain appears not to have suspended");
-            goto out;
-        }
-    }
-
-    last_iter = !live;
-
-    max_pfn = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
-
-    DPRINTF("after 1st handle hvm domain max_pfn=0x%lx, "
-            "max_memkb=0x%lx, live=%d.\n",
-            max_pfn, info.max_memkb, live);
-
-    /* Size of any array that covers 0 ... max_pfn */
-    pfn_array_size = max_pfn + 1;
-    if ( !write_exact(io_fd, &pfn_array_size, sizeof(unsigned long)) )
-    {
-        ERROR("Error when writing to state file (1)");
-        goto out;
-    }
-
-    /* pretend we sent all the pages last iteration */
-    sent_last_iter = pfn_array_size;
-
-    /* calculate the power of 2 order of pfn_array_size, e.g.
-       15->4 16->4 17->5 */
-    for ( i = pfn_array_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
-        continue;
-
-    /* Setup to_send / to_fix and to_skip bitmaps */
-    to_send = malloc(BITMAP_SIZE);
-    to_skip = malloc(BITMAP_SIZE);
-
-    if ( live )
-    {
-        /* Get qemu-dm logging dirty pages too */
-        void *seg = init_qemu_maps(dom, BITMAP_SIZE);
-        qemu_bitmaps[0] = seg;
-        qemu_bitmaps[1] = seg + BITMAP_SIZE;
-        qemu_active = 0;
-        qemu_non_active = 1;
-    }
-
-    hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
-    if ( hvm_buf_size == -1 )
-    {
-        ERROR("Couldn't get HVM context size from Xen");
-        goto out;
-    }
-    hvm_buf = malloc(hvm_buf_size);
-
-    if ( !to_send || !to_skip || !hvm_buf )
-    {
-        ERROR("Couldn't allocate memory");
-        goto out;
-    }
-
-    memset(to_send, 0xff, BITMAP_SIZE);
-
-    if ( lock_pages(to_send, BITMAP_SIZE) )
-    {
-        ERROR("Unable to lock to_send");
-        return 1;
-    }
-
-    /* (to fix is local only) */
-    if ( lock_pages(to_skip, BITMAP_SIZE) )
-    {
-        ERROR("Unable to lock to_skip");
-        return 1;
-    }
-
-    analysis_phase(xc_handle, dom, pfn_array_size, to_skip, 0);
-
-    /* We want zeroed memory so use calloc rather than malloc. */
-    pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
-    if ( pfn_batch == NULL )
-    {
-        ERROR("failed to alloc memory for pfn_batch array");
-        errno = ENOMEM;
-        goto out;
-    }
-
-    for ( ; ; )
-    {
-        unsigned int prev_pc, sent_this_iter, N, batch;
-
-        iter++;
-        sent_this_iter = 0;
-        skip_this_iter = 0;
-        prev_pc = 0;
-        N=0;
-
-        DPRINTF("Saving memory pages: iter %d   0%%", iter);
-
-        while ( N < pfn_array_size )
-        {
-            unsigned int this_pc = (N * 100) / pfn_array_size;
-            int rc;
-
-            if ( (this_pc - prev_pc) >= 5 )
-            {
-                DPRINTF("\b\b\b\b%3d%%", this_pc);
-                prev_pc = this_pc;
-            }
-
-            if ( !last_iter )
-            {
-                /* Slightly wasteful to peek the whole array evey time,
-                   but this is fast enough for the moment. */
-                rc = xc_shadow_control(
-                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
-                    pfn_array_size, NULL, 0, NULL);
-                if ( rc != pfn_array_size )
-                {
-                    ERROR("Error peeking shadow bitmap");
-                    goto out;
-                }
-            }
-
-            /* load pfn_batch[] with the mfn of all the pages we're doing in
-               this batch. */
-            for ( batch = 0;
-                  (batch < MAX_BATCH_SIZE) && (N < pfn_array_size);
-                  N++ )
-            {
-                int n = permute(N, pfn_array_size, order_nr);
-
-                if ( 0 && debug )
-                    DPRINTF("%d pfn= %08lx %d \n",
-                            iter, (unsigned long)n, test_bit(n, to_send));
-
-                if ( !last_iter &&
-                     test_bit(n, to_send) &&
-                     test_bit(n, to_skip) )
-                    skip_this_iter++; /* stats keeping */
-
-                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
-                       (test_bit(n, to_send) && last_iter)) )
-                    continue;
-
-                /* Skip PFNs that aren't really there */
-                if ( (n >= 0xa0 && n < 0xc0) /* VGA hole */
-                     || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) &&
-                         n < (1ULL << 32) >> PAGE_SHIFT) /* 4G MMIO hole */ )
-                    continue;
-
-                /*
-                ** we get here if:
-                **  1. page is marked to_send & hasn't already been re-dirtied
-                **  2. (ignore to_skip in last iteration)
-                */
-
-                pfn_batch[batch] = n;
-
-                batch++;
-            }
-
-            if ( batch == 0 )
-                goto skip; /* vanishingly unlikely... */
-
-            region_base = xc_map_foreign_batch(
-                xc_handle, dom, PROT_READ, pfn_batch, batch);
-            if ( region_base == 0 )
-            {
-                ERROR("map batch failed");
-                goto out;
-            }
-
-            /* write num of pfns */
-            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
-            {
-                ERROR("Error when writing to state file (2)");
-                goto out;
-            }
-
-            /* write all the pfns */
-            if ( !write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch) )
-            {
-                ERROR("Error when writing to state file (3)");
-                goto out;
-            }
-
-            for ( j = 0; j < batch; j++ )
-            {
-                if ( pfn_batch[j] & XEN_DOMCTL_PFINFO_LTAB_MASK )
-                    continue;
-                if ( ratewrite(io_fd, region_base + j*PAGE_SIZE,
-                               PAGE_SIZE) != PAGE_SIZE )
-                {
-                    ERROR("ERROR when writing to state file (4)");
-                    goto out;
-                }
-            }
-
-            sent_this_iter += batch;
-
-            munmap(region_base, batch*PAGE_SIZE);
-
-        } /* end of this while loop for this iteration */
-
-      skip:
-
-        total_sent += sent_this_iter;
-
-        DPRINTF("\r %d: sent %d, skipped %d, ",
-                iter, sent_this_iter, skip_this_iter );
-
-        if ( last_iter )
-        {
-            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
-            DPRINTF("Total pages sent= %ld (%.2fx)\n",
-                    total_sent, ((float)total_sent)/pfn_array_size );
-        }
-
-        if ( last_iter && debug )
-        {
-            int minusone = -1;
-            memset(to_send, 0xff, BITMAP_SIZE);
-            debug = 0;
-            DPRINTF("Entering debug resend-all mode\n");
-
-            /* send "-1" to put receiver into debug mode */
-            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
-            {
-                ERROR("Error when writing to state file (6)");
-                goto out;
-            }
-
-            continue;
-        }
-
-        if ( last_iter )
-            break;
-
-        if ( live )
-        {
-            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
-                 (iter >= max_iters) ||
-                 (sent_this_iter+skip_this_iter < 50) ||
-                 (total_sent > pfn_array_size*max_factor) )
-            {
-                DPRINTF("Start last iteration for HVM domain\n");
-                last_iter = 1;
-
-                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
-                                       &ctxt))
-                {
-                    ERROR("Domain appears not to have suspended");
-                    goto out;
-                }
-
-                DPRINTF("SUSPEND eip %08lx edx %08lx\n",
-                        (unsigned long)ctxt.user_regs.eip,
-                        (unsigned long)ctxt.user_regs.edx);
-            }
-
-            if ( xc_shadow_control(xc_handle, dom, 
-                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
-                                   pfn_array_size, NULL, 
-                                   0, &stats) != pfn_array_size )
-            {
-                ERROR("Error flushing shadow PT");
-                goto out;
-            }
-
-            /* Pull in the dirty bits from qemu too */
-            if ( !last_iter )
-            {
-                qemu_active = qemu_non_active;
-                qemu_non_active = qemu_active ? 0 : 1;
-                qemu_flip_buffer(dom, qemu_active);
-                for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
-                {
-                    to_send[j] |= qemu_bitmaps[qemu_non_active][j];
-                    qemu_bitmaps[qemu_non_active][j] = 0;
-                }
-            }
-            else
-            {
-                for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
-                    to_send[j] |= qemu_bitmaps[qemu_active][j];
-            }
-
-            sent_last_iter = sent_this_iter;
-
-            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
-        }
-    } /* end of while 1 */
-
-
-    DPRINTF("All HVM memory is saved\n");
-
-    {
-        struct {
-            int minustwo;
-            int max_vcpu_id;
-            uint64_t vcpumap;
-        } chunk = { -2, info.max_vcpu_id };
-
-        if (info.max_vcpu_id >= 64) {
-            ERROR("Too many VCPUS in guest!");
-            goto out;
-        }
-
-        for (i = 1; i <= info.max_vcpu_id; i++) {
-            xc_vcpuinfo_t vinfo;
-            if ((xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
-                vinfo.online)
-                vcpumap |= 1ULL << i;
-        }
-
-        chunk.vcpumap = vcpumap;
-        if(!write_exact(io_fd, &chunk, sizeof(chunk))) {
-            ERROR("Error when writing to state file (errno %d)", errno);
-            goto out;
-        }
-    }
-
-    /* Zero terminate */
-    i = 0;
-    if ( !write_exact(io_fd, &i, sizeof(int)) )
-    {
-        ERROR("Error when writing to state file (6)");
-        goto out;
-    }
-
-    /* Save magic-page locations. */
-    memset(magic_pfns, 0, sizeof(magic_pfns));
-    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
-                     (unsigned long *)&magic_pfns[0]);
-    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
-                     (unsigned long *)&magic_pfns[1]);
-    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
-                     (unsigned long *)&magic_pfns[2]);
-    if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
-    {
-        ERROR("Error when writing to state file (7)");
-        goto out;
-    }
-
-    /* save vcpu/vmcs contexts */
-    for ( i = 0; i < nr_vcpus; i++ )
-    {
-        if ( !(vcpumap & (1ULL << i)) )
-            continue;
-
-        if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
-        {
-            ERROR("HVM:Could not get vcpu context");
-            goto out;
-        }
-
-        DPRINTF("write vcpu %d context.\n", i); 
-        if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
-        {
-            ERROR("write vcpu context failed!\n");
-            goto out;
-        }
-    }
-
-    if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 
-                                              hvm_buf_size)) == -1 )
-    {
-        ERROR("HVM:Could not get hvm buffer");
-        goto out;
-    }
-
-    if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
-    {
-        ERROR("error write hvm buffer size");
-        goto out;
-    }
-
-    if ( !write_exact(io_fd, hvm_buf, rec_size) )
-    {
-        ERROR("write HVM info failed!\n");
-        goto out;
-    }
-
-    /* Success! */
-    rc = 0;
-
- out:
-
-    if ( live )
-    {
-        if ( xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
-                               NULL, 0, NULL, 0, NULL) < 0 )
-            DPRINTF("Warning - couldn't disable shadow mode");
-    }
-
-    free(hvm_buf);
-    free(pfn_batch);
-    free(to_send);
-    free(to_skip);
-
-    return !!rc;
-}
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Wed Apr 11 09:29:00 2007 +0100
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,1414 +0,0 @@
-/******************************************************************************
- * xc_linux_save.c
- *
- * Save the state of a running Linux session.
- *
- * Copyright (c) 2003, K A Fraser.
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-
-#include "xc_private.h"
-#include "xc_dom.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-/*
-** Default values for important tuning parameters. Can override by passing
-** non-zero replacement values to xc_linux_save().
-**
-** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
-**
-*/
-#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
-#define DEF_MAX_FACTOR   3   /* never send more than 3x p2m_size  */
-
-/* max mfn of the whole machine */
-static unsigned long max_mfn;
-
-/* virtual starting address of the hypervisor */
-static unsigned long hvirt_start;
-
-/* #levels of page tables used by the current guest */
-static unsigned int pt_levels;
-
-/* number of pfns this guest has (i.e. number of entries in the P2M) */
-static unsigned long p2m_size;
-
-/* Live mapping of the table mapping each PFN to its current MFN. */
-static xen_pfn_t *live_p2m = NULL;
-
-/* Live mapping of system MFN to PFN table. */
-static xen_pfn_t *live_m2p = NULL;
-static unsigned long m2p_mfn0;
-
-/* grep fodder: machine_to_phys */
-
-#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
-
-/*
- * Returns TRUE if the given machine frame number has a unique mapping
- * in the guest's pseudophysical map.
- */
-#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
-    (((_mfn) < (max_mfn)) &&                    \
-     ((mfn_to_pfn(_mfn) < (p2m_size)) &&        \
-      (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
-
-/* Returns TRUE if MFN is successfully converted to a PFN. */
-#define translate_mfn_to_pfn(_pmfn)                             \
-({                                                              \
-    unsigned long mfn = *(_pmfn);                               \
-    int _res = 1;                                               \
-    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
-        _res = 0;                                               \
-    else                                                        \
-        *(_pmfn) = mfn_to_pfn(mfn);                             \
-    _res;                                                       \
-})
-
-/*
-** During (live) save/migrate, we maintain a number of bitmaps to track
-** which pages we have to send, to fixup, and to skip.
-*/
-
-#define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define BITMAP_SIZE   ((p2m_size + BITS_PER_LONG - 1) / 8)
-
-#define BITMAP_ENTRY(_nr,_bmap) \
-   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
-
-#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
-
-static inline int test_bit (int nr, volatile void * addr)
-{
-    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
-}
-
-static inline void clear_bit (int nr, volatile void * addr)
-{
-    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
-}
-
-static inline void set_bit ( int nr, volatile void * addr)
-{
-    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
-}
-
-/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
-static inline unsigned int hweight32(unsigned int w)
-{
-    unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
-    res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
-    res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
-    res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
-    return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
-}
-
-static inline int count_bits ( int nr, volatile void *addr)
-{
-    int i, count = 0;
-    volatile unsigned long *p = (volatile unsigned long *)addr;
-    /* We know that the array is padded to unsigned long. */
-    for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
-        count += hweight32(*p);
-    return count;
-}
-
-static inline int permute( int i, int nr, int order_nr  )
-{
-    /* Need a simple permutation function so that we scan pages in a
-       pseudo random order, enabling us to get a better estimate of
-       the domain's page dirtying rate as we go (there are often
-       contiguous ranges of pfns that have similar behaviour, and we
-       want to mix them up. */
-
-    /* e.g. nr->oder 15->4 16->4 17->5 */
-    /* 512MB domain, 128k pages, order 17 */
-
-    /*
-      QPONMLKJIHGFEDCBA
-             QPONMLKJIH
-      GFEDCBA
-     */
-
-    /*
-      QPONMLKJIHGFEDCBA
-                  EDCBA
-             QPONM
-      LKJIHGF
-      */
-
-    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
-    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
-
-    return i;
-}
-
-static uint64_t tv_to_us(struct timeval *new)
-{
-    return (new->tv_sec * 1000000) + new->tv_usec;
-}
-
-static uint64_t llgettimeofday(void)
-{
-    struct timeval now;
-    gettimeofday(&now, NULL);
-    return tv_to_us(&now);
-}
-
-static uint64_t tv_delta(struct timeval *new, struct timeval *old)
-{
-    return (((new->tv_sec - old->tv_sec)*1000000) +
-            (new->tv_usec - old->tv_usec));
-}
-
-static int noncached_write(int fd, int live, void *buffer, int len) 
-{
-    static int write_count = 0;
-
-    int rc = write(fd,buffer,len);
-
-    write_count += len;
-    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
-    {
-        /* Time to discard cache - dont care if this fails */
-        discard_file_cache(fd, 0 /* no flush */);
-        write_count = 0;
-    }
-
-    return rc;
-}
-
-#ifdef ADAPTIVE_SAVE
-
-/*
-** We control the rate at which we transmit (or save) to minimize impact
-** on running domains (including the target if we're doing live migrate).
-*/
-
-#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
-#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
-
-/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
-#define RATE_TO_BTU      781250
-
-/* Amount in bytes we allow ourselves to send in a burst */
-#define BURST_BUDGET (100*1024)
-
-/* We keep track of the current and previous transmission rate */
-static int mbit_rate, ombit_rate = 0;
-
-/* Have we reached the maximum transmission rate? */
-#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
-
-static inline void initialize_mbit_rate()
-{
-    mbit_rate = START_MBIT_RATE;
-}
-
-static int ratewrite(int io_fd, int live, void *buf, int n)
-{
-    static int budget = 0;
-    static int burst_time_us = -1;
-    static struct timeval last_put = { 0 };
-    struct timeval now;
-    struct timespec delay;
-    long long delta;
-
-    if ( START_MBIT_RATE == 0 )
-        return noncached_write(io_fd, live, buf, n);
-
-    budget -= n;
-    if ( budget < 0 )
-    {
-        if ( mbit_rate != ombit_rate )
-        {
-            burst_time_us = RATE_TO_BTU / mbit_rate;
-            ombit_rate = mbit_rate;
-            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
-                    mbit_rate, BURST_BUDGET, burst_time_us);
-        }
-        if ( last_put.tv_sec == 0 )
-        {
-            budget += BURST_BUDGET;
-            gettimeofday(&last_put, NULL);
-        }
-        else
-        {
-            while ( budget < 0 )
-            {
-                gettimeofday(&now, NULL);
-                delta = tv_delta(&now, &last_put);
-                while ( delta > burst_time_us )
-                {
-                    budget += BURST_BUDGET;
-                    last_put.tv_usec += burst_time_us;
-                    if ( last_put.tv_usec > 1000000 
-                    {
-                        last_put.tv_usec -= 1000000;
-                        last_put.tv_sec++;
-                    }
-                    delta -= burst_time_us;
-                }
-                if ( budget > 0 )
-                    break;
-                delay.tv_sec = 0;
-                delay.tv_nsec = 1000 * (burst_time_us - delta);
-                while ( delay.tv_nsec > 0 )
-                    if ( nanosleep(&delay, &delay) == 0 )
-                        break;
-            }
-        }
-    }
-    return noncached_write(io_fd, live, buf, n);
-}
-
-#else /* ! ADAPTIVE SAVE */
-
-#define RATE_IS_MAX() (0)
-#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), 
(_buf), (_n))
-#define initialize_mbit_rate()
-
-#endif
-
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
-    return (write(fd, buf, count) == count);
-}
-
-static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
-                       xc_shadow_op_stats_t *stats, int print)
-{
-    static struct timeval wall_last;
-    static long long      d0_cpu_last;
-    static long long      d1_cpu_last;
-
-    struct timeval        wall_now;
-    long long             wall_delta;
-    long long             d0_cpu_now, d0_cpu_delta;
-    long long             d1_cpu_now, d1_cpu_delta;
-
-    gettimeofday(&wall_now, NULL);
-
-    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
-    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
-
-    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
-        DPRINTF("ARRHHH!!\n");
-
-    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
-    if ( wall_delta == 0 )
-        wall_delta = 1;
-
-    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
-    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
-
-    if ( print )
-        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
-                "dirtied %dMb/s %" PRId32 " pages\n",
-                wall_delta,
-                (int)((d0_cpu_delta*100)/wall_delta),
-                (int)((d1_cpu_delta*100)/wall_delta),
-                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
-                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
-                stats->dirty_count);
-
-#ifdef ADAPTIVE_SAVE
-    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
-    {
-        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
-            + 50;
-        if ( mbit_rate > MAX_MBIT_RATE )
-            mbit_rate = MAX_MBIT_RATE;
-    }
-#endif
-
-    d0_cpu_last = d0_cpu_now;
-    d1_cpu_last = d1_cpu_now;
-    wall_last   = wall_now;
-
-    return 0;
-}
-
-
-static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
-                          unsigned long *arr, int runs)
-{
-    long long start, now;
-    xc_shadow_op_stats_t stats;
-    int j;
-
-    start = llgettimeofday();
-
-    for ( j = 0; j < runs; j++ )
-    {
-        int i;
-
-        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
-                          arr, p2m_size, NULL, 0, NULL);
-        DPRINTF("#Flush\n");
-        for ( i = 0; i < 40; i++ )
-        {
-            usleep(50000);
-            now = llgettimeofday();
-            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
-                              NULL, 0, NULL, 0, &stats);
-            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
-                    ((now-start)+500)/1000,
-                    stats.fault_count, stats.dirty_count);
-        }
-    }
-
-    return -1;
-}
-
-
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
-                             int dom, xc_dominfo_t *info,
-                             vcpu_guest_context_t *ctxt)
-{
-    int i = 0;
-
-    if ( !(*suspend)(dom) )
-    {
-        ERROR("Suspend request failed");
-        return -1;
-    }
-
- retry:
-
-    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
-    {
-        ERROR("Could not get domain info");
-        return -1;
-    }
-
-    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
-        ERROR("Could not get vcpu context");
-
-
-    if ( info->dying )
-    {
-        ERROR("domain is dying");
-        return -1;
-    }
-
-    if ( info->crashed )
-    {
-        ERROR("domain has crashed");
-        return -1;
-    }
-
-    if ( info->shutdown )
-    {
-        switch ( info->shutdown_reason )
-        {
-        case SHUTDOWN_poweroff:
-        case SHUTDOWN_reboot:
-            ERROR("domain has shut down");
-            return -1;
-        case SHUTDOWN_suspend:
-            return 0;
-        case SHUTDOWN_crash:
-            ERROR("domain has crashed");
-            return -1;
-        }
-    }
-
-    if ( info->paused )
-    {
-        /* Try unpausing domain, wait, and retest. */
-        xc_domain_unpause( xc_handle, dom );
-        ERROR("Domain was paused. Wait and re-test.");
-        usleep(10000); /* 10ms */
-        goto retry;
-    }
-
-    if ( ++i < 100 )
-    {
-        ERROR("Retry suspend domain");
-        usleep(10000); /* 10ms */
-        goto retry;
-    }
-
-    ERROR("Unable to suspend domain.");
-
-    return -1;
-}
-
-/*
-** Map the top-level page of MFNs from the guest. The guest might not have
-** finished resuming from a previous restore operation, so we wait a while for
-** it to update the MFN to a reasonable value.
-*/
-static void *map_frame_list_list(int xc_handle, uint32_t dom,
-                                 shared_info_t *shinfo)
-{
-    int count = 100;
-    void *p;
-
-    while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
-        usleep(10000);
-
-    if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
-    {
-        ERROR("Timed out waiting for frame list updated.");
-        return NULL;
-    }
-
-    p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
-                             shinfo->arch.pfn_to_mfn_frame_list_list);
-    if ( p == NULL )
-        ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
-
-    return p;
-}
-
-/*
-** During transfer (or in the state file), all page-table pages must be
-** converted into a 'canonical' form where references to actual mfns
-** are replaced with references to the corresponding pfns.
-**
-** This function performs the appropriate conversion, taking into account
-** which entries do not require canonicalization (in particular, those
-** entries which map the virtual address reserved for the hypervisor).
-*/
-static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
-                           const void *spage, void *dpage)
-{
-
-    int i, pte_last, xen_start, xen_end, race = 0; 
-    uint64_t pte;
-
-    /*
-    ** We need to determine which entries in this page table hold
-    ** reserved hypervisor mappings. This depends on the current
-    ** page table type as well as the number of paging levels.
-    */
-    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
-
-    if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
-        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
-
-    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
-        xen_start = L3_PAGETABLE_ENTRIES_PAE;
-
-    /*
-    ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
-    ** We can spot this by looking for the guest linear mapping which
-    ** Xen always ensures is present in that L2. Guests must ensure
-    ** that this check will fail for other L2s.
-    */
-    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
-    {
-        int hstart;
-        uint64_t he;
-
-        hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
-        he = ((const uint64_t *) spage)[hstart];
-
-        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
-        {
-            /* hvirt starts with xen stuff... */
-            xen_start = hstart;
-        }
-        else if ( hvirt_start != 0xf5800000 )
-        {
-            /* old L2s from before hole was shrunk... */
-            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
-            he = ((const uint64_t *) spage)[hstart];
-            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
-                xen_start = hstart;
-        }
-    }
-
-    if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
-    {
-        /*
-        ** XXX SMH: should compute these from hvirt_start (which we have)
-        ** and hvirt_end (which we don't)
-        */
-        xen_start = 256;
-        xen_end   = 272;
-    }
-
-    /* Now iterate through the page table, canonicalizing each PTE */
-    for (i = 0; i < pte_last; i++ )
-    {
-        unsigned long pfn, mfn;
-
-        if ( pt_levels == 2 )
-            pte = ((const uint32_t*)spage)[i];
-        else
-            pte = ((const uint64_t*)spage)[i];
-
-        if ( (i >= xen_start) && (i < xen_end) )
-            pte = 0;
-
-        if ( pte & _PAGE_PRESENT )
-        {
-            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
-            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
-            {
-                /* This will happen if the type info is stale which
-                   is quite feasible under live migration */
-                pfn  = 0;  /* zap it - we'll retransmit this page later */
-                race = 1;  /* inform the caller of race; fatal if !live */ 
-            }
-            else
-                pfn = mfn_to_pfn(mfn);
-
-            pte &= ~MADDR_MASK_X86;
-            pte |= (uint64_t)pfn << PAGE_SHIFT;
-
-            /*
-             * PAE guest L3Es can contain these flags when running on
-             * a 64bit hypervisor. We zap these here to avoid any
-             * surprise at restore time...
-             */
-            if ( (pt_levels == 3) &&
-                 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
-                 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
-                pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
-        }
-
-        if ( pt_levels == 2 )
-            ((uint32_t*)dpage)[i] = pte;
-        else
-            ((uint64_t*)dpage)[i] = pte;
-    }
-
-    return race;
-}
-
-static xen_pfn_t *xc_map_m2p(int xc_handle,
-                                 unsigned long max_mfn,
-                                 int prot)
-{
-    struct xen_machphys_mfn_list xmml;
-    privcmd_mmap_entry_t *entries;
-    unsigned long m2p_chunks, m2p_size;
-    xen_pfn_t *m2p;
-    xen_pfn_t *extent_start;
-    int i, rc;
-
-    m2p_size   = M2P_SIZE(max_mfn);
-    m2p_chunks = M2P_CHUNKS(max_mfn);
-
-    xmml.max_extents = m2p_chunks;
-    if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
-    {
-        ERROR("failed to allocate space for m2p mfns");
-        return NULL;
-    }
-    set_xen_guest_handle(xmml.extent_start, extent_start);
-
-    if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
-         (xmml.nr_extents != m2p_chunks) )
-    {
-        ERROR("xc_get_m2p_mfns");
-        return NULL;
-    }
-
-    if ( (m2p = mmap(NULL, m2p_size, prot,
-                     MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
-    {
-        ERROR("failed to mmap m2p");
-        return NULL;
-    }
-
-    if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
-    {
-        ERROR("failed to allocate space for mmap entries");
-        return NULL;
-    }
-
-    for ( i = 0; i < m2p_chunks; i++ )
-    {
-        entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
-        entries[i].mfn = extent_start[i];
-        entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
-    }
-
-    if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
-                                     entries, m2p_chunks)) < 0 )
-    {
-        ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
-        return NULL;
-    }
-
-    m2p_mfn0 = entries[0].mfn;
-
-    free(extent_start);
-    free(entries);
-
-    return m2p;
-}
-
-int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-                  uint32_t max_factor, uint32_t flags, int (*suspend)(int))
-{
-    xc_dominfo_t info;
-
-    int rc = 1, i, j, last_iter, iter = 0;
-    int live  = (flags & XCFLAGS_LIVE);
-    int debug = (flags & XCFLAGS_DEBUG);
-    int race = 0, sent_last_iter, skip_this_iter;
-
-    /* The new domain's shared-info frame number. */
-    unsigned long shared_info_frame;
-
-    /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_t ctxt;
-
-    /* A table containg the type of each PFN (/not/ MFN!). */
-    unsigned long *pfn_type = NULL;
-    unsigned long *pfn_batch = NULL;
-
-    /* A temporary mapping, and a copy, of one frame of guest memory. */
-    char page[PAGE_SIZE];
-
-    /* Double and single indirect references to the live P2M table */
-    xen_pfn_t *live_p2m_frame_list_list = NULL;
-    xen_pfn_t *live_p2m_frame_list = NULL;
-
-    /* A copy of the pfn-to-mfn table frame list. */
-    xen_pfn_t *p2m_frame_list = NULL;
-
-    /* Live mapping of shared info structure */
-    shared_info_t *live_shinfo = NULL;
-
-    /* base of the region in which domain memory is mapped */
-    unsigned char *region_base = NULL;
-
-    /* power of 2 order of p2m_size */
-    int order_nr;
-
-    /* bitmap of pages:
-       - that should be sent this iteration (unless later marked as skip);
-       - to skip this iteration because already dirty;
-       - to fixup by sending at the end if not already resent; */
-    unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
-
-    xc_shadow_op_stats_t stats;
-
-    unsigned long needed_to_fix = 0;
-    unsigned long total_sent    = 0;
-
-    uint64_t vcpumap = 1ULL;
-
-    /* If no explicit control parameters given, use defaults */
-    max_iters  = max_iters  ? : DEF_MAX_ITERS;
-    max_factor = max_factor ? : DEF_MAX_FACTOR;
-
-    initialize_mbit_rate();
-
-    if ( !get_platform_info(xc_handle, dom,
-                            &max_mfn, &hvirt_start, &pt_levels) )
-    {
-        ERROR("Unable to get platform info.");
-        return 1;
-    }
-
-    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
-    {
-        ERROR("Could not get domain info");
-        return 1;
-    }
-
-    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
-    {
-        ERROR("Could not get vcpu context");
-        goto out;
-    }
-    shared_info_frame = info.shared_info_frame;
-
-    /* Map the shared info frame */
-    if ( !(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                              PROT_READ, shared_info_frame)) )
-    {
-        ERROR("Couldn't map live_shinfo");
-        goto out;
-    }
-
-    p2m_size = live_shinfo->arch.max_pfn;
-
-    live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
-                                                   live_shinfo);
-    if ( !live_p2m_frame_list_list )
-        goto out;
-
-    live_p2m_frame_list =
-        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
-                             live_p2m_frame_list_list,
-                             P2M_FLL_ENTRIES);
-    if ( !live_p2m_frame_list )
-    {
-        ERROR("Couldn't map p2m_frame_list");
-        goto out;
-    }
-
-    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
-       the guest must not change which frames are used for this purpose.
-       (its not clear why it would want to change them, and we'll be OK
-       from a safety POV anyhow. */
-
-    live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
-                                    live_p2m_frame_list,
-                                    P2M_FL_ENTRIES);
-    if ( !live_p2m )
-    {
-        ERROR("Couldn't map p2m table");
-        goto out;
-    }
-
-    /* Setup the mfn_to_pfn table mapping */
-    if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
-    {
-        ERROR("Failed to map live M2P table");
-        goto out;
-    }
-
-
-    /* Get a local copy of the live_P2M_frame_list */
-    if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
-    {
-        ERROR("Couldn't allocate p2m_frame_list array");
-        goto out;
-    }
-    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
-
-    /* Canonicalise the pfn-to-mfn table frame-number list. */
-    for ( i = 0; i < p2m_size; i += fpp )
-    {
-        if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
-        {
-            ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
-            ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
-                  (uint64_t)p2m_frame_list[i/fpp]);
-            goto out;
-        }
-    }
-
-    /* Domain is still running at this point */
-    if ( live )
-    {
-        /* Live suspend. Enable log-dirty mode. */
-        if ( xc_shadow_control(xc_handle, dom,
-                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
-                               NULL, 0, NULL, 0, NULL) < 0 )
-        {
-            ERROR("Couldn't enable shadow mode");
-            goto out;
-        }
-    }
-    else
-    {
-        /* This is a non-live suspend. Suspend the domain .*/
-        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
-        {
-            ERROR("Domain appears not to have suspended");
-            goto out;
-        }
-    }
-
-    last_iter = !live;
-
-    /* pretend we sent all the pages last iteration */
-    sent_last_iter = p2m_size;
-
-    /* calculate the power of 2 order of p2m_size, e.g.
-       15->4 16->4 17->5 */
-    for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
-        continue;
-
-    /* Setup to_send / to_fix and to_skip bitmaps */
-    to_send = malloc(BITMAP_SIZE);
-    to_fix  = calloc(1, BITMAP_SIZE);
-    to_skip = malloc(BITMAP_SIZE);
-
-    if ( !to_send || !to_fix || !to_skip )
-    {
-        ERROR("Couldn't allocate to_send array");
-        goto out;
-    }
-
-    memset(to_send, 0xff, BITMAP_SIZE);
-
-    if ( lock_pages(to_send, BITMAP_SIZE) )
-    {
-        ERROR("Unable to lock to_send");
-        return 1;
-    }
-
-    /* (to fix is local only) */
-    if ( lock_pages(to_skip, BITMAP_SIZE) )
-    {
-        ERROR("Unable to lock to_skip");
-        return 1;
-    }
-
-    analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
-
-    /* We want zeroed memory so use calloc rather than malloc. */
-    pfn_type   = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
-    pfn_batch  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
-    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
-    {
-        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
-        errno = ENOMEM;
-        goto out;
-    }
-
-    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
-    {
-        ERROR("Unable to lock");
-        goto out;
-    }
-
-    /*
-     * Quick belt and braces sanity check.
-     */
-    {
-        int err=0;
-        unsigned long mfn;
-        for ( i = 0; i < p2m_size; i++ )
-        {
-            mfn = live_p2m[i];
-            if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
-            {
-                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
-                        mfn, mfn_to_pfn(mfn));
-                err++;
-            }
-        }
-        DPRINTF("Had %d unexplained entries in p2m table\n", err);
-    }
-
-    /* Start writing out the saved-domain record. */
-    if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
-    {
-        ERROR("write: p2m_size");
-        goto out;
-    }
-
-    /*
-     * Write an extended-info structure to inform the restore code that
-     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
-     * slow paths in the restore code.
-     */
-    if ( (pt_levels == 3) &&
-         (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
-    {
-        unsigned long signature = ~0UL;
-        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
-        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
-        char chunk_sig[]  = "vcpu";
-        if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
-             !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
-             !write_exact(io_fd, &chunk_sig, 4) ||
-             !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
-             !write_exact(io_fd, &ctxt,      sizeof(ctxt)) )
-        {
-            ERROR("write: extended info");
-            goto out;
-        }
-    }
-
-    if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
-    {
-        ERROR("write: p2m_frame_list");
-        goto out;
-    }
-
-    print_stats(xc_handle, dom, 0, &stats, 0);
-
-    /* Now write out each data page, canonicalising page tables as we go... */
-    for ( ; ; )
-    {
-        unsigned int prev_pc, sent_this_iter, N, batch;
-
-        iter++;
-        sent_this_iter = 0;
-        skip_this_iter = 0;
-        prev_pc = 0;
-        N = 0;
-
-        DPRINTF("Saving memory pages: iter %d   0%%", iter);
-
-        while ( N < p2m_size )
-        {
-            unsigned int this_pc = (N * 100) / p2m_size;
-            int rc;
-
-            if ( (this_pc - prev_pc) >= 5 )
-            {
-                DPRINTF("\b\b\b\b%3d%%", this_pc);
-                prev_pc = this_pc;
-            }
-
-            if ( !last_iter )
-            {
-                /* Slightly wasteful to peek the whole array evey time,
-                   but this is fast enough for the moment. */
-                rc = xc_shadow_control(
-                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
-                    p2m_size, NULL, 0, NULL);
-                if ( rc != p2m_size )
-                {
-                    ERROR("Error peeking shadow bitmap");
-                    goto out;
-                }
-            }
-
-            /* load pfn_type[] with the mfn of all the pages we're doing in
-               this batch. */
-            for  ( batch = 0;
-                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
-                   N++ )
-            {
-                int n = permute(N, p2m_size, order_nr);
-
-                if ( debug )
-                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
-                            iter, (unsigned long)n, live_p2m[n],
-                            test_bit(n, to_send),
-                            mfn_to_pfn(live_p2m[n]&0xFFFFF));
-
-                if ( !last_iter &&
-                     test_bit(n, to_send) &&
-                     test_bit(n, to_skip) )
-                    skip_this_iter++; /* stats keeping */
-
-                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
-                       (test_bit(n, to_send) && last_iter) ||
-                       (test_bit(n, to_fix)  && last_iter)) )
-                    continue;
-
-                /*
-                ** we get here if:
-                **  1. page is marked to_send & hasn't already been re-dirtied
-                **  2. (ignore to_skip in last iteration)
-                **  3. add in pages that still need fixup (net bufs)
-                */
-
-                pfn_batch[batch] = n;
-                pfn_type[batch]  = live_p2m[n];
-
-                if ( !is_mapped(pfn_type[batch]) )
-                {
-                    /*
-                    ** not currently in psuedo-physical map -- set bit
-                    ** in to_fix since we must send this page in last_iter
-                    ** unless its sent sooner anyhow, or it never enters
-                    ** pseudo-physical map (e.g. for ballooned down domains)
-                    */
-                    set_bit(n, to_fix);
-                    continue;
-                }
-
-                if ( last_iter &&
-                     test_bit(n, to_fix) &&
-                     !test_bit(n, to_send) )
-                {
-                    needed_to_fix++;
-                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
-                            iter, n, pfn_type[batch]);
-                }
-
-                clear_bit(n, to_fix);
-
-                batch++;
-            }
-
-            if ( batch == 0 )
-                goto skip; /* vanishingly unlikely... */
-
-            region_base = xc_map_foreign_batch(
-                xc_handle, dom, PROT_READ, pfn_type, batch);
-            if ( region_base == NULL )
-            {
-                ERROR("map batch failed");
-                goto out;
-            }
-
-            for ( j = 0; j < batch; j++ )
-                ((uint32_t *)pfn_type)[j] = pfn_type[j];
-            if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
-                                       (uint32_t *)pfn_type) )
-            {
-                ERROR("get_pfn_type_batch failed");
-                goto out;
-            }
-            for ( j = batch-1; j >= 0; j-- )
-                pfn_type[j] = ((uint32_t *)pfn_type)[j];
-
-            for ( j = 0; j < batch; j++ )
-            {
-
-                if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
-                     XEN_DOMCTL_PFINFO_XTAB )
-                {
-                    DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
-                    continue;
-                }
-
-                if ( debug )
-                    DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
-                            " sum= %08lx\n",
-                            iter,
-                            (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
-                            pfn_batch[j],
-                            pfn_type[j],
-                            mfn_to_pfn(pfn_type[j] &
-                                       ~XEN_DOMCTL_PFINFO_LTAB_MASK),
-                            csum_page(region_base + (PAGE_SIZE*j)));
-
-                /* canonicalise mfn->pfn */
-                pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
-                    pfn_batch[j];
-            }
-
-            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
-            {
-                ERROR("Error when writing to state file (2) (errno %d)",
-                      errno);
-                goto out;
-            }
-
-            if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*j) )
-            {
-                ERROR("Error when writing to state file (3) (errno %d)",
-                      errno);
-                goto out;
-            }
-
-            /* entering this loop, pfn_type is now in pfns (Not mfns) */
-            for ( j = 0; j < batch; j++ )
-            {
-                unsigned long pfn, pagetype;
-                void *spage = (char *)region_base + (PAGE_SIZE*j);
-
-                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-                /* write out pages in batch */
-                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
-                    continue;
-
-                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
-                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
-                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
-                {
-                    /* We have a pagetable page: need to rewrite it. */
-                    race = 
-                        canonicalize_pagetable(pagetype, pfn, spage, page); 
-
-                    if ( race && !live )
-                    {
-                        ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
-                              pagetype);
-                        goto out;
-                    }
-
-                    if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
-                    {
-                        ERROR("Error when writing to state file (4)"
-                              " (errno %d)", errno);
-                        goto out;
-                    }
-                }
-                else
-                {
-                    /* We have a normal page: just write it directly. */
-                    if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
-                         PAGE_SIZE )
-                    {
-                        ERROR("Error when writing to state file (5)"
-                              " (errno %d)", errno);
-                        goto out;
-                    }
-                }
-            } /* end of the write out for this batch */
-
-            sent_this_iter += batch;
-
-            munmap(region_base, batch*PAGE_SIZE);
-
-        } /* end of this while loop for this iteration */
-
-      skip:
-
-        total_sent += sent_this_iter;
-
-        DPRINTF("\r %d: sent %d, skipped %d, ",
-                iter, sent_this_iter, skip_this_iter );
-
-        if ( last_iter )
-        {
-            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
-
-            DPRINTF("Total pages sent= %ld (%.2fx)\n",
-                    total_sent, ((float)total_sent)/p2m_size );
-            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
-        }
-
-        if ( last_iter && debug )
-        {
-            int minusone = -1;
-            memset(to_send, 0xff, BITMAP_SIZE);
-            debug = 0;
-            DPRINTF("Entering debug resend-all mode\n");
-
-            /* send "-1" to put receiver into debug mode */
-            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
-            {
-                ERROR("Error when writing to state file (6) (errno %d)",
-                      errno);
-                goto out;
-            }
-
-            continue;
-        }
-
-        if ( last_iter )
-            break;
-
-        if ( live )
-        {
-            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
-                 (iter >= max_iters) ||
-                 (sent_this_iter+skip_this_iter < 50) ||
-                 (total_sent > p2m_size*max_factor) )
-            {
-                DPRINTF("Start last iteration\n");
-                last_iter = 1;
-
-                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
-                                       &ctxt) )
-                {
-                    ERROR("Domain appears not to have suspended");
-                    goto out;
-                }
-
-                DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
-                        info.shared_info_frame,
-                        (unsigned long)ctxt.user_regs.eip,
-                        (unsigned long)ctxt.user_regs.edx);
-            }
-
-            if ( xc_shadow_control(xc_handle, dom, 
-                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
-                                   p2m_size, NULL, 0, &stats) != p2m_size )
-            {
-                ERROR("Error flushing shadow PT");
-                goto out;
-            }
-
-            sent_last_iter = sent_this_iter;
-
-            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
-
-        }
-    } /* end of infinite for loop */
-
-    DPRINTF("All memory is saved\n");
-
-    {
-        struct {
-            int minustwo;
-            int max_vcpu_id;
-            uint64_t vcpumap;
-        } chunk = { -2, info.max_vcpu_id };
-
-        if ( info.max_vcpu_id >= 64 )
-        {
-            ERROR("Too many VCPUS in guest!");
-            goto out;
-        }
-
-        for ( i = 1; i <= info.max_vcpu_id; i++ )
-        {
-            xc_vcpuinfo_t vinfo;
-            if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
-                 vinfo.online )
-                vcpumap |= 1ULL << i;
-        }
-
-        chunk.vcpumap = vcpumap;
-        if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            ERROR("Error when writing to state file (errno %d)", errno);
-            goto out;
-        }
-    }
-
-    /* Zero terminate */
-    i = 0;
-    if ( !write_exact(io_fd, &i, sizeof(int)) )
-    {
-        ERROR("Error when writing to state file (6') (errno %d)", errno);
-        goto out;
-    }
-
-    /* Send through a list of all the PFNs that were not in map at the close */
-    {
-        unsigned int i,j;
-        unsigned long pfntab[1024];
-
-        for ( i = 0, j = 0; i < p2m_size; i++ )
-        {
-            if ( !is_mapped(live_p2m[i]) )
-                j++;
-        }
-
-        if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
-        {
-            ERROR("Error when writing to state file (6a) (errno %d)", errno);
-            goto out;
-        }
-
-        for ( i = 0, j = 0; i < p2m_size; )
-        {
-            if ( !is_mapped(live_p2m[i]) )
-                pfntab[j++] = i;
-
-            i++;
-            if ( (j == 1024) || (i == p2m_size) )
-            {
-                if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
-                {
-                    ERROR("Error when writing to state file (6b) (errno %d)",
-                          errno);
-                    goto out;
-                }
-                j = 0;
-            }
-        }
-    }
-
-    /* Canonicalise the suspend-record frame number. */
-    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
-    {
-        ERROR("Suspend record is not in range of pseudophys map");
-        goto out;
-    }
-
-    for ( i = 0; i <= info.max_vcpu_id; i++ )
-    {
-        if ( !(vcpumap & (1ULL << i)) )
-            continue;
-
-        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
-        {
-            ERROR("No context for VCPU%d", i);
-            goto out;
-        }
-
-        /* Canonicalise each GDT frame number. */
-        for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
-        {
-            if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
-            {
-                ERROR("GDT frame is not in range of pseudophys map");
-                goto out;
-            }
-        }
-
-        /* Canonicalise the page table base pointer. */
-        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
-        {
-            ERROR("PT base is not in range of pseudophys map");
-            goto out;
-        }
-        ctxt.ctrlreg[3] = 
-            xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
-
-        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
-        if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
-        {
-            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
-            {
-                ERROR("PT base is not in range of pseudophys map");
-                goto out;
-            }
-            /* Least-significant bit means 'valid PFN'. */
-            ctxt.ctrlreg[1] = 1 |
-                xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
-        }
-
-        if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
-        {
-            ERROR("Error when writing to state file (1) (errno %d)", errno);
-            goto out;
-        }
-    }
-
-    /*
-     * Reset the MFN to be a known-invalid value. See map_frame_list_list().
-     */
-    memcpy(page, live_shinfo, PAGE_SIZE);
-    ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
-    if ( !write_exact(io_fd, page, PAGE_SIZE) )
-    {
-        ERROR("Error when writing to state file (1) (errno %d)", errno);
-        goto out;
-    }
-
-    /* Success! */
-    rc = 0;
-
- out:
-
-    if ( live )
-    {
-        if ( xc_shadow_control(xc_handle, dom, 
-                               XEN_DOMCTL_SHADOW_OP_OFF,
-                               NULL, 0, NULL, 0, NULL) < 0 )
-            DPRINTF("Warning - couldn't disable shadow mode");
-    }
-
-    /* Flush last write and discard cache for file. */
-    discard_file_cache(io_fd, 1 /* flush */);
-
-    if ( live_shinfo )
-        munmap(live_shinfo, PAGE_SIZE);
-
-    if ( live_p2m_frame_list_list )
-        munmap(live_p2m_frame_list_list, PAGE_SIZE);
-
-    if ( live_p2m_frame_list )
-        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
-
-    if ( live_p2m )
-        munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
-
-    if ( live_m2p )
-        munmap(live_m2p, M2P_SIZE(max_mfn));
-
-    free(pfn_type);
-    free(pfn_batch);
-    free(to_send);
-    free(to_fix);
-    free(to_skip);
-
-    DPRINTF("Save exit rc=%d\n",rc);
-
-    return !!rc;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/libxc/xenguest.h    Wed Apr 11 14:45:14 2007 +0100
@@ -16,26 +16,19 @@
 
 
 /**
- * This function will save a domain running Linux.
+ * This function will save a running domain.
  *
  * @parm xc_handle a handle to an open hypervisor interface
  * @parm fd the file descriptor to save a domain to
  * @parm dom the id of the domain
  * @return 0 on success, -1 on failure
  */
-int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-                  uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
-                  int (*suspend)(int domid));
+int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                   uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+                   int (*suspend)(int domid), int hvm,
+                   void *(*init_qemu_maps)(int, unsigned),  /* HVM only */
+                   void (*qemu_flip_buffer)(int, int));     /* HVM only */
 
-/**
- * This function will save a hvm domain running unmodified guest.
- * @return 0 on success, -1 on failure
- */
-int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-                uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
-                int (*suspend)(int domid),  
-                void *(*init_qemu_maps)(int, unsigned), 
-                void (*qemu_flip_buffer)(int, int));
 
 /**
  * This function will restore a saved domain.
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/libxc/xg_private.c
--- a/tools/libxc/xg_private.c  Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/libxc/xg_private.c  Wed Apr 11 14:45:14 2007 +0100
@@ -193,17 +193,6 @@ __attribute__((weak))
                      uint32_t domid,
                      int memsize,
                      const char *image_name)
-{
-    errno = ENOSYS;
-    return -1;
-}
-
-__attribute__((weak)) 
-    int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-                    uint32_t max_factor, uint32_t flags,
-                    int (*suspend)(int domid), 
-                    void *(*init_qemu_maps)(int, unsigned), 
-                    void (*qemu_flip_buffer)(int, int))
 {
     errno = ENOSYS;
     return -1;
diff -r 6e7ef794cdbc -r 90a6af455bbd tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c   Wed Apr 11 09:29:00 2007 +0100
+++ b/tools/xcutils/xc_save.c   Wed Apr 11 14:45:14 2007 +0100
@@ -174,12 +174,9 @@ main(int argc, char **argv)
     max_f = atoi(argv[4]);
     flags = atoi(argv[5]);
 
-    if (flags & XCFLAGS_HVM)
-        ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
-                          &suspend, &init_qemu_maps, &qemu_flip_buffer);
-    else 
-        ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
-                            &suspend);
+    ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
+                         &suspend, !!(flags & XCFLAGS_HVM),
+                         &init_qemu_maps, &qemu_flip_buffer);
 
     xc_interface_close(xc_fd);
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.