[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [HVM] Save/restore: merge xc_linux_restore and xc_hvm_restore



# HG changeset patch
# User Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx>
# Date 1175782282 -3600
# Node ID e518f2fbdd724ca7b21789d2d075c7ee8665ddaa
# Parent  602d061ff51f50d7b46bd5ca78c4b70fbe809d20
[HVM] Save/restore: merge xc_linux_restore and xc_hvm_restore
into one function (and one file) since they share a lot of code
Signed-off-by: Tim Deegan <Tim.Deegan@?ensource.com>
---
 tools/libxc/xc_hvm_restore.c    |  351 ------------
 tools/libxc/xc_linux_restore.c  |  955 -----------------------------------
 tools/libxc/Makefile            |    4 
 tools/libxc/xc_domain_restore.c | 1086 ++++++++++++++++++++++++++++++++++++++++
 tools/libxc/xc_hvm_save.c       |   57 +-
 tools/libxc/xenguest.h          |   22 
 tools/libxc/xg_private.c        |   10 
 tools/xcutils/xc_restore.c      |   10 
 8 files changed, 1137 insertions(+), 1358 deletions(-)

diff -r 602d061ff51f -r e518f2fbdd72 tools/libxc/Makefile
--- a/tools/libxc/Makefile      Thu Apr 05 10:43:50 2007 +0100
+++ b/tools/libxc/Makefile      Thu Apr 05 15:11:22 2007 +0100
@@ -26,8 +26,8 @@ CTRL_SRCS-$(CONFIG_X86_Linux) += xc_ptra
 
 GUEST_SRCS-y :=
 GUEST_SRCS-y += xg_private.c
-GUEST_SRCS-$(CONFIG_MIGRATE) += xc_linux_restore.c xc_linux_save.c
-GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_restore.c xc_hvm_save.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_linux_save.c
+GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_save.c
 
 # symlink libelf from xen/common/libelf/
 LIBELF_SRCS := libelf-tools.c libelf-loader.c
diff -r 602d061ff51f -r e518f2fbdd72 tools/libxc/xc_domain_restore.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_domain_restore.c   Thu Apr 05 15:11:22 2007 +0100
@@ -0,0 +1,1086 @@
+/******************************************************************************
+ * xc_domain_restore.c
+ *
+ * Restore the state of a guest session.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ * Copyright (c) 2006, Intel Corporation
+ * Copyright (c) 2007, XenSource Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+#include "xc_dom.h"
+
+#include <xen/hvm/ioreq.h>
+#include <xen/hvm/params.h>
+
+/* max mfn of the current host machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the current guest */
+static unsigned int pt_levels;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
+static unsigned long nr_pfns;
+
+/* Live mapping of the table mapping each PFN to its current MFN. */
+static xen_pfn_t *live_p2m = NULL;
+
+/* A table mapping each PFN to its new MFN. */
+static xen_pfn_t *p2m = NULL;
+
+/* A table of P2M mappings in the current region */
+static xen_pfn_t *p2m_batch = NULL;
+
+static ssize_t
+read_exact(int fd, void *buf, size_t count)
+{
+    int r = 0, s;
+    unsigned char *b = buf;
+
+    while (r < count) {
+        s = read(fd, &b[r], count - r);
+        if ((s == -1) && (errno == EINTR))
+            continue;
+        if (s <= 0) {
+            break;
+        }
+        r += s;
+    }
+
+    return (r == count) ? 1 : 0;
+}
+
+/*
+** In the state file (or during transfer), all page-table pages are
+** converted into a 'canonical' form where references to actual mfns
+** are replaced with references to the corresponding pfns.
+** This function inverts that operation, replacing the pfn values with
+** the (now known) appropriate mfn values.
+*/
+static int uncanonicalize_pagetable(int xc_handle, uint32_t dom, 
+                                    unsigned long type, void *page)
+{
+    int i, pte_last;
+    unsigned long pfn;
+    uint64_t pte;
+    int nr_mfns = 0; 
+
+    pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
+
+    /* First pass: work out how many (if any) MFNs we need to alloc */
+    for(i = 0; i < pte_last; i++) {
+        
+        if(pt_levels == 2)
+            pte = ((uint32_t *)page)[i];
+        else
+            pte = ((uint64_t *)page)[i];
+        
+        /* XXX SMH: below needs fixing for PROT_NONE etc */
+        if(!(pte & _PAGE_PRESENT))
+            continue; 
+        
+        pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
+        
+        if(pfn >= p2m_size) {
+            /* This "page table page" is probably not one; bail. */
+            ERROR("Frame number in type %lu page table is out of range: "
+                  "i=%d pfn=0x%lx p2m_size=%lu",
+                  type >> 28, i, pfn, p2m_size);
+            return 0;
+        }
+        
+        if(p2m[pfn] == INVALID_P2M_ENTRY) {
+            /* Have a 'valid' PFN without a matching MFN - need to alloc */
+            p2m_batch[nr_mfns++] = pfn; 
+        }
+    }
+    
+    
+    /* Allocate the requistite number of mfns */
+    if (nr_mfns && xc_domain_memory_populate_physmap(
+            xc_handle, dom, nr_mfns, 0, 0, p2m_batch) != 0) { 
+        ERROR("Failed to allocate memory for batch.!\n"); 
+        errno = ENOMEM;
+        return 0; 
+    }
+    
+    /* Second pass: uncanonicalize each present PTE */
+    nr_mfns = 0;
+    for(i = 0; i < pte_last; i++) {
+
+        if(pt_levels == 2)
+            pte = ((uint32_t *)page)[i];
+        else
+            pte = ((uint64_t *)page)[i];
+        
+        /* XXX SMH: below needs fixing for PROT_NONE etc */
+        if(!(pte & _PAGE_PRESENT))
+            continue;
+        
+        pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
+        
+        if(p2m[pfn] == INVALID_P2M_ENTRY)
+            p2m[pfn] = p2m_batch[nr_mfns++];
+
+        pte &= ~MADDR_MASK_X86;
+        pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
+
+        if(pt_levels == 2)
+            ((uint32_t *)page)[i] = (uint32_t)pte;
+        else
+            ((uint64_t *)page)[i] = (uint64_t)pte;
+    }
+
+    return 1;
+}
+
+
+/* Load the p2m frame list, plus potential extended info chunk */
+static xen_pfn_t * load_p2m_frame_list(int io_fd, int *pae_extended_cr3)
+{
+    xen_pfn_t *p2m_frame_list;
+    vcpu_guest_context_t ctxt;
+
+    if (!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
+        ERROR("Couldn't allocate p2m_frame_list array");
+        return NULL;
+    }
+    
+    /* Read first entry of P2M list, or extended-info signature (~0UL). */
+    if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
+            ERROR("read extended-info signature failed");
+            return NULL;
+        }
+    
+    if (p2m_frame_list[0] == ~0UL) {
+        uint32_t tot_bytes;
+        
+        /* Next 4 bytes: total size of following extended info. */
+        if (!read_exact(io_fd, &tot_bytes, sizeof(tot_bytes))) {
+            ERROR("read extended-info size failed");
+            return NULL;
+        }
+        
+        while (tot_bytes) {
+            uint32_t chunk_bytes;
+            char     chunk_sig[4];
+            
+            /* 4-character chunk signature + 4-byte remaining chunk size. */
+            if (!read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
+                !read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes))) {
+                ERROR("read extended-info chunk signature failed");
+                return NULL;
+            }
+            tot_bytes -= 8;
+            
+            /* VCPU context structure? */
+            if (!strncmp(chunk_sig, "vcpu", 4)) {
+                if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
+                    ERROR("read extended-info vcpu context failed");
+                    return NULL;
+                }
+                tot_bytes   -= sizeof(struct vcpu_guest_context);
+                chunk_bytes -= sizeof(struct vcpu_guest_context);
+                
+                if (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))
+                    *pae_extended_cr3 = 1;
+            }
+            
+            /* Any remaining bytes of this chunk: read and discard. */
+            while (chunk_bytes) {
+                unsigned long sz = chunk_bytes;
+                if ( sz > P2M_FL_SIZE )
+                    sz = P2M_FL_SIZE;
+                if (!read_exact(io_fd, p2m_frame_list, sz)) {
+                    ERROR("read-and-discard extended-info chunk bytes failed");
+                    return NULL;
+                }
+                chunk_bytes -= sz;
+                tot_bytes   -= sz;
+            }
+        }
+        
+        /* Now read the real first entry of P2M list. */
+        if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
+            ERROR("read first entry of p2m_frame_list failed");
+            return NULL;
+        }
+    }
+    
+    /* First entry is already read into the p2m array. */
+    if (!read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long))) {
+            ERROR("read p2m_frame_list failed");
+            return NULL;
+    }
+    
+    return p2m_frame_list;
+}
+
+
+
+int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
+                      unsigned int store_evtchn, unsigned long *store_mfn,
+                      unsigned int console_evtchn, unsigned long *console_mfn,
+                      unsigned int hvm, unsigned int pae)
+{
+    DECLARE_DOMCTL;
+    int rc = 1, i, j, n, m, pae_extended_cr3 = 0;
+    unsigned long mfn, pfn;
+    unsigned int prev_pc, this_pc;
+    int verify = 0;
+    int nraces = 0;
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+    unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
+    shared_info_t *shared_info = (shared_info_t *)shared_info_page;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    /* A table containing the type of each PFN (/not/ MFN!). */
+    unsigned long *pfn_type = NULL;
+
+    /* A table of MFNs to map in the current region */
+    xen_pfn_t *region_mfn = NULL;
+
+    /* Types of the pfns in the current region */
+    unsigned long region_pfn_type[MAX_BATCH_SIZE];
+
+    /* A temporary mapping, and a copy, of one frame of guest memory. */
+    unsigned long *page = NULL;
+
+    /* A copy of the pfn-to-mfn table frame list. */
+    xen_pfn_t *p2m_frame_list = NULL;
+    
+    /* A temporary mapping of the guest's start_info page. */
+    start_info_t *start_info;
+
+    /* Our mapping of the current region (batch) */
+    char *region_base;
+
+    xc_mmu_t *mmu = NULL;
+
+    /* used by debug verify code */
+    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
+
+    struct mmuext_op pin[MAX_PIN_BATCH];
+    unsigned int nr_pins;
+
+    uint64_t vcpumap = 1ULL;
+    unsigned int max_vcpu_id = 0;
+    int new_ctxt_format = 0;
+
+    /* Magic frames in HVM guests: ioreqs and xenstore comms. */
+    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+    /* Buffer for holding HVM context */
+    uint8_t *hvm_buf = NULL;
+
+    /* For info only */
+    nr_pfns = 0;
+
+    if ( !read_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+    {
+        ERROR("read: p2m_size");
+        goto out;
+    }
+    DPRINTF("xc_domain_restore start: p2m_size = %lx\n", p2m_size);
+
+    if ( !hvm )
+    {
+        /*
+         * XXX For now, 32bit dom0's can only save/restore 32bit domUs
+         * on 64bit hypervisors.
+         */
+        memset(&domctl, 0, sizeof(domctl));
+        domctl.domain = dom;
+        domctl.cmd    = XEN_DOMCTL_set_address_size;
+        domctl.u.address_size.size = sizeof(unsigned long) * 8;
+        rc = do_domctl(xc_handle, &domctl);
+        if ( rc != 0 ) {
+            ERROR("Unable to set guest address size.");
+            goto out;
+        }
+        rc = 1;
+    }
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERROR("Unable to get platform info.");
+        return 1;
+    }
+
+    if (lock_pages(&ctxt, sizeof(ctxt))) {
+        /* needed for build domctl, but might as well do early */
+        ERROR("Unable to lock ctxt");
+        return 1;
+    }
+
+    /* Load the p2m frame list, plus potential extended info chunk */
+    if ( !hvm ) 
+    {
+        p2m_frame_list = load_p2m_frame_list(io_fd, &pae_extended_cr3);
+        if ( !p2m_frame_list )
+            goto out;
+    }
+
+    /* We want zeroed memory so use calloc rather than malloc. */
+    p2m        = calloc(p2m_size, sizeof(xen_pfn_t));
+    pfn_type   = calloc(p2m_size, sizeof(unsigned long));
+    region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
+    p2m_batch  = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
+
+    if ((p2m == NULL) || (pfn_type == NULL) ||
+        (region_mfn == NULL) || (p2m_batch == NULL)) {
+        ERROR("memory alloc failed");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    if (lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
+        ERROR("Could not lock region_mfn");
+        goto out;
+    }
+
+    if (lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
+        ERROR("Could not lock p2m_batch");
+        goto out;
+    }
+
+    /* Get the domain's shared-info frame. */
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)dom;
+    if (xc_domctl(xc_handle, &domctl) < 0) {
+        ERROR("Could not get information on new domain");
+        goto out;
+    }
+    shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
+
+    /* Mark all PFNs as invalid; we allocate on demand */
+    for ( pfn = 0; pfn < p2m_size; pfn++ )
+        p2m[pfn] = INVALID_P2M_ENTRY;
+
+    if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
+        ERROR("Could not initialise for MMU updates");
+        goto out;
+    }
+
+    DPRINTF("Reloading memory pages:   0%%\n");
+
+    /*
+     * Now simply read each saved frame into its new machine frame.
+     * We uncanonicalise page tables as we go.
+     */
+    prev_pc = 0;
+
+    n = m = 0;
+    while (1) {
+
+        int j, nr_mfns = 0; 
+
+        this_pc = (n * 100) / p2m_size;
+        if ( (this_pc - prev_pc) >= 5 )
+        {
+            PPRINTF("\b\b\b\b%3d%%", this_pc);
+            prev_pc = this_pc;
+        }
+
+        if (!read_exact(io_fd, &j, sizeof(int))) {
+            ERROR("Error when reading batch size");
+            goto out;
+        }
+
+        PPRINTF("batch %d\n",j);
+
+        if (j == -1) {
+            verify = 1;
+            DPRINTF("Entering page verify mode\n");
+            continue;
+        }
+
+        if (j == -2) {
+            new_ctxt_format = 1;
+            if (!read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
+                (max_vcpu_id >= 64) ||
+                !read_exact(io_fd, &vcpumap, sizeof(uint64_t))) {
+                ERROR("Error when reading max_vcpu_id");
+                goto out;
+            }
+            continue;
+        }
+
+        if (j == 0)
+            break;  /* our work here is done */
+
+        if (j > MAX_BATCH_SIZE) {
+            ERROR("Max batch size exceeded. Giving up.");
+            goto out;
+        }
+
+        if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) {
+            ERROR("Error when reading region pfn types");
+            goto out;
+        }
+
+        /* First pass for this batch: work out how much memory to alloc */
+        nr_mfns = 0; 
+        for ( i = 0; i < j; i++ )
+        {
+            unsigned long pfn, pagetype;
+            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+            if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && 
+                 (p2m[pfn] == INVALID_P2M_ENTRY) )
+            {
+                /* Have a live PFN which hasn't had an MFN allocated */
+                p2m_batch[nr_mfns++] = pfn; 
+            }
+        } 
+
+
+        /* Now allocate a bunch of mfns for this batch */
+        if (nr_mfns && xc_domain_memory_populate_physmap(
+                xc_handle, dom, nr_mfns, 0, 0, p2m_batch) != 0) { 
+            ERROR("Failed to allocate memory for batch.!\n"); 
+            errno = ENOMEM;
+            goto out;
+        }
+
+        /* Second pass for this batch: update p2m[] and region_mfn[] */
+        nr_mfns = 0; 
+        for ( i = 0; i < j; i++ )
+        {
+            unsigned long pfn, pagetype;
+            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+            if ( pagetype == XEN_DOMCTL_PFINFO_XTAB)
+                region_mfn[i] = ~0UL; /* map will fail but we don't care */
+            else 
+            {
+                if (p2m[pfn] == INVALID_P2M_ENTRY) {
+                    /* We just allocated a new mfn above; update p2m */
+                    p2m[pfn] = p2m_batch[nr_mfns++]; 
+                    nr_pfns++; 
+                }
+
+                /* setup region_mfn[] for batch map.
+                 * For HVM guests, this interface takes PFNs, not MFNs */
+                region_mfn[i] = hvm ? pfn : p2m[pfn]; 
+            }
+        } 
+
+        /* Map relevant mfns */
+        region_base = xc_map_foreign_batch(
+            xc_handle, dom, PROT_WRITE, region_mfn, j);
+
+        if ( region_base == NULL )
+        {
+            ERROR("map batch failed");
+            goto out;
+        }
+
+        for ( i = 0; i < j; i++ )
+        {
+            void *page;
+            unsigned long pagetype;
+
+            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+            if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                /* a bogus/unmapped page: skip it */
+                continue;
+
+            if ( pfn > p2m_size )
+            {
+                ERROR("pfn out of range");
+                goto out;
+            }
+
+            pfn_type[pfn] = pagetype;
+
+            mfn = p2m[pfn];
+
+            /* In verify mode, we use a copy; otherwise we work in place */
+            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
+
+            if (!read_exact(io_fd, page, PAGE_SIZE)) {
+                ERROR("Error when reading page (type was %lx)", pagetype);
+                goto out;
+            }
+
+            pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+            if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 
+                 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+            {
+                /*
+                ** A page table page - need to 'uncanonicalize' it, i.e.
+                ** replace all the references to pfns with the corresponding
+                ** mfns for the new domain.
+                **
+                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
+                ** so we may need to update the p2m after the main loop.
+                ** Hence we defer canonicalization of L1s until then.
+                */
+                if ((pt_levels != 3) ||
+                    pae_extended_cr3 ||
+                    (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
+
+                    if (!uncanonicalize_pagetable(xc_handle, dom, 
+                                                  pagetype, page)) {
+                        /*
+                        ** Failing to uncanonicalize a page table can be ok
+                        ** under live migration since the pages type may have
+                        ** changed by now (and we'll get an update later).
+                        */
+                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+                                pagetype >> 28, pfn, mfn);
+                        nraces++;
+                        continue;
+                    } 
+                }
+            }
+            else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
+            {
+                ERROR("Bogus page type %lx page table is out of range: "
+                    "i=%d p2m_size=%lu", pagetype, i, p2m_size);
+                goto out;
+
+            }
+
+
+            if (verify) {
+
+                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
+
+                if (res) {
+
+                    int v;
+
+                    DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
+                            "actualcs=%08lx\n", pfn, pfn_type[pfn],
+                            csum_page(region_base + i*PAGE_SIZE),
+                            csum_page(buf));
+
+                    for (v = 0; v < 4; v++) {
+
+                        unsigned long *p = (unsigned long *)
+                            (region_base + i*PAGE_SIZE);
+                        if (buf[v] != p[v])
+                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
+                    }
+                }
+            }
+
+            if (!hvm 
+                && xc_add_mmu_update(xc_handle, mmu,
+                                     (((unsigned long long)mfn) << PAGE_SHIFT)
+                                     | MMU_MACHPHYS_UPDATE, pfn)) {
+                ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
+                goto out;
+            }
+        } /* end of 'batch' for loop */
+
+        munmap(region_base, j*PAGE_SIZE);
+        n+= j; /* crude stats */
+
+        /* 
+         * Discard cache for portion of file read so far up to last
+         *  page boundary every 16MB or so.
+         */
+        m += j;
+        if ( m > MAX_PAGECACHE_USAGE )
+        {
+            discard_file_cache(io_fd, 0 /* no flush */);
+            m = 0;
+        }
+    }
+
+    /*
+     * Ensure we flush all machphys updates before potential PAE-specific
+     * reallocations below.
+     */
+    if (!hvm && xc_finish_mmu_updates(xc_handle, mmu)) {
+        ERROR("Error doing finish_mmu_updates()");
+        goto out;
+    }
+
+    DPRINTF("Received all pages (%d races)\n", nraces);
+
+    if ( hvm ) 
+    {
+        uint32_t rec_len;
+
+        /* Set HVM-specific parameters */
+        if ( !read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+        {
+            ERROR("error reading magic page addresses");
+            goto out;
+        }
+        
+        /* These comms pages need to be zeroed at the start of day */
+        if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) ||
+             xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) ||
+             xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) )
+        {
+            ERROR("error zeroing magic pages");
+            goto out;
+        }
+        
+        xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, magic_pfns[0]);
+        xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, 
magic_pfns[1]);
+        xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, magic_pfns[2]);
+        xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae);
+        xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_EVTCHN, store_evtchn);
+        *store_mfn = magic_pfns[2];
+
+        /* Read vcpu contexts */
+        for (i = 0; i <= max_vcpu_id; i++) 
+        {
+            if (!(vcpumap & (1ULL << i)))
+                continue;
+
+            if ( !read_exact(io_fd, &(ctxt), sizeof(ctxt)) )
+            {
+                ERROR("error read vcpu context.\n");
+                goto out;
+            }
+            
+            if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) )
+            {
+                ERROR("Could not set vcpu context, rc=%d", rc);
+                goto out;
+            }
+            rc = 1;
+        }
+
+        /* Read HVM context */
+        if ( !read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
+        {
+            ERROR("error read hvm context size!\n");
+            goto out;
+        }
+        
+        hvm_buf = malloc(rec_len);
+        if ( hvm_buf == NULL )
+        {
+            ERROR("memory alloc for hvm context buffer failed");
+            errno = ENOMEM;
+            goto out;
+        }
+        
+        if ( !read_exact(io_fd, hvm_buf, rec_len) )
+        {
+            ERROR("error loading the HVM context");
+            goto out;
+        }
+        
+        rc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len);
+        if ( rc ) 
+            ERROR("error setting the HVM context");
+       
+        goto out;
+    }
+
+    /* Non-HVM guests only from here on */
+
+    if ((pt_levels == 3) && !pae_extended_cr3) {
+
+        /*
+        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
+        ** is a little awkward and involves (a) finding all such PGDs and
+        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
+        ** with the new info; and (c) canonicalizing all the L1s using the
+        ** (potentially updated) p2m[].
+        **
+        ** This is relatively slow (and currently involves two passes through
+        ** the pfn_type[] array), but at least seems to be correct. May wish
+        ** to consider more complex approaches to optimize this later.
+        */
+
+        int j, k;
+        
+        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
+        for ( i = 0; i < p2m_size; i++ )
+        {
+            if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
+                  XEN_DOMCTL_PFINFO_L3TAB) &&
+                 (p2m[i] > 0xfffffUL) )
+            {
+                unsigned long new_mfn;
+                uint64_t l3ptes[4];
+                uint64_t *l3tab;
+
+                l3tab = (uint64_t *)
+                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                         PROT_READ, p2m[i]);
+
+                for(j = 0; j < 4; j++)
+                    l3ptes[j] = l3tab[j];
+
+                munmap(l3tab, PAGE_SIZE);
+
+                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
+                    ERROR("Couldn't get a page below 4GB :-(");
+                    goto out;
+                }
+
+                p2m[i] = new_mfn;
+                if (xc_add_mmu_update(xc_handle, mmu,
+                                      (((unsigned long long)new_mfn)
+                                       << PAGE_SHIFT) |
+                                      MMU_MACHPHYS_UPDATE, i)) {
+                    ERROR("Couldn't m2p on PAE root pgdir");
+                    goto out;
+                }
+
+                l3tab = (uint64_t *)
+                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                         PROT_READ | PROT_WRITE, p2m[i]);
+
+                for(j = 0; j < 4; j++)
+                    l3tab[j] = l3ptes[j];
+
+                munmap(l3tab, PAGE_SIZE);
+
+            }
+        }
+
+        /* Second pass: find all L1TABs and uncanonicalize them */
+        j = 0;
+
+        for ( i = 0; i < p2m_size; i++ )
+        {
+            if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
+                  XEN_DOMCTL_PFINFO_L1TAB) )
+            {
+                region_mfn[j] = p2m[i];
+                j++;
+            }
+
+            if(i == (p2m_size-1) || j == MAX_BATCH_SIZE) {
+
+                if (!(region_base = xc_map_foreign_batch(
+                          xc_handle, dom, PROT_READ | PROT_WRITE,
+                          region_mfn, j))) {
+                    ERROR("map batch failed");
+                    goto out;
+                }
+
+                for(k = 0; k < j; k++) {
+                    if(!uncanonicalize_pagetable(xc_handle, dom, 
+                                                 XEN_DOMCTL_PFINFO_L1TAB,
+                                                 region_base + k*PAGE_SIZE)) {
+                        ERROR("failed uncanonicalize pt!");
+                        goto out;
+                    }
+                }
+
+                munmap(region_base, j*PAGE_SIZE);
+                j = 0;
+            }
+        }
+
+        if (xc_finish_mmu_updates(xc_handle, mmu)) {
+            ERROR("Error doing finish_mmu_updates()");
+            goto out;
+        }
+    }
+
+    /*
+     * Pin page tables. Do this after writing to them as otherwise Xen
+     * will barf when doing the type-checking.
+     */
+    nr_pins = 0;
+    for ( i = 0; i < p2m_size; i++ )
+    {
+        if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
+            continue;
+
+        switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
+        {
+        case XEN_DOMCTL_PFINFO_L1TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
+            break;
+
+        case XEN_DOMCTL_PFINFO_L2TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
+            break;
+
+        case XEN_DOMCTL_PFINFO_L3TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
+            break;
+
+        case XEN_DOMCTL_PFINFO_L4TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
+            break;
+
+        default:
+            continue;
+        }
+
+        pin[nr_pins].arg1.mfn = p2m[i];
+        nr_pins++;
+
+        /* Batch full? Then flush. */
+        if (nr_pins == MAX_PIN_BATCH) {
+            if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) {
+                ERROR("Failed to pin batch of %d page tables", nr_pins);
+                goto out;
+            }
+            nr_pins = 0;
+        }
+    }
+
+    /* Flush final partial batch. */
+    if ((nr_pins != 0) && (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0)) {
+        ERROR("Failed to pin batch of %d page tables", nr_pins);
+        goto out;
+    }
+
+    DPRINTF("\b\b\b\b100%%\n");
+    DPRINTF("Memory reloaded (%ld pages)\n", nr_pfns);
+
+    /* Get the list of PFNs that are not in the psuedo-phys map */
+    {
+        unsigned int count;
+        unsigned long *pfntab;
+        int nr_frees, rc;
+
+        if (!read_exact(io_fd, &count, sizeof(count))) {
+            ERROR("Error when reading pfn count");
+            goto out;
+        }
+
+        if(!(pfntab = malloc(sizeof(unsigned long) * count))) {
+            ERROR("Out of memory");
+            goto out;
+        }
+
+        if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) {
+            ERROR("Error when reading pfntab");
+            goto out;
+        }
+
+        nr_frees = 0; 
+        for (i = 0; i < count; i++) {
+
+            unsigned long pfn = pfntab[i];
+
+            if(p2m[pfn] != INVALID_P2M_ENTRY) {
+                /* pfn is not in physmap now, but was at some point during 
+                   the save/migration process - need to free it */
+                pfntab[nr_frees++] = p2m[pfn];
+                p2m[pfn]  = INVALID_P2M_ENTRY; // not in pseudo-physical map
+            }
+        }
+
+        if (nr_frees > 0) {
+
+            struct xen_memory_reservation reservation = {
+                .nr_extents   = nr_frees,
+                .extent_order = 0,
+                .domid        = dom
+            };
+            set_xen_guest_handle(reservation.extent_start, pfntab);
+
+            if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
+                                   &reservation)) != nr_frees) {
+                ERROR("Could not decrease reservation : %d", rc);
+                goto out;
+            } else
+                DPRINTF("Decreased reservation by %d pages\n", count);
+        }
+    }
+
+    for (i = 0; i <= max_vcpu_id; i++) {
+        if (!(vcpumap & (1ULL << i)))
+            continue;
+
+        if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
+            ERROR("Error when reading ctxt %d", i);
+            goto out;
+        }
+
+        if ( !new_ctxt_format )
+            ctxt.flags |= VGCF_online;
+
+        if (i == 0) {
+            /*
+             * Uncanonicalise the suspend-record frame number and poke
+             * resume record.
+             */
+            pfn = ctxt.user_regs.edx;
+            if ((pfn >= p2m_size) ||
+                (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
+                ERROR("Suspend record frame number is bad");
+                goto out;
+            }
+            ctxt.user_regs.edx = mfn = p2m[pfn];
+            start_info = xc_map_foreign_range(
+                xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
+            start_info->nr_pages = p2m_size;
+            start_info->shared_info = shared_info_frame << PAGE_SHIFT;
+            start_info->flags = 0;
+            *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn];
+            start_info->store_evtchn = store_evtchn;
+            start_info->console.domU.mfn = p2m[start_info->console.domU.mfn];
+            start_info->console.domU.evtchn = console_evtchn;
+            *console_mfn = start_info->console.domU.mfn;
+            munmap(start_info, PAGE_SIZE);
+        }
+
+        /* Uncanonicalise each GDT frame number. */
+        if (ctxt.gdt_ents > 8192) {
+            ERROR("GDT entry count out of range");
+            goto out;
+        }
+
+        for (j = 0; (512*j) < ctxt.gdt_ents; j++) {
+            pfn = ctxt.gdt_frames[j];
+            if ((pfn >= p2m_size) ||
+                (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
+                ERROR("GDT frame number is bad");
+                goto out;
+            }
+            ctxt.gdt_frames[j] = p2m[pfn];
+        }
+
+        /* Uncanonicalise the page table base pointer. */
+        pfn = xen_cr3_to_pfn(ctxt.ctrlreg[3]);
+
+        if (pfn >= p2m_size) {
+            ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
+                  pfn, p2m_size, pfn_type[pfn]);
+            goto out;
+        }
+
+        if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
+             ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) {
+            ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
+                  pfn, p2m_size, pfn_type[pfn],
+                  (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+            goto out;
+        }
+
+        ctxt.ctrlreg[3] = xen_pfn_to_cr3(p2m[pfn]);
+
+        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
+        if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
+        {
+            pfn = xen_cr3_to_pfn(ctxt.ctrlreg[1]);
+
+            if (pfn >= p2m_size) {
+                ERROR("User PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
+                      pfn, p2m_size, pfn_type[pfn]);
+                goto out;
+            }
+
+            if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
+                 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) {
+                ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
+                      pfn, p2m_size, pfn_type[pfn],
+                      (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+                goto out;
+            }
+
+            ctxt.ctrlreg[1] = xen_pfn_to_cr3(p2m[pfn]);
+        }
+
+        domctl.cmd = XEN_DOMCTL_setvcpucontext;
+        domctl.domain = (domid_t)dom;
+        domctl.u.vcpucontext.vcpu = i;
+        set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt);
+        rc = xc_domctl(xc_handle, &domctl);
+        if (rc != 0) {
+            ERROR("Couldn't build vcpu%d", i);
+            goto out;
+        }
+        rc = 1;
+    }
+
+    if (!read_exact(io_fd, shared_info_page, PAGE_SIZE)) {
+        ERROR("Error when reading shared info page");
+        goto out;
+    }
+
+    /* clear any pending events and the selector */
+    memset(&(shared_info->evtchn_pending[0]), 0,
+           sizeof (shared_info->evtchn_pending));
+    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
+        shared_info->vcpu_info[i].evtchn_pending_sel = 0;
+
+    /* Copy saved contents of shared-info page. No checking needed. */
+    page = xc_map_foreign_range(
+        xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
+    memcpy(page, shared_info, PAGE_SIZE);
+    munmap(page, PAGE_SIZE);
+
+    /* Uncanonicalise the pfn-to-mfn table frame-number list. */
+    for (i = 0; i < P2M_FL_ENTRIES; i++) {
+        pfn = p2m_frame_list[i];
+        if ((pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
+            ERROR("PFN-to-MFN frame number is bad");
+            goto out;
+        }
+
+        p2m_frame_list[i] = p2m[pfn];
+    }
+
+    /* Copy the P2M we've constructed to the 'live' P2M */
+    if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
+                                          p2m_frame_list, P2M_FL_ENTRIES))) {
+        ERROR("Couldn't map p2m table");
+        goto out;
+    }
+
+    memcpy(live_p2m, p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
+    munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+    DPRINTF("Domain ready to be built.\n");
+    rc = 0;
+
+ out:
+    if ( (rc != 0) && (dom != 0) )
+        xc_domain_destroy(xc_handle, dom);
+    free(mmu);
+    free(p2m);
+    free(pfn_type);
+    free(hvm_buf);
+
+    /* discard cache for save file  */
+    discard_file_cache(io_fd, 1 /*flush*/);
+
+    DPRINTF("Restore exit with rc=%d\n", rc);
+    
+    return rc;
+}
diff -r 602d061ff51f -r e518f2fbdd72 tools/libxc/xc_hvm_restore.c
--- a/tools/libxc/xc_hvm_restore.c      Thu Apr 05 10:43:50 2007 +0100
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,351 +0,0 @@
-/******************************************************************************
- * xc_hvm_restore.c
- *
- * Restore the state of a HVM guest.
- *
- * Copyright (c) 2003, K A Fraser.
- * Copyright (c) 2006 Intel Corperation
- * rewriten for hvm guest by Zhai Edwin <edwin.zhai@xxxxxxxxx>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- */
-
-#include <stdlib.h>
-#include <unistd.h>
-
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-#include <xen/hvm/ioreq.h>
-#include <xen/hvm/params.h>
-#include <xen/hvm/e820.h>
-
-static ssize_t
-read_exact(int fd, void *buf, size_t count)
-{
-    int r = 0, s;
-    unsigned char *b = buf;
-
-    while ( r < count )
-    {
-        s = read(fd, &b[r], count - r);
-        if ( (s == -1) && (errno == EINTR) )
-            continue;
-        if ( s <= 0 )
-            break;
-        r += s;
-    }
-
-    return (r == count) ? 1 : 0;
-}
-
-#define BPL (sizeof(long)*8)
-#define test_bit(bit, map) !!((map)[(bit)/BPL] & (1UL << ((bit) % BPL)))
-#define set_bit(bit, map)  ((map)[(bit)/BPL] |= (1UL << ((bit) % BPL)))
-static int test_and_set_bit(unsigned long nr, unsigned long *map)
-{
-    int rc = test_bit(nr, map);
-    if ( !rc )
-        set_bit(nr, map);
-    return rc;
-}
-
-int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom,
-                   unsigned int store_evtchn, unsigned long *store_mfn,
-                   unsigned int pae, unsigned int apic)
-{
-    DECLARE_DOMCTL;
-
-    /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_t ctxt;
-
-    char *region_base;
-
-    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
-
-    xc_dominfo_t info;
-    unsigned int rc = 1, n, i;
-    uint32_t rec_len, nr_vcpus;
-    uint8_t *hvm_buf = NULL;
-
-    /* Magic frames: ioreqs and xenstore comms. */
-    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
-    unsigned long pfn;
-    int verify = 0;
-
-    /* Types of the pfns in the current region */
-    unsigned long region_pfn_type[MAX_BATCH_SIZE];
-    xen_pfn_t pfn_alloc_batch[MAX_BATCH_SIZE];
-    unsigned int pfn_alloc_batch_size;
-
-    /* The size of an array big enough to contain all guest pfns */
-    unsigned long max_pfn = 0xfffffUL; /* initial memory map guess: 4GB */
-    unsigned long *pfn_bitmap = NULL, *new_pfn_bitmap;
-
-    DPRINTF("xc_hvm_restore:dom=%d, store_evtchn=%d, "
-            "pae=%u, apic=%u.\n", dom, store_evtchn, pae, apic);
-
-    DPRINTF("xc_hvm_restore start: max_pfn = %lx\n", max_pfn);
-
-    if ( mlock(&ctxt, sizeof(ctxt)) )
-    {
-        /* needed for build dom0 op, but might as well do early */
-        ERROR("Unable to mlock ctxt");
-        return 1;
-    }
-
-    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
-    {
-        ERROR("Could not get domain info");
-        return 1;
-    }
-
-    domctl.cmd = XEN_DOMCTL_getdomaininfo;
-    domctl.domain = (domid_t)dom;
-    if ( xc_domctl(xc_handle, &domctl) < 0 )
-    {
-        ERROR("Could not get information on new domain");
-        goto out;
-    }
-
-    pfn_bitmap = calloc((max_pfn+1)/8, 1);
-    if ( pfn_bitmap == NULL )
-    {
-        ERROR("Could not allocate pfn bitmap");
-        goto out;
-    }
-
-    n = 0;
-    for ( ; ; )
-    {
-        int j;
-
-        if ( !read_exact(io_fd, &j, sizeof(int)) )
-        {
-            ERROR("HVM restore Error when reading batch size");
-            goto out;
-        }
-
-        PPRINTF("batch %d\n",j);
-
-        if ( j == -1 )
-        {
-            verify = 1;
-            DPRINTF("Entering page verify mode\n");
-            continue;
-        }
-
-        if ( j == 0 )
-            break;  /* our work here is done */
-
-        if ( j > MAX_BATCH_SIZE )
-        {
-            ERROR("Max batch size exceeded. Giving up.");
-            goto out;
-        }
-
-        if ( !read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
-        {
-            ERROR("Error when reading region pfn types");
-            goto out;
-        }
-
-        pfn_alloc_batch_size = 0;
-        for ( i = 0; i < j; i++ )
-        {
-            pfn = region_pfn_type[i];
-            if ( pfn & XEN_DOMCTL_PFINFO_LTAB_MASK )
-                continue;
-
-            while ( pfn > max_pfn )
-            {
-                if ( max_pfn >= 0xfffffff )
-                {
-                    ERROR("Maximum PFN beyond reason (1TB) %lx\n", pfn);
-                    goto out;
-                }
-                max_pfn = 2*max_pfn + 1;
-                new_pfn_bitmap = realloc(pfn_bitmap, (max_pfn+1)/8);
-                if ( new_pfn_bitmap == NULL )
-                {
-                    ERROR("Could not realloc pfn bitmap for max_pfn=%lx\n",
-                          max_pfn);
-                    goto out;
-                }
-                pfn_bitmap = new_pfn_bitmap;
-                memset(&pfn_bitmap[(max_pfn+1)/(2*BPL)], 0, (max_pfn+1)/(2*8));
-            }
-
-            if ( !test_and_set_bit(pfn, pfn_bitmap) )
-                pfn_alloc_batch[pfn_alloc_batch_size++] = pfn;
-        }
-
-        if ( pfn_alloc_batch_size != 0 )
-        {
-             rc = xc_domain_memory_populate_physmap(
-                 xc_handle, dom, pfn_alloc_batch_size, 0, 0, pfn_alloc_batch);
-             if ( rc != 0 )
-             {
-                 PERROR("Could not allocate %u pages for HVM guest.\n",
-                        pfn_alloc_batch_size);
-                 goto out;
-             }
-        }
-
-        region_base = xc_map_foreign_batch(
-            xc_handle, dom, PROT_WRITE, region_pfn_type, j);
-
-        for ( i = 0; i < j; i++ )
-        {
-            void *page;
-
-            pfn = region_pfn_type[i];
-            if ( pfn & XEN_DOMCTL_PFINFO_LTAB_MASK )
-                continue;
-
-            /* In verify mode, we use a copy; otherwise we work in place */
-            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
-
-            if ( !read_exact(io_fd, page, PAGE_SIZE) )
-            {
-                ERROR("Error when reading page (%x)", i);
-                goto out;
-            }
-
-            if ( verify )
-            {
-                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
-                if ( res )
-                {
-                    int v;
-
-                    DPRINTF("************** pfn=%lx gotcs=%08lx "
-                            "actualcs=%08lx\n", pfn, 
-                            csum_page(region_base + i*PAGE_SIZE),
-                            csum_page(buf));
-
-                    for ( v = 0; v < 4; v++ )
-                    {
-                        unsigned long *p = (unsigned long *)
-                            (region_base + i*PAGE_SIZE);
-                        if (buf[v] != p[v])
-                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
-                    }
-                }
-            }
-
-        } /* end of 'batch' for loop */
-
-        munmap(region_base, j*PAGE_SIZE);
-        n += j; /* crude stats */
-    }
-    
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae);
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_EVTCHN, store_evtchn);
-
-    if ( !read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
-    {
-        ERROR("error reading magic page addresses\n");
-        goto out;
-    }
-
-    if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) ||
-         xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) ||
-         xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) )
-    {
-        rc = -1;
-        goto out;
-    }
-
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, magic_pfns[0]);
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]);
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, magic_pfns[2]);
-    *store_mfn = magic_pfns[2];
-    DPRINTF("hvm restore: calculate new store_mfn=0x%lx.\n", *store_mfn);
-
-    if ( !read_exact(io_fd, &nr_vcpus, sizeof(uint32_t)) )
-    {
-        ERROR("error read nr vcpu !\n");
-        goto out;
-    }
-    DPRINTF("hvm restore:get nr_vcpus=%d.\n", nr_vcpus);
-
-    for ( i = 0; i < nr_vcpus; i++ )
-    {
-        if ( !read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
-        {
-            ERROR("error read vcpu context size!\n");
-            goto out;
-        }
-        if ( rec_len != sizeof(ctxt) )
-        {
-            ERROR("vcpu context size dismatch!\n");
-            goto out;
-        }
-
-        if ( !read_exact(io_fd, &(ctxt), sizeof(ctxt)) )
-        {
-            ERROR("error read vcpu context.\n");
-            goto out;
-        }
-
-        if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) )
-        {
-            ERROR("Could not set vcpu context, rc=%d", rc);
-            goto out;
-        }
-    }
-
-    /* restore hvm context including pic/pit/shpage */
-    if ( !read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
-    {
-        ERROR("error read hvm context size!\n");
-        goto out;
-    }
-
-    hvm_buf = malloc(rec_len);
-    if ( hvm_buf == NULL )
-    {
-        ERROR("memory alloc for hvm context buffer failed");
-        errno = ENOMEM;
-        goto out;
-    }
-
-    if ( !read_exact(io_fd, hvm_buf, rec_len) )
-    {
-        ERROR("error read hvm buffer!\n");
-        goto out;
-    }
-
-    if ( (rc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len)) )
-    {
-        ERROR("error set hvm buffer!\n");
-        goto out;
-    }
-
-    rc = 0;
-    goto out;
-
- out:
-    if ( (rc != 0) && (dom != 0) )
-        xc_domain_destroy(xc_handle, dom);
-    free(hvm_buf);
-    free(pfn_bitmap);
-
-    DPRINTF("Restore exit with rc=%d\n", rc);
-
-    return rc;
-}
diff -r 602d061ff51f -r e518f2fbdd72 tools/libxc/xc_hvm_save.c
--- a/tools/libxc/xc_hvm_save.c Thu Apr 05 10:43:50 2007 +0100
+++ b/tools/libxc/xc_hvm_save.c Thu Apr 05 15:11:22 2007 +0100
@@ -305,6 +305,8 @@ int xc_hvm_save(int xc_handle, int io_fd
 
     unsigned long total_sent = 0;
 
+    uint64_t vcpumap = 1ULL;
+
     DPRINTF("xc_hvm_save: dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, "
             "live=%d, debug=%d.\n", dom, max_iters, max_factor, flags,
             live, debug);
@@ -371,6 +373,12 @@ int xc_hvm_save(int xc_handle, int io_fd
 
     /* Size of any array that covers 0 ... max_pfn */
     pfn_array_size = max_pfn + 1;
+    if ( !write_exact(io_fd, &pfn_array_size, sizeof(unsigned long)) )
+    {
+        ERROR("Error when writing to state file (1)");
+        goto out;
+    }
+    
 
     /* pretend we sent all the pages last iteration */
     sent_last_iter = pfn_array_size;
@@ -644,6 +652,32 @@ int xc_hvm_save(int xc_handle, int io_fd
 
     DPRINTF("All HVM memory is saved\n");
 
+    {
+        struct {
+            int minustwo;
+            int max_vcpu_id;
+            uint64_t vcpumap;
+        } chunk = { -2, info.max_vcpu_id };
+
+        if (info.max_vcpu_id >= 64) {
+            ERROR("Too many VCPUS in guest!");
+            goto out;
+        }
+
+        for (i = 1; i <= info.max_vcpu_id; i++) {
+            xc_vcpuinfo_t vinfo;
+            if ((xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
+                vinfo.online)
+                vcpumap |= 1ULL << i;
+        }
+
+        chunk.vcpumap = vcpumap;
+        if(!write_exact(io_fd, &chunk, sizeof(chunk))) {
+            ERROR("Error when writing to state file (errno %d)", errno);
+            goto out;
+        }
+    }
+
     /* Zero terminate */
     i = 0;
     if ( !write_exact(io_fd, &i, sizeof(int)) )
@@ -666,33 +700,22 @@ int xc_hvm_save(int xc_handle, int io_fd
         goto out;
     }
 
-    /* save vcpu/vmcs context */
-    if ( !write_exact(io_fd, &nr_vcpus, sizeof(uint32_t)) )
-    {
-        ERROR("error write nr vcpus");
-        goto out;
-    }
-
-    /*XXX: need a online map to exclude down cpu */
+    /* save vcpu/vmcs contexts */
     for ( i = 0; i < nr_vcpus; i++ )
     {
+        if (!(vcpumap & (1ULL << i)))
+            continue;
+
         if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
         {
             ERROR("HVM:Could not get vcpu context");
             goto out;
         }
 
-        rec_size = sizeof(ctxt);
-        DPRINTF("write %d vcpucontext of total %d.\n", i, nr_vcpus); 
-        if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
-        {
-            ERROR("error write vcpu ctxt size");
-            goto out;
-        }
-
+        DPRINTF("write vcpu %d context.\n", i); 
         if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
         {
-            ERROR("write vmcs failed!\n");
+            ERROR("write vcpu context failed!\n");
             goto out;
         }
     }
diff -r 602d061ff51f -r e518f2fbdd72 tools/libxc/xc_linux_restore.c
--- a/tools/libxc/xc_linux_restore.c    Thu Apr 05 10:43:50 2007 +0100
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,955 +0,0 @@
-/******************************************************************************
- * xc_linux_restore.c
- *
- * Restore the state of a Linux session.
- *
- * Copyright (c) 2003, K A Fraser.
- */
-
-#include <stdlib.h>
-#include <unistd.h>
-
-#include "xg_private.h"
-#include "xg_save_restore.h"
-#include "xc_dom.h"
-
-/* max mfn of the current host machine */
-static unsigned long max_mfn;
-
-/* virtual starting address of the hypervisor */
-static unsigned long hvirt_start;
-
-/* #levels of page tables used by the current guest */
-static unsigned int pt_levels;
-
-/* number of pfns this guest has (i.e. number of entries in the P2M) */
-static unsigned long p2m_size;
-
-/* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
-static unsigned long nr_pfns;
-
-/* Live mapping of the table mapping each PFN to its current MFN. */
-static xen_pfn_t *live_p2m = NULL;
-
-/* A table mapping each PFN to its new MFN. */
-static xen_pfn_t *p2m = NULL;
-
-/* A table of P2M mappings in the current region */
-static xen_pfn_t *p2m_batch = NULL;
-
-static ssize_t
-read_exact(int fd, void *buf, size_t count)
-{
-    int r = 0, s;
-    unsigned char *b = buf;
-
-    while (r < count) {
-        s = read(fd, &b[r], count - r);
-        if ((s == -1) && (errno == EINTR))
-            continue;
-        if (s <= 0) {
-            break;
-        }
-        r += s;
-    }
-
-    return (r == count) ? 1 : 0;
-}
-
-/*
-** In the state file (or during transfer), all page-table pages are
-** converted into a 'canonical' form where references to actual mfns
-** are replaced with references to the corresponding pfns.
-** This function inverts that operation, replacing the pfn values with
-** the (now known) appropriate mfn values.
-*/
-static int uncanonicalize_pagetable(int xc_handle, uint32_t dom, 
-                                    unsigned long type, void *page)
-{
-    int i, pte_last;
-    unsigned long pfn;
-    uint64_t pte;
-    int nr_mfns = 0; 
-
-    pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
-
-    /* First pass: work out how many (if any) MFNs we need to alloc */
-    for(i = 0; i < pte_last; i++) {
-        
-        if(pt_levels == 2)
-            pte = ((uint32_t *)page)[i];
-        else
-            pte = ((uint64_t *)page)[i];
-        
-        /* XXX SMH: below needs fixing for PROT_NONE etc */
-        if(!(pte & _PAGE_PRESENT))
-            continue; 
-        
-        pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
-        
-        if(pfn >= p2m_size) {
-            /* This "page table page" is probably not one; bail. */
-            ERROR("Frame number in type %lu page table is out of range: "
-                  "i=%d pfn=0x%lx p2m_size=%lu",
-                  type >> 28, i, pfn, p2m_size);
-            return 0;
-        }
-        
-        if(p2m[pfn] == INVALID_P2M_ENTRY) {
-            /* Have a 'valid' PFN without a matching MFN - need to alloc */
-            p2m_batch[nr_mfns++] = pfn; 
-        }
-    }
-    
-    
-    /* Alllocate the requistite number of mfns */
-    if (nr_mfns && xc_domain_memory_populate_physmap(
-            xc_handle, dom, nr_mfns, 0, 0, p2m_batch) != 0) { 
-        ERROR("Failed to allocate memory for batch.!\n"); 
-        errno = ENOMEM;
-        return 0; 
-    }
-    
-    /* Second pass: uncanonicalize each present PTE */
-    nr_mfns = 0;
-    for(i = 0; i < pte_last; i++) {
-
-        if(pt_levels == 2)
-            pte = ((uint32_t *)page)[i];
-        else
-            pte = ((uint64_t *)page)[i];
-        
-        /* XXX SMH: below needs fixing for PROT_NONE etc */
-        if(!(pte & _PAGE_PRESENT))
-            continue;
-        
-        pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
-        
-        if(p2m[pfn] == INVALID_P2M_ENTRY)
-            p2m[pfn] = p2m_batch[nr_mfns++];
-
-        pte &= ~MADDR_MASK_X86;
-        pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
-
-        if(pt_levels == 2)
-            ((uint32_t *)page)[i] = (uint32_t)pte;
-        else
-            ((uint64_t *)page)[i] = (uint64_t)pte;
-    }
-
-    return 1;
-}
-
-
-int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom,
-                     unsigned int store_evtchn, unsigned long *store_mfn,
-                     unsigned int console_evtchn, unsigned long *console_mfn)
-{
-    DECLARE_DOMCTL;
-    int rc = 1, i, j, n, m, pae_extended_cr3 = 0;
-    unsigned long mfn, pfn;
-    unsigned int prev_pc, this_pc;
-    int verify = 0;
-    int nraces = 0;
-
-    /* The new domain's shared-info frame number. */
-    unsigned long shared_info_frame;
-    unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
-    shared_info_t *shared_info = (shared_info_t *)shared_info_page;
-
-    /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_t ctxt;
-
-    /* A table containing the type of each PFN (/not/ MFN!). */
-    unsigned long *pfn_type = NULL;
-
-    /* A table of MFNs to map in the current region */
-    xen_pfn_t *region_mfn = NULL;
-
-    /* Types of the pfns in the current region */
-    unsigned long region_pfn_type[MAX_BATCH_SIZE];
-
-    /* A temporary mapping, and a copy, of one frame of guest memory. */
-    unsigned long *page = NULL;
-
-    /* A copy of the pfn-to-mfn table frame list. */
-    xen_pfn_t *p2m_frame_list = NULL;
-
-    /* A temporary mapping of the guest's start_info page. */
-    start_info_t *start_info;
-
-    /* Our mapping of the current region (batch) */
-    char *region_base;
-
-    xc_mmu_t *mmu = NULL;
-
-    /* used by debug verify code */
-    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
-
-    struct mmuext_op pin[MAX_PIN_BATCH];
-    unsigned int nr_pins;
-
-    uint64_t vcpumap = 1ULL;
-    unsigned int max_vcpu_id = 0;
-    int new_ctxt_format = 0;
-
-    /* For info only */
-    nr_pfns = 0;
-
-    if ( !read_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
-    {
-        ERROR("read: p2m_size");
-        goto out;
-    }
-    DPRINTF("xc_linux_restore start: p2m_size = %lx\n", p2m_size);
-
-    /*
-     * XXX For now, 32bit dom0's can only save/restore 32bit domUs
-     * on 64bit hypervisors.
-     */
-    memset(&domctl, 0, sizeof(domctl));
-    domctl.domain = dom;
-    domctl.cmd    = XEN_DOMCTL_set_address_size;
-    domctl.u.address_size.size = sizeof(unsigned long) * 8;
-    rc = do_domctl(xc_handle, &domctl);
-    if ( rc != 0 ) {
-       ERROR("Unable to set guest address size.");
-       goto out;
-    }
-
-    if(!get_platform_info(xc_handle, dom,
-                          &max_mfn, &hvirt_start, &pt_levels)) {
-        ERROR("Unable to get platform info.");
-        return 1;
-    }
-
-    if (lock_pages(&ctxt, sizeof(ctxt))) {
-        /* needed for build domctl, but might as well do early */
-        ERROR("Unable to lock ctxt");
-        return 1;
-    }
-
-    if (!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
-        ERROR("Couldn't allocate p2m_frame_list array");
-        goto out;
-    }
-
-    /* Read first entry of P2M list, or extended-info signature (~0UL). */
-    if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
-        ERROR("read extended-info signature failed");
-        goto out;
-    }
-
-    if (p2m_frame_list[0] == ~0UL) {
-        uint32_t tot_bytes;
-
-        /* Next 4 bytes: total size of following extended info. */
-        if (!read_exact(io_fd, &tot_bytes, sizeof(tot_bytes))) {
-            ERROR("read extended-info size failed");
-            goto out;
-        }
-
-        while (tot_bytes) {
-            uint32_t chunk_bytes;
-            char     chunk_sig[4];
-
-            /* 4-character chunk signature + 4-byte remaining chunk size. */
-            if (!read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
-                !read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes))) {
-                ERROR("read extended-info chunk signature failed");
-                goto out;
-            }
-            tot_bytes -= 8;
-
-            /* VCPU context structure? */
-            if (!strncmp(chunk_sig, "vcpu", 4)) {
-                if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
-                    ERROR("read extended-info vcpu context failed");
-                    goto out;
-                }
-                tot_bytes   -= sizeof(struct vcpu_guest_context);
-                chunk_bytes -= sizeof(struct vcpu_guest_context);
-
-                if (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))
-                    pae_extended_cr3 = 1;
-            }
-
-            /* Any remaining bytes of this chunk: read and discard. */
-            while (chunk_bytes) {
-                unsigned long sz = chunk_bytes;
-                if ( sz > P2M_FL_SIZE )
-                    sz = P2M_FL_SIZE;
-                if (!read_exact(io_fd, p2m_frame_list, sz)) {
-                    ERROR("read-and-discard extended-info chunk bytes failed");
-                    goto out;
-                }
-                chunk_bytes -= sz;
-                tot_bytes   -= sz;
-            }
-        }
-
-        /* Now read the real first entry of P2M list. */
-        if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
-            ERROR("read first entry of p2m_frame_list failed");
-            goto out;
-        }
-    }
-
-    /* First entry is already read into the p2m array. */
-    if (!read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long))) {
-        ERROR("read p2m_frame_list failed");
-        goto out;
-    }
-
-    /* We want zeroed memory so use calloc rather than malloc. */
-    p2m        = calloc(p2m_size, sizeof(xen_pfn_t));
-    pfn_type   = calloc(p2m_size, sizeof(unsigned long));
-    region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
-    p2m_batch  = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
-
-    if ((p2m == NULL) || (pfn_type == NULL) ||
-        (region_mfn == NULL) || (p2m_batch == NULL)) {
-        ERROR("memory alloc failed");
-        errno = ENOMEM;
-        goto out;
-    }
-
-    if (lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
-        ERROR("Could not lock region_mfn");
-        goto out;
-    }
-
-    if (lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
-        ERROR("Could not lock p2m_batch");
-        goto out;
-    }
-
-    /* Get the domain's shared-info frame. */
-    domctl.cmd = XEN_DOMCTL_getdomaininfo;
-    domctl.domain = (domid_t)dom;
-    if (xc_domctl(xc_handle, &domctl) < 0) {
-        ERROR("Could not get information on new domain");
-        goto out;
-    }
-    shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
-
-    /* Mark all PFNs as invalid; we allocate on demand */
-    for ( pfn = 0; pfn < p2m_size; pfn++ )
-        p2m[pfn] = INVALID_P2M_ENTRY;
-
-    if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
-        ERROR("Could not initialise for MMU updates");
-        goto out;
-    }
-
-    DPRINTF("Reloading memory pages:   0%%\n");
-
-    /*
-     * Now simply read each saved frame into its new machine frame.
-     * We uncanonicalise page tables as we go.
-     */
-    prev_pc = 0;
-
-    n = m = 0;
-    while (1) {
-
-        int j, nr_mfns = 0; 
-
-        this_pc = (n * 100) / p2m_size;
-        if ( (this_pc - prev_pc) >= 5 )
-        {
-            PPRINTF("\b\b\b\b%3d%%", this_pc);
-            prev_pc = this_pc;
-        }
-
-        if (!read_exact(io_fd, &j, sizeof(int))) {
-            ERROR("Error when reading batch size");
-            goto out;
-        }
-
-        PPRINTF("batch %d\n",j);
-
-        if (j == -1) {
-            verify = 1;
-            DPRINTF("Entering page verify mode\n");
-            continue;
-        }
-
-        if (j == -2) {
-            new_ctxt_format = 1;
-            if (!read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
-                (max_vcpu_id >= 64) ||
-                !read_exact(io_fd, &vcpumap, sizeof(uint64_t))) {
-                ERROR("Error when reading max_vcpu_id");
-                goto out;
-            }
-            continue;
-        }
-
-        if (j == 0)
-            break;  /* our work here is done */
-
-        if (j > MAX_BATCH_SIZE) {
-            ERROR("Max batch size exceeded. Giving up.");
-            goto out;
-        }
-
-        if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) {
-            ERROR("Error when reading region pfn types");
-            goto out;
-        }
-
-        /* First pass for this batch: work out how much memory to alloc */
-        nr_mfns = 0; 
-        for ( i = 0; i < j; i++ )
-        {
-            unsigned long pfn, pagetype;
-            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-            if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && 
-                 (p2m[pfn] == INVALID_P2M_ENTRY) )
-            {
-                /* Have a live PFN which hasn't had an MFN allocated */
-                p2m_batch[nr_mfns++] = pfn; 
-            }
-        } 
-
-
-        /* Now allocate a bunch of mfns for this batch */
-        if (nr_mfns && xc_domain_memory_populate_physmap(
-                xc_handle, dom, nr_mfns, 0, 0, p2m_batch) != 0) { 
-            ERROR("Failed to allocate memory for batch.!\n"); 
-            errno = ENOMEM;
-            goto out;
-        }
-
-        /* Second pass for this batch: update p2m[] and region_mfn[] */
-        nr_mfns = 0; 
-        for ( i = 0; i < j; i++ )
-        {
-            unsigned long pfn, pagetype;
-            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-            if ( pagetype == XEN_DOMCTL_PFINFO_XTAB)
-                region_mfn[i] = ~0UL; /* map will fail but we don't care */
-            else 
-            {
-                if (p2m[pfn] == INVALID_P2M_ENTRY) {
-                    /* We just allocated a new mfn above; update p2m */
-                    p2m[pfn] = p2m_batch[nr_mfns++]; 
-                    nr_pfns++; 
-                }
-
-                /* setup region_mfn[] for batch map */
-                region_mfn[i] = p2m[pfn]; 
-            }
-        } 
-
-        /* Map relevant mfns */
-        region_base = xc_map_foreign_batch(
-            xc_handle, dom, PROT_WRITE, region_mfn, j);
-
-        if ( region_base == NULL )
-        {
-            ERROR("map batch failed");
-            goto out;
-        }
-
-        for ( i = 0; i < j; i++ )
-        {
-            void *page;
-            unsigned long pagetype;
-
-            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-            if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
-                /* a bogus/unmapped page: skip it */
-                continue;
-
-            if ( pfn > p2m_size )
-            {
-                ERROR("pfn out of range");
-                goto out;
-            }
-
-            pfn_type[pfn] = pagetype;
-
-            mfn = p2m[pfn];
-
-            /* In verify mode, we use a copy; otherwise we work in place */
-            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
-
-            if (!read_exact(io_fd, page, PAGE_SIZE)) {
-                ERROR("Error when reading page (type was %lx)", pagetype);
-                goto out;
-            }
-
-            pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
-            if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 
-                 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
-            {
-                /*
-                ** A page table page - need to 'uncanonicalize' it, i.e.
-                ** replace all the references to pfns with the corresponding
-                ** mfns for the new domain.
-                **
-                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
-                ** so we may need to update the p2m after the main loop.
-                ** Hence we defer canonicalization of L1s until then.
-                */
-                if ((pt_levels != 3) ||
-                    pae_extended_cr3 ||
-                    (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
-
-                    if (!uncanonicalize_pagetable(xc_handle, dom, 
-                                                  pagetype, page)) {
-                        /*
-                        ** Failing to uncanonicalize a page table can be ok
-                        ** under live migration since the pages type may have
-                        ** changed by now (and we'll get an update later).
-                        */
-                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
-                                pagetype >> 28, pfn, mfn);
-                        nraces++;
-                        continue;
-                    } 
-                }
-            }
-            else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
-            {
-                ERROR("Bogus page type %lx page table is out of range: "
-                    "i=%d p2m_size=%lu", pagetype, i, p2m_size);
-                goto out;
-
-            }
-
-
-            if (verify) {
-
-                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
-
-                if (res) {
-
-                    int v;
-
-                    DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
-                            "actualcs=%08lx\n", pfn, pfn_type[pfn],
-                            csum_page(region_base + i*PAGE_SIZE),
-                            csum_page(buf));
-
-                    for (v = 0; v < 4; v++) {
-
-                        unsigned long *p = (unsigned long *)
-                            (region_base + i*PAGE_SIZE);
-                        if (buf[v] != p[v])
-                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
-                    }
-                }
-            }
-
-            if (xc_add_mmu_update(xc_handle, mmu,
-                                  (((unsigned long long)mfn) << PAGE_SHIFT)
-                                  | MMU_MACHPHYS_UPDATE, pfn)) {
-                ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
-                goto out;
-            }
-        } /* end of 'batch' for loop */
-
-        munmap(region_base, j*PAGE_SIZE);
-        n+= j; /* crude stats */
-
-        /* 
-         * Discard cache for portion of file read so far up to last
-         *  page boundary every 16MB or so.
-         */
-        m += j;
-        if ( m > MAX_PAGECACHE_USAGE )
-        {
-            discard_file_cache(io_fd, 0 /* no flush */);
-            m = 0;
-        }
-    }
-
-    /*
-     * Ensure we flush all machphys updates before potential PAE-specific
-     * reallocations below.
-     */
-    if (xc_finish_mmu_updates(xc_handle, mmu)) {
-        ERROR("Error doing finish_mmu_updates()");
-        goto out;
-    }
-
-    DPRINTF("Received all pages (%d races)\n", nraces);
-
-    if ((pt_levels == 3) && !pae_extended_cr3) {
-
-        /*
-        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
-        ** is a little awkward and involves (a) finding all such PGDs and
-        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
-        ** with the new info; and (c) canonicalizing all the L1s using the
-        ** (potentially updated) p2m[].
-        **
-        ** This is relatively slow (and currently involves two passes through
-        ** the pfn_type[] array), but at least seems to be correct. May wish
-        ** to consider more complex approaches to optimize this later.
-        */
-
-        int j, k;
-        
-        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
-        for ( i = 0; i < p2m_size; i++ )
-        {
-            if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
-                  XEN_DOMCTL_PFINFO_L3TAB) &&
-                 (p2m[i] > 0xfffffUL) )
-            {
-                unsigned long new_mfn;
-                uint64_t l3ptes[4];
-                uint64_t *l3tab;
-
-                l3tab = (uint64_t *)
-                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                         PROT_READ, p2m[i]);
-
-                for(j = 0; j < 4; j++)
-                    l3ptes[j] = l3tab[j];
-
-                munmap(l3tab, PAGE_SIZE);
-
-                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
-                    ERROR("Couldn't get a page below 4GB :-(");
-                    goto out;
-                }
-
-                p2m[i] = new_mfn;
-                if (xc_add_mmu_update(xc_handle, mmu,
-                                      (((unsigned long long)new_mfn)
-                                       << PAGE_SHIFT) |
-                                      MMU_MACHPHYS_UPDATE, i)) {
-                    ERROR("Couldn't m2p on PAE root pgdir");
-                    goto out;
-                }
-
-                l3tab = (uint64_t *)
-                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                         PROT_READ | PROT_WRITE, p2m[i]);
-
-                for(j = 0; j < 4; j++)
-                    l3tab[j] = l3ptes[j];
-
-                munmap(l3tab, PAGE_SIZE);
-
-            }
-        }
-
-        /* Second pass: find all L1TABs and uncanonicalize them */
-        j = 0;
-
-        for ( i = 0; i < p2m_size; i++ )
-        {
-            if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
-                  XEN_DOMCTL_PFINFO_L1TAB) )
-            {
-                region_mfn[j] = p2m[i];
-                j++;
-            }
-
-            if(i == (p2m_size-1) || j == MAX_BATCH_SIZE) {
-
-                if (!(region_base = xc_map_foreign_batch(
-                          xc_handle, dom, PROT_READ | PROT_WRITE,
-                          region_mfn, j))) {
-                    ERROR("map batch failed");
-                    goto out;
-                }
-
-                for(k = 0; k < j; k++) {
-                    if(!uncanonicalize_pagetable(xc_handle, dom, 
-                                                 XEN_DOMCTL_PFINFO_L1TAB,
-                                                 region_base + k*PAGE_SIZE)) {
-                        ERROR("failed uncanonicalize pt!");
-                        goto out;
-                    }
-                }
-
-                munmap(region_base, j*PAGE_SIZE);
-                j = 0;
-            }
-        }
-
-        if (xc_finish_mmu_updates(xc_handle, mmu)) {
-            ERROR("Error doing finish_mmu_updates()");
-            goto out;
-        }
-    }
-
-    /*
-     * Pin page tables. Do this after writing to them as otherwise Xen
-     * will barf when doing the type-checking.
-     */
-    nr_pins = 0;
-    for ( i = 0; i < p2m_size; i++ )
-    {
-        if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
-            continue;
-
-        switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
-        {
-        case XEN_DOMCTL_PFINFO_L1TAB:
-            pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
-            break;
-
-        case XEN_DOMCTL_PFINFO_L2TAB:
-            pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
-            break;
-
-        case XEN_DOMCTL_PFINFO_L3TAB:
-            pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
-            break;
-
-        case XEN_DOMCTL_PFINFO_L4TAB:
-            pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
-            break;
-
-        default:
-            continue;
-        }
-
-        pin[nr_pins].arg1.mfn = p2m[i];
-        nr_pins++;
-
-        /* Batch full? Then flush. */
-        if (nr_pins == MAX_PIN_BATCH) {
-            if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) {
-                ERROR("Failed to pin batch of %d page tables", nr_pins);
-                goto out;
-            }
-            nr_pins = 0;
-        }
-    }
-
-    /* Flush final partial batch. */
-    if ((nr_pins != 0) && (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0)) {
-        ERROR("Failed to pin batch of %d page tables", nr_pins);
-        goto out;
-    }
-
-    DPRINTF("\b\b\b\b100%%\n");
-    DPRINTF("Memory reloaded (%ld pages)\n", nr_pfns);
-
-    /* Get the list of PFNs that are not in the psuedo-phys map */
-    {
-        unsigned int count;
-        unsigned long *pfntab;
-        int nr_frees, rc;
-
-        if (!read_exact(io_fd, &count, sizeof(count))) {
-            ERROR("Error when reading pfn count");
-            goto out;
-        }
-
-        if(!(pfntab = malloc(sizeof(unsigned long) * count))) {
-            ERROR("Out of memory");
-            goto out;
-        }
-
-        if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) {
-            ERROR("Error when reading pfntab");
-            goto out;
-        }
-
-        nr_frees = 0; 
-        for (i = 0; i < count; i++) {
-
-            unsigned long pfn = pfntab[i];
-
-            if(p2m[pfn] != INVALID_P2M_ENTRY) {
-                /* pfn is not in physmap now, but was at some point during 
-                   the save/migration process - need to free it */
-                pfntab[nr_frees++] = p2m[pfn];
-                p2m[pfn]  = INVALID_P2M_ENTRY; // not in pseudo-physical map
-            }
-        }
-
-        if (nr_frees > 0) {
-
-            struct xen_memory_reservation reservation = {
-                .nr_extents   = nr_frees,
-                .extent_order = 0,
-                .domid        = dom
-            };
-            set_xen_guest_handle(reservation.extent_start, pfntab);
-
-            if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
-                                   &reservation)) != nr_frees) {
-                ERROR("Could not decrease reservation : %d", rc);
-                goto out;
-            } else
-                DPRINTF("Decreased reservation by %d pages\n", count);
-        }
-    }
-
-    for (i = 0; i <= max_vcpu_id; i++) {
-        if (!(vcpumap & (1ULL << i)))
-            continue;
-
-        if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
-            ERROR("Error when reading ctxt %d", i);
-            goto out;
-        }
-
-        if ( !new_ctxt_format )
-            ctxt.flags |= VGCF_online;
-
-        if (i == 0) {
-            /*
-             * Uncanonicalise the suspend-record frame number and poke
-             * resume record.
-             */
-            pfn = ctxt.user_regs.edx;
-            if ((pfn >= p2m_size) ||
-                (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
-                ERROR("Suspend record frame number is bad");
-                goto out;
-            }
-            ctxt.user_regs.edx = mfn = p2m[pfn];
-            start_info = xc_map_foreign_range(
-                xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
-            start_info->nr_pages = p2m_size;
-            start_info->shared_info = shared_info_frame << PAGE_SHIFT;
-            start_info->flags = 0;
-            *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn];
-            start_info->store_evtchn = store_evtchn;
-            start_info->console.domU.mfn = p2m[start_info->console.domU.mfn];
-            start_info->console.domU.evtchn = console_evtchn;
-            *console_mfn = start_info->console.domU.mfn;
-            munmap(start_info, PAGE_SIZE);
-        }
-
-        /* Uncanonicalise each GDT frame number. */
-        if (ctxt.gdt_ents > 8192) {
-            ERROR("GDT entry count out of range");
-            goto out;
-        }
-
-        for (j = 0; (512*j) < ctxt.gdt_ents; j++) {
-            pfn = ctxt.gdt_frames[j];
-            if ((pfn >= p2m_size) ||
-                (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
-                ERROR("GDT frame number is bad");
-                goto out;
-            }
-            ctxt.gdt_frames[j] = p2m[pfn];
-        }
-
-        /* Uncanonicalise the page table base pointer. */
-        pfn = xen_cr3_to_pfn(ctxt.ctrlreg[3]);
-
-        if (pfn >= p2m_size) {
-            ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
-                  pfn, p2m_size, pfn_type[pfn]);
-            goto out;
-        }
-
-        if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
-             ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) {
-            ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
-                  pfn, p2m_size, pfn_type[pfn],
-                  (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
-            goto out;
-        }
-
-        ctxt.ctrlreg[3] = xen_pfn_to_cr3(p2m[pfn]);
-
-        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
-        if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
-        {
-            pfn = xen_cr3_to_pfn(ctxt.ctrlreg[1]);
-
-            if (pfn >= p2m_size) {
-                ERROR("User PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
-                      pfn, p2m_size, pfn_type[pfn]);
-                goto out;
-            }
-
-            if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
-                 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) {
-                ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
-                      pfn, p2m_size, pfn_type[pfn],
-                      (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
-                goto out;
-            }
-
-            ctxt.ctrlreg[1] = xen_pfn_to_cr3(p2m[pfn]);
-        }
-
-        domctl.cmd = XEN_DOMCTL_setvcpucontext;
-        domctl.domain = (domid_t)dom;
-        domctl.u.vcpucontext.vcpu = i;
-        set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt);
-        rc = xc_domctl(xc_handle, &domctl);
-        if (rc != 0) {
-            ERROR("Couldn't build vcpu%d", i);
-            goto out;
-        }
-    }
-
-    if (!read_exact(io_fd, shared_info_page, PAGE_SIZE)) {
-        ERROR("Error when reading shared info page");
-        goto out;
-    }
-
-    /* clear any pending events and the selector */
-    memset(&(shared_info->evtchn_pending[0]), 0,
-           sizeof (shared_info->evtchn_pending));
-    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
-        shared_info->vcpu_info[i].evtchn_pending_sel = 0;
-
-    /* Copy saved contents of shared-info page. No checking needed. */
-    page = xc_map_foreign_range(
-        xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
-    memcpy(page, shared_info, PAGE_SIZE);
-    munmap(page, PAGE_SIZE);
-
-    /* Uncanonicalise the pfn-to-mfn table frame-number list. */
-    for (i = 0; i < P2M_FL_ENTRIES; i++) {
-        pfn = p2m_frame_list[i];
-        if ((pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
-            ERROR("PFN-to-MFN frame number is bad");
-            goto out;
-        }
-
-        p2m_frame_list[i] = p2m[pfn];
-    }
-
-    /* Copy the P2M we've constructed to the 'live' P2M */
-    if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
-                                          p2m_frame_list, P2M_FL_ENTRIES))) {
-        ERROR("Couldn't map p2m table");
-        goto out;
-    }
-
-    memcpy(live_p2m, p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
-    munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
-
-    DPRINTF("Domain ready to be built.\n");
-
- out:
-    if ( (rc != 0) && (dom != 0) )
-        xc_domain_destroy(xc_handle, dom);
-    free(mmu);
-    free(p2m);
-    free(pfn_type);
-
-    /* discard cache for save file  */
-    discard_file_cache(io_fd, 1 /*flush*/);
-
-    DPRINTF("Restore exit with rc=%d\n", rc);
-    
-    return rc;
-}
diff -r 602d061ff51f -r e518f2fbdd72 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Thu Apr 05 10:43:50 2007 +0100
+++ b/tools/libxc/xenguest.h    Thu Apr 05 15:11:22 2007 +0100
@@ -38,29 +38,21 @@ int xc_hvm_save(int xc_handle, int io_fd
                 void (*qemu_flip_buffer)(int, int));
 
 /**
- * This function will restore a saved domain running Linux.
+ * This function will restore a saved domain.
  *
  * @parm xc_handle a handle to an open hypervisor interface
  * @parm fd the file descriptor to restore a domain from
  * @parm dom the id of the domain
  * @parm store_evtchn the store event channel for this domain to use
  * @parm store_mfn returned with the mfn of the store page
+ * @parm hvm non-zero if this is a HVM restore
+ * @parm pae non-zero if this HVM domain has PAE support enabled
  * @return 0 on success, -1 on failure
  */
-int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom,
-                     unsigned int store_evtchn, unsigned long *store_mfn,
-                     unsigned int console_evtchn, unsigned long *console_mfn);
-
-/**
- * This function will restore a saved hvm domain running unmodified guest.
- *
- * @parm store_mfn pass mem size & returned with the mfn of the store page
- * @return 0 on success, -1 on failure
- */
-int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom,
-                   unsigned int store_evtchn,
-                   unsigned long *store_mfn, 
-                   unsigned int pae, unsigned int apic);
+int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
+                      unsigned int store_evtchn, unsigned long *store_mfn,
+                      unsigned int console_evtchn, unsigned long *console_mfn,
+                      unsigned int hvm, unsigned int pae);
 
 /**
  * This function will create a domain for a paravirtualized Linux
diff -r 602d061ff51f -r e518f2fbdd72 tools/libxc/xg_private.c
--- a/tools/libxc/xg_private.c  Thu Apr 05 10:43:50 2007 +0100
+++ b/tools/libxc/xg_private.c  Thu Apr 05 15:11:22 2007 +0100
@@ -204,16 +204,6 @@ __attribute__((weak))
                     int (*suspend)(int domid), 
                     void *(*init_qemu_maps)(int, unsigned), 
                     void (*qemu_flip_buffer)(int, int))
-{
-    errno = ENOSYS;
-    return -1;
-}
-
-__attribute__((weak)) 
-    int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom,
-                       unsigned int store_evtchn,
-                       unsigned long *store_mfn,
-                       unsigned int pae, unsigned int apic)
 {
     errno = ENOSYS;
     return -1;
diff -r 602d061ff51f -r e518f2fbdd72 tools/xcutils/xc_restore.c
--- a/tools/xcutils/xc_restore.c        Thu Apr 05 10:43:50 2007 +0100
+++ b/tools/xcutils/xc_restore.c        Thu Apr 05 15:11:22 2007 +0100
@@ -39,14 +39,8 @@ main(int argc, char **argv)
     pae  = atoi(argv[6]);
     apic = atoi(argv[7]);
 
-    if ( hvm )
-        ret = xc_hvm_restore(xc_fd, io_fd, domid,
-                             store_evtchn, &store_mfn,
-                             pae, apic);
-    else
-        ret = xc_linux_restore(xc_fd, io_fd, domid,
-                               store_evtchn, &store_mfn,
-                               console_evtchn, &console_mfn);
+    ret = xc_domain_restore(xc_fd, io_fd, domid, store_evtchn, &store_mfn,
+                            console_evtchn, &console_mfn, hvm, pae);
 
     if ( ret == 0 )
     {

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.