[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] Remus: Make xc_domain_restore loop until the fd is closed.



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1257794043 0
# Node ID 0f893b8f7c152990cb3be458427827506a500f44
# Parent  07f6d9047af4baada47b03c396ed9761cae3b92c
Remus: Make xc_domain_restore loop until the fd is closed.

The tail containing the final PFN table, VCPU contexts and
shared_info_page is buffered, then the read loop is restarted.
After the first pass, incoming pages are buffered until the next tail
is read, completing a new consistent checkpoint. At this point, the
memory changes are applied and the loop begins again. When the fd read
fails, the tail buffer is processed.

Signed-off-by: Brendan Cully <brendan@xxxxxxxxx>
---
 tools/libxc/xc_domain_restore.c         |  771 ++++++++++++++++++++------------
 tools/python/xen/xend/XendCheckpoint.py |    2 
 2 files changed, 503 insertions(+), 270 deletions(-)

diff -r 07f6d9047af4 -r 0f893b8f7c15 tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Mon Nov 09 19:06:25 2009 +0000
+++ b/tools/libxc/xc_domain_restore.c   Mon Nov 09 19:14:03 2009 +0000
@@ -624,6 +624,410 @@ static xen_pfn_t *load_p2m_frame_list(
     return p2m_frame_list;
 }
 
+typedef struct {
+    unsigned int pfncount;
+    unsigned long* pfntab;
+    unsigned int vcpucount;
+    unsigned char* vcpubuf;
+    unsigned char shared_info_page[PAGE_SIZE];
+} tailbuf_t;
+
+static int buffer_tail(tailbuf_t* buf, int fd, unsigned int max_vcpu_id,
+                      uint64_t vcpumap, int ext_vcpucontext)
+{
+    unsigned int i;
+    size_t pfnlen, vcpulen;
+
+    /* TODO: handle changing pfntab and vcpu counts */
+    /* PFN tab */
+    if ( read_exact(fd, &buf->pfncount, sizeof(buf->pfncount)) ||
+         (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */
+    {
+        ERROR("Error when reading pfn count");
+        return -1;
+    }
+    pfnlen = sizeof(unsigned long) * buf->pfncount;
+    if ( !(buf->pfntab) ) {
+        if ( !(buf->pfntab = malloc(pfnlen)) ) {
+            ERROR("Error allocating PFN tail buffer");
+            return -1;
+        }
+    }
+    // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen);
+    if ( read_exact(fd, buf->pfntab, pfnlen) ) {
+        ERROR("Error when reading pfntab");
+        goto free_pfntab;
+    }
+
+    /* VCPU contexts */
+    buf->vcpucount = 0;
+    for (i = 0; i <= max_vcpu_id; i++) {
+        // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap, i, (vcpumap 
% (1ULL << i)));
+        if ( (!(vcpumap & (1ULL << i))) )
+            continue;
+        buf->vcpucount++;
+    }
+    // DPRINTF("VCPU count: %d\n", buf->vcpucount);
+    vcpulen = ((guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t)
+               : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount;
+    if ( ext_vcpucontext )
+        vcpulen += 128 * buf->vcpucount;
+
+    if ( !(buf->vcpubuf) ) {
+        if ( !(buf->vcpubuf = malloc(vcpulen)) ) {
+            ERROR("Error allocating VCPU ctxt tail buffer");
+            goto free_pfntab;
+        }
+    }
+    // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
+    if ( read_exact(fd, buf->vcpubuf, vcpulen) ) {
+        ERROR("Error when reading ctxt");
+        goto free_vcpus;
+    }
+
+    /* load shared_info_page */
+    // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE);
+    if ( read_exact(fd, buf->shared_info_page, PAGE_SIZE) ) {
+        ERROR("Error when reading shared info page");
+        goto free_vcpus;
+    }
+
+    return 0;
+
+  free_vcpus:
+    if (buf->vcpubuf) {
+        free (buf->vcpubuf);
+        buf->vcpubuf = NULL;
+    }
+  free_pfntab:
+    if (buf->pfntab) {
+        free (buf->pfntab);
+        buf->pfntab = NULL;
+    }
+
+    return -1;
+}
+
+static void tailbuf_free(tailbuf_t* buf)
+{
+    if (buf->vcpubuf) {
+        free(buf->vcpubuf);
+        buf->vcpubuf = NULL;
+    }
+    if (buf->pfntab) {
+        free(buf->pfntab);
+        buf->pfntab = NULL;
+    }
+}
+
+typedef struct {
+    void* pages;
+    /* pages is of length nr_physpages, pfn_types is of length nr_pages */
+    unsigned int nr_physpages, nr_pages;
+
+    /* Types of the pfns in the current region */
+    unsigned long* pfn_types;
+
+    int verify;
+
+    int new_ctxt_format;
+    int max_vcpu_id;
+    uint64_t vcpumap;
+    uint64_t identpt;
+    uint64_t vm86_tss;
+} pagebuf_t;
+
+static int pagebuf_init(pagebuf_t* buf)
+{
+    memset(buf, 0, sizeof(*buf));
+    return 0;
+}
+
+static void pagebuf_free(pagebuf_t* buf)
+{
+    if (buf->pages) {
+        free(buf->pages);
+        buf->pages = NULL;
+    }
+    if(buf->pfn_types) {
+        free(buf->pfn_types);
+        buf->pfn_types = NULL;
+    }
+}
+
+static int pagebuf_get_one(pagebuf_t* buf, int fd, int xch, uint32_t dom)
+{
+    int count, countpages, oldcount, i;
+    void* ptmp;
+
+    if ( read_exact(fd, &count, sizeof(count)) )
+    {
+        ERROR("Error when reading batch size");
+        return -1;
+    }
+
+    // DPRINTF("reading batch of %d pages\n", count);
+
+    if (!count) {
+        // DPRINTF("Last batch read\n");
+        return 0;
+    } else if (count == -1) {
+        DPRINTF("Entering page verify mode\n");
+        buf->verify = 1;
+        return pagebuf_get_one(buf, fd, xch, dom);
+    } else if (count == -2) {
+        buf->new_ctxt_format = 1;
+        if ( read_exact(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) ||
+             buf->max_vcpu_id >= 64 || read_exact(fd, &buf->vcpumap,
+                                                  sizeof(uint64_t)) ) {
+            ERROR("Error when reading max_vcpu_id");
+            return -1;
+        }
+        // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, 
buf->vcpumap);
+        return pagebuf_get_one(buf, fd, xch, dom);
+    } else if (count == -3) {
+        /* Skip padding 4 bytes then read the EPT identity PT location. */
+        if ( read_exact(fd, &buf->identpt, sizeof(uint32_t)) ||
+             read_exact(fd, &buf->identpt, sizeof(uint64_t)) )
+        {
+            ERROR("error read the address of the EPT identity map");
+            return -1;
+        }
+        // DPRINTF("EPT identity map address: %llx\n", buf->identpt);
+        return pagebuf_get_one(buf, fd, xch, dom);
+    } else if ( count == -4 )  {
+        /* Skip padding 4 bytes then read the vm86 TSS location. */
+        if ( read_exact(fd, &buf->vm86_tss, sizeof(uint32_t)) ||
+             read_exact(fd, &buf->vm86_tss, sizeof(uint64_t)) )
+        {
+            ERROR("error read the address of the vm86 TSS");
+            return -1;
+        }
+        // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss);
+        return pagebuf_get_one(buf, fd, xch, dom);
+    } else if ( count == -5 ) {
+        DPRINTF("xc_domain_restore start tmem\n");
+        if ( xc_tmem_restore(xch, dom, fd) ) {
+            ERROR("error reading/restoring tmem");
+            return -1;
+        }
+        return pagebuf_get_one(buf, fd, xch, dom);
+    }
+    else if ( count == -6 ) {
+        if ( xc_tmem_restore_extra(xch, dom, fd) ) {
+            ERROR("error reading/restoring tmem extra");
+            return -1;
+        }
+        return pagebuf_get_one(buf, fd, xch, dom);
+    } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
+        ERROR("Max batch size exceeded (%d). Giving up.", count);
+        return -1;
+    }
+
+    oldcount = buf->nr_pages;
+    buf->nr_pages += count;
+    if (!buf->pfn_types) {
+        if (!(buf->pfn_types = malloc(buf->nr_pages * 
sizeof(*(buf->pfn_types))))) {
+            ERROR("Could not allocate PFN type buffer");
+            return -1;
+        }
+    } else {
+        if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * 
sizeof(*(buf->pfn_types))))) {
+            ERROR("Could not reallocate PFN type buffer");
+            return -1;
+        }
+        buf->pfn_types = ptmp;
+    }
+    if ( read_exact(fd, buf->pfn_types + oldcount, count * 
sizeof(*(buf->pfn_types)))) {
+        ERROR("Error when reading region pfn types");
+        return -1;
+    }
+
+    countpages = count;
+    for (i = oldcount; i < buf->nr_pages; ++i)
+        if ((buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) == 
XEN_DOMCTL_PFINFO_XTAB)
+            --countpages;
+
+    if (!countpages)
+        return count;
+
+    oldcount = buf->nr_physpages;
+    buf->nr_physpages += countpages;
+    if (!buf->pages) {
+        if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) {
+            ERROR("Could not allocate page buffer");
+            return -1;
+        }
+    } else {
+        if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) {
+            ERROR("Could not reallocate page buffer");
+            return -1;
+        }
+        buf->pages = ptmp;
+    }
+    if ( read_exact(fd, buf->pages + oldcount * PAGE_SIZE, countpages * 
PAGE_SIZE) ) {
+        ERROR("Error when reading pages");
+        return -1;
+    }
+
+    return count;
+}
+
+static int pagebuf_get(pagebuf_t* buf, int fd, int xch, uint32_t dom)
+{
+    int rc;
+
+    buf->nr_physpages = buf->nr_pages = 0;
+
+    do {
+        rc = pagebuf_get_one(buf, fd, xch, dom);
+    } while (rc > 0);
+
+    if (rc < 0)
+        pagebuf_free(buf);
+
+    return rc;
+}
+
+static int apply_batch(int xc_handle, uint32_t dom, xen_pfn_t* region_mfn,
+                       unsigned long* pfn_type, int pae_extended_cr3,
+                       unsigned int hvm, struct xc_mmu* mmu,
+                       pagebuf_t* pagebuf, int curbatch, int superpages)
+{
+    int i, j, curpage;
+    /* used by debug verify code */
+    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
+    /* Our mapping of the current region (batch) */
+    char *region_base;
+    /* A temporary mapping, and a copy, of one frame of guest memory. */
+    unsigned long *page = NULL;
+    int nraces = 0;
+
+    unsigned long mfn, pfn, pagetype;
+
+    j = pagebuf->nr_pages - curbatch;
+    if (j > MAX_BATCH_SIZE)
+        j = MAX_BATCH_SIZE;
+
+    if (allocate_physmem(xc_handle, dom, &pagebuf->pfn_types[curbatch],
+                         j, hvm, region_mfn, superpages) != 0)
+    {
+        ERROR("allocate_physmem() failed\n");
+        return -1;
+    }
+
+    /* Map relevant mfns */
+    region_base = xc_map_foreign_batch(
+        xc_handle, dom, PROT_WRITE, region_mfn, j);
+
+    if ( region_base == NULL )
+    {
+        ERROR("map batch failed");
+        return -1;
+    }
+
+    for ( i = 0, curpage = -1; i < j; i++ )
+    {
+        pfn      = pagebuf->pfn_types[i + curbatch] & 
~XEN_DOMCTL_PFINFO_LTAB_MASK;
+        pagetype = pagebuf->pfn_types[i + curbatch] &  
XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+        if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+            /* a bogus/unmapped page: skip it */
+            continue;
+
+        ++curpage;
+
+        if ( pfn > p2m_size )
+        {
+            ERROR("pfn out of range");
+            return -1;
+        }
+
+        pfn_type[pfn] = pagetype;
+
+        mfn = p2m[pfn];
+
+        /* In verify mode, we use a copy; otherwise we work in place */
+        page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
+
+        memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, 
PAGE_SIZE);
+
+        pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+        if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+             (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+        {
+            /*
+            ** A page table page - need to 'uncanonicalize' it, i.e.
+            ** replace all the references to pfns with the corresponding
+            ** mfns for the new domain.
+            **
+            ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
+            ** so we may need to update the p2m after the main loop.
+            ** Hence we defer canonicalization of L1s until then.
+            */
+            if ((pt_levels != 3) ||
+                pae_extended_cr3 ||
+                (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
+
+                if (!uncanonicalize_pagetable(xc_handle, dom,
+                                              pagetype, page, superpages)) {
+                    /*
+                    ** Failing to uncanonicalize a page table can be ok
+                    ** under live migration since the pages type may have
+                    ** changed by now (and we'll get an update later).
+                    */
+                    DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+                            pagetype >> 28, pfn, mfn);
+                    nraces++;
+                    continue;
+                }
+            }
+        }
+        else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
+        {
+            ERROR("Bogus page type %lx page table is out of range: "
+                  "i=%d p2m_size=%lu", pagetype, i, p2m_size);
+            return -1;
+        }
+
+        if ( pagebuf->verify )
+        {
+            int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
+            if ( res )
+            {
+                int v;
+
+                DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
+                        "actualcs=%08lx\n", pfn, pagebuf->pfn_types[pfn],
+                        csum_page(region_base + (i + curbatch)*PAGE_SIZE),
+                        csum_page(buf));
+
+                for ( v = 0; v < 4; v++ )
+                {
+                    unsigned long *p = (unsigned long *)
+                        (region_base + i*PAGE_SIZE);
+                    if ( buf[v] != p[v] )
+                        DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
+                }
+            }
+        }
+
+        if ( !hvm &&
+             xc_add_mmu_update(xc_handle, mmu,
+                               (((unsigned long long)mfn) << PAGE_SHIFT)
+                               | MMU_MACHPHYS_UPDATE, pfn) )
+        {
+            ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
+            return -1;
+        }
+    } /* end of 'batch' for loop */
+
+    munmap(region_base, j*PAGE_SIZE);
+
+    return nraces;
+}
+
 int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
                       unsigned int store_evtchn, unsigned long *store_mfn,
                       unsigned int console_evtchn, unsigned long *console_mfn,
@@ -633,7 +1037,6 @@ int xc_domain_restore(int xc_handle, int
     int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
     unsigned long mfn, pfn;
     unsigned int prev_pc, this_pc;
-    int verify = 0;
     int nraces = 0;
 
     /* The new domain's shared-info frame number. */
@@ -652,9 +1055,6 @@ int xc_domain_restore(int xc_handle, int
     /* A table of MFNs to map in the current region */
     xen_pfn_t *region_mfn = NULL;
 
-    /* Types of the pfns in the current region */
-    unsigned long region_pfn_type[MAX_BATCH_SIZE];
-
     /* A copy of the pfn-to-mfn table frame list. */
     xen_pfn_t *p2m_frame_list = NULL;
     
@@ -666,9 +1066,6 @@ int xc_domain_restore(int xc_handle, int
 
     struct xc_mmu *mmu = NULL;
 
-    /* used by debug verify code */
-    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
-
     struct mmuext_op pin[MAX_PIN_BATCH];
     unsigned int nr_pins;
 
@@ -681,6 +1078,14 @@ int xc_domain_restore(int xc_handle, int
 
     /* Buffer for holding HVM context */
     uint8_t *hvm_buf = NULL;
+
+    int completed = 0;
+    pagebuf_t pagebuf;
+    tailbuf_t tailbuf, tmptail;
+    void* vcpup;
+
+    pagebuf_init(&pagebuf);
+    memset(&tailbuf, 0, sizeof(tailbuf));
 
     /* For info only */
     nr_pfns = 0;
@@ -784,9 +1189,10 @@ int xc_domain_restore(int xc_handle, int
     prev_pc = 0;
 
     n = m = 0;
+ loadpages:
     for ( ; ; )
     {
-        int j; 
+        int j, curbatch;
 
         this_pc = (n * 100) / p2m_size;
         if ( (this_pc - prev_pc) >= 5 )
@@ -795,221 +1201,49 @@ int xc_domain_restore(int xc_handle, int
             prev_pc = this_pc;
         }
 
-        if ( read_exact(io_fd, &j, sizeof(int)) )
-        {
-            ERROR("Error when reading batch size");
-            goto out;
-        }
+        if ( !completed ) {
+            pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+            if ( pagebuf_get_one(&pagebuf, io_fd, xc_handle, dom) < 0 ) {
+                ERROR("Error when reading batch\n");
+                goto out;
+            }
+        }
+        j = pagebuf.nr_pages;
 
         PPRINTF("batch %d\n",j);
 
-        if ( j == -1 )
-        {
-            verify = 1;
-            DPRINTF("Entering page verify mode\n");
-            continue;
-        }
-
-        if ( j == -2 )
-        {
-            new_ctxt_format = 1;
-            if ( read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
-                 (max_vcpu_id >= 64) ||
-                 read_exact(io_fd, &vcpumap, sizeof(uint64_t)) )
-            {
-                ERROR("Error when reading max_vcpu_id");
+        if ( j == 0 ) {
+            /* catch vcpu updates */
+            if (pagebuf.new_ctxt_format) {
+                vcpumap = pagebuf.vcpumap;
+                max_vcpu_id = pagebuf.max_vcpu_id;
+            }
+            /* should this be deferred? does it change? */
+            if ( pagebuf.identpt )
+                xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, 
pagebuf.identpt);
+            if ( pagebuf.vm86_tss )
+                xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, 
pagebuf.vm86_tss);
+            break;  /* our work here is done */
+        }
+
+        /* break pagebuf into batches */
+        curbatch = 0;
+        while ( curbatch < j ) {
+            int brc;
+
+            brc = apply_batch(xc_handle, dom, region_mfn, pfn_type,
+                              pae_extended_cr3, hvm, mmu, &pagebuf, curbatch, 
superpages);
+            if ( brc < 0 )
                 goto out;
-            }
-            continue;
-        }
-
-        if ( j == -3 )
-        {
-            uint64_t ident_pt;
-
-            /* Skip padding 4 bytes then read the EPT identity PT location. */
-            if ( read_exact(io_fd, &ident_pt, sizeof(uint32_t)) ||
-                 read_exact(io_fd, &ident_pt, sizeof(uint64_t)) )
-            {
-                ERROR("error read the address of the EPT identity map");
-                goto out;
-            }
-
-            xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, ident_pt);
-            continue;
-        }
-
-        if ( j == -4 )
-        {
-            uint64_t vm86_tss;
-
-            /* Skip padding 4 bytes then read the vm86 TSS location. */
-            if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
-                 read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
-            {
-                ERROR("error read the address of the vm86 TSS");
-                goto out;
-            }
-
-            xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
-            continue;
-        }
-
-        if ( j == -5 )
-        {
-            DPRINTF("xc_domain_restore start tmem\n");
-            if ( xc_tmem_restore(xc_handle, dom, io_fd) )
-            {
-                ERROR("error reading/restoring tmem");
-                goto out;
-            }
-            continue;
-        }
-
-        if ( j == -6 )
-        {
-            if ( xc_tmem_restore_extra(xc_handle, dom, io_fd) )
-            {
-                ERROR("error reading/restoring tmem extra");
-                goto out;
-            }
-            continue;
-        }
-
-        if ( j == 0 )
-            break;  /* our work here is done */
-
-        if ( (j > MAX_BATCH_SIZE) || (j < 0) )
-        {
-            ERROR("Max batch size exceeded. Giving up.");
-            goto out;
-        }
-
-        if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
-        {
-            ERROR("Error when reading region pfn types");
-            goto out;
-        }
-
-        if (allocate_physmem(xc_handle, dom, region_pfn_type,
-                             j, hvm, region_mfn, superpages) != 0)
-            goto out;
-
-        /* Map relevant mfns */
-        region_base = xc_map_foreign_batch(
-            xc_handle, dom, PROT_WRITE, region_mfn, j);
-
-        if ( region_base == NULL )
-        {
-            ERROR("map batch failed");
-            goto out;
-        }
-
-        for ( i = 0; i < j; i++ )
-        {
-            void *page;
-            unsigned long pagetype;
-
-            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-            if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
-                /* a bogus/unmapped page: skip it */
-                continue;
-
-            if ( pfn > p2m_size )
-            {
-                ERROR("pfn out of range");
-                goto out;
-            }
-
-            pfn_type[pfn] = pagetype;
-
-            mfn = p2m[pfn];
-
-            /* In verify mode, we use a copy; otherwise we work in place */
-            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
-
-            if ( read_exact(io_fd, page, PAGE_SIZE) )
-            {
-                ERROR("Error when reading page (type was %lx)", pagetype);
-                goto out;
-            }
-
-            pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
-            if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 
-                 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
-            {
-                /*
-                ** A page table page - need to 'uncanonicalize' it, i.e.
-                ** replace all the references to pfns with the corresponding
-                ** mfns for the new domain.
-                **
-                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
-                ** so we may need to update the p2m after the main loop.
-                ** Hence we defer canonicalization of L1s until then.
-                */
-                if ((pt_levels != 3) ||
-                    pae_extended_cr3 ||
-                    (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
-
-                    if (!uncanonicalize_pagetable(xc_handle, dom, 
-                                                  pagetype, page, superpages)) 
{
-                        /*
-                        ** Failing to uncanonicalize a page table can be ok
-                        ** under live migration since the pages type may have
-                        ** changed by now (and we'll get an update later).
-                        */
-                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
-                                pagetype >> 28, pfn, mfn);
-                        nraces++;
-                        continue;
-                    } 
-                }
-            }
-            else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
-            {
-                ERROR("Bogus page type %lx page table is out of range: "
-                    "i=%d p2m_size=%lu", pagetype, i, p2m_size);
-                goto out;
-
-            }
-
-            if ( verify )
-            {
-                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
-                if ( res )
-                {
-                    int v;
-
-                    DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
-                            "actualcs=%08lx\n", pfn, pfn_type[pfn],
-                            csum_page(region_base + i*PAGE_SIZE),
-                            csum_page(buf));
-
-                    for ( v = 0; v < 4; v++ )
-                    {
-                        unsigned long *p = (unsigned long *)
-                            (region_base + i*PAGE_SIZE);
-                        if ( buf[v] != p[v] )
-                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
-                    }
-                }
-            }
-
-            if ( !hvm &&
-                 xc_add_mmu_update(xc_handle, mmu,
-                                   (((unsigned long long)mfn) << PAGE_SHIFT)
-                                   | MMU_MACHPHYS_UPDATE, pfn) )
-            {
-                ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
-                goto out;
-            }
-        } /* end of 'batch' for loop */
-
-        munmap(region_base, j*PAGE_SIZE);
-        n+= j; /* crude stats */
+
+            nraces += brc;
+
+            curbatch += MAX_BATCH_SIZE;
+        }
+
+        pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+
+        n += j; /* crude stats */
 
         /* 
          * Discard cache for portion of file read so far up to last
@@ -1033,7 +1267,7 @@ int xc_domain_restore(int xc_handle, int
         goto out;
     }
 
-    DPRINTF("Received all pages (%d races)\n", nraces);
+    // DPRINTF("Received all pages (%d races)\n", nraces);
 
     if ( hvm ) 
     {
@@ -1107,6 +1341,34 @@ int xc_domain_restore(int xc_handle, int
 
     /* Non-HVM guests only from here on */
 
+    if ( !completed ) {
+        if ( buffer_tail(&tailbuf, io_fd, max_vcpu_id, vcpumap,
+                         ext_vcpucontext) < 0 ) {
+            ERROR ("error buffering image tail");
+            goto out;
+        }
+        completed = 1;
+    }
+
+    // DPRINTF("Buffered checkpoint\n");
+
+    if ( pagebuf_get(&pagebuf, io_fd, xc_handle, dom) ) {
+        ERROR("error when buffering batch, finishing\n");
+        goto finish;
+    }
+    memset(&tmptail, 0, sizeof(tmptail));
+    if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
+                     ext_vcpucontext) < 0 ) {
+        ERROR ("error buffering image tail, finishing");
+        goto finish;
+    }
+    tailbuf_free(&tailbuf);
+    memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
+
+    goto loadpages;
+
+  finish:
+
     if ( (pt_levels == 3) && !pae_extended_cr3 )
     {
         /*
@@ -1275,39 +1537,17 @@ int xc_domain_restore(int xc_handle, int
 
     /* Get the list of PFNs that are not in the psuedo-phys map */
     {
-        unsigned int count = 0;
-        unsigned long *pfntab;
-        int nr_frees;
-
-        if ( read_exact(io_fd, &count, sizeof(count)) ||
-             (count > (1U << 28)) ) /* up to 1TB of address space */
-        {
-            ERROR("Error when reading pfn count (= %u)", count);
-            goto out;
-        }
-
-        if ( !(pfntab = malloc(sizeof(unsigned long) * count)) )
-        {
-            ERROR("Out of memory");
-            goto out;
-        }
-
-        if ( read_exact(io_fd, pfntab, sizeof(unsigned long)*count) )
-        {
-            ERROR("Error when reading pfntab");
-            goto out;
-        }
-
-        nr_frees = 0; 
-        for ( i = 0; i < count; i++ )
-        {
-            unsigned long pfn = pfntab[i];
+        int nr_frees = 0;
+
+        for ( i = 0; i < tailbuf.pfncount; i++ )
+        {
+            unsigned long pfn = tailbuf.pfntab[i];
 
             if ( p2m[pfn] != INVALID_P2M_ENTRY )
             {
-                /* pfn is not in physmap now, but was at some point during 
+                /* pfn is not in physmap now, but was at some point during
                    the save/migration process - need to free it */
-                pfntab[nr_frees++] = p2m[pfn];
+                tailbuf.pfntab[nr_frees++] = p2m[pfn];
                 p2m[pfn]  = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
             }
         }
@@ -1319,7 +1559,7 @@ int xc_domain_restore(int xc_handle, int
                 .extent_order = 0,
                 .domid        = dom
             };
-            set_xen_guest_handle(reservation.extent_start, pfntab);
+            set_xen_guest_handle(reservation.extent_start, tailbuf.pfntab);
 
             if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
                                      &reservation)) != nr_frees )
@@ -1328,7 +1568,7 @@ int xc_domain_restore(int xc_handle, int
                 goto out;
             }
             else
-                DPRINTF("Decreased reservation by %d pages\n", count);
+                DPRINTF("Decreased reservation by %d pages\n", 
tailbuf.pfncount);
         }
     }
 
@@ -1338,18 +1578,17 @@ int xc_domain_restore(int xc_handle, int
         return 1;
     }
 
+    vcpup = tailbuf.vcpubuf;
     for ( i = 0; i <= max_vcpu_id; i++ )
     {
         if ( !(vcpumap & (1ULL << i)) )
             continue;
 
-        if ( read_exact(io_fd, &ctxt, ((guest_width == 8)
-                                       ? sizeof(ctxt.x64)
-                                       : sizeof(ctxt.x32))) )
-        {
-            ERROR("Error when reading ctxt %d", i);
-            goto out;
-        }
+        memcpy(&ctxt, vcpup, ((guest_width == 8) ? sizeof(ctxt.x64)
+                              : sizeof(ctxt.x32)));
+        vcpup += (guest_width == 8) ? sizeof(ctxt.x64) : sizeof(ctxt.x32);
+
+        DPRINTF("read VCPU %d\n", i);
 
         if ( !new_ctxt_format )
             SET_FIELD(&ctxt, flags, GET_FIELD(&ctxt, flags) | VGCF_online);
@@ -1454,12 +1693,8 @@ int xc_domain_restore(int xc_handle, int
 
         if ( !ext_vcpucontext )
             continue;
-        if ( read_exact(io_fd, &domctl.u.ext_vcpucontext, 128) ||
-             (domctl.u.ext_vcpucontext.vcpu != i) )
-        {
-            ERROR("Error when reading extended ctxt %d", i);
-            goto out;
-        }
+        memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
+        vcpup += 128;
         domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
         domctl.domain = dom;
         frc = xc_domctl(xc_handle, &domctl);
@@ -1470,11 +1705,9 @@ int xc_domain_restore(int xc_handle, int
         }
     }
 
-    if ( read_exact(io_fd, shared_info_page, PAGE_SIZE) )
-    {
-        ERROR("Error when reading shared info page");
-        goto out;
-    }
+    memcpy(shared_info_page, tailbuf.shared_info_page, PAGE_SIZE);
+
+    DPRINTF("Completed checkpoint load\n");
 
     /* Restore contents of shared-info page. No checking needed. */
     new_shared_info = xc_map_foreign_range(
diff -r 07f6d9047af4 -r 0f893b8f7c15 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Mon Nov 09 19:06:25 2009 +0000
+++ b/tools/python/xen/xend/XendCheckpoint.py   Mon Nov 09 19:14:03 2009 +0000
@@ -345,7 +345,7 @@ def restore(xd, fd, dominfo = None, paus
 
         restore_image.setCpuid()
 
-        os.read(fd, 1)           # Wait for source to close connection
+        # xc_restore will wait for source to close connection
         
         dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.