[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 2 of 9] Make xc_domain_restore loop until the fd is closed



# HG changeset patch
# User Brendan Cully <brendan@xxxxxxxxx>
# Date 1240355507 25200
# Node ID f5c0d3208d8ae9183391398d52c9be5969da24ec
# Parent  904729ffa2692482c77e7da5828c4b218a3a51c2
Make xc_domain_restore loop until the fd is closed.
The tail containing the final PFN table, VCPU contexts and
shared_info_page is buffered, then the read loop is restarted.
After the first pass, incoming pages are buffered until the next tail
is read, completing a new consistent checkpoint. At this point, the
memory changes are applied and the loop begins again. When the fd read
fails, the tail buffer is processed.

Signed-off-by: Brendan Cully <brendan@xxxxxxxxx>

diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -269,6 +269,438 @@
     return p2m_frame_list;
 }
 
+typedef struct {
+  unsigned int pfncount;
+  unsigned long* pfntab;
+  unsigned int vcpucount;
+  unsigned char* vcpubuf;
+  unsigned char shared_info_page[PAGE_SIZE];
+} tailbuf_t;
+
+static int buffer_tail(tailbuf_t* buf, int fd, unsigned int max_vcpu_id,
+                      uint64_t vcpumap, int ext_vcpucontext)
+{
+    unsigned int i;
+    size_t pfnlen, vcpulen;
+
+    /* TODO: handle changing pfntab and vcpu counts */
+    /* PFN tab */
+    if ( read_exact(fd, &buf->pfncount, sizeof(buf->pfncount)) ||
+        (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */
+    {
+       ERROR("Error when reading pfn count");
+       return -1;
+    }
+    pfnlen = sizeof(unsigned long) * buf->pfncount;
+    if ( !(buf->pfntab) ) {
+       if ( !(buf->pfntab = malloc(pfnlen)) ) {
+           ERROR("Error allocating PFN tail buffer");
+           return -1;
+       }
+    }
+    // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen);
+    if ( read_exact(fd, buf->pfntab, pfnlen) ) {
+       ERROR("Error when reading pfntab");
+       goto free_pfntab;
+    }
+    
+    /* VCPU contexts */
+    buf->vcpucount = 0;
+    for (i = 0; i <= max_vcpu_id; i++) {
+      // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap, i, (vcpumap % 
(1ULL << i)));
+       if ( (!(vcpumap & (1ULL << i))) )
+           continue;
+       buf->vcpucount++;
+    }
+    // DPRINTF("VCPU count: %d\n", buf->vcpucount);
+    vcpulen = ((guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t)
+              : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount;
+    if ( ext_vcpucontext )
+      vcpulen += 128 * buf->vcpucount;
+
+    if ( !(buf->vcpubuf) ) {
+       if ( !(buf->vcpubuf = malloc(vcpulen)) ) {
+           ERROR("Error allocating VCPU ctxt tail buffer");
+           goto free_pfntab;
+       }
+    }
+    // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
+    if ( read_exact(fd, buf->vcpubuf, vcpulen) ) {
+       ERROR("Error when reading ctxt");
+       goto free_vcpus;
+    }
+
+    /* load shared_info_page */
+    // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE);
+    if ( read_exact(fd, buf->shared_info_page, PAGE_SIZE) ) {
+       ERROR("Error when reading shared info page");
+       goto free_vcpus;
+    }
+
+    return 0;
+
+  free_vcpus:
+    if (buf->vcpubuf) {
+      free (buf->vcpubuf);
+      buf->vcpubuf = NULL;
+    }
+  free_pfntab:
+    if (buf->pfntab) {
+      free (buf->pfntab);
+      buf->pfntab = NULL;
+    }
+
+    return -1;
+}
+
+static void tailbuf_free(tailbuf_t* buf)
+{
+  if (buf->vcpubuf) {
+    free(buf->vcpubuf);
+    buf->vcpubuf = NULL;
+  }
+  if (buf->pfntab) {
+    free(buf->pfntab);
+    buf->pfntab = NULL;
+  }
+}
+
+typedef struct {
+  void* pages;
+  unsigned int nr_physpages, nr_pages; /* pages is of length nr_physpages, 
pfn_types is of length nr_pages */
+
+  /* Types of the pfns in the current region */
+  unsigned long* pfn_types;
+
+  int verify;
+
+  int new_ctxt_format;
+  int max_vcpu_id;
+  uint64_t vcpumap;
+  uint64_t identpt;
+  uint64_t vm86_tss;
+} pagebuf_t;
+
+static int pagebuf_init(pagebuf_t* buf)
+{
+  memset(buf, 0, sizeof(*buf));
+  return 0;
+}
+
+static void pagebuf_free(pagebuf_t* buf)
+{
+  if (buf->pages) {
+    free(buf->pages);
+    buf->pages = NULL;
+  }
+  if(buf->pfn_types) {
+    free(buf->pfn_types);
+    buf->pfn_types = NULL;
+  }
+}
+
+static int pagebuf_get_one(pagebuf_t* buf, int fd)
+{
+  int count, countpages, oldcount, i;
+  void* ptmp;
+
+  if ( read_exact(fd, &count, sizeof(count)) )
+  {
+    ERROR("Error when reading batch size");
+    return -1;
+  }
+
+  // DPRINTF("reading batch of %d pages\n", count);
+
+  if (!count) {
+    DPRINTF("Last batch read\n");
+    return 0;
+  } else if (count == -1) {
+    DPRINTF("Entering page verify mode\n");
+    buf->verify = 1;
+    return pagebuf_get_one(buf, fd);
+  } else if (count == -2) {
+    buf->new_ctxt_format = 1;
+    if ( read_exact(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) ||
+        buf->max_vcpu_id >= 64 || read_exact(fd, &buf->vcpumap,
+                                            sizeof(uint64_t)) ) {
+      ERROR("Error when reading max_vcpu_id");
+      return -1;
+    }
+    // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, 
buf->vcpumap);
+    return pagebuf_get_one(buf, fd);
+  } else if (count == -3) {
+    /* Skip padding 4 bytes then read the EPT identity PT location. */
+    if ( read_exact(fd, &buf->identpt, sizeof(uint32_t)) ||
+        read_exact(fd, &buf->identpt, sizeof(uint64_t)) )
+    {
+      ERROR("error read the address of the EPT identity map");
+      return -1;
+    }
+    // DPRINTF("EPT identity map address: %llx\n", buf->identpt);
+    return pagebuf_get_one(buf, fd);
+  } else if ( count == -4 )  {
+    /* Skip padding 4 bytes then read the vm86 TSS location. */
+    if ( read_exact(fd, &buf->vm86_tss, sizeof(uint32_t)) ||
+        read_exact(fd, &buf->vm86_tss, sizeof(uint64_t)) )
+    {
+      ERROR("error read the address of the vm86 TSS");
+      return -1;
+    }
+    // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss);
+    return pagebuf_get_one(buf, fd);
+  } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
+    ERROR("Max batch size exceeded (%d). Giving up.", count);
+    return -1;
+  }
+
+  oldcount = buf->nr_pages;
+  buf->nr_pages += count;
+  if (!buf->pfn_types) {
+    if (!(buf->pfn_types = malloc(buf->nr_pages * sizeof(*(buf->pfn_types))))) 
{
+      ERROR("Could not allocate PFN type buffer");
+      return -1;
+    }
+  } else {
+    if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * 
sizeof(*(buf->pfn_types))))) {
+      ERROR("Could not reallocate PFN type buffer");
+      return -1;
+    }
+    buf->pfn_types = ptmp;
+  }
+  if ( read_exact(fd, buf->pfn_types + oldcount, count * 
sizeof(*(buf->pfn_types)))) {
+    ERROR("Error when reading region pfn types");
+    return -1;
+  }
+
+  countpages = count;
+  for (i = oldcount; i < buf->nr_pages; ++i)
+    if ((buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) == 
XEN_DOMCTL_PFINFO_XTAB)
+      --countpages;
+
+  if (!countpages)
+    return count;
+
+  oldcount = buf->nr_physpages;
+  buf->nr_physpages += countpages;
+  if (!buf->pages) {
+    if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) {
+      ERROR("Could not allocate page buffer");
+      return -1;
+    }
+  } else {
+    if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) {
+      ERROR("Could not reallocate page buffer");
+      return -1;
+    }
+    buf->pages = ptmp;
+  }
+  if ( read_exact(fd, buf->pages + oldcount * PAGE_SIZE, countpages * 
PAGE_SIZE) ) {
+    ERROR("Error when reading pages");
+    return -1;
+  }
+
+  return count;
+}
+
+static int pagebuf_get(pagebuf_t* buf, int fd)
+{
+  int rc;
+
+  buf->nr_physpages = buf->nr_pages = 0;
+
+  do {
+    rc = pagebuf_get_one(buf, fd);
+  } while (rc > 0);
+
+  if (rc < 0)
+    pagebuf_free(buf);
+
+  return rc;
+}
+
+static int apply_batch(int xc_handle, uint32_t dom, xen_pfn_t* region_mfn,
+                      unsigned long* pfn_type, int pae_extended_cr3,
+                      unsigned int hvm, struct xc_mmu* mmu,
+                      pagebuf_t* pagebuf, int curbatch)
+{
+  int nr_mfns = 0;
+  int i, j, curpage;
+  /* used by debug verify code */
+  unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
+  /* Our mapping of the current region (batch) */
+  char *region_base;
+  /* A temporary mapping, and a copy, of one frame of guest memory. */
+  unsigned long *page = NULL;
+  int nraces = 0;
+
+  unsigned long mfn, pfn, pagetype;
+
+  j = pagebuf->nr_pages - curbatch;
+  if (j > MAX_BATCH_SIZE)
+    j = MAX_BATCH_SIZE;
+
+  /* First pass for this batch: work out how much memory to alloc */
+  for ( i = 0; i < j; i++ )
+  {
+    pfn      = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+    pagetype = pagebuf->pfn_types[i + curbatch] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+    if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && 
+        (p2m[pfn] == INVALID_P2M_ENTRY) )
+    {
+      /* Have a live PFN which hasn't had an MFN allocated */
+      p2m_batch[nr_mfns++] = pfn; 
+      p2m[pfn]--;
+    }
+  } 
+
+  /* Now allocate a bunch of mfns for this batch */
+  if ( nr_mfns &&
+       (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
+                                         0, p2m_batch) != 0) )
+  { 
+    ERROR("Failed to allocate memory for batch.!\n"); 
+    errno = ENOMEM;
+    return -1;
+  }
+
+  /* Second pass for this batch: update p2m[] and region_mfn[] */
+  nr_mfns = 0; 
+  for ( i = 0; i < j; i++ )
+  {
+    pfn      = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+    pagetype = pagebuf->pfn_types[i + curbatch] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+    if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+      region_mfn[i] = ~0UL; /* map will fail but we don't care */
+    else 
+    {
+      if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
+      {
+       /* We just allocated a new mfn above; update p2m */
+       p2m[pfn] = p2m_batch[nr_mfns++]; 
+       nr_pfns++; 
+      }
+
+      /* setup region_mfn[] for batch map.
+       * For HVM guests, this interface takes PFNs, not MFNs */
+      region_mfn[i] = hvm ? pfn : p2m[pfn]; 
+    }
+  }
+
+  /* Map relevant mfns */
+  region_base = xc_map_foreign_batch(
+    xc_handle, dom, PROT_WRITE, region_mfn, j);
+
+  if ( region_base == NULL )
+  {
+    ERROR("map batch failed");
+    return -1;
+  }
+
+  for ( i = 0, curpage = -1; i < j; i++ )
+  {
+    pfn      = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+    pagetype = pagebuf->pfn_types[i + curbatch] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+    if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+      /* a bogus/unmapped page: skip it */
+      continue;
+
+    ++curpage;
+
+    if ( pfn > p2m_size )
+    {
+      ERROR("pfn out of range");
+      return -1;
+    }
+
+    pfn_type[pfn] = pagetype;
+
+    mfn = p2m[pfn];
+
+    /* In verify mode, we use a copy; otherwise we work in place */
+    page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
+
+    memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, PAGE_SIZE);
+
+    pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+    if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 
+        (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+    {
+      /*
+      ** A page table page - need to 'uncanonicalize' it, i.e.
+      ** replace all the references to pfns with the corresponding
+      ** mfns for the new domain.
+      **
+      ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
+      ** so we may need to update the p2m after the main loop.
+      ** Hence we defer canonicalization of L1s until then.
+      */
+      if ((pt_levels != 3) ||
+         pae_extended_cr3 ||
+         (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
+
+       if (!uncanonicalize_pagetable(xc_handle, dom, 
+                                     pagetype, page)) {
+         /*
+         ** Failing to uncanonicalize a page table can be ok
+         ** under live migration since the pages type may have
+         ** changed by now (and we'll get an update later).
+         */
+         DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+                 pagetype >> 28, pfn, mfn);
+         nraces++;
+         continue;
+       } 
+      }
+    }
+    else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
+    {
+      ERROR("Bogus page type %lx page table is out of range: "
+           "i=%d p2m_size=%lu", pagetype, i, p2m_size);
+      return -1;
+    }
+
+    if ( pagebuf->verify )
+    {
+      int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
+      if ( res )
+      {
+       int v;
+
+       DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
+               "actualcs=%08lx\n", pfn, pagebuf->pfn_types[pfn],
+               csum_page(region_base + (i + curbatch)*PAGE_SIZE),
+               csum_page(buf));
+
+       for ( v = 0; v < 4; v++ )
+       {
+         unsigned long *p = (unsigned long *)
+           (region_base + i*PAGE_SIZE);
+         if ( buf[v] != p[v] )
+           DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
+       }
+      }
+    }
+
+    if ( !hvm &&
+        xc_add_mmu_update(xc_handle, mmu,
+                          (((unsigned long long)mfn) << PAGE_SHIFT)
+                          | MMU_MACHPHYS_UPDATE, pfn) )
+    {
+      ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
+      return -1;
+    }
+  } /* end of 'batch' for loop */
+
+  munmap(region_base, j*PAGE_SIZE);
+
+  return nraces;
+}
+
 int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
                       unsigned int store_evtchn, unsigned long *store_mfn,
                       unsigned int console_evtchn, unsigned long *console_mfn,
@@ -278,7 +710,6 @@
     int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
     unsigned long mfn, pfn;
     unsigned int prev_pc, this_pc;
-    int verify = 0;
     int nraces = 0;
 
     /* The new domain's shared-info frame number. */
@@ -297,9 +728,6 @@
     /* A table of MFNs to map in the current region */
     xen_pfn_t *region_mfn = NULL;
 
-    /* Types of the pfns in the current region */
-    unsigned long region_pfn_type[MAX_BATCH_SIZE];
-
     /* A copy of the pfn-to-mfn table frame list. */
     xen_pfn_t *p2m_frame_list = NULL;
     
@@ -311,9 +739,6 @@
 
     struct xc_mmu *mmu = NULL;
 
-    /* used by debug verify code */
-    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
-
     struct mmuext_op pin[MAX_PIN_BATCH];
     unsigned int nr_pins;
 
@@ -327,6 +752,14 @@
     /* Buffer for holding HVM context */
     uint8_t *hvm_buf = NULL;
 
+    int completed = 0;
+    pagebuf_t pagebuf;
+    tailbuf_t tailbuf, tmptail;
+    void* vcpup;
+
+    pagebuf_init(&pagebuf);
+    memset(&tailbuf, 0, sizeof(tailbuf));
+
     /* For info only */
     nr_pfns = 0;
 
@@ -435,9 +868,10 @@
     prev_pc = 0;
 
     n = m = 0;
+  loadpages:
     for ( ; ; )
     {
-        int j, nr_mfns = 0; 
+        int j, curbatch; 
 
         this_pc = (n * 100) / p2m_size;
         if ( (this_pc - prev_pc) >= 5 )
@@ -446,248 +880,49 @@
             prev_pc = this_pc;
         }
 
-        if ( read_exact(io_fd, &j, sizeof(int)) )
-        {
-            ERROR("Error when reading batch size");
-            goto out;
-        }
+       if ( !completed ) {
+         pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+         if ( pagebuf_get_one(&pagebuf, io_fd) < 0 ) {
+           ERROR("Error when reading batch\n");
+           goto out;
+         }
+       }
+       j = pagebuf.nr_pages;
 
         PPRINTF("batch %d\n",j);
 
-        if ( j == -1 )
-        {
-            verify = 1;
-            DPRINTF("Entering page verify mode\n");
-            continue;
-        }
+       if ( j == 0 ) {
+         /* catch vcpu updates */
+         if (pagebuf.new_ctxt_format) {
+           vcpumap = pagebuf.vcpumap;
+           max_vcpu_id = pagebuf.max_vcpu_id;
+         }
+         /* should this be deferred? does it change? */
+         if (pagebuf.identpt)
+           xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, 
pagebuf.identpt);
+         if (pagebuf.vm86_tss)
+           xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, 
pagebuf.vm86_tss);
+         break;  /* our work here is done */
+       }
 
-        if ( j == -2 )
-        {
-            new_ctxt_format = 1;
-            if ( read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
-                 (max_vcpu_id >= 64) ||
-                 read_exact(io_fd, &vcpumap, sizeof(uint64_t)) )
-            {
-                ERROR("Error when reading max_vcpu_id");
-                goto out;
-            }
-            continue;
-        }
+       /* break pagebuf into batches */
+       curbatch = 0;
+       while ( curbatch < j ) {
+         int brc;
 
-        if ( j == -3 )
-        {
-            uint64_t ident_pt;
+         brc = apply_batch(xc_handle, dom, region_mfn, pfn_type,
+                           pae_extended_cr3, hvm, mmu, &pagebuf, curbatch);
+         if ( brc < 0 )
+           goto out;
 
-            /* Skip padding 4 bytes then read the EPT identity PT location. */
-            if ( read_exact(io_fd, &ident_pt, sizeof(uint32_t)) ||
-                 read_exact(io_fd, &ident_pt, sizeof(uint64_t)) )
-            {
-                ERROR("error read the address of the EPT identity map");
-                goto out;
-            }
+         nraces += brc;
 
-            xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, ident_pt);
-            continue;
-        }
+         curbatch += MAX_BATCH_SIZE;
+       }
 
-        if ( j == -4 )
-        {
-            uint64_t vm86_tss;
+       pagebuf.nr_physpages = pagebuf.nr_pages = 0;
 
-            /* Skip padding 4 bytes then read the vm86 TSS location. */
-            if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
-                 read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
-            {
-                ERROR("error read the address of the vm86 TSS");
-                goto out;
-            }
-
-            xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
-            continue;
-        }
-
-        if ( j == 0 )
-            break;  /* our work here is done */
-
-        if ( (j > MAX_BATCH_SIZE) || (j < 0) )
-        {
-            ERROR("Max batch size exceeded. Giving up.");
-            goto out;
-        }
-
-        if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
-        {
-            ERROR("Error when reading region pfn types");
-            goto out;
-        }
-
-        /* First pass for this batch: work out how much memory to alloc */
-        nr_mfns = 0; 
-        for ( i = 0; i < j; i++ )
-        {
-            unsigned long pfn, pagetype;
-            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-            if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && 
-                 (p2m[pfn] == INVALID_P2M_ENTRY) )
-            {
-                /* Have a live PFN which hasn't had an MFN allocated */
-                p2m_batch[nr_mfns++] = pfn; 
-                p2m[pfn]--;
-            }
-        } 
-
-        /* Now allocate a bunch of mfns for this batch */
-        if ( nr_mfns &&
-             (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
-                                                0, p2m_batch) != 0) )
-        { 
-            ERROR("Failed to allocate memory for batch.!\n"); 
-            errno = ENOMEM;
-            goto out;
-        }
-
-        /* Second pass for this batch: update p2m[] and region_mfn[] */
-        nr_mfns = 0; 
-        for ( i = 0; i < j; i++ )
-        {
-            unsigned long pfn, pagetype;
-            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-            if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
-                region_mfn[i] = ~0UL; /* map will fail but we don't care */
-            else 
-            {
-                if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
-                {
-                    /* We just allocated a new mfn above; update p2m */
-                    p2m[pfn] = p2m_batch[nr_mfns++]; 
-                    nr_pfns++; 
-                }
-
-                /* setup region_mfn[] for batch map.
-                 * For HVM guests, this interface takes PFNs, not MFNs */
-                region_mfn[i] = hvm ? pfn : p2m[pfn]; 
-            }
-        } 
-
-        /* Map relevant mfns */
-        region_base = xc_map_foreign_batch(
-            xc_handle, dom, PROT_WRITE, region_mfn, j);
-
-        if ( region_base == NULL )
-        {
-            ERROR("map batch failed");
-            goto out;
-        }
-
-        for ( i = 0; i < j; i++ )
-        {
-            void *page;
-            unsigned long pagetype;
-
-            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-            if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
-                /* a bogus/unmapped page: skip it */
-                continue;
-
-            if ( pfn > p2m_size )
-            {
-                ERROR("pfn out of range");
-                goto out;
-            }
-
-            pfn_type[pfn] = pagetype;
-
-            mfn = p2m[pfn];
-
-            /* In verify mode, we use a copy; otherwise we work in place */
-            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
-
-            if ( read_exact(io_fd, page, PAGE_SIZE) )
-            {
-                ERROR("Error when reading page (type was %lx)", pagetype);
-                goto out;
-            }
-
-            pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
-            if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 
-                 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
-            {
-                /*
-                ** A page table page - need to 'uncanonicalize' it, i.e.
-                ** replace all the references to pfns with the corresponding
-                ** mfns for the new domain.
-                **
-                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
-                ** so we may need to update the p2m after the main loop.
-                ** Hence we defer canonicalization of L1s until then.
-                */
-                if ((pt_levels != 3) ||
-                    pae_extended_cr3 ||
-                    (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
-
-                    if (!uncanonicalize_pagetable(xc_handle, dom, 
-                                                  pagetype, page)) {
-                        /*
-                        ** Failing to uncanonicalize a page table can be ok
-                        ** under live migration since the pages type may have
-                        ** changed by now (and we'll get an update later).
-                        */
-                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
-                                pagetype >> 28, pfn, mfn);
-                        nraces++;
-                        continue;
-                    } 
-                }
-            }
-            else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
-            {
-                ERROR("Bogus page type %lx page table is out of range: "
-                    "i=%d p2m_size=%lu", pagetype, i, p2m_size);
-                goto out;
-
-            }
-
-            if ( verify )
-            {
-                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
-                if ( res )
-                {
-                    int v;
-
-                    DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
-                            "actualcs=%08lx\n", pfn, pfn_type[pfn],
-                            csum_page(region_base + i*PAGE_SIZE),
-                            csum_page(buf));
-
-                    for ( v = 0; v < 4; v++ )
-                    {
-                        unsigned long *p = (unsigned long *)
-                            (region_base + i*PAGE_SIZE);
-                        if ( buf[v] != p[v] )
-                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
-                    }
-                }
-            }
-
-            if ( !hvm &&
-                 xc_add_mmu_update(xc_handle, mmu,
-                                   (((unsigned long long)mfn) << PAGE_SHIFT)
-                                   | MMU_MACHPHYS_UPDATE, pfn) )
-            {
-                ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
-                goto out;
-            }
-        } /* end of 'batch' for loop */
-
-        munmap(region_base, j*PAGE_SIZE);
-        n+= j; /* crude stats */
+       n += j; /* crude stats */
 
         /* 
          * Discard cache for portion of file read so far up to last
@@ -785,6 +1020,32 @@
 
     /* Non-HVM guests only from here on */
 
+    if (!completed) {
+      if ( buffer_tail(&tailbuf, io_fd, max_vcpu_id, vcpumap,
+                      ext_vcpucontext) < 0 ) {
+       ERROR ("error buffering image tail");
+       goto out;
+      }
+      completed = 1;
+    }
+    
+    DPRINTF("Buffered checkpoint\n");
+    if (pagebuf_get(&pagebuf, io_fd)) {
+         ERROR("error when buffering batch, finishing\n");
+         goto finish;
+    }
+    if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
+                    ext_vcpucontext) < 0 ) {
+      ERROR ("error buffering image tail, finishing");
+         goto finish;
+    }
+    tailbuf_free(&tailbuf);
+    memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
+
+    goto loadpages;
+
+  finish:
+
     if ( (pt_levels == 3) && !pae_extended_cr3 )
     {
         /*
@@ -953,39 +1214,17 @@
 
     /* Get the list of PFNs that are not in the psuedo-phys map */
     {
-        unsigned int count = 0;
-        unsigned long *pfntab;
-        int nr_frees;
+        int nr_frees = 0;
 
-        if ( read_exact(io_fd, &count, sizeof(count)) ||
-             (count > (1U << 28)) ) /* up to 1TB of address space */
+       for ( i = 0; i < tailbuf.pfncount; i++ )
         {
-            ERROR("Error when reading pfn count (= %u)", count);
-            goto out;
-        }
-
-        if ( !(pfntab = malloc(sizeof(unsigned long) * count)) )
-        {
-            ERROR("Out of memory");
-            goto out;
-        }
-
-        if ( read_exact(io_fd, pfntab, sizeof(unsigned long)*count) )
-        {
-            ERROR("Error when reading pfntab");
-            goto out;
-        }
-
-        nr_frees = 0; 
-        for ( i = 0; i < count; i++ )
-        {
-            unsigned long pfn = pfntab[i];
+            unsigned long pfn = tailbuf.pfntab[i];
 
             if ( p2m[pfn] != INVALID_P2M_ENTRY )
             {
                 /* pfn is not in physmap now, but was at some point during 
                    the save/migration process - need to free it */
-                pfntab[nr_frees++] = p2m[pfn];
+               tailbuf.pfntab[nr_frees++] = p2m[pfn];
                 p2m[pfn]  = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
             }
         }
@@ -997,7 +1236,7 @@
                 .extent_order = 0,
                 .domid        = dom
             };
-            set_xen_guest_handle(reservation.extent_start, pfntab);
+            set_xen_guest_handle(reservation.extent_start, tailbuf.pfntab);
 
             if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
                                      &reservation)) != nr_frees )
@@ -1006,7 +1245,7 @@
                 goto out;
             }
             else
-                DPRINTF("Decreased reservation by %d pages\n", count);
+                DPRINTF("Decreased reservation by %d pages\n", 
tailbuf.pfncount);
         }
     }
 
@@ -1016,18 +1255,17 @@
         return 1;
     }
 
+    vcpup = tailbuf.vcpubuf;
     for ( i = 0; i <= max_vcpu_id; i++ )
     {
         if ( !(vcpumap & (1ULL << i)) )
             continue;
 
-        if ( read_exact(io_fd, &ctxt, ((guest_width == 8)
-                                       ? sizeof(ctxt.x64)
-                                       : sizeof(ctxt.x32))) )
-        {
-            ERROR("Error when reading ctxt %d", i);
-            goto out;
-        }
+       memcpy(&ctxt, vcpup, ((guest_width == 8) ? sizeof(ctxt.x64)
+                             : sizeof(ctxt.x32)));
+       vcpup += (guest_width == 8) ? sizeof(ctxt.x64) : sizeof(ctxt.x32);
+
+       DPRINTF("read VCPU %d\n", i);
 
         if ( !new_ctxt_format )
             SET_FIELD(&ctxt, flags, GET_FIELD(&ctxt, flags) | VGCF_online);
@@ -1132,12 +1370,8 @@
 
         if ( !ext_vcpucontext )
             continue;
-        if ( read_exact(io_fd, &domctl.u.ext_vcpucontext, 128) ||
-             (domctl.u.ext_vcpucontext.vcpu != i) )
-        {
-            ERROR("Error when reading extended ctxt %d", i);
-            goto out;
-        }
+       memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
+       vcpup += 128;
         domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
         domctl.domain = dom;
         frc = xc_domctl(xc_handle, &domctl);
@@ -1148,11 +1382,9 @@
         }
     }
 
-    if ( read_exact(io_fd, shared_info_page, PAGE_SIZE) )
-    {
-        ERROR("Error when reading shared info page");
-        goto out;
-    }
+    memcpy(shared_info_page, tailbuf.shared_info_page, PAGE_SIZE);
+
+    DPRINTF("Completed checkpoint load\n");
 
     /* Restore contents of shared-info page. No checking needed. */
     new_shared_info = xc_map_foreign_range(

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.