[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [Patch v6 12/13] tools/libxc: noarch save code



Save a domain, calling domain type specific function at the appropriate
points.  This implements the xc_domain_save2() API function which is
equivalent to the existing xc_domain_save().

This writes the image and domain headers, and writes all the PAGE_DATA records
using a "live" process.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

---
v6:
 * Use writev() instead of write() for PAGE_DATA.
 * Introduce separate non-live suspend which avoids playing with logdirty.
 * Fix a bug which would clobber the penultimate logdirty bitmap.
---
 tools/libxc/saverestore/save.c |  687 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 686 insertions(+), 1 deletion(-)

diff --git a/tools/libxc/saverestore/save.c b/tools/libxc/saverestore/save.c
index f6ad734..653dc8e 100644
--- a/tools/libxc/saverestore/save.c
+++ b/tools/libxc/saverestore/save.c
@@ -1,11 +1,696 @@
+#include <assert.h>
+#include <arpa/inet.h>
+
 #include "common.h"
 
+/*
+ * Writes an Image header and Domain header into the stream.
+ */
+static int write_headers(struct xc_sr_context *ctx, uint16_t guest_type)
+{
+    xc_interface *xch = ctx->xch;
+    int32_t xen_version = xc_version(xch, XENVER_version, NULL);
+    struct xc_sr_ihdr ihdr =
+        {
+            .marker  = IHDR_MARKER,
+            .id      = htonl(IHDR_ID),
+            .version = htonl(IHDR_VERSION),
+            .options = htons(IHDR_OPT_LITTLE_ENDIAN),
+        };
+    struct xc_sr_dhdr dhdr =
+        {
+            .type       = guest_type,
+            .page_shift = XC_PAGE_SHIFT,
+            .xen_major  = (xen_version >> 16) & 0xffff,
+            .xen_minor  = (xen_version)       & 0xffff,
+        };
+
+    if ( xen_version < 0 )
+    {
+        PERROR("Unable to obtain Xen Version");
+        return -1;
+    }
+
+    if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
+    {
+        PERROR("Unable to write Image Header to stream");
+        return -1;
+    }
+
+    if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
+    {
+        PERROR("Unable to write Domain Header to stream");
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Writes an END record into the stream.
+ */
+static int write_end_record(struct xc_sr_context *ctx)
+{
+    struct xc_sr_record end = { REC_TYPE_END, 0, NULL };
+
+    return write_record(ctx, &end);
+}
+
+/*
+ * Writes a batch of memory as a PAGE_DATA record into the stream.  The batch
+ * is constructed in ctx->save.batch_pfns.
+ *
+ * This function:
+ * - gets the types for each pfn in the batch.
+ * - for each pfn with real data:
+ *   - maps and attempts to localise the pages.
+ * - construct and writes a PAGE_DATA record into the stream.
+ */
+static int write_batch(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t *mfns = NULL, *types = NULL;
+    void *guest_mapping = NULL;
+    void **guest_data = NULL;
+    void **local_pages = NULL;
+    int *errors = NULL, rc = -1;
+    unsigned i, p, nr_pages = 0;
+    unsigned nr_pfns = ctx->save.nr_batch_pfns;
+    void *page, *orig_page;
+    uint64_t *rec_pfns = NULL;
+    struct iovec *iov = NULL; int iovcnt = 0;
+    struct xc_sr_rec_page_data_header hdr = { 0 };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_PAGE_DATA,
+    };
+
+    assert(nr_pfns != 0);
+
+    /* Mfns of the batch pfns. */
+    mfns = malloc(nr_pfns * sizeof(*mfns));
+    /* Types of the batch pfns. */
+    types = malloc(nr_pfns * sizeof(*types));
+    /* Errors from attempting to map the mfns. */
+    errors = malloc(nr_pfns * sizeof(*errors));
+    /* Pointers to page data to send.  Either mapped mfns or local 
allocations. */
+    guest_data = calloc(nr_pfns, sizeof(*guest_data));
+    /* Pointers to locally allocated pages.  Need freeing. */
+    local_pages = calloc(nr_pfns, sizeof(*local_pages));
+    /* iovec[] for writev(). */
+    iov = malloc((nr_pfns + 4) * sizeof(*iov));
+
+    if ( !mfns || !types || !errors || !guest_data || !local_pages || !iov )
+    {
+        ERROR("Unable to allocate arrays for a batch of %u pages",
+              nr_pfns);
+        goto err;
+    }
+
+    for ( i = 0; i < nr_pfns; ++i )
+    {
+        types[i] = mfns[i] = ctx->save.ops.pfn_to_gfn(ctx, 
ctx->save.batch_pfns[i]);
+
+        /* Likely a ballooned page. */
+        if ( mfns[i] == INVALID_MFN )
+            set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
+    }
+
+    rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types);
+    if ( rc )
+    {
+        PERROR("Failed to get types for pfn batch");
+        goto err;
+    }
+    rc = -1;
+
+    for ( i = 0; i < nr_pfns; ++i )
+    {
+        switch ( types[i] )
+        {
+        case XEN_DOMCTL_PFINFO_BROKEN:
+        case XEN_DOMCTL_PFINFO_XALLOC:
+        case XEN_DOMCTL_PFINFO_XTAB:
+            continue;
+        }
+
+        mfns[nr_pages++] = mfns[i];
+    }
+
+    if ( nr_pages > 0 )
+    {
+        guest_mapping = xc_map_foreign_bulk(
+            xch, ctx->domid, PROT_READ, mfns, errors, nr_pages);
+        if ( !guest_mapping )
+        {
+            PERROR("Failed to map guest pages");
+            goto err;
+        }
+    }
+
+    for ( i = 0, p = 0; i < nr_pfns; ++i )
+    {
+        switch ( types[i] )
+        {
+        case XEN_DOMCTL_PFINFO_BROKEN:
+        case XEN_DOMCTL_PFINFO_XALLOC:
+        case XEN_DOMCTL_PFINFO_XTAB:
+            continue;
+        }
+
+        if ( errors[p] )
+        {
+            ERROR("Mapping of pfn %#lx (mfn %#lx) failed %d",
+                  ctx->save.batch_pfns[i], mfns[p], errors[p]);
+            goto err;
+        }
+
+        orig_page = page = guest_mapping + (p * PAGE_SIZE);
+        rc = ctx->save.ops.normalise_page(ctx, types[i], &page);
+        if ( rc )
+        {
+            if ( rc == -1 && errno == EAGAIN )
+            {
+                set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
+                types[i] = XEN_DOMCTL_PFINFO_XTAB;
+                --nr_pages;
+            }
+            else
+                goto err;
+        }
+        else
+            guest_data[i] = page;
+
+        if ( page != orig_page )
+            local_pages[i] = page;
+        rc = -1;
+
+        ++p;
+    }
+
+    rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns));
+    if ( !rec_pfns )
+    {
+        ERROR("Unable to allocate %zu bytes of memory for page data pfn list",
+              nr_pfns * sizeof(*rec_pfns));
+        goto err;
+    }
+
+    hdr.count = nr_pfns;
+
+    rec.length = sizeof(hdr);
+    rec.length += nr_pfns * sizeof(*rec_pfns);
+    rec.length += nr_pages * PAGE_SIZE;
+
+    for ( i = 0; i < nr_pfns; ++i )
+        rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i];
+
+    iov[0].iov_base = &rec.type;
+    iov[0].iov_len = sizeof(rec.type);
+
+    iov[1].iov_base = &rec.length;
+    iov[1].iov_len = sizeof(rec.length);
+
+    iov[2].iov_base = &hdr;
+    iov[2].iov_len = sizeof(hdr);
+
+    iov[3].iov_base = rec_pfns;
+    iov[3].iov_len = nr_pfns * sizeof(*rec_pfns);
+
+    for ( i = 0, iovcnt = 4; i < nr_pfns; ++i )
+    {
+        if ( guest_data[i] )
+        {
+            iov[iovcnt].iov_base = guest_data[i];
+            iov[iovcnt].iov_len = PAGE_SIZE;
+            iovcnt++;
+            --nr_pages;
+        }
+    }
+
+    if ( writev_exact(ctx->fd, iov, iovcnt) )
+    {
+        PERROR("Failed to write page data to stream");
+        goto err;
+    }
+
+    /* Sanity check we have sent all the pages we expected to. */
+    assert(nr_pages == 0);
+    rc = ctx->save.nr_batch_pfns = 0;
+
+ err:
+    free(rec_pfns);
+    if ( guest_mapping )
+        munmap(guest_mapping, nr_pages * PAGE_SIZE);
+    for ( i = 0; local_pages && i < nr_pfns; ++i )
+            free(local_pages[i]);
+    free(iov);
+    free(local_pages);
+    free(guest_data);
+    free(errors);
+    free(types);
+    free(mfns);
+
+    return rc;
+}
+
+/*
+ * Flush a batch of pfns into the stream.
+ */
+static int flush_batch(struct xc_sr_context *ctx)
+{
+    int rc = 0;
+
+    if ( ctx->save.nr_batch_pfns == 0 )
+        return rc;
+
+    rc = write_batch(ctx);
+
+    if ( !rc )
+    {
+        VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns,
+                                    MAX_BATCH_SIZE * 
sizeof(*ctx->save.batch_pfns));
+    }
+
+    return rc;
+}
+
+/*
+ * Add a single pfn to the batch, flushing the batch if full.
+ */
+static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    int rc = 0;
+
+    if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE )
+        rc = flush_batch(ctx);
+
+    if ( rc == 0 )
+        ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
+
+    return rc;
+}
+
+/*
+ * Pause/suspend the domain, and refresh ctx->dominfo if required.
+ */
+static int suspend_domain(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+
+    /* TODO: Properly specify the return value from this callback.  All
+     * implementations currently appear to return 1 for success, whereas
+     * the legacy code checks for != 0. */
+    int cb_rc = ctx->save.callbacks->suspend(ctx->save.callbacks->data);
+
+    if ( cb_rc == 0 )
+    {
+        ERROR("save callback suspend() failed: %d", cb_rc);
+        return -1;
+    }
+
+    /* Refresh domain information. */
+    if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) ||
+         (ctx->dominfo.domid != ctx->domid) )
+    {
+        PERROR("Unable to refresh domain information");
+        return -1;
+    }
+
+    /* Confirm the domain has actually been paused. */
+    if ( !ctx->dominfo.shutdown ||
+         (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) )
+    {
+        ERROR("Domain has not been suspended: shutdown %d, reason %d",
+              ctx->dominfo.shutdown, ctx->dominfo.shutdown_reason);
+        return -1;
+    }
+
+    IPRINTF("Domain now paused");
+
+    return 0;
+}
+
+/*
+ * Send all pages in the guests p2m.  Used as the first iteration of the live
+ * migration loop, and for a non-live save.
+ */
+static int send_all_pages(struct xc_sr_context *ctx)
+{
+    xen_pfn_t p;
+    int rc;
+
+    for ( p = 0; p < ctx->save.p2m_size; ++p )
+    {
+        rc = add_to_batch(ctx, p);
+        if ( rc )
+            return rc;
+    }
+
+    return flush_batch(ctx);
+}
+
+/*
+ * Send all domain memory.  This is the heart of the live migration loop.
+ */
+static int send_domain_memory_live(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    DECLARE_HYPERCALL_BUFFER(unsigned long, to_send);
+    xc_shadow_op_stats_t stats = { -1, -1 };
+    unsigned pages_written;
+    unsigned x = 0, max_iter = 5, dirty_threshold = 50;
+    xen_pfn_t p;
+    int rc = -1;
+
+    to_send = xc_hypercall_buffer_alloc_pages(
+        xch, to_send, NRPAGES(bitmap_size(ctx->save.p2m_size)));
+
+    ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE * 
sizeof(*ctx->save.batch_pfns));
+    ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size));
+
+    if ( !ctx->save.batch_pfns || !to_send || !ctx->save.deferred_pages )
+    {
+        ERROR("Unable to allocate memory for to_{send,fix}/batch bitmaps");
+        goto out;
+    }
+
+    /* This juggling is required if logdirty is already on, e.g. VRAM tracking 
*/
+    if ( xc_shadow_control(xch, ctx->domid,
+                           XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                           NULL, 0, NULL, 0, NULL) < 0 )
+    {
+        int frc = xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                                    NULL, 0, NULL, 0, NULL);
+        if ( frc >= 0 )
+            frc = xc_shadow_control(xch, ctx->domid,
+                                    XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                                    NULL, 0, NULL, 0, NULL);
+        if ( frc < 0 )
+        {
+            PERROR("Failed to enable logdirty: rc %d", frc);
+            goto out;
+        }
+    }
+
+    DPRINTF("Iteration %u", x);
+    rc = send_all_pages(ctx);
+    if ( rc )
+        goto out;
+    pages_written = ctx->save.p2m_size - 1;
+
+    for ( x = 1; x < max_iter ; ++x )
+    {
+        DPRINTF("Iteration %u", x);
+
+        if ( xc_shadow_control(
+                 xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+                 HYPERCALL_BUFFER(to_send), ctx->save.p2m_size,
+                 NULL, 0, &stats) != ctx->save.p2m_size )
+        {
+            PERROR("Failed to retrieve logdirty bitmap");
+            rc = -1;
+            goto out;
+        }
+        else
+            DPRINTF("  Wrote %u pages; stats: faults %"PRIu32", dirty %"PRIu32,
+                    pages_written, stats.fault_count, stats.dirty_count);
+        pages_written = 0;
+
+        for ( p = 0 ; p < ctx->save.p2m_size; ++p )
+        {
+            if ( test_bit(p, to_send) )
+            {
+                rc = add_to_batch(ctx, p);
+                if ( rc )
+                    goto out;
+                ++pages_written;
+            }
+        }
+
+        rc = flush_batch(ctx);
+        if ( rc )
+            goto out;
+
+        if ( stats.dirty_count < dirty_threshold )
+            break;
+    }
+
+    rc = suspend_domain(ctx);
+    if ( rc )
+        goto out;
+
+    if ( xc_shadow_control(
+             xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+             HYPERCALL_BUFFER(to_send), ctx->save.p2m_size,
+             NULL, 0, &stats) != ctx->save.p2m_size )
+    {
+        PERROR("Failed to retrieve logdirty bitmap");
+        rc = -1;
+        goto out;
+    }
+
+    for ( p = 0, pages_written = 0 ; p < ctx->save.p2m_size; ++p )
+    {
+        if ( test_bit(p, to_send) || test_bit(p, ctx->save.deferred_pages) )
+        {
+            rc = add_to_batch(ctx, p);
+            if ( rc )
+                goto out;
+            ++pages_written;
+        }
+    }
+
+    rc = flush_batch(ctx);
+    if ( rc )
+        goto out;
+
+    DPRINTF("  Wrote %u pages", pages_written);
+    IPRINTF("Sent all pages");
+
+    if ( ctx->save.debug )
+    {
+        struct xc_sr_record rec =
+        {
+            .type = REC_TYPE_VERIFY,
+            .length = 0,
+        };
+
+        DPRINTF("Enabling verify mode");
+
+        rc = write_record(ctx, &rec);
+        if ( rc )
+            goto out;
+
+        rc = send_all_pages(ctx);
+        if ( rc )
+            goto out;
+
+        DPRINTF("  Wrote %lu verification pages", ctx->save.p2m_size - 1);
+
+        if ( xc_shadow_control(
+                 xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_PEEK,
+                 HYPERCALL_BUFFER(to_send), ctx->save.p2m_size,
+                 NULL, 0, &stats) != ctx->save.p2m_size )
+        {
+            PERROR("Failed to retrieve logdirty bitmap");
+            rc = -1;
+            goto out;
+        }
+
+        DPRINTF("  Further stats: faults %"PRIu32", dirty %"PRIu32,
+                stats.fault_count, stats.dirty_count);
+    }
+
+  out:
+    xc_hypercall_buffer_free_pages(xch, to_send,
+                                   NRPAGES(bitmap_size(ctx->save.p2m_size)));
+    free(ctx->save.deferred_pages);
+    free(ctx->save.batch_pfns);
+    return rc;
+}
+
+/*
+ * Send all domain memory, pausing the domain first.  Generally used for
+ * suspend-to-file.
+ */
+static int send_domain_memory_nonlive(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc = -1;
+
+    ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE *
+                                  sizeof(*ctx->save.batch_pfns));
+    ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size));
+
+    if ( !ctx->save.batch_pfns || !ctx->save.deferred_pages )
+    {
+        PERROR("Failed to allocate memory for nonlive tracking structures");
+        errno = ENOMEM;
+        goto err;
+    }
+
+    rc = suspend_domain(ctx);
+    if ( rc )
+        goto err;
+
+    rc = send_all_pages(ctx);
+    if ( rc )
+        goto err;
+
+    IPRINTF("Sent all %lu pages", ctx->save.p2m_size - 1);
+
+ err:
+    free(ctx->save.deferred_pages);
+    free(ctx->save.batch_pfns);
+
+    return rc;
+}
+
+/*
+ * Save a domain.
+ */
+static int save(struct xc_sr_context *ctx, uint16_t guest_type)
+{
+    xc_interface *xch = ctx->xch;
+    int rc, saved_rc = 0, saved_errno = 0;
+
+    IPRINTF("Saving domain %d, type %s",
+            ctx->domid, dhdr_type_to_str(guest_type));
+
+    rc = ctx->save.ops.setup(ctx);
+    if ( rc )
+        goto err;
+
+    rc = write_headers(ctx, guest_type);
+    if ( rc )
+        goto err;
+
+    rc = arch_write_saving_cpu(ctx);
+    if ( rc )
+        goto err;
+
+    rc = ctx->save.ops.start_of_stream(ctx);
+    if ( rc )
+        goto err;
+
+    if ( ctx->save.live )
+    {
+        DPRINTF("Starting live migrate");
+        rc = send_domain_memory_live(ctx);
+    }
+    else
+    {
+        DPRINTF("Starting nonlive save");
+        rc = send_domain_memory_nonlive(ctx);
+    }
+
+    if ( rc )
+        goto err;
+
+    /* Refresh domain information now it has paused. */
+    if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) ||
+         (ctx->dominfo.domid != ctx->domid) )
+    {
+        PERROR("Unable to refresh domain information");
+        rc = -1;
+        goto err;
+    }
+    else if ( (!ctx->dominfo.shutdown ||
+               ctx->dominfo.shutdown_reason != SHUTDOWN_suspend ) &&
+              !ctx->dominfo.paused )
+    {
+        ERROR("Domain has not been suspended");
+        rc = -1;
+        goto err;
+    }
+
+    rc = ctx->save.ops.end_of_stream(ctx);
+    if ( rc )
+        goto err;
+
+    rc = write_end_record(ctx);
+    if ( rc )
+        goto err;
+
+    IPRINTF("Save successful");
+    goto done;
+
+ err:
+    saved_errno = errno;
+    saved_rc = rc;
+    PERROR("Save failed");
+
+ done:
+    xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                      NULL, 0, NULL, 0, NULL);
+
+    rc = ctx->save.ops.cleanup(ctx);
+    if ( rc )
+        PERROR("Failed to clean up");
+
+    if ( saved_rc )
+    {
+        rc = saved_rc;
+        errno = saved_errno;
+    }
+
+    return rc;
+};
+
 int xc_domain_save2(xc_interface *xch, int io_fd, uint32_t dom, uint32_t 
max_iters,
                     uint32_t max_factor, uint32_t flags,
                     struct save_callbacks* callbacks, int hvm)
 {
+    struct xc_sr_context ctx =
+        {
+            .xch = xch,
+            .fd = io_fd,
+        };
+
+    /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions :( */
+    ctx.save.callbacks = callbacks;
+    ctx.save.live  = !!(flags & XCFLAGS_LIVE);
+    ctx.save.debug = !!(flags & XCFLAGS_DEBUG);
+
     IPRINTF("In experimental %s", __func__);
-    return -1;
+    DPRINTF("fd %d, dom %"PRIu32", max_iters %"PRIu32", max_factor %"PRIu32
+            ", flags %"PRIu32", hvm %d", io_fd, dom, max_iters, max_factor,
+            flags, hvm);
+
+    if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
+    {
+        PERROR("Failed to get domain info");
+        return -1;
+    }
+
+    if ( ctx.dominfo.domid != dom )
+    {
+        ERROR("Domain %"PRIu32" does not exist", dom);
+        return -1;
+    }
+
+    ctx.domid = dom;
+    IPRINTF("Saving domain %"PRIu32, dom);
+
+    ctx.save.p2m_size = xc_domain_maximum_gpfn(xch, dom) + 1;
+    if ( ctx.save.p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK )
+    {
+        errno = E2BIG;
+        ERROR("Cannot save this big a guest");
+        return -1;
+    }
+
+    if ( ctx.dominfo.hvm )
+    {
+        ctx.save.ops = save_ops_x86_hvm;
+        return save(&ctx, DHDR_TYPE_X86_HVM);
+    }
+    else
+    {
+        ctx.save.ops = save_ops_x86_pv;
+        return save(&ctx, DHDR_TYPE_X86_PV);
+    }
 }
 
 /*
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.