[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH v5 RFC 13/14] tools/libxc: noarch save code
At 06/12/2014 02:14 AM, Andrew Cooper Wrote: > Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> > Signed-off-by: Frediano Ziglio <frediano.ziglio@xxxxxxxxxx> > Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx> > --- > tools/libxc/saverestore/save.c | 545 > +++++++++++++++++++++++++++++++++++++++- > 1 file changed, 544 insertions(+), 1 deletion(-) > > diff --git a/tools/libxc/saverestore/save.c b/tools/libxc/saverestore/save.c > index f6ad734..9ad43a5 100644 > --- a/tools/libxc/saverestore/save.c > +++ b/tools/libxc/saverestore/save.c > @@ -1,11 +1,554 @@ > +#include <assert.h> > +#include <arpa/inet.h> > + > #include "common.h" > > +/* > + * Writes an Image header and Domain header into the stream. > + */ > +static int write_headers(struct context *ctx, uint16_t guest_type) > +{ > + xc_interface *xch = ctx->xch; > + int32_t xen_version = xc_version(xch, XENVER_version, NULL); > + struct ihdr ihdr = > + { > + .marker = IHDR_MARKER, > + .id = htonl(IHDR_ID), > + .version = htonl(IHDR_VERSION), > + .options = htons(IHDR_OPT_LITTLE_ENDIAN), > + }; > + struct dhdr dhdr = > + { > + .type = guest_type, > + .page_shift = XC_PAGE_SHIFT, > + .xen_major = (xen_version >> 16) & 0xffff, > + .xen_minor = (xen_version) & 0xffff, > + }; > + > + if ( xen_version < 0 ) > + { > + PERROR("Unable to obtain Xen Version"); > + return -1; > + } > + > + if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) ) > + { > + PERROR("Unable to write Image Header to stream"); > + return -1; > + } > + > + if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) ) > + { > + PERROR("Unable to write Domain Header to stream"); > + return -1; > + } > + > + return 0; > +} > + > +/* > + * Writes an END record into the stream. > + */ > +static int write_end_record(struct context *ctx) > +{ > + struct record end = { REC_TYPE_END, 0, NULL }; > + > + return write_record(ctx, &end); > +} > + > +/* > + * Writes a batch of memory as a PAGE_DATA record into the stream. The batch > + * is constructed in ctx->save.batch_pfns. > + * > + * This function: > + * - gets the types for each pfn in the batch. > + * - for each pfn with real data: > + * - maps and attempts to localise the pages. > + * - construct and writes a PAGE_DATA record into the stream. > + */ > +static int write_batch(struct context *ctx) > +{ > + xc_interface *xch = ctx->xch; > + xen_pfn_t *mfns = NULL, *types = NULL; > + void *guest_mapping = NULL; > + void **guest_data = NULL; > + void **local_pages = NULL; > + int *errors = NULL, rc = -1; > + unsigned i, p, nr_pages = 0; > + unsigned nr_pfns = ctx->save.nr_batch_pfns; > + void *page, *orig_page; > + uint64_t *rec_pfns = NULL; > + struct rec_page_data_header hdr = { 0 }; > + struct record rec = > + { > + .type = REC_TYPE_PAGE_DATA, > + }; > + > + assert(nr_pfns != 0); > + > + /* Mfns of the batch pfns. */ > + mfns = malloc(nr_pfns * sizeof(*mfns)); > + /* Types of the batch pfns. */ > + types = malloc(nr_pfns * sizeof(*types)); > + /* Errors from attempting to map the mfns. */ > + errors = malloc(nr_pfns * sizeof(*errors)); > + /* Pointers to page data to send. Either mapped mfns or local > allocations. */ > + guest_data = calloc(nr_pfns, sizeof(*guest_data)); > + /* Pointers to locally allocated pages. Need freeing. */ > + local_pages = calloc(nr_pfns, sizeof(*local_pages)); This function is called too many times, so we will allocate/free memory again and again. It may affect the performance. I think we can allocate at setup stage, and only clear guest_data/ local_pages here. > + > + if ( !mfns || !types || !errors || !guest_data || !local_pages ) > + { > + ERROR("Unable to allocate arrays for a batch of %u pages", > + nr_pfns); > + goto err; > + } > + > + for ( i = 0; i < nr_pfns; ++i ) > + { > + types[i] = mfns[i] = ctx->ops.pfn_to_gfn(ctx, > ctx->save.batch_pfns[i]); > + > + /* Likely a ballooned page. */ > + if ( mfns[i] == INVALID_MFN ) > + set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages); > + } > + > + rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types); > + if ( rc ) > + { > + PERROR("Failed to get types for pfn batch"); > + goto err; > + } > + rc = -1; > + > + for ( i = 0; i < nr_pfns; ++i ) > + { > + switch ( types[i] ) > + { > + case XEN_DOMCTL_PFINFO_BROKEN: > + case XEN_DOMCTL_PFINFO_XALLOC: > + case XEN_DOMCTL_PFINFO_XTAB: > + continue; > + } > + > + mfns[nr_pages++] = mfns[i]; > + } > + > + if ( nr_pages > 0 ) > + { > + guest_mapping = xc_map_foreign_bulk( > + xch, ctx->domid, PROT_READ, mfns, errors, nr_pages); > + if ( !guest_mapping ) > + { > + PERROR("Failed to map guest pages"); > + goto err; > + } To support remus, we will map/unmap guest memory again and again. It also affects the performance. We can cache guest mapping here. Thanks Wen Congyang > + } > + > + for ( i = 0, p = 0; i < nr_pfns; ++i ) > + { > + switch ( types[i] ) > + { > + case XEN_DOMCTL_PFINFO_BROKEN: > + case XEN_DOMCTL_PFINFO_XALLOC: > + case XEN_DOMCTL_PFINFO_XTAB: > + continue; > + } > + > + if ( errors[p] ) > + { > + ERROR("Mapping of pfn %#lx (mfn %#lx) failed %d", > + ctx->save.batch_pfns[i], mfns[p], errors[p]); > + goto err; > + } > + > + orig_page = page = guest_mapping + (p * PAGE_SIZE); > + rc = ctx->save.ops.normalise_page(ctx, types[i], &page); > + if ( rc ) > + { > + if ( rc == -1 && errno == EAGAIN ) > + { > + set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages); > + types[i] = XEN_DOMCTL_PFINFO_XTAB; > + --nr_pages; > + } > + else > + goto err; > + } > + else > + guest_data[i] = page; > + > + if ( page != orig_page ) > + local_pages[i] = page; > + rc = -1; > + > + ++p; > + } > + > + rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns)); > + if ( !rec_pfns ) > + { > + ERROR("Unable to allocate %zu bytes of memory for page data pfn > list", > + nr_pfns * sizeof(*rec_pfns)); > + goto err; > + } > + > + hdr.count = nr_pfns; > + > + rec.length = sizeof(hdr); > + rec.length += nr_pfns * sizeof(*rec_pfns); > + rec.length += nr_pages * PAGE_SIZE; > + > + for ( i = 0; i < nr_pfns; ++i ) > + rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i]; > + > + if ( write_record_header(ctx, &rec) || > + write_exact(ctx->fd, &hdr, sizeof(hdr)) || > + write_exact(ctx->fd, rec_pfns, nr_pfns * sizeof(*rec_pfns)) ) > + { > + PERROR("Failed to write page_type header to stream"); > + goto err; > + } > + > + for ( i = 0; i < nr_pfns; ++i ) > + { > + if ( guest_data[i] ) > + { > + if ( write_exact(ctx->fd, guest_data[i], PAGE_SIZE) ) > + { > + PERROR("Failed to write page into stream"); > + goto err; > + } > + > + --nr_pages; > + } > + } > + > + /* Sanity check we have sent all the pages we expected to. */ > + assert(nr_pages == 0); > + rc = ctx->save.nr_batch_pfns = 0; > + > + err: > + free(rec_pfns); > + if ( guest_mapping ) > + munmap(guest_mapping, nr_pages * PAGE_SIZE); > + for ( i = 0; local_pages && i < nr_pfns; ++i ) > + free(local_pages[i]); > + free(local_pages); > + free(guest_data); > + free(errors); > + free(types); > + free(mfns); > + > + return rc; > +} > + > +/* > + * Flush a batch of pfns into the stream. > + */ > +static int flush_batch(struct context *ctx) > +{ > + int rc = 0; > + > + if ( ctx->save.nr_batch_pfns == 0 ) > + return rc; > + > + rc = write_batch(ctx); > + > + if ( !rc ) > + { > + VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns, > + MAX_BATCH_SIZE * > sizeof(*ctx->save.batch_pfns)); > + } > + > + return rc; > +} > + > +/* > + * Add a single pfn to the batch, flushing the batch if full. > + */ > +static int add_to_batch(struct context *ctx, xen_pfn_t pfn) > +{ > + int rc = 0; > + > + if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE ) > + rc = flush_batch(ctx); > + > + if ( rc == 0 ) > + ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn; > + > + return rc; > +} > + > +/* > + * Pause the domain. > + */ > +static int pause_domain(struct context *ctx) > +{ > + xc_interface *xch = ctx->xch; > + int rc; > + > + if ( !ctx->dominfo.paused ) > + { > + /* TODO: Properly specify the return value from this callback. */ > + rc = (ctx->save.callbacks->suspend(ctx->save.callbacks->data) != 1); > + if ( rc ) > + { > + ERROR("Failed to suspend domain"); > + return rc; > + } > + } > + > + IPRINTF("Domain now paused"); > + return 0; > +} > + > +/* > + * Send all domain memory. This is the heart of the live migration loop. > + */ > +static int send_domain_memory(struct context *ctx) > +{ > + xc_interface *xch = ctx->xch; > + DECLARE_HYPERCALL_BUFFER(unsigned long, to_send); > + xc_shadow_op_stats_t stats = { -1, -1 }; > + unsigned pages_written; > + unsigned x, max_iter = 5, dirty_threshold = 50; > + xen_pfn_t p; > + int rc = -1; > + > + to_send = xc_hypercall_buffer_alloc_pages( > + xch, to_send, NRPAGES(bitmap_size(ctx->save.p2m_size))); > + > + ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE * > sizeof(*ctx->save.batch_pfns)); > + ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size)); > + > + if ( !ctx->save.batch_pfns || !to_send || !ctx->save.deferred_pages ) > + { > + ERROR("Unable to allocate memory for to_{send,fix}/batch bitmaps"); > + goto out; > + } > + > + if ( xc_shadow_control(xch, ctx->domid, > + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, > + NULL, 0, NULL, 0, NULL) < 0 ) > + { > + PERROR("Failed to enable logdirty"); > + goto out; > + } > + > + for ( x = 0, pages_written = 0; x < max_iter ; ++x ) > + { > + if ( x == 0 ) > + { > + /* First iteration, send all pages. */ > + memset(to_send, 0xff, bitmap_size(ctx->save.p2m_size)); > + } > + else > + { > + /* Else consult the dirty bitmap. */ > + if ( xc_shadow_control( > + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, > + HYPERCALL_BUFFER(to_send), ctx->save.p2m_size, > + NULL, 0, &stats) != ctx->save.p2m_size ) > + { > + PERROR("Failed to retrieve logdirty bitmap"); > + rc = -1; > + goto out; > + } > + else > + DPRINTF(" Wrote %u pages; stats: faults %"PRIu32", dirty > %"PRIu32, > + pages_written, stats.fault_count, stats.dirty_count); > + pages_written = 0; > + > + if ( stats.dirty_count < dirty_threshold ) > + break; > + } > + > + DPRINTF("Iteration %u", x); > + > + for ( p = 0 ; p < ctx->save.p2m_size; ++p ) > + { > + if ( test_bit(p, to_send) ) > + { > + rc = add_to_batch(ctx, p); > + if ( rc ) > + goto out; > + ++pages_written; > + } > + } > + > + rc = flush_batch(ctx); > + if ( rc ) > + goto out; > + } > + > + rc = pause_domain(ctx); > + if ( rc ) > + goto out; > + > + if ( xc_shadow_control( > + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, > + HYPERCALL_BUFFER(to_send), ctx->save.p2m_size, > + NULL, 0, &stats) != ctx->save.p2m_size ) > + { > + PERROR("Failed to retrieve logdirty bitmap"); > + rc = -1; > + goto out; > + } > + > + for ( p = 0, pages_written = 0 ; p < ctx->save.p2m_size; ++p ) > + { > + if ( test_bit(p, to_send) || test_bit(p, ctx->save.deferred_pages) ) > + { > + rc = add_to_batch(ctx, p); > + if ( rc ) > + goto out; > + ++pages_written; > + } > + } > + > + rc = flush_batch(ctx); > + if ( rc ) > + goto out; > + > + DPRINTF(" Wrote %u pages", pages_written); > + IPRINTF("Sent all pages"); > + > + out: > + xc_hypercall_buffer_free_pages(xch, to_send, > + NRPAGES(bitmap_size(ctx->save.p2m_size))); > + free(ctx->save.deferred_pages); > + free(ctx->save.batch_pfns); > + return rc; > +} > + > +/* > + * Save a domain. > + */ > +static int save(struct context *ctx, uint16_t guest_type) > +{ > + xc_interface *xch = ctx->xch; > + int rc, saved_rc = 0, saved_errno = 0; > + > + IPRINTF("Saving domain %d, type %s", > + ctx->domid, dhdr_type_to_str(guest_type)); > + > + rc = ctx->save.ops.setup(ctx); > + if ( rc ) > + goto err; > + > + rc = write_headers(ctx, guest_type); > + if ( rc ) > + goto err; > + > + rc = ctx->save.ops.start_of_stream(ctx); > + if ( rc ) > + goto err; > + > + rc = send_domain_memory(ctx); > + if ( rc ) > + goto err; > + > + /* Refresh domain information now it has paused. */ > + if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) || > + (ctx->dominfo.domid != ctx->domid) ) > + { > + PERROR("Unable to refresh domain information"); > + rc = -1; > + goto err; > + } > + else if ( (!ctx->dominfo.shutdown || > + ctx->dominfo.shutdown_reason != SHUTDOWN_suspend ) && > + !ctx->dominfo.paused ) > + { > + ERROR("Domain has not been suspended"); > + rc = -1; > + goto err; > + } > + > + rc = ctx->save.ops.end_of_stream(ctx); > + if ( rc ) > + goto err; > + > + rc = write_end_record(ctx); > + if ( rc ) > + goto err; > + > + xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF, > + NULL, 0, NULL, 0, NULL); > + > + IPRINTF("Save successful"); > + goto done; > + > + err: > + saved_errno = errno; > + saved_rc = rc; > + PERROR("Save failed"); > + > + done: > + rc = ctx->save.ops.cleanup(ctx); > + if ( rc ) > + PERROR("Failed to clean up"); > + > + if ( saved_rc ) > + { > + rc = saved_rc; > + errno = saved_errno; > + } > + > + return rc; > +}; > + > int xc_domain_save2(xc_interface *xch, int io_fd, uint32_t dom, uint32_t > max_iters, > uint32_t max_factor, uint32_t flags, > struct save_callbacks* callbacks, int hvm) > { > + struct context ctx = > + { > + .xch = xch, > + .fd = io_fd, > + }; > + > + /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions :( > */ > + ctx.save.callbacks = callbacks; > + > IPRINTF("In experimental %s", __func__); > - return -1; > + > + if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 ) > + { > + PERROR("Failed to get domain info"); > + return -1; > + } > + > + if ( ctx.dominfo.domid != dom ) > + { > + ERROR("Domain %d does not exist", dom); > + return -1; > + } > + > + ctx.domid = dom; > + IPRINTF("Saving domain %d", dom); > + > + ctx.save.p2m_size = xc_domain_maximum_gpfn(xch, dom) + 1; > + if ( ctx.save.p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK ) > + { > + errno = E2BIG; > + ERROR("Cannot save this big a guest"); > + return -1; > + } > + > + if ( ctx.dominfo.hvm ) > + { > + ctx.ops = common_ops_x86_hvm; > + ctx.save.ops = save_ops_x86_hvm; > + return save(&ctx, DHDR_TYPE_X86_HVM); > + } > + else > + { > + ctx.ops = common_ops_x86_pv; > + ctx.save.ops = save_ops_x86_pv; > + return save(&ctx, DHDR_TYPE_X86_PV); > + } > } > > /* > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |