Xen project Mailing List

[Xen-changelog] [xen-unstable] tools/libxc: Remus Checkpoint Compression

From: Xen patchbot-unstable <patchbot@xxxxxxx>

Date: Sat, 10 Dec 2011 09:33:15 +0000

Delivery-date: Sat, 10 Dec 2011 09:33:24 +0000

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User Shriram Rajagopalan <rshriram@xxxxxxxxx> # Date 1322753775 0 # Node ID aa273f3a2082a5b0dfbd01cdde071cdeaf4bd180 # Parent 8cc7470020f72de82968eb9b4326fcf062895c1a tools/libxc: Remus Checkpoint Compression Instead of sending dirty pages of guest memory as-is, use a simple compression algorithm that sends a RLE-encoded XOR of the page against its last sent copy. A small LRU cache is used to hold recently dirtied pages. Pagetable pages are sent as-is, as they are canonicalized at sender side and uncanonicalized at receiver. [ Fixed up a conflict in sg_save_restore.h. I had to increase the ID values used from -11 and -12 to -12 and -13 because -11 had been taken by ..._HVM_VIRIDIAN in the meantime. -iwj ] Signed-off-by: Shriram Rajagopalan <rshriram@xxxxxxxxx> Acked-by: Brendan Cully <brendan@xxxxxxxxx> Acked-by: Ian Jackson <ian.jackson@xxxxxxxxxxxxx> Committed-by: Ian Jackson <ian.jackson@xxxxxxxxxxxxx> --- diff -r 8cc7470020f7 -r aa273f3a2082 tools/libxc/Makefile --- a/tools/libxc/Makefile Thu Dec 01 15:35:02 2011 +0000 +++ b/tools/libxc/Makefile Thu Dec 01 15:36:15 2011 +0000 @@ -42,7 +42,7 @@ GUEST_SRCS-y := GUEST_SRCS-y += xg_private.c xc_suspend.c GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c -GUEST_SRCS-$(CONFIG_MIGRATE) += xc_offline_page.c +GUEST_SRCS-$(CONFIG_MIGRATE) += xc_offline_page.c xc_compression.c GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c vpath %.c ../../xen/common/libelf diff -r 8cc7470020f7 -r aa273f3a2082 tools/libxc/xc_compression.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_compression.c Thu Dec 01 15:36:15 2011 +0000 @@ -0,0 +1,552 @@ +/****************************************************************************** + * xc_compression.c + * + * Checkpoint Compression using Page Delta Algorithm. + * - A LRU cache of recently dirtied guest pages is maintained. + * - For each dirty guest page in the checkpoint, if a previous version of the + * page exists in the cache, XOR both pages and send the non-zero sections + * to the receiver. The cache is then updated with the newer copy of guest page. + * - The receiver will XOR the non-zero sections against its copy of the guest + * page, thereby bringing the guest page up-to-date with the sender side. + * + * Copyright (c) 2011 Shriram Rajagopalan (rshriram@xxxxxxxxx). + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <inttypes.h> +#include <errno.h> +#include "xc_private.h" +#include "xenctrl.h" +#include "xg_save_restore.h" +#include "xg_private.h" +#include "xc_dom.h" + +/* Page Cache for Delta Compression*/ +#define DELTA_CACHE_SIZE (XC_PAGE_SIZE * 8192) + +/* Internal page buffer to hold dirty pages of a checkpoint, + * to be compressed after the domain is resumed for execution. + */ +#define PAGE_BUFFER_SIZE (XC_PAGE_SIZE * 8192) + +struct cache_page +{ + char *page; + xen_pfn_t pfn; + struct cache_page *next; + struct cache_page *prev; +}; + +struct compression_ctx +{ + /* compression buffer - holds compressed data */ + char *compbuf; + unsigned long compbuf_size; + unsigned long compbuf_pos; + + /* Page buffer to hold pages to be compressed */ + char *inputbuf; + /* pfns of pages to be compressed */ + xen_pfn_t *sendbuf_pfns; + unsigned int pfns_len; + unsigned int pfns_index; + + /* Compression Cache (LRU) */ + char *cache_base; + struct cache_page **pfn2cache; + struct cache_page *cache; + struct cache_page *page_list_head; + struct cache_page *page_list_tail; + unsigned long dom_pfnlist_size; +}; + +#define RUNFLAG 0 +#define SKIPFLAG ((char)128) +#define FLAGMASK SKIPFLAG +#define LENMASK ((char)127) + +/* + * see xg_save_restore.h for details on the compressed stream format. + * delta size = 4 bytes. + * run header = 1 byte (1 bit for runtype, 7bits for run length). + * i.e maximum size of a run = 127 * 4 = 508 bytes. + * Worst case compression: Entire page has changed. + * In the worst case, the size of the compressed page is + * 8 runs of 508 bytes + 1 run of 32 bytes + 9 run headers + * = 4105 bytes. + * We could detect this worst case and send the entire page with a + * FULL_PAGE marker, reducing the total size to 4097 bytes. The cost + * of this size reduction is an additional memcpy, on top of two previous + * memcpy (to the compressed stream and the cache page in the for loop). + * + * We might as well sacrifice an extra 8 bytes instead of a memcpy. + */ +#define WORST_COMP_PAGE_SIZE (XC_PAGE_SIZE + 9) + +/* + * A zero length skip indicates full page. + */ +#define EMPTY_PAGE 0 +#define FULL_PAGE SKIPFLAG +#define FULL_PAGE_SIZE (XC_PAGE_SIZE + 1) +#define MAX_DELTAS (XC_PAGE_SIZE/sizeof(uint32_t)) + +/* + * Add a pagetable page or a new page (uncached) + * if srcpage is a pagetable page, cache_page is null. + * if srcpage is a page that was not previously in the cache, + * cache_page points to a free page slot in the cache where + * this new page can be copied to. + */ +static int add_full_page(comp_ctx *ctx, char *srcpage, char *cache_page) +{ + char *dest = (ctx->compbuf + ctx->compbuf_pos); + + if ( (ctx->compbuf_pos + FULL_PAGE_SIZE) > ctx->compbuf_size) + return -1; + + if (cache_page) + memcpy(cache_page, srcpage, XC_PAGE_SIZE); + dest[0] = FULL_PAGE; + memcpy(&dest[1], srcpage, XC_PAGE_SIZE); + ctx->compbuf_pos += FULL_PAGE_SIZE; + + return FULL_PAGE_SIZE; +} + +static int compress_page(comp_ctx *ctx, char *srcpage, char *cache_page) +{ + char *dest = (ctx->compbuf + ctx->compbuf_pos); + uint32_t *new, *old; + + int off, runptr = 0; + int wascopying = 0, copying = 0, bytes_skipped = 0; + int complen = 0, pageoff = 0, runbytes = 0; + + char runlen = 0; + + if ( (ctx->compbuf_pos + WORST_COMP_PAGE_SIZE) > ctx->compbuf_size) + return -1; + + /* + * There are no alignment issues here since srcpage is + * domU's page passed from xc_domain_save and cache_page is + * a ptr to cache page (cache is page aligned). + */ + new = (uint32_t*)srcpage; + old = (uint32_t*)cache_page; + + for (off = 0; off <= MAX_DELTAS; off++) + { + /* + * At (off == MAX_DELTAS), we are processing the last run + * in the page. Since there is no XORing, make wascopying != copying + * to satisfy the if-block below. + */ + copying = ((off < MAX_DELTAS) ? (old[off] != new[off]) : !wascopying); + + if (runlen) + { + /* switching between run types or current run is full */ + if ( (wascopying != copying) || (runlen == LENMASK) ) + { + runbytes = runlen * sizeof(uint32_t); + runlen |= (wascopying ? RUNFLAG : SKIPFLAG); + dest[complen++] = runlen; + + if (wascopying) /* RUNFLAG */ + { + pageoff = runptr * sizeof(uint32_t); + memcpy(dest + complen, srcpage + pageoff, runbytes); + memcpy(cache_page + pageoff, srcpage + pageoff, runbytes); + complen += runbytes; + } + else /* SKIPFLAG */ + { + bytes_skipped += runbytes; + } + + runlen = 0; + runptr = off; + } + } + runlen++; + wascopying = copying; + } + + /* + * Check for empty page. + */ + if (bytes_skipped == XC_PAGE_SIZE) + { + complen = 1; + dest[0] = EMPTY_PAGE; + } + ctx->compbuf_pos += complen; + + return complen; +} + +static +char *get_cache_page(comp_ctx *ctx, xen_pfn_t pfn, + int *israw) +{ + struct cache_page *item = NULL; + + item = ctx->pfn2cache[pfn]; + + if (!item) + { + *israw = 1; + + /* If the list is full, evict a page from the tail end. */ + item = ctx->page_list_tail; + if (item->pfn != INVALID_P2M_ENTRY) + ctx->pfn2cache[item->pfn] = NULL; + + item->pfn = pfn; + ctx->pfn2cache[pfn] = item; + } + + /* if requested item is in cache move to head of list */ + if (item != ctx->page_list_head) + { + if (item == ctx->page_list_tail) + { + /* item at tail of list. */ + ctx->page_list_tail = item->prev; + (ctx->page_list_tail)->next = NULL; + } + else + { + /* item in middle of list */ + item->prev->next = item->next; + item->next->prev = item->prev; + } + + item->prev = NULL; + item->next = ctx->page_list_head; + (ctx->page_list_head)->prev = item; + ctx->page_list_head = item; + } + + return (ctx->page_list_head)->page; +} + +/* Remove pagetable pages from cache and move to tail, as free pages */ +static +void invalidate_cache_page(comp_ctx *ctx, xen_pfn_t pfn) +{ + struct cache_page *item = NULL; + + item = ctx->pfn2cache[pfn]; + if (item) + { + if (item != ctx->page_list_tail) + { + /* item at head of list */ + if (item == ctx->page_list_head) + { + ctx->page_list_head = (ctx->page_list_head)->next; + (ctx->page_list_head)->prev = NULL; + } + else /* item in middle of list */ + { + item->prev->next = item->next; + item->next->prev = item->prev; + } + + item->next = NULL; + item->prev = ctx->page_list_tail; + (ctx->page_list_tail)->next = item; + ctx->page_list_tail = item; + } + ctx->pfn2cache[pfn] = NULL; + (ctx->page_list_tail)->pfn = INVALID_P2M_ENTRY; + } +} + +int xc_compression_add_page(xc_interface *xch, comp_ctx *ctx, + char *page, xen_pfn_t pfn, int israw) +{ + if (pfn > ctx->dom_pfnlist_size) + { + ERROR("Invalid pfn passed into " + "xc_compression_add_page %" PRIpfn "\n", pfn); + return -2; + } + + /* pagetable page */ + if (israw) + invalidate_cache_page(ctx, pfn); + ctx->sendbuf_pfns[ctx->pfns_len] = israw ? INVALID_P2M_ENTRY : pfn; + memcpy(ctx->inputbuf + ctx->pfns_len * XC_PAGE_SIZE, page, XC_PAGE_SIZE); + ctx->pfns_len++; + + /* check if we have run out of space. If so, + * we need to synchronously compress the pages and flush them out + */ + if (ctx->pfns_len == NRPAGES(PAGE_BUFFER_SIZE)) + return -1; + return 0; +} + +int xc_compression_compress_pages(xc_interface *xch, comp_ctx *ctx, + char *compbuf, unsigned long compbuf_size, + unsigned long *compbuf_len) +{ + char *cache_copy = NULL, *current_page = NULL; + int israw, rc = 1; + + if (!ctx->pfns_len || (ctx->pfns_index == ctx->pfns_len)) { + ctx->pfns_len = ctx->pfns_index = 0; + return 0; + } + + ctx->compbuf_pos = 0; + ctx->compbuf = compbuf; + ctx->compbuf_size = compbuf_size; + + for (; ctx->pfns_index < ctx->pfns_len; ctx->pfns_index++) + { + israw = 0; + cache_copy = NULL; + current_page = ctx->inputbuf + ctx->pfns_index * XC_PAGE_SIZE; + + if (ctx->sendbuf_pfns[ctx->pfns_index] == INVALID_P2M_ENTRY) + israw = 1; + else + cache_copy = get_cache_page(ctx, + ctx->sendbuf_pfns[ctx->pfns_index], + &israw); + + if (israw) + rc = (add_full_page(ctx, current_page, cache_copy) >= 0); + else + rc = (compress_page(ctx, current_page, cache_copy) >= 0); + + if ( !rc ) + { + /* Out of space in outbuf! flush and come back */ + rc = -1; + break; + } + } + if (compbuf_len) + *compbuf_len = ctx->compbuf_pos; + + return rc; +} + +inline +void xc_compression_reset_pagebuf(xc_interface *xch, comp_ctx *ctx) +{ + ctx->pfns_index = ctx->pfns_len = 0; +} + +int xc_compression_uncompress_page(xc_interface *xch, char *compbuf, + unsigned long compbuf_size, + unsigned long *compbuf_pos, char *destpage) +{ + unsigned long pos; + unsigned int len = 0, pagepos = 0; + char flag; + + pos = *compbuf_pos; + if (pos >= compbuf_size) + { + ERROR("Out of bounds exception in compression buffer (a):" + "read ptr:%lu, bufsize = %lu\n", + *compbuf_pos, compbuf_size); + return -1; + } + + switch (compbuf[pos]) + { + case EMPTY_PAGE: + pos++; + break; + + case FULL_PAGE: + { + /* Check if the input buffer has 4KB of data */ + if ((pos + FULL_PAGE_SIZE) > compbuf_size) + { + ERROR("Out of bounds exception in compression buffer (b):" + "read ptr = %lu, bufsize = %lu\n", + *compbuf_pos, compbuf_size); + return -1; + } + memcpy(destpage, &compbuf[pos + 1], XC_PAGE_SIZE); + pos += FULL_PAGE_SIZE; + } + break; + + default: /* Normal page with one or more runs */ + { + do + { + flag = compbuf[pos] & FLAGMASK; + len = (compbuf[pos] & LENMASK) * sizeof(uint32_t); + /* Sanity Check: Zero-length runs are allowed only for + * FULL_PAGE and EMPTY_PAGE. + */ + if (!len) + { + ERROR("Zero length run encountered for normal page: " + "buffer (d):read ptr = %lu, flag = %u, " + "bufsize = %lu, pagepos = %u\n", + pos, (unsigned int)flag, compbuf_size, pagepos); + return -1; + } + + pos++; + if (flag == RUNFLAG) + { + /* Check if the input buffer has len bytes of data + * and whether it would fit in the destination page. + */ + if (((pos + len) > compbuf_size) + || ((pagepos + len) > XC_PAGE_SIZE)) + { + ERROR("Out of bounds exception in compression " + "buffer (c):read ptr = %lu, runlen = %u, " + "bufsize = %lu, pagepos = %u\n", + pos, len, compbuf_size, pagepos); + return -1; + } + memcpy(&destpage[pagepos], &compbuf[pos], len); + pos += len; + } + pagepos += len; + } while ((pagepos < XC_PAGE_SIZE) && (pos < compbuf_size)); + + /* Make sure we have copied/skipped 4KB worth of data */ + if (pagepos != XC_PAGE_SIZE) + { + ERROR("Invalid data in compression buffer:" + "read ptr = %lu, bufsize = %lu, pagepos = %u\n", + pos, compbuf_size, pagepos); + return -1; + } + } + } + *compbuf_pos = pos; + return 0; +} + +void xc_compression_free_context(xc_interface *xch, comp_ctx *ctx) +{ + if (!ctx) return; + + if (ctx->inputbuf) + free(ctx->inputbuf); + if (ctx->sendbuf_pfns) + free(ctx->sendbuf_pfns); + if (ctx->cache_base) + free(ctx->cache_base); + if (ctx->pfn2cache) + free(ctx->pfn2cache); + if (ctx->cache) + free(ctx->cache); + free(ctx); +} + +comp_ctx *xc_compression_create_context(xc_interface *xch, + unsigned long p2m_size) +{ + unsigned long i; + comp_ctx *ctx = NULL; + unsigned long num_cache_pages = DELTA_CACHE_SIZE/XC_PAGE_SIZE; + + ctx = (comp_ctx *)malloc(sizeof(comp_ctx)); + if (!ctx) + { + ERROR("Failed to allocate compression_ctx\n"); + goto error; + } + memset(ctx, 0, sizeof(comp_ctx)); + + ctx->inputbuf = xc_memalign(xch, XC_PAGE_SIZE, PAGE_BUFFER_SIZE); + if (!ctx->inputbuf) + { + ERROR("Failed to allocate page buffer\n"); + goto error; + } + + ctx->cache_base = xc_memalign(xch, XC_PAGE_SIZE, DELTA_CACHE_SIZE); + if (!ctx->cache_base) + { + ERROR("Failed to allocate delta cache\n"); + goto error; + } + + ctx->sendbuf_pfns = malloc(NRPAGES(PAGE_BUFFER_SIZE) * + sizeof(xen_pfn_t)); + if (!ctx->sendbuf_pfns) + { + ERROR("Could not alloc sendbuf_pfns\n"); + goto error; + } + memset(ctx->sendbuf_pfns, -1, + NRPAGES(PAGE_BUFFER_SIZE) * sizeof(xen_pfn_t)); + + ctx->pfn2cache = calloc(p2m_size, sizeof(struct cache_page *)); + if (!ctx->pfn2cache) + { + ERROR("Could not alloc pfn2cache map\n"); + goto error; + } + + ctx->cache = malloc(num_cache_pages * sizeof(struct cache_page)); + if (!ctx->cache) + { + ERROR("Could not alloc compression cache\n"); + goto error; + } + + for (i = 0; i < num_cache_pages; i++) + { + ctx->cache[i].pfn = INVALID_P2M_ENTRY; + ctx->cache[i].page = ctx->cache_base + i * XC_PAGE_SIZE; + ctx->cache[i].prev = (i == 0) ? NULL : &(ctx->cache[i - 1]); + ctx->cache[i].next = ((i+1) == num_cache_pages)? NULL : + &(ctx->cache[i + 1]); + } + ctx->page_list_head = &(ctx->cache[0]); + ctx->page_list_tail = &(ctx->cache[num_cache_pages -1]); + ctx->dom_pfnlist_size = p2m_size; + + return ctx; +error: + xc_compression_free_context(xch, ctx); + return NULL; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 8cc7470020f7 -r aa273f3a2082 tools/libxc/xc_domain_restore.c --- a/tools/libxc/xc_domain_restore.c Thu Dec 01 15:35:02 2011 +0000 +++ b/tools/libxc/xc_domain_restore.c Thu Dec 01 15:36:15 2011 +0000 @@ -43,6 +43,7 @@ xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region. */ int completed; /* Set when a consistent image is available */ int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */ + int compressing; /* Set when sender signals that pages would be sent compressed (for Remus) */ struct domain_info_context dinfo; }; @@ -663,6 +664,10 @@ /* pages is of length nr_physpages, pfn_types is of length nr_pages */ unsigned int nr_physpages, nr_pages; + /* checkpoint compression state */ + int compressing; + unsigned long compbuf_pos, compbuf_size; + /* Types of the pfns in the current region */ unsigned long* pfn_types; @@ -701,6 +706,7 @@ { int count, countpages, oldcount, i; void* ptmp; + unsigned long compbuf_size; if ( RDEXACT(fd, &count, sizeof(count)) ) { @@ -820,6 +826,40 @@ } return pagebuf_get_one(xch, ctx, buf, fd, dom); + case XC_SAVE_ID_ENABLE_COMPRESSION: + /* We cannot set compression flag directly in pagebuf structure, + * since this pagebuf still has uncompressed pages that are yet to + * be applied. We enable the compression field in pagebuf structure + * after receiving the first tailbuf. + */ + ctx->compressing = 1; + // DPRINTF("compression flag received"); + return pagebuf_get_one(xch, ctx, buf, fd, dom); + + case XC_SAVE_ID_COMPRESSED_DATA: + + /* read the length of compressed chunk coming in */ + if ( RDEXACT(fd, &compbuf_size, sizeof(unsigned long)) ) + { + PERROR("Error when reading compbuf_size"); + return -1; + } + if (!compbuf_size) return 1; + + buf->compbuf_size += compbuf_size; + if (!(ptmp = realloc(buf->pages, buf->compbuf_size))) { + ERROR("Could not (re)allocate compression buffer"); + return -1; + } + buf->pages = ptmp; + + if ( RDEXACT(fd, buf->pages + (buf->compbuf_size - compbuf_size), + compbuf_size) ) { + PERROR("Error when reading compression buffer"); + return -1; + } + return compbuf_size; + default: if ( (count > MAX_BATCH_SIZE) || (count < 0) ) { ERROR("Max batch size exceeded (%d). Giving up.", count); @@ -857,6 +897,13 @@ if (!countpages) return count; + /* If Remus Checkpoint Compression is turned on, we will only be + * receiving the pfn lists now. The compressed pages will come in later, + * following a <XC_SAVE_ID_COMPRESSED_DATA, compressedChunkSize> tuple. + */ + if (buf->compressing) + return pagebuf_get_one(xch, ctx, buf, fd, dom); + oldcount = buf->nr_physpages; buf->nr_physpages += countpages; if (!buf->pages) { @@ -885,6 +932,7 @@ int rc; buf->nr_physpages = buf->nr_pages = 0; + buf->compbuf_pos = buf->compbuf_size = 0; do { rc = pagebuf_get_one(xch, ctx, buf, fd, dom); @@ -1102,7 +1150,21 @@ /* In verify mode, we use a copy; otherwise we work in place */ page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE); - memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, PAGE_SIZE); + /* Remus - page decompression */ + if (pagebuf->compressing) + { + if (xc_compression_uncompress_page(xch, pagebuf->pages, + pagebuf->compbuf_size, + &pagebuf->compbuf_pos, + (char *)page)) + { + ERROR("Failed to uncompress page (pfn=%lx)\n", pfn); + goto err_mapped; + } + } + else + memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, + PAGE_SIZE); pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; @@ -1364,6 +1426,7 @@ if ( !ctx->completed ) { pagebuf.nr_physpages = pagebuf.nr_pages = 0; + pagebuf.compbuf_pos = pagebuf.compbuf_size = 0; if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) { PERROR("Error when reading batch"); goto out; @@ -1406,6 +1469,7 @@ } pagebuf.nr_physpages = pagebuf.nr_pages = 0; + pagebuf.compbuf_pos = pagebuf.compbuf_size = 0; n += j; /* crude stats */ @@ -1449,6 +1513,13 @@ */ if ( !ctx->last_checkpoint ) fcntl(io_fd, F_SETFL, orig_io_fd_flags | O_NONBLOCK); + + /* + * If sender had sent enable compression flag, switch to compressed + * checkpoints mode once the first checkpoint is received. + */ + if (ctx->compressing) + pagebuf.compressing = 1; } if (pagebuf.viridian != 0) diff -r 8cc7470020f7 -r aa273f3a2082 tools/libxc/xc_domain_save.c --- a/tools/libxc/xc_domain_save.c Thu Dec 01 15:35:02 2011 +0000 +++ b/tools/libxc/xc_domain_save.c Thu Dec 01 15:36:15 2011 +0000 @@ -218,6 +218,56 @@ return noncached_write(xch, ob, fd, buf, len); } +static int write_compressed(xc_interface *xch, comp_ctx *compress_ctx, + int dobuf, struct outbuf* ob, int fd) +{ + int rc = 0; + int header = sizeof(int) + sizeof(unsigned long); + int marker = XC_SAVE_ID_COMPRESSED_DATA; + unsigned long compbuf_len = 0; + + do + { + /* check for available space (atleast 8k) */ + if ((ob->pos + header + XC_PAGE_SIZE * 2) > ob->size) + { + if (outbuf_flush(xch, ob, fd) < 0) + { + ERROR("Error when flushing outbuf intermediate"); + return -1; + } + } + + rc = xc_compression_compress_pages(xch, compress_ctx, + ob->buf + ob->pos + header, + ob->size - ob->pos - header, + &compbuf_len); + if (!rc) + return 0; + + if (outbuf_hardwrite(xch, ob, fd, &marker, sizeof(marker)) < 0) + { + PERROR("Error when writing marker (errno %d)", errno); + return -1; + } + + if (outbuf_hardwrite(xch, ob, fd, &compbuf_len, sizeof(compbuf_len)) < 0) + { + PERROR("Error when writing compbuf_len (errno %d)", errno); + return -1; + } + + ob->pos += (size_t) compbuf_len; + if (!dobuf && outbuf_flush(xch, ob, fd) < 0) + { + ERROR("Error when writing compressed chunk"); + return -1; + } + } while (rc != 0); + + return 0; +} + struct time_stats { struct timeval wall; long long d0_cpu, d1_cpu; @@ -815,11 +865,35 @@ unsigned long mfn; - struct outbuf ob; + /* Without checkpoint compression, the dirty pages, pfn arrays + * and tailbuf (vcpu ctx, shared info page, etc.) are written + * directly to outbuf. All of this is done while the domain is + * suspended. + * + * When checkpoint compression is enabled, the dirty pages are + * buffered, compressed "after" the domain is resumed and then + * written to outbuf. Since tailbuf data are collected while a + * domain is suspended, they cannot be directly written to the + * outbuf as there is no dirty page data preceeding tailbuf. + * + * So,two output buffers are maintained. Tailbuf data goes into + * ob_tailbuf. The dirty pages are compressed after resuming the + * domain and written to ob_pagebuf. ob_tailbuf is then appended + * to ob_pagebuf and finally flushed out. + */ + struct outbuf ob_pagebuf, ob_tailbuf, *ob = NULL; struct save_ctx _ctx; struct save_ctx *ctx = &_ctx; struct domain_info_context *dinfo = &ctx->dinfo; + /* Compression context */ + comp_ctx *compress_ctx= NULL; + /* Even if XCFLAGS_CHECKPOINT_COMPRESS is set, we enable compression only + * after sending XC_SAVE_ID_ENABLE_COMPRESSION and the tailbuf for + * first time. + */ + int compressing = 0; + int completed = 0; if ( hvm && !callbacks->switch_qemu_logdirty ) @@ -829,7 +903,7 @@ return 1; } - outbuf_init(xch, &ob, OUTBUF_SIZE); + outbuf_init(xch, &ob_pagebuf, OUTBUF_SIZE); memset(ctx, 0, sizeof(*ctx)); @@ -917,6 +991,16 @@ } } + if ( flags & XCFLAGS_CHECKPOINT_COMPRESS ) + { + if (!(compress_ctx = xc_compression_create_context(xch, dinfo->p2m_size))) + { + ERROR("Failed to create compression context"); + goto out; + } + outbuf_init(xch, &ob_tailbuf, OUTBUF_SIZE/4); + } + last_iter = !live; /* pretend we sent all the pages last iteration */ @@ -1025,9 +1109,11 @@ } copypages: -#define wrexact(fd, buf, len) write_buffer(xch, last_iter, &ob, (fd), (buf), (len)) -#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, &ob, (fd), (buf), (len)) +#define wrexact(fd, buf, len) write_buffer(xch, last_iter, ob, (fd), (buf), (len)) +#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, ob, (fd), (buf), (len)) +#define wrcompressed(fd) write_compressed(xch, compress_ctx, last_iter, ob, (fd)) + ob = &ob_pagebuf; /* Holds pfn_types, pages/compressed pages */ /* Now write out each data page, canonicalising page tables as we go... */ for ( ; ; ) { @@ -1270,7 +1356,7 @@ { /* If the page is not a normal data page, write out any run of pages we may have previously acumulated */ - if ( run ) + if ( !compressing && run ) { if ( wruncached(io_fd, live, (char*)region_base+(PAGE_SIZE*(j-run)), @@ -1305,7 +1391,41 @@ goto out; } - if ( wruncached(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE ) + if (compressing) + { + int c_err; + /* Mark pagetable page to be sent uncompressed */ + c_err = xc_compression_add_page(xch, compress_ctx, page, + pfn, 1 /* raw page */); + if (c_err == -2) /* OOB PFN */ + { + ERROR("Could not add pagetable page " + "(pfn:%" PRIpfn "to page buffer\n", pfn); + goto out; + } + + if (c_err == -1) + { + /* + * We are out of buffer space to hold dirty + * pages. Compress and flush the current buffer + * to make space. This is a corner case, that + * slows down checkpointing as the compression + * happens while domain is suspended. Happens + * seldom and if you find this occuring + * frequently, increase the PAGE_BUFFER_SIZE + * in xc_compression.c. + */ + if (wrcompressed(io_fd) < 0) + { + ERROR("Error when writing compressed" + " data (4b)\n"); + goto out; + } + } + } + else if ( wruncached(io_fd, live, page, + PAGE_SIZE) != PAGE_SIZE ) { PERROR("Error when writing to state file (4b)" " (errno %d)", errno); @@ -1315,7 +1435,34 @@ else { /* We have a normal page: accumulate it for writing. */ - run++; + if (compressing) + { + int c_err; + /* For checkpoint compression, accumulate the page in the + * page buffer, to be compressed later. + */ + c_err = xc_compression_add_page(xch, compress_ctx, spage, + pfn, 0 /* not raw page */); + + if (c_err == -2) /* OOB PFN */ + { + ERROR("Could not add page " + "(pfn:%" PRIpfn "to page buffer\n", pfn); + goto out; + } + + if (c_err == -1) + { + if (wrcompressed(io_fd) < 0) + { + ERROR("Error when writing compressed" + " data (4c)\n"); + goto out; + } + } + } + else + run++; } } /* end of the write out for this batch */ @@ -1423,6 +1570,15 @@ DPRINTF("All memory is saved\n"); + /* After last_iter, buffer the rest of pagebuf & tailbuf data into a + * separate output buffer and flush it after the compressed page chunks. + */ + if (compressing) + { + ob = &ob_tailbuf; + ob->pos = 0; + } + { struct { int id; @@ -1534,6 +1690,25 @@ } } + /* Enable compression logic on both sides by sending this + * one time marker. + * NOTE: We could have simplified this procedure by sending + * the enable/disable compression flag before the beginning of + * the main for loop. But this would break compatibility for + * live migration code, with older versions of xen. So we have + * to enable it after the last_iter, when the XC_SAVE_ID_* + * elements are sent. + */ + if (!compressing && (flags & XCFLAGS_CHECKPOINT_COMPRESS)) + { + i = XC_SAVE_ID_ENABLE_COMPRESSION; + if ( wrexact(io_fd, &i, sizeof(int)) ) + { + PERROR("Error when writing enable_compression marker"); + goto out; + } + } + /* Zero terminate */ i = 0; if ( wrexact(io_fd, &i, sizeof(int)) ) @@ -1778,14 +1953,38 @@ if ( !rc && callbacks->postcopy ) callbacks->postcopy(callbacks->data); + /* guest has been resumed. Now we can compress data + * at our own pace. + */ + if (!rc && compressing) + { + ob = &ob_pagebuf; + if (wrcompressed(io_fd) < 0) + { + ERROR("Error when writing compressed data, after postcopy\n"); + rc = 1; + goto out; + } + /* Append the tailbuf data to the main outbuf */ + if ( wrexact(io_fd, ob_tailbuf.buf, ob_tailbuf.pos) ) + { + rc = 1; + PERROR("Error when copying tailbuf into outbuf"); + goto out; + } + } + /* Flush last write and discard cache for file. */ - if ( outbuf_flush(xch, &ob, io_fd) < 0 ) { + if ( outbuf_flush(xch, ob, io_fd) < 0 ) { PERROR("Error when flushing output buffer"); rc = 1; } discard_file_cache(xch, io_fd, 1 /* flush */); + /* Enable compression now, finally */ + compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS); + /* checkpoint_cb can spend arbitrarily long in between rounds */ if (!rc && callbacks->checkpoint && callbacks->checkpoint(callbacks->data) > 0) @@ -1827,6 +2026,9 @@ DPRINTF("Warning - couldn't disable qemu log-dirty mode"); } + if (compress_ctx) + xc_compression_free_context(xch, compress_ctx); + if ( live_shinfo ) munmap(live_shinfo, PAGE_SIZE); diff -r 8cc7470020f7 -r aa273f3a2082 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Thu Dec 01 15:35:02 2011 +0000 +++ b/tools/libxc/xenctrl.h Thu Dec 01 15:36:15 2011 +0000 @@ -1937,4 +1937,64 @@ int verbose); /* Useful for callers who also use libelf. */ +/** + * Checkpoint Compression + */ +typedef struct compression_ctx comp_ctx; +comp_ctx *xc_compression_create_context(xc_interface *xch, + unsigned long p2m_size); +void xc_compression_free_context(xc_interface *xch, comp_ctx *ctx); + +/** + * Add a page to compression page buffer, to be compressed later. + * + * returns 0 if the page was successfully added to the page buffer + * + * returns -1 if there is no space in buffer. In this case, the + * application should call xc_compression_compress_pages to compress + * the buffer (or atleast part of it), thereby freeing some space in + * the page buffer. + * + * returns -2 if the pfn is out of bounds, where the bound is p2m_size + * parameter passed during xc_compression_create_context. + */ +int xc_compression_add_page(xc_interface *xch, comp_ctx *ctx, char *page, + unsigned long pfn, int israw); + +/** + * Delta compress pages in the compression buffer and inserts the + * compressed data into the supplied compression buffer compbuf, whose + * size is compbuf_size. + * After compression, the pages are copied to the internal LRU cache. + * + * This function compresses as many pages as possible into the + * supplied compression buffer. It maintains an internal iterator to + * keep track of pages in the input buffer that are yet to be compressed. + * + * returns -1 if the compression buffer has run out of space. + * returns 1 on success. + * returns 0 if no more pages are left to be compressed. + * When the return value is non-zero, compbuf_len indicates the actual + * amount of data present in compbuf (<=compbuf_size). + */ +int xc_compression_compress_pages(xc_interface *xch, comp_ctx *ctx, + char *compbuf, unsigned long compbuf_size, + unsigned long *compbuf_len); + +/** + * Resets the internal page buffer that holds dirty pages before compression. + * Also resets the iterators. + */ +void xc_compression_reset_pagebuf(xc_interface *xch, comp_ctx *ctx); + +/** + * Caller must supply the compression buffer (compbuf), + * its size (compbuf_size) and a reference to index variable (compbuf_pos) + * that is used internally. Each call pulls out one page from the compressed + * chunk and copies it to dest. + */ +int xc_compression_uncompress_page(xc_interface *xch, char *compbuf, + unsigned long compbuf_size, + unsigned long *compbuf_pos, char *dest); + #endif /* XENCTRL_H */ diff -r 8cc7470020f7 -r aa273f3a2082 tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Thu Dec 01 15:35:02 2011 +0000 +++ b/tools/libxc/xenguest.h Thu Dec 01 15:36:15 2011 +0000 @@ -27,6 +27,7 @@ #define XCFLAGS_DEBUG 2 #define XCFLAGS_HVM 4 #define XCFLAGS_STDVGA 8 +#define XCFLAGS_CHECKPOINT_COMPRESS 16 #define X86_64_B_SIZE 64 #define X86_32_B_SIZE 32 diff -r 8cc7470020f7 -r aa273f3a2082 tools/libxc/xg_save_restore.h --- a/tools/libxc/xg_save_restore.h Thu Dec 01 15:35:02 2011 +0000 +++ b/tools/libxc/xg_save_restore.h Thu Dec 01 15:36:15 2011 +0000 @@ -67,7 +67,7 @@ * * consists of p2m_size bytes comprising an array of xen_pfn_t sized entries. * - * BODY PHASE + * BODY PHASE - Format A (for live migration or Remus without compression) * ---------- * * A series of chunks with a common header: @@ -87,6 +87,122 @@ * * If chunk type is 0 then body phase is complete. * + * + * BODY PHASE - Format B (for Remus with compression) + * ---------- + * + * A series of chunks with a common header: + * int : chunk type + * + * If the chunk type is +ve then chunk contains array of PFNs corresponding + * to guest memory and type contains the number of PFNs in the batch: + * + * unsigned long[] : PFN array, length == number of pages in batch + * Each entry consists of XEN_DOMCTL_PFINFO_* + * in bits 31-28 and the PFN number in bits 27-0. + * + * If the chunk type is -ve then chunk consists of one of a number of + * metadata types. See definitions of XC_SAVE_ID_* below. + * + * If the chunk type is -ve and equals XC_SAVE_ID_COMPRESSED_DATA, then the + * chunk consists of compressed page data, in the following format: + * + * unsigned long : Size of the compressed chunk to follow + * compressed data : variable length data of size indicated above. + * This chunk consists of compressed page data. + * The number of pages in one chunk depends on + * the amount of space available in the sender's + * output buffer. + * + * Format of compressed data: + * compressed_data = <deltas>* + * delta = <marker, run*> + * marker = (RUNFLAG|SKIPFLAG) bitwise-or RUNLEN [1 byte marker] + * RUNFLAG = 0 + * SKIPFLAG = 1 << 7 + * RUNLEN = 7-bit unsigned value indicating number of WORDS in the run + * run = string of bytes of length sizeof(WORD) * RUNLEN + * + * If marker contains RUNFLAG, then RUNLEN * sizeof(WORD) bytes of data following + * the marker is copied into the target page at the appropriate offset indicated by + * the offset_ptr + * If marker contains SKIPFLAG, then the offset_ptr is advanced + * by RUNLEN * sizeof(WORD). + * + * If chunk type is 0 then body phase is complete. + * + * There can be one or more chunks with type XC_SAVE_ID_COMPRESSED_DATA, + * containing compressed pages. The compressed chunks are collated to form + * one single compressed chunk for the entire iteration. The number of pages + * present in this final compressed chunk will be equal to the total number + * of valid PFNs specified by the +ve chunks. + * + * At the sender side, compressed pages are inserted into the output stream + * in the same order as they would have been if compression logic was absent. + * + * Until last iteration, the BODY is sent in Format A, to maintain live + * migration compatibility with receivers of older Xen versions. + * At the last iteration, if Remus compression was enabled, the sender sends + * a trigger, XC_SAVE_ID_ENABLE_COMPRESSION to tell the receiver to parse the + * BODY in Format B from the next iteration onwards. + * + * An example sequence of chunks received in Format B: + * +16 +ve chunk + * unsigned long[16] PFN array + * +100 +ve chunk + * unsigned long[100] PFN array + * +50 +ve chunk + * unsigned long[50] PFN array + * + * XC_SAVE_ID_COMPRESSED_DATA TAG + * N Length of compressed data + * N bytes of DATA Decompresses to 166 pages + * + * XC_SAVE_ID_* other xc save chunks + * 0 END BODY TAG + * + * Corner case with checkpoint compression: + * At sender side, after pausing the domain, dirty pages are usually + * copied out to a temporary buffer. After the domain is resumed, + * compression is done and the compressed chunk(s) are sent, followed by + * other XC_SAVE_ID_* chunks. + * If the temporary buffer gets full while scanning for dirty pages, + * the sender stops buffering of dirty pages, compresses the temporary + * buffer and sends the compressed data with XC_SAVE_ID_COMPRESSED_DATA. + * The sender then resumes the buffering of dirty pages and continues + * scanning for the dirty pages. + * For e.g., assume that the temporary buffer can hold 4096 pages and + * there are 5000 dirty pages. The following is the sequence of chunks + * that the receiver will see: + * + * +1024 +ve chunk + * unsigned long[1024] PFN array + * +1024 +ve chunk + * unsigned long[1024] PFN array + * +1024 +ve chunk + * unsigned long[1024] PFN array + * +1024 +ve chunk + * unsigned long[1024] PFN array + * + * XC_SAVE_ID_COMPRESSED_DATA TAG + * N Length of compressed data + * N bytes of DATA Decompresses to 4096 pages + * + * +4 +ve chunk + * unsigned long[4] PFN array + * + * XC_SAVE_ID_COMPRESSED_DATA TAG + * M Length of compressed data + * M bytes of DATA Decompresses to 4 pages + * + * XC_SAVE_ID_* other xc save chunks + * 0 END BODY TAG + * + * In other words, XC_SAVE_ID_COMPRESSED_DATA can be interleaved with + * +ve chunks arbitrarily. But at the receiver end, the following condition + * always holds true until the end of BODY PHASE: + * num(PFN entries +ve chunks) >= num(pages received in compressed form) + * * TAIL PHASE * ---------- * @@ -135,6 +251,8 @@ #define XC_SAVE_ID_LAST_CHECKPOINT -9 /* Commit to restoring after completion of current iteration. */ #define XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION -10 #define XC_SAVE_ID_HVM_VIRIDIAN -11 +#define XC_SAVE_ID_COMPRESSED_DATA -12 /* Marker to indicate arrival of compressed data */ +#define XC_SAVE_ID_ENABLE_COMPRESSION -13 /* Marker to enable compression logic at receiver side */ /* ** We process save/restore/migrate in batches of pages; the below _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.