[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] Remus: Make checkpoint buffering HVM-aware
# HG changeset patch # User Keir Fraser <keir.fraser@xxxxxxxxxx> # Date 1257794367 0 # Node ID f6091947ffed5707b60120fe44d3d038a3307591 # Parent 01c9cb566f61f143870600b30857562963aaee27 Remus: Make checkpoint buffering HVM-aware Signed-off-by: Brendan Cully <brendan@xxxxxxxxx> --- tools/libxc/xc_domain_restore.c | 387 +++++++++++++++++++++++--------- tools/python/xen/xend/XendCheckpoint.py | 20 - 2 files changed, 289 insertions(+), 118 deletions(-) diff -r 01c9cb566f61 -r f6091947ffed tools/libxc/xc_domain_restore.c --- a/tools/libxc/xc_domain_restore.c Mon Nov 09 19:17:22 2009 +0000 +++ b/tools/libxc/xc_domain_restore.c Mon Nov 09 19:19:27 2009 +0000 @@ -670,15 +670,204 @@ static xen_pfn_t *load_p2m_frame_list( } typedef struct { - unsigned int pfncount; - unsigned long* pfntab; - unsigned int vcpucount; - unsigned char* vcpubuf; - unsigned char shared_info_page[PAGE_SIZE]; + int ishvm; + union { + struct tailbuf_pv { + unsigned int pfncount; + unsigned long* pfntab; + unsigned int vcpucount; + unsigned char* vcpubuf; + unsigned char shared_info_page[PAGE_SIZE]; + } pv; + struct tailbuf_hvm { + uint64_t magicpfns[3]; + uint32_t hvmbufsize, reclen; + uint8_t* hvmbuf; + struct { + uint32_t magic; + uint32_t version; + uint64_t len; + } qemuhdr; + uint32_t qemubufsize; + uint8_t* qemubuf; + } hvm; + } u; } tailbuf_t; -static int buffer_tail(tailbuf_t* buf, int fd, unsigned int max_vcpu_id, - uint64_t vcpumap, int ext_vcpucontext) +/* read stream until EOF, growing buffer as necssary */ +static int compat_buffer_qemu(int fd, struct tailbuf_hvm *buf) +{ + uint8_t *qbuf, *tmp; + int blen = 0, dlen = 0; + int rc; + + /* currently save records tend to be about 7K */ + blen = 8192; + if ( !(qbuf = malloc(blen)) ) { + ERROR("Error allocating QEMU buffer"); + return -1; + } + + while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) { + DPRINTF("Read %d bytes of QEMU data\n", rc); + dlen += rc; + + if (dlen == blen) { + DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen); + blen += 4096; + tmp = realloc(qbuf, blen); + if ( !tmp ) { + ERROR("Error growing QEMU buffer to %d bytes", blen); + free(qbuf); + return -1; + } + qbuf = tmp; + } + } + + if ( rc < 0 ) { + ERROR("Error reading QEMU data"); + free(qbuf); + return -1; + } + + if ( memcmp(qbuf, "QEVM", 4) ) { + ERROR("Invalid QEMU magic: 0x%08x", *(unsigned long*)qbuf); + free(qbuf); + return -1; + } + + buf->qemubuf = qbuf; + buf->qemubufsize = dlen; + + return 0; +} + +static int buffer_qemu(int fd, struct tailbuf_hvm *buf) +{ + uint32_t qlen; + uint8_t *tmp; + + if ( read_exact(fd, &qlen, sizeof(qlen)) ) { + ERROR("Error reading QEMU header length"); + return -1; + } + + if ( qlen > buf->qemubufsize ) { + if ( buf->qemubuf) { + tmp = realloc(buf->qemubuf, qlen); + if ( tmp ) + buf->qemubuf = tmp; + else { + ERROR("Error reallocating QEMU state buffer"); + return -1; + } + } else { + buf->qemubuf = malloc(qlen); + if ( !buf->qemubuf ) { + ERROR("Error allocating QEMU state buffer"); + return -1; + } + } + } + buf->qemubufsize = qlen; + + if ( read_exact(fd, buf->qemubuf, buf->qemubufsize) ) { + ERROR("Error reading QEMU state"); + return -1; + } + + return 0; +} + +static int dump_qemu(uint32_t dom, struct tailbuf_hvm *buf) +{ + int saved_errno; + char path[256]; + FILE *fp; + + sprintf(path, "/var/lib/xen/qemu-save.%u", dom); + fp = fopen(path, "wb"); + if ( !fp ) + return -1; + + DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize); + if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) { + saved_errno = errno; + fclose(fp); + errno = saved_errno; + return -1; + } + + fclose(fp); + + return 0; +} + +static int buffer_tail_hvm(struct tailbuf_hvm *buf, int fd, + unsigned int max_vcpu_id, uint64_t vcpumap, + int ext_vcpucontext) +{ + uint8_t *tmp; + unsigned char qemusig[21]; + + if ( read_exact(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) { + ERROR("Error reading magic PFNs"); + return -1; + } + + if ( read_exact(fd, &buf->reclen, sizeof(buf->reclen)) ) { + ERROR("Error reading HVM params size"); + return -1; + } + + if ( buf->reclen > buf->hvmbufsize ) { + if ( buf->hvmbuf) { + tmp = realloc(buf->hvmbuf, buf->reclen); + if ( tmp ) { + buf->hvmbuf = tmp; + buf->hvmbufsize = buf->reclen; + } else { + ERROR("Error reallocating HVM param buffer"); + return -1; + } + } else { + buf->hvmbuf = malloc(buf->reclen); + if ( !buf->hvmbuf ) { + ERROR("Error allocating HVM param buffer"); + return -1; + } + buf->hvmbufsize = buf->reclen; + } + } + + if ( read_exact(fd, buf->hvmbuf, buf->reclen) ) { + ERROR("Error reading HVM params"); + return -1; + } + + if ( read_exact(fd, qemusig, sizeof(qemusig)) ) { + ERROR("Error reading QEMU signature"); + return -1; + } + + /* The normal live-migration QEMU record has no length information. + * Short of reimplementing the QEMU parser, we're forced to just read + * until EOF. Remus gets around this by sending a different signature + * which includes a length prefix */ + if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) ) + return compat_buffer_qemu(fd, buf); + else if ( !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) ) + return buffer_qemu(fd, buf); + + qemusig[20] = '\0'; + ERROR("Invalid QEMU signature: %s", qemusig); + return -1; +} + +static int buffer_tail_pv(struct tailbuf_pv *buf, int fd, + unsigned int max_vcpu_id, uint64_t vcpumap, + int ext_vcpucontext) { unsigned int i; size_t pfnlen, vcpulen; @@ -753,16 +942,47 @@ static int buffer_tail(tailbuf_t* buf, i return -1; } -static void tailbuf_free(tailbuf_t* buf) -{ - if (buf->vcpubuf) { +static int buffer_tail(tailbuf_t *buf, int fd, unsigned int max_vcpu_id, + uint64_t vcpumap, int ext_vcpucontext) +{ + if ( buf->ishvm ) + return buffer_tail_hvm(&buf->u.hvm, fd, max_vcpu_id, vcpumap, + ext_vcpucontext); + else + return buffer_tail_pv(&buf->u.pv, fd, max_vcpu_id, vcpumap, + ext_vcpucontext); +} + +static void tailbuf_free_hvm(struct tailbuf_hvm *buf) +{ + if ( buf->hvmbuf ) { + free(buf->hvmbuf); + buf->hvmbuf = NULL; + } + if ( buf->qemubuf ) { + free(buf->qemubuf); + buf->qemubuf = NULL; + } +} + +static void tailbuf_free_pv(struct tailbuf_pv *buf) +{ + if ( buf->vcpubuf ) { free(buf->vcpubuf); buf->vcpubuf = NULL; } - if (buf->pfntab) { + if ( buf->pfntab ) { free(buf->pfntab); buf->pfntab = NULL; } +} + +static void tailbuf_free(tailbuf_t *buf) +{ + if ( buf->ishvm ) + tailbuf_free_hvm(&buf->u.hvm); + else + tailbuf_free_pv(&buf->u.pv); } typedef struct { @@ -1118,18 +1338,13 @@ int xc_domain_restore(int xc_handle, int unsigned int max_vcpu_id = 0; int new_ctxt_format = 0; - /* Magic frames in HVM guests: ioreqs and xenstore comms. */ - uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */ - - /* Buffer for holding HVM context */ - uint8_t *hvm_buf = NULL; - pagebuf_t pagebuf; tailbuf_t tailbuf, tmptail; void* vcpup; pagebuf_init(&pagebuf); memset(&tailbuf, 0, sizeof(tailbuf)); + tailbuf.ishvm = hvm; /* For info only */ nr_pfns = 0; @@ -1313,78 +1528,6 @@ int xc_domain_restore(int xc_handle, int // DPRINTF("Received all pages (%d races)\n", nraces); - if ( hvm ) - { - uint32_t rec_len; - - /* Set HVM-specific parameters */ - if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) ) - { - ERROR("error reading magic page addresses"); - goto out; - } - - /* These comms pages need to be zeroed at the start of day */ - if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) || - xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) || - xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) ) - { - ERROR("error zeroing magic pages"); - goto out; - } - - if ( (frc = xc_set_hvm_param(xc_handle, dom, - HVM_PARAM_IOREQ_PFN, magic_pfns[0])) - || (frc = xc_set_hvm_param(xc_handle, dom, - HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1])) - || (frc = xc_set_hvm_param(xc_handle, dom, - HVM_PARAM_STORE_PFN, magic_pfns[2])) - || (frc = xc_set_hvm_param(xc_handle, dom, - HVM_PARAM_PAE_ENABLED, pae)) - || (frc = xc_set_hvm_param(xc_handle, dom, - HVM_PARAM_STORE_EVTCHN, - store_evtchn)) ) - { - ERROR("error setting HVM params: %i", frc); - goto out; - } - *store_mfn = magic_pfns[2]; - - /* Read HVM context */ - if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) ) - { - ERROR("error read hvm context size!\n"); - goto out; - } - - hvm_buf = malloc(rec_len); - if ( hvm_buf == NULL ) - { - ERROR("memory alloc for hvm context buffer failed"); - errno = ENOMEM; - goto out; - } - - if ( read_exact(io_fd, hvm_buf, rec_len) ) - { - ERROR("error loading the HVM context"); - goto out; - } - - frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len); - if ( frc ) - { - ERROR("error setting the HVM context"); - goto out; - } - - /* HVM success! */ - rc = 0; - goto out; - } - - /* Non-HVM guests only from here on */ - if ( !completed ) { int flags = 0; @@ -1407,6 +1550,7 @@ int xc_domain_restore(int xc_handle, int goto finish; } memset(&tmptail, 0, sizeof(tmptail)); + tmptail.ishvm = hvm; if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap, ext_vcpucontext) < 0 ) { ERROR ("error buffering image tail, finishing"); @@ -1418,6 +1562,8 @@ int xc_domain_restore(int xc_handle, int goto loadpages; finish: + if ( hvm ) + goto finish_hvm; if ( (pt_levels == 3) && !pae_extended_cr3 ) { @@ -1589,15 +1735,15 @@ int xc_domain_restore(int xc_handle, int { int nr_frees = 0; - for ( i = 0; i < tailbuf.pfncount; i++ ) - { - unsigned long pfn = tailbuf.pfntab[i]; + for ( i = 0; i < tailbuf.u.pv.pfncount; i++ ) + { + unsigned long pfn = tailbuf.u.pv.pfntab[i]; if ( p2m[pfn] != INVALID_P2M_ENTRY ) { /* pfn is not in physmap now, but was at some point during the save/migration process - need to free it */ - tailbuf.pfntab[nr_frees++] = p2m[pfn]; + tailbuf.u.pv.pfntab[nr_frees++] = p2m[pfn]; p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */ } } @@ -1609,7 +1755,7 @@ int xc_domain_restore(int xc_handle, int .extent_order = 0, .domid = dom }; - set_xen_guest_handle(reservation.extent_start, tailbuf.pfntab); + set_xen_guest_handle(reservation.extent_start, tailbuf.u.pv.pfntab); if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation, &reservation)) != nr_frees ) @@ -1618,7 +1764,7 @@ int xc_domain_restore(int xc_handle, int goto out; } else - DPRINTF("Decreased reservation by %d pages\n", tailbuf.pfncount); + DPRINTF("Decreased reservation by %d pages\n", tailbuf.u.pv.pfncount); } } @@ -1628,7 +1774,7 @@ int xc_domain_restore(int xc_handle, int return 1; } - vcpup = tailbuf.vcpubuf; + vcpup = tailbuf.u.pv.vcpubuf; for ( i = 0; i <= max_vcpu_id; i++ ) { if ( !(vcpumap & (1ULL << i)) ) @@ -1755,7 +1901,7 @@ int xc_domain_restore(int xc_handle, int } } - memcpy(shared_info_page, tailbuf.shared_info_page, PAGE_SIZE); + memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE); DPRINTF("Completed checkpoint load\n"); @@ -1812,6 +1958,51 @@ int xc_domain_restore(int xc_handle, int DPRINTF("Domain ready to be built.\n"); rc = 0; + goto out; + + finish_hvm: + /* Dump the QEMU state to a state file for QEMU to load */ + if ( dump_qemu(dom, &tailbuf.u.hvm) ) { + ERROR("Error dumping QEMU state to file"); + goto out; + } + + /* These comms pages need to be zeroed at the start of day */ + if ( xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[0]) || + xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[1]) || + xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[2]) ) + { + ERROR("error zeroing magic pages"); + goto out; + } + + if ( (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_IOREQ_PFN, tailbuf.u.hvm.magicpfns[0])) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_BUFIOREQ_PFN, tailbuf.u.hvm.magicpfns[1])) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_STORE_PFN, tailbuf.u.hvm.magicpfns[2])) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_PAE_ENABLED, pae)) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_STORE_EVTCHN, + store_evtchn)) ) + { + ERROR("error setting HVM params: %i", frc); + goto out; + } + *store_mfn = tailbuf.u.hvm.magicpfns[2]; + + frc = xc_domain_hvm_setcontext(xc_handle, dom, tailbuf.u.hvm.hvmbuf, + tailbuf.u.hvm.reclen); + if ( frc ) + { + ERROR("error setting the HVM context"); + goto out; + } + + /* HVM success! */ + rc = 0; out: if ( (rc != 0) && (dom != 0) ) @@ -1819,7 +2010,7 @@ int xc_domain_restore(int xc_handle, int free(mmu); free(p2m); free(pfn_type); - free(hvm_buf); + tailbuf_free(&tailbuf); /* discard cache for save file */ discard_file_cache(io_fd, 1 /*flush*/); diff -r 01c9cb566f61 -r f6091947ffed tools/python/xen/xend/XendCheckpoint.py --- a/tools/python/xen/xend/XendCheckpoint.py Mon Nov 09 19:17:22 2009 +0000 +++ b/tools/python/xen/xend/XendCheckpoint.py Mon Nov 09 19:19:27 2009 +0000 @@ -323,26 +323,6 @@ def restore(xd, fd, dominfo = None, paus if not is_hvm and handler.console_mfn is None: raise XendError('Could not read console MFN') - # get qemu state and create a tmp file for dm restore - # Even PV guests may have QEMU stat, but its not currently - # used so only bother with HVM currently. - if is_hvm: - qemu_signature = read_exact(fd, len(QEMU_SIGNATURE), - "invalid device model signature read") - if qemu_signature != QEMU_SIGNATURE: - raise XendError("not a valid device model state: found '%s'" % - qemu_signature) - qemu_fd = os.open("/var/lib/xen/qemu-save.%d" % dominfo.getDomid(), - os.O_WRONLY | os.O_CREAT | os.O_TRUNC) - while True: - buf = os.read(fd, dm_batch) - if len(buf): - write_exact(qemu_fd, buf, - "could not write dm state to tmp file") - else: - break - os.close(qemu_fd) - restore_image.setCpuid() # xc_restore will wait for source to close connection _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |