[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] Implement clean return from save/restore failure (so that original
# HG changeset patch # User kfraser@xxxxxxxxxxxxxxxxxxxxx # Date 1169478932 0 # Node ID 207523704fb15ae92b1852bb7e1f0e739ed01fb3 # Parent baa9b76ea3e1de27dbe46ba9b3fb117e09637518 Implement clean return from save/restore failure (so that original domain can continue execution). Signed-off-by: Andrei Petrov <andrei.petrov@xxxxxxxxxxxxx> --- tools/libxc/xc_resume.c | 156 +++++++++++++++++++++++++++++--- tools/libxc/xg_save_restore.h | 9 - tools/python/xen/lowlevel/xc/xc.c | 4 tools/python/xen/xend/XendCheckpoint.py | 24 ++++ tools/python/xen/xend/XendDomain.py | 1 tools/python/xen/xend/XendDomainInfo.py | 29 +++++ 6 files changed, 200 insertions(+), 23 deletions(-) diff -r baa9b76ea3e1 -r 207523704fb1 tools/libxc/xc_resume.c --- a/tools/libxc/xc_resume.c Mon Jan 22 14:13:26 2007 +0000 +++ b/tools/libxc/xc_resume.c Mon Jan 22 15:15:32 2007 +0000 @@ -1,5 +1,6 @@ #include "xc_private.h" - +#include "xg_private.h" +#include "xg_save_restore.h" #if defined(__i386__) || defined(__x86_64__) static int modify_returncode(int xc_handle, uint32_t domid) @@ -22,19 +23,7 @@ static int modify_returncode(int xc_hand } #endif - -/* - * Resume execution of a domain after suspend shutdown. - * This can happen in one of two ways: - * 1. Resume with special return code. - * 2. Reset guest environment so it believes it is resumed in a new - * domain context. - * (2) should be used only for guests which cannot handle the special - * new return code. (1) is always safe (but slower). - * - * XXX Only (2) is implemented below. We need to use (1) by default! - */ -int xc_domain_resume(int xc_handle, uint32_t domid) +static int xc_domain_resume_cooperative(int xc_handle, uint32_t domid) { DECLARE_DOMCTL; int rc; @@ -50,3 +39,142 @@ int xc_domain_resume(int xc_handle, uint domctl.domain = domid; return do_domctl(xc_handle, &domctl); } + +static int xc_domain_resume_any(int xc_handle, uint32_t domid) +{ + DECLARE_DOMCTL; + int i, rc = -1; + + /* + * (x86 only) Rewrite store_mfn and console_mfn back to MFN (from PFN). + */ +#if defined(__i386__) || defined(__x86_64__) + xc_dominfo_t info; + unsigned long mfn, max_pfn = 0; + vcpu_guest_context_t ctxt; + start_info_t *start_info; + shared_info_t *shinfo = NULL; + xen_pfn_t *p2m_frame_list_list = NULL; + xen_pfn_t *p2m_frame_list = NULL; + xen_pfn_t *p2m = NULL; + + if ( xc_domain_getinfo(xc_handle, domid, 1, &info) != 1 ) + { + PERROR("Could not get domain info"); + goto out; + } + + /* Map the shared info frame */ + shinfo = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, + PROT_READ, info.shared_info_frame); + if ( shinfo == NULL ) + { + ERROR("Couldn't map shared info"); + goto out; + } + + max_pfn = shinfo->arch.max_pfn; + + p2m_frame_list_list = + xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, PROT_READ, + shinfo->arch.pfn_to_mfn_frame_list_list); + if ( p2m_frame_list_list == NULL ) + { + ERROR("Couldn't map p2m_frame_list_list"); + goto out; + } + + p2m_frame_list = xc_map_foreign_batch(xc_handle, domid, PROT_READ, + p2m_frame_list_list, + P2M_FLL_ENTRIES); + if ( p2m_frame_list == NULL ) + { + ERROR("Couldn't map p2m_frame_list"); + goto out; + } + + /* Map all the frames of the pfn->mfn table. For migrate to succeed, + the guest must not change which frames are used for this purpose. + (its not clear why it would want to change them, and we'll be OK + from a safety POV anyhow. */ + p2m = xc_map_foreign_batch(xc_handle, domid, PROT_READ, + p2m_frame_list, + P2M_FL_ENTRIES); + if ( p2m == NULL ) + { + ERROR("Couldn't map p2m table"); + goto out; + } + + if ( lock_pages(&ctxt, sizeof(ctxt)) ) + { + ERROR("Unable to lock ctxt"); + goto out; + } + + if ( xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt) ) + { + ERROR("Could not get vcpu context"); + goto out; + } + + mfn = ctxt.user_regs.edx; + + start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, + PROT_READ | PROT_WRITE, mfn); + if ( start_info == NULL ) + { + ERROR("Couldn't map start_info"); + goto out; + } + + start_info->store_mfn = p2m[start_info->store_mfn]; + start_info->console.domU.mfn = p2m[start_info->console.domU.mfn]; + + munmap(start_info, PAGE_SIZE); +#endif /* defined(__i386__) || defined(__x86_64__) */ + + /* Reset all secondary CPU states. */ + for ( i = 1; i <= info.max_vcpu_id; i++ ) + xc_vcpu_setcontext(xc_handle, domid, i, NULL); + + /* Ready to resume domain execution now. */ + domctl.cmd = XEN_DOMCTL_resumedomain; + domctl.domain = domid; + rc = do_domctl(xc_handle, &domctl); + +#if defined(__i386__) || defined(__x86_64__) + out: + unlock_pages((void *)&ctxt, sizeof ctxt); + if (p2m) + munmap(p2m, P2M_FL_ENTRIES*PAGE_SIZE); + if (p2m_frame_list) + munmap(p2m_frame_list, P2M_FLL_ENTRIES*PAGE_SIZE); + if (p2m_frame_list_list) + munmap(p2m_frame_list_list, PAGE_SIZE); + if (shinfo) + munmap(shinfo, PAGE_SIZE); +#endif + + return rc; +} + +/* + * Resume execution of a domain after suspend shutdown. + * This can happen in one of two ways: + * 1. Resume with special return code. + * 2. Reset guest environment so it believes it is resumed in a new + * domain context. + * (2) should be used only for guests which cannot handle the special + * new return code. (1) is always safe (but slower). + */ +int xc_domain_resume(int xc_handle, uint32_t domid) +{ + /* + * XXX: Implement a way to select between options (1) and (2). + * Or expose the options as two different methods to Python. + */ + return (0 + ? xc_domain_resume_cooperative(xc_handle, domid) + : xc_domain_resume_any(xc_handle, domid)); +} diff -r baa9b76ea3e1 -r 207523704fb1 tools/libxc/xg_save_restore.h --- a/tools/libxc/xg_save_restore.h Mon Jan 22 14:13:26 2007 +0000 +++ b/tools/libxc/xg_save_restore.h Mon Jan 22 15:15:32 2007 +0000 @@ -34,11 +34,10 @@ ** ** Returns 1 on success, 0 on failure. */ -static int get_platform_info(int xc_handle, uint32_t dom, - /* OUT */ unsigned long *max_mfn, - /* OUT */ unsigned long *hvirt_start, - /* OUT */ unsigned int *pt_levels) - +static inline int get_platform_info(int xc_handle, uint32_t dom, + /* OUT */ unsigned long *max_mfn, + /* OUT */ unsigned long *hvirt_start, + /* OUT */ unsigned int *pt_levels) { xen_capabilities_info_t xen_caps = ""; xen_platform_parameters_t xen_params; diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Mon Jan 22 14:13:26 2007 +0000 +++ b/tools/python/xen/lowlevel/xc/xc.c Mon Jan 22 15:15:32 2007 +0000 @@ -1064,9 +1064,9 @@ static PyMethodDef pyxc_methods[] = { "Destroy a domain.\n" " dom [int]: Identifier of domain to be destroyed.\n\n" "Returns: [int] 0 on success; -1 on error.\n" }, - + { "domain_resume", - (PyCFunction)pyxc_domain_resume, + (PyCFunction)pyxc_domain_resume, METH_VARARGS, "\n" "Resume execution of a suspended domain.\n" " dom [int]: Identifier of domain to be resumed.\n\n" diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendCheckpoint.py --- a/tools/python/xen/xend/XendCheckpoint.py Mon Jan 22 14:13:26 2007 +0000 +++ b/tools/python/xen/xend/XendCheckpoint.py Mon Jan 22 15:15:32 2007 +0000 @@ -122,6 +122,8 @@ def save(fd, dominfo, network, live, dst os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid()) dominfo.destroyDomain() + dominfo.testDeviceComplete() + try: dominfo.setName(domain_name) except VmError: @@ -134,11 +136,31 @@ def save(fd, dominfo, network, live, dst except Exception, exn: log.exception("Save failed on domain %s (%s).", domain_name, dominfo.getDomid()) + + dominfo._releaseDevices() + dominfo.testDeviceComplete() + dominfo.testvifsComplete() + log.debug("XendCheckpoint.save: devices released") + + dominfo._resetChannels() + + dominfo._removeDom('control/shutdown') + dominfo._removeDom('device-misc/vif/nextDeviceID') + + dominfo._createChannels() + dominfo._introduceDomain() + dominfo._storeDomDetails() + + dominfo._createDevices() + log.debug("XendCheckpoint.save: devices created") + + dominfo.resumeDomain() + log.debug("XendCheckpoint.save: resumeDomain") + try: dominfo.setName(domain_name) except: log.exception("Failed to reset the migrating domain's name") - raise Exception, exn def restore(xd, fd, dominfo = None, paused = False): diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Mon Jan 22 14:13:26 2007 +0000 +++ b/tools/python/xen/xend/XendDomain.py Mon Jan 22 15:15:32 2007 +0000 @@ -1166,7 +1166,6 @@ class XendDomain: sock.send("receive\n") sock.recv(80) XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst) - dominfo.testDeviceComplete() sock.close() def domain_save(self, domid, dst): diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Mon Jan 22 14:13:26 2007 +0000 +++ b/tools/python/xen/xend/XendDomainInfo.py Mon Jan 22 15:15:32 2007 +0000 @@ -1580,6 +1580,16 @@ class XendDomainInfo: log.exception("Exception in alloc_unbound(%d)", self.domid) raise + def _resetChannels(self): + """Reset all event channels in the domain. + """ + try: + return xc.evtchn_reset(dom=self.domid) + except: + log.exception("Exception in evtcnh_reset(%d)", self.domid) + raise + + # # Bootloader configuration # @@ -1727,6 +1737,25 @@ class XendDomainInfo: test = 0 diff = time.time() - start for i in self.getDeviceController('vbd').deviceIDs(): + test = 1 + log.info("Dev %s still active, looping...", i) + time.sleep(0.1) + + if test == 0: + break + if diff >= MIGRATE_TIMEOUT: + log.info("Dev still active but hit max loop timeout") + break + + def testvifsComplete(self): + """ In case vifs are released and then created for the same + domain, we need to wait the device shut down. + """ + start = time.time() + while True: + test = 0 + diff = time.time() - start + for i in self.getDeviceController('vif').deviceIDs(): test = 1 log.info("Dev %s still active, looping...", i) time.sleep(0.1) _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |