[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] Implement clean return from save/restore failure (so that original



# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1169478932 0
# Node ID 207523704fb15ae92b1852bb7e1f0e739ed01fb3
# Parent  baa9b76ea3e1de27dbe46ba9b3fb117e09637518
Implement clean return from save/restore failure (so that original
domain can continue execution).
Signed-off-by: Andrei Petrov <andrei.petrov@xxxxxxxxxxxxx>
---
 tools/libxc/xc_resume.c                 |  156 +++++++++++++++++++++++++++++---
 tools/libxc/xg_save_restore.h           |    9 -
 tools/python/xen/lowlevel/xc/xc.c       |    4 
 tools/python/xen/xend/XendCheckpoint.py |   24 ++++
 tools/python/xen/xend/XendDomain.py     |    1 
 tools/python/xen/xend/XendDomainInfo.py |   29 +++++
 6 files changed, 200 insertions(+), 23 deletions(-)

diff -r baa9b76ea3e1 -r 207523704fb1 tools/libxc/xc_resume.c
--- a/tools/libxc/xc_resume.c   Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/libxc/xc_resume.c   Mon Jan 22 15:15:32 2007 +0000
@@ -1,5 +1,6 @@
 #include "xc_private.h"
-
+#include "xg_private.h"
+#include "xg_save_restore.h"
 
 #if defined(__i386__) || defined(__x86_64__)
 static int modify_returncode(int xc_handle, uint32_t domid)
@@ -22,19 +23,7 @@ static int modify_returncode(int xc_hand
 }
 #endif
 
-
-/*
- * Resume execution of a domain after suspend shutdown.
- * This can happen in one of two ways:
- *  1. Resume with special return code.
- *  2. Reset guest environment so it believes it is resumed in a new
- *     domain context.
- * (2) should be used only for guests which cannot handle the special
- * new return code. (1) is always safe (but slower).
- * 
- * XXX Only (2) is implemented below. We need to use (1) by default!
- */
-int xc_domain_resume(int xc_handle, uint32_t domid)
+static int xc_domain_resume_cooperative(int xc_handle, uint32_t domid)
 {
     DECLARE_DOMCTL;
     int rc;
@@ -50,3 +39,142 @@ int xc_domain_resume(int xc_handle, uint
     domctl.domain = domid;
     return do_domctl(xc_handle, &domctl);
 }
+
+static int xc_domain_resume_any(int xc_handle, uint32_t domid)
+{
+    DECLARE_DOMCTL;
+    int i, rc = -1;
+
+    /*
+     * (x86 only) Rewrite store_mfn and console_mfn back to MFN (from PFN).
+     */
+#if defined(__i386__) || defined(__x86_64__)
+    xc_dominfo_t info;
+    unsigned long mfn, max_pfn = 0;
+    vcpu_guest_context_t ctxt;
+    start_info_t *start_info;
+    shared_info_t *shinfo = NULL;
+    xen_pfn_t *p2m_frame_list_list = NULL;
+    xen_pfn_t *p2m_frame_list = NULL;
+    xen_pfn_t *p2m = NULL;
+
+    if ( xc_domain_getinfo(xc_handle, domid, 1, &info) != 1 )
+    {
+        PERROR("Could not get domain info");
+        goto out;
+    }
+
+    /* Map the shared info frame */
+    shinfo = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
+                                  PROT_READ, info.shared_info_frame);
+    if ( shinfo == NULL )
+    {
+        ERROR("Couldn't map shared info");
+        goto out;
+    }
+
+    max_pfn = shinfo->arch.max_pfn;
+
+    p2m_frame_list_list =
+        xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, PROT_READ,
+                             shinfo->arch.pfn_to_mfn_frame_list_list);
+    if ( p2m_frame_list_list == NULL )
+    {
+        ERROR("Couldn't map p2m_frame_list_list");
+        goto out;
+    }
+
+    p2m_frame_list = xc_map_foreign_batch(xc_handle, domid, PROT_READ,
+                                          p2m_frame_list_list,
+                                          P2M_FLL_ENTRIES);
+    if ( p2m_frame_list == NULL )
+    {
+        ERROR("Couldn't map p2m_frame_list");
+        goto out;
+    }
+
+    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
+       the guest must not change which frames are used for this purpose.
+       (its not clear why it would want to change them, and we'll be OK
+       from a safety POV anyhow. */
+    p2m = xc_map_foreign_batch(xc_handle, domid, PROT_READ,
+                               p2m_frame_list,
+                               P2M_FL_ENTRIES);
+    if ( p2m == NULL )
+    {
+        ERROR("Couldn't map p2m table");
+        goto out;
+    }
+
+    if ( lock_pages(&ctxt, sizeof(ctxt)) )
+    {
+        ERROR("Unable to lock ctxt");
+        goto out;
+    }
+
+    if ( xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt) )
+    {
+        ERROR("Could not get vcpu context");
+        goto out;
+    }
+
+    mfn = ctxt.user_regs.edx;
+
+    start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
+                                      PROT_READ | PROT_WRITE, mfn);
+    if ( start_info == NULL )
+    {
+        ERROR("Couldn't map start_info");
+        goto out;
+    }
+
+    start_info->store_mfn        = p2m[start_info->store_mfn];
+    start_info->console.domU.mfn = p2m[start_info->console.domU.mfn];
+
+    munmap(start_info, PAGE_SIZE);
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+    /* Reset all secondary CPU states. */
+    for ( i = 1; i <= info.max_vcpu_id; i++ )
+        xc_vcpu_setcontext(xc_handle, domid, i, NULL);
+
+    /* Ready to resume domain execution now. */
+    domctl.cmd = XEN_DOMCTL_resumedomain;
+    domctl.domain = domid;
+    rc = do_domctl(xc_handle, &domctl);
+
+#if defined(__i386__) || defined(__x86_64__)
+ out:
+    unlock_pages((void *)&ctxt, sizeof ctxt);
+    if (p2m)
+        munmap(p2m, P2M_FL_ENTRIES*PAGE_SIZE);
+    if (p2m_frame_list)
+        munmap(p2m_frame_list, P2M_FLL_ENTRIES*PAGE_SIZE);
+    if (p2m_frame_list_list)
+        munmap(p2m_frame_list_list, PAGE_SIZE);
+    if (shinfo)
+        munmap(shinfo, PAGE_SIZE);
+#endif
+
+    return rc;
+}
+
+/*
+ * Resume execution of a domain after suspend shutdown.
+ * This can happen in one of two ways:
+ *  1. Resume with special return code.
+ *  2. Reset guest environment so it believes it is resumed in a new
+ *     domain context.
+ * (2) should be used only for guests which cannot handle the special
+ * new return code. (1) is always safe (but slower).
+ */
+int xc_domain_resume(int xc_handle, uint32_t domid)
+{
+    /*
+     * XXX: Implement a way to select between options (1) and (2).
+     * Or expose the options as two different methods to Python.
+     */
+    return (0
+            ? xc_domain_resume_cooperative(xc_handle, domid)
+            : xc_domain_resume_any(xc_handle, domid));
+}
diff -r baa9b76ea3e1 -r 207523704fb1 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/libxc/xg_save_restore.h     Mon Jan 22 15:15:32 2007 +0000
@@ -34,11 +34,10 @@
 **
 ** Returns 1 on success, 0 on failure.
 */
-static int get_platform_info(int xc_handle, uint32_t dom,
-                             /* OUT */ unsigned long *max_mfn,
-                             /* OUT */ unsigned long *hvirt_start,
-                             /* OUT */ unsigned int *pt_levels)
-
+static inline int get_platform_info(int xc_handle, uint32_t dom,
+                                    /* OUT */ unsigned long *max_mfn,
+                                    /* OUT */ unsigned long *hvirt_start,
+                                    /* OUT */ unsigned int *pt_levels)
 {
     xen_capabilities_info_t xen_caps = "";
     xen_platform_parameters_t xen_params;
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/lowlevel/xc/xc.c Mon Jan 22 15:15:32 2007 +0000
@@ -1064,9 +1064,9 @@ static PyMethodDef pyxc_methods[] = {
       "Destroy a domain.\n"
       " dom [int]:    Identifier of domain to be destroyed.\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
-    
+
     { "domain_resume", 
-      (PyCFunction)pyxc_domain_resume, 
+      (PyCFunction)pyxc_domain_resume,
       METH_VARARGS, "\n"
       "Resume execution of a suspended domain.\n"
       " dom [int]: Identifier of domain to be resumed.\n\n"
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendCheckpoint.py   Mon Jan 22 15:15:32 2007 +0000
@@ -122,6 +122,8 @@ def save(fd, dominfo, network, live, dst
             os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid())
 
         dominfo.destroyDomain()
+        dominfo.testDeviceComplete()
+
         try:
             dominfo.setName(domain_name)
         except VmError:
@@ -134,11 +136,31 @@ def save(fd, dominfo, network, live, dst
     except Exception, exn:
         log.exception("Save failed on domain %s (%s).", domain_name,
                       dominfo.getDomid())
+
+        dominfo._releaseDevices()
+        dominfo.testDeviceComplete()
+        dominfo.testvifsComplete()
+        log.debug("XendCheckpoint.save: devices released")
+
+        dominfo._resetChannels()
+
+        dominfo._removeDom('control/shutdown')
+        dominfo._removeDom('device-misc/vif/nextDeviceID')
+
+        dominfo._createChannels()
+        dominfo._introduceDomain()
+        dominfo._storeDomDetails()
+
+        dominfo._createDevices()
+        log.debug("XendCheckpoint.save: devices created")
+
+        dominfo.resumeDomain()
+        log.debug("XendCheckpoint.save: resumeDomain")
+
         try:
             dominfo.setName(domain_name)
         except:
             log.exception("Failed to reset the migrating domain's name")
-        raise Exception, exn
 
 
 def restore(xd, fd, dominfo = None, paused = False):
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendDomain.py       Mon Jan 22 15:15:32 2007 +0000
@@ -1166,7 +1166,6 @@ class XendDomain:
         sock.send("receive\n")
         sock.recv(80)
         XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst)
-        dominfo.testDeviceComplete()
         sock.close()
 
     def domain_save(self, domid, dst):
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendDomainInfo.py   Mon Jan 22 15:15:32 2007 +0000
@@ -1580,6 +1580,16 @@ class XendDomainInfo:
             log.exception("Exception in alloc_unbound(%d)", self.domid)
             raise
 
+    def _resetChannels(self):
+        """Reset all event channels in the domain.
+        """
+        try:
+            return xc.evtchn_reset(dom=self.domid)
+        except:
+            log.exception("Exception in evtcnh_reset(%d)", self.domid)
+            raise
+
+
     #
     # Bootloader configuration
     #
@@ -1727,6 +1737,25 @@ class XendDomainInfo:
             test = 0
             diff = time.time() - start
             for i in self.getDeviceController('vbd').deviceIDs():
+                test = 1
+                log.info("Dev %s still active, looping...", i)
+                time.sleep(0.1)
+                
+            if test == 0:
+                break
+            if diff >= MIGRATE_TIMEOUT:
+                log.info("Dev still active but hit max loop timeout")
+                break
+
+    def testvifsComplete(self):
+        """ In case vifs are released and then created for the same
+        domain, we need to wait the device shut down.
+        """
+        start = time.time()
+        while True:
+            test = 0
+            diff = time.time() - start
+            for i in self.getDeviceController('vif').deviceIDs():
                 test = 1
                 log.info("Dev %s still active, looping...", i)
                 time.sleep(0.1)

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.