[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC Patch v3 21/22] setup and control colo-agent for secondary vm



This patch adds the machinery required for protecting a secondary vm's
network device state. This patch implements the interfaces required by
the checkpoint abstract device layer. A note about the implementation:
   a) setup() and teardown() are called for each vif attached to the
      secondary vm.
      During setup(), the hotplug script is called to setup COLO agent for
      given vif. The script does the follow things:
      i)  redirect vif egress traffic to the FORWARD device
      ii) redirect FORWARD device egress traffic to vif

      During teardown(), the hotplug scripts are called again for each
      vif. The scripts does the follow things:
      i)  remove the vif->FORWARD traffic redirection
      ii) remove the FORWARD->vif traffic redirection

   b) Nothing should be done for secondary vm's network device.

Signed-off-by: Wen Congyang <wency@xxxxxxxxxxxxxx>
---
 tools/libxl/libxl_colo_nic.c     |  21 ++++++
 tools/libxl/libxl_colo_restore.c | 152 +++++++++++++++++++++++++++++++++------
 tools/libxl/libxl_internal.h     |   2 +
 3 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/tools/libxl/libxl_colo_nic.c b/tools/libxl/libxl_colo_nic.c
index 56bccad..9955ef1 100644
--- a/tools/libxl/libxl_colo_nic.c
+++ b/tools/libxl/libxl_colo_nic.c
@@ -286,3 +286,24 @@ const libxl__checkpoint_device_instance_ops 
colo_save_device_nic = {
     .setup = colo_nic_save_setup,
     .teardown = colo_nic_save_teardown,
 };
+
+/* ======== secondary ======== */
+static void colo_nic_restore_setup(libxl__checkpoint_device *dev)
+{
+    libxl__colo_restore_state *crs = CONTAINER_OF(dev->cds, *crs, cds);
+
+    colo_nic_setup(dev, secondary, crs->colo_agent_script);
+}
+
+static void colo_nic_restore_teardown(libxl__checkpoint_device *dev)
+{
+    libxl__colo_restore_state *crs = CONTAINER_OF(dev->cds, *crs, cds);
+
+    colo_nic_teardown(dev, secondary, crs->colo_agent_script);
+}
+
+const libxl__checkpoint_device_instance_ops colo_restore_device_nic = {
+    .kind = LIBXL__DEVICE_KIND_CHECKPOINT_NIC,
+    .setup = colo_nic_restore_setup,
+    .teardown = colo_nic_restore_teardown,
+};
diff --git a/tools/libxl/libxl_colo_restore.c b/tools/libxl/libxl_colo_restore.c
index 3fd587b..c9b57a4 100644
--- a/tools/libxl/libxl_colo_restore.c
+++ b/tools/libxl/libxl_colo_restore.c
@@ -40,6 +40,9 @@ struct libxl__colo_restore_checkpoint_state {
     libxl__logdirty_switch lds;
     libxl__colo_restore_state *crs;
     int status;
+    /* used for teardown */
+    int teardown_devices;
+    int saved_rc;
 
     void (*callback)(libxl__egc *,
                      libxl__colo_restore_checkpoint_state *,
@@ -58,6 +61,13 @@ static void libxl__colo_restore_domain_resume_callback(void 
*data);
 static void libxl__colo_restore_domain_checkpoint_callback(void *data);
 static void libxl__colo_restore_domain_suspend_callback(void *data);
 
+extern const libxl__checkpoint_device_instance_ops colo_restore_device_nic;
+
+static const libxl__checkpoint_device_instance_ops *colo_restore_ops[] = {
+    &colo_restore_device_nic,
+    NULL,
+};
+
 /* ===================== colo: common functions ===================== */
 static void colo_enable_logdirty(libxl__colo_restore_state *crs, libxl__egc 
*egc)
 {
@@ -138,6 +148,28 @@ static void colo_resume_vm(libxl__egc *egc,
     return;
 }
 
+static int init_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+    /* init device subkind-specific state in the libxl ctx */
+    int rc;
+    STATE_AO_GC(cds->ao);
+
+    rc = init_subkind_colo_nic(cds);
+    if (rc) goto out;
+
+    rc = 0;
+out:
+    return rc;
+}
+
+static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+    /* cleanup device subkind-specific state in the libxl ctx */
+    STATE_AO_GC(cds->ao);
+
+    cleanup_subkind_colo_nic(cds);
+}
+
 
 /* ================ colo: setup restore environment ================ */
 static void libxl__colo_domain_create_cb(libxl__egc *egc,
@@ -266,6 +298,9 @@ static void libxl__colo_domain_create_cb(libxl__egc *egc,
 
 
 /* ================ colo: teardown restore environment ================ */
+static void colo_restore_teardown_done(libxl__egc *egc,
+                                       libxl__checkpoint_devices_state *cds,
+                                       int rc);
 static void do_failover_done(libxl__egc *egc,
                              libxl__colo_restore_checkpoint_state* crcs,
                              int rc);
@@ -312,11 +347,38 @@ void libxl__colo_restore_teardown(libxl__egc *egc,
     EGC_GC;
 
     if (!dirty_bitmap)
-        goto do_failover;
+        goto teardown_devices;
 
     xc_hypercall_buffer_free_pages(CTX->xch, dirty_bitmap, NRPAGES(bsize));
 
-do_failover:
+teardown_devices:
+    crcs->saved_rc = rc;
+    if (!crcs->teardown_devices) {
+        colo_restore_teardown_done(egc, &crs->cds, 0);
+        return;
+    }
+
+    crs->cds.callback = colo_restore_teardown_done;
+    libxl__checkpoint_devices_teardown(egc, &crs->cds);
+}
+
+static void colo_restore_teardown_done(libxl__egc *egc,
+                                       libxl__checkpoint_devices_state *cds,
+                                       int rc)
+{
+    libxl__colo_restore_state *crs = CONTAINER_OF(cds, *crs, cds);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+
+    EGC_GC;
+
+    if (rc)
+        LOG(ERROR, "COLO: failed to teardown device after setup failed"
+            " for guest with domid %u, rc %d", cds->domid, rc);
+
+    cleanup_device_subkind(cds);
+
+    rc = crcs->saved_rc;
     if (!rc) {
         crcs->callback = do_failover_done;
         do_failover(egc, crs);
@@ -405,6 +467,11 @@ static void colo_reenable_logdirty(libxl__egc *egc,
 static void colo_reenable_logdirty_done(libxl__egc *egc,
                                         libxl__logdirty_switch *lds,
                                         int rc);
+static void colo_setup_checkpoint_devices(libxl__egc *egc,
+                                          libxl__colo_restore_state *crs);
+static void colo_restore_setup_cds_done(libxl__egc *egc,
+                                        libxl__checkpoint_devices_state *cds,
+                                        int rc);
 
 static void libxl__colo_restore_domain_resume_callback(void *data)
 {
@@ -516,7 +583,6 @@ static void colo_write_svm_resumed(libxl__egc *egc,
     dc->copywhat = crcs->copywhat[2];
     dc->writewhat = "colo stream";
     dc->callback = colo_common_send_data_done;
-    /* TODO: configure network */
     crcs->callback = NULL;
 
     rc = libxl__datacopier_start(dc);
@@ -539,12 +605,9 @@ static void colo_enable_logdirty_done(libxl__egc *egc,
                                       int rc)
 {
     libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
-    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
 
     /* Convenience aliases */
     libxl__colo_restore_state *const crs = crcs->crs;
-    libxl__save_helper_state *const shs = &dcs->shs;
-    const uint32_t domid = crs->domid;
 
     STATE_AO_GC(crs->ao);
 
@@ -558,19 +621,7 @@ static void colo_enable_logdirty_done(libxl__egc *egc,
         return;
     }
 
-    /* We have enabled secondary vm's logdirty, so we can unpause it now */
-    rc = libxl__domain_unpause(gc, domid);
-    if (rc) {
-        LOG(ERROR, "cannot unpause secondary vm");
-        goto out;
-    }
-
-    colo_write_svm_resumed(egc, crcs);
-
-    return;
-
-out:
-    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+    colo_setup_checkpoint_devices(egc, crs);
 }
 
 static void colo_reenable_logdirty(libxl__egc *egc,
@@ -609,7 +660,6 @@ static void colo_reenable_logdirty_done(libxl__egc *egc,
 
     /* Convenience aliases */
     libxl__save_helper_state *const shs = &dcs->shs;
-    const uint32_t domid = crcs->crs->domid;
 
     STATE_AO_GC(crcs->crs->ao);
 
@@ -618,6 +668,68 @@ static void colo_reenable_logdirty_done(libxl__egc *egc,
         goto out;
     }
 
+    colo_setup_checkpoint_devices(egc, crcs->crs);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+/*
+ * We cannot setup checkpoint devices in libxl__colo_restore_setup(),
+ * because the guest is not ready.
+ */
+static void colo_setup_checkpoint_devices(libxl__egc *egc,
+                                          libxl__colo_restore_state *crs)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    libxl__checkpoint_devices_state *cds = &crs->cds;
+    libxl__save_helper_state *const shs = &dcs->shs;
+
+    STATE_AO_GC(crs->ao);
+
+    crcs->teardown_devices = 1;
+
+    cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_CHECKPOINT_NIC);
+    cds->callback = colo_restore_setup_cds_done;
+    cds->ao = ao;
+    cds->domid = crs->domid;
+    cds->ops = colo_restore_ops;
+
+    if (init_device_subkind(cds))
+        goto out;
+
+    libxl__checkpoint_devices_setup(egc, cds);
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_restore_setup_cds_done(libxl__egc *egc,
+                                        libxl__checkpoint_devices_state *cds,
+                                        int rc)
+{
+    libxl__colo_restore_state *crs = CONTAINER_OF(cds, *crs, cds);
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    libxl__save_helper_state *const shs = &dcs->shs;
+    const uint32_t domid = crs->domid;
+
+    STATE_AO_GC(cds->ao);
+
+    if (rc) {
+        LOG(ERROR, "COLO: failed to setup device for guest with domid %u",
+            cds->domid);
+        goto out;
+    }
+
     /* We have enabled secondary vm's logdirty, so we can unpause it now */
     rc = libxl__domain_unpause(gc, domid);
     if (rc) {
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index b1a7208..00dfa1e 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3078,10 +3078,12 @@ struct libxl__colo_restore_state {
     int pae;
     int superpages;
     libxl__colo_callback *callback;
+    char *colo_agent_script;
 
     /* private, colo restore checkpoint state */
     libxl__domain_create_cb *saved_cb;
     void *crcs;
+    libxl__checkpoint_devices_state cds;
 };
 
 struct libxl__domain_create_state {
-- 
1.9.3


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.