[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC 07/20] migration: defer precopy policy to libxl



Hi,

I would like to encourage this patch - as I have use for it outside
of your postcopy work.

Some things people will comment on:
You've used 'unsigned' without the int keyword, which people don't like.
Also on line 324, your missing space between 'if (' and 'ctx->save.policy_decision'.

Also, I'm not a fan of your CONSULT_POLICY macro, which you've defined at
a odd point in your function, and I think could be done more elegantly.
Worst of all ... its a macro - which I think should generally be avoided unless there is little choice. I'm sure you could write a helper function to replace this.

Cheers,

-jenny

On 27/03/17 10:06, Joshua Otto wrote:
The precopy phase of the xc_domain_save() live migration algorithm has
historically been implemented to run until either a) (almost) no pages
are dirty or b) some fixed, hard-coded maximum number of precopy
iterations has been exceeded.  This policy and its implementation are
less than ideal for a few reasons:
- the logic of the policy is intertwined with the control flow of the
   mechanism of the precopy stage
- it can't take into account facts external to the immediate
   migration context, such as interactive user input or the passage of
   wall-clock time
- it does not permit the user to change their mind, over time, about
   what to do at the end of the precopy (they get an unconditional
   transition into the stop-and-copy phase of the migration)

To permit users to implement arbitrary higher-level policies governing
when the live migration precopy phase should end, and what should be
done next:
- add a precopy_policy() callback to the xc_domain_save() user-supplied
   callbacks
- during the precopy phase of live migrations, consult this policy after
   each batch of pages transmitted and take the dictated action, which
   may be to a) abort the migration entirely, b) continue with the
   precopy, or c) proceed to the stop-and-copy phase.
- provide an implementation of the old policy as such a callback in
   libxl and plumb it through the IPC machinery to libxc, effectively
   maintaing the old policy for now

Signed-off-by: Joshua Otto <jtotto@xxxxxxxxxxxx>
---
  tools/libxc/include/xenguest.h     |  23 ++++-
  tools/libxc/xc_nomigrate.c         |   3 +-
  tools/libxc/xc_sr_common.h         |   7 +-
  tools/libxc/xc_sr_save.c           | 194 ++++++++++++++++++++++++++-----------
  tools/libxl/libxl_dom_save.c       |  20 ++++
  tools/libxl/libxl_save_callout.c   |   3 +-
  tools/libxl/libxl_save_helper.c    |   7 +-
  tools/libxl/libxl_save_msgs_gen.pl |   4 +-
  8 files changed, 189 insertions(+), 72 deletions(-)

diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index aa8cc8b..30ffb6f 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -39,6 +39,14 @@
   */
  struct xenevtchn_handle;
+/* For save's precopy_policy(). */
+struct precopy_stats
+{
+    unsigned iteration;
+    unsigned total_written;
+    long dirty_count; /* -1 if unknown */
+};
+
  /* callbacks provided by xc_domain_save */
  struct save_callbacks {
      /* Called after expiration of checkpoint interval,
@@ -46,6 +54,17 @@ struct save_callbacks {
       */
      int (*suspend)(void* data);
+ /* Called after every batch of page data sent during the precopy phase of a
+     * live migration to ask the caller what to do next based on the current
+     * state of the precopy migration.
+     */
+#define XGS_POLICY_ABORT          (-1) /* Abandon the migration entirely and
+                                        * tidy up. */
+#define XGS_POLICY_CONTINUE_PRECOPY 0  /* Remain in the precopy phase. */
+#define XGS_POLICY_STOP_AND_COPY    1  /* Immediately suspend and transmit the
+                                        * remaining dirty pages. */
+    int (*precopy_policy)(struct precopy_stats stats, void *data);
+
      /* Called after the guest's dirty pages have been
       *  copied into an output buffer.
       * Callback function resumes the guest & the device model,
@@ -100,8 +119,8 @@ typedef enum {
   *        doesn't use checkpointing
   * @return 0 on success, -1 on failure
   */
-int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t 
max_iters,
-                   uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
+                   uint32_t flags /* XCFLAGS_xxx */,
                     struct save_callbacks* callbacks, int hvm,
                     xc_migration_stream_t stream_type, int recv_fd);
diff --git a/tools/libxc/xc_nomigrate.c b/tools/libxc/xc_nomigrate.c
index 15c838f..2af64e4 100644
--- a/tools/libxc/xc_nomigrate.c
+++ b/tools/libxc/xc_nomigrate.c
@@ -20,8 +20,7 @@
  #include <xenctrl.h>
  #include <xenguest.h>
-int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iters,
-                   uint32_t max_factor, uint32_t flags,
+int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t flags,
                     struct save_callbacks* callbacks, int hvm,
                     xc_migration_stream_t stream_type, int recv_fd)
  {
diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
index b1aa88e..a9160bd 100644
--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -198,12 +198,11 @@ struct xc_sr_context
              /* Further debugging information in the stream. */
              bool debug;
- /* Parameters for tweaking live migration. */
-            unsigned max_iterations;
-            unsigned dirty_threshold;
-
              unsigned long p2m_size;
+ struct precopy_stats stats;
+            int policy_decision;
+
              xen_pfn_t *batch_pfns;
              unsigned nr_batch_pfns;
              unsigned long *deferred_pages;
diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c
index 797aec5..eb95334 100644
--- a/tools/libxc/xc_sr_save.c
+++ b/tools/libxc/xc_sr_save.c
@@ -271,13 +271,29 @@ static int write_batch(struct xc_sr_context *ctx)
  }
/*
+ * Test if the batch is full.
+ */
+static bool batch_full(struct xc_sr_context *ctx)
+{
+    return ctx->save.nr_batch_pfns == MAX_BATCH_SIZE;
+}
+
+/*
+ * Test if the batch is empty.
+ */
+static bool batch_empty(struct xc_sr_context *ctx)
+{
+    return ctx->save.nr_batch_pfns == 0;
+}
+
+/*
   * Flush a batch of pfns into the stream.
   */
  static int flush_batch(struct xc_sr_context *ctx)
  {
      int rc = 0;
- if ( ctx->save.nr_batch_pfns == 0 )
+    if ( batch_empty(ctx) )
          return rc;
rc = write_batch(ctx);
@@ -293,19 +309,12 @@ static int flush_batch(struct xc_sr_context *ctx)
  }
/*
- * Add a single pfn to the batch, flushing the batch if full.
+ * Add a single pfn to the batch.
   */
-static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn)
+static void add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn)
  {
-    int rc = 0;
-
-    if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE )
-        rc = flush_batch(ctx);
-
-    if ( rc == 0 )
-        ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
-
-    return rc;
+    assert(ctx->save.nr_batch_pfns < MAX_BATCH_SIZE);
+    ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
  }
/*
@@ -352,10 +361,15 @@ static int suspend_domain(struct xc_sr_context *ctx)
   * Send a subset of pages in the guests p2m, according to the dirty bitmap.
   * Used for each subsequent iteration of the live migration loop.
   *
+ * During the precopy stage of a live migration, test the user-supplied
+ * policy function after each batch of pages and cut off the operation
+ * early if indicated.  Unless aborting, the dirty pages remaining in this 
round
+ * are transferred into the deferred_pages bitmap.
+ *
   * Bitmap is bounded by p2m_size.
   */
  static int send_dirty_pages(struct xc_sr_context *ctx,
-                            unsigned long entries)
+                            unsigned long entries, bool precopy)
  {
      xc_interface *xch = ctx->xch;
      xen_pfn_t p;
@@ -364,31 +378,57 @@ static int send_dirty_pages(struct xc_sr_context *ctx,
      DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
                                      &ctx->save.dirty_bitmap_hbuf);
- for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p )
+    int (*precopy_policy)(struct precopy_stats, void *) =
+        ctx->save.callbacks->precopy_policy;
+    void *data = ctx->save.callbacks->data;
+
+    assert(batch_empty(ctx));
+    for ( p = 0, written = 0; p < ctx->save.p2m_size; )
      {
-        if ( !test_bit(p, dirty_bitmap) )
-            continue;
+        if ( ctx->save.live && precopy )
+        {
+            ctx->save.policy_decision = precopy_policy(ctx->save.stats, data);
+            if ( ctx->save.policy_decision == XGS_POLICY_ABORT )
+            {
+                return -1;
+            }
+            else if ( ctx->save.policy_decision != XGS_POLICY_CONTINUE_PRECOPY 
)
+            {
+                /* Any outstanding dirty pages are now deferred until the next
+                 * phase of the migration. */
+                bitmap_or(ctx->save.deferred_pages, dirty_bitmap,
+                          ctx->save.p2m_size);
+                if ( entries > written )
+                    ctx->save.nr_deferred_pages += entries - written;
+
+                goto done;
+            }
+        }
- rc = add_to_batch(ctx, p);
+        for ( ; p < ctx->save.p2m_size && !batch_full(ctx); ++p )
+        {
+            if ( test_and_clear_bit(p, dirty_bitmap) )
+            {
+                add_to_batch(ctx, p);
+                ++written;
+                ++ctx->save.stats.total_written;
+            }
+        }
+
+        rc = flush_batch(ctx);
          if ( rc )
              return rc;
- /* Update progress every 4MB worth of memory sent. */
-        if ( (written & ((1U << (22 - 12)) - 1)) == 0 )
-            xc_report_progress_step(xch, written, entries);
-
-        ++written;
+        /* Update progress after every batch (4MB) worth of memory sent. */
+        xc_report_progress_step(xch, written, entries);
      }
- rc = flush_batch(ctx);
-    if ( rc )
-        return rc;
-
      if ( written > entries )
          DPRINTF("Bitmap contained more entries than expected...");
xc_report_progress_step(xch, entries, entries); + done:
      return ctx->save.ops.check_vm_state(ctx);
  }
@@ -396,14 +436,14 @@ static int send_dirty_pages(struct xc_sr_context *ctx,
   * Send all pages in the guests p2m.  Used as the first iteration of the live
   * migration loop, and for a non-live save.
   */
-static int send_all_pages(struct xc_sr_context *ctx)
+static int send_all_pages(struct xc_sr_context *ctx, bool precopy)
  {
      DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
                                      &ctx->save.dirty_bitmap_hbuf);
bitmap_set(dirty_bitmap, ctx->save.p2m_size); - return send_dirty_pages(ctx, ctx->save.p2m_size);
+    return send_dirty_pages(ctx, ctx->save.p2m_size, precopy);
  }
static int enable_logdirty(struct xc_sr_context *ctx)
@@ -446,8 +486,7 @@ static int update_progress_string(struct xc_sr_context *ctx,
      xc_interface *xch = ctx->xch;
      char *new_str = NULL;
- if ( asprintf(&new_str, "Frames iteration %u of %u",
-                  iter, ctx->save.max_iterations) == -1 )
+    if ( asprintf(&new_str, "Frames iteration %u", iter) == -1 )
      {
          PERROR("Unable to allocate new progress string");
          return -1;
@@ -468,20 +507,47 @@ static int send_memory_live(struct xc_sr_context *ctx)
      xc_interface *xch = ctx->xch;
      xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
      char *progress_str = NULL;
-    unsigned x;
      int rc;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->save.dirty_bitmap_hbuf);
+
+    int (*precopy_policy)(struct precopy_stats, void *) =
+        ctx->save.callbacks->precopy_policy;
+    void *data = ctx->save.callbacks->data;
+
      rc = update_progress_string(ctx, &progress_str, 0);
      if ( rc )
          goto out;
- rc = send_all_pages(ctx);
+#define CONSULT_POLICY                                                        \
+    do {                                                                      \
+        if ( ctx->save.policy_decision == XGS_POLICY_ABORT )                  \
+        {                                                                     \
+            rc = -1;                                                          \
+            goto out;                                                         \
+        }                                                                     \
+        else if ( ctx->save.policy_decision != XGS_POLICY_CONTINUE_PRECOPY )  \
+        {                                                                     \
+            rc = 0;                                                           \
+            goto out;                                                         \
+        }                                                                     \
+    } while (0)
+
+    ctx->save.stats = (struct precopy_stats)
+        {
+            .iteration     = 0,
+            .total_written = 0,
+            .dirty_count   = -1
+        };
+    rc = send_all_pages(ctx, /* precopy */ true);
      if ( rc )
          goto out;
- for ( x = 1;
-          ((x < ctx->save.max_iterations) &&
-           (stats.dirty_count > ctx->save.dirty_threshold)); ++x )
+    /* send_all_pages() has updated the stats */
+    CONSULT_POLICY;
+
+    for ( ctx->save.stats.iteration = 1; ; ++ctx->save.stats.iteration )
      {
          if ( xc_shadow_control(
                   xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
@@ -493,18 +559,42 @@ static int send_memory_live(struct xc_sr_context *ctx)
              goto out;
          }
- if ( stats.dirty_count == 0 )
-            break;
+        /* Check the new dirty_count against the policy. */
+        ctx->save.stats.dirty_count = stats.dirty_count;
+        ctx->save.policy_decision = precopy_policy(ctx->save.stats, data);
+        if ( ctx->save.policy_decision == XGS_POLICY_ABORT )
+        {
+            rc = -1;
+            goto out;
+        }
+        else if (ctx->save.policy_decision != XGS_POLICY_CONTINUE_PRECOPY )
+        {
+            bitmap_or(ctx->save.deferred_pages, dirty_bitmap,
+                      ctx->save.p2m_size);
+            ctx->save.nr_deferred_pages += stats.dirty_count;
+            rc = 0;
+            goto out;
+        }
+
+        /* After this point we won't know how many pages are really dirty until
+         * the next iteration. */
+        ctx->save.stats.dirty_count = -1;
- rc = update_progress_string(ctx, &progress_str, x);
+        rc = update_progress_string(ctx, &progress_str,
+                                    ctx->save.stats.iteration);
          if ( rc )
              goto out;
- rc = send_dirty_pages(ctx, stats.dirty_count);
+        rc = send_dirty_pages(ctx, stats.dirty_count, /* precopy */ true);
          if ( rc )
              goto out;
+
+        /* send_dirty_pages() has updated the stats */
+        CONSULT_POLICY;
      }
+#undef CONSULT_POLICY
+
   out:
      xc_set_progress_prefix(xch, NULL);
      free(progress_str);
@@ -595,7 +685,7 @@ static int suspend_and_send_dirty(struct xc_sr_context *ctx)
      if ( ctx->save.live )
      {
          rc = update_progress_string(ctx, &progress_str,
-                                    ctx->save.max_iterations);
+                                    ctx->save.stats.iteration);
          if ( rc )
              goto out;
      }
@@ -614,7 +704,8 @@ static int suspend_and_send_dirty(struct xc_sr_context *ctx)
          }
      }
- rc = send_dirty_pages(ctx, stats.dirty_count + ctx->save.nr_deferred_pages);
+    rc = send_dirty_pages(ctx, stats.dirty_count + ctx->save.nr_deferred_pages,
+                          /* precopy */ false);
      if ( rc )
          goto out;
@@ -645,7 +736,7 @@ static int verify_frames(struct xc_sr_context *ctx)
          goto out;
xc_set_progress_prefix(xch, "Frames verify");
-    rc = send_all_pages(ctx);
+    rc = send_all_pages(ctx, /* precopy */ false);
      if ( rc )
          goto out;
@@ -719,7 +810,7 @@ static int send_domain_memory_nonlive(struct xc_sr_context *ctx) xc_set_progress_prefix(xch, "Frames"); - rc = send_all_pages(ctx);
+    rc = send_all_pages(ctx, /* precopy */ false);
      if ( rc )
          goto err;
@@ -910,8 +1001,7 @@ static int save(struct xc_sr_context *ctx, uint16_t guest_type)
  };
int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
-                   uint32_t max_iters, uint32_t max_factor, uint32_t flags,
-                   struct save_callbacks* callbacks, int hvm,
+                   uint32_t flags, struct save_callbacks* callbacks, int hvm,
                     xc_migration_stream_t stream_type, int recv_fd)
  {
      struct xc_sr_context ctx =
@@ -932,25 +1022,17 @@ int xc_domain_save(xc_interface *xch, int io_fd, 
uint32_t dom,
             stream_type == XC_MIG_STREAM_REMUS ||
             stream_type == XC_MIG_STREAM_COLO);
- /*
-     * TODO: Find some time to better tweak the live migration algorithm.
-     *
-     * These parameters are better than the legacy algorithm especially for
-     * busy guests.
-     */
-    ctx.save.max_iterations = 5;
-    ctx.save.dirty_threshold = 50;
-
      /* Sanity checks for callbacks. */
      if ( hvm )
          assert(callbacks->switch_qemu_logdirty);
+    if ( ctx.save.live )
+        assert(callbacks->precopy_policy);
      if ( ctx.save.checkpointed )
          assert(callbacks->checkpoint && callbacks->aftercopy);
      if ( ctx.save.checkpointed == XC_MIG_STREAM_COLO )
          assert(callbacks->wait_checkpoint);
- DPRINTF("fd %d, dom %u, max_iters %u, max_factor %u, flags %u, hvm %d",
-            io_fd, dom, max_iters, max_factor, flags, hvm);
+    DPRINTF("fd %d, dom %u, flags %u, hvm %d", io_fd, dom, flags, hvm);
if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
      {
diff --git a/tools/libxl/libxl_dom_save.c b/tools/libxl/libxl_dom_save.c
index 77fe30e..6d28cce 100644
--- a/tools/libxl/libxl_dom_save.c
+++ b/tools/libxl/libxl_dom_save.c
@@ -328,6 +328,25 @@ int 
libxl__save_emulator_xenstore_data(libxl__domain_save_state *dss,
      return rc;
  }
+/*
+ * This is the live migration precopy policy - it's called periodically during
+ * the precopy phase of live migrations, and is responsible for deciding when
+ * the precopy phase should terminate and what should be done next.
+ *
+ * The policy implemented here behaves identically to the policy previously
+ * hard-coded into xc_domain_save() - it proceeds to the stop-and-copy phase of
+ * the live migration when there are either fewer than 50 dirty pages, or more
+ * than 5 precopy rounds have completed.
+ */
+static int libxl__save_live_migration_simple_precopy_policy(
+    struct precopy_stats stats, void *user)
+{
+    return ((stats.dirty_count >= 0 && stats.dirty_count < 50) ||
+            stats.iteration >= 5)
+        ? XGS_POLICY_STOP_AND_COPY
+        : XGS_POLICY_CONTINUE_PRECOPY;
+}
+
  /*----- main code for saving, in order of execution -----*/
void libxl__domain_save(libxl__egc *egc, libxl__domain_save_state *dss)
@@ -401,6 +420,7 @@ void libxl__domain_save(libxl__egc *egc, 
libxl__domain_save_state *dss)
      if (dss->checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_NONE)
          callbacks->suspend = libxl__domain_suspend_callback;
+ callbacks->precopy_policy = libxl__save_live_migration_simple_precopy_policy;
      callbacks->switch_qemu_logdirty = 
libxl__domain_suspend_common_switch_qemu_logdirty;
dss->sws.ao = dss->ao;
diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
index 46b892c..026b572 100644
--- a/tools/libxl/libxl_save_callout.c
+++ b/tools/libxl/libxl_save_callout.c
@@ -89,8 +89,7 @@ void libxl__xc_domain_save(libxl__egc *egc, 
libxl__domain_save_state *dss,
          libxl__srm_callout_enumcallbacks_save(&shs->callbacks.save.a);
const unsigned long argnums[] = {
-        dss->domid, 0, 0, dss->xcflags, dss->hvm,
-        cbflags, dss->checkpointed_stream,
+        dss->domid, dss->xcflags, dss->hvm, cbflags, dss->checkpointed_stream,
      };
shs->ao = ao;
diff --git a/tools/libxl/libxl_save_helper.c b/tools/libxl/libxl_save_helper.c
index d3def6b..0241a6b 100644
--- a/tools/libxl/libxl_save_helper.c
+++ b/tools/libxl/libxl_save_helper.c
@@ -251,8 +251,6 @@ int main(int argc, char **argv)
          io_fd =                             atoi(NEXTARG);
          recv_fd =                           atoi(NEXTARG);
          uint32_t dom =                      strtoul(NEXTARG,0,10);
-        uint32_t max_iters =                strtoul(NEXTARG,0,10);
-        uint32_t max_factor =               strtoul(NEXTARG,0,10);
          uint32_t flags =                    strtoul(NEXTARG,0,10);
          int hvm =                           atoi(NEXTARG);
          unsigned cbflags =                  strtoul(NEXTARG,0,10);
@@ -264,9 +262,8 @@ int main(int argc, char **argv)
          startup("save");
          setup_signals(save_signal_handler);
- r = xc_domain_save(xch, io_fd, dom, max_iters, max_factor, flags,
-                           &helper_save_callbacks, hvm, stream_type,
-                           recv_fd);
+        r = xc_domain_save(xch, io_fd, dom, flags, &helper_save_callbacks, hvm,
+                           stream_type, recv_fd);
          complete(r);
} else if (!strcmp(mode,"--restore-domain")) {
diff --git a/tools/libxl/libxl_save_msgs_gen.pl 
b/tools/libxl/libxl_save_msgs_gen.pl
index 27845bb..50c97b4 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -33,6 +33,7 @@ our @msgs = (
                                                'xen_pfn_t', 'console_gfn'] ],
      [  9, 'srW',    "complete",              [qw(int retval
                                                   int errnoval)] ],
+    [ 10, 'scxW',   "precopy_policy", ['struct precopy_stats', 'stats'] ]
  );
#----------------------------------------
@@ -141,7 +142,8 @@ static void bytes_put(unsigned char *const buf, int *len,
END -foreach my $simpletype (qw(int uint16_t uint32_t unsigned), 'unsigned long', 'xen_pfn_t') {
+foreach my $simpletype (qw(int uint16_t uint32_t unsigned),
+                        'unsigned long', 'xen_pfn_t', 'struct precopy_stats') {
      my $typeid = typeid($simpletype);
      $out_body{'callout'} .= <<END;
  static int ${typeid}_get(const unsigned char **msg,


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.