[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 2/2] live migration: use superpages for physmap population on restore when possible



Implement an heuristic for X86 HVM guests which tries to use superpages while
populating guest physmap on live migration. This should impove memory accesses
performances for these guests.

Signed-off-by: Andrei Semenov <andrei.semenov@xxxxxxxx>
---
 tools/include/xen-tools/libs.h           |  4 ++
 tools/libs/guest/xg_private.h            |  3 +
 tools/libs/guest/xg_sr_common.h          | 18 ++++-
 tools/libs/guest/xg_sr_restore.c         | 60 +++++++---------
 tools/libs/guest/xg_sr_restore_x86_hvm.c | 88 +++++++++++++++++++++++-
 tools/libs/guest/xg_sr_restore_x86_pv.c  | 22 +++++-
 6 files changed, 154 insertions(+), 41 deletions(-)

diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h
index a16e0c3807..bdd903eb7b 100644
--- a/tools/include/xen-tools/libs.h
+++ b/tools/include/xen-tools/libs.h
@@ -63,4 +63,8 @@
 #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
 #endif
 
+#ifndef ROUNDDOWN
+#define ROUNDDOWN(_x,_w) ((unsigned long)(_x) & (-1UL << (_w)))
+#endif
+
 #endif /* __XEN_TOOLS_LIBS__ */
diff --git a/tools/libs/guest/xg_private.h b/tools/libs/guest/xg_private.h
index 09e24f1227..dcf63b5188 100644
--- a/tools/libs/guest/xg_private.h
+++ b/tools/libs/guest/xg_private.h
@@ -134,6 +134,9 @@ typedef uint64_t x86_pgentry_t;
 #define PAGE_SIZE_X86           (1UL << PAGE_SHIFT_X86)
 #define PAGE_MASK_X86           (~(PAGE_SIZE_X86-1))
 
+#define S_PAGE_1GB_ORDER        18
+#define S_PAGE_2MB_ORDER         9
+
 #define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT)
 
 static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m,
diff --git a/tools/libs/guest/xg_sr_common.h b/tools/libs/guest/xg_sr_common.h
index 941e24d7b7..96365e05a8 100644
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -137,7 +137,8 @@ struct xc_sr_restore_ops
     bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
 
     /* Set the GFN of a PFN. */
-    void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);
+    void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn,
+                    unsigned int order);
 
     /* Set the type of a PFN. */
     void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
@@ -175,6 +176,17 @@ struct xc_sr_restore_ops
 #define BROKEN_CHANNEL 2
     int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec);
 
+    /**
+     * Guest physmap population order is based on heuristic which is family
+     * dependant. X86 HVM  heuristic is interested in observing the whole
+     * record (the first) in order to guess how the physmap should be 
populated.
+     */
+    void (*guess_physmap)(struct xc_sr_context *ctx, unsigned int count,
+                          const xen_pfn_t *pfns, const uint32_t *types);
+
+    /* Get the physmap population order for given PFN */
+    int (*get_physmap_order)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
+
     /**
      * Perform any actions required after the static data has arrived.  Called
      * when the STATIC_DATA_COMPLETE record has been recieved/inferred.
@@ -404,6 +416,10 @@ struct xc_sr_context
                     {
                         /* HVM context blob. */
                         struct xc_sr_blob context;
+
+                        /* Set guest type (based on the first record) */
+                        bool set_guest_type;
+                        bool pvh_guest;
                     } restore;
                 };
             } hvm;
diff --git a/tools/libs/guest/xg_sr_restore.c b/tools/libs/guest/xg_sr_restore.c
index 074b56d263..af864bd5ea 100644
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -86,18 +86,21 @@ static bool pfn_is_populated(const struct xc_sr_context 
*ctx, xen_pfn_t pfn)
  * avoid realloc()ing too excessively, the size increased to the nearest power
  * of two large enough to contain the required pfn.
  */
-static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn,
+                             unsigned int order)
 {
     xc_interface *xch = ctx->xch;
+    xen_pfn_t start_pfn = ROUNDDOWN(pfn, order),
+        end_pfn = (ROUNDUP(pfn + 1, order) - 1);
 
-    if ( pfn > ctx->restore.max_populated_pfn )
+    if ( end_pfn > ctx->restore.max_populated_pfn )
     {
         xen_pfn_t new_max;
         size_t old_sz, new_sz;
         unsigned long *p;
 
         /* Round up to the nearest power of two larger than pfn, less 1. */
-        new_max = pfn;
+        new_max = end_pfn;
         new_max |= new_max >> 1;
         new_max |= new_max >> 2;
         new_max |= new_max >> 4;
@@ -123,8 +126,11 @@ static int pfn_set_populated(struct xc_sr_context *ctx, 
xen_pfn_t pfn)
         ctx->restore.max_populated_pfn = new_max;
     }
 
-    assert(!test_bit(pfn, ctx->restore.populated_pfns));
-    set_bit(pfn, ctx->restore.populated_pfns);
+    for ( pfn = start_pfn; pfn <= end_pfn; ++pfn )
+    {
+        assert(!test_bit(pfn, ctx->restore.populated_pfns));
+        set_bit(pfn, ctx->restore.populated_pfns);
+    }
 
     return 0;
 }
@@ -138,60 +144,40 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned int 
count,
                   const xen_pfn_t *original_pfns, const uint32_t *types)
 {
     xc_interface *xch = ctx->xch;
-    xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
-        *pfns = malloc(count * sizeof(*pfns));
-    unsigned int i, nr_pfns = 0;
+    xen_pfn_t mfn, pfn;
+    unsigned int i, order;
     int rc = -1;
 
-    if ( !mfns || !pfns )
-    {
-        ERROR("Failed to allocate %zu bytes for populating the physmap",
-              2 * count * sizeof(*mfns));
-        goto err;
-    }
+    /* Feed this record for family dependant heuristic to guess the physmap */
+    ctx->restore.ops.guess_physmap(ctx, count, original_pfns, types);
 
     for ( i = 0; i < count; ++i )
     {
         if ( (!types || page_type_to_populate(types[i])) &&
              !pfn_is_populated(ctx, original_pfns[i]) )
         {
-            rc = pfn_set_populated(ctx, original_pfns[i]);
+            order = ctx->restore.ops.get_physmap_order(ctx, original_pfns[i]);
+            rc = pfn_set_populated(ctx, original_pfns[i], order);
             if ( rc )
                 goto err;
-            pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
-            ++nr_pfns;
-        }
-    }
-
-    if ( nr_pfns )
-    {
-        rc = xc_domain_populate_physmap_exact(
-            xch, ctx->domid, nr_pfns, 0, 0, mfns);
-        if ( rc )
-        {
-            PERROR("Failed to populate physmap");
-            goto err;
-        }
 
-        for ( i = 0; i < nr_pfns; ++i )
-        {
-            if ( mfns[i] == INVALID_MFN )
+            pfn = mfn = ROUNDDOWN(original_pfns[i], order);
+            rc = xc_domain_populate_physmap_exact(xch, ctx->domid, 1, order, 0,
+                                                  &mfn);
+            if ( rc || (mfn == INVALID_MFN) )
             {
-                ERROR("Populate physmap failed for pfn %u", i);
+                ERROR("Failed to populate physmap for pfn %lu (%u)", pfn, 
order);
                 rc = -1;
                 goto err;
             }
 
-            ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
+            ctx->restore.ops.set_gfn(ctx, pfn, mfn, order);
         }
     }
 
     rc = 0;
 
  err:
-    free(pfns);
-    free(mfns);
-
     return rc;
 }
 
diff --git a/tools/libs/guest/xg_sr_restore_x86_hvm.c 
b/tools/libs/guest/xg_sr_restore_x86_hvm.c
index d6ea6f3012..2e525443ab 100644
--- a/tools/libs/guest/xg_sr_restore_x86_hvm.c
+++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c
@@ -110,7 +110,7 @@ static xen_pfn_t x86_hvm_pfn_to_gfn(const struct 
xc_sr_context *ctx,
 
 /* restore_ops function. */
 static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
-                            xen_pfn_t gfn)
+                            xen_pfn_t gfn, unsigned int order)
 {
     /* no op */
 }
@@ -161,6 +161,8 @@ static int x86_hvm_setup(struct xc_sr_context *ctx)
     }
 #endif
 
+    ctx->x86.hvm.restore.set_guest_type = true;
+
     return 0;
 }
 
@@ -192,6 +194,88 @@ static int x86_hvm_process_record(struct xc_sr_context 
*ctx,
     }
 }
 
+/*
+ * We consider that PVH guest physmap starts from 0 and coninugiously cover the
+ * pysical memory space for the first GB of memory.  HVM guest will have I/0
+ * holes in the first 2MB of memory space (at least for VGA). Therefore we
+ * should observe the very first record (wich comes in physmap order) to find
+ * out how we should map this first GB.
+ * To map the rest of the memory space in both cases (PVH or HVM) we will use
+ * the maximum available order (up to 1GB), except for forth GB wich holds the
+ * low MMIO hole (at least for LAPIC MMIO window and for potential 
passthroughed
+ * or emulated PCI devices BARs).
+ */
+static void x86_hvm_guess_physmap(struct xc_sr_context *ctx, unsigned int 
count,
+                         const xen_pfn_t *pfns, const uint32_t *types)
+{
+    xen_pfn_t prev;
+    unsigned int i;
+
+
+    if ( !ctx->x86.hvm.restore.set_guest_type )
+        return;
+
+    for ( i = 0, prev = INVALID_PFN; i < count; ++i )
+    {
+        if ( !types || page_type_to_populate(types[i]) )
+        {
+            if ( prev == INVALID_MFN )
+            {
+                if (pfns[i] != 0)
+                    break;
+            }
+            else
+            {
+                if ( pfns[i] != (prev + 1) )
+                    break;
+            }
+            prev = pfns[i];
+        }
+    }
+
+    ctx->x86.hvm.restore.pvh_guest = (i == count) ? true : false;
+    ctx->x86.hvm.restore.set_guest_type = false;
+}
+
+/*
+ *
+ */
+static int x86_hvm_get_physmap_order(const struct xc_sr_context *ctx,
+                                      xen_pfn_t pfn)
+{
+    int order;
+
+    if ( pfn >= ctx->restore.p2m_size )
+        return 0;
+
+    switch (pfn >> S_PAGE_1GB_ORDER)
+    {
+    case 3:
+        /* The forth GB of memory is mapped with 2MB superpages */
+        order = S_PAGE_2MB_ORDER;
+        break;
+    case 0:
+        if (!ctx->x86.hvm.restore.pvh_guest)
+        {
+            /* First 2MB are mapped as 4K for HVM guest */
+            order = (pfn > 0x1ff) ? S_PAGE_2MB_ORDER : 0;
+            break;
+        }
+    default:
+        order = S_PAGE_1GB_ORDER;
+    }
+
+    if ( ((ROUNDUP(pfn + 1, S_PAGE_1GB_ORDER) - 1) >= ctx->restore.p2m_size) &&
+         order == S_PAGE_1GB_ORDER )
+        order = S_PAGE_2MB_ORDER;
+
+    if ( ((ROUNDUP(pfn + 1, S_PAGE_2MB_ORDER) - 1) >= ctx->restore.p2m_size) &&
+         order == S_PAGE_2MB_ORDER )
+        order = 0;
+
+    return order;
+}
+
 /*
  * restore_ops function.  Sets extra hvm parameters and seeds the grant table.
  */
@@ -258,6 +342,8 @@ struct xc_sr_restore_ops restore_ops_x86_hvm =
     .localise_page   = x86_hvm_localise_page,
     .setup           = x86_hvm_setup,
     .process_record  = x86_hvm_process_record,
+    .guess_physmap   = x86_hvm_guess_physmap,
+    .get_physmap_order = x86_hvm_get_physmap_order,
     .static_data_complete = x86_static_data_complete,
     .stream_complete = x86_hvm_stream_complete,
     .cleanup         = x86_hvm_cleanup,
diff --git a/tools/libs/guest/xg_sr_restore_x86_pv.c 
b/tools/libs/guest/xg_sr_restore_x86_pv.c
index dc50b0f5a8..f8545f941a 100644
--- a/tools/libs/guest/xg_sr_restore_x86_pv.c
+++ b/tools/libs/guest/xg_sr_restore_x86_pv.c
@@ -59,7 +59,7 @@ static int expand_p2m(struct xc_sr_context *ctx, unsigned 
long max_pfn)
     ctx->x86.pv.max_pfn = max_pfn;
     for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
     {
-        ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN);
+        ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN, 0);
         ctx->restore.ops.set_page_type(ctx, i, 0);
     }
 
@@ -947,9 +947,10 @@ static void x86_pv_set_page_type(struct xc_sr_context 
*ctx, xen_pfn_t pfn,
 
 /* restore_ops function. */
 static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
-                           xen_pfn_t mfn)
+                           xen_pfn_t mfn, unsigned int order)
 {
     assert(pfn <= ctx->x86.pv.max_pfn);
+    assert(!order);
 
     if ( ctx->x86.pv.width == sizeof(uint64_t) )
         /* 64 bit guest.  Need to expand INVALID_MFN for 32 bit toolstacks. */
@@ -1113,6 +1114,21 @@ static int x86_pv_process_record(struct xc_sr_context 
*ctx,
     }
 }
 
+/*
+ * There's no reliable heuristic which can predict the PV guest physmap.
+ * Therefore the 0 order always will be used.
+ */
+static void x86_pv_guess_physmap(struct xc_sr_context *ctx, unsigned int count,
+                                 const xen_pfn_t *pfns, const uint32_t *types)
+{
+}
+
+static int x86_pv_get_physmap_order(const struct xc_sr_context *ctx,
+                                    xen_pfn_t pfn)
+{
+    return 0;
+}
+
 /*
  * restore_ops function.  Update the vcpu context in Xen, pin the pagetables,
  * rewrite the p2m and seed the grant table.
@@ -1194,6 +1210,8 @@ struct xc_sr_restore_ops restore_ops_x86_pv =
     .localise_page   = x86_pv_localise_page,
     .setup           = x86_pv_setup,
     .process_record  = x86_pv_process_record,
+    .guess_physmap   = x86_pv_guess_physmap,
+    .get_physmap_order = x86_pv_get_physmap_order,
     .static_data_complete = x86_static_data_complete,
     .stream_complete = x86_pv_stream_complete,
     .cleanup         = x86_pv_cleanup,
-- 
2.34.1



Andrei Semenov | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions
w: vates.fr | xcp-ng.org | xen-orchestra.com



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.