|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v2 2/2] live migration: use superpages for physmap population on restore when possible
On 06.09.2022 11:54, Andrei Semenov wrote:
> Implement an heuristic for X86 HVM guests which tries to use superpages while
> populating guest physmap on live migration. This should impove memory accesses
> performances for these guests.
>
> Signed-off-by: Andrei Semenov <andrei.semenov@xxxxxxxx>
Olaf - I recall you've done some similar work before. Do you have any
thoughts here, perhaps going as far as merging your and Andrei's work?
Jan
> ---
> tools/include/xen-tools/libs.h | 4 ++
> tools/libs/guest/xg_private.h | 3 +
> tools/libs/guest/xg_sr_common.h | 18 ++++-
> tools/libs/guest/xg_sr_restore.c | 60 +++++++---------
> tools/libs/guest/xg_sr_restore_x86_hvm.c | 88 +++++++++++++++++++++++-
> tools/libs/guest/xg_sr_restore_x86_pv.c | 22 +++++-
> 6 files changed, 154 insertions(+), 41 deletions(-)
>
> diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h
> index a16e0c3807..bdd903eb7b 100644
> --- a/tools/include/xen-tools/libs.h
> +++ b/tools/include/xen-tools/libs.h
> @@ -63,4 +63,8 @@
> #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) &
> ~((1UL<<(_w))-1))
> #endif
>
> +#ifndef ROUNDDOWN
> +#define ROUNDDOWN(_x,_w) ((unsigned long)(_x) & (-1UL << (_w)))
> +#endif
> +
> #endif /* __XEN_TOOLS_LIBS__ */
> diff --git a/tools/libs/guest/xg_private.h b/tools/libs/guest/xg_private.h
> index 09e24f1227..dcf63b5188 100644
> --- a/tools/libs/guest/xg_private.h
> +++ b/tools/libs/guest/xg_private.h
> @@ -134,6 +134,9 @@ typedef uint64_t x86_pgentry_t;
> #define PAGE_SIZE_X86 (1UL << PAGE_SHIFT_X86)
> #define PAGE_MASK_X86 (~(PAGE_SIZE_X86-1))
>
> +#define S_PAGE_1GB_ORDER 18
> +#define S_PAGE_2MB_ORDER 9
> +
> #define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT)
>
> static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m,
> diff --git a/tools/libs/guest/xg_sr_common.h b/tools/libs/guest/xg_sr_common.h
> index 941e24d7b7..96365e05a8 100644
> --- a/tools/libs/guest/xg_sr_common.h
> +++ b/tools/libs/guest/xg_sr_common.h
> @@ -137,7 +137,8 @@ struct xc_sr_restore_ops
> bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
>
> /* Set the GFN of a PFN. */
> - void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);
> + void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn,
> + unsigned int order);
>
> /* Set the type of a PFN. */
> void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
> @@ -175,6 +176,17 @@ struct xc_sr_restore_ops
> #define BROKEN_CHANNEL 2
> int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record
> *rec);
>
> + /**
> + * Guest physmap population order is based on heuristic which is family
> + * dependant. X86 HVM heuristic is interested in observing the whole
> + * record (the first) in order to guess how the physmap should be
> populated.
> + */
> + void (*guess_physmap)(struct xc_sr_context *ctx, unsigned int count,
> + const xen_pfn_t *pfns, const uint32_t *types);
> +
> + /* Get the physmap population order for given PFN */
> + int (*get_physmap_order)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
> +
> /**
> * Perform any actions required after the static data has arrived.
> Called
> * when the STATIC_DATA_COMPLETE record has been recieved/inferred.
> @@ -404,6 +416,10 @@ struct xc_sr_context
> {
> /* HVM context blob. */
> struct xc_sr_blob context;
> +
> + /* Set guest type (based on the first record) */
> + bool set_guest_type;
> + bool pvh_guest;
> } restore;
> };
> } hvm;
> diff --git a/tools/libs/guest/xg_sr_restore.c
> b/tools/libs/guest/xg_sr_restore.c
> index 074b56d263..af864bd5ea 100644
> --- a/tools/libs/guest/xg_sr_restore.c
> +++ b/tools/libs/guest/xg_sr_restore.c
> @@ -86,18 +86,21 @@ static bool pfn_is_populated(const struct xc_sr_context
> *ctx, xen_pfn_t pfn)
> * avoid realloc()ing too excessively, the size increased to the nearest
> power
> * of two large enough to contain the required pfn.
> */
> -static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
> +static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn,
> + unsigned int order)
> {
> xc_interface *xch = ctx->xch;
> + xen_pfn_t start_pfn = ROUNDDOWN(pfn, order),
> + end_pfn = (ROUNDUP(pfn + 1, order) - 1);
>
> - if ( pfn > ctx->restore.max_populated_pfn )
> + if ( end_pfn > ctx->restore.max_populated_pfn )
> {
> xen_pfn_t new_max;
> size_t old_sz, new_sz;
> unsigned long *p;
>
> /* Round up to the nearest power of two larger than pfn, less 1. */
> - new_max = pfn;
> + new_max = end_pfn;
> new_max |= new_max >> 1;
> new_max |= new_max >> 2;
> new_max |= new_max >> 4;
> @@ -123,8 +126,11 @@ static int pfn_set_populated(struct xc_sr_context *ctx,
> xen_pfn_t pfn)
> ctx->restore.max_populated_pfn = new_max;
> }
>
> - assert(!test_bit(pfn, ctx->restore.populated_pfns));
> - set_bit(pfn, ctx->restore.populated_pfns);
> + for ( pfn = start_pfn; pfn <= end_pfn; ++pfn )
> + {
> + assert(!test_bit(pfn, ctx->restore.populated_pfns));
> + set_bit(pfn, ctx->restore.populated_pfns);
> + }
>
> return 0;
> }
> @@ -138,60 +144,40 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned
> int count,
> const xen_pfn_t *original_pfns, const uint32_t *types)
> {
> xc_interface *xch = ctx->xch;
> - xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
> - *pfns = malloc(count * sizeof(*pfns));
> - unsigned int i, nr_pfns = 0;
> + xen_pfn_t mfn, pfn;
> + unsigned int i, order;
> int rc = -1;
>
> - if ( !mfns || !pfns )
> - {
> - ERROR("Failed to allocate %zu bytes for populating the physmap",
> - 2 * count * sizeof(*mfns));
> - goto err;
> - }
> + /* Feed this record for family dependant heuristic to guess the physmap
> */
> + ctx->restore.ops.guess_physmap(ctx, count, original_pfns, types);
>
> for ( i = 0; i < count; ++i )
> {
> if ( (!types || page_type_to_populate(types[i])) &&
> !pfn_is_populated(ctx, original_pfns[i]) )
> {
> - rc = pfn_set_populated(ctx, original_pfns[i]);
> + order = ctx->restore.ops.get_physmap_order(ctx,
> original_pfns[i]);
> + rc = pfn_set_populated(ctx, original_pfns[i], order);
> if ( rc )
> goto err;
> - pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
> - ++nr_pfns;
> - }
> - }
> -
> - if ( nr_pfns )
> - {
> - rc = xc_domain_populate_physmap_exact(
> - xch, ctx->domid, nr_pfns, 0, 0, mfns);
> - if ( rc )
> - {
> - PERROR("Failed to populate physmap");
> - goto err;
> - }
>
> - for ( i = 0; i < nr_pfns; ++i )
> - {
> - if ( mfns[i] == INVALID_MFN )
> + pfn = mfn = ROUNDDOWN(original_pfns[i], order);
> + rc = xc_domain_populate_physmap_exact(xch, ctx->domid, 1, order,
> 0,
> + &mfn);
> + if ( rc || (mfn == INVALID_MFN) )
> {
> - ERROR("Populate physmap failed for pfn %u", i);
> + ERROR("Failed to populate physmap for pfn %lu (%u)", pfn,
> order);
> rc = -1;
> goto err;
> }
>
> - ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
> + ctx->restore.ops.set_gfn(ctx, pfn, mfn, order);
> }
> }
>
> rc = 0;
>
> err:
> - free(pfns);
> - free(mfns);
> -
> return rc;
> }
>
> diff --git a/tools/libs/guest/xg_sr_restore_x86_hvm.c
> b/tools/libs/guest/xg_sr_restore_x86_hvm.c
> index d6ea6f3012..2e525443ab 100644
> --- a/tools/libs/guest/xg_sr_restore_x86_hvm.c
> +++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c
> @@ -110,7 +110,7 @@ static xen_pfn_t x86_hvm_pfn_to_gfn(const struct
> xc_sr_context *ctx,
>
> /* restore_ops function. */
> static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
> - xen_pfn_t gfn)
> + xen_pfn_t gfn, unsigned int order)
> {
> /* no op */
> }
> @@ -161,6 +161,8 @@ static int x86_hvm_setup(struct xc_sr_context *ctx)
> }
> #endif
>
> + ctx->x86.hvm.restore.set_guest_type = true;
> +
> return 0;
> }
>
> @@ -192,6 +194,88 @@ static int x86_hvm_process_record(struct xc_sr_context
> *ctx,
> }
> }
>
> +/*
> + * We consider that PVH guest physmap starts from 0 and coninugiously cover
> the
> + * pysical memory space for the first GB of memory. HVM guest will have I/0
> + * holes in the first 2MB of memory space (at least for VGA). Therefore we
> + * should observe the very first record (wich comes in physmap order) to find
> + * out how we should map this first GB.
> + * To map the rest of the memory space in both cases (PVH or HVM) we will use
> + * the maximum available order (up to 1GB), except for forth GB wich holds
> the
> + * low MMIO hole (at least for LAPIC MMIO window and for potential
> passthroughed
> + * or emulated PCI devices BARs).
> + */
> +static void x86_hvm_guess_physmap(struct xc_sr_context *ctx, unsigned int
> count,
> + const xen_pfn_t *pfns, const uint32_t *types)
> +{
> + xen_pfn_t prev;
> + unsigned int i;
> +
> +
> + if ( !ctx->x86.hvm.restore.set_guest_type )
> + return;
> +
> + for ( i = 0, prev = INVALID_PFN; i < count; ++i )
> + {
> + if ( !types || page_type_to_populate(types[i]) )
> + {
> + if ( prev == INVALID_MFN )
> + {
> + if (pfns[i] != 0)
> + break;
> + }
> + else
> + {
> + if ( pfns[i] != (prev + 1) )
> + break;
> + }
> + prev = pfns[i];
> + }
> + }
> +
> + ctx->x86.hvm.restore.pvh_guest = (i == count) ? true : false;
> + ctx->x86.hvm.restore.set_guest_type = false;
> +}
> +
> +/*
> + *
> + */
> +static int x86_hvm_get_physmap_order(const struct xc_sr_context *ctx,
> + xen_pfn_t pfn)
> +{
> + int order;
> +
> + if ( pfn >= ctx->restore.p2m_size )
> + return 0;
> +
> + switch (pfn >> S_PAGE_1GB_ORDER)
> + {
> + case 3:
> + /* The forth GB of memory is mapped with 2MB superpages */
> + order = S_PAGE_2MB_ORDER;
> + break;
> + case 0:
> + if (!ctx->x86.hvm.restore.pvh_guest)
> + {
> + /* First 2MB are mapped as 4K for HVM guest */
> + order = (pfn > 0x1ff) ? S_PAGE_2MB_ORDER : 0;
> + break;
> + }
> + default:
> + order = S_PAGE_1GB_ORDER;
> + }
> +
> + if ( ((ROUNDUP(pfn + 1, S_PAGE_1GB_ORDER) - 1) >= ctx->restore.p2m_size)
> &&
> + order == S_PAGE_1GB_ORDER )
> + order = S_PAGE_2MB_ORDER;
> +
> + if ( ((ROUNDUP(pfn + 1, S_PAGE_2MB_ORDER) - 1) >= ctx->restore.p2m_size)
> &&
> + order == S_PAGE_2MB_ORDER )
> + order = 0;
> +
> + return order;
> +}
> +
> /*
> * restore_ops function. Sets extra hvm parameters and seeds the grant
> table.
> */
> @@ -258,6 +342,8 @@ struct xc_sr_restore_ops restore_ops_x86_hvm =
> .localise_page = x86_hvm_localise_page,
> .setup = x86_hvm_setup,
> .process_record = x86_hvm_process_record,
> + .guess_physmap = x86_hvm_guess_physmap,
> + .get_physmap_order = x86_hvm_get_physmap_order,
> .static_data_complete = x86_static_data_complete,
> .stream_complete = x86_hvm_stream_complete,
> .cleanup = x86_hvm_cleanup,
> diff --git a/tools/libs/guest/xg_sr_restore_x86_pv.c
> b/tools/libs/guest/xg_sr_restore_x86_pv.c
> index dc50b0f5a8..f8545f941a 100644
> --- a/tools/libs/guest/xg_sr_restore_x86_pv.c
> +++ b/tools/libs/guest/xg_sr_restore_x86_pv.c
> @@ -59,7 +59,7 @@ static int expand_p2m(struct xc_sr_context *ctx, unsigned
> long max_pfn)
> ctx->x86.pv.max_pfn = max_pfn;
> for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
> {
> - ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN);
> + ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN, 0);
> ctx->restore.ops.set_page_type(ctx, i, 0);
> }
>
> @@ -947,9 +947,10 @@ static void x86_pv_set_page_type(struct xc_sr_context
> *ctx, xen_pfn_t pfn,
>
> /* restore_ops function. */
> static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
> - xen_pfn_t mfn)
> + xen_pfn_t mfn, unsigned int order)
> {
> assert(pfn <= ctx->x86.pv.max_pfn);
> + assert(!order);
>
> if ( ctx->x86.pv.width == sizeof(uint64_t) )
> /* 64 bit guest. Need to expand INVALID_MFN for 32 bit toolstacks.
> */
> @@ -1113,6 +1114,21 @@ static int x86_pv_process_record(struct xc_sr_context
> *ctx,
> }
> }
>
> +/*
> + * There's no reliable heuristic which can predict the PV guest physmap.
> + * Therefore the 0 order always will be used.
> + */
> +static void x86_pv_guess_physmap(struct xc_sr_context *ctx, unsigned int
> count,
> + const xen_pfn_t *pfns, const uint32_t
> *types)
> +{
> +}
> +
> +static int x86_pv_get_physmap_order(const struct xc_sr_context *ctx,
> + xen_pfn_t pfn)
> +{
> + return 0;
> +}
> +
> /*
> * restore_ops function. Update the vcpu context in Xen, pin the pagetables,
> * rewrite the p2m and seed the grant table.
> @@ -1194,6 +1210,8 @@ struct xc_sr_restore_ops restore_ops_x86_pv =
> .localise_page = x86_pv_localise_page,
> .setup = x86_pv_setup,
> .process_record = x86_pv_process_record,
> + .guess_physmap = x86_pv_guess_physmap,
> + .get_physmap_order = x86_pv_get_physmap_order,
> .static_data_complete = x86_static_data_complete,
> .stream_complete = x86_pv_stream_complete,
> .cleanup = x86_pv_cleanup,
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |