[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2 2/2] live migration: use superpages for physmap population on restore when possible


  • To: Andrei Semenov <andrei.semenov@xxxxxxxx>, Olaf Hering <olaf@xxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Wed, 7 Sep 2022 09:40:41 +0200
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=6WixJyynWc/r512BygQzcQt+HqqYCwdWSrFx73zK/n4=; b=N5U+szF9m+3UU4jupm7lTRZ8iKaROR9ERo/amq1Gykme6gtICExp9l+U6VGf2hwWtMhPbHNf/Gm3RYIgpoP+bp0pIVMPJKPGRHNWlAdEvvnF2uQ8/m9qjLX5heav+sAMOLSRNdoFzHTfju7QOAUCaBF22cy1feidmhESb80Jm9V7Or6oaQddZHY94njM8prO07hC6KkWmWTrdZmJdj3Wj9eo6G3IhwmmmDUxrvSs/LRZP04VLFdHcc5qyfnlMTqtKbwoMPgmoKdLPojxZTq/at4U7ov/kH9h8ZJ8x6uPzTgcJxhKfJpma568fNUoRhu3xBmsIQoBmzvw2tep3Smi0A==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=VAsudtBTvSXvkdsH1ymdjxlOxUo/2O416qS8PhTseDdOlh6zlihZMaQ+yc0ToF6mvCOzMFeSF/l8GIY/oPoPkmNb5X+inBbYPv/ul/lS+Vt6cC3escwxQVF+uSvHrhBToSu8mhm2/tsBuZ5aqXyMvRiG6lMEHNL5E+BIzEP+DuEjcyJlnQmqxp9OSi3ftziHGI+0AvoQWdnYDlM4c31MSdW5/oxwhkD0b12sU5ARRHq2y12pj4Qtb3z5zwq3ej+PWeKR/pGhTb9CkqK3AGyTXeghHN9QTfAZHQlCshcfO/VpsPV/4Gxvgl5M8ki/pAvnvWWwqf5xXe5bK1XvWo/sWg==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;
  • Cc: Wei Liu <wl@xxxxxxx>, Anthony PERARD <anthony.perard@xxxxxxxxxx>, Juergen Gross <jgross@xxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx
  • Delivery-date: Wed, 07 Sep 2022 07:40:49 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 06.09.2022 11:54, Andrei Semenov wrote:
> Implement an heuristic for X86 HVM guests which tries to use superpages while
> populating guest physmap on live migration. This should impove memory accesses
> performances for these guests.
> 
> Signed-off-by: Andrei Semenov <andrei.semenov@xxxxxxxx>

Olaf - I recall you've done some similar work before. Do you have any
thoughts here, perhaps going as far as merging your and Andrei's work?

Jan

> ---
>  tools/include/xen-tools/libs.h           |  4 ++
>  tools/libs/guest/xg_private.h            |  3 +
>  tools/libs/guest/xg_sr_common.h          | 18 ++++-
>  tools/libs/guest/xg_sr_restore.c         | 60 +++++++---------
>  tools/libs/guest/xg_sr_restore_x86_hvm.c | 88 +++++++++++++++++++++++-
>  tools/libs/guest/xg_sr_restore_x86_pv.c  | 22 +++++-
>  6 files changed, 154 insertions(+), 41 deletions(-)
> 
> diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h
> index a16e0c3807..bdd903eb7b 100644
> --- a/tools/include/xen-tools/libs.h
> +++ b/tools/include/xen-tools/libs.h
> @@ -63,4 +63,8 @@
>  #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & 
> ~((1UL<<(_w))-1))
>  #endif
>  
> +#ifndef ROUNDDOWN
> +#define ROUNDDOWN(_x,_w) ((unsigned long)(_x) & (-1UL << (_w)))
> +#endif
> +
>  #endif       /* __XEN_TOOLS_LIBS__ */
> diff --git a/tools/libs/guest/xg_private.h b/tools/libs/guest/xg_private.h
> index 09e24f1227..dcf63b5188 100644
> --- a/tools/libs/guest/xg_private.h
> +++ b/tools/libs/guest/xg_private.h
> @@ -134,6 +134,9 @@ typedef uint64_t x86_pgentry_t;
>  #define PAGE_SIZE_X86           (1UL << PAGE_SHIFT_X86)
>  #define PAGE_MASK_X86           (~(PAGE_SIZE_X86-1))
>  
> +#define S_PAGE_1GB_ORDER        18
> +#define S_PAGE_2MB_ORDER         9
> +
>  #define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT)
>  
>  static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m,
> diff --git a/tools/libs/guest/xg_sr_common.h b/tools/libs/guest/xg_sr_common.h
> index 941e24d7b7..96365e05a8 100644
> --- a/tools/libs/guest/xg_sr_common.h
> +++ b/tools/libs/guest/xg_sr_common.h
> @@ -137,7 +137,8 @@ struct xc_sr_restore_ops
>      bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
>  
>      /* Set the GFN of a PFN. */
> -    void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);
> +    void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn,
> +                    unsigned int order);
>  
>      /* Set the type of a PFN. */
>      void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
> @@ -175,6 +176,17 @@ struct xc_sr_restore_ops
>  #define BROKEN_CHANNEL 2
>      int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record 
> *rec);
>  
> +    /**
> +     * Guest physmap population order is based on heuristic which is family
> +     * dependant. X86 HVM  heuristic is interested in observing the whole
> +     * record (the first) in order to guess how the physmap should be 
> populated.
> +     */
> +    void (*guess_physmap)(struct xc_sr_context *ctx, unsigned int count,
> +                          const xen_pfn_t *pfns, const uint32_t *types);
> +
> +    /* Get the physmap population order for given PFN */
> +    int (*get_physmap_order)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
> +
>      /**
>       * Perform any actions required after the static data has arrived.  
> Called
>       * when the STATIC_DATA_COMPLETE record has been recieved/inferred.
> @@ -404,6 +416,10 @@ struct xc_sr_context
>                      {
>                          /* HVM context blob. */
>                          struct xc_sr_blob context;
> +
> +                        /* Set guest type (based on the first record) */
> +                        bool set_guest_type;
> +                        bool pvh_guest;
>                      } restore;
>                  };
>              } hvm;
> diff --git a/tools/libs/guest/xg_sr_restore.c 
> b/tools/libs/guest/xg_sr_restore.c
> index 074b56d263..af864bd5ea 100644
> --- a/tools/libs/guest/xg_sr_restore.c
> +++ b/tools/libs/guest/xg_sr_restore.c
> @@ -86,18 +86,21 @@ static bool pfn_is_populated(const struct xc_sr_context 
> *ctx, xen_pfn_t pfn)
>   * avoid realloc()ing too excessively, the size increased to the nearest 
> power
>   * of two large enough to contain the required pfn.
>   */
> -static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
> +static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn,
> +                             unsigned int order)
>  {
>      xc_interface *xch = ctx->xch;
> +    xen_pfn_t start_pfn = ROUNDDOWN(pfn, order),
> +        end_pfn = (ROUNDUP(pfn + 1, order) - 1);
>  
> -    if ( pfn > ctx->restore.max_populated_pfn )
> +    if ( end_pfn > ctx->restore.max_populated_pfn )
>      {
>          xen_pfn_t new_max;
>          size_t old_sz, new_sz;
>          unsigned long *p;
>  
>          /* Round up to the nearest power of two larger than pfn, less 1. */
> -        new_max = pfn;
> +        new_max = end_pfn;
>          new_max |= new_max >> 1;
>          new_max |= new_max >> 2;
>          new_max |= new_max >> 4;
> @@ -123,8 +126,11 @@ static int pfn_set_populated(struct xc_sr_context *ctx, 
> xen_pfn_t pfn)
>          ctx->restore.max_populated_pfn = new_max;
>      }
>  
> -    assert(!test_bit(pfn, ctx->restore.populated_pfns));
> -    set_bit(pfn, ctx->restore.populated_pfns);
> +    for ( pfn = start_pfn; pfn <= end_pfn; ++pfn )
> +    {
> +        assert(!test_bit(pfn, ctx->restore.populated_pfns));
> +        set_bit(pfn, ctx->restore.populated_pfns);
> +    }
>  
>      return 0;
>  }
> @@ -138,60 +144,40 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned 
> int count,
>                    const xen_pfn_t *original_pfns, const uint32_t *types)
>  {
>      xc_interface *xch = ctx->xch;
> -    xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
> -        *pfns = malloc(count * sizeof(*pfns));
> -    unsigned int i, nr_pfns = 0;
> +    xen_pfn_t mfn, pfn;
> +    unsigned int i, order;
>      int rc = -1;
>  
> -    if ( !mfns || !pfns )
> -    {
> -        ERROR("Failed to allocate %zu bytes for populating the physmap",
> -              2 * count * sizeof(*mfns));
> -        goto err;
> -    }
> +    /* Feed this record for family dependant heuristic to guess the physmap 
> */
> +    ctx->restore.ops.guess_physmap(ctx, count, original_pfns, types);
>  
>      for ( i = 0; i < count; ++i )
>      {
>          if ( (!types || page_type_to_populate(types[i])) &&
>               !pfn_is_populated(ctx, original_pfns[i]) )
>          {
> -            rc = pfn_set_populated(ctx, original_pfns[i]);
> +            order = ctx->restore.ops.get_physmap_order(ctx, 
> original_pfns[i]);
> +            rc = pfn_set_populated(ctx, original_pfns[i], order);
>              if ( rc )
>                  goto err;
> -            pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
> -            ++nr_pfns;
> -        }
> -    }
> -
> -    if ( nr_pfns )
> -    {
> -        rc = xc_domain_populate_physmap_exact(
> -            xch, ctx->domid, nr_pfns, 0, 0, mfns);
> -        if ( rc )
> -        {
> -            PERROR("Failed to populate physmap");
> -            goto err;
> -        }
>  
> -        for ( i = 0; i < nr_pfns; ++i )
> -        {
> -            if ( mfns[i] == INVALID_MFN )
> +            pfn = mfn = ROUNDDOWN(original_pfns[i], order);
> +            rc = xc_domain_populate_physmap_exact(xch, ctx->domid, 1, order, 
> 0,
> +                                                  &mfn);
> +            if ( rc || (mfn == INVALID_MFN) )
>              {
> -                ERROR("Populate physmap failed for pfn %u", i);
> +                ERROR("Failed to populate physmap for pfn %lu (%u)", pfn, 
> order);
>                  rc = -1;
>                  goto err;
>              }
>  
> -            ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
> +            ctx->restore.ops.set_gfn(ctx, pfn, mfn, order);
>          }
>      }
>  
>      rc = 0;
>  
>   err:
> -    free(pfns);
> -    free(mfns);
> -
>      return rc;
>  }
>  
> diff --git a/tools/libs/guest/xg_sr_restore_x86_hvm.c 
> b/tools/libs/guest/xg_sr_restore_x86_hvm.c
> index d6ea6f3012..2e525443ab 100644
> --- a/tools/libs/guest/xg_sr_restore_x86_hvm.c
> +++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c
> @@ -110,7 +110,7 @@ static xen_pfn_t x86_hvm_pfn_to_gfn(const struct 
> xc_sr_context *ctx,
>  
>  /* restore_ops function. */
>  static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
> -                            xen_pfn_t gfn)
> +                            xen_pfn_t gfn, unsigned int order)
>  {
>      /* no op */
>  }
> @@ -161,6 +161,8 @@ static int x86_hvm_setup(struct xc_sr_context *ctx)
>      }
>  #endif
>  
> +    ctx->x86.hvm.restore.set_guest_type = true;
> +
>      return 0;
>  }
>  
> @@ -192,6 +194,88 @@ static int x86_hvm_process_record(struct xc_sr_context 
> *ctx,
>      }
>  }
>  
> +/*
> + * We consider that PVH guest physmap starts from 0 and coninugiously cover 
> the
> + * pysical memory space for the first GB of memory.  HVM guest will have I/0
> + * holes in the first 2MB of memory space (at least for VGA). Therefore we
> + * should observe the very first record (wich comes in physmap order) to find
> + * out how we should map this first GB.
> + * To map the rest of the memory space in both cases (PVH or HVM) we will use
> + * the maximum available order (up to 1GB), except for forth GB wich holds 
> the
> + * low MMIO hole (at least for LAPIC MMIO window and for potential 
> passthroughed
> + * or emulated PCI devices BARs).
> + */
> +static void x86_hvm_guess_physmap(struct xc_sr_context *ctx, unsigned int 
> count,
> +                         const xen_pfn_t *pfns, const uint32_t *types)
> +{
> +    xen_pfn_t prev;
> +    unsigned int i;
> +
> +
> +    if ( !ctx->x86.hvm.restore.set_guest_type )
> +        return;
> +
> +    for ( i = 0, prev = INVALID_PFN; i < count; ++i )
> +    {
> +        if ( !types || page_type_to_populate(types[i]) )
> +        {
> +            if ( prev == INVALID_MFN )
> +            {
> +                if (pfns[i] != 0)
> +                    break;
> +            }
> +            else
> +            {
> +                if ( pfns[i] != (prev + 1) )
> +                    break;
> +            }
> +            prev = pfns[i];
> +        }
> +    }
> +
> +    ctx->x86.hvm.restore.pvh_guest = (i == count) ? true : false;
> +    ctx->x86.hvm.restore.set_guest_type = false;
> +}
> +
> +/*
> + *
> + */
> +static int x86_hvm_get_physmap_order(const struct xc_sr_context *ctx,
> +                                      xen_pfn_t pfn)
> +{
> +    int order;
> +
> +    if ( pfn >= ctx->restore.p2m_size )
> +        return 0;
> +
> +    switch (pfn >> S_PAGE_1GB_ORDER)
> +    {
> +    case 3:
> +        /* The forth GB of memory is mapped with 2MB superpages */
> +        order = S_PAGE_2MB_ORDER;
> +        break;
> +    case 0:
> +        if (!ctx->x86.hvm.restore.pvh_guest)
> +        {
> +            /* First 2MB are mapped as 4K for HVM guest */
> +            order = (pfn > 0x1ff) ? S_PAGE_2MB_ORDER : 0;
> +            break;
> +        }
> +    default:
> +        order = S_PAGE_1GB_ORDER;
> +    }
> +
> +    if ( ((ROUNDUP(pfn + 1, S_PAGE_1GB_ORDER) - 1) >= ctx->restore.p2m_size) 
> &&
> +         order == S_PAGE_1GB_ORDER )
> +        order = S_PAGE_2MB_ORDER;
> +
> +    if ( ((ROUNDUP(pfn + 1, S_PAGE_2MB_ORDER) - 1) >= ctx->restore.p2m_size) 
> &&
> +         order == S_PAGE_2MB_ORDER )
> +        order = 0;
> +
> +    return order;
> +}
> +
>  /*
>   * restore_ops function.  Sets extra hvm parameters and seeds the grant 
> table.
>   */
> @@ -258,6 +342,8 @@ struct xc_sr_restore_ops restore_ops_x86_hvm =
>      .localise_page   = x86_hvm_localise_page,
>      .setup           = x86_hvm_setup,
>      .process_record  = x86_hvm_process_record,
> +    .guess_physmap   = x86_hvm_guess_physmap,
> +    .get_physmap_order = x86_hvm_get_physmap_order,
>      .static_data_complete = x86_static_data_complete,
>      .stream_complete = x86_hvm_stream_complete,
>      .cleanup         = x86_hvm_cleanup,
> diff --git a/tools/libs/guest/xg_sr_restore_x86_pv.c 
> b/tools/libs/guest/xg_sr_restore_x86_pv.c
> index dc50b0f5a8..f8545f941a 100644
> --- a/tools/libs/guest/xg_sr_restore_x86_pv.c
> +++ b/tools/libs/guest/xg_sr_restore_x86_pv.c
> @@ -59,7 +59,7 @@ static int expand_p2m(struct xc_sr_context *ctx, unsigned 
> long max_pfn)
>      ctx->x86.pv.max_pfn = max_pfn;
>      for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
>      {
> -        ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN);
> +        ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN, 0);
>          ctx->restore.ops.set_page_type(ctx, i, 0);
>      }
>  
> @@ -947,9 +947,10 @@ static void x86_pv_set_page_type(struct xc_sr_context 
> *ctx, xen_pfn_t pfn,
>  
>  /* restore_ops function. */
>  static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
> -                           xen_pfn_t mfn)
> +                           xen_pfn_t mfn, unsigned int order)
>  {
>      assert(pfn <= ctx->x86.pv.max_pfn);
> +    assert(!order);
>  
>      if ( ctx->x86.pv.width == sizeof(uint64_t) )
>          /* 64 bit guest.  Need to expand INVALID_MFN for 32 bit toolstacks. 
> */
> @@ -1113,6 +1114,21 @@ static int x86_pv_process_record(struct xc_sr_context 
> *ctx,
>      }
>  }
>  
> +/*
> + * There's no reliable heuristic which can predict the PV guest physmap.
> + * Therefore the 0 order always will be used.
> + */
> +static void x86_pv_guess_physmap(struct xc_sr_context *ctx, unsigned int 
> count,
> +                                 const xen_pfn_t *pfns, const uint32_t 
> *types)
> +{
> +}
> +
> +static int x86_pv_get_physmap_order(const struct xc_sr_context *ctx,
> +                                    xen_pfn_t pfn)
> +{
> +    return 0;
> +}
> +
>  /*
>   * restore_ops function.  Update the vcpu context in Xen, pin the pagetables,
>   * rewrite the p2m and seed the grant table.
> @@ -1194,6 +1210,8 @@ struct xc_sr_restore_ops restore_ops_x86_pv =
>      .localise_page   = x86_pv_localise_page,
>      .setup           = x86_pv_setup,
>      .process_record  = x86_pv_process_record,
> +    .guess_physmap   = x86_pv_guess_physmap,
> +    .get_physmap_order = x86_pv_get_physmap_order,
>      .static_data_complete = x86_static_data_complete,
>      .stream_complete = x86_pv_stream_complete,
>      .cleanup         = x86_pv_cleanup,




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.