[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v10 1/3] xen/mem_sharing: VM forking



On Tue, Feb 25, 2020 at 11:17:55AM -0800, Tamas K Lengyel wrote:
> VM forking is the process of creating a domain with an empty memory space and 
> a
> parent domain specified from which to populate the memory when necessary. For
> the new domain to be functional the VM state is copied over as part of the 
> fork
> operation (HVM params, hap allocation, etc).
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@xxxxxxxxx>
> ---
> v10: setup vcpu_info pages for vCPUs in the fork if the parent has them
>      setup pages for special HVM PFNs if the parent has them
>      minor adjustments based on Roger's comments
> ---
>  xen/arch/x86/domain.c             |  11 ++
>  xen/arch/x86/hvm/hvm.c            |   4 +-
>  xen/arch/x86/mm/hap/hap.c         |   3 +-
>  xen/arch/x86/mm/mem_sharing.c     | 287 ++++++++++++++++++++++++++++++
>  xen/arch/x86/mm/p2m.c             |   9 +-
>  xen/common/domain.c               |   3 +
>  xen/include/asm-x86/hap.h         |   1 +
>  xen/include/asm-x86/hvm/hvm.h     |   2 +
>  xen/include/asm-x86/mem_sharing.h |  17 ++
>  xen/include/public/memory.h       |   5 +
>  xen/include/xen/sched.h           |   5 +
>  11 files changed, 342 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index fe63c23676..1ab0ca0942 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2203,6 +2203,17 @@ int domain_relinquish_resources(struct domain *d)
>              ret = relinquish_shared_pages(d);
>              if ( ret )
>                  return ret;
> +
> +            /*
> +             * If the domain is forked, decrement the parent's pause count
> +             * and release the domain.
> +             */
> +            if ( d->parent )
> +            {
> +                domain_unpause(d->parent);
> +                put_domain(d->parent);
> +                d->parent = NULL;
> +            }
>          }
>  #endif
>  
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index a339b36a0d..c284f3cf5f 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1915,7 +1915,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned 
> long gla,
>      }
>  #endif
>  
> -    /* Spurious fault? PoD and log-dirty also take this path. */
> +    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
>      if ( p2m_is_ram(p2mt) )
>      {
>          rc = 1;
> @@ -4429,7 +4429,7 @@ static int hvm_allow_get_param(struct domain *d,
>      return rc;
>  }
>  
> -static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
> +int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
>  {
>      int rc;
>  
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index 3d93f3451c..c7c7ff6e99 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct 
> page_info *pg)
>  }
>  
>  /* Return the size of the pool, rounded up to the nearest MB */
> -static unsigned int
> -hap_get_allocation(struct domain *d)
> +unsigned int hap_get_allocation(struct domain *d)
>  {
>      unsigned int pg = d->arch.paging.hap.total_pages
>          + d->arch.paging.hap.p2m_pages;
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index 3835bc928f..8ee37e6943 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -22,6 +22,7 @@
>  
>  #include <xen/types.h>
>  #include <xen/domain_page.h>
> +#include <xen/event.h>
>  #include <xen/spinlock.h>
>  #include <xen/rwlock.h>
>  #include <xen/mm.h>
> @@ -36,6 +37,8 @@
>  #include <asm/altp2m.h>
>  #include <asm/atomic.h>
>  #include <asm/event.h>
> +#include <asm/hap.h>
> +#include <asm/hvm/hvm.h>
>  #include <xsm/xsm.h>
>  
>  #include "mm-locks.h"
> @@ -1444,6 +1447,263 @@ static inline int mem_sharing_control(struct domain 
> *d, bool enable)
>      return 0;
>  }
>  
> +/*
> + * Forking a page only gets called when the VM faults due to no entry being
> + * in the EPT for the access. Depending on the type of access we either
> + * populate the physmap with a shared entry for read-only access or
> + * fork the page if its a write access.
> + *
> + * The client p2m is already locked so we only need to lock
> + * the parent's here.
> + */
> +int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
> +{
> +    int rc = -ENOENT;
> +    shr_handle_t handle;
> +    struct domain *parent = d->parent;
> +    struct p2m_domain *p2m;
> +    unsigned long gfn_l = gfn_x(gfn);
> +    mfn_t mfn, new_mfn;
> +    p2m_type_t p2mt;
> +    struct page_info *page;
> +
> +    if ( !mem_sharing_is_fork(d) )
> +        return -ENOENT;
> +
> +    if ( !unsharing )
> +    {
> +        /* For read-only accesses we just add a shared entry to the physmap 
> */
> +        while ( parent )
> +        {
> +            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
> +                break;
> +
> +            parent = parent->parent;
> +        }
> +
> +        if ( !rc )
> +        {
> +            /* The client's p2m is already locked */
> +            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
> +
> +            p2m_lock(pp2m);
> +            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
> +            p2m_unlock(pp2m);
> +
> +            if ( !rc )
> +                return 0;
> +        }
> +    }
> +
> +    /*
> +     * If it's a write access (ie. unsharing) or if adding a shared entry to
> +     * the physmap failed we'll fork the page directly.
> +     */
> +    p2m = p2m_get_hostp2m(d);
> +    parent = d->parent;
> +
> +    while ( parent )
> +    {
> +        mfn = get_gfn_query(parent, gfn_l, &p2mt);
> +
> +        /*
> +         * We can't fork grant memory from the parent, only regular ram.
> +         */
> +        if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
> +            break;
> +
> +        put_gfn(parent, gfn_l);
> +        parent = parent->parent;
> +    }
> +
> +    if ( !parent )
> +        return -ENOENT;
> +
> +    if ( !(page = alloc_domheap_page(d, 0)) )
> +    {
> +        put_gfn(parent, gfn_l);
> +        return -ENOMEM;
> +    }
> +
> +    new_mfn = page_to_mfn(page);
> +    copy_domain_page(new_mfn, mfn);
> +    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> +
> +    put_gfn(parent, gfn_l);
> +
> +    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> +                          p2m->default_access, -1);
> +}
> +
> +static int bring_up_vcpus(struct domain *cd, struct domain *d)
> +{
> +    unsigned int i;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> +    int ret = -EINVAL;
> +
> +    if ( d->max_vcpus != cd->max_vcpus )
> +        return ret;
> +
> +    if ( (ret = cpupool_move_domain(cd, d->cpupool)) )
> +        return ret;

You can join both ifs into a single one, since both return ret.

> +
> +    for ( i = 0; i < cd->max_vcpus; i++ )
> +    {
> +        mfn_t vcpu_info_mfn;
> +
> +        if ( !d->vcpu[i] || cd->vcpu[i] )
> +            continue;
> +
> +        if ( !vcpu_create(cd, i) )
> +            return -EINVAL;
> +
> +        /*
> +         * Map in a page for the vcpu_info if the guest uses one to the exact
> +         * same spot.
> +         */
> +        vcpu_info_mfn = d->vcpu[i]->vcpu_info_mfn;
> +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> +        {
> +            struct page_info *page;
> +            mfn_t new_mfn;
> +            gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> +            unsigned long gfn_l = gfn_x(gfn);
> +
> +            if ( !(page = alloc_domheap_page(cd, 0)) )
> +                return -ENOMEM;
> +
> +            new_mfn = page_to_mfn(page);
> +            set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> +
> +            if ( !(ret = p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K,
> +                                        p2m_ram_rw, p2m->default_access, 
> -1)) )
> +                return ret;
> +
> +            if ( !(ret = map_vcpu_info(cd->vcpu[i], gfn_l,
> +                                       d->vcpu[i]->vcpu_info_offset)) )
> +                return ret;

I think you also need to copy the contents from the parent into those
vcpu_info areas, or else you might discard pending event channels
contained in the evtchn_* fields? (and the masked channels if any).

The runtime area should be handled in a similar way AFAICT (albeit
there's no need to copy the parent's data in that case), see
VCPUOP_register_runstate_memory_area.

> +        }
> +    }
> +
> +    domain_update_node_affinity(cd);
> +    return 0;
> +}
> +
> +static int fork_hap_allocation(struct domain *cd, struct domain *d)
> +{
> +    int rc;
> +    bool preempted;
> +    unsigned long mb = hap_get_allocation(d);
> +
> +    if ( mb == hap_get_allocation(cd) )
> +        return 0;
> +
> +    paging_lock(cd);
> +    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
> +    paging_unlock(cd);
> +
> +    return preempted ? -ERESTART : rc;
> +}
> +
> +static void fork_tsc(struct domain *cd, struct domain *d)
> +{
> +    uint32_t tsc_mode;
> +    uint32_t gtsc_khz;
> +    uint32_t incarnation;
> +    uint64_t elapsed_nsec;
> +
> +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> +    /* Don't bump incarnation on set */
> +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
> +}
> +
> +static int populate_special_pages(struct domain *cd)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> +    static const unsigned int params[] =
> +    {
> +        HVM_PARAM_STORE_PFN,
> +        HVM_PARAM_IOREQ_PFN,
> +        HVM_PARAM_BUFIOREQ_PFN,
> +        HVM_PARAM_CONSOLE_PFN
> +    };
> +    unsigned int i;
> +
> +    for ( i=0; i<4; i++ )

Nit: can you please add some spaces around the operators?

> +    {
> +        uint64_t value = 0;
> +        mfn_t new_mfn;
> +        struct page_info *page;
> +
> +        if ( hvm_get_param(cd, params[i], &value) || !value )
> +            continue;
> +
> +        if ( !(page = alloc_domheap_page(cd, 0)) )
> +            return -ENOMEM;
> +
> +        new_mfn = page_to_mfn(page);
> +        set_gpfn_from_mfn(mfn_x(new_mfn), value);
> +
> +        return p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
> +                              p2m_ram_rw, p2m->default_access, -1);

I think you also need to copy the contents from the parent page here.

> +    }
> +
> +    return 0;
> +}
> +
> +static int fork(struct domain *d, struct domain *cd)
> +{
> +    int rc = -EBUSY;
> +
> +    if ( !cd->controller_pause_count )
> +        return rc;
> +
> +    /*
> +     * We only want to get and pause the parent once, not each time this
> +     * operation is restarted due to preemption.
> +     */
> +    if ( !cd->parent_paused )
> +    {
> +        if ( !get_domain(d) )
> +        {
> +            ASSERT_UNREACHABLE();
> +            return -EBUSY;
> +        }
> +
> +        domain_pause(d);
> +        cd->parent_paused = true;
> +        cd->max_pages = d->max_pages;
> +        cd->max_vcpus = d->max_vcpus;
> +    }
> +
> +    /* this is preemptible so it's the first to get done */
> +    if ( (rc = fork_hap_allocation(cd, d)) )
> +        goto done;
> +
> +    if ( (rc = bring_up_vcpus(cd, d)) )
> +        goto done;
> +
> +    if ( (rc = hvm_copy_context_and_params(cd, d)) )
> +        goto done;
> +
> +    if ( (rc = populate_special_pages(cd)) )
> +        goto done;
> +
> +    fork_tsc(cd, d);

I think you need to copy the contents of the shared info page from the
parent into the child, or else you are discarding any pending event
channels. You should also map such shared info page into the same gfn
as the parent.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.