[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v12 1/3] xen/mem_sharing: VM forking



Sorry it has taken me a while to get to this.

On Mon, Mar 23, 2020 at 10:04:35AM -0700, Tamas K Lengyel wrote:
> VM forking is the process of creating a domain with an empty memory space and 
> a
> parent domain specified from which to populate the memory when necessary. For
> the new domain to be functional the VM state is copied over as part of the 
> fork
> operation (HVM params, hap allocation, etc).
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@xxxxxxxxx>
> ---
> v12: Minor style adjustments Jan pointed out
>      Convert mem_sharing_is_fork to inline function
> v11: Fully copy vcpu_info pages
>      Setup vcpu_runstate for forks
>      Added TODO note for PV timers
>      Copy shared_info page
>      Add copy_settings function, to be shared with fork_reset in the next 
> patch
> ---
>  xen/arch/x86/domain.c             |  11 +
>  xen/arch/x86/hvm/hvm.c            |   4 +-
>  xen/arch/x86/mm/hap/hap.c         |   3 +-
>  xen/arch/x86/mm/mem_sharing.c     | 368 ++++++++++++++++++++++++++++++
>  xen/arch/x86/mm/p2m.c             |   9 +-
>  xen/common/domain.c               |   3 +
>  xen/include/asm-x86/hap.h         |   1 +
>  xen/include/asm-x86/hvm/hvm.h     |   2 +
>  xen/include/asm-x86/mem_sharing.h |  18 ++
>  xen/include/public/memory.h       |   5 +
>  xen/include/xen/sched.h           |   5 +
>  11 files changed, 424 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index caf2ecad7e..11d3c2216e 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d)
>              ret = relinquish_shared_pages(d);
>              if ( ret )
>                  return ret;
> +
> +            /*
> +             * If the domain is forked, decrement the parent's pause count
> +             * and release the domain.
> +             */
> +            if ( mem_sharing_is_fork(d) )
> +            {
> +                domain_unpause(d->parent);
> +                put_domain(d->parent);
> +                d->parent = NULL;
> +            }
>          }
>  #endif
>  
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index a3d115b650..304b3d1562 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1917,7 +1917,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned 
> long gla,
>      }
>  #endif
>  
> -    /* Spurious fault? PoD and log-dirty also take this path. */
> +    /* Spurious fault? PoD, log-dirty and VM forking also take this path. */
>      if ( p2m_is_ram(p2mt) )
>      {
>          rc = 1;
> @@ -4377,7 +4377,7 @@ static int hvm_allow_get_param(struct domain *d,
>      return rc;
>  }
>  
> -static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
> +int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
>  {
>      int rc;
>  
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index a6d5e39b02..814d0c3253 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct 
> page_info *pg)
>  }
>  
>  /* Return the size of the pool, rounded up to the nearest MB */
> -static unsigned int
> -hap_get_allocation(struct domain *d)
> +unsigned int hap_get_allocation(struct domain *d)
>  {
>      unsigned int pg = d->arch.paging.hap.total_pages
>          + d->arch.paging.hap.p2m_pages;
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index 3835bc928f..23deeddff2 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -22,6 +22,7 @@
>  
>  #include <xen/types.h>
>  #include <xen/domain_page.h>
> +#include <xen/event.h>
>  #include <xen/spinlock.h>
>  #include <xen/rwlock.h>
>  #include <xen/mm.h>
> @@ -36,6 +37,8 @@
>  #include <asm/altp2m.h>
>  #include <asm/atomic.h>
>  #include <asm/event.h>
> +#include <asm/hap.h>
> +#include <asm/hvm/hvm.h>
>  #include <xsm/xsm.h>
>  
>  #include "mm-locks.h"
> @@ -1444,6 +1447,334 @@ static inline int mem_sharing_control(struct domain 
> *d, bool enable)
>      return 0;
>  }
>  
> +/*
> + * Forking a page only gets called when the VM faults due to no entry being
> + * in the EPT for the access. Depending on the type of access we either
> + * populate the physmap with a shared entry for read-only access or
> + * fork the page if its a write access.
> + *
> + * The client p2m is already locked so we only need to lock
> + * the parent's here.
> + */
> +int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
> +{
> +    int rc = -ENOENT;
> +    shr_handle_t handle;
> +    struct domain *parent = d->parent;
> +    struct p2m_domain *p2m;
> +    unsigned long gfn_l = gfn_x(gfn);
> +    mfn_t mfn, new_mfn;
> +    p2m_type_t p2mt;
> +    struct page_info *page;
> +
> +    if ( !mem_sharing_is_fork(d) )
> +        return -ENOENT;
> +
> +    if ( !unsharing )
> +    {
> +        /* For read-only accesses we just add a shared entry to the physmap 
> */
> +        while ( parent )
> +        {
> +            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
> +                break;
> +
> +            parent = parent->parent;
> +        }
> +
> +        if ( !rc )
> +        {
> +            /* The client's p2m is already locked */
> +            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);

Nit: I think you could just use the existing p2m local variable.

> +            p2m_lock(pp2m);
> +            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
> +            p2m_unlock(pp2m);
> +
> +            if ( !rc )
> +                return 0;
> +        }
> +    }
> +
> +    /*
> +     * If it's a write access (ie. unsharing) or if adding a shared entry to
> +     * the physmap failed we'll fork the page directly.
> +     */
> +    p2m = p2m_get_hostp2m(d);
> +    parent = d->parent;
> +
> +    while ( parent )
> +    {
> +        mfn = get_gfn_query(parent, gfn_l, &p2mt);
> +
> +        /*
> +         * We can't fork grant memory from the parent, only regular ram.
> +         */

Nit: single line comments should use /* ... */ (here and below).

> +        if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
> +            break;
> +
> +        put_gfn(parent, gfn_l);
> +        parent = parent->parent;
> +    }
> +
> +    if ( !parent )
> +        return -ENOENT;
> +
> +    if ( !(page = alloc_domheap_page(d, 0)) )
> +    {
> +        put_gfn(parent, gfn_l);
> +        return -ENOMEM;
> +    }
> +
> +    new_mfn = page_to_mfn(page);
> +    copy_domain_page(new_mfn, mfn);
> +    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> +
> +    put_gfn(parent, gfn_l);
> +
> +    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> +                          p2m->default_access, -1);
> +}
> +
> +static int bring_up_vcpus(struct domain *cd, struct domain *d)
> +{
> +    unsigned int i;
> +    int ret = -EINVAL;

Nit: you can get rid of ret...

> +
> +    if ( d->max_vcpus != cd->max_vcpus ||
> +        (ret = cpupool_move_domain(cd, d->cpupool)) )
> +        return ret;

...and just return -EINVAL here. Seeing as it's not used anywhere
else.

> +
> +    for ( i = 0; i < cd->max_vcpus; i++ )
> +    {
> +        if ( !d->vcpu[i] || cd->vcpu[i] )
> +            continue;
> +
> +        if ( !vcpu_create(cd, i) )
> +            return -EINVAL;
> +    }
> +
> +    domain_update_node_affinity(cd);
> +    return 0;
> +}
> +
> +static int copy_vcpu_settings(struct domain *cd, struct domain *d)
> +{
> +    unsigned int i;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> +    int ret = -EINVAL;
> +
> +    for ( i = 0; i < cd->max_vcpus; i++ )
> +    {
> +        const struct vcpu *d_vcpu = d->vcpu[i];
> +        struct vcpu *cd_vcpu = cd->vcpu[i];
> +        struct vcpu_runstate_info runstate;
> +        mfn_t vcpu_info_mfn;
> +
> +        if ( !d_vcpu || !cd_vcpu )
> +            continue;
> +
> +        /*
> +         * Copy & map in the vcpu_info page if the guest uses one
> +         */
> +        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> +        {
> +            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> +
> +            /*
> +             * Allocate & map the page for it if it hasn't been already
> +             */
> +            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
> +            {
> +                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> +                unsigned long gfn_l = gfn_x(gfn);
> +                struct page_info *page;
> +
> +                if ( !(page = alloc_domheap_page(cd, 0)) )
> +                    return -ENOMEM;
> +
> +                new_vcpu_info_mfn = page_to_mfn(page);
> +                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
> +
> +                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, 
> PAGE_ORDER_4K,
> +                                     p2m_ram_rw, p2m->default_access, -1);
> +                if ( ret )
> +                    return ret;
> +
> +                ret = map_vcpu_info(cd_vcpu, gfn_l,
> +                                    d_vcpu->vcpu_info_offset);
> +                if ( ret )
> +                    return ret;
> +            }
> +
> +            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
> +        }
> +
> +        /*
> +         * Setup the vCPU runstate area
> +         */
> +        if ( guest_handle_is_null(runstate_guest(cd_vcpu)) )

Maybe I'm confused, but isn't this the other way around and you need
to check? If the parent runstate is not null copy it to the fork,
ie:

if ( !guest_handle_is_null(runstate_guest(d_vcpu)) )
{
    ...

> +        {
> +            runstate_guest(cd_vcpu) = runstate_guest(d_vcpu);
> +            vcpu_runstate_get(cd_vcpu, &runstate);
> +            __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1);

You should check the return code I think.

> +        }
> +
> +        /*
> +         * TODO: to support VMs with PV interfaces copy additional
> +         * settings here, such as PV timers.
> +         */
> +    }
> +
> +    return 0;
> +}
> +
> +static int fork_hap_allocation(struct domain *cd, struct domain *d)
> +{
> +    int rc;
> +    bool preempted;
> +    unsigned long mb = hap_get_allocation(d);
> +
> +    if ( mb == hap_get_allocation(cd) )
> +        return 0;
> +
> +    paging_lock(cd);
> +    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
> +    paging_unlock(cd);
> +
> +    return preempted ? -ERESTART : rc;
> +}
> +
> +static void copy_tsc(struct domain *cd, struct domain *d)
> +{
> +    uint32_t tsc_mode;
> +    uint32_t gtsc_khz;
> +    uint32_t incarnation;
> +    uint64_t elapsed_nsec;
> +
> +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> +    /* Don't bump incarnation on set */
> +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
> +}
> +
> +static int copy_special_pages(struct domain *cd, struct domain *d)
> +{
> +    mfn_t new_mfn, old_mfn;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> +    static const unsigned int params[] =
> +    {
> +        HVM_PARAM_STORE_PFN,
> +        HVM_PARAM_IOREQ_PFN,
> +        HVM_PARAM_BUFIOREQ_PFN,
> +        HVM_PARAM_CONSOLE_PFN
> +    };
> +    unsigned int i;
> +    int rc;
> +
> +    for ( i = 0; i < 4; i++ )

Please use ARRAY_SIZE instead of hard coding 4.

> diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> index 9f51370327..1ed7d13084 100644
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, 
> unsigned long gfn_l,
>  
>      mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
>  
> +    /* Check if we need to fork the page */
> +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> +
> +    /* Check if we need to unshare the page */
>      if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
>      {
>          ASSERT(p2m_is_hostp2m(p2m));
> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
>              return page;
>  
>          /* Error path: not a suitable GFN at all */
> -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> +             !mem_sharing_is_fork(p2m->domain) )
>              return NULL;
>      }
>  
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index b4eb476a9c..62aed53a16 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, 
> unsigned offset)
>  
>      v->vcpu_info = new_info;
>      v->vcpu_info_mfn = page_to_mfn(page);
> +#ifdef CONFIG_MEM_SHARING
> +    v->vcpu_info_offset = offset;

There's no need to introduce this field, you can just use v->vcpu_info
& ~PAGE_MASK AFAICT.

Thanks, Roger.



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.