Xen project Mailing List

Re: [Xen-devel] [PATCH v12 1/3] xen/mem_sharing: VM forking

To: Tamas K Lengyel <tamas.lengyel@xxxxxxxxx>

From: Roger Pau Monné <roger.pau@xxxxxxxxxx>

Date: Wed, 25 Mar 2020 16:47:02 +0100

Authentication-results: esa2.hc3370-68.iphmx.com; dkim=none (message not signed) header.i=none; spf=None smtp.pra=roger.pau@xxxxxxxxxx; spf=Pass smtp.mailfrom=roger.pau@xxxxxxxxxx; spf=None smtp.helo=postmaster@xxxxxxxxxxxxxxx

Cc: Stefano Stabellini <sstabellini@xxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Wei Liu <wl@xxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Ian Jackson <ian.jackson@xxxxxxxxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Tamas K Lengyel <tamas@xxxxxxxxxxxxx>, Jan Beulich <jbeulich@xxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx

Delivery-date: Wed, 25 Mar 2020 15:47:15 +0000

Ironport-sdr: kZrplel5J875bLuTyDuYOJJl4zbXErwsm/hiaZVIkGN0A6YdRGHqdVaZgPiSrg5ZX0Ri0JjdLL umVZAXN1X4IkTxxEKdx6BeEazjixL2Y5cVpuEXuLfvOcTbvAq3n0rLyIoTRIg79Q58iU4b/nOY 1At5dHVgz02ssHnnCZWCw0thBL24J7A3HHzBMDSJVd12zFSnRTEGyuM4d+8m0ymZZlC5wqTNWY mkmxJemyMC2ffmlcA7bvqxPYGvtmhLlx+uFhAID+UTfefARNRuritHeRWuot/9mw5GoF/QmWNy TeI=

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Sorry it has taken me a while to get to this. On Mon, Mar 23, 2020 at 10:04:35AM -0700, Tamas K Lengyel wrote: > VM forking is the process of creating a domain with an empty memory space and > a > parent domain specified from which to populate the memory when necessary. For > the new domain to be functional the VM state is copied over as part of the > fork > operation (HVM params, hap allocation, etc). > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@xxxxxxxxx> > --- > v12: Minor style adjustments Jan pointed out > Convert mem_sharing_is_fork to inline function > v11: Fully copy vcpu_info pages > Setup vcpu_runstate for forks > Added TODO note for PV timers > Copy shared_info page > Add copy_settings function, to be shared with fork_reset in the next > patch > --- > xen/arch/x86/domain.c | 11 + > xen/arch/x86/hvm/hvm.c | 4 +- > xen/arch/x86/mm/hap/hap.c | 3 +- > xen/arch/x86/mm/mem_sharing.c | 368 ++++++++++++++++++++++++++++++ > xen/arch/x86/mm/p2m.c | 9 +- > xen/common/domain.c | 3 + > xen/include/asm-x86/hap.h | 1 + > xen/include/asm-x86/hvm/hvm.h | 2 + > xen/include/asm-x86/mem_sharing.h | 18 ++ > xen/include/public/memory.h | 5 + > xen/include/xen/sched.h | 5 + > 11 files changed, 424 insertions(+), 5 deletions(-) > > diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c > index caf2ecad7e..11d3c2216e 100644 > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d) > ret = relinquish_shared_pages(d); > if ( ret ) > return ret; > + > + /* > + * If the domain is forked, decrement the parent's pause count > + * and release the domain. > + */ > + if ( mem_sharing_is_fork(d) ) > + { > + domain_unpause(d->parent); > + put_domain(d->parent); > + d->parent = NULL; > + } > } > #endif > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > index a3d115b650..304b3d1562 100644 > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -1917,7 +1917,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned > long gla, > } > #endif > > - /* Spurious fault? PoD and log-dirty also take this path. */ > + /* Spurious fault? PoD, log-dirty and VM forking also take this path. */ > if ( p2m_is_ram(p2mt) ) > { > rc = 1; > @@ -4377,7 +4377,7 @@ static int hvm_allow_get_param(struct domain *d, > return rc; > } > > -static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value) > +int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value) > { > int rc; > > diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c > index a6d5e39b02..814d0c3253 100644 > --- a/xen/arch/x86/mm/hap/hap.c > +++ b/xen/arch/x86/mm/hap/hap.c > @@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct > page_info *pg) > } > > /* Return the size of the pool, rounded up to the nearest MB */ > -static unsigned int > -hap_get_allocation(struct domain *d) > +unsigned int hap_get_allocation(struct domain *d) > { > unsigned int pg = d->arch.paging.hap.total_pages > + d->arch.paging.hap.p2m_pages; > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c > index 3835bc928f..23deeddff2 100644 > --- a/xen/arch/x86/mm/mem_sharing.c > +++ b/xen/arch/x86/mm/mem_sharing.c > @@ -22,6 +22,7 @@ > > #include <xen/types.h> > #include <xen/domain_page.h> > +#include <xen/event.h> > #include <xen/spinlock.h> > #include <xen/rwlock.h> > #include <xen/mm.h> > @@ -36,6 +37,8 @@ > #include <asm/altp2m.h> > #include <asm/atomic.h> > #include <asm/event.h> > +#include <asm/hap.h> > +#include <asm/hvm/hvm.h> > #include <xsm/xsm.h> > > #include "mm-locks.h" > @@ -1444,6 +1447,334 @@ static inline int mem_sharing_control(struct domain > *d, bool enable) > return 0; > } > > +/* > + * Forking a page only gets called when the VM faults due to no entry being > + * in the EPT for the access. Depending on the type of access we either > + * populate the physmap with a shared entry for read-only access or > + * fork the page if its a write access. > + * > + * The client p2m is already locked so we only need to lock > + * the parent's here. > + */ > +int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing) > +{ > + int rc = -ENOENT; > + shr_handle_t handle; > + struct domain *parent = d->parent; > + struct p2m_domain *p2m; > + unsigned long gfn_l = gfn_x(gfn); > + mfn_t mfn, new_mfn; > + p2m_type_t p2mt; > + struct page_info *page; > + > + if ( !mem_sharing_is_fork(d) ) > + return -ENOENT; > + > + if ( !unsharing ) > + { > + /* For read-only accesses we just add a shared entry to the physmap > */ > + while ( parent ) > + { > + if ( !(rc = nominate_page(parent, gfn, 0, &handle)) ) > + break; > + > + parent = parent->parent; > + } > + > + if ( !rc ) > + { > + /* The client's p2m is already locked */ > + struct p2m_domain *pp2m = p2m_get_hostp2m(parent); Nit: I think you could just use the existing p2m local variable. > + p2m_lock(pp2m); > + rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false); > + p2m_unlock(pp2m); > + > + if ( !rc ) > + return 0; > + } > + } > + > + /* > + * If it's a write access (ie. unsharing) or if adding a shared entry to > + * the physmap failed we'll fork the page directly. > + */ > + p2m = p2m_get_hostp2m(d); > + parent = d->parent; > + > + while ( parent ) > + { > + mfn = get_gfn_query(parent, gfn_l, &p2mt); > + > + /* > + * We can't fork grant memory from the parent, only regular ram. > + */ Nit: single line comments should use /* ... */ (here and below). > + if ( mfn_valid(mfn) && p2m_is_ram(p2mt) ) > + break; > + > + put_gfn(parent, gfn_l); > + parent = parent->parent; > + } > + > + if ( !parent ) > + return -ENOENT; > + > + if ( !(page = alloc_domheap_page(d, 0)) ) > + { > + put_gfn(parent, gfn_l); > + return -ENOMEM; > + } > + > + new_mfn = page_to_mfn(page); > + copy_domain_page(new_mfn, mfn); > + set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l); > + > + put_gfn(parent, gfn_l); > + > + return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw, > + p2m->default_access, -1); > +} > + > +static int bring_up_vcpus(struct domain *cd, struct domain *d) > +{ > + unsigned int i; > + int ret = -EINVAL; Nit: you can get rid of ret... > + > + if ( d->max_vcpus != cd->max_vcpus || > + (ret = cpupool_move_domain(cd, d->cpupool)) ) > + return ret; ...and just return -EINVAL here. Seeing as it's not used anywhere else. > + > + for ( i = 0; i < cd->max_vcpus; i++ ) > + { > + if ( !d->vcpu[i] || cd->vcpu[i] ) > + continue; > + > + if ( !vcpu_create(cd, i) ) > + return -EINVAL; > + } > + > + domain_update_node_affinity(cd); > + return 0; > +} > + > +static int copy_vcpu_settings(struct domain *cd, struct domain *d) > +{ > + unsigned int i; > + struct p2m_domain *p2m = p2m_get_hostp2m(cd); > + int ret = -EINVAL; > + > + for ( i = 0; i < cd->max_vcpus; i++ ) > + { > + const struct vcpu *d_vcpu = d->vcpu[i]; > + struct vcpu *cd_vcpu = cd->vcpu[i]; > + struct vcpu_runstate_info runstate; > + mfn_t vcpu_info_mfn; > + > + if ( !d_vcpu || !cd_vcpu ) > + continue; > + > + /* > + * Copy & map in the vcpu_info page if the guest uses one > + */ > + vcpu_info_mfn = d_vcpu->vcpu_info_mfn; > + if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) ) > + { > + mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn; > + > + /* > + * Allocate & map the page for it if it hasn't been already > + */ > + if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) ) > + { > + gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn); > + unsigned long gfn_l = gfn_x(gfn); > + struct page_info *page; > + > + if ( !(page = alloc_domheap_page(cd, 0)) ) > + return -ENOMEM; > + > + new_vcpu_info_mfn = page_to_mfn(page); > + set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l); > + > + ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, > PAGE_ORDER_4K, > + p2m_ram_rw, p2m->default_access, -1); > + if ( ret ) > + return ret; > + > + ret = map_vcpu_info(cd_vcpu, gfn_l, > + d_vcpu->vcpu_info_offset); > + if ( ret ) > + return ret; > + } > + > + copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn); > + } > + > + /* > + * Setup the vCPU runstate area > + */ > + if ( guest_handle_is_null(runstate_guest(cd_vcpu)) ) Maybe I'm confused, but isn't this the other way around and you need to check? If the parent runstate is not null copy it to the fork, ie: if ( !guest_handle_is_null(runstate_guest(d_vcpu)) ) { ... > + { > + runstate_guest(cd_vcpu) = runstate_guest(d_vcpu); > + vcpu_runstate_get(cd_vcpu, &runstate); > + __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1); You should check the return code I think. > + } > + > + /* > + * TODO: to support VMs with PV interfaces copy additional > + * settings here, such as PV timers. > + */ > + } > + > + return 0; > +} > + > +static int fork_hap_allocation(struct domain *cd, struct domain *d) > +{ > + int rc; > + bool preempted; > + unsigned long mb = hap_get_allocation(d); > + > + if ( mb == hap_get_allocation(cd) ) > + return 0; > + > + paging_lock(cd); > + rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted); > + paging_unlock(cd); > + > + return preempted ? -ERESTART : rc; > +} > + > +static void copy_tsc(struct domain *cd, struct domain *d) > +{ > + uint32_t tsc_mode; > + uint32_t gtsc_khz; > + uint32_t incarnation; > + uint64_t elapsed_nsec; > + > + tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation); > + /* Don't bump incarnation on set */ > + tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1); > +} > + > +static int copy_special_pages(struct domain *cd, struct domain *d) > +{ > + mfn_t new_mfn, old_mfn; > + struct p2m_domain *p2m = p2m_get_hostp2m(cd); > + static const unsigned int params[] = > + { > + HVM_PARAM_STORE_PFN, > + HVM_PARAM_IOREQ_PFN, > + HVM_PARAM_BUFIOREQ_PFN, > + HVM_PARAM_CONSOLE_PFN > + }; > + unsigned int i; > + int rc; > + > + for ( i = 0; i < 4; i++ ) Please use ARRAY_SIZE instead of hard coding 4. > diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c > index 9f51370327..1ed7d13084 100644 > --- a/xen/arch/x86/mm/p2m.c > +++ b/xen/arch/x86/mm/p2m.c > @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, > unsigned long gfn_l, > > mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL); > > + /* Check if we need to fork the page */ > + if ( (q & P2M_ALLOC) && p2m_is_hole(*t) && > + !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) ) > + mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL); > + > + /* Check if we need to unshare the page */ > if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) ) > { > ASSERT(p2m_is_hostp2m(p2m)); > @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn( > return page; > > /* Error path: not a suitable GFN at all */ > - if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) ) > + if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) && > + !mem_sharing_is_fork(p2m->domain) ) > return NULL; > } > > diff --git a/xen/common/domain.c b/xen/common/domain.c > index b4eb476a9c..62aed53a16 100644 > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, > unsigned offset) > > v->vcpu_info = new_info; > v->vcpu_info_mfn = page_to_mfn(page); > +#ifdef CONFIG_MEM_SHARING > + v->vcpu_info_offset = offset; There's no need to introduce this field, you can just use v->vcpu_info & ~PAGE_MASK AFAICT. Thanks, Roger.

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.