Xen project Mailing List

[PATCH v2 8/8] common: convert vCPU info area registration

To: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>

Date: Mon, 23 Jan 2023 15:57:22 +0100

Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none

Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=4vRXvWQLyGKAmG7Bji/j/VfcIcDZDAV0kqzYj4NHDKM=; b=ibdqDd717LlSPHfSaxIXXgWfWJbHocf82gZgDEC1yaMy5UN7+JWVSsSSGsy/y+PPgqKsYk90DXY2LbwFXDRx+l4TXxCQ2woVaLtOxEiYNAtuw5ZhpMtQgvaueKjdTiqanVEF0URlYgpd5AVY2ERjP/0+YD0Vt//0W/RC8K3q3ak7hdffckr0nFkt9JQkosWEjuyehnMuAuwqumiMvKqg6w+YYC/4im522C7+Ldmi8I5K2M4qpjCyZy8YwOb2RXo0MJr3W5iWiV6X2xa2xDRFtmavkYCSSt0DHY0jrOtABmKiLD4lRzXCrGRdEIB9E1JxCA2GMxxb/hDsV3M9wc/Wcg==

Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=BBL5oriJJl9QVmA217MpxakpE+Qh9lBxqdfPRykcVGBQ+kYV6ur31uGRg5vTuFE3oJ9AgKcRdwWMa3ONVbInZp4+KLB762IQ4wMQnnH2vqlFvNnoKFFbgU6UjfgcPIQAjPSAnivogA7rgstKujrpRh1lJzK9lo58PESuP+SdZWo61wF4wfF4YG5qVD5yhzub6wDCzwkhos+cQrGTofQSMHFK5d2mw4Hxfpe+cfuOJR1BHWXiMqgrEU++CyYN/FIOqRDCtOw8ykhMb4F4+3upfuFnRytGlJvwUcf1n8N0y9SlRtO20sZ3S+l9mZbCV9bhURHTTlery2EeSZmdqYM13g==

Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;

Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Tamas K Lengyel <tamas@xxxxxxxxxxxxx>

Delivery-date: Mon, 23 Jan 2023 14:57:31 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Switch to using map_guest_area(). Noteworthy differences from map_vcpu_info(): - remote vCPU-s are paused rather than checked for being down (which in principle can change right after the check), - the domain lock is taken for a much smaller region, - the error code for an attempt to re-register the area is now -EBUSY, - we could in principle permit de-registration when no area was previously registered (which would permit "probing", if necessary for anything). Note that this eliminates a bug in copy_vcpu_settings(): The function did allocate a new page regardless of the GFN already having a mapping, thus in particular breaking the case of two vCPU-s having their info areas on the same page. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- RFC: I'm not really certain whether the preliminary check (ahead of calling map_guest_area()) is worthwhile to have. --- v2: Re-base over changes earlier in the series. Properly enforce no re- registration. Avoid several casts by introducing local variables. --- a/xen/arch/x86/include/asm/shared.h +++ b/xen/arch/x86/include/asm/shared.h @@ -26,17 +26,20 @@ static inline void arch_set_##field(stru #define GET_SET_VCPU(type, field) \ static inline type arch_get_##field(const struct vcpu *v) \ { \ + const vcpu_info_t *vi = v->vcpu_info_area.map; \ + \ return !has_32bit_shinfo(v->domain) ? \ - v->vcpu_info->native.arch.field : \ - v->vcpu_info->compat.arch.field; \ + vi->native.arch.field : vi->compat.arch.field; \ } \ static inline void arch_set_##field(struct vcpu *v, \ type val) \ { \ + vcpu_info_t *vi = v->vcpu_info_area.map; \ + \ if ( !has_32bit_shinfo(v->domain) ) \ - v->vcpu_info->native.arch.field = val; \ + vi->native.arch.field = val; \ else \ - v->vcpu_info->compat.arch.field = val; \ + vi->compat.arch.field = val; \ } #else @@ -57,12 +60,16 @@ static inline void arch_set_##field(stru #define GET_SET_VCPU(type, field) \ static inline type arch_get_##field(const struct vcpu *v) \ { \ - return v->vcpu_info->arch.field; \ + const vcpu_info_t *vi = v->vcpu_info_area.map; \ + \ + return vi->arch.field; \ } \ static inline void arch_set_##field(struct vcpu *v, \ type val) \ { \ - v->vcpu_info->arch.field = val; \ + vcpu_info_t *vi = v->vcpu_info_area.map; \ + \ + vi->arch.field = val; \ } #endif --- a/xen/arch/x86/mm/mem_sharing.c +++ b/xen/arch/x86/mm/mem_sharing.c @@ -1758,53 +1758,24 @@ static int copy_vpmu(struct vcpu *d_vcpu static int copy_vcpu_settings(struct domain *cd, const struct domain *d) { unsigned int i; - struct p2m_domain *p2m = p2m_get_hostp2m(cd); int ret = -EINVAL; for ( i = 0; i < cd->max_vcpus; i++ ) { struct vcpu *d_vcpu = d->vcpu[i]; struct vcpu *cd_vcpu = cd->vcpu[i]; - mfn_t vcpu_info_mfn; if ( !d_vcpu || !cd_vcpu ) continue; - /* Copy & map in the vcpu_info page if the guest uses one */ - vcpu_info_mfn = d_vcpu->vcpu_info_mfn; - if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) ) - { - mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn; - - /* Allocate & map the page for it if it hasn't been already */ - if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) ) - { - gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn); - unsigned long gfn_l = gfn_x(gfn); - struct page_info *page; - - if ( !(page = alloc_domheap_page(cd, 0)) ) - return -ENOMEM; - - new_vcpu_info_mfn = page_to_mfn(page); - set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l); - - ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, - PAGE_ORDER_4K, p2m_ram_rw, - p2m->default_access, -1); - if ( ret ) - return ret; - - ret = map_vcpu_info(cd_vcpu, gfn_l, - PAGE_OFFSET(d_vcpu->vcpu_info)); - if ( ret ) - return ret; - } - - copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn); - } - - /* Same for the (physically registered) runstate and time info areas. */ + /* + * Copy and map the vcpu_info page and the (physically registered) + * runstate and time info areas. + */ + ret = copy_guest_area(&cd_vcpu->vcpu_info_area, + &d_vcpu->vcpu_info_area, cd_vcpu, d); + if ( ret ) + return ret; ret = copy_guest_area(&cd_vcpu->runstate_guest_area, &d_vcpu->runstate_guest_area, cd_vcpu, d); if ( ret ) --- a/xen/arch/x86/pv/shim.c +++ b/xen/arch/x86/pv/shim.c @@ -395,7 +395,7 @@ int pv_shim_shutdown(uint8_t reason) for_each_vcpu ( d, v ) { /* Unmap guest vcpu_info page and runstate/time areas. */ - unmap_vcpu_info(v); + unmap_guest_area(v, &v->vcpu_info_area); unmap_guest_area(v, &v->runstate_guest_area); unmap_guest_area(v, &v->arch.time_guest_area); --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -1438,7 +1438,7 @@ static void __update_vcpu_system_time(st struct vcpu_time_info *u = &vcpu_info(v, time), _u; const struct domain *d = v->domain; - if ( v->vcpu_info == NULL ) + if ( !v->vcpu_info_area.map ) return; collect_time_info(v, &_u); --- a/xen/arch/x86/x86_64/asm-offsets.c +++ b/xen/arch/x86/x86_64/asm-offsets.c @@ -53,7 +53,7 @@ void __dummy__(void) OFFSET(VCPU_processor, struct vcpu, processor); OFFSET(VCPU_domain, struct vcpu, domain); - OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info); + OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map); OFFSET(VCPU_trap_bounce, struct vcpu, arch.pv.trap_bounce); OFFSET(VCPU_thread_flags, struct vcpu, arch.flags); OFFSET(VCPU_event_addr, struct vcpu, arch.pv.event_callback_eip); --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -97,7 +97,7 @@ static void _show_registers( if ( context == CTXT_hypervisor ) printk(" %pS", _p(regs->rip)); printk("\nRFLAGS: %016lx ", regs->rflags); - if ( (context == CTXT_pv_guest) && v && v->vcpu_info ) + if ( (context == CTXT_pv_guest) && v && v->vcpu_info_area.map ) printk("EM: %d ", !!vcpu_info(v, evtchn_upcall_mask)); printk("CONTEXT: %s", context_names[context]); if ( v && !is_idle_vcpu(v) ) --- a/xen/common/compat/domain.c +++ b/xen/common/compat/domain.c @@ -49,7 +49,7 @@ int compat_common_vcpu_op(int cmd, struc { case VCPUOP_initialise: { - if ( v->vcpu_info == &dummy_vcpu_info ) + if ( v->vcpu_info_area.map == &dummy_vcpu_info ) return -EINVAL; #ifdef CONFIG_HVM --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -127,10 +127,10 @@ static void vcpu_info_reset(struct vcpu { struct domain *d = v->domain; - v->vcpu_info = ((v->vcpu_id < XEN_LEGACY_MAX_VCPUS) - ? (vcpu_info_t *)&shared_info(d, vcpu_info[v->vcpu_id]) - : &dummy_vcpu_info); - v->vcpu_info_mfn = INVALID_MFN; + v->vcpu_info_area.map = + ((v->vcpu_id < XEN_LEGACY_MAX_VCPUS) + ? (vcpu_info_t *)&shared_info(d, vcpu_info[v->vcpu_id]) + : &dummy_vcpu_info); } static void vmtrace_free_buffer(struct vcpu *v) @@ -964,7 +964,7 @@ int domain_kill(struct domain *d) return -ERESTART; for_each_vcpu ( d, v ) { - unmap_vcpu_info(v); + unmap_guest_area(v, &v->vcpu_info_area); unmap_guest_area(v, &v->runstate_guest_area); } d->is_dying = DOMDYING_dead; @@ -1419,7 +1419,7 @@ int domain_soft_reset(struct domain *d, for_each_vcpu ( d, v ) { set_xen_guest_handle(runstate_guest(v), NULL); - unmap_vcpu_info(v); + unmap_guest_area(v, &v->vcpu_info_area); unmap_guest_area(v, &v->runstate_guest_area); } @@ -1467,111 +1467,6 @@ int vcpu_reset(struct vcpu *v) return rc; } -/* - * Map a guest page in and point the vcpu_info pointer at it. This - * makes sure that the vcpu_info is always pointing at a valid piece - * of memory, and it sets a pending event to make sure that a pending - * event doesn't get missed. - */ -int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned int offset) -{ - struct domain *d = v->domain; - void *mapping; - vcpu_info_t *new_info; - struct page_info *page; - unsigned int align; - - if ( offset > (PAGE_SIZE - sizeof(*new_info)) ) - return -ENXIO; - -#ifdef CONFIG_COMPAT - BUILD_BUG_ON(sizeof(*new_info) != sizeof(new_info->compat)); - if ( has_32bit_shinfo(d) ) - align = alignof(new_info->compat); - else -#endif - align = alignof(*new_info); - if ( offset & (align - 1) ) - return -ENXIO; - - if ( !mfn_eq(v->vcpu_info_mfn, INVALID_MFN) ) - return -EINVAL; - - /* Run this command on yourself or on other offline VCPUS. */ - if ( (v != current) && !(v->pause_flags & VPF_down) ) - return -EINVAL; - - page = get_page_from_gfn(d, gfn, NULL, P2M_UNSHARE); - if ( !page ) - return -EINVAL; - - if ( !get_page_type(page, PGT_writable_page) ) - { - put_page(page); - return -EINVAL; - } - - mapping = __map_domain_page_global(page); - if ( mapping == NULL ) - { - put_page_and_type(page); - return -ENOMEM; - } - - new_info = (vcpu_info_t *)(mapping + offset); - - if ( v->vcpu_info == &dummy_vcpu_info ) - { - memset(new_info, 0, sizeof(*new_info)); -#ifdef XEN_HAVE_PV_UPCALL_MASK - __vcpu_info(v, new_info, evtchn_upcall_mask) = 1; -#endif - } - else - { - memcpy(new_info, v->vcpu_info, sizeof(*new_info)); - } - - v->vcpu_info = new_info; - v->vcpu_info_mfn = page_to_mfn(page); - - /* Set new vcpu_info pointer /before/ setting pending flags. */ - smp_wmb(); - - /* - * Mark everything as being pending just to make sure nothing gets - * lost. The domain will get a spurious event, but it can cope. - */ -#ifdef CONFIG_COMPAT - if ( !has_32bit_shinfo(d) ) - write_atomic(&new_info->native.evtchn_pending_sel, ~0); - else -#endif - write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0); - vcpu_mark_events_pending(v); - - return 0; -} - -/* - * Unmap the vcpu info page if the guest decided to place it somewhere - * else. This is used from domain_kill() and domain_soft_reset(). - */ -void unmap_vcpu_info(struct vcpu *v) -{ - mfn_t mfn = v->vcpu_info_mfn; - - if ( mfn_eq(mfn, INVALID_MFN) ) - return; - - unmap_domain_page_global((void *) - ((unsigned long)v->vcpu_info & PAGE_MASK)); - - vcpu_info_reset(v); /* NB: Clobbers v->vcpu_info_mfn */ - - put_page_and_type(mfn_to_page(mfn)); -} - int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size, struct guest_area *area, void (*populate)(void *dst, struct vcpu *v)) @@ -1633,14 +1528,44 @@ int map_guest_area(struct vcpu *v, paddr domain_lock(d); - if ( map ) - populate(map, v); + /* No re-registration of the vCPU info area. */ + if ( area != &v->vcpu_info_area || !area->pg ) + { + if ( map ) + populate(map, v); - SWAP(area->pg, pg); - SWAP(area->map, map); + SWAP(area->pg, pg); + SWAP(area->map, map); + } + else + rc = -EBUSY; domain_unlock(d); + /* Set pending flags /after/ new vcpu_info pointer was set. */ + if ( area == &v->vcpu_info_area && !rc ) + { + /* + * Mark everything as being pending just to make sure nothing gets + * lost. The domain will get a spurious event, but it can cope. + */ +#ifdef CONFIG_COMPAT + if ( !has_32bit_shinfo(d) ) + { + vcpu_info_t *info = area->map; + + /* For VCPUOP_register_vcpu_info handling in common_vcpu_op(). */ + BUILD_BUG_ON(sizeof(*info) != sizeof(info->compat)); + write_atomic(&info->native.evtchn_pending_sel, ~0); + } + else +#endif + write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0); + vcpu_mark_events_pending(v); + + force_update_vcpu_system_time(v); + } + if ( v != current ) vcpu_unpause(v); @@ -1670,7 +1595,10 @@ void unmap_guest_area(struct vcpu *v, st domain_lock(d); map = area->map; - area->map = NULL; + if ( area == &v->vcpu_info_area ) + vcpu_info_reset(v); + else + area->map = NULL; pg = area->pg; area->pg = NULL; domain_unlock(d); @@ -1803,6 +1731,27 @@ bool update_runstate_area(struct vcpu *v return rc; } +/* + * This makes sure that the vcpu_info is always pointing at a valid piece of + * memory, and it sets a pending event to make sure that a pending event + * doesn't get missed. + */ +static void cf_check +vcpu_info_populate(void *map, struct vcpu *v) +{ + vcpu_info_t *info = map; + + if ( v->vcpu_info_area.map == &dummy_vcpu_info ) + { + memset(info, 0, sizeof(*info)); +#ifdef XEN_HAVE_PV_UPCALL_MASK + __vcpu_info(v, info, evtchn_upcall_mask) = 1; +#endif + } + else + memcpy(info, v->vcpu_info_area.map, sizeof(*info)); +} + static void cf_check runstate_area_populate(void *map, struct vcpu *v) { @@ -1832,7 +1781,7 @@ long common_vcpu_op(int cmd, struct vcpu switch ( cmd ) { case VCPUOP_initialise: - if ( v->vcpu_info == &dummy_vcpu_info ) + if ( v->vcpu_info_area.map == &dummy_vcpu_info ) return -EINVAL; rc = arch_initialise_vcpu(v, arg); @@ -1956,16 +1905,29 @@ long common_vcpu_op(int cmd, struct vcpu case VCPUOP_register_vcpu_info: { struct vcpu_register_vcpu_info info; + paddr_t gaddr; rc = -EFAULT; if ( copy_from_guest(&info, arg, 1) ) break; - domain_lock(d); - rc = map_vcpu_info(v, info.mfn, info.offset); - domain_unlock(d); + rc = -EINVAL; + gaddr = gfn_to_gaddr(_gfn(info.mfn)) + info.offset; + if ( !~gaddr || + gfn_x(gaddr_to_gfn(gaddr)) != info.mfn ) + break; - force_update_vcpu_system_time(v); + /* Preliminary check only; see map_guest_area(). */ + rc = -EBUSY; + if ( v->vcpu_info_area.pg ) + break; + + /* See the BUILD_BUG_ON() in vcpu_info_populate(). */ + rc = map_guest_area(v, gaddr, sizeof(vcpu_info_t), + &v->vcpu_info_area, vcpu_info_populate); + if ( rc == -ERESTART ) + rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", + cmd, vcpuid, arg); break; } --- a/xen/include/xen/domain.h +++ b/xen/include/xen/domain.h @@ -79,9 +79,6 @@ void cf_check free_pirq_struct(void *); int arch_vcpu_create(struct vcpu *v); void arch_vcpu_destroy(struct vcpu *v); -int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned int offset); -void unmap_vcpu_info(struct vcpu *v); - int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size, struct guest_area *area, void (*populate)(void *dst, struct vcpu *v)); --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -175,7 +175,7 @@ struct vcpu int processor; - vcpu_info_t *vcpu_info; + struct guest_area vcpu_info_area; struct domain *domain; @@ -288,9 +288,6 @@ struct vcpu struct waitqueue_vcpu *waitqueue_vcpu; - /* Guest-specified relocation of vcpu_info. */ - mfn_t vcpu_info_mfn; - struct evtchn_fifo_vcpu *evtchn_fifo; /* vPCI per-vCPU area, used to store data for long running operations. */ --- a/xen/include/xen/shared.h +++ b/xen/include/xen/shared.h @@ -44,6 +44,7 @@ typedef struct vcpu_info vcpu_info_t; extern vcpu_info_t dummy_vcpu_info; #define shared_info(d, field) __shared_info(d, (d)->shared_info, field) -#define vcpu_info(v, field) __vcpu_info(v, (v)->vcpu_info, field) +#define vcpu_info(v, field) \ + __vcpu_info(v, (vcpu_info_t *)(v)->vcpu_info_area.map, field) #endif /* __XEN_SHARED_H__ */

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.