Xen project Mailing List

[PATCH RFC 10/10] common: convert vCPU info area registration

To: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>

Date: Wed, 19 Oct 2022 09:45:52 +0200

Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none

Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=DgBhR75TKXzhjJAKk8igA43OaQRf/s9CHiXdfaBkBfE=; b=iz25YbwMEI8ZEwfiXIX1isd7+QjHbDI+Qenlwqd9kfuEWxhyFlnFnfkNjwJk+zJC7NwCMvfM2rE2b3n3/g5pC+T6E6H3s46UAWZF6dHOKxRy0gLJy/Z2DTSBKebYZf20WNYpq4kNVFtTMO+5Uf+7tnu5g0q0eyIYCt32M2CVo2RMYKGFu3vaXrdd4uEmUdDRR0WGMGuNR5JWpRcJbZo65/gc1VQDWdSXc/GYk4UAlLwKRP0LNzBjSoWFohyn2ycG0G0MEyg1+a021jYjFSsBLE45BCftmEilv+yUh2qVszJ0je4jXgWmaCgL3MBmaCIaOO7SLlq66cbWLzs/JtAH/Q==

Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=HmShmaqQykFqvmtCcfCQ1GOW4Os2WvVSex46DrA1W/IA2XONfWPJZk5ErWkmBSOyIQcFanUMApAoyfk0ZDxh8FzhVIO7CA6rfV2hDgbW+t1/zAhL8XLnTdcvwJRX0QgKHiQzkYP/8jNhm8wJEPoHxHZxJtkRwKhRkQeJkKAvdwL93aLXZ+zVYLRrFAeBjjXDLh5i0TD4nJI6gX3Flbs6aBKlg+mFcQmIDgFhXtFxqKvZE7wZJiC6JQZyY63TPGbLH9TW06OCdQ2q+38G75SjDet/Px5X+ONIiwoh9mGmaUXwD3yjz2HWvpy7uucLO/pTsvbHSUWGGgYXVNAUe/MaLw==

Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;

Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Tamas K Lengyel <tamas@xxxxxxxxxxxxx>

Delivery-date: Wed, 19 Oct 2022 07:46:00 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Switch to using map_guest_area(). Noteworthy differences from map_vcpu_info(): - remote vCPU-s are paused rather than checked for being down (which in principle can change right after the check), - the domain lock is taken for a much smaller region, - areas could now be registered more than once, if we want this, - as a corner case registering the area at GFN 0 offset 0 is no longer possible (this is considered an invalid de-registration request). Note that this eliminates a bug in copy_vcpu_settings(): The function did allocate a new page regardless of the GFN already having a mapping, thus in particular breaking the case of two vCPU-s having their info areas on the same page. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- RFC: While the "no re-registration" check is retained, it is now racy. If we really think it needs retaining _and_ properly enforcing, then locking will be needed, but I don't think we can hold the domain lock around a call to map_guest_area(), first and foremost because of its use of vcpu_pause(). RFC: Is the GFN 0 offset 0 behavioral change deemed acceptable? RFC: To have just a single instance of casts to vcpu_info_t *, introducing a macro (or inline function if header dependencies permit) might be nice. However, the only sensible identifier for it would imo be vcpu_info(). Yet we already have a macro of that name. With some trickery it might be possible to overload the macro (making the "field" argument optional), but this may not be desirable for other reasons (it could e.g. be deemed confusing). --- a/xen/arch/x86/include/asm/shared.h +++ b/xen/arch/x86/include/asm/shared.h @@ -27,16 +27,16 @@ static inline void arch_set_##field(stru static inline type arch_get_##field(const struct vcpu *v) \ { \ return !has_32bit_shinfo(v->domain) ? \ - v->vcpu_info->native.arch.field : \ - v->vcpu_info->compat.arch.field; \ + ((const vcpu_info_t *)v->vcpu_info_area.map)->native.arch.field : \ + ((const vcpu_info_t *)v->vcpu_info_area.map)->compat.arch.field; \ } \ static inline void arch_set_##field(struct vcpu *v, \ type val) \ { \ if ( !has_32bit_shinfo(v->domain) ) \ - v->vcpu_info->native.arch.field = val; \ + ((vcpu_info_t *)v->vcpu_info_area.map)->native.arch.field = val; \ else \ - v->vcpu_info->compat.arch.field = val; \ + ((vcpu_info_t *)v->vcpu_info_area.map)->compat.arch.field = val; \ } #else @@ -57,12 +57,12 @@ static inline void arch_set_##field(stru #define GET_SET_VCPU(type, field) \ static inline type arch_get_##field(const struct vcpu *v) \ { \ - return v->vcpu_info->arch.field; \ + return ((const vcpu_info_t *)v->vcpu_info_area.map)->arch.field; \ } \ static inline void arch_set_##field(struct vcpu *v, \ type val) \ { \ - v->vcpu_info->arch.field = val; \ + ((vcpu_info_t *)v->vcpu_info_area.map)->arch.field = val; \ } #endif --- a/xen/arch/x86/mm/mem_sharing.c +++ b/xen/arch/x86/mm/mem_sharing.c @@ -1758,53 +1758,24 @@ static int copy_vpmu(struct vcpu *d_vcpu static int copy_vcpu_settings(struct domain *cd, const struct domain *d) { unsigned int i; - struct p2m_domain *p2m = p2m_get_hostp2m(cd); int ret = -EINVAL; for ( i = 0; i < cd->max_vcpus; i++ ) { struct vcpu *d_vcpu = d->vcpu[i]; struct vcpu *cd_vcpu = cd->vcpu[i]; - mfn_t vcpu_info_mfn; if ( !d_vcpu || !cd_vcpu ) continue; - /* Copy & map in the vcpu_info page if the guest uses one */ - vcpu_info_mfn = d_vcpu->vcpu_info_mfn; - if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) ) - { - mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn; - - /* Allocate & map the page for it if it hasn't been already */ - if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) ) - { - gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn); - unsigned long gfn_l = gfn_x(gfn); - struct page_info *page; - - if ( !(page = alloc_domheap_page(cd, 0)) ) - return -ENOMEM; - - new_vcpu_info_mfn = page_to_mfn(page); - set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l); - - ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, - PAGE_ORDER_4K, p2m_ram_rw, - p2m->default_access, -1); - if ( ret ) - return ret; - - ret = map_vcpu_info(cd_vcpu, gfn_l, - PAGE_OFFSET(d_vcpu->vcpu_info)); - if ( ret ) - return ret; - } - - copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn); - } - - /* Same for the (physically registered) runstate and time info areas. */ + /* + * Copy and map the vcpu_info page and the (physically registered) + * runstate and time info areas. + */ + ret = copy_guest_area(&cd_vcpu->vcpu_info_area, + &d_vcpu->vcpu_info_area, cd_vcpu, d); + if ( ret ) + return ret; ret = copy_guest_area(&cd_vcpu->runstate_guest_area, &d_vcpu->runstate_guest_area, cd_vcpu, d); if ( ret ) --- a/xen/arch/x86/pv/shim.c +++ b/xen/arch/x86/pv/shim.c @@ -395,7 +395,7 @@ int pv_shim_shutdown(uint8_t reason) for_each_vcpu ( d, v ) { /* Unmap guest vcpu_info page and runstate/time areas. */ - unmap_vcpu_info(v); + unmap_guest_area(v, &v->vcpu_info_area); unmap_guest_area(v, &v->runstate_guest_area); unmap_guest_area(v, &v->arch.time_guest_area); --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -1438,7 +1438,7 @@ static void __update_vcpu_system_time(st struct vcpu_time_info *u = &vcpu_info(v, time), _u; const struct domain *d = v->domain; - if ( v->vcpu_info == NULL ) + if ( !v->vcpu_info_area.map ) return; collect_time_info(v, &_u); --- a/xen/arch/x86/x86_64/asm-offsets.c +++ b/xen/arch/x86/x86_64/asm-offsets.c @@ -53,7 +53,7 @@ void __dummy__(void) OFFSET(VCPU_processor, struct vcpu, processor); OFFSET(VCPU_domain, struct vcpu, domain); - OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info); + OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map); OFFSET(VCPU_trap_bounce, struct vcpu, arch.pv.trap_bounce); OFFSET(VCPU_thread_flags, struct vcpu, arch.flags); OFFSET(VCPU_event_addr, struct vcpu, arch.pv.event_callback_eip); --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -97,7 +97,7 @@ static void _show_registers( if ( context == CTXT_hypervisor ) printk(" %pS", _p(regs->rip)); printk("\nRFLAGS: %016lx ", regs->rflags); - if ( (context == CTXT_pv_guest) && v && v->vcpu_info ) + if ( (context == CTXT_pv_guest) && v && v->vcpu_info_area.map ) printk("EM: %d ", !!vcpu_info(v, evtchn_upcall_mask)); printk("CONTEXT: %s", context_names[context]); if ( v && !is_idle_vcpu(v) ) --- a/xen/common/compat/domain.c +++ b/xen/common/compat/domain.c @@ -49,7 +49,7 @@ int compat_common_vcpu_op(int cmd, struc { case VCPUOP_initialise: { - if ( v->vcpu_info == &dummy_vcpu_info ) + if ( v->vcpu_info_area.map == &dummy_vcpu_info ) return -EINVAL; #ifdef CONFIG_HVM --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -127,10 +127,10 @@ static void vcpu_info_reset(struct vcpu { struct domain *d = v->domain; - v->vcpu_info = ((v->vcpu_id < XEN_LEGACY_MAX_VCPUS) - ? (vcpu_info_t *)&shared_info(d, vcpu_info[v->vcpu_id]) - : &dummy_vcpu_info); - v->vcpu_info_mfn = INVALID_MFN; + v->vcpu_info_area.map = + ((v->vcpu_id < XEN_LEGACY_MAX_VCPUS) + ? (vcpu_info_t *)&shared_info(d, vcpu_info[v->vcpu_id]) + : &dummy_vcpu_info); } static void vmtrace_free_buffer(struct vcpu *v) @@ -951,7 +951,7 @@ int domain_kill(struct domain *d) return -ERESTART; for_each_vcpu ( d, v ) { - unmap_vcpu_info(v); + unmap_guest_area(v, &v->vcpu_info_area); unmap_guest_area(v, &v->runstate_guest_area); } d->is_dying = DOMDYING_dead; @@ -1406,7 +1406,7 @@ int domain_soft_reset(struct domain *d, for_each_vcpu ( d, v ) { set_xen_guest_handle(runstate_guest(v), NULL); - unmap_vcpu_info(v); + unmap_guest_area(v, &v->vcpu_info_area); unmap_guest_area(v, &v->runstate_guest_area); } @@ -1454,111 +1454,6 @@ int vcpu_reset(struct vcpu *v) return rc; } -/* - * Map a guest page in and point the vcpu_info pointer at it. This - * makes sure that the vcpu_info is always pointing at a valid piece - * of memory, and it sets a pending event to make sure that a pending - * event doesn't get missed. - */ -int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned int offset) -{ - struct domain *d = v->domain; - void *mapping; - vcpu_info_t *new_info; - struct page_info *page; - unsigned int align; - - if ( offset > (PAGE_SIZE - sizeof(*new_info)) ) - return -ENXIO; - -#ifdef CONFIG_COMPAT - BUILD_BUG_ON(sizeof(*new_info) != sizeof(new_info->compat)); - if ( has_32bit_shinfo(d) ) - align = alignof(new_info->compat); - else -#endif - align = alignof(*new_info); - if ( offset & (align - 1) ) - return -ENXIO; - - if ( !mfn_eq(v->vcpu_info_mfn, INVALID_MFN) ) - return -EINVAL; - - /* Run this command on yourself or on other offline VCPUS. */ - if ( (v != current) && !(v->pause_flags & VPF_down) ) - return -EINVAL; - - page = get_page_from_gfn(d, gfn, NULL, P2M_UNSHARE); - if ( !page ) - return -EINVAL; - - if ( !get_page_type(page, PGT_writable_page) ) - { - put_page(page); - return -EINVAL; - } - - mapping = __map_domain_page_global(page); - if ( mapping == NULL ) - { - put_page_and_type(page); - return -ENOMEM; - } - - new_info = (vcpu_info_t *)(mapping + offset); - - if ( v->vcpu_info == &dummy_vcpu_info ) - { - memset(new_info, 0, sizeof(*new_info)); -#ifdef XEN_HAVE_PV_UPCALL_MASK - __vcpu_info(v, new_info, evtchn_upcall_mask) = 1; -#endif - } - else - { - memcpy(new_info, v->vcpu_info, sizeof(*new_info)); - } - - v->vcpu_info = new_info; - v->vcpu_info_mfn = page_to_mfn(page); - - /* Set new vcpu_info pointer /before/ setting pending flags. */ - smp_wmb(); - - /* - * Mark everything as being pending just to make sure nothing gets - * lost. The domain will get a spurious event, but it can cope. - */ -#ifdef CONFIG_COMPAT - if ( !has_32bit_shinfo(d) ) - write_atomic(&new_info->native.evtchn_pending_sel, ~0); - else -#endif - write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0); - vcpu_mark_events_pending(v); - - return 0; -} - -/* - * Unmap the vcpu info page if the guest decided to place it somewhere - * else. This is used from domain_kill() and domain_soft_reset(). - */ -void unmap_vcpu_info(struct vcpu *v) -{ - mfn_t mfn = v->vcpu_info_mfn; - - if ( mfn_eq(mfn, INVALID_MFN) ) - return; - - unmap_domain_page_global((void *) - ((unsigned long)v->vcpu_info & PAGE_MASK)); - - vcpu_info_reset(v); /* NB: Clobbers v->vcpu_info_mfn */ - - put_page_and_type(mfn_to_page(mfn)); -} - int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size, struct guest_area *area, void (*populate)(void *dst, struct vcpu *v)) @@ -1628,6 +1523,30 @@ int map_guest_area(struct vcpu *v, paddr domain_unlock(currd); + /* Set pending flags /after/ new vcpu_info pointer was set. */ + if ( area == &v->vcpu_info_area ) + { + /* + * Mark everything as being pending just to make sure nothing gets + * lost. The domain will get a spurious event, but it can cope. + */ +#ifdef CONFIG_COMPAT + if ( !has_32bit_shinfo(currd) ) + { + vcpu_info_t *info = area->map; + + /* For VCPUOP_register_vcpu_info handling in common_vcpu_op(). */ + BUILD_BUG_ON(sizeof(*info) != sizeof(info->compat)); + write_atomic(&info->native.evtchn_pending_sel, ~0); + } + else +#endif + write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0); + vcpu_mark_events_pending(v); + + force_update_vcpu_system_time(v); + } + if ( v != current ) vcpu_unpause(v); @@ -1654,7 +1573,10 @@ void unmap_guest_area(struct vcpu *v, st domain_lock(d); map = area->map; - area->map = NULL; + if ( area == &v->vcpu_info_area ) + vcpu_info_reset(v); + else + area->map = NULL; pg = area->pg; area->pg = NULL; domain_unlock(d); @@ -1789,6 +1711,27 @@ bool update_runstate_area(struct vcpu *v return rc; } +/* + * This makes sure that the vcpu_info is always pointing at a valid piece of + * memory, and it sets a pending event to make sure that a pending event + * doesn't get missed. + */ +static void cf_check +vcpu_info_populate(void *map, struct vcpu *v) +{ + vcpu_info_t *info = map; + + if ( v->vcpu_info_area.map == &dummy_vcpu_info ) + { + memset(info, 0, sizeof(*info)); +#ifdef XEN_HAVE_PV_UPCALL_MASK + __vcpu_info(v, info, evtchn_upcall_mask) = 1; +#endif + } + else + memcpy(info, v->vcpu_info_area.map, sizeof(*info)); +} + static void cf_check runstate_area_populate(void *map, struct vcpu *v) { @@ -1818,7 +1761,7 @@ long common_vcpu_op(int cmd, struct vcpu switch ( cmd ) { case VCPUOP_initialise: - if ( v->vcpu_info == &dummy_vcpu_info ) + if ( v->vcpu_info_area.map == &dummy_vcpu_info ) return -EINVAL; rc = arch_initialise_vcpu(v, arg); @@ -1942,16 +1885,30 @@ long common_vcpu_op(int cmd, struct vcpu case VCPUOP_register_vcpu_info: { struct vcpu_register_vcpu_info info; + paddr_t gaddr; rc = -EFAULT; if ( copy_from_guest(&info, arg, 1) ) break; - domain_lock(d); - rc = map_vcpu_info(v, info.mfn, info.offset); - domain_unlock(d); + rc = -EINVAL; + gaddr = gfn_to_gaddr(_gfn(info.mfn)) + info.offset; + if ( !gaddr || + gfn_x(gaddr_to_gfn(gaddr)) != info.mfn || + /* + * Technically re-registration is okay, but it wasn't allowed + * originally. By trying to win a race, a guest might be able + * to bypass this restriction. + */ + v->vcpu_info_area.pg ) + break; - force_update_vcpu_system_time(v); + /* See the BUILD_BUG_ON() in vcpu_info_populate(). */ + rc = map_guest_area(v, gaddr, sizeof(vcpu_info_t), + &v->vcpu_info_area, vcpu_info_populate); + if ( rc == -ERESTART ) + rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", + cmd, vcpuid, arg); break; } --- a/xen/include/xen/domain.h +++ b/xen/include/xen/domain.h @@ -79,9 +79,6 @@ void cf_check free_pirq_struct(void *); int arch_vcpu_create(struct vcpu *v); void arch_vcpu_destroy(struct vcpu *v); -int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned int offset); -void unmap_vcpu_info(struct vcpu *v); - int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size, struct guest_area *area, void (*populate)(void *dst, struct vcpu *v)); --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -175,7 +175,7 @@ struct vcpu int processor; - vcpu_info_t *vcpu_info; + struct guest_area vcpu_info_area; struct domain *domain; @@ -288,9 +288,6 @@ struct vcpu struct waitqueue_vcpu *waitqueue_vcpu; - /* Guest-specified relocation of vcpu_info. */ - mfn_t vcpu_info_mfn; - struct evtchn_fifo_vcpu *evtchn_fifo; /* vPCI per-vCPU area, used to store data for long running operations. */ --- a/xen/include/xen/shared.h +++ b/xen/include/xen/shared.h @@ -44,6 +44,7 @@ typedef struct vcpu_info vcpu_info_t; extern vcpu_info_t dummy_vcpu_info; #define shared_info(d, field) __shared_info(d, (d)->shared_info, field) -#define vcpu_info(v, field) __vcpu_info(v, (v)->vcpu_info, field) +#define vcpu_info(v, field) \ + __vcpu_info(v, (vcpu_info_t *)(v)->vcpu_info_area.map, field) #endif /* __XEN_SHARED_H__ */

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.