Xen project Mailing List

Re: [PATCH v3 8/8] common: convert vCPU info area registration

From: Roger Pau Monné <roger.pau@xxxxxxxxxx>

Date: Thu, 28 Sep 2023 10:24:33 +0200

Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=citrix.com; dmarc=pass action=none header.from=citrix.com; dkim=pass header.d=citrix.com; arc=none

Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=Aid/D0BOm83IvQS0evn5GSVqtpz+aJj91bQcOu2b620=; b=BPVReFnrKhJz0Rwrfn/7VGOmT0lyxY9usKEL3oPTkiNTxXlYM9zzJB05+apnoKozi8fXg/B1bATH8SWHrl6ip1eO3uM3v1ejZb81Ub/ltrC/42nVcDrsw1jZGrx2VFzn5huZOvYp9coDHkYs/skbk25eUFawhTjgjBFpyqDzCQMTWABJ61xSgiV3tfFUh0ECIeq2XJOUgemLUDootDi/WbSfVcVgIQfrOsTKzMGcI1gmNFg5uRHtPJBeI9gvhjCr0nJJTHlDYQobUdpuJLzZ9JbDnphpBiIiTF35k7NJVtiolUCnpBrj2ILh2OLqjc1V9xMpAzhOrwrHxoDXW81tuQ==

Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=P7+qcZwvdzio+NlNEOfaE+QpZNyT+eOeNP/powS3K9hX3dNLS2bvIZtBUpS3ApHe/F74fqEt4tNY+alWkEnQrUvVlZOFKSsbJE5AhDPUm0zeRubCkcqwOS3U7pSgBRTqSSz37Uoe6LivK1eTY2d2426zJQBsI8X8YJxM9WjwjSCju3oY0tNeS+tM4OWGfvbACiitARlojIuhiYOsBdZwLZvX0l2+yPTmb5RkfFb8XP6lcJw7AQGk31QrxRDaJmw0JxHDkNbTfvf1SbaPTs29yZ3bMIf81oYv/3Elex6/ED1sgWxi/q9V3Gnd9YSfDZLqaKarPlfHdeSdYmtkMg9LTQ==

Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=citrix.com;

Cc: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, Tamas K Lengyel <tamas@xxxxxxxxxxxxx>

Delivery-date: Thu, 28 Sep 2023 08:25:00 +0000

Ironport-data: A9a23:LMq0QqL3hTgbyWR1FE+RNJQlxSXFcZb7ZxGr2PjKsXjdYENS1mdSz zYaUWDXbPvYamryctBxb9+zp0gH6JLTy9NrSlNlqX01Q3x08seUXt7xwmUcnc+xBpaaEB84t ZV2hv3odp1coqr0/0/1WlTZhSAhk/nOHvylULKs1hlZHWdMUD0mhQ9oh9k3i4tphcnRKw6Ws Jb5rta31GWNglaYCUpKrfrYwP9TlK6q4mhA7wZuPakjUGL2zBH5MrpOfcldEFOgKmVkNrbSb /rOyri/4lTY838FYj9yuu+mGqGiaue60Tmm0hK6aYD76vRxjnVaPpIAHOgdcS9qZwChxLid/ jnvWauYEm/FNoWU8AgUvoIx/ytWZcWq85efSZSzXFD6I+QrvBIAzt03ZHzaM7H09c5JGFxl1 q0dJQkxcxDTnvixxKC2DeRz05FLwMnDZOvzu1lG5BSAV7MMZ8CGRK/Ho9hFwD03m8ZCW+7EY NYUYiZuaxKGZABTPlAQC9Q1m+LAanvXKmUE7g7K4/dqpTGNnWSd05C0WDbRUsaNSshP2F6Ru 0rN/njjAwFcP9uaodaA2iv22rORx3yqBur+EpXn6+460AWBhVArDTcKfl2Xn9aw10+HDoc3x 0s8v3BGQbIJ3E6hQ8T5Xha4iGWZpRNaUN1Ve8Uq5QfIxqfK7gKxAmkfUiUHeNEgrNUxRzEhy hmOhdyBLT5ytLyYT1qN+7HSqim9UQAONnMLbyIASQoD4vHgrZs1gxaJScxseIaqivXlFDe2x CqFxAAijrAaluYX1KG2+1/WjjbqrZ/MJjPZ/S3SV2Ohqwl/NIisYtXy7UCBtKgRaoGEUlOGo X4I3dCE6/wDBo2MkyrLR/gRGLau5LCONzi0bUNTIqTNPg+FoxaLFb28KhklTKu1Gq7ooQPUX XI=

Ironport-hdrordr: A9a23:VLsD/atCY2kuQE/9f6k1/sF27skCBYMji2hC6mlwRA09TyVXrb HLoB0+726PtN93YgBcpTngAtj6fZq4z/JICOYqTNSftTfdySqVxe1ZnOnfKnjbalHDHgA079 YrT0BRYOeAQWSSp/yKqDVRKr4bsZS6GErBv5aQ854Vd3AgV0gC1XYANu/4KDwReOAcP+txKH P03KMuzFDMFhV2UimiPAh1YwGAnayzqHuBW29JO/cJ0njPsdrC0s+cL/H35GZ6b9oC+8ZczY AF+zaJkZlKHpqApCM1WAfonuJrcLeN8Ko8OOW8zvINIjHbggy0ZIJnMofyyAwdkaWU8V4vpt LFuH4bTrZOwkKURHi8pS331xDnyytG0Q6W9XaoxUH7qcjOXjg9EMYpv/MrTjLpr3A4tNVLyq hMxG6ut51LZCmw7BjA2w==

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On Wed, May 03, 2023 at 05:58:30PM +0200, Jan Beulich wrote: > Switch to using map_guest_area(). Noteworthy differences from > map_vcpu_info(): > - remote vCPU-s are paused rather than checked for being down (which in > principle can change right after the check), > - the domain lock is taken for a much smaller region, > - the error code for an attempt to re-register the area is now -EBUSY, > - we could in principle permit de-registration when no area was > previously registered (which would permit "probing", if necessary for > anything). > > Note that this eliminates a bug in copy_vcpu_settings(): The function > did allocate a new page regardless of the GFN already having a mapping, > thus in particular breaking the case of two vCPU-s having their info > areas on the same page. > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Some minor comments below: Acked-by: Roger Pau Monné <roger.pau@xxxxxxxxxx> > --- > RFC: I'm not really certain whether the preliminary check (ahead of > calling map_guest_area()) is worthwhile to have. > --- > v2: Re-base over changes earlier in the series. Properly enforce no re- > registration. Avoid several casts by introducing local variables. > > --- a/xen/arch/x86/include/asm/shared.h > +++ b/xen/arch/x86/include/asm/shared.h > @@ -26,17 +26,20 @@ static inline void arch_set_##field(stru > #define GET_SET_VCPU(type, field) \ > static inline type arch_get_##field(const struct vcpu *v) \ > { \ > + const vcpu_info_t *vi = v->vcpu_info_area.map; \ > + \ > return !has_32bit_shinfo(v->domain) ? \ > - v->vcpu_info->native.arch.field : \ > - v->vcpu_info->compat.arch.field; \ > + vi->native.arch.field : vi->compat.arch.field; \ > } \ > static inline void arch_set_##field(struct vcpu *v, \ > type val) \ > { \ > + vcpu_info_t *vi = v->vcpu_info_area.map; \ > + \ > if ( !has_32bit_shinfo(v->domain) ) \ > - v->vcpu_info->native.arch.field = val; \ > + vi->native.arch.field = val; \ > else \ > - v->vcpu_info->compat.arch.field = val; \ > + vi->compat.arch.field = val; \ > } > > #else > @@ -57,12 +60,16 @@ static inline void arch_set_##field(stru > #define GET_SET_VCPU(type, field) \ > static inline type arch_get_##field(const struct vcpu *v) \ > { \ > - return v->vcpu_info->arch.field; \ > + const vcpu_info_t *vi = v->vcpu_info_area.map; \ > + \ > + return vi->arch.field; \ > } \ > static inline void arch_set_##field(struct vcpu *v, \ > type val) \ > { \ > - v->vcpu_info->arch.field = val; \ > + vcpu_info_t *vi = v->vcpu_info_area.map; \ > + \ > + vi->arch.field = val; \ > } > > #endif > --- a/xen/arch/x86/mm/mem_sharing.c > +++ b/xen/arch/x86/mm/mem_sharing.c > @@ -1749,53 +1749,24 @@ static int copy_vpmu(struct vcpu *d_vcpu > static int copy_vcpu_settings(struct domain *cd, const struct domain *d) > { > unsigned int i; > - struct p2m_domain *p2m = p2m_get_hostp2m(cd); > int ret = -EINVAL; > > for ( i = 0; i < cd->max_vcpus; i++ ) > { > struct vcpu *d_vcpu = d->vcpu[i]; > struct vcpu *cd_vcpu = cd->vcpu[i]; > - mfn_t vcpu_info_mfn; > > if ( !d_vcpu || !cd_vcpu ) > continue; > > - /* Copy & map in the vcpu_info page if the guest uses one */ > - vcpu_info_mfn = d_vcpu->vcpu_info_mfn; > - if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) ) > - { > - mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn; > - > - /* Allocate & map the page for it if it hasn't been already */ > - if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) ) > - { > - gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn); > - unsigned long gfn_l = gfn_x(gfn); > - struct page_info *page; > - > - if ( !(page = alloc_domheap_page(cd, 0)) ) > - return -ENOMEM; > - > - new_vcpu_info_mfn = page_to_mfn(page); > - set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l); > - > - ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, > - PAGE_ORDER_4K, p2m_ram_rw, > - p2m->default_access, -1); > - if ( ret ) > - return ret; > - > - ret = map_vcpu_info(cd_vcpu, gfn_l, > - PAGE_OFFSET(d_vcpu->vcpu_info)); > - if ( ret ) > - return ret; > - } > - > - copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn); > - } > - > - /* Same for the (physically registered) runstate and time info > areas. */ > + /* > + * Copy and map the vcpu_info page and the (physically registered) > + * runstate and time info areas. > + */ > + ret = copy_guest_area(&cd_vcpu->vcpu_info_area, > + &d_vcpu->vcpu_info_area, cd_vcpu, d); > + if ( ret ) > + return ret; > ret = copy_guest_area(&cd_vcpu->runstate_guest_area, > &d_vcpu->runstate_guest_area, cd_vcpu, d); > if ( ret ) > --- a/xen/arch/x86/pv/shim.c > +++ b/xen/arch/x86/pv/shim.c > @@ -383,7 +383,7 @@ int pv_shim_shutdown(uint8_t reason) > for_each_vcpu ( d, v ) > { > /* Unmap guest vcpu_info page and runstate/time areas. */ > - unmap_vcpu_info(v); > + unmap_guest_area(v, &v->vcpu_info_area); > unmap_guest_area(v, &v->runstate_guest_area); > unmap_guest_area(v, &v->arch.time_guest_area); > > --- a/xen/arch/x86/time.c > +++ b/xen/arch/x86/time.c > @@ -1547,7 +1547,7 @@ static void __update_vcpu_system_time(st > struct vcpu_time_info *u = &vcpu_info(v, time), _u; > const struct domain *d = v->domain; > > - if ( v->vcpu_info == NULL ) > + if ( !v->vcpu_info_area.map ) > return; > > collect_time_info(v, &_u); > --- a/xen/arch/x86/x86_64/asm-offsets.c > +++ b/xen/arch/x86/x86_64/asm-offsets.c > @@ -53,7 +53,7 @@ void __dummy__(void) > > OFFSET(VCPU_processor, struct vcpu, processor); > OFFSET(VCPU_domain, struct vcpu, domain); > - OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info); > + OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map); > OFFSET(VCPU_trap_bounce, struct vcpu, arch.pv.trap_bounce); > OFFSET(VCPU_thread_flags, struct vcpu, arch.flags); > OFFSET(VCPU_event_addr, struct vcpu, arch.pv.event_callback_eip); > --- a/xen/arch/x86/x86_64/traps.c > +++ b/xen/arch/x86/x86_64/traps.c > @@ -96,7 +96,7 @@ static void _show_registers( > if ( context == CTXT_hypervisor ) > printk(" %pS", _p(regs->rip)); > printk("\nRFLAGS: %016lx ", regs->rflags); > - if ( (context == CTXT_pv_guest) && v && v->vcpu_info ) > + if ( (context == CTXT_pv_guest) && v && v->vcpu_info_area.map ) > printk("EM: %d ", !!vcpu_info(v, evtchn_upcall_mask)); > printk("CONTEXT: %s", context_names[context]); > if ( v && !is_idle_vcpu(v) ) > --- a/xen/common/compat/domain.c > +++ b/xen/common/compat/domain.c > @@ -49,7 +49,7 @@ int compat_common_vcpu_op(int cmd, struc > { > case VCPUOP_initialise: > { > - if ( v->vcpu_info == &dummy_vcpu_info ) > + if ( v->vcpu_info_area.map == &dummy_vcpu_info ) > return -EINVAL; > > #ifdef CONFIG_HVM > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -127,10 +127,10 @@ static void vcpu_info_reset(struct vcpu > { > struct domain *d = v->domain; d could likely be made const? > > - v->vcpu_info = ((v->vcpu_id < XEN_LEGACY_MAX_VCPUS) > - ? (vcpu_info_t *)&shared_info(d, vcpu_info[v->vcpu_id]) > - : &dummy_vcpu_info); > - v->vcpu_info_mfn = INVALID_MFN; > + v->vcpu_info_area.map = > + ((v->vcpu_id < XEN_LEGACY_MAX_VCPUS) > + ? (vcpu_info_t *)&shared_info(d, vcpu_info[v->vcpu_id]) > + : &dummy_vcpu_info); > } > > static void vmtrace_free_buffer(struct vcpu *v) > @@ -964,7 +964,7 @@ int domain_kill(struct domain *d) > return -ERESTART; > for_each_vcpu ( d, v ) > { > - unmap_vcpu_info(v); > + unmap_guest_area(v, &v->vcpu_info_area); > unmap_guest_area(v, &v->runstate_guest_area); > } > d->is_dying = DOMDYING_dead; > @@ -1419,7 +1419,7 @@ int domain_soft_reset(struct domain *d, > for_each_vcpu ( d, v ) > { > set_xen_guest_handle(runstate_guest(v), NULL); > - unmap_vcpu_info(v); > + unmap_guest_area(v, &v->vcpu_info_area); > unmap_guest_area(v, &v->runstate_guest_area); > } > > @@ -1467,111 +1467,6 @@ int vcpu_reset(struct vcpu *v) > return rc; > } > > -/* > - * Map a guest page in and point the vcpu_info pointer at it. This > - * makes sure that the vcpu_info is always pointing at a valid piece > - * of memory, and it sets a pending event to make sure that a pending > - * event doesn't get missed. > - */ > -int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned int offset) > -{ > - struct domain *d = v->domain; > - void *mapping; > - vcpu_info_t *new_info; > - struct page_info *page; > - unsigned int align; > - > - if ( offset > (PAGE_SIZE - sizeof(*new_info)) ) > - return -ENXIO; > - > -#ifdef CONFIG_COMPAT > - BUILD_BUG_ON(sizeof(*new_info) != sizeof(new_info->compat)); > - if ( has_32bit_shinfo(d) ) > - align = alignof(new_info->compat); > - else > -#endif > - align = alignof(*new_info); > - if ( offset & (align - 1) ) > - return -ENXIO; > - > - if ( !mfn_eq(v->vcpu_info_mfn, INVALID_MFN) ) > - return -EINVAL; > - > - /* Run this command on yourself or on other offline VCPUS. */ > - if ( (v != current) && !(v->pause_flags & VPF_down) ) > - return -EINVAL; > - > - page = get_page_from_gfn(d, gfn, NULL, P2M_UNSHARE); > - if ( !page ) > - return -EINVAL; > - > - if ( !get_page_type(page, PGT_writable_page) ) > - { > - put_page(page); > - return -EINVAL; > - } > - > - mapping = __map_domain_page_global(page); > - if ( mapping == NULL ) > - { > - put_page_and_type(page); > - return -ENOMEM; > - } > - > - new_info = (vcpu_info_t *)(mapping + offset); > - > - if ( v->vcpu_info == &dummy_vcpu_info ) > - { > - memset(new_info, 0, sizeof(*new_info)); > -#ifdef XEN_HAVE_PV_UPCALL_MASK > - __vcpu_info(v, new_info, evtchn_upcall_mask) = 1; > -#endif > - } > - else > - { > - memcpy(new_info, v->vcpu_info, sizeof(*new_info)); > - } > - > - v->vcpu_info = new_info; > - v->vcpu_info_mfn = page_to_mfn(page); > - > - /* Set new vcpu_info pointer /before/ setting pending flags. */ > - smp_wmb(); > - > - /* > - * Mark everything as being pending just to make sure nothing gets > - * lost. The domain will get a spurious event, but it can cope. > - */ > -#ifdef CONFIG_COMPAT > - if ( !has_32bit_shinfo(d) ) > - write_atomic(&new_info->native.evtchn_pending_sel, ~0); > - else > -#endif > - write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0); > - vcpu_mark_events_pending(v); > - > - return 0; > -} > - > -/* > - * Unmap the vcpu info page if the guest decided to place it somewhere > - * else. This is used from domain_kill() and domain_soft_reset(). > - */ > -void unmap_vcpu_info(struct vcpu *v) > -{ > - mfn_t mfn = v->vcpu_info_mfn; > - > - if ( mfn_eq(mfn, INVALID_MFN) ) > - return; > - > - unmap_domain_page_global((void *) > - ((unsigned long)v->vcpu_info & PAGE_MASK)); > - > - vcpu_info_reset(v); /* NB: Clobbers v->vcpu_info_mfn */ > - > - put_page_and_type(mfn_to_page(mfn)); > -} > - > int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size, > struct guest_area *area, > void (*populate)(void *dst, struct vcpu *v)) > @@ -1633,14 +1528,44 @@ int map_guest_area(struct vcpu *v, paddr > > domain_lock(d); > > - if ( map ) > - populate(map, v); > + /* No re-registration of the vCPU info area. */ > + if ( area != &v->vcpu_info_area || !area->pg ) It would be nice if this check could be done earlier, as to avoid having to fetch and map the page just to discard it. That would however require taking the domain lock earlier. > + { > + if ( map ) > + populate(map, v); > > - SWAP(area->pg, pg); > - SWAP(area->map, map); > + SWAP(area->pg, pg); > + SWAP(area->map, map); > + } > + else > + rc = -EBUSY; > > domain_unlock(d); > > + /* Set pending flags /after/ new vcpu_info pointer was set. */ > + if ( area == &v->vcpu_info_area && !rc ) > + { > + /* > + * Mark everything as being pending just to make sure nothing gets > + * lost. The domain will get a spurious event, but it can cope. > + */ > +#ifdef CONFIG_COMPAT > + if ( !has_32bit_shinfo(d) ) > + { > + vcpu_info_t *info = area->map; > + > + /* For VCPUOP_register_vcpu_info handling in common_vcpu_op(). */ > + BUILD_BUG_ON(sizeof(*info) != sizeof(info->compat)); > + write_atomic(&info->native.evtchn_pending_sel, ~0); > + } > + else > +#endif > + write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0); Can't the setting of evtchn_pending_sel be done in vcpu_info_populate()? > + vcpu_mark_events_pending(v); > + > + force_update_vcpu_system_time(v); > + } > + > if ( v != current ) > vcpu_unpause(v); > > @@ -1670,7 +1595,10 @@ void unmap_guest_area(struct vcpu *v, st > > domain_lock(d); > map = area->map; > - area->map = NULL; > + if ( area == &v->vcpu_info_area ) > + vcpu_info_reset(v); > + else > + area->map = NULL; > pg = area->pg; > area->pg = NULL; > domain_unlock(d); > @@ -1801,6 +1729,27 @@ bool update_runstate_area(struct vcpu *v > return rc; > } > > +/* > + * This makes sure that the vcpu_info is always pointing at a valid piece of > + * memory, and it sets a pending event to make sure that a pending event > + * doesn't get missed. > + */ > +static void cf_check > +vcpu_info_populate(void *map, struct vcpu *v) > +{ > + vcpu_info_t *info = map; > + > + if ( v->vcpu_info_area.map == &dummy_vcpu_info ) > + { > + memset(info, 0, sizeof(*info)); > +#ifdef XEN_HAVE_PV_UPCALL_MASK > + __vcpu_info(v, info, evtchn_upcall_mask) = 1; > +#endif I'm not sure about the point of those guards, this will always be 1, as we always build the hypervisor with the headers in xen/public? Is it to make backports easier? Thanks, Roger.

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.