Xen project Mailing List

[Xen-changelog] Fix context switching race which could cause vcpu_pause()

From: Xen patchbot -unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>

Date: Tue, 16 Aug 2005 14:04:11 -0400

Delivery-date: Tue, 16 Aug 2005 18:04:41 +0000

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User kaf24@xxxxxxxxxxxxxxxxxxxx # Node ID 027812e4a63cde88a0cc03a3a83d40325f4e34f8 # Parent 26c03c17c418ba106ebda01502713da2fc9d28c6 Fix context switching race which could cause vcpu_pause() to not actually do its job properly. This requires us to move clearing of VCPUF_running to be protected by the scheduler lock. Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx> diff -r 26c03c17c418 -r 027812e4a63c xen/arch/ia64/xenmisc.c --- a/xen/arch/ia64/xenmisc.c Tue Aug 16 17:02:49 2005 +++ b/xen/arch/ia64/xenmisc.c Tue Aug 16 18:02:24 2005 @@ -280,7 +280,6 @@ unsigned long context_switch_count = 0; -// context_switch void context_switch(struct vcpu *prev, struct vcpu *next) { //printk("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"); @@ -290,22 +289,14 @@ //if (prev->domain->domain_id == 0 && next->domain->domain_id == 1) cs01foo(); //printk("@@sw %d->%d\n",prev->domain->domain_id,next->domain->domain_id); #ifdef CONFIG_VTI - unsigned long psr; - /* Interrupt is enabled after next task is chosen. - * So we have to disable it for stack switch. - */ - local_irq_save(psr); vtm_domain_out(prev); - /* Housekeeping for prev domain */ -#endif // CONFIG_VTI - +#endif context_switch_count++; switch_to(prev,next,prev); #ifdef CONFIG_VTI - /* Post-setup for new domain */ vtm_domain_in(current); - local_irq_restore(psr); -#endif // CONFIG_VTI +#endif + // leave this debug for now: it acts as a heartbeat when more than // one domain is active { @@ -315,25 +306,27 @@ if (!cnt[id]--) { printk("%x",id); cnt[id] = 500000; } if (!i--) { printk("+",id); i = 1000000; } } - clear_bit(_VCPUF_running, &prev->vcpu_flags); - //if (!is_idle_task(next->domain) ) - //send_guest_virq(next, VIRQ_TIMER); + #ifdef CONFIG_VTI if (VMX_DOMAIN(current)) vmx_load_all_rr(current); - return; -#else // CONFIG_VTI +#else if (!is_idle_task(current->domain)) { load_region_regs(current); if (vcpu_timer_expired(current)) vcpu_pend_timer(current); } if (vcpu_timer_expired(current)) vcpu_pend_timer(current); -#endif // CONFIG_VTI +#endif +} + +void context_switch_finalise(struct vcpu *next) +{ + /* nothing to do */ } void continue_running(struct vcpu *same) { - /* nothing to do */ + /* nothing to do */ } void panic_domain(struct pt_regs *regs, const char *fmt, ...) diff -r 26c03c17c418 -r 027812e4a63c xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Tue Aug 16 17:02:49 2005 +++ b/xen/arch/x86/domain.c Tue Aug 16 18:02:24 2005 @@ -48,6 +48,8 @@ struct percpu_ctxt { struct vcpu *curr_vcpu; + unsigned int context_not_finalised; + unsigned int dirty_segment_mask; } __cacheline_aligned; static struct percpu_ctxt percpu_ctxt[NR_CPUS]; @@ -541,51 +543,59 @@ __r; }) #if CONFIG_VMX -#define load_msrs(_p, _n) if (vmx_switch_on) vmx_load_msrs((_p), (_n)) +#define load_msrs(n) if (vmx_switch_on) vmx_load_msrs(n) #else -#define load_msrs(_p, _n) ((void)0) +#define load_msrs(n) ((void)0) #endif -static void load_segments(struct vcpu *p, struct vcpu *n) -{ - struct vcpu_guest_context *pctxt = &p->arch.guest_context; +/* + * save_segments() writes a mask of segments which are dirty (non-zero), + * allowing load_segments() to avoid some expensive segment loads and + * MSR writes. + */ +#define DIRTY_DS 0x01 +#define DIRTY_ES 0x02 +#define DIRTY_FS 0x04 +#define DIRTY_GS 0x08 +#define DIRTY_FS_BASE 0x10 +#define DIRTY_GS_BASE_USER 0x20 + +static void load_segments(struct vcpu *n) +{ struct vcpu_guest_context *nctxt = &n->arch.guest_context; int all_segs_okay = 1; + unsigned int dirty_segment_mask, cpu = smp_processor_id(); + + /* Load and clear the dirty segment mask. */ + dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask; + percpu_ctxt[cpu].dirty_segment_mask = 0; /* Either selector != 0 ==> reload. */ - if ( unlikely(pctxt->user_regs.ds | nctxt->user_regs.ds) ) + if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) ) all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds); /* Either selector != 0 ==> reload. */ - if ( unlikely(pctxt->user_regs.es | nctxt->user_regs.es) ) + if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) ) all_segs_okay &= loadsegment(es, nctxt->user_regs.es); /* * Either selector != 0 ==> reload. * Also reload to reset FS_BASE if it was non-zero. */ - if ( unlikely(pctxt->user_regs.fs | - pctxt->fs_base | + if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) | nctxt->user_regs.fs) ) - { all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs); - if ( pctxt->user_regs.fs ) /* != 0 selector kills fs_base */ - pctxt->fs_base = 0; - } /* * Either selector != 0 ==> reload. * Also reload to reset GS_BASE if it was non-zero. */ - if ( unlikely(pctxt->user_regs.gs | - pctxt->gs_base_user | + if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) | nctxt->user_regs.gs) ) { /* Reset GS_BASE with user %gs? */ - if ( pctxt->user_regs.gs || !nctxt->gs_base_user ) + if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user ) all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs); - if ( pctxt->user_regs.gs ) /* != 0 selector kills gs_base_user */ - pctxt->gs_base_user = 0; } /* This can only be non-zero if selector is NULL. */ @@ -650,7 +660,9 @@ static void save_segments(struct vcpu *v) { - struct cpu_user_regs *regs = &v->arch.guest_context.user_regs; + struct vcpu_guest_context *ctxt = &v->arch.guest_context; + struct cpu_user_regs *regs = &ctxt->user_regs; + unsigned int dirty_segment_mask = 0; if ( VMX_DOMAIN(v) ) rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs); @@ -659,18 +671,34 @@ __asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) ); __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) ); __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) ); -} - -static void clear_segments(void) -{ - __asm__ __volatile__ ( - " movl %0,%%ds; " - " movl %0,%%es; " - " movl %0,%%fs; " - " movl %0,%%gs; " - ""safe_swapgs" " - " movl %0,%%gs" - : : "r" (0) ); + + if ( regs->ds ) + dirty_segment_mask |= DIRTY_DS; + + if ( regs->es ) + dirty_segment_mask |= DIRTY_ES; + + if ( regs->fs ) + { + dirty_segment_mask |= DIRTY_FS; + ctxt->fs_base = 0; /* != 0 selector kills fs_base */ + } + else if ( ctxt->fs_base ) + { + dirty_segment_mask |= DIRTY_FS_BASE; + } + + if ( regs->gs ) + { + dirty_segment_mask |= DIRTY_GS; + ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */ + } + else if ( ctxt->gs_base_user ) + { + dirty_segment_mask |= DIRTY_GS_BASE_USER; + } + + percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask; } long do_switch_to_user(void) @@ -706,10 +734,9 @@ #elif defined(__i386__) -#define load_segments(_p, _n) ((void)0) -#define load_msrs(_p, _n) ((void)0) -#define save_segments(_p) ((void)0) -#define clear_segments() ((void)0) +#define load_segments(n) ((void)0) +#define load_msrs(n) ((void)0) +#define save_segments(p) ((void)0) static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu) { @@ -726,9 +753,9 @@ static void __context_switch(void) { struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); - unsigned int cpu = smp_processor_id(); - struct vcpu *p = percpu_ctxt[cpu].curr_vcpu; - struct vcpu *n = current; + unsigned int cpu = smp_processor_id(); + struct vcpu *p = percpu_ctxt[cpu].curr_vcpu; + struct vcpu *n = current; if ( !is_idle_task(p->domain) ) { @@ -786,23 +813,27 @@ void context_switch(struct vcpu *prev, struct vcpu *next) { - struct vcpu *realprev; - - local_irq_disable(); + unsigned int cpu = smp_processor_id(); set_current(next); - if ( ((realprev = percpu_ctxt[smp_processor_id()].curr_vcpu) == next) || - is_idle_task(next->domain) ) - { - local_irq_enable(); - } - else + if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) ) { __context_switch(); - - local_irq_enable(); - + percpu_ctxt[cpu].context_not_finalised = 1; + } +} + +void context_switch_finalise(struct vcpu *next) +{ + unsigned int cpu = smp_processor_id(); + + if ( percpu_ctxt[cpu].context_not_finalised ) + { + percpu_ctxt[cpu].context_not_finalised = 0; + + BUG_ON(percpu_ctxt[cpu].curr_vcpu != next); + if ( VMX_DOMAIN(next) ) { vmx_restore_msrs(next); @@ -810,18 +841,10 @@ else { load_LDT(next); - load_segments(realprev, next); - load_msrs(realprev, next); - } - } - - /* - * We do this late on because it doesn't need to be protected by the - * schedule_lock, and because we want this to be the very last use of - * 'prev' (after this point, a dying domain's info structure may be freed - * without warning). - */ - clear_bit(_VCPUF_running, &prev->vcpu_flags); + load_segments(next); + load_msrs(next); + } + } schedule_tail(next); BUG(); @@ -835,12 +858,19 @@ int __sync_lazy_execstate(void) { - if ( percpu_ctxt[smp_processor_id()].curr_vcpu == current ) - return 0; - __context_switch(); - load_LDT(current); - clear_segments(); - return 1; + unsigned long flags; + int switch_required; + + local_irq_save(flags); + + switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current); + + if ( switch_required ) + __context_switch(); + + local_irq_restore(flags); + + return switch_required; } void sync_lazy_execstate_cpu(unsigned int cpu) diff -r 26c03c17c418 -r 027812e4a63c xen/arch/x86/vmx.c --- a/xen/arch/x86/vmx.c Tue Aug 16 17:02:49 2005 +++ b/xen/arch/x86/vmx.c Tue Aug 16 18:02:24 2005 @@ -65,7 +65,7 @@ * are not modified once set for generic domains, we don't save them, * but simply reset them to the values set at percpu_traps_init(). */ -void vmx_load_msrs(struct vcpu *p, struct vcpu *n) +void vmx_load_msrs(struct vcpu *n) { struct msr_state *host_state; host_state = &percpu_msr[smp_processor_id()]; diff -r 26c03c17c418 -r 027812e4a63c xen/common/schedule.c --- a/xen/common/schedule.c Tue Aug 16 17:02:49 2005 +++ b/xen/common/schedule.c Tue Aug 16 18:02:24 2005 @@ -474,13 +474,14 @@ set_ac_timer(&schedule_data[cpu].s_timer, now + r_time); - /* Must be protected by the schedule_lock! */ + if ( unlikely(prev == next) ) + { + spin_unlock_irq(&schedule_data[cpu].schedule_lock); + return continue_running(prev); + } + + clear_bit(_VCPUF_running, &prev->vcpu_flags); set_bit(_VCPUF_running, &next->vcpu_flags); - - spin_unlock_irq(&schedule_data[cpu].schedule_lock); - - if ( unlikely(prev == next) ) - return continue_running(prev); perfc_incrc(sched_ctx); @@ -517,6 +518,10 @@ next->domain->domain_id, next->vcpu_id); context_switch(prev, next); + + spin_unlock_irq(&schedule_data[cpu].schedule_lock); + + context_switch_finalise(next); } /* No locking needed -- pointer comparison is safe :-) */ diff -r 26c03c17c418 -r 027812e4a63c xen/include/asm-x86/vmx_vmcs.h --- a/xen/include/asm-x86/vmx_vmcs.h Tue Aug 16 17:02:49 2005 +++ b/xen/include/asm-x86/vmx_vmcs.h Tue Aug 16 18:02:24 2005 @@ -28,10 +28,10 @@ extern void stop_vmx(void); #if defined (__x86_64__) -extern void vmx_load_msrs(struct vcpu *p, struct vcpu *n); +extern void vmx_load_msrs(struct vcpu *n); void vmx_restore_msrs(struct vcpu *d); #else -#define vmx_load_msrs(_p, _n) ((void)0) +#define vmx_load_msrs(_n) ((void)0) #define vmx_restore_msrs(_v) ((void)0) #endif diff -r 26c03c17c418 -r 027812e4a63c xen/include/xen/sched.h --- a/xen/include/xen/sched.h Tue Aug 16 17:02:49 2005 +++ b/xen/include/xen/sched.h Tue Aug 16 18:02:24 2005 @@ -258,12 +258,32 @@ extern void sync_lazy_execstate_all(void); extern int __sync_lazy_execstate(void); -/* Called by the scheduler to switch to another vcpu. */ +/* + * Called by the scheduler to switch to another VCPU. On entry, although + * VCPUF_running is no longer asserted for @prev, its context is still running + * on the local CPU and is not committed to memory. The local scheduler lock + * is therefore still held, and interrupts are disabled, because the local CPU + * is in an inconsistent state. + * + * The callee must ensure that the local CPU is no longer running in @prev's + * context, and that the context is saved to memory, before returning. + * Alternatively, if implementing lazy context switching, it suffices to ensure + * that invoking __sync_lazy_execstate() will switch and commit @prev's state. + */ extern void context_switch( struct vcpu *prev, struct vcpu *next); -/* Called by the scheduler to continue running the current vcpu. */ +/* + * On some architectures (notably x86) it is not possible to entirely load + * @next's context with interrupts disabled. These may implement a function to + * finalise loading the new context after interrupts are re-enabled. This + * function is not given @prev and is not permitted to access it. + */ +extern void context_switch_finalise( + struct vcpu *next); + +/* Called by the scheduler to continue running the current VCPU. */ extern void continue_running( struct vcpu *same); _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.