[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH v3] x86: use VMLOAD for PV context switch
On 18/09/18 13:28, Jan Beulich wrote: > Besides the mentioned oddity with measured performance, I've also > noticed a significant difference (of at least 150 clocks) between > measuring immediately around the calls to svm_load_segs() and measuring > immediately inside the function. This is a little concerning. It either means that there is a bug with your timing, or (along the same line as why I think the prefetch makes a difference), the mapping of of svm_load_segs() is reliably TLB-cold in the context switch path. > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -52,6 +52,7 @@ > #include <asm/hvm/hvm.h> > #include <asm/hvm/nestedhvm.h> > #include <asm/hvm/support.h> > +#include <asm/hvm/svm/svm.h> > #include <asm/hvm/viridian.h> > #include <asm/debugreg.h> > #include <asm/msr.h> > @@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n > struct cpu_user_regs *uregs = &n->arch.user_regs; > int all_segs_okay = 1; > unsigned int dirty_segment_mask, cpu = smp_processor_id(); > + bool fs_gs_done = false; > > /* Load and clear the dirty segment mask. */ > dirty_segment_mask = per_cpu(dirty_segment_mask, cpu); > per_cpu(dirty_segment_mask, cpu) = 0; > > +#ifdef CONFIG_HVM > + if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm && > + !((uregs->fs | uregs->gs) & ~3) && > + /* > + * The remaining part is just for optimization: If only shadow GS > + * needs loading, there's nothing to be gained here. VMLOAD also loads LDT, and LLDT is fully serialising, so an even heavier perf hit than wrmsr. > + */ > + (n->arch.pv.fs_base | n->arch.pv.gs_base_user) ) > + { > + fs_gs_done = n->arch.flags & TF_kernel_mode > + ? svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n), > + uregs->fs, n->arch.pv.fs_base, > + uregs->gs, n->arch.pv.gs_base_kernel, > + n->arch.pv.gs_base_user) > + : svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n), > + uregs->fs, n->arch.pv.fs_base, > + uregs->gs, n->arch.pv.gs_base_user, > + n->arch.pv.gs_base_kernel); This looks like a confusing way of writing: { unsigned long gsb = n->arch.flags & TF_kernel_mode ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user; unsigned long gss = n->arch.flags & TF_kernel_mode ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel; fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n), uregs->fs, n->arch.pv.fs_base, uregs->gs, gsb, gss); } AFAICT? > + } > +#endif > + if ( !fs_gs_done ) > + load_LDT(n); > + > /* Either selector != 0 ==> reload. */ > if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) ) > { > @@ -1301,7 +1326,7 @@ static void load_segments(struct vcpu *n > } > > /* Either selector != 0 ==> reload. */ > - if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) ) > + if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) && > !fs_gs_done ) > { > all_segs_okay &= loadsegment(fs, uregs->fs); > /* non-nul selector updates fs_base */ > @@ -1310,7 +1335,7 @@ static void load_segments(struct vcpu *n > } > > /* Either selector != 0 ==> reload. */ > - if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) ) > + if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) && > !fs_gs_done ) One too many spaces. > { > all_segs_okay &= loadsegment(gs, uregs->gs); > /* non-nul selector updates gs_base_user */ > @@ -1318,7 +1343,7 @@ static void load_segments(struct vcpu *n > dirty_segment_mask &= ~DIRTY_GS_BASE; > } > > - if ( !is_pv_32bit_vcpu(n) ) > + if ( !fs_gs_done && !is_pv_32bit_vcpu(n) ) > { > /* This can only be non-zero if selector is NULL. */ > if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) ) > @@ -1653,6 +1678,12 @@ static void __context_switch(void) > > write_ptbase(n); > > +#if defined(CONFIG_PV) && defined(CONFIG_HVM) From a comments in code point of view, this is the most useful location to have something along the lines of: /* Prefetch the VMCB if we expect to use it later in the context switch */ > + if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) > && > + !cpu_has_fsgsbase && cpu_has_svm ) > + svm_load_segs(0, 0, 0, 0, 0, 0, 0); > +#endif > + > if ( need_full_gdt(nd) && > ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) ) > { > --- a/xen/arch/x86/hvm/svm/svm.c > +++ b/xen/arch/x86/hvm/svm/svm.c > @@ -1630,6 +1646,66 @@ static void svm_init_erratum_383(const s > } > } > > +#ifdef CONFIG_PV > +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base, > + unsigned int fs_sel, unsigned long fs_base, > + unsigned int gs_sel, unsigned long gs_base, > + unsigned long gs_shadow) > +{ > + unsigned int cpu = smp_processor_id(); > + struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu); > + > + if ( unlikely(!vmcb) ) > + return false; When can this error path ever be taken? > + > + if ( !ldt_base ) > + { > + /* > + * The actual structure field used here was arbitrarily chosen. > + * Empirically it doesn't seem to matter much which element is used, > + * and a clear explanation of the otherwise poor performance has not > + * been found/provided so far. > + */ > + asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) ); prefetchw(), which already exists and is used. ~Andrew > + return true; > + } > + > + if ( likely(!ldt_ents) ) > + memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr)); > + else > + { > + /* Keep GDT in sync. */ > + struct desc_struct *desc = this_cpu(gdt_table) + LDT_ENTRY - > + FIRST_RESERVED_GDT_ENTRY; > + > + _set_tssldt_desc(desc, ldt_base, ldt_ents * 8 - 1, SYS_DESC_ldt); > + > + vmcb->ldtr.sel = LDT_ENTRY << 3; > + vmcb->ldtr.attr = SYS_DESC_ldt | (_SEGMENT_P >> 8); > + vmcb->ldtr.limit = ldt_ents * 8 - 1; > + vmcb->ldtr.base = ldt_base; > + } > + > + ASSERT(!(fs_sel & ~3)); > + vmcb->fs.sel = fs_sel; > + vmcb->fs.attr = 0; > + vmcb->fs.limit = 0; > + vmcb->fs.base = fs_base; > + > + ASSERT(!(gs_sel & ~3)); > + vmcb->gs.sel = gs_sel; > + vmcb->gs.attr = 0; > + vmcb->gs.limit = 0; > + vmcb->gs.base = gs_base; > + > + vmcb->kerngsbase = gs_shadow; > + > + svm_vmload_pa(per_cpu(host_vmcb, cpu)); > + > + return true; > +} > +#endif > + > static int _svm_cpu_up(bool bsp) > { > uint64_t msr_content; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |