[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen master] x86: use VMLOAD for PV context switch



commit bacb35445d4a8e0359027aafc407e84202cfe4a9
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Fri Oct 5 16:24:05 2018 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Fri Oct 5 16:24:05 2018 +0200

    x86: use VMLOAD for PV context switch
    
    Having noticed that VMLOAD alone is about as fast as a single of the
    involved WRMSRs, I thought it might be a reasonable idea to also use it
    for PV. Measurements, however, have shown that an actual improvement can
    be achieved only with an early prefetch of the VMCB (thanks to Andrew
    for suggesting to try this), which I have to admit I can't really
    explain. This way on my Fam15 box context switch takes over 100 clocks
    less on average (the measured values are heavily varying in all cases,
    though).
    
    This is intentionally not using a new hvm_funcs hook: For one, this is
    all about PV, and something similar can hardly be done for VMX.
    Furthermore the indirect to direct call patching that is meant to be
    applied to most hvm_funcs hooks would be ugly to make work with
    functions having more than 6 parameters.
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Acked-by: Brian Woods <brian.woods@xxxxxxx>
    Acked-by: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
    Reviewed-by: Wei Liu <wei.liu2@xxxxxxxxxx>
    Reviewed-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
 xen/arch/x86/domain.c             | 40 +++++++++++++++++---
 xen/arch/x86/hvm/svm/svm.c        | 78 +++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/hvm/svm/svm.h |  9 +++++
 3 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index d67a0478f6..9371efc8c7 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -52,6 +52,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/support.h>
+#include <asm/hvm/svm/svm.h>
 #include <asm/hvm/viridian.h>
 #include <asm/debugreg.h>
 #include <asm/msr.h>
@@ -1281,11 +1282,34 @@ static void load_segments(struct vcpu *n)
     struct cpu_user_regs *uregs = &n->arch.user_regs;
     int all_segs_okay = 1;
     unsigned int dirty_segment_mask, cpu = smp_processor_id();
+    bool fs_gs_done = false;
 
     /* Load and clear the dirty segment mask. */
     dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
     per_cpu(dirty_segment_mask, cpu) = 0;
 
+#ifdef CONFIG_HVM
+    if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
+         !((uregs->fs | uregs->gs) & ~3) &&
+         /*
+          * The remaining part is just for optimization: If only shadow GS
+          * needs loading, there's nothing to be gained here.
+          */
+         (n->arch.pv.fs_base | n->arch.pv.gs_base_user | n->arch.pv.ldt_ents) )
+    {
+        unsigned long gsb = n->arch.flags & TF_kernel_mode
+            ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
+        unsigned long gss = n->arch.flags & TF_kernel_mode
+            ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel;
+
+        fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
+                                   uregs->fs, n->arch.pv.fs_base,
+                                   uregs->gs, gsb, gss);
+    }
+#endif
+    if ( !fs_gs_done )
+        load_LDT(n);
+
     /* Either selector != 0 ==> reload. */
     if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
     {
@@ -1301,7 +1325,7 @@ static void load_segments(struct vcpu *n)
     }
 
     /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
+    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) && !fs_gs_done )
     {
         all_segs_okay &= loadsegment(fs, uregs->fs);
         /* non-nul selector updates fs_base */
@@ -1310,7 +1334,7 @@ static void load_segments(struct vcpu *n)
     }
 
     /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
+    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) && !fs_gs_done )
     {
         all_segs_okay &= loadsegment(gs, uregs->gs);
         /* non-nul selector updates gs_base_user */
@@ -1318,7 +1342,7 @@ static void load_segments(struct vcpu *n)
             dirty_segment_mask &= ~DIRTY_GS_BASE;
     }
 
-    if ( !is_pv_32bit_vcpu(n) )
+    if ( !fs_gs_done && !is_pv_32bit_vcpu(n) )
     {
         /* This can only be non-zero if selector is NULL. */
         if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
@@ -1653,6 +1677,13 @@ static void __context_switch(void)
 
     write_ptbase(n);
 
+#if defined(CONFIG_PV) && defined(CONFIG_HVM)
+    /* Prefetch the VMCB if we expect to use it later in the context switch */
+    if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) &&
+         !cpu_has_fsgsbase && cpu_has_svm )
+        svm_load_segs(0, 0, 0, 0, 0, 0, 0);
+#endif
+
     if ( need_full_gdt(nd) &&
          ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
     {
@@ -1714,10 +1745,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
         local_irq_enable();
 
         if ( is_pv_domain(nextd) )
-        {
-            load_LDT(next);
             load_segments(next);
-        }
 
         ctxt_switch_levelling(next);
 
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index ef8f271168..c98cfc2c13 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -78,6 +78,9 @@ static struct hvm_function_table svm_function_table;
  */
 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, hsa);
 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, host_vmcb);
+#ifdef CONFIG_PV
+static DEFINE_PER_CPU(struct vmcb_struct *, host_vmcb_va);
+#endif
 
 static bool_t amd_erratum383_found __read_mostly;
 
@@ -1567,6 +1570,14 @@ static void svm_cpu_dead(unsigned int cpu)
         *this_hsa = 0;
     }
 
+#ifdef CONFIG_PV
+    if ( per_cpu(host_vmcb_va, cpu) )
+    {
+        unmap_domain_page_global(per_cpu(host_vmcb_va, cpu));
+        per_cpu(host_vmcb_va, cpu) = NULL;
+    }
+#endif
+
     if ( *this_vmcb )
     {
         free_domheap_page(maddr_to_page(*this_vmcb));
@@ -1601,6 +1612,11 @@ static int svm_cpu_up_prepare(unsigned int cpu)
         if ( !pg )
             goto err;
 
+#ifdef CONFIG_PV
+        if ( !cpu_has_fsgsbase )
+            per_cpu(host_vmcb_va, cpu) = __map_domain_page_global(pg);
+#endif
+
         clear_domain_page(page_to_mfn(pg));
         *this_vmcb = page_to_maddr(pg);
     }
@@ -1630,6 +1646,66 @@ static void svm_init_erratum_383(const struct 
cpuinfo_x86 *c)
     }
 }
 
+#ifdef CONFIG_PV
+bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
+                   unsigned int fs_sel, unsigned long fs_base,
+                   unsigned int gs_sel, unsigned long gs_base,
+                   unsigned long gs_shadow)
+{
+    unsigned int cpu = smp_processor_id();
+    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
+
+    if ( unlikely(!vmcb) )
+        return false;
+
+    if ( !ldt_base )
+    {
+        /*
+         * The actual structure field used here was arbitrarily chosen.
+         * Empirically it doesn't seem to matter much which element is used,
+         * and a clear explanation of the otherwise poor performance has not
+         * been found/provided so far.
+         */
+        prefetchw(&vmcb->ldtr);
+        return true;
+    }
+
+    if ( likely(!ldt_ents) )
+        memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
+    else
+    {
+        /* Keep GDT in sync. */
+        struct desc_struct *desc = this_cpu(gdt_table) + LDT_ENTRY -
+                                   FIRST_RESERVED_GDT_ENTRY;
+
+        _set_tssldt_desc(desc, ldt_base, ldt_ents * 8 - 1, SYS_DESC_ldt);
+
+        vmcb->ldtr.sel = LDT_ENTRY << 3;
+        vmcb->ldtr.attr = SYS_DESC_ldt | (_SEGMENT_P >> 8);
+        vmcb->ldtr.limit = ldt_ents * 8 - 1;
+        vmcb->ldtr.base = ldt_base;
+    }
+
+    ASSERT(!(fs_sel & ~3));
+    vmcb->fs.sel = fs_sel;
+    vmcb->fs.attr = 0;
+    vmcb->fs.limit = 0;
+    vmcb->fs.base = fs_base;
+
+    ASSERT(!(gs_sel & ~3));
+    vmcb->gs.sel = gs_sel;
+    vmcb->gs.attr = 0;
+    vmcb->gs.limit = 0;
+    vmcb->gs.base = gs_base;
+
+    vmcb->kerngsbase = gs_shadow;
+
+    svm_vmload_pa(per_cpu(host_vmcb, cpu));
+
+    return true;
+}
+#endif
+
 static int _svm_cpu_up(bool bsp)
 {
     uint64_t msr_content;
@@ -1662,6 +1738,8 @@ static int _svm_cpu_up(bool bsp)
     /* Initialize OSVW bits to be used by guests */
     svm_host_osvw_init();
 
+    svm_vmsave_pa(per_cpu(host_vmcb, cpu));
+
     return 0;
 }
 
diff --git a/xen/include/asm-x86/hvm/svm/svm.h 
b/xen/include/asm-x86/hvm/svm/svm.h
index 8166046a6d..49dca39e0b 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -53,6 +53,15 @@ unsigned long *svm_msrbit(unsigned long *msr_bitmap, 
uint32_t msr);
 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len);
 void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
 
+/*
+ * PV context switch helper. Calls with zero ldt_base request a prefetch of
+ * the VMCB area to be loaded from, instead of an actual load of state.
+ */
+bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
+                   unsigned int fs_sel, unsigned long fs_base,
+                   unsigned int gs_sel, unsigned long gs_base,
+                   unsigned long gs_shadow);
+
 extern u32 svm_feature_flags;
 
 #define SVM_FEATURE_NPT            0 /* Nested page table support */
--
generated by git-patchbot for /home/xen/git/xen.git#master

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.