[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 7/7] xen/x86: use PCID feature



Avoid flushing the complete TLB when switching %cr3 for mitigation of
Meltdown by using the PCID feature if available.

We are using 4 PCID values for a 64 bit pv domain subject to XPTI and
2 values for the non-XPTI case:

- guest active and in kernel mode
- guest active and in user mode
- hypervisor active and guest in user mode (XPTI only)
- hypervisor active and guest in kernel mode (XPTI only)

We use PCID only if PCID _and_ INVPCID are supported. With PCID in use
we disable global pages in cr4. A command line parameter controls in
which cases PCID is being used.

As the non-XPTI case has shown not to perform better with PCID at least
on some machines the default is to use PCID only for domains subject to
XPTI.

Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
---
V3:
- support PCID for non-XPTI case, too
- add command line parameter for controlling usage of PCID
- check PCID active by using cr4.pcide (Jan Beulich)
---
 docs/misc/xen-command-line.markdown | 12 +++++++++
 xen/arch/x86/debug.c                |  3 ++-
 xen/arch/x86/domain_page.c          |  2 +-
 xen/arch/x86/domctl.c               |  4 +++
 xen/arch/x86/flushtlb.c             | 49 ++++++++++++++++++++++++++++------
 xen/arch/x86/mm.c                   | 34 +++++++++++++++++++++---
 xen/arch/x86/pv/dom0_build.c        |  1 +
 xen/arch/x86/pv/domain.c            | 52 +++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/domain.h        | 14 +++++++---
 xen/include/asm-x86/pv/domain.h     |  2 ++
 xen/include/asm-x86/x86-defns.h     |  1 +
 11 files changed, 158 insertions(+), 16 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 8fc7b2ff3b..4ecf471ea9 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1450,6 +1450,18 @@ All numbers specified must be hexadecimal ones.
 
 This option can be specified more than once (up to 8 times at present).
 
+### pcid (x86)
+> `= off | all | xpti | noxpti`
+
+> Default: `xpti`
+
+> Can be modified at runtime
+
+If available, control usage of the PCID feature of the processor for
+64-bit pv-domains. PCID can be used either for no domain at all, for
+all of them, only for those subject to XPTI or for those not subject
+to XPTI.
+
 ### ple\_gap
 > `= <integer>`
 
diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c
index 9159f32db4..c8079569c4 100644
--- a/xen/arch/x86/debug.c
+++ b/xen/arch/x86/debug.c
@@ -97,7 +97,8 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t 
pgd3val)
     l3_pgentry_t l3e, *l3t;
     l2_pgentry_t l2e, *l2t;
     l1_pgentry_t l1e, *l1t;
-    unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3);
+    unsigned long cr3 = (pgd3val ? pgd3val
+                                 : (dp->vcpu[0]->arch.cr3 & ~X86_CR3_NOFLUSH));
     mfn_t mfn = maddr_to_mfn(cr3);
 
     DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, 
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index b5780f201f..8073ae5282 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -51,7 +51,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
         if ( (v = idle_vcpu[smp_processor_id()]) == current )
             sync_local_execstate();
         /* We must now be running on the idle page table. */
-        ASSERT(read_cr3() == __pa(idle_pg_table));
+        ASSERT((read_cr3() & ~X86_CR3_PCIDMASK) == __pa(idle_pg_table));
     }
 
     return v;
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 0704f398c7..a7c8772fa6 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -613,7 +613,11 @@ long arch_do_domctl(
             ret = -EINVAL;
 
         if ( ret == 0 )
+        {
             xpti_domain_init(d);
+            pcid_domain_init(d);
+        }
+
         break;
 
     case XEN_DOMCTL_get_address_size:
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index d4b8acc837..092ef86314 100644
--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -102,7 +102,19 @@ void write_cr3_cr4(unsigned long cr3, unsigned long cr4)
     t = pre_flush();
 
     if ( read_cr4() & X86_CR4_PGE )
+        /*
+         * X86_CR4_PGE set means PCID being inactive.
+         * We have to purge the TLB via flipping cr4.pge.
+         */
         write_cr4(cr4 & ~X86_CR4_PGE);
+    else if ( cpu_has_invpcid )
+        /*
+         * If we are using PCID purge the TLB via INVPCID as loading cr3
+         * will affect the current PCID only.
+         * If INVPCID is not supported we don't use PCIDs so loading cr3
+         * will purge the TLB (we are in the "global pages off" branch).
+         */
+        invpcid_flush_all();
 
     asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
 
@@ -131,14 +143,35 @@ unsigned int flush_area_local(const void *va, unsigned 
int flags)
     {
         if ( order == 0 )
         {
-            /*
-             * We don't INVLPG multi-page regions because the 2M/4M/1G
-             * region may not have been mapped with a superpage. Also there
-             * are various errata surrounding INVLPG usage on superpages, and
-             * a full flush is in any case not *that* expensive.
-             */
-            asm volatile ( "invlpg %0"
-                           : : "m" (*(const char *)(va)) : "memory" );
+            if ( read_cr4() & X86_CR4_PCIDE )
+            {
+                unsigned long addr = (unsigned long)va;
+
+                /*
+                 * Flush the addresses for all potential address spaces.
+                 * We can't check the current domain for being subject to
+                 * XPTI as current might be the idle vcpu while we still have
+                 * some XPTI domain TLB entries.
+                 * Using invpcid is okay here, as with PCID enabled we always
+                 * have global pages disabled.
+                 */
+                invpcid_flush_one(PCID_PV_PRIV, addr);
+                invpcid_flush_one(PCID_PV_USER, addr);
+                if ( !cpu_has_no_xpti )
+                {
+                    invpcid_flush_one(PCID_PV_PRIV | PCID_PV_XEN, addr);
+                    invpcid_flush_one(PCID_PV_USER | PCID_PV_XEN, addr);
+                }
+            }
+            else
+                /*
+                 * We don't INVLPG multi-page regions because the 2M/4M/1G
+                 * region may not have been mapped with a superpage. Also there
+                 * are various errata surrounding INVLPG usage on superpages,
+                 * and a full flush is in any case not *that* expensive.
+                 */
+                asm volatile ( "invlpg %0"
+                               : : "m" (*(const char *)(va)) : "memory" );
         }
         else
             do_tlb_flush();
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 29071bb257..242425c075 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -497,12 +497,38 @@ void free_shared_domheap_page(struct page_info *page)
     free_domheap_page(page);
 }
 
+/*
+ * Return additional PCID specific cr3 bits.
+ *
+ * Note that X86_CR3_NOFLUSH will not be readable in cr3. Anyone consuming
+ * v->arch.cr3 should mask away X86_CR3_NOFLUSH and X86_CR3_PCIDMASK in case
+ * the value is used to address the root page table.
+ */
+static unsigned long get_pcid_bits(struct vcpu *v, bool is_xen)
+{
+    return X86_CR3_NOFLUSH | (is_xen ? PCID_PV_XEN : 0) |
+           ((v->arch.flags & TF_kernel_mode) ? PCID_PV_PRIV : PCID_PV_USER);
+}
+
 void make_cr3(struct vcpu *v, mfn_t mfn)
 {
+    struct domain *d = v->domain;
+
     v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT;
-    if ( v == current && this_cpu(root_pgt) && is_pv_vcpu(v) &&
-         !is_pv_32bit_vcpu(v) )
-        get_cpu_info()->root_pgt_changed = true;
+    if ( is_pv_domain(d) )
+    {
+        if ( d->arch.pv_domain.xpti && v == current )
+        {
+            struct cpu_info *cpu_info = get_cpu_info();
+
+            cpu_info->root_pgt_changed = true;
+            if ( d->arch.pv_domain.pcid )
+                cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
+                                   get_pcid_bits(v, false);
+        }
+        if ( d->arch.pv_domain.pcid )
+            v->arch.cr3 |= get_pcid_bits(v, d->arch.pv_domain.xpti);
+    }
 }
 
 void write_ptbase(struct vcpu *v)
@@ -517,6 +543,8 @@ void write_ptbase(struct vcpu *v)
     {
         cpu_info->root_pgt_changed = true;
         cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
+        if ( new_cr4 & X86_CR4_PCIDE )
+            cpu_info->pv_cr3 |= get_pcid_bits(v, false);
         write_cr3_cr4(v->arch.cr3, new_cr4);
     }
     else
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index 77186c19bd..2af0094e95 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -709,6 +709,7 @@ int __init dom0_construct_pv(struct domain *d,
     }
 
     xpti_domain_init(d);
+    pcid_domain_init(d);
 
     d->arch.paging.mode = 0;
 
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 266117e804..46c050aeeb 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -95,6 +95,58 @@ void xpti_domain_init(struct domain *d)
     }
 }
 
+static __read_mostly enum {
+    PCID_OFF,
+    PCID_ALL,
+    PCID_XPTI,
+    PCID_NOXPTI
+} opt_pcid = PCID_XPTI;
+
+static __init int parse_pcid(const char *s)
+{
+    int rc = 0;
+
+    if ( !strcmp(s, "off") )
+        opt_pcid = PCID_OFF;
+    else if ( !strcmp(s, "all") )
+        opt_pcid = PCID_ALL;
+    else if ( !strcmp(s, "xpti") )
+        opt_pcid = PCID_XPTI;
+    else if ( !strcmp(s, "noxpti") )
+        opt_pcid = PCID_NOXPTI;
+    else
+        rc = -EINVAL;
+
+    return rc;
+}
+custom_runtime_param("pcid", parse_pcid);
+
+void pcid_domain_init(struct domain *d)
+{
+    if ( !is_pv_domain(d) || is_pv_32bit_domain(d) ||
+         !cpu_has_invpcid || !cpu_has_pcid )
+        return;
+
+    switch ( opt_pcid )
+    {
+    case PCID_OFF:
+        d->arch.pv_domain.pcid = false;
+        break;
+    case PCID_ALL:
+        d->arch.pv_domain.pcid = true;
+        break;
+    case PCID_XPTI:
+        d->arch.pv_domain.pcid = d->arch.pv_domain.xpti;
+        break;
+    case PCID_NOXPTI:
+        d->arch.pv_domain.pcid = !d->arch.pv_domain.xpti;
+        break;
+    default:
+        ASSERT_UNREACHABLE();
+        break;
+    }
+}
+
 static void noreturn continue_nonidle_domain(struct vcpu *v)
 {
     check_wakeup_from_wait();
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index ef9e9639fd..7751f9225b 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -255,6 +255,8 @@ struct pv_domain
 
     /* XPTI active? */
     bool xpti;
+    /* Use PCID feature? */
+    bool pcid;
 
     /* map_domain_page() mapping cache. */
     struct mapcache_domain mapcache;
@@ -262,6 +264,11 @@ struct pv_domain
     struct cpuidmasks *cpuidmasks;
 };
 
+/* PCID values for the address spaces of 64-bit pv domains: */
+#define PCID_PV_PRIV      0x0000    /* Used for other domains, too. */
+#define PCID_PV_USER      0x0001
+#define PCID_PV_XEN       0x0002    /* To be ORed to above values. */
+
 struct monitor_write_data {
     struct {
         unsigned int msr : 1;
@@ -619,14 +626,15 @@ unsigned long pv_guest_cr4_fixup(const struct vcpu *, 
unsigned long guest_cr4);
       | (mmu_cr4_features                                   \
          & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP |      \
             X86_CR4_SMAP | X86_CR4_OSXSAVE |                \
-            X86_CR4_FSGSBASE))                              \
-      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0))         \
+            X86_CR4_FSGSBASE | X86_CR4_PCIDE))              \
+      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)          \
+      | ((v)->domain->arch.pv_domain.pcid ? X86_CR4_PCIDE : 0)) \
      & ~(X86_CR4_DE |                                       \
          ((v)->domain->arch.pv_domain.xpti ? X86_CR4_PGE : 0)))
 #define real_cr4_to_pv_guest_cr4(c)                         \
     ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD |      \
              X86_CR4_OSXSAVE | X86_CR4_SMEP |               \
-             X86_CR4_FSGSBASE | X86_CR4_SMAP))
+             X86_CR4_FSGSBASE | X86_CR4_SMAP | X86_CR4_PCIDE))
 
 #define domain_max_vcpus(d) (is_hvm_domain(d) ? HVM_MAX_VCPUS : MAX_VIRT_CPUS)
 
diff --git a/xen/include/asm-x86/pv/domain.h b/xen/include/asm-x86/pv/domain.h
index 2213a8fb3d..8fd822ca83 100644
--- a/xen/include/asm-x86/pv/domain.h
+++ b/xen/include/asm-x86/pv/domain.h
@@ -29,6 +29,7 @@ void pv_domain_destroy(struct domain *d);
 int pv_domain_initialise(struct domain *d);
 void xpti_init(void);
 void xpti_domain_init(struct domain *d);
+void pcid_domain_init(struct domain *d);
 
 #else  /* !CONFIG_PV */
 
@@ -40,6 +41,7 @@ static inline void pv_domain_destroy(struct domain *d) {}
 static inline int pv_domain_initialise(struct domain *d) { return -EOPNOTSUPP; 
}
 static inline void xpti_init(void) {}
 static inline void xpti_domain_init(struct domain *d) {}
+static inline void pcid_domain_init(struct domain *d) {}
 #endif /* CONFIG_PV */
 
 void paravirt_ctxt_switch_from(struct vcpu *v);
diff --git a/xen/include/asm-x86/x86-defns.h b/xen/include/asm-x86/x86-defns.h
index ff8d66be3c..e323d3c01f 100644
--- a/xen/include/asm-x86/x86-defns.h
+++ b/xen/include/asm-x86/x86-defns.h
@@ -46,6 +46,7 @@
  * Intel CPU flags in CR3
  */
 #define X86_CR3_NOFLUSH (_AC(1, ULL) << 63)
+#define X86_CR3_PCIDMASK _AC(0x0fff, ULL) /* Mask for PCID */
 
 /*
  * Intel CPU features in CR4
-- 
2.13.6


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.