[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v4 7/7] xen/x86: use PCID feature
Avoid flushing the complete TLB when switching %cr3 for mitigation of Meltdown by using the PCID feature if available. We are using 4 PCID values for a 64 bit pv domain subject to XPTI and 2 values for the non-XPTI case: - guest active and in kernel mode - guest active and in user mode - hypervisor active and guest in user mode (XPTI only) - hypervisor active and guest in kernel mode (XPTI only) We use PCID only if PCID _and_ INVPCID are supported. With PCID in use we disable global pages in cr4. A command line parameter controls in which cases PCID is being used. As the non-XPTI case has shown not to perform better with PCID at least on some machines the default is to use PCID only for domains subject to XPTI. With PCID enabled we always disable global pages. This avoids having to either flush the complete TLB or do a cycle through all PCID values when invalidating a single global page. pv_guest_cr4_to_real_cr4() is switched from a macro to a real function now as it has become more complex. Signed-off-by: Juergen Gross <jgross@xxxxxxxx> --- V4: - add cr3 mask for page table address and use that in dbg_pv_va2mfn() (Jan Beulich) - use invpcid_flush_all_nonglobals() instead of invpcid_flush_all() (Jan Beulich) - use PCIDs 0/1 when running in Xen or without XPTI, 2/3 with XPTI in guest (Jan Beulich) - ASSERT cr4.pge and cr4.pcide are never active at the same time (Jan Beulich) - make pv_guest_cr4_to_real_cr4() a real function V3: - support PCID for non-XPTI case, too - add command line parameter for controlling usage of PCID - check PCID active by using cr4.pcide (Jan Beulich) --- docs/misc/xen-command-line.markdown | 12 ++++++ xen/arch/x86/debug.c | 2 +- xen/arch/x86/domain_page.c | 2 +- xen/arch/x86/domctl.c | 4 ++ xen/arch/x86/flushtlb.c | 44 +++++++++++++++++++-- xen/arch/x86/mm.c | 24 +++++++++++- xen/arch/x86/pv/dom0_build.c | 1 + xen/arch/x86/pv/domain.c | 76 ++++++++++++++++++++++++++++++++++++- xen/include/asm-x86/domain.h | 15 +++----- xen/include/asm-x86/processor.h | 3 ++ xen/include/asm-x86/pv/domain.h | 20 ++++++++++ xen/include/asm-x86/x86-defns.h | 4 +- 12 files changed, 188 insertions(+), 19 deletions(-) diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index 5f6ae654ad..db87fd326d 100644 --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -1452,6 +1452,18 @@ All numbers specified must be hexadecimal ones. This option can be specified more than once (up to 8 times at present). +### pcid (x86) +> `= <boolean> | xpti | noxpti` + +> Default: `xpti` + +> Can be modified at runtime + +If available, control usage of the PCID feature of the processor for +64-bit pv-domains. PCID can be used either for no domain at all (`false`), +for all of them (`true`), only for those subject to XPTI (`xpti`) or for +those not subject to XPTI (`noxpti`). + ### ple\_gap > `= <integer>` diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c index 9159f32db4..0d46f2f45a 100644 --- a/xen/arch/x86/debug.c +++ b/xen/arch/x86/debug.c @@ -97,7 +97,7 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val) l3_pgentry_t l3e, *l3t; l2_pgentry_t l2e, *l2t; l1_pgentry_t l1e, *l1t; - unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3); + unsigned long cr3 = pgd3val ?: (dp->vcpu[0]->arch.cr3 & X86_CR3_ADDR_MASK); mfn_t mfn = maddr_to_mfn(cr3); DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c index b5780f201f..bf5bf45188 100644 --- a/xen/arch/x86/domain_page.c +++ b/xen/arch/x86/domain_page.c @@ -51,7 +51,7 @@ static inline struct vcpu *mapcache_current_vcpu(void) if ( (v = idle_vcpu[smp_processor_id()]) == current ) sync_local_execstate(); /* We must now be running on the idle page table. */ - ASSERT(read_cr3() == __pa(idle_pg_table)); + ASSERT((read_cr3() & ~X86_CR3_PCID_MASK) == __pa(idle_pg_table)); } return v; diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index 0704f398c7..a7c8772fa6 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -613,7 +613,11 @@ long arch_do_domctl( ret = -EINVAL; if ( ret == 0 ) + { xpti_domain_init(d); + pcid_domain_init(d); + } + break; case XEN_DOMCTL_get_address_size: diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c index 5cb0ad97b8..96f09239f0 100644 --- a/xen/arch/x86/flushtlb.c +++ b/xen/arch/x86/flushtlb.c @@ -12,6 +12,7 @@ #include <asm/flushtlb.h> #include <asm/invpcid.h> #include <asm/page.h> +#include <asm/pv/domain.h> /* Debug builds: Wrap frequently to stress-test the wrap logic. */ #ifdef NDEBUG @@ -102,7 +103,21 @@ void write_cr3_cr4(unsigned long cr3, unsigned long cr4) t = pre_flush(); if ( read_cr4() & X86_CR4_PGE ) + /* + * X86_CR4_PGE set means PCID being inactive. + * We have to purge the TLB via flipping cr4.pge. + */ write_cr4(cr4 & ~X86_CR4_PGE); + else if ( cpu_has_invpcid ) + /* + * If we are using PCID purge the TLB via INVPCID as loading cr3 + * will affect the current PCID only. + * If INVPCID is not supported we don't use PCIDs so loading cr3 + * will purge the TLB (we are in the "global pages off" branch). + * invpcid_flush_all_nonglobals() seems to be faster than + * invpcid_flush_all(). + */ + invpcid_flush_all_nonglobals(); asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" ); @@ -134,11 +149,32 @@ unsigned int flush_area_local(const void *va, unsigned int flags) /* * We don't INVLPG multi-page regions because the 2M/4M/1G * region may not have been mapped with a superpage. Also there - * are various errata surrounding INVLPG usage on superpages, and - * a full flush is in any case not *that* expensive. + * are various errata surrounding INVLPG usage on superpages, + * and a full flush is in any case not *that* expensive. */ - asm volatile ( "invlpg %0" - : : "m" (*(const char *)(va)) : "memory" ); + if ( read_cr4() & X86_CR4_PCIDE ) + { + unsigned long addr = (unsigned long)va; + + /* + * Flush the addresses for all potential address spaces. + * We can't check the current domain for being subject to + * XPTI as current might be the idle vcpu while we still have + * some XPTI domain TLB entries. + * Using invpcid is okay here, as with PCID enabled we always + * have global pages disabled. + */ + invpcid_flush_one(PCID_PV_PRIV, addr); + invpcid_flush_one(PCID_PV_USER, addr); + if ( !cpu_has_no_xpti ) + { + invpcid_flush_one(PCID_PV_PRIV | PCID_PV_XPTI, addr); + invpcid_flush_one(PCID_PV_USER | PCID_PV_XPTI, addr); + } + } + else + asm volatile ( "invlpg %0" + : : "m" (*(const char *)(va)) : "memory" ); } else do_tlb_flush(); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 856eb9e67f..1566e5b3aa 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -126,6 +126,7 @@ #include <asm/processor.h> #include <asm/hvm/grant_table.h> +#include <asm/pv/domain.h> #include <asm/pv/grant_table.h> #include <asm/pv/mm.h> @@ -499,7 +500,26 @@ void free_shared_domheap_page(struct page_info *page) void make_cr3(struct vcpu *v, mfn_t mfn) { + struct domain *d = v->domain; + v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT; + if ( is_pv_domain(d) && d->arch.pv_domain.pcid ) + v->arch.cr3 |= get_pcid_bits(v, false); +} + +unsigned long pv_guest_cr4_to_real_cr4(struct vcpu *v) +{ + struct domain *d = v->domain; + unsigned long cr4; + + cr4 = v->arch.pv_vcpu.ctrlreg[4] & ~X86_CR4_DE; + cr4 |= mmu_cr4_features & (X86_CR4_PSE | X86_CR4_SMEP | X86_CR4_SMAP | + X86_CR4_OSXSAVE | X86_CR4_FSGSBASE); + cr4 |= (d->arch.pv_domain.xpti || d->arch.pv_domain.pcid) ? 0 : X86_CR4_PGE; + cr4 |= d->arch.pv_domain.pcid ? X86_CR4_PCIDE : 0; + cr4 |= d->arch.vtsc ? X86_CR4_TSD : 0; + + return cr4; } void write_ptbase(struct vcpu *v) @@ -509,12 +529,14 @@ void write_ptbase(struct vcpu *v) new_cr4 = (is_pv_vcpu(v) && !is_idle_vcpu(v)) ? pv_guest_cr4_to_real_cr4(v) - : ((read_cr4() & ~X86_CR4_TSD) | X86_CR4_PGE); + : ((read_cr4() & ~(X86_CR4_PCIDE | X86_CR4_TSD)) | X86_CR4_PGE); if ( is_pv_vcpu(v) && v->domain->arch.pv_domain.xpti ) { cpu_info->root_pgt_changed = true; cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)); + if ( new_cr4 & X86_CR4_PCIDE ) + cpu_info->pv_cr3 |= get_pcid_bits(v, true); write_cr3_cr4(v->arch.cr3, new_cr4); } else diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c index 77186c19bd..2af0094e95 100644 --- a/xen/arch/x86/pv/dom0_build.c +++ b/xen/arch/x86/pv/dom0_build.c @@ -709,6 +709,7 @@ int __init dom0_construct_pv(struct domain *d, } xpti_domain_init(d); + pcid_domain_init(d); d->arch.paging.mode = 0; diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c index e6bb2bac76..838525f6e7 100644 --- a/xen/arch/x86/pv/domain.c +++ b/xen/arch/x86/pv/domain.c @@ -94,6 +94,70 @@ void xpti_domain_init(struct domain *d) } } +static __read_mostly enum { + PCID_OFF, + PCID_ALL, + PCID_XPTI, + PCID_NOXPTI +} opt_pcid = PCID_XPTI; + +static __init int parse_pcid(const char *s) +{ + int rc = 0; + + switch ( parse_bool(s, NULL) ) + { + case 0: + opt_pcid = PCID_OFF; + break; + case 1: + opt_pcid = PCID_ALL; + break; + default: + switch ( parse_boolean("xpti", s, NULL) ) + { + case 0: + opt_pcid = PCID_NOXPTI; + break; + case 1: + opt_pcid = PCID_XPTI; + break; + default: + rc = -EINVAL; + break; + } + break; + } + + return rc; +} +custom_runtime_param("pcid", parse_pcid); + +void pcid_domain_init(struct domain *d) +{ + if ( !is_pv_domain(d) || is_pv_32bit_domain(d) || + !cpu_has_invpcid || !cpu_has_pcid ) + return; + + switch ( opt_pcid ) + { + case PCID_OFF: + break; + case PCID_ALL: + d->arch.pv_domain.pcid = true; + break; + case PCID_XPTI: + d->arch.pv_domain.pcid = d->arch.pv_domain.xpti; + break; + case PCID_NOXPTI: + d->arch.pv_domain.pcid = !d->arch.pv_domain.xpti; + break; + default: + ASSERT_UNREACHABLE(); + break; + } +} + static void noreturn continue_nonidle_domain(struct vcpu *v) { check_wakeup_from_wait(); @@ -298,11 +362,21 @@ int pv_domain_initialise(struct domain *d) static void _toggle_guest_pt(struct vcpu *v, bool force_cr3) { + struct domain *d = v->domain; + ASSERT(!in_irq()); v->arch.flags ^= TF_kernel_mode; update_cr3(v); - get_cpu_info()->root_pgt_changed = true; + if ( d->arch.pv_domain.xpti ) + { + struct cpu_info *cpu_info = get_cpu_info(); + + cpu_info->root_pgt_changed = true; + cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) | + (d->arch.pv_domain.pcid + ? get_pcid_bits(v, true) : 0); + } /* * There's no need to load CR3 here when it is going to be loaded on the diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index b7894dc8c8..9a624d0e5f 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -255,6 +255,8 @@ struct pv_domain /* XPTI active? */ bool xpti; + /* Use PCID feature? */ + bool pcid; /* map_domain_page() mapping cache. */ struct mapcache_domain mapcache; @@ -615,19 +617,12 @@ void vcpu_show_registers(const struct vcpu *); unsigned long pv_guest_cr4_fixup(const struct vcpu *, unsigned long guest_cr4); /* Convert between guest-visible and real CR4 values. */ -#define pv_guest_cr4_to_real_cr4(v) \ - (((v)->arch.pv_vcpu.ctrlreg[4] \ - | (mmu_cr4_features \ - & (X86_CR4_PSE | X86_CR4_SMEP | \ - X86_CR4_SMAP | X86_CR4_OSXSAVE | \ - X86_CR4_FSGSBASE)) \ - | ((v)->domain->arch.pv_domain.xpti ? 0 : X86_CR4_PGE) \ - | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \ - & ~X86_CR4_DE) +unsigned long pv_guest_cr4_to_real_cr4(struct vcpu *v); + #define real_cr4_to_pv_guest_cr4(c) \ ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | \ X86_CR4_OSXSAVE | X86_CR4_SMEP | \ - X86_CR4_FSGSBASE | X86_CR4_SMAP)) + X86_CR4_FSGSBASE | X86_CR4_SMAP | X86_CR4_PCIDE)) #define domain_max_vcpus(d) (is_hvm_domain(d) ? HVM_MAX_VCPUS : MAX_VIRT_CPUS) diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index db9988ab33..3067a8c58f 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -290,6 +290,9 @@ static inline unsigned long read_cr4(void) static inline void write_cr4(unsigned long val) { + /* No global pages in case of PCIDs enabled! */ + ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE)); + get_cpu_info()->cr4 = val; asm volatile ( "mov %0,%%cr4" : : "r" (val) ); } diff --git a/xen/include/asm-x86/pv/domain.h b/xen/include/asm-x86/pv/domain.h index 911e5dc07f..3c8c8f4ccc 100644 --- a/xen/include/asm-x86/pv/domain.h +++ b/xen/include/asm-x86/pv/domain.h @@ -21,6 +21,24 @@ #ifndef __X86_PV_DOMAIN_H__ #define __X86_PV_DOMAIN_H__ +/* PCID values for the address spaces of 64-bit pv domains: */ +#define PCID_PV_PRIV 0x0000 /* Used for other domains, too. */ +#define PCID_PV_USER 0x0001 +#define PCID_PV_XPTI 0x0002 /* To be ORed to above values. */ + +/* + * Return additional PCID specific cr3 bits. + * + * Note that X86_CR3_NOFLUSH will not be readable in cr3. Anyone consuming + * v->arch.cr3 should mask away X86_CR3_NOFLUSH and X86_CR3_PCIDMASK in case + * the value is used to address the root page table. + */ +static inline unsigned long get_pcid_bits(const struct vcpu *v, bool is_xpti) +{ + return X86_CR3_NOFLUSH | (is_xpti ? PCID_PV_XPTI : 0) | + ((v->arch.flags & TF_kernel_mode) ? PCID_PV_PRIV : PCID_PV_USER); +} + #ifdef CONFIG_PV void pv_vcpu_destroy(struct vcpu *v); @@ -29,6 +47,7 @@ void pv_domain_destroy(struct domain *d); int pv_domain_initialise(struct domain *d); void xpti_init(void); void xpti_domain_init(struct domain *d); +void pcid_domain_init(struct domain *d); #else /* !CONFIG_PV */ @@ -40,6 +59,7 @@ static inline void pv_domain_destroy(struct domain *d) {} static inline int pv_domain_initialise(struct domain *d) { return -EOPNOTSUPP; } static inline void xpti_init(void) {} static inline void xpti_domain_init(struct domain *d) {} +static inline void pcid_domain_init(struct domain *d) {} #endif /* CONFIG_PV */ diff --git a/xen/include/asm-x86/x86-defns.h b/xen/include/asm-x86/x86-defns.h index ff8d66be3c..a80325f86d 100644 --- a/xen/include/asm-x86/x86-defns.h +++ b/xen/include/asm-x86/x86-defns.h @@ -45,7 +45,9 @@ /* * Intel CPU flags in CR3 */ -#define X86_CR3_NOFLUSH (_AC(1, ULL) << 63) +#define X86_CR3_NOFLUSH (_AC(1, ULL) << 63) +#define X86_CR3_ADDR_MASK (PAGE_MASK & ~X86_CR3_NOFLUSH) +#define X86_CR3_PCID_MASK _AC(0x0fff, ULL) /* Mask for PCID */ /* * Intel CPU features in CR4 -- 2.13.6 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |