[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v5 4/7] xen/x86: use invpcid for flushing the TLB



On 06/04/18 08:52, Juergen Gross wrote:
> If possible use the INVPCID instruction for flushing the TLB instead of
> toggling cr4.pge for that purpose.
>
> While at it remove the dependency on cr4.pge being required for mtrr
> loading, as this will be required later anyway.
>
> Add a command line option "invpcid" for controlling the use of
> INVPCID (default to true).
>
> Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
> ---
> V5:
> - use pre_flush() as an initializer in do_tlb_flush() (Jan Beulich)
> - introduce boolean use_invpcid instead of clearing X86_FEATURE_INVPCID
>   (Jan Beulich)
>
> V4:
> - option "invpcid" instead of "noinvpcid" (Jan Beulich)
>
> V3:
> - new patch
> ---
>  docs/misc/xen-command-line.markdown | 10 ++++++++++
>  xen/arch/x86/cpu/mtrr/generic.c     | 37 
> ++++++++++++++++++++++++++-----------
>  xen/arch/x86/flushtlb.c             | 29 +++++++++++++++++++----------
>  xen/arch/x86/setup.c                |  8 ++++++++
>  xen/include/asm-x86/invpcid.h       |  2 ++
>  5 files changed, 65 insertions(+), 21 deletions(-)
>
> diff --git a/docs/misc/xen-command-line.markdown 
> b/docs/misc/xen-command-line.markdown
> index 79be9a6ba5..5f6ae654ad 100644
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -1380,6 +1380,16 @@ Because responsibility for APIC setup is shared 
> between Xen and the
>  domain 0 kernel this option is automatically propagated to the domain
>  0 command line.
>  
> +### invpcid (x86)
> +> `= <boolean>`
> +
> +> Default: `true`
> +
> +Control using the INVPCID instruction for flushing TLB entries.
> +This should only be used in case of known issues on the current platform
> +with that instruction. Disabling INVPCID will normally result in a slightly
> +degraded performance.

How about:

By default, Xen will use the INVPCID instruction for TLB management if
it is available.  This option can be used to cause Xen to fall back to
older mechanisms, which are generally slower.

?

We are not aware of any systems with INVPCID problems, but "for testing
purposes" is also a valid reason to use this option.


> +
>  ### noirqbalance
>  > `= <boolean>`
>  
> diff --git a/xen/arch/x86/cpu/mtrr/generic.c b/xen/arch/x86/cpu/mtrr/generic.c
> index e9c0e5e059..705855e753 100644
> --- a/xen/arch/x86/cpu/mtrr/generic.c
> +++ b/xen/arch/x86/cpu/mtrr/generic.c
> @@ -5,6 +5,7 @@
>  #include <xen/mm.h>
>  #include <xen/stdbool.h>
>  #include <asm/flushtlb.h>
> +#include <asm/invpcid.h>
>  #include <asm/io.h>
>  #include <asm/mtrr.h>
>  #include <asm/msr.h>
> @@ -400,8 +401,10 @@ static DEFINE_SPINLOCK(set_atomicity_lock);
>   * has been called.
>   */
>  
> -static void prepare_set(void)
> +static bool prepare_set(void)
>  {
> +     unsigned long cr4;
> +
>       /*  Note that this is not ideal, since the cache is only 
> flushed/disabled
>          for this CPU while the MTRRs are changed, but changing this requires
>          more invasive changes to the way the kernel boots  */
> @@ -412,18 +415,24 @@ static void prepare_set(void)
>       write_cr0(read_cr0() | X86_CR0_CD);
>       wbinvd();
>  
> -     /*  TLB flushing here relies on Xen always using CR4.PGE. */
> -     BUILD_BUG_ON(!(XEN_MINIMAL_CR4 & X86_CR4_PGE));
> -     write_cr4(read_cr4() & ~X86_CR4_PGE);
> +     cr4 = read_cr4();
> +     if (cr4 & X86_CR4_PGE)
> +             write_cr4(cr4 & ~X86_CR4_PGE);
> +     else if (use_invpcid)
> +             invpcid_flush_all();
> +     else
> +             asm volatile( "mov %0, %%cr3" : : "r" (read_cr3()) : "memory" );

We could really do with gaining a static inline wrapper for write_cr3(),
because at the very least, the memory clobber going missing would be a
BadThing(tm).  Sadly that name is already taken in Xen, and behaves
differently to (all?) other OS's example.

There are currently only 3 callers of write_cr3().  How about a prereq
patch renaming write_cr3() to switch_cr3() (as switch_cr3() logically
implies the TLB clock activities), and a new thin write_cr3() wrapper?

>  
>       /*  Save MTRR state */
>       rdmsrl(MSR_MTRRdefType, deftype);
>  
>       /*  Disable MTRRs, and set the default type to uncached  */
>       mtrr_wrmsr(MSR_MTRRdefType, deftype & ~0xcff);
> +
> +     return cr4 & X86_CR4_PGE;
>  }
>  
> -static void post_set(void)
> +static void post_set(bool pge)
>  {
>       /* Intel (P6) standard MTRRs */
>       mtrr_wrmsr(MSR_MTRRdefType, deftype);
> @@ -432,7 +441,12 @@ static void post_set(void)
>       write_cr0(read_cr0() & ~X86_CR0_CD);
>  
>       /*  Reenable CR4.PGE (also flushes the TLB) */
> -     write_cr4(read_cr4() | X86_CR4_PGE);
> +     if (pge)
> +             write_cr4(read_cr4() | X86_CR4_PGE);
> +     else if (use_invpcid)
> +             invpcid_flush_all();
> +     else
> +             asm volatile( "mov %0, %%cr3" : : "r" (read_cr3()) : "memory" );
>  
>       spin_unlock(&set_atomicity_lock);
>  }
> @@ -441,14 +455,15 @@ static void generic_set_all(void)
>  {
>       unsigned long mask, count;
>       unsigned long flags;
> +     bool pge;
>  
>       local_irq_save(flags);
> -     prepare_set();
> +     pge = prepare_set();
>  
>       /* Actually set the state */
>       mask = set_mtrr_state();
>  
> -     post_set();
> +     post_set(pge);
>       local_irq_restore(flags);
>  
>       /*  Use the atomic bitops to update the global mask  */
> @@ -457,7 +472,6 @@ static void generic_set_all(void)
>                       set_bit(count, &smp_changes_mask);
>               mask >>= 1;
>       }
> -     
>  }
>  
>  static void generic_set_mtrr(unsigned int reg, unsigned long base,
> @@ -474,11 +488,12 @@ static void generic_set_mtrr(unsigned int reg, unsigned 
> long base,
>  {
>       unsigned long flags;
>       struct mtrr_var_range *vr;
> +     bool pge;
>  
>       vr = &mtrr_state.var_ranges[reg];
>  
>       local_irq_save(flags);
> -     prepare_set();
> +     pge = prepare_set();
>  
>       if (size == 0) {
>               /* The invalid bit is kept in the mask, so we simply clear the
> @@ -499,7 +514,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned 
> long base,
>               mtrr_wrmsr(MSR_IA32_MTRR_PHYSMASK(reg), vr->mask);
>       }
>  
> -     post_set();
> +     post_set(pge);
>       local_irq_restore(flags);
>  }
>  
> diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
> index 38cedf3b22..f793b70696 100644
> --- a/xen/arch/x86/flushtlb.c
> +++ b/xen/arch/x86/flushtlb.c
> @@ -10,6 +10,7 @@
>  #include <xen/sched.h>
>  #include <xen/softirq.h>
>  #include <asm/flushtlb.h>
> +#include <asm/invpcid.h>
>  #include <asm/page.h>
>  
>  /* Debug builds: Wrap frequently to stress-test the wrap logic. */
> @@ -71,6 +72,23 @@ static void post_flush(u32 t)
>      this_cpu(tlbflush_time) = t;
>  }
>  
> +static void do_tlb_flush(void)
> +{
> +    u32 t = pre_flush();
> +
> +    if ( use_invpcid )
> +        invpcid_flush_all();
> +    else
> +    {
> +        unsigned long cr4 = read_cr4();
> +
> +        write_cr4(cr4 ^ X86_CR4_PGE);
> +        write_cr4(cr4);
> +    }
> +
> +    post_flush(t);
> +}
> +
>  void write_cr3(unsigned long cr3)
>  {
>      unsigned long flags, cr4;
> @@ -118,16 +136,7 @@ unsigned int flush_area_local(const void *va, unsigned 
> int flags)
>                             : : "m" (*(const char *)(va)) : "memory" );
>          }
>          else
> -        {
> -            u32 t = pre_flush();
> -            unsigned long cr4 = read_cr4();
> -
> -            write_cr4(cr4 & ~X86_CR4_PGE);
> -            barrier();
> -            write_cr4(cr4);
> -
> -            post_flush(t);
> -        }
> +            do_tlb_flush();
>      }
>  
>      if ( flags & FLUSH_CACHE )
> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> index e83d3b4f07..0a27402e4d 100644
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -63,6 +63,11 @@ boolean_param("nosmp", opt_nosmp);
>  static unsigned int __initdata max_cpus;
>  integer_param("maxcpus", max_cpus);
>  
> +/* opt_invpcid: If false, don't use INVPCID instruction even if available. */
> +static bool __initdata opt_invpcid = true;
> +boolean_param("invpcid", opt_invpcid);
> +bool use_invpcid;

__read_mostly.

~Andrew

> +
>  unsigned long __read_mostly cr4_pv32_mask;
>  
>  /* **** Linux config option: propagated to domain0. */
> @@ -1549,6 +1554,9 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>      if ( cpu_has_fsgsbase )
>          set_in_cr4(X86_CR4_FSGSBASE);
>  
> +    if ( opt_invpcid && cpu_has_invpcid )
> +        use_invpcid = true;
> +
>      init_speculation_mitigations();
>  
>      init_idle_domain();
> diff --git a/xen/include/asm-x86/invpcid.h b/xen/include/asm-x86/invpcid.h
> index b46624a865..edd8b68706 100644
> --- a/xen/include/asm-x86/invpcid.h
> +++ b/xen/include/asm-x86/invpcid.h
> @@ -3,6 +3,8 @@
>  
>  #include <xen/types.h>
>  
> +extern bool use_invpcid;
> +
>  #define INVPCID_TYPE_INDIV_ADDR      0
>  #define INVPCID_TYPE_SINGLE_CTXT     1
>  #define INVPCID_TYPE_ALL_INCL_GLOBAL 2


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.