[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v3 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode


  • To: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Wed, 25 Jan 2023 16:27:06 +0100
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=uPkRfwew94fNX+zunfbWzw+IG6xaC8yyVufkSJVeAIQ=; b=V5FJQnfdXphepJF44Tn1IE7c8gG5SWAnNgWCqfha7qxuTffv2Jf33tbZ0aFqiSjwVbPRonNpBQeaIqckfqL9eb7u/ElCojlm4eoxSvSJEk6kCgDYAm/LDAI2+hIBL5JhAjqLWPmHJaQwUnzjUzb0IUEs24zMeDrylZsSCiBgqtKoc1gHjzrPohZYli0FrWz8ET5LnIZfXStB/5TXFim3fD6Wx/Mt9e2lqtLvi6dDUd+irNhuo6W8zbro8UNU4msBrAWUU3knZOveqdl3tyvnNUzPaHSiVMRxEuOPbVrvp4uBkIgkmCOCEX9szlCyfCUqlpvuLITQQbMdjklDpfDvgg==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=EmrsoaMcuaqEvysV4AVdeOJZhpbqg4/Wm77DCxX8k7qFrUudr3j0eWi8H7H6nodf+kvODhQsoF4JlM9U4ZkkV6kqw7iKraffDW+xBzk56jzF5z1JXPkzLUK29aaavnyJZ2JKimhq98e2KGhwz3RcHXDDDqJOqAytpFp2sVzF2x8tp0cpSWiWX+dkoolUIJFpuuoGRPzzcHfC5U1X+3ig0zUjtrnXMevGy2zsGIoF1o9MVfWJHFTg3hr9RXVYGJRGk9NAtZ18pSPS1u7JvqxMwOy0dp9+N1O5agWgJIBFEw7V79xVbsLjC4JMfjq2cKKtCo1GRUPiGVk8kh/w+S/iFA==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;
  • Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>
  • Delivery-date: Wed, 25 Jan 2023 15:27:24 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Since both kernel and user mode run in ring 3, they run in the same
"predictor mode". While the kernel could take care of this itself, doing
so would be yet another item distinguishing PV from native. Additionally
we're in a much better position to issue the barrier command, and we can
save a #GP (for privileged instruction emulation) this way.

To allow to recover performance, introduce a new VM assist allowing the
guest kernel to suppress this barrier. Make availability of the assist
dependent upon the command line control, such that kernels have a way to
know whether their request actually took any effect.

Note that because of its use in PV64_VM_ASSIST_MASK, the declaration of
opt_ibpb_mode_switch can't live in asm/spec_ctrl.h.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
Is the placement of the clearing of opt_ibpb_ctxt_switch correct in
parse_spec_ctrl()? Shouldn't it live ahead of the "disable_common"
label, as being about guest protection, not Xen's?

Adding setting of the variable to the "pv" sub-case in parse_spec_ctrl()
didn't seem quite right to me, considering that we default it to the
opposite of opt_ibpb_entry_pv.
---
v3: Leverage exit-IBPB. Introduce separate command line control.
v2: Leverage entry-IBPB. Add VM assist. Re-base.

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -2315,8 +2315,8 @@ By default SSBD will be mitigated at run
 ### spec-ctrl (x86)
 > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>,
 >              {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
->              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
->              eager-fpu,l1d-flush,branch-harden,srb-lock,
+>              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ibpb-mode-switch,
+>              ssbd,psfd,eager-fpu,l1d-flush,branch-harden,srb-lock,
 >              unpriv-mmio}=<bool> ]`
 
 Controls for speculative execution sidechannel mitigations.  By default, Xen
@@ -2398,7 +2398,10 @@ default.
 
 On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=`
 option can be used to force (the default) or prevent Xen from issuing branch
-prediction barriers on vcpu context switches.
+prediction barriers on vcpu context switches.  On such hardware the
+`ibpb-mode-switch` option can be used to control whether, by default, Xen
+would issue branch prediction barriers when 64-bit PV guests switch from
+user to kernel mode.  If enabled, guest kernels can op out of this behavior.
 
 On all hardware, the `eager-fpu=` option can be used to force or prevent Xen
 from using fully eager FPU context switches.  This is currently implemented as
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -742,6 +742,8 @@ static inline void pv_inject_sw_interrup
     pv_inject_event(&event);
 }
 
+extern int8_t opt_ibpb_mode_switch;
+
 #define PV32_VM_ASSIST_MASK ((1UL << VMASST_TYPE_4gb_segments)        | \
                              (1UL << VMASST_TYPE_4gb_segments_notify) | \
                              (1UL << VMASST_TYPE_writable_pagetables) | \
@@ -753,7 +755,9 @@ static inline void pv_inject_sw_interrup
  * but we can't make such requests fail all of the sudden.
  */
 #define PV64_VM_ASSIST_MASK (PV32_VM_ASSIST_MASK                      | \
-                             (1UL << VMASST_TYPE_m2p_strict))
+                             (1UL << VMASST_TYPE_m2p_strict)          | \
+                             ((opt_ibpb_mode_switch + 0UL) <<           \
+                              VMASST_TYPE_mode_switch_no_ibpb))
 #define HVM_VM_ASSIST_MASK  (1UL << VMASST_TYPE_runstate_update_flag)
 
 #define arch_vm_assist_valid_mask(d) \
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
 void toggle_guest_mode(struct vcpu *v)
 {
     const struct domain *d = v->domain;
+    struct cpu_info *cpu_info = get_cpu_info();
     unsigned long gs_base;
 
     ASSERT(!is_pv_32bit_vcpu(v));
@@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
     if ( v->arch.flags & TF_kernel_mode )
         v->arch.pv.gs_base_kernel = gs_base;
     else
+    {
         v->arch.pv.gs_base_user = gs_base;
+
+        if ( opt_ibpb_mode_switch &&
+             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
+             !VM_ASSIST(d, mode_switch_no_ibpb) )
+            cpu_info->spec_ctrl_flags |= SCF_exit_ibpb;
+    }
+
     asm volatile ( "swapgs" );
 
     _toggle_guest_pt(v);
 
     if ( d->arch.pv.xpti )
     {
-        struct cpu_info *cpu_info = get_cpu_info();
-
         cpu_info->root_pgt_changed = true;
         cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
                            (d->arch.pv.pcid ? get_pcid_bits(v, true) : 0);
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -60,6 +60,7 @@ bool __ro_after_init opt_ssbd;
 int8_t __initdata opt_psfd = -1;
 
 int8_t __ro_after_init opt_ibpb_ctxt_switch = -1;
+int8_t __ro_after_init opt_ibpb_mode_switch = -1;
 int8_t __read_mostly opt_eager_fpu = -1;
 int8_t __read_mostly opt_l1d_flush = -1;
 static bool __initdata opt_branch_harden = true;
@@ -111,6 +112,8 @@ static int __init cf_check parse_spec_ct
             if ( opt_pv_l1tf_domu < 0 )
                 opt_pv_l1tf_domu = 0;
 
+            opt_ibpb_mode_switch = 0;
+
             if ( opt_tsx == -1 )
                 opt_tsx = -3;
 
@@ -271,6 +274,8 @@ static int __init cf_check parse_spec_ct
         /* Misc settings. */
         else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
             opt_ibpb_ctxt_switch = val;
+        else if ( (val = parse_boolean("ibpb-mode-switch", s, ss)) >= 0 )
+            opt_ibpb_mode_switch = val;
         else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
             opt_eager_fpu = val;
         else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
@@ -527,7 +532,7 @@ static void __init print_details(enum in
 
 #endif
 #ifdef CONFIG_PV
-    printk("  Support for PV VMs:%s%s%s%s%s%s\n",
+    printk("  Support for PV VMs:%s%s%s%s%s%s%s\n",
            (boot_cpu_has(X86_FEATURE_SC_MSR_PV) ||
             boot_cpu_has(X86_FEATURE_SC_RSB_PV) ||
             boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ||
@@ -536,7 +541,8 @@ static void __init print_details(enum in
            boot_cpu_has(X86_FEATURE_SC_RSB_PV)       ? " RSB"           : "",
            opt_eager_fpu                             ? " EAGER_FPU"     : "",
            opt_md_clear_pv                           ? " MD_CLEAR"      : "",
-           boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "");
+           boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "",
+           opt_ibpb_mode_switch                      ? " IBPB-mode-switch" : 
"");
 
     printk("  XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n",
            opt_xpti_hwdom ? "enabled" : "disabled",
@@ -804,7 +810,8 @@ static void __init ibpb_calculations(voi
     /* Check we have hardware IBPB support before using it... */
     if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
     {
-        opt_ibpb_entry_hvm = opt_ibpb_entry_pv = opt_ibpb_ctxt_switch = 0;
+        opt_ibpb_entry_hvm = opt_ibpb_entry_pv = 0;
+        opt_ibpb_mode_switch = opt_ibpb_ctxt_switch = 0;
         opt_ibpb_entry_dom0 = false;
         return;
     }
@@ -859,6 +866,18 @@ static void __init ibpb_calculations(voi
         setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_PV);
         setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_HVM);
     }
+
+#ifdef CONFIG_PV
+    /*
+     * If we're using IBPB-on-entry to protect against PV guests, then
+     * there's no need to also issue IBPB on a guest user->kernel switch.
+     */
+    if ( opt_ibpb_mode_switch == -1 )
+        opt_ibpb_mode_switch = !opt_ibpb_entry_pv ||
+                               (!opt_ibpb_entry_dom0 && !opt_dom0_pvh);
+    if ( opt_ibpb_mode_switch )
+        setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_PV);
+#endif
 }
 
 /* Calculate whether this CPU is vulnerable to L1TF. */
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -554,6 +554,16 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
  */
 #define VMASST_TYPE_m2p_strict           32
 
+/*
+ * x86-64 guests: Suppress IBPB on guest-user to guest-kernel mode switch.
+ *
+ * By default (on affected and capable hardware) as a safety measure Xen,
+ * to cover for the fact that guest-kernel and guest-user modes are both
+ * running in ring 3 (and hence share prediction context), would issue a
+ * barrier for user->kernel mode switches of PV guests.
+ */
+#define VMASST_TYPE_mode_switch_no_ibpb  33
+
 #if __XEN_INTERFACE_VERSION__ < 0x00040600
 #define MAX_VMASST_TYPE                  3
 #endif




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.