[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] x86/spec-ctrl: Scrub stale segment registers on leaky hardware



Intel Core/Xeon CPUs have two registers per architectural segment register, to
allow for sufficient speculation to cover a typical context switch (one write
to each segment).  Unfortunately, these CPUs speculate over a faulting
descriptor load, and for a period of time, operate with the stale segment.

This in practice allows one vcpu to access the previous vcpu's segment
registers.  These data don't contain secrets, so no CVE has been assigned, but
it is potentially useful information for mounting a different attack.

On context switch, write each data segment register twice to scrub any state
from the previous vcpu.  Other properties of %cs/%ss loads prevent those
segments from being leaky.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
CC: Jan Beulich <JBeulich@xxxxxxxx>
CC: Wei Liu <wl@xxxxxxx>
CC: Roger Pau Monné <roger.pau@xxxxxxxxxx>

RFC for now.  I've demonstrated leakage when I posion in Xen and recover in an
HVM guest.  However, I've failed to demonstrate leakage between two HVM
guests, despite knowing that it is possible.  There may be other mitigating
factors in our context switch path which I've yet to spot.
---
 docs/misc/xen-command-line.pandoc |  8 +++-
 xen/arch/x86/domain.c             | 23 ++++++++++
 xen/arch/x86/hvm/vmx/vmx.c        | 28 ++++++++++++
 xen/arch/x86/spec_ctrl.c          | 92 ++++++++++++++++++++++++++++++++++++++-
 xen/include/asm-x86/spec_ctrl.h   |  1 +
 5 files changed, 150 insertions(+), 2 deletions(-)

diff --git a/docs/misc/xen-command-line.pandoc 
b/docs/misc/xen-command-line.pandoc
index 7c72e31032..1c003495a5 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -1914,7 +1914,7 @@ By default SSBD will be mitigated at runtime (i.e 
`ssbd=runtime`).
 ### spec-ctrl (x86)
 > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>,
 >              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
->              l1d-flush,l1tf-barrier}=<bool> ]`
+>              l1d-flush,stale-seg-clear,l1tf-barrier}=<bool> ]`
 
 Controls for speculative execution sidechannel mitigations.  By default, Xen
 will pick the most appropriate mitigations based on compiled in support,
@@ -1986,6 +1986,12 @@ Irrespective of Xen's setting, the feature is 
virtualised for HVM guests to
 use.  By default, Xen will enable this mitigation on hardware believed to be
 vulnerable to L1TF.
 
+On all hardware, the `stale-seg-clear=` option can be used to force or prevent
+Xen from clearing the microarchitectural segment register copies on context
+switch.  By default, Xen will choose to use stale segment clearing on affected
+hardware.  The clearing logic is tuned to microarchitectural details of the
+affected CPUs.
+
 On hardware vulnerable to L1TF, the `l1tf-barrier=` option can be used to force
 or prevent Xen from protecting evaluations inside the hypervisor with a barrier
 instruction to not load potentially secret information into L1 cache.  By
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 612afb683f..39c3258893 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1328,6 +1328,29 @@ static void load_segments(struct vcpu *n)
     dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
     per_cpu(dirty_segment_mask, cpu) = 0;
 
+    /*
+     * CPUs which suffer from stale segment register leakage have two copies
+     * of each segment register [Correct at the time of writing - Aug 2019].
+     *
+     * We must write to both of them to scrub state from the previous vcpu.
+     * However, two writes in quick succession stall the pipeline, as only one
+     * write per segment can be speculatively outstanding.
+     *
+     * Split the two sets of writes to each register to maximise the chance
+     * that these writes have retired before the second set starts, thus
+     * reducing the chance of stalling.
+     */
+    if ( opt_stale_seg_clear )
+    {
+        asm volatile ( "mov %[sel], %%ds;"
+                       "mov %[sel], %%es;"
+                       "mov %[sel], %%fs;"
+                       "mov %[sel], %%gs;"
+                       :: [sel] "r" (0) );
+        /* Force a reload of all segments to be the second scrubbing write. */
+        dirty_segment_mask = ~0;
+    }
+
 #ifdef CONFIG_HVM
     if ( cpu_has_svm && !is_pv_32bit_vcpu(n) &&
          !(read_cr4() & X86_CR4_FSGSBASE) && !((uregs->fs | uregs->gs) & ~3) )
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 0060310d74..bf544fc2ca 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -57,6 +57,7 @@
 #include <asm/event.h>
 #include <asm/mce.h>
 #include <asm/monitor.h>
+#include <asm/spec_ctrl.h>
 #include <public/arch-x86/cpuid.h>
 
 static bool_t __initdata opt_force_ept;
@@ -872,11 +873,38 @@ static void vmx_ctxt_switch_from(struct vcpu *v)
 
 static void vmx_ctxt_switch_to(struct vcpu *v)
 {
+    /*
+     * CPUs which suffer from stale segment register leakage have two copies
+     * of each segment register [Correct at the time of writing - Aug 2019].
+     *
+     * We must write to both of them to scrub state from the previous vcpu.
+     * However, two writes in quick succession stall the pipeline, as only one
+     * write per segment can be speculatively outstanding.
+     *
+     * Split the two sets of writes to each register to maximise the chance
+     * that these writes have retired before the second set starts, thus
+     * reducing the chance of stalling.
+     */
+    if ( opt_stale_seg_clear )
+        asm volatile ( "mov %[sel], %%ds;"
+                       "mov %[sel], %%es;"
+                       "mov %[sel], %%fs;"
+                       "mov %[sel], %%gs;"
+                       :: [sel] "r" (0) );
+
     vmx_restore_guest_msrs(v);
     vmx_restore_dr(v);
 
     if ( v->domain->arch.hvm.pi_ops.flags & PI_CSW_TO )
         vmx_pi_switch_to(v);
+
+    /* Should be last in the function.  See above. */
+    if ( opt_stale_seg_clear )
+        asm volatile ( "mov %[sel], %%ds;"
+                       "mov %[sel], %%es;"
+                       "mov %[sel], %%fs;"
+                       "mov %[sel], %%gs;"
+                       :: [sel] "r" (0) );
 }
 
 
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 468a847598..33a0a0f089 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -66,6 +66,9 @@ static unsigned int __initdata l1d_maxphysaddr;
 static bool __initdata cpu_has_bug_msbds_only; /* => minimal HT impact. */
 static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS 
combination. */
 
+static bool __initdata cpu_has_bug_stale_seg;
+int8_t opt_stale_seg_clear = -1;
+
 static int __init parse_spec_ctrl(const char *s)
 {
     const char *ss;
@@ -111,6 +114,7 @@ static int __init parse_spec_ctrl(const char *s)
             opt_ibpb = false;
             opt_ssbd = false;
             opt_l1d_flush = 0;
+            opt_stale_seg_clear = 0;
         }
         else if ( val > 0 )
             rc = -EINVAL;
@@ -175,6 +179,8 @@ static int __init parse_spec_ctrl(const char *s)
             opt_eager_fpu = val;
         else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
             opt_l1d_flush = val;
+        else if ( (val = parse_boolean("stale-seg-clear", s, ss)) >= 0 )
+            opt_stale_seg_clear = val;
         else if ( (val = parse_boolean("l1tf-barrier", s, ss)) >= 0 )
             opt_l1tf_barrier = val;
         else
@@ -337,7 +343,7 @@ static void __init print_details(enum ind_thunk thunk, 
uint64_t caps)
                "\n");
 
     /* Settings for Xen's protection, irrespective of guests. */
-    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s%s\n",
+    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s%s%s\n",
            thunk == THUNK_NONE      ? "N/A" :
            thunk == THUNK_RETPOLINE ? "RETPOLINE" :
            thunk == THUNK_LFENCE    ? "LFENCE" :
@@ -349,6 +355,7 @@ static void __init print_details(enum ind_thunk thunk, 
uint64_t caps)
            opt_ibpb                                  ? " IBPB"  : "",
            opt_l1d_flush                             ? " L1D_FLUSH" : "",
            opt_md_clear_pv || opt_md_clear_hvm       ? " VERW"  : "",
+           opt_stale_seg_clear                       ? " SEG-CLEAR" : "",
            opt_l1tf_barrier                          ? " L1TF_BARRIER" : "");
 
     /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
@@ -864,6 +871,83 @@ static __init void mds_calculations(uint64_t caps)
     }
 }
 
+/* Calculate whether this CPU leaks segment registers between contexts. */
+static void __init stale_segment_calculations(void)
+{
+    /*
+     * Assume all unrecognised processors are ok.  This is only known to
+     * affect Intel Family 6 processors.
+     */
+    if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+         boot_cpu_data.x86 != 6 )
+        return;
+
+    switch ( boot_cpu_data.x86_model )
+    {
+        /*
+         * Core processors since at least Nehalem are vulnerable.
+         */
+    case 0x1e: /* Nehalem */
+    case 0x1f: /* Auburndale / Havendale */
+    case 0x1a: /* Nehalem EP */
+    case 0x2e: /* Nehalem EX */
+    case 0x25: /* Westmere */
+    case 0x2c: /* Westmere EP */
+    case 0x2f: /* Westmere EX */
+    case 0x2a: /* SandyBridge */
+    case 0x2d: /* SandyBridge EP/EX */
+    case 0x3a: /* IvyBridge */
+    case 0x3e: /* IvyBridge EP/EX */
+    case 0x3c: /* Haswell */
+    case 0x3f: /* Haswell EX/EP */
+    case 0x45: /* Haswell D */
+    case 0x46: /* Haswell H */
+    case 0x3d: /* Broadwell */
+    case 0x47: /* Broadwell H */
+    case 0x4f: /* Broadwell EP/EX */
+    case 0x56: /* Broadwell D */
+    case 0x4e: /* Skylake M */
+    case 0x55: /* Skylake X */
+    case 0x5e: /* Skylake D */
+    case 0x66: /* Cannonlake */
+    case 0x67: /* Cannonlake? */
+    case 0x8e: /* Kabylake M */
+    case 0x9e: /* Kabylake D */
+        cpu_has_bug_stale_seg = true;
+        break;
+
+        /*
+         * Atom processors are not vulnerable.
+         */
+    case 0x1c: /* Pineview */
+    case 0x26: /* Lincroft */
+    case 0x27: /* Penwell */
+    case 0x35: /* Cloverview */
+    case 0x36: /* Cedarview */
+    case 0x37: /* Baytrail / Valleyview (Silvermont) */
+    case 0x4d: /* Avaton / Rangely (Silvermont) */
+    case 0x4c: /* Cherrytrail / Brasswell */
+    case 0x4a: /* Merrifield */
+    case 0x5a: /* Moorefield */
+    case 0x5c: /* Goldmont */
+    case 0x5f: /* Denverton */
+    case 0x7a: /* Gemini Lake */
+        break;
+
+        /*
+         * Knights processors are not vulnerable.
+         */
+    case 0x57: /* Knights Landing */
+    case 0x85: /* Knights Mill */
+        break;
+
+    default:
+        printk("Unrecognised CPU model %#x - assuming vulnerable to 
StaleSeg\n",
+               boot_cpu_data.x86_model);
+        break;
+    }
+}
+
 void __init init_speculation_mitigations(void)
 {
     enum ind_thunk thunk = THUNK_DEFAULT;
@@ -1098,6 +1182,12 @@ void __init init_speculation_mitigations(void)
             "enabled.  Mitigations will not be fully effective.  Please\n"
             "choose an explicit smt=<bool> setting.  See XSA-297.\n");
 
+    stale_segment_calculations();
+
+    /* Scrub segment registers by default on leaky hardware. */
+    if ( opt_stale_seg_clear == -1 )
+        opt_stale_seg_clear = cpu_has_bug_stale_seg;
+
     print_details(thunk, caps);
 
     /*
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
index 1339ddd7ef..49b755e4ed 100644
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -37,6 +37,7 @@ extern bool opt_ibpb;
 extern bool opt_ssbd;
 extern int8_t opt_eager_fpu;
 extern int8_t opt_l1d_flush;
+extern int8_t opt_stale_seg_clear;
 extern int8_t opt_l1tf_barrier;
 
 extern bool bsp_delay_spec_ctrl;
-- 
2.11.0


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.