[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen staging-4.7] x86/spec-ctrl: Calculate safe PTE addresses for L1TF mitigations



commit 61c4360284826caba58b9ea5d547fa0fa7e28795
Author:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
AuthorDate: Wed Jul 25 12:10:19 2018 +0000
Commit:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Tue Aug 14 17:28:09 2018 +0100

    x86/spec-ctrl: Calculate safe PTE addresses for L1TF mitigations
    
    Safe PTE addresses for L1TF mitigations are ones which are within the L1D
    address width (may be wider than reported in CPUID), and above the highest
    cacheable RAM/NVDIMM/BAR/etc.
    
    All logic here is best-effort heuristics, which should in practice be fine 
for
    most hardware.  Future work will see about disentangling the SRAT handling
    further, as well as having L0 pass this information down to lower levels 
when
    virtualised.
    
    This is part of XSA-273 / CVE-2018-3620.
    
    Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    (cherry picked from commit b03a57c9383b32181e60add6b6de12b473652aa4)
---
 xen/arch/x86/setup.c            |  12 ++++
 xen/arch/x86/spec_ctrl.c        | 153 ++++++++++++++++++++++++++++++++++++++++
 xen/arch/x86/srat.c             |   8 ++-
 xen/common/efi/boot.c           |  12 ++++
 xen/include/asm-x86/spec_ctrl.h |   7 ++
 5 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 717e85006b..0072de75f9 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -819,6 +819,18 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     /* Sanitise the raw E820 map to produce a final clean version. */
     max_page = raw_max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
 
+    if ( !efi_enabled )
+    {
+        /*
+         * Supplement the heuristics in l1tf_calculations() by assuming that
+         * anything referenced in the E820 may be cacheable.
+         */
+        l1tf_safe_maddr =
+            max(l1tf_safe_maddr,
+                ROUNDUP(e820_raw[e820_raw_nr - 1].addr +
+                        e820_raw[e820_raw_nr - 1].size, PAGE_SIZE));
+    }
+
     /* Create a temporary copy of the E820 map. */
     memcpy(&boot_e820, &e820, sizeof(e820));
 
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 6b59da675f..0883610373 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -50,6 +50,10 @@ bool_t __initdata bsp_delay_spec_ctrl;
 uint8_t __read_mostly default_xen_spec_ctrl;
 uint8_t __read_mostly default_spec_ctrl_flags;
 
+paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr;
+static bool_t __initdata cpu_has_bug_l1tf;
+static unsigned int __initdata l1d_maxphysaddr;
+
 static int __init parse_bti(const char *s)
 {
     const char *ss;
@@ -404,6 +408,153 @@ static bool_t __init should_use_eager_fpu(void)
     }
 }
 
+/* Calculate whether this CPU is vulnerable to L1TF. */
+static __init void l1tf_calculations(uint64_t caps)
+{
+    bool_t hit_default = 0;
+
+    l1d_maxphysaddr = paddr_bits;
+
+    /* L1TF is only known to affect Intel Family 6 processors at this time. */
+    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+         boot_cpu_data.x86 == 6 )
+    {
+        switch ( boot_cpu_data.x86_model )
+        {
+            /*
+             * Core processors since at least Penryn are vulnerable.
+             */
+        case 0x17: /* Penryn */
+        case 0x1d: /* Dunnington */
+            cpu_has_bug_l1tf = 1;
+            break;
+
+        case 0x1f: /* Auburndale / Havendale */
+        case 0x1e: /* Nehalem */
+        case 0x1a: /* Nehalem EP */
+        case 0x2e: /* Nehalem EX */
+        case 0x25: /* Westmere */
+        case 0x2c: /* Westmere EP */
+        case 0x2f: /* Westmere EX */
+            cpu_has_bug_l1tf = 1;
+            l1d_maxphysaddr = 44;
+            break;
+
+        case 0x2a: /* SandyBridge */
+        case 0x2d: /* SandyBridge EP/EX */
+        case 0x3a: /* IvyBridge */
+        case 0x3e: /* IvyBridge EP/EX */
+        case 0x3c: /* Haswell */
+        case 0x3f: /* Haswell EX/EP */
+        case 0x45: /* Haswell D */
+        case 0x46: /* Haswell H */
+        case 0x3d: /* Broadwell */
+        case 0x47: /* Broadwell H */
+        case 0x4f: /* Broadwell EP/EX */
+        case 0x56: /* Broadwell D */
+        case 0x4e: /* Skylake M */
+        case 0x55: /* Skylake X */
+        case 0x5e: /* Skylake D */
+        case 0x66: /* Cannonlake */
+        case 0x67: /* Cannonlake? */
+        case 0x8e: /* Kabylake M */
+        case 0x9e: /* Kabylake D */
+            cpu_has_bug_l1tf = 1;
+            l1d_maxphysaddr = 46;
+            break;
+
+            /*
+             * Atom processors are not vulnerable.
+             */
+        case 0x1c: /* Pineview */
+        case 0x26: /* Lincroft */
+        case 0x27: /* Penwell */
+        case 0x35: /* Cloverview */
+        case 0x36: /* Cedarview */
+        case 0x37: /* Baytrail / Valleyview (Silvermont) */
+        case 0x4d: /* Avaton / Rangely (Silvermont) */
+        case 0x4c: /* Cherrytrail / Brasswell */
+        case 0x4a: /* Merrifield */
+        case 0x5a: /* Moorefield */
+        case 0x5c: /* Goldmont */
+        case 0x5f: /* Denverton */
+        case 0x7a: /* Gemini Lake */
+            break;
+
+            /*
+             * Knights processors are not vulnerable.
+             */
+        case 0x57: /* Knights Landing */
+        case 0x85: /* Knights Mill */
+            break;
+
+        default:
+            /* Defer printk() until we've accounted for RDCL_NO. */
+            hit_default = 1;
+            cpu_has_bug_l1tf = 1;
+            break;
+        }
+    }
+
+    /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */
+    if ( caps & ARCH_CAPABILITIES_RDCL_NO )
+        cpu_has_bug_l1tf = 0;
+
+    if ( cpu_has_bug_l1tf && hit_default )
+        printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n",
+               boot_cpu_data.x86_model);
+
+    /*
+     * L1TF safe address heuristics.  These apply to the real hardware we are
+     * running on, and are best-effort-only if Xen is virtualised.
+     *
+     * The address mask which the L1D cache uses, which might be wider than
+     * the CPUID-reported maxphysaddr.
+     */
+    l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK;
+
+    /*
+     * To be safe, l1tf_safe_maddr must be above the highest cacheable entity
+     * in system physical address space.  However, to preserve space for
+     * paged-out metadata, it should be as low as possible above the highest
+     * cacheable address, so as to require fewer high-order bits being set.
+     *
+     * These heuristics are based on some guesswork to improve the likelihood
+     * of safety in the common case, including Linux's L1TF mitigation of
+     * inverting all address bits in a non-present PTE.
+     *
+     * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end
+     *   server), setting any address bit beyond CPUID maxphysaddr guarantees
+     *   to make the PTE safe.  This case doesn't require all the high-order
+     *   bits being set, and doesn't require any other source of information
+     *   for safety.
+     *
+     * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we
+     *   must sacrifice high order bits from the real address space for
+     *   safety.  Therefore, make a blind guess that there is nothing
+     *   cacheable in the top quarter of physical address space.
+     *
+     *   It is exceedingly unlikely for machines to be populated with this
+     *   much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on
+     *   Sandybridge and later) due to the sheer volume of DIMMs this would
+     *   actually take.
+     *
+     *   However, it is possible to find machines this large, so the "top
+     *   quarter" guess is supplemented to push the limit higher if references
+     *   to cacheable mappings (E820/SRAT/EFI/etc) are found above the top
+     *   quarter boundary.
+     *
+     *   Finally, this top quarter guess gives us a good chance of being safe
+     *   when running virtualised (and the CPUID maxphysaddr hasn't been
+     *   levelled for heterogeneous migration safety), where the safety
+     *   consideration is still in terms of host details, but all E820/etc
+     *   information is in terms of guest physical layout.
+     */
+    l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits)
+                                            ? (1ul << paddr_bits)
+                                            : (3ul << (paddr_bits - 2))));
+}
+
 int8_t __read_mostly opt_xpti = -1;
 
 static __init void xpti_init_default(uint64_t caps)
@@ -617,6 +768,8 @@ void __init init_speculation_mitigations(void)
     else
         setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
 
+    l1tf_calculations(caps);
+
     print_details(thunk, caps);
 
     /*
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index 72ad27dd86..42396cc92a 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -20,6 +20,7 @@
 #include <xen/pfn.h>
 #include <asm/e820.h>
 #include <asm/page.h>
+#include <asm/spec_ctrl.h>
 
 static struct acpi_table_slit *__read_mostly acpi_slit;
 
@@ -288,6 +289,11 @@ acpi_numa_memory_affinity_init(const struct 
acpi_srat_mem_affinity *ma)
        if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
                return;
 
+       start = ma->base_address;
+       end = start + ma->length;
+       /* Supplement the heuristics in l1tf_calculations(). */
+       l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
+
        if (num_node_memblks >= NR_NODE_MEMBLKS)
        {
                dprintk(XENLOG_WARNING,
@@ -296,8 +302,6 @@ acpi_numa_memory_affinity_init(const struct 
acpi_srat_mem_affinity *ma)
                return;
        }
 
-       start = ma->base_address;
-       end = start + ma->length;
        pxm = ma->proximity_domain;
        if (srat_rev < 2)
                pxm &= 0xff;
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index 125c9ce64a..d83635ee4e 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1153,6 +1153,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE 
*SystemTable)
 
 #ifndef CONFIG_ARM /* TODO - runtime service support */
 
+#include <asm/spec_ctrl.h>
+
 static bool_t __initdata efi_rs_enable = 1;
 static bool_t __initdata efi_map_uc;
 
@@ -1254,6 +1256,16 @@ void __init efi_init_memory(void)
                desc->PhysicalStart, desc->PhysicalStart + len - 1,
                desc->Type, desc->Attribute);
 
+        if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) ||
+             (efi_bs_revision >= EFI_REVISION(2, 5) &&
+              (desc->Attribute & EFI_MEMORY_WP)) )
+        {
+            /* Supplement the heuristics in l1tf_calculations(). */
+            l1tf_safe_maddr =
+                max(l1tf_safe_maddr,
+                    ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE));
+        }
+
         if ( !efi_rs_enable ||
              (!(desc->Attribute & EFI_MEMORY_RUNTIME) &&
               (!map_bs ||
diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
index 830db3ee9c..64a242bda1 100644
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -38,6 +38,13 @@ extern int8_t opt_xpti;
 #define OPT_XPTI_DOM0  0x01
 #define OPT_XPTI_DOMU  0x02
 
+/*
+ * The L1D address mask, which might be wider than reported in CPUID, and the
+ * system physical address above which there are believed to be no cacheable
+ * memory regions, thus unable to leak data via the L1TF vulnerability.
+ */
+extern paddr_t l1tf_addr_mask, l1tf_safe_maddr;
+
 static inline void init_shadow_spec_ctrl_state(void)
 {
     struct cpu_info *info = get_cpu_info();
--
generated by git-patchbot for /home/xen/git/xen.git#staging-4.7

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.