[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen master] x86: move syscall trampolines off the stack



commit 7a66ac8d1633297be79fefbf920f0e02c71041d3
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Fri May 22 10:45:43 2015 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Fri May 22 10:45:43 2015 +0200

    x86: move syscall trampolines off the stack
    
    This is needed as stacks are going to become non-executable. Use
    separate stub pages (shared among suitable CPUs on the same node)
    instead.
    
    Stub areas (currently 128 bytes each) are being split into two parts -
    a fixed usage one (the syscall ones) and dynamically usable space,
    which will be used by subsequent changes to hold dynamically generated
    code during instruction eumlation.
    
    While sharing physical pages among certain CPUs on the same node, for
    now the virtual mappings get established in distinct pages for each
    CPU. This isn't a strict requirement, but simplifies VA space
    management for this initial implementation: Sharing VA space would
    require additional tracking of which areas are currently in use. If
    the VA and/or TLB overhead turned out to be a problem, such extra code
    could easily be added.
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Reviewed-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
 xen/arch/x86/setup.c               |    4 ++
 xen/arch/x86/smpboot.c             |   71 +++++++++++++++++++++++++-
 xen/arch/x86/x86_64/compat/entry.S |   15 +++++-
 xen/arch/x86/x86_64/entry.S        |   24 ++++-----
 xen/arch/x86/x86_64/traps.c        |   98 +++++++++++++++++++----------------
 xen/arch/x86/xen.lds.S             |    3 +
 xen/include/asm-x86/config.h       |    4 ++
 xen/include/asm-x86/page.h         |    8 ++--
 xen/include/asm-x86/processor.h    |   14 +++++-
 xen/include/asm-x86/x86_64/page.h  |    1 +
 10 files changed, 177 insertions(+), 65 deletions(-)

diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 2cc9185..e808ea9 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1271,6 +1271,10 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 
     init_idle_domain();
 
+    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
+                                           &this_cpu(stubs).mfn);
+    BUG_ON(!this_cpu(stubs.addr));
+
     trap_init();
 
     rcu_init();
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 116c8f8..2289284 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -25,6 +25,7 @@
 #include <xen/kernel.h>
 #include <xen/mm.h>
 #include <xen/domain.h>
+#include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <xen/sched-if.h>
 #include <xen/irq.h>
@@ -603,6 +604,43 @@ static int do_boot_cpu(int apicid, int cpu)
     return rc;
 }
 
+#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
+
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
+{
+    unsigned long stub_va;
+    struct page_info *pg;
+
+    BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
+
+    if ( *mfn )
+        pg = mfn_to_page(*mfn);
+    else
+    {
+        nodeid_t node = cpu_to_node(cpu);
+        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
+
+        pg = alloc_domheap_page(NULL, memflags);
+        if ( !pg )
+            return 0;
+
+        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
+    }
+
+    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
+    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
+                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
+    {
+        if ( !*mfn )
+            free_domheap_page(pg);
+        stub_va = 0;
+    }
+    else if ( !*mfn )
+        *mfn = page_to_mfn(pg);
+
+    return stub_va;
+}
+
 void cpu_exit_clear(unsigned int cpu)
 {
     cpu_uninit(cpu);
@@ -616,6 +654,23 @@ static void cpu_smpboot_free(unsigned int cpu)
     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
 
+    if ( per_cpu(stubs.addr, cpu) )
+    {
+        unsigned long mfn = per_cpu(stubs.mfn, cpu);
+        unsigned char *stub_page = map_domain_page(mfn);
+        unsigned int i;
+
+        memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
+        for ( i = 0; i < STUBS_PER_PAGE; ++i )
+            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
+                break;
+        unmap_domain_page(stub_page);
+        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
+                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
+        if ( i == STUBS_PER_PAGE )
+            free_domheap_page(mfn_to_page(mfn));
+    }
+
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
     free_xenheap_pages(per_cpu(gdt_table, cpu), order);
 
@@ -635,9 +690,10 @@ static void cpu_smpboot_free(unsigned int cpu)
 
 static int cpu_smpboot_alloc(unsigned int cpu)
 {
-    unsigned int order, memflags = 0;
+    unsigned int i, order, memflags = 0;
     nodeid_t node = cpu_to_node(cpu);
     struct desc_struct *gdt;
+    unsigned long stub_page;
 
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
@@ -667,6 +723,19 @@ static int cpu_smpboot_alloc(unsigned int cpu)
         goto oom;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
 
+    for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+          i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+        if ( cpu_online(i) && cpu_to_node(i) == node )
+        {
+            per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
+            break;
+        }
+    BUG_ON(i == cpu);
+    stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
+    if ( !stub_page )
+        goto oom;
+    per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
+
     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
          zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
         return 0;
diff --git a/xen/arch/x86/x86_64/compat/entry.S 
b/xen/arch/x86/x86_64/compat/entry.S
index 5b0af61..46f340b 100644
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -219,7 +219,20 @@ ENTRY(compat_post_handle_exception)
         movb  $0,TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
-ENTRY(compat_syscall)
+/* See lstar_enter for entry register state. */
+ENTRY(cstar_enter)
+        sti
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_USER_CS32
+        pushq %rcx
+        pushq $0
+        SAVE_VOLATILE TRAP_syscall
+        GET_CURRENT(%rbx)
+        movq  VCPU_domain(%rbx),%rcx
+        cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
+        je    switch_to_kernel
         cmpb  $0,VCPU_syscall32_disables_events(%rbx)
         movzwl VCPU_syscall32_sel(%rbx),%esi
         movq  VCPU_syscall32_addr(%rbx),%rax
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 7e63e64..4a37642 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -13,9 +13,8 @@
 #include <public/xen.h>
 #include <irq_vectors.h>
 
-        ALIGN
 /* %rbx: struct vcpu */
-switch_to_kernel:
+ENTRY(switch_to_kernel)
         leaq  VCPU_trap_bounce(%rbx),%rdx
         /* TB_eip = (32-bit syscall && syscall32_addr) ?
          *          syscall32_addr : syscall_addr */
@@ -113,23 +112,22 @@ restore_all_xen:
  * When entering SYSCALL from user mode:
  *  Vector directly to the registered arch.syscall_addr.
  *
- * Initial work is done by per-CPU stack trampolines. At this point %rsp
- * has been initialised to point at the correct Xen stack, and %rsp, %rflags
- * and %cs have been saved. All other registers are still to be saved onto
- * the stack, starting with %rip, and an appropriate %ss must be saved into
- * the space left by the trampoline.
+ * Initial work is done by per-CPU trampolines. At this point %rsp has been
+ * initialised to point at the correct Xen stack, %rsp has been saved, and
+ * %rax needs to be restored from the %ss save slot. All other registers are
+ * still to be saved onto the stack, starting with RFLAGS, and an appropriate
+ * %ss must be saved into the space left by the trampoline.
  */
-ENTRY(syscall_enter)
+ENTRY(lstar_enter)
         sti
-        movl  $FLAT_KERNEL_SS,24(%rsp)
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_KERNEL_CS64
         pushq %rcx
         pushq $0
-        movq  24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
         SAVE_VOLATILE TRAP_syscall
         GET_CURRENT(%rbx)
-        movq  VCPU_domain(%rbx),%rcx
-        testb $1,DOMAIN_is_32bit_pv(%rcx)
-        jnz   compat_syscall
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index fc35aba..117a133 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -337,70 +337,78 @@ unsigned long do_iret(void)
     return 0;
 }
 
-static int write_stack_trampoline(
-    char *stack, char *stack_bottom, uint16_t cs_seg)
+static unsigned int write_stub_trampoline(
+    unsigned char *stub, unsigned long stub_va,
+    unsigned long stack_bottom, unsigned long target_va)
 {
-    /* movq %rsp, saversp(%rip) */
-    stack[0] = 0x48;
-    stack[1] = 0x89;
-    stack[2] = 0x25;
-    *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
-
-    /* leaq saversp(%rip), %rsp */
-    stack[7] = 0x48;
-    stack[8] = 0x8d;
-    stack[9] = 0x25;
-    *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
-
-    /* pushq %r11 */
-    stack[14] = 0x41;
-    stack[15] = 0x53;
-
-    /* pushq $<cs_seg> */
-    stack[16] = 0x68;
-    *(u32 *)&stack[17] = cs_seg;
-
-    /* movq $syscall_enter,%r11 */
-    stack[21] = 0x49;
-    stack[22] = 0xbb;
-    *(void **)&stack[23] = (void *)syscall_enter;
-
-    /* jmpq *%r11 */
-    stack[31] = 0x41;
-    stack[32] = 0xff;
-    stack[33] = 0xe3;
-
-    return 34;
+    /* movabsq %rax, stack_bottom - 8 */
+    stub[0] = 0x48;
+    stub[1] = 0xa3;
+    *(uint64_t *)&stub[2] = stack_bottom - 8;
+
+    /* movq %rsp, %rax */
+    stub[10] = 0x48;
+    stub[11] = 0x89;
+    stub[12] = 0xe0;
+
+    /* movabsq $stack_bottom - 8, %rsp */
+    stub[13] = 0x48;
+    stub[14] = 0xbc;
+    *(uint64_t *)&stub[15] = stack_bottom - 8;
+
+    /* pushq %rax */
+    stub[23] = 0x50;
+
+    /* jmp target_va */
+    stub[24] = 0xe9;
+    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
+
+    /* Round up to a multiple of 16 bytes. */
+    return 32;
 }
 
+DEFINE_PER_CPU(struct stubs, stubs);
+void lstar_enter(void);
+void cstar_enter(void);
+
 void __devinit subarch_percpu_traps_init(void)
 {
-    char *stack_bottom, *stack;
-
-    stack_bottom = (char *)get_stack_bottom();
-    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
+    unsigned long stack_bottom = get_stack_bottom();
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned char *stub_page;
+    unsigned int offset;
 
     /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
     BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
 
-    /* Trampoline for SYSCALL entry from long mode. */
-    stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
-    wrmsrl(MSR_LSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
+    stub_page = map_domain_page(this_cpu(stubs.mfn));
+
+    /* Trampoline for SYSCALL entry from 64-bit mode. */
+    wrmsrl(MSR_LSTAR, stub_va);
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_bottom,
+                                   (unsigned long)lstar_enter);
+    stub_va += offset;
 
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
          boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
     {
         /* SYSENTER entry. */
-        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
+        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
         wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
         wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
     }
 
     /* Trampoline for SYSCALL entry from compatibility mode. */
-    stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
-    wrmsrl(MSR_CSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
+    wrmsrl(MSR_CSTAR, stub_va);
+    offset += write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                    stub_va, stack_bottom,
+                                    (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
 
     /* Common SYSCALL parameters. */
     wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index 4699a04..5fc6c9f 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -217,4 +217,7 @@ SECTIONS
   .comment 0 : { *(.comment) }
 }
 
+ASSERT(__image_base__ > XEN_VIRT_START ||
+       _end <= XEN_VIRT_END - NR_CPUS * PAGE_SIZE,
+       "Xen image overlaps stubs area")
 ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large")
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 7700c77..3e9be83 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -94,6 +94,10 @@
 /* Primary stack is restricted to 8kB by guard pages. */
 #define PRIMARY_STACK_SIZE 8192
 
+/* Total size of syscall and emulation stubs. */
+#define STUB_BUF_SHIFT (L1_CACHE_SHIFT > 7 ? L1_CACHE_SHIFT : 7)
+#define STUB_BUF_SIZE  (1 << STUB_BUF_SHIFT)
+
 /* Return value for zero-size _xmalloc(), distinguished from NULL. */
 #define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL)
 
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index 27c2ae7..ef1e86f 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -323,10 +323,10 @@ void efi_update_l4_pgtable(unsigned int l4idx, 
l4_pgentry_t);
 #define _PAGE_GNTTAB   0
 #endif
 
-#define __PAGE_HYPERVISOR \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_HYPERVISOR_NOCACHE \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
+                                   _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR | _PAGE_PCD)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index a9b4e06..fb2c2fc 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -532,12 +532,24 @@ void trap_nop(void);
 void enable_nmis(void);
 void do_reserved_trap(struct cpu_user_regs *regs);
 
-void syscall_enter(void);
 void sysenter_entry(void);
 void sysenter_eflags_saved(void);
 void compat_hypercall(void);
 void int80_direct_trap(void);
 
+#define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE)
+
+struct stubs {
+    union {
+        void(*func)(void);
+        unsigned long addr;
+    };
+    unsigned long mfn;
+};
+
+DECLARE_PER_CPU(struct stubs, stubs);
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn);
+
 extern int hypercall(void);
 
 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
diff --git a/xen/include/asm-x86/x86_64/page.h 
b/xen/include/asm-x86/x86_64/page.h
index 1d54587..fd2f700 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -148,6 +148,7 @@ typedef l4_pgentry_t root_pgentry_t;
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
 #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
 
 #endif /* __X86_64_PAGE_H__ */
--
generated by git-patchbot for /home/xen/git/xen.git#master

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.