x86: move syscall trampolines off the stack This is needed as stacks are going to become non-executable. Use separate stub pages (shared among suitable CPUs on the same node) instead. Signed-off-by: Jan Beulich --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -1270,6 +1270,10 @@ void __init noreturn __start_xen(unsigne init_idle_domain(); + this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(), + &this_cpu(stubs).mfn); + BUG_ON(!this_cpu(stubs.addr)); + trap_init(); rcu_init(); --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -603,6 +604,42 @@ static int do_boot_cpu(int apicid, int c return rc; } +unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn) +{ + unsigned long stub_va; + struct page_info *pg; + + if ( *mfn ) + pg = mfn_to_page(*mfn); + else + { + nodeid_t node = cpu_to_node(cpu); + unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0; + + pg = alloc_domheap_page(NULL, memflags); + if ( !pg ) + return 0; + + unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE)); + } + + stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE; + if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1, + PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) ) + { + if ( !*mfn ) + free_domheap_page(pg); + stub_va = 0; + } + else + { + stub_va += (cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE; + *mfn = page_to_mfn(pg); + } + + return stub_va; +} + void cpu_exit_clear(unsigned int cpu) { cpu_uninit(cpu); @@ -616,6 +653,24 @@ static void cpu_smpboot_free(unsigned in free_cpumask_var(per_cpu(cpu_sibling_mask, cpu)); free_cpumask_var(per_cpu(cpu_core_mask, cpu)); + if ( per_cpu(stubs.addr, cpu) ) + { + unsigned long mfn = per_cpu(stubs.mfn, cpu); + unsigned char *stub_page = map_domain_page(mfn); + unsigned int i; + + memset(stub_page + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE, + 0xcc, STUB_BUF_SIZE); + for ( i = 0; i < STUBS_PER_PAGE; ++i ) + if ( stub_page[i * STUB_BUF_SIZE] != 0xcc) + break; + unmap_domain_page(stub_page); + destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK, + (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1); + if ( i == STUBS_PER_PAGE ) + free_domheap_page(mfn_to_page(mfn)); + } + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); free_xenheap_pages(per_cpu(gdt_table, cpu), order); @@ -635,9 +690,10 @@ static void cpu_smpboot_free(unsigned in static int cpu_smpboot_alloc(unsigned int cpu) { - unsigned int order, memflags = 0; + unsigned int i, order, memflags = 0; nodeid_t node = cpu_to_node(cpu); struct desc_struct *gdt; + unsigned long stub_page; if ( node != NUMA_NO_NODE ) memflags = MEMF_node(node); @@ -667,6 +723,20 @@ static int cpu_smpboot_alloc(unsigned in goto oom; memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t)); + for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); + i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) + if ( cpu_online(i) && cpu_to_node(i) == node ) + { + per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i); + break; + } + BUG_ON(i == cpu); + stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu)); + if ( !stub_page ) + goto oom; + per_cpu(stubs.addr, cpu) = stub_page + + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE; + if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) && zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) ) return 0; --- a/xen/arch/x86/x86_64/compat/entry.S +++ b/xen/arch/x86/x86_64/compat/entry.S @@ -219,7 +219,19 @@ ENTRY(compat_post_handle_exception) movb $0,TRAPBOUNCE_flags(%rdx) jmp compat_test_all_events -ENTRY(compat_syscall) +ENTRY(cstar_enter) + sti + movq 8(%rsp),%rax /* Restore %rax. */ + movq $FLAT_KERNEL_SS,8(%rsp) + pushq %r11 + pushq $FLAT_USER_CS32 + pushq %rcx + pushq $0 + SAVE_VOLATILE TRAP_syscall + GET_CURRENT(%rbx) + movq VCPU_domain(%rbx),%rcx + cmpb $0,DOMAIN_is_32bit_pv(%rcx) + je switch_to_kernel cmpb $0,VCPU_syscall32_disables_events(%rbx) movzwl VCPU_syscall32_sel(%rbx),%esi movq VCPU_syscall32_addr(%rbx),%rax --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -13,9 +13,8 @@ #include #include - ALIGN /* %rbx: struct vcpu */ -switch_to_kernel: +ENTRY(switch_to_kernel) leaq VCPU_trap_bounce(%rbx),%rdx /* TB_eip = (32-bit syscall && syscall32_addr) ? * syscall32_addr : syscall_addr */ @@ -114,22 +113,21 @@ restore_all_xen: * Vector directly to the registered arch.syscall_addr. * * Initial work is done by per-CPU stack trampolines. At this point %rsp - * has been initialised to point at the correct Xen stack, and %rsp, %rflags - * and %cs have been saved. All other registers are still to be saved onto - * the stack, starting with %rip, and an appropriate %ss must be saved into - * the space left by the trampoline. + * has been initialised to point at the correct Xen stack, %rsp has been + * saved, and %rax needs to be restored from the %ss save slot. All other + * registers are still to be saved onto the stack, starting with RFLAGS, and + * an appropriate %ss must be saved into the space left by the trampoline. */ -ENTRY(syscall_enter) +ENTRY(lstar_enter) sti - movl $FLAT_KERNEL_SS,24(%rsp) + movq 8(%rsp),%rax /* Restore %rax. */ + movq $FLAT_KERNEL_SS,8(%rsp) + pushq %r11 + pushq $FLAT_KERNEL_CS64 pushq %rcx pushq $0 - movq 24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */ SAVE_VOLATILE TRAP_syscall GET_CURRENT(%rbx) - movq VCPU_domain(%rbx),%rcx - testb $1,DOMAIN_is_32bit_pv(%rcx) - jnz compat_syscall testb $TF_kernel_mode,VCPU_thread_flags(%rbx) jz switch_to_kernel --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -337,70 +337,78 @@ unsigned long do_iret(void) return 0; } -static int write_stack_trampoline( - char *stack, char *stack_bottom, uint16_t cs_seg) +static unsigned int write_stub_trampoline( + unsigned char *stub, unsigned long stub_va, + unsigned long stack_bottom, unsigned long target_va) { - /* movq %rsp, saversp(%rip) */ - stack[0] = 0x48; - stack[1] = 0x89; - stack[2] = 0x25; - *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16; - - /* leaq saversp(%rip), %rsp */ - stack[7] = 0x48; - stack[8] = 0x8d; - stack[9] = 0x25; - *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16; - - /* pushq %r11 */ - stack[14] = 0x41; - stack[15] = 0x53; - - /* pushq $ */ - stack[16] = 0x68; - *(u32 *)&stack[17] = cs_seg; - - /* movq $syscall_enter,%r11 */ - stack[21] = 0x49; - stack[22] = 0xbb; - *(void **)&stack[23] = (void *)syscall_enter; - - /* jmpq *%r11 */ - stack[31] = 0x41; - stack[32] = 0xff; - stack[33] = 0xe3; + /* movabsq %rax, stack_bottom - 8 */ + stub[0] = 0x48; + stub[1] = 0xa3; + *(uint64_t *)&stub[2] = stack_bottom - 8; + + /* movq %rsp, %rax */ + stub[10] = 0x48; + stub[11] = 0x89; + stub[12] = 0xe0; + + /* movabsq $stack_bottom - 8, %rsp */ + stub[13] = 0x48; + stub[14] = 0xbc; + *(uint64_t *)&stub[15] = stack_bottom - 8; + + /* pushq %rax */ + stub[23] = 0x50; + + /* jmp target_va */ + stub[24] = 0xe9; + *(int32_t *)&stub[25] = target_va - (stub_va + 29); - return 34; + /* Round up to a multiple of 16 bytes. */ + return 32; } +DEFINE_PER_CPU(struct stubs, stubs); +void lstar_enter(void); +void cstar_enter(void); + void __devinit subarch_percpu_traps_init(void) { - char *stack_bottom, *stack; - - stack_bottom = (char *)get_stack_bottom(); - stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1)); + unsigned long stack_bottom = get_stack_bottom(); + unsigned long stub_va = this_cpu(stubs.addr); + unsigned char *stub_page; + unsigned int offset; /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */ BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE); - /* Trampoline for SYSCALL entry from long mode. */ - stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */ - wrmsrl(MSR_LSTAR, (unsigned long)stack); - stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64); + stub_page = map_domain_page(this_cpu(stubs.mfn)); + + /* Trampoline for SYSCALL entry from 64-bit mode. */ + wrmsrl(MSR_LSTAR, stub_va); + offset = write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)), + stub_va, stack_bottom, + (unsigned long)lstar_enter); + stub_va += offset; if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR ) { /* SYSENTER entry. */ - wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom); + wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom); wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry); wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0); } /* Trampoline for SYSCALL entry from compatibility mode. */ - stack = (char *)L1_CACHE_ALIGN((unsigned long)stack); - wrmsrl(MSR_CSTAR, (unsigned long)stack); - stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32); + wrmsrl(MSR_CSTAR, stub_va); + offset += write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)), + stub_va, stack_bottom, + (unsigned long)cstar_enter); + + /* Don't consume more than half of the stub space here. */ + ASSERT(offset <= STUB_BUF_SIZE / 2); + + unmap_domain_page(stub_page); /* Common SYSCALL parameters. */ wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS); --- a/xen/arch/x86/xen.lds.S +++ b/xen/arch/x86/xen.lds.S @@ -217,4 +217,7 @@ SECTIONS .comment 0 : { *(.comment) } } +ASSERT(__image_base__ > XEN_VIRT_START || + _end <= XEN_VIRT_END - NR_CPUS * PAGE_SIZE, + "Xen image overlaps stubs area") ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large") --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -94,6 +94,10 @@ /* Primary stack is restricted to 8kB by guard pages. */ #define PRIMARY_STACK_SIZE 8192 +/* Total size of syscall and emulation stubs. */ +#define STUB_BUF_SHIFT max(L1_CACHE_SHIFT, 7) +#define STUB_BUF_SIZE (1 << STUB_BUF_SHIFT) + /* Return value for zero-size _xmalloc(), distinguished from NULL. */ #define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL) --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -320,10 +320,10 @@ void paging_init(void); #define _PAGE_GNTTAB 0 #endif -#define __PAGE_HYPERVISOR \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) -#define __PAGE_HYPERVISOR_NOCACHE \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED) +#define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED) +#define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \ + _PAGE_DIRTY | _PAGE_RW) +#define __PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR | _PAGE_PCD) #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */ --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -532,12 +532,24 @@ void trap_nop(void); void enable_nmis(void); void do_reserved_trap(struct cpu_user_regs *regs); -void syscall_enter(void); void sysenter_entry(void); void sysenter_eflags_saved(void); void compat_hypercall(void); void int80_direct_trap(void); +#define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE) + +struct stubs { + union { + void(*func)(void); + unsigned long addr; + }; + unsigned long mfn; +}; + +DECLARE_PER_CPU(struct stubs, stubs); +unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn); + extern int hypercall(void); int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx, --- a/xen/include/asm-x86/x86_64/page.h +++ b/xen/include/asm-x86/x86_64/page.h @@ -148,6 +148,7 @@ typedef l4_pgentry_t root_pgentry_t; #define _PAGE_GUEST_KERNEL (1U<<12) #define PAGE_HYPERVISOR (__PAGE_HYPERVISOR | _PAGE_GLOBAL) +#define PAGE_HYPERVISOR_RX (__PAGE_HYPERVISOR_RX | _PAGE_GLOBAL) #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL) #endif /* __X86_64_PAGE_H__ */