[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 1/4] x86: move syscall trampolines off the stack



On 18/05/15 13:46, Jan Beulich wrote:
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -25,6 +25,7 @@
>  #include <xen/kernel.h>
>  #include <xen/mm.h>
>  #include <xen/domain.h>
> +#include <xen/domain_page.h>
>  #include <xen/sched.h>
>  #include <xen/sched-if.h>
>  #include <xen/irq.h>
> @@ -603,6 +604,42 @@ static int do_boot_cpu(int apicid, int c
>      return rc;
>  }
>  
> +unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
> +{
> +    unsigned long stub_va;
> +    struct page_info *pg;
> +
> +    if ( *mfn )
> +        pg = mfn_to_page(*mfn);
> +    else
> +    {
> +        nodeid_t node = cpu_to_node(cpu);
> +        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
> +
> +        pg = alloc_domheap_page(NULL, memflags);
> +        if ( !pg )
> +            return 0;
> +
> +        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));

Seems like memset_page(pg, int val) might be a nice piece of cleanup.

> +    }
> +
> +    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
> +    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
> +                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
> +    {
> +        if ( !*mfn )
> +            free_domheap_page(pg);
> +        stub_va = 0;
> +    }
> +    else
> +    {
> +        stub_va += (cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE;

"(cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE" comes up very frequently
through this patch.

I think the logic would be easier to read given:

#define STUB_OFFSET(cpu) (((cpu) & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)

> +        *mfn = page_to_mfn(pg);
> +    }
> +
> +    return stub_va;
> +}
> +
>  void cpu_exit_clear(unsigned int cpu)
>  {
>      cpu_uninit(cpu);
> @@ -616,6 +653,24 @@ static void cpu_smpboot_free(unsigned in
>      free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
>      free_cpumask_var(per_cpu(cpu_core_mask, cpu));
>  
> +    if ( per_cpu(stubs.addr, cpu) )
> +    {
> +        unsigned long mfn = per_cpu(stubs.mfn, cpu);
> +        unsigned char *stub_page = map_domain_page(mfn);
> +        unsigned int i;
> +
> +        memset(stub_page + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE,
> +               0xcc, STUB_BUF_SIZE);
> +        for ( i = 0; i < STUBS_PER_PAGE; ++i )
> +            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc)
> +                break;

There is a race condition between allocate and free, as
write_stub_trampoline() is written by the cpu itself.  Just finding 0xcc
here doesn't mean there isn't a different cpu in the process of coming
up and intending to use the stub page.

I suspect the race can be fixed by having the alloc side clobber the
first instruction, perhaps to ud2 ?

(Also, style at the end of the if)

> +        unmap_domain_page(stub_page);
> +        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
> +                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
> +        if ( i == STUBS_PER_PAGE )
> +            free_domheap_page(mfn_to_page(mfn));
> +    }
> +
>      order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
>      free_xenheap_pages(per_cpu(gdt_table, cpu), order);
>  
> --- a/xen/arch/x86/x86_64/compat/entry.S
> +++ b/xen/arch/x86/x86_64/compat/entry.S
> @@ -219,7 +219,19 @@ ENTRY(compat_post_handle_exception)
>          movb  $0,TRAPBOUNCE_flags(%rdx)
>          jmp   compat_test_all_events
>  

I would be tempted to leave a comment here referring to 'lstar_enter'
for stack/register details.

> -ENTRY(compat_syscall)
> +ENTRY(cstar_enter)
> +        sti
> +        movq  8(%rsp),%rax /* Restore %rax. */
> +        movq  $FLAT_KERNEL_SS,8(%rsp)
> +        pushq %r11
> +        pushq $FLAT_USER_CS32
> +        pushq %rcx
> +        pushq $0
> +        SAVE_VOLATILE TRAP_syscall
> +        GET_CURRENT(%rbx)
> +        movq  VCPU_domain(%rbx),%rcx
> +        cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
> +        je    switch_to_kernel
>          cmpb  $0,VCPU_syscall32_disables_events(%rbx)
>          movzwl VCPU_syscall32_sel(%rbx),%esi
>          movq  VCPU_syscall32_addr(%rbx),%rax
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -13,9 +13,8 @@
>  #include <public/xen.h>
>  #include <irq_vectors.h>
>  
> -        ALIGN
>  /* %rbx: struct vcpu */
> -switch_to_kernel:
> +ENTRY(switch_to_kernel)
>          leaq  VCPU_trap_bounce(%rbx),%rdx
>          /* TB_eip = (32-bit syscall && syscall32_addr) ?
>           *          syscall32_addr : syscall_addr */
> @@ -114,22 +113,21 @@ restore_all_xen:
>   *  Vector directly to the registered arch.syscall_addr.
>   *
>   * Initial work is done by per-CPU stack trampolines. At this point %rsp

The trampolines are no longer on the stack, so I think this line needs
editing as well.

> - * has been initialised to point at the correct Xen stack, and %rsp, %rflags
> - * and %cs have been saved. All other registers are still to be saved onto
> - * the stack, starting with %rip, and an appropriate %ss must be saved into
> - * the space left by the trampoline.
> + * has been initialised to point at the correct Xen stack, %rsp has been
> + * saved, and %rax needs to be restored from the %ss save slot. All other
> + * registers are still to be saved onto the stack, starting with RFLAGS, and
> + * an appropriate %ss must be saved into the space left by the trampoline.
>   */
> -ENTRY(syscall_enter)
> +ENTRY(lstar_enter)
>          sti
> -        movl  $FLAT_KERNEL_SS,24(%rsp)
> +        movq  8(%rsp),%rax /* Restore %rax. */
> +        movq  $FLAT_KERNEL_SS,8(%rsp)
> +        pushq %r11
> +        pushq $FLAT_KERNEL_CS64
>          pushq %rcx
>          pushq $0
> -        movq  24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
>          SAVE_VOLATILE TRAP_syscall
>          GET_CURRENT(%rbx)
> -        movq  VCPU_domain(%rbx),%rcx
> -        testb $1,DOMAIN_is_32bit_pv(%rcx)
> -        jnz   compat_syscall
>          testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
>          jz    switch_to_kernel
>  
> --- a/xen/arch/x86/x86_64/traps.c
> +++ b/xen/arch/x86/x86_64/traps.c
>
> +DEFINE_PER_CPU(struct stubs, stubs);
> +void lstar_enter(void);
> +void cstar_enter(void);
> +
>  void __devinit subarch_percpu_traps_init(void)
>  {
> -    char *stack_bottom, *stack;
> -
> -    stack_bottom = (char *)get_stack_bottom();
> -    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
> +    unsigned long stack_bottom = get_stack_bottom();
> +    unsigned long stub_va = this_cpu(stubs.addr);
> +    unsigned char *stub_page;
> +    unsigned int offset;
>  
>      /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
>      BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > 
> STACK_SIZE);
>  
> -    /* Trampoline for SYSCALL entry from long mode. */
> -    stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
> -    wrmsrl(MSR_LSTAR, (unsigned long)stack);
> -    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
> +    stub_page = map_domain_page(this_cpu(stubs.mfn));
> +
> +    /* Trampoline for SYSCALL entry from 64-bit mode. */
> +    wrmsrl(MSR_LSTAR, stub_va);
> +    offset = write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),

I would personally opt for "stub_va & ~PAGE_MASK"

> +                                   stub_va, stack_bottom,
> +                                   (unsigned long)lstar_enter);
> +    stub_va += offset;
>  
>      if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
>           boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
>      {
>          /* SYSENTER entry. */
> -        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
> +        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
>          wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
>          wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
>      }
>  
>      /* Trampoline for SYSCALL entry from compatibility mode. */
> -    stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
> -    wrmsrl(MSR_CSTAR, (unsigned long)stack);
> -    stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
> +    wrmsrl(MSR_CSTAR, stub_va);
> +    offset += write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),
> +                                    stub_va, stack_bottom,
> +                                    (unsigned long)cstar_enter);
> +
> +    /* Don't consume more than half of the stub space here. */
> +    ASSERT(offset <= STUB_BUF_SIZE / 2);
> +
> +    unmap_domain_page(stub_page);
>  
>      /* Common SYSCALL parameters. */
>      wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);

As for the rest of the patch, I have checked the asm changes carefully
and can't see any issues.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.