[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RFC 41/44] x86/smp: Switch to using the percpu stacks
This is very easy for the APs. __high_start() is modified to switch stacks before entering C. The BSP however is more complicated, and needs to stay on cpu0_stack[] until setup is complete. The end of __start_xen() is modified to copy the top-of-stack data to the percpu stack immediately before jumping there. The VMCS Host and SYSENTER stacks are suitably adjusted, and become construction-time constant. The stack_start and stack_base[] array are removed completely, as well as the memguard_guard_stack() infrastructure. The STACK_ORDER xenheap allocations are no longer needed, and higher CPUs on large machines are finally numa-local. Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> --- xen/arch/x86/boot/x86_64.S | 15 ++++++++------- xen/arch/x86/efi/efi-boot.h | 8 ++++---- xen/arch/x86/hvm/vmx/vmcs.c | 21 ++++++++++----------- xen/arch/x86/mm.c | 15 --------------- xen/arch/x86/setup.c | 29 +++++++++++++++++++---------- xen/arch/x86/smpboot.c | 18 ------------------ xen/arch/x86/tboot.c | 29 +---------------------------- xen/arch/x86/traps.c | 10 ++-------- xen/include/asm-arm/mm.h | 1 - xen/include/asm-x86/mm.h | 3 --- xen/include/xen/smp.h | 2 -- 11 files changed, 44 insertions(+), 107 deletions(-) diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S index b1f0457..ed4c805 100644 --- a/xen/arch/x86/boot/x86_64.S +++ b/xen/arch/x86/boot/x86_64.S @@ -15,21 +15,25 @@ ENTRY(__high_start) mov $XEN_MINIMAL_CR4,%rcx mov %rcx,%cr4 - /* Set up %cr3 (differs between BSP and APs). */ + /* Set up %cr3 and %rsp (differs between BSP and APs). */ test %ebx, %ebx jz .Lbsp_setup /* APs switch onto percpu_idle_pt[], as provided by do_boot_cpu(). */ mov ap_cr3(%rip), %rax mov %rax, %cr3 + + /* APs move straight onto the PERCPU stack. */ + movabs $STACK_SIZE - CPUINFO_sizeof + PERCPU_STACK_MAPPING, %rsp + jmp .Ldone .Lbsp_setup: /* The BSP stays on the idle_pg_table[] during early boot. */ -.Ldone: - mov stack_start(%rip),%rsp - or $(STACK_SIZE-CPUINFO_sizeof),%rsp + /* The BSP starts on cpu0_stack. */ + lea STACK_SIZE - CPUINFO_sizeof + cpu0_stack(%rip), %rsp +.Ldone: /* Reset EFLAGS (subsumes CLI and CLD). */ pushq $0 @@ -61,9 +65,6 @@ GLOBAL(gdt_descr) .word LAST_RESERVED_GDT_BYTE .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE -GLOBAL(stack_start) - .quad cpu0_stack - .section .data.page_aligned, "aw", @progbits .align PAGE_SIZE, 0 GLOBAL(boot_cpu_gdt_table) diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h index d30f688..8af661b 100644 --- a/xen/arch/x86/efi/efi-boot.h +++ b/xen/arch/x86/efi/efi-boot.h @@ -251,15 +251,15 @@ static void __init noreturn efi_arch_post_exit_boot(void) #endif "movabs $__start_xen, %[rip]\n\t" "lgdt gdt_descr(%%rip)\n\t" - "mov stack_start(%%rip), %%rsp\n\t" + "lea %c[stkoff] + cpu0_stack(%%rip), %%rsp\n\t" "mov %[ds], %%ss\n\t" "mov %[ds], %%ds\n\t" "mov %[ds], %%es\n\t" "mov %[ds], %%fs\n\t" "mov %[ds], %%gs\n\t" - "movl %[cs], 8(%%rsp)\n\t" - "mov %[rip], (%%rsp)\n\t" - "lretq %[stkoff]-16" + "push %[cs]\n\t" + "push %[rip]\n\t" + "lretq" : [rip] "=&r" (efer/* any dead 64-bit variable */), [cr4] "+&r" (cr4) : [cr3] "r" (idle_pg_table), diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 795210f..483f72d 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -804,15 +804,6 @@ static void vmx_set_host_env(struct vcpu *v) __vmwrite(HOST_TR_BASE, (unsigned long)&per_cpu(init_tss, cpu)); - __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom()); - - /* - * Skip end of cpu_user_regs when entering the hypervisor because the - * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc - * all get saved into the VMCS instead. - */ - __vmwrite(HOST_RSP, - (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code); } void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr, @@ -1148,13 +1139,21 @@ static int construct_vmcs(struct vcpu *v) __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); __vmwrite(HOST_CR4, mmu_cr4_features); - /* Host CS:RIP. */ + /* Host code/stack. */ __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler); + __vmwrite(HOST_RSP, /* VMExit doesn't push an excpetion frame. */ + (PERCPU_STACK_MAPPING + STACK_SIZE - + sizeof(struct cpu_info) + + offsetof(struct cpu_info, guest_cpu_user_regs.error_code))); - /* Host SYSENTER CS:RIP. */ + /* Host SYSENTER code/stack. */ __vmwrite(HOST_SYSENTER_CS, __HYPERVISOR_CS); __vmwrite(HOST_SYSENTER_EIP, (unsigned long)sysenter_entry); + __vmwrite(HOST_SYSENTER_ESP, + (PERCPU_STACK_MAPPING + STACK_SIZE - + sizeof(struct cpu_info) + + offsetof(struct cpu_info, guest_cpu_user_regs.es))); /* MSR intercepts. */ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 933bd67..cb54921 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -5281,21 +5281,6 @@ void memguard_unguard_range(void *p, unsigned long l) #endif -void memguard_guard_stack(void *p) -{ - BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE); - p = (void *)((unsigned long)p + STACK_SIZE - - PRIMARY_STACK_SIZE - PAGE_SIZE); - memguard_guard_range(p, PAGE_SIZE); -} - -void memguard_unguard_stack(void *p) -{ - p = (void *)((unsigned long)p + STACK_SIZE - - PRIMARY_STACK_SIZE - PAGE_SIZE); - memguard_unguard_range(p, PAGE_SIZE); -} - void arch_dump_shared_mem_info(void) { printk("Shared frames %u -- Saved frames %u\n", diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index d624b95..c0f7289 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -651,8 +651,6 @@ static void noinline init_done(void) /* Reinitalise all state referring to the old virtual address of the stack. */ static void __init noreturn reinit_bsp_stack(void) { - unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1)); - /* Sanity check that IST settings weren't set up before this point. */ ASSERT(MASK_EXTR(idt_tables[0][TRAP_nmi].a, 7UL << 32) == 0); @@ -664,9 +662,6 @@ static void __init noreturn reinit_bsp_stack(void) /* Update SYSCALL trampolines */ percpu_traps_init(); - stack_base[0] = stack; - memguard_guard_stack(stack); - reset_stack_and_jump(init_done); } @@ -1744,11 +1739,25 @@ void __init noreturn __start_xen(unsigned long mbi_p) setup_io_bitmap(dom0); - /* Jump to the 1:1 virtual mappings of cpu0_stack. */ - asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" :: - [stk] "g" (__va(__pa(get_stack_bottom()))), - [fn] "i" (reinit_bsp_stack) : "memory"); - unreachable(); + /* + * Switch from cpu0_stack to the percpu stack, copying the non-GPR + * cpu_info data into place before hand. + */ + { + const struct cpu_info *src = get_cpu_info(); + struct cpu_info *dst = _p(PERCPU_STACK_MAPPING + STACK_SIZE - + sizeof(*dst)); + + dst->processor_id = src->processor_id; + dst->current_vcpu = src->current_vcpu; + dst->per_cpu_offset = src->per_cpu_offset; + dst->cr4 = src->cr4; + + asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" :: + [stk] "g" (&dst->guest_cpu_user_regs.es), + [fn] "i" (reinit_bsp_stack) : "memory"); + unreachable(); + } } void arch_get_xen_caps(xen_capabilities_info_t *info) diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index f785d5f..77ee883 100644 --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -91,8 +91,6 @@ static enum cpu_state { } cpu_state; #define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0) -void *stack_base[NR_CPUS]; - void initialize_cpu_data(unsigned int cpu) { cpu_data[cpu] = boot_cpu_data; @@ -386,7 +384,6 @@ void start_secondary(void *unused) /* Used to pass percpu_idle_pt to the booting AP. */ paddr_t ap_cr3; -extern void *stack_start; static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) { @@ -529,7 +526,6 @@ static int do_boot_cpu(int apicid, int cpu) cpu, apicid, start_eip); ap_cr3 = per_cpu(percpu_idle_pt, cpu); - stack_start = stack_base[cpu]; /* This grunge runs the startup process for the targeted processor. */ @@ -1002,13 +998,6 @@ static void cpu_smpboot_free(unsigned int cpu) free_xenheap_page(idt_tables[cpu]); idt_tables[cpu] = NULL; - if ( stack_base[cpu] != NULL ) - { - memguard_unguard_stack(stack_base[cpu]); - free_xenheap_pages(stack_base[cpu], STACK_ORDER); - stack_base[cpu] = NULL; - } - if ( per_cpu(percpu_idle_pt, cpu) ) { free_domheap_page(maddr_to_page(per_cpu(percpu_idle_pt, cpu))); @@ -1030,11 +1019,6 @@ static int cpu_smpboot_alloc(unsigned int cpu) if ( node != NUMA_NO_NODE ) memflags = MEMF_node(node); - stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags); - if ( stack_base[cpu] == NULL ) - goto out; - memguard_guard_stack(stack_base[cpu]); - order = get_order_from_pages(NR_RESERVED_GDT_PAGES); per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags); if ( gdt == NULL ) @@ -1148,8 +1132,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) boot_cpu_physical_apicid = get_apic_id(); x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; - stack_base[0] = stack_start; - set_nr_sockets(); socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets); diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c index 59d7c47..c283b91 100644 --- a/xen/arch/x86/tboot.c +++ b/xen/arch/x86/tboot.c @@ -243,29 +243,6 @@ static void tboot_gen_domain_integrity(const uint8_t key[TB_KEY_SIZE], memset(&ctx, 0, sizeof(ctx)); } -/* - * For stack overflow detection in debug build, a guard page is set up. - * This fn is used to detect whether a page is in the guarded pages for - * the above reason. - */ -static int mfn_in_guarded_stack(unsigned long mfn) -{ - void *p; - int i; - - for ( i = 0; i < nr_cpu_ids; i++ ) - { - if ( !stack_base[i] ) - continue; - p = (void *)((unsigned long)stack_base[i] + STACK_SIZE - - PRIMARY_STACK_SIZE - PAGE_SIZE); - if ( mfn == virt_to_mfn(p) ) - return -1; - } - - return 0; -} - static void tboot_gen_xenheap_integrity(const uint8_t key[TB_KEY_SIZE], vmac_t *mac) { @@ -290,12 +267,8 @@ static void tboot_gen_xenheap_integrity(const uint8_t key[TB_KEY_SIZE], if ( is_page_in_use(page) && is_xen_heap_page(page) ) { - void *pg; - - if ( mfn_in_guarded_stack(mfn) ) - continue; /* skip guard stack, see memguard_guard_stack() in mm.c */ + void *pg = mfn_to_virt(mfn); - pg = mfn_to_virt(mfn); vmac_update((uint8_t *)pg, PAGE_SIZE, &ctx); } } diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index eeabb4a..493f8f3 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -356,9 +356,6 @@ unsigned long get_stack_trace_bottom(unsigned long sp) return ROUNDUP(sp, PAGE_SIZE) - offsetof(struct cpu_user_regs, es) - sizeof(unsigned long); -#ifndef MEMORY_GUARD - case 3 ... 5: -#endif case 6 ... 7: return ROUNDUP(sp, STACK_SIZE) - sizeof(struct cpu_info) - sizeof(unsigned long); @@ -375,9 +372,6 @@ unsigned long get_stack_dump_bottom(unsigned long sp) case 0 ... 2: return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long); -#ifndef MEMORY_GUARD - case 3 ... 5: -#endif case 6 ... 7: return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long); @@ -518,9 +512,9 @@ void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs) unsigned long esp_top, esp_bottom; #endif - if ( _p(curr_stack_base) != stack_base[cpu] ) + if ( curr_stack_base != PERCPU_STACK_MAPPING ) printk("Current stack base %p differs from expected %p\n", - _p(curr_stack_base), stack_base[cpu]); + _p(curr_stack_base), _p(PERCPU_STACK_MAPPING)); #ifdef MEMORY_GUARD esp_bottom = (esp | (STACK_SIZE - 1)) + 1; diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h index 4d5563b..86b8fcb 100644 --- a/xen/include/asm-arm/mm.h +++ b/xen/include/asm-arm/mm.h @@ -362,7 +362,6 @@ unsigned long domain_get_maximum_gpfn(struct domain *d); extern struct domain *dom_xen, *dom_io, *dom_cow; -#define memguard_guard_stack(_p) ((void)0) #define memguard_guard_range(_p,_l) ((void)0) #define memguard_unguard_range(_p,_l) ((void)0) diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 22c2809..2c1ed1d 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -521,9 +521,6 @@ void memguard_unguard_range(void *p, unsigned long l); #define memguard_unguard_range(_p,_l) ((void)0) #endif -void memguard_guard_stack(void *p); -void memguard_unguard_stack(void *p); - struct mmio_ro_emulate_ctxt { unsigned long cr2; unsigned int seg, bdf; diff --git a/xen/include/xen/smp.h b/xen/include/xen/smp.h index c55f57f..d30f369 100644 --- a/xen/include/xen/smp.h +++ b/xen/include/xen/smp.h @@ -69,8 +69,6 @@ void smp_send_call_function_mask(const cpumask_t *mask); int alloc_cpu_id(void); -extern void *stack_base[NR_CPUS]; - void initialize_cpu_data(unsigned int cpu); #endif /* __XEN_SMP_H__ */ -- 2.1.4 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |