[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] Provide cpu hotplug support to Xen. Note this hotplug
# HG changeset patch # User kfraser@xxxxxxxxxxxxxxxxxxxxx # Date 1184172490 -3600 # Node ID c3929e540632ec30c19d85a2884f1e49081b8410 # Parent e00547dcda097c10e4c1390f0e2873deee741c0c Provide cpu hotplug support to Xen. Note this hotplug support is specific to PM, instead of for a run-time single CPU hotplug which can be a separate task. See embedded comment: /* * XXX: One important thing missed here is to migrate vcpus * from dead cpu to other online ones and then put whole * system into a stop state. It assures a safe environment * for a cpu hotplug/remove at normal running state. * * However for xen PM case, at this point: * -> All other domains should be notified with PM event, * and then in following states: * * Suspend state, or * * Paused state, which is a force step to all * domains if they do nothing to suspend * -> All vcpus of dom0 (except vcpu0) have already beem * hot removed * with the net effect that all other cpus only have idle vcpu * running. In this special case, we can avoid vcpu migration * then and system can be considered in a stop state. * * So current cpu hotplug is a special version for PM specific * usage, and need more effort later for full cpu hotplug. * (ktian1) */ Signed-off-by: Kevin Tian <kevin.tian@xxxxxxxxx> Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx> --- xen/arch/x86/cpu/common.c | 8 - xen/arch/x86/domain.c | 29 ++--- xen/arch/x86/i8259.c | 1 xen/arch/x86/io_apic.c | 3 xen/arch/x86/irq.c | 62 +++++------- xen/arch/x86/smp.c | 10 - xen/arch/x86/smpboot.c | 218 +++++++++++++++++++++++++++++-------------- xen/include/asm-x86/config.h | 3 xen/include/asm-x86/smp.h | 13 ++ xen/include/asm-x86/system.h | 2 10 files changed, 213 insertions(+), 136 deletions(-) diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/cpu/common.c --- a/xen/arch/x86/cpu/common.c Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/arch/x86/cpu/common.c Wed Jul 11 17:48:10 2007 +0100 @@ -600,9 +600,5 @@ void __cpuinit cpu_uninit(void) { int cpu = raw_smp_processor_id(); cpu_clear(cpu, cpu_initialized); - - /* lazy TLB state */ - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} -#endif +} +#endif diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/arch/x86/domain.c Wed Jul 11 17:48:10 2007 +0100 @@ -81,24 +81,23 @@ static void default_idle(void) /* We don't actually take CPU down, just spin without interrupts. */ static inline void play_dead(void) { - /* This must be done before dead CPU ack */ - cpu_exit_clear(); - wbinvd(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* - * With physical CPU hotplug, we should halt the cpu - */ - local_irq_disable(); - while (1) - halt(); + __cpu_disable(); + /* This must be done before dead CPU ack */ + cpu_exit_clear(); + wbinvd(); + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* With physical CPU hotplug, we should halt the cpu. */ + local_irq_disable(); + for ( ; ; ) + halt(); } #else static inline void play_dead(void) { - BUG(); + BUG(); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -106,6 +105,8 @@ void idle_loop(void) { for ( ; ; ) { + if (cpu_is_offline(smp_processor_id())) + play_dead(); page_scrub_schedule_work(); default_idle(); do_softirq(); diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/i8259.c --- a/xen/arch/x86/i8259.c Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/arch/x86/i8259.c Wed Jul 11 17:48:10 2007 +0100 @@ -396,6 +396,7 @@ void __init init_IRQ(void) irq_desc[i].action = NULL; irq_desc[i].depth = 1; spin_lock_init(&irq_desc[i].lock); + cpus_setall(irq_desc[i].affinity); set_intr_gate(i, interrupt[i]); } diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/io_apic.c --- a/xen/arch/x86/io_apic.c Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/arch/x86/io_apic.c Wed Jul 11 17:48:10 2007 +0100 @@ -34,9 +34,6 @@ #include <asm/desc.h> #include <mach_apic.h> #include <io_ports.h> - -#define set_irq_info(irq, mask) ((void)0) -#define set_native_irq_info(irq, mask) ((void)0) /* Different to Linux: our implementation can be simpler. */ #define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq))) diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/arch/x86/irq.c Wed Jul 11 17:48:10 2007 +0100 @@ -656,42 +656,34 @@ __initcall(setup_dump_irqs); __initcall(setup_dump_irqs); #ifdef CONFIG_HOTPLUG_CPU -#include <mach_apic.h> +#include <asm/mach-generic/mach_apic.h> +#include <xen/delay.h> void fixup_irqs(cpumask_t map) { - unsigned int irq; - static int warned; - - for (irq = 0; irq < NR_IRQS; irq++) { - cpumask_t mask; - if (irq == 2) - continue; - - cpus_and(mask, irq_desc[irq].affinity, map); - if (any_online_cpu(mask) == NR_CPUS) { - printk("Breaking affinity for irq %i\n", irq); - mask = map; - } - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); - else if (irq_desc[irq].action && !(warned++)) - printk("Cannot set affinity for irq %i\n", irq); - } - -#if 0 - barrier(); - /* Ingo Molnar says: "after the IO-APIC masks have been redirected - [note the nop - the interrupt-enable boundary on x86 is two - instructions from sti] - to flush out pending hardirqs and - IPIs. After this point nothing is supposed to reach this CPU." */ - __asm__ __volatile__("sti; nop; cli"); - barrier(); -#else - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); + unsigned int irq; + static int warned; + + for ( irq = 0; irq < NR_IRQS; irq++ ) + { + cpumask_t mask; + if ( irq == 2 ) + continue; + + cpus_and(mask, irq_desc[irq].affinity, map); + if ( any_online_cpu(mask) == NR_CPUS ) + { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if ( irq_desc[irq].handler->set_affinity ) + irq_desc[irq].handler->set_affinity(irq, mask); + else if ( irq_desc[irq].action && !(warned++) ) + printk("Cannot set affinity for irq %i\n", irq); + } + + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} #endif -} -#endif diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/smp.c --- a/xen/arch/x86/smp.c Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/arch/x86/smp.c Wed Jul 11 17:48:10 2007 +0100 @@ -256,16 +256,6 @@ static DEFINE_SPINLOCK(call_lock); static DEFINE_SPINLOCK(call_lock); static struct call_data_struct *call_data; -void lock_ipi_call_lock(void) -{ - spin_lock_irq(&call_lock); -} - -void unlock_ipi_call_lock(void) -{ - spin_unlock_irq(&call_lock); -} - int smp_call_function( void (*func) (void *info), void *info, diff -r e00547dcda09 -r c3929e540632 xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/arch/x86/smpboot.c Wed Jul 11 17:48:10 2007 +0100 @@ -110,6 +110,11 @@ EXPORT_SYMBOL(x86_cpu_to_apicid); EXPORT_SYMBOL(x86_cpu_to_apicid); static void map_cpu_to_logical_apicid(void); +/* State of each CPU. */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + +static void *stack_base[NR_CPUS] __cacheline_aligned; +spinlock_t cpu_add_remove_lock; /* * The bootstrap kernel entry code has set these up. Save them for @@ -396,9 +401,11 @@ void __devinit smp_callin(void) /* * Synchronize the TSC with the BP */ - if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) + if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) { synchronize_tsc_ap(); - calibrate_tsc_ap(); + /* No sync for same reason as above */ + calibrate_tsc_ap(); + } } static int cpucount, booting_cpu; @@ -464,8 +471,12 @@ static void construct_percpu_idt(unsigne { unsigned char idt_load[10]; - idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); - memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t)); + /* If IDT table exists since last hotplug, reuse it */ + if (!idt_tables[cpu]) { + idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); + memcpy(idt_tables[cpu], idt_table, + IDT_ENTRIES*sizeof(idt_entry_t)); + } *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1; *(unsigned long *)(&idt_load[2]) = (unsigned long)idt_tables[cpu]; @@ -488,7 +499,7 @@ void __devinit start_secondary(void *unu set_processor_id(cpu); set_current(idle_vcpu[cpu]); - this_cpu(curr_vcpu) = idle_vcpu[cpu]; + this_cpu(curr_vcpu) = idle_vcpu[cpu]; percpu_traps_init(); @@ -516,23 +527,13 @@ void __devinit start_secondary(void *unu set_cpu_sibling_map(raw_smp_processor_id()); wmb(); - /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI receipients, and the time when the determination is made - * for which cpus receive the IPI. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - */ - /*lock_ipi_call_lock();*/ cpu_set(smp_processor_id(), cpu_online_map); - /*unlock_ipi_call_lock();*/ - /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/ + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + + init_percpu_time(); /* We can take interrupts now: we're officially "up". */ local_irq_enable(); - - init_percpu_time(); wmb(); startup_cpu_idle_loop(); @@ -794,6 +795,22 @@ static inline int alloc_cpu_id(void) return cpu; } +static struct vcpu *prepare_idle_vcpu(unsigned int cpu) +{ + if (idle_vcpu[cpu]) + return idle_vcpu[cpu]; + + return alloc_idle_vcpu(cpu); +} + +static void *prepare_idle_stack(unsigned int cpu) +{ + if (!stack_base[cpu]) + stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER); + + return stack_base[cpu]; +} + static int __devinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -811,7 +828,7 @@ static int __devinit do_boot_cpu(int api booting_cpu = cpu; - v = alloc_idle_vcpu(cpu); + v = prepare_idle_vcpu(cpu); BUG_ON(v == NULL); /* start_eip had better be page-aligned! */ @@ -820,7 +837,7 @@ static int __devinit do_boot_cpu(int api /* So we see what's up */ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); - stack_start.esp = alloc_xenheap_pages(STACK_ORDER); + stack_start.esp = prepare_idle_stack(cpu); /* Debug build: detect stack overflow by setting up a guard page. */ memguard_guard_stack(stack_start.esp); @@ -898,6 +915,12 @@ static int __devinit do_boot_cpu(int api } #ifdef CONFIG_HOTPLUG_CPU +static void idle_task_exit(void) +{ + /* Give up lazy state borrowed by this idle vcpu */ + __sync_lazy_execstate(); +} + void cpu_exit_clear(void) { int cpu = raw_smp_processor_id(); @@ -906,7 +929,6 @@ void cpu_exit_clear(void) cpucount --; cpu_uninit(); - irq_ctx_exit(cpu); cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); @@ -915,26 +937,9 @@ void cpu_exit_clear(void) unmap_cpu_to_logical_apicid(cpu); } -struct warm_boot_cpu_info { - struct completion *complete; - int apicid; - int cpu; -}; - -static void __cpuinit do_warm_boot_cpu(void *p) -{ - struct warm_boot_cpu_info *info = p; - do_boot_cpu(info->apicid, info->cpu); - complete(info->complete); -} - static int __cpuinit __smp_prepare_cpu(int cpu) { - DECLARE_COMPLETION(done); - struct warm_boot_cpu_info info; - struct work_struct task; int apicid, ret; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { @@ -942,34 +947,12 @@ static int __cpuinit __smp_prepare_cpu(i goto exit; } - /* - * the CPU isn't initialized at boot time, allocate gdt table here. - * cpu_init will initialize it - */ - if (!cpu_gdt_descr->address) { - cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); - if (!cpu_gdt_descr->address) - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - ret = -ENOMEM; - goto exit; - } - - info.complete = &done; - info.apicid = apicid; - info.cpu = cpu; - INIT_WORK(&task, do_warm_boot_cpu, &info); - tsc_sync_disabled = 1; - /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - KERNEL_PGD_PTRS); - flush_tlb_all(); - schedule_work(&task); - wait_for_completion(&done); + do_boot_cpu(apicid, cpu); tsc_sync_disabled = 0; - zap_low_mappings(); + ret = 0; exit: return ret; @@ -1002,6 +985,8 @@ static void __init smp_boot_cpus(unsigne boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; + + stack_base[0] = stack_start.esp; /*current_thread_info()->cpu = 0;*/ /*smp_tune_scheduling();*/ @@ -1173,7 +1158,8 @@ void __devinit smp_prepare_boot_cpu(void cpu_set(smp_processor_id(), cpu_callout_map); cpu_set(smp_processor_id(), cpu_present_map); cpu_set(smp_processor_id(), cpu_possible_map); - /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/ + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + spin_lock_init(&cpu_add_remove_lock); } #ifdef CONFIG_HOTPLUG_CPU @@ -1196,11 +1182,12 @@ remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); - c[cpu].phys_proc_id = 0; - c[cpu].cpu_core_id = 0; + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; cpu_clear(cpu, cpu_sibling_setup_map); } +extern void fixup_irqs(cpumask_t map); int __cpu_disable(void) { cpumask_t map = cpu_online_map; @@ -1217,12 +1204,15 @@ int __cpu_disable(void) if (cpu == 0) return -EBUSY; + local_irq_disable(); clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); mdelay(1); local_irq_disable(); + time_suspend(); + remove_siblinginfo(cpu); cpu_clear(cpu, map); @@ -1241,13 +1231,89 @@ void __cpu_die(unsigned int cpu) /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { printk ("CPU %d is now offline\n", cpu); - if (1 == num_online_cpus()) - alternatives_smp_switch(0); return; } - msleep(100); + mdelay(100); + mb(); + process_pending_timers(); } printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} + +/* + * XXX: One important thing missed here is to migrate vcpus + * from dead cpu to other online ones and then put whole + * system into a stop state. It assures a safe environment + * for a cpu hotplug/remove at normal running state. + * + * However for xen PM case, at this point: + * -> All other domains should be notified with PM event, + * and then in following states: + * * Suspend state, or + * * Paused state, which is a force step to all + * domains if they do nothing to suspend + * -> All vcpus of dom0 (except vcpu0) have already beem + * hot removed + * with the net effect that all other cpus only have idle vcpu + * running. In this special case, we can avoid vcpu migration + * then and system can be considered in a stop state. + * + * So current cpu hotplug is a special version for PM specific + * usage, and need more effort later for full cpu hotplug. + * (ktian1) + */ +int cpu_down(unsigned int cpu) +{ + int err = 0; + cpumask_t mask; + + spin_lock(&cpu_add_remove_lock); + if (num_online_cpus() == 1) { + err = -EBUSY; + goto out; + } + + if (!cpu_online(cpu)) { + err = -EINVAL; + goto out; + } + + printk("Prepare to bring CPU%d down...\n", cpu); + /* Send notification to remote idle vcpu */ + cpus_clear(mask); + cpu_set(cpu, mask); + per_cpu(cpu_state, cpu) = CPU_DYING; + smp_send_event_check_mask(mask); + + __cpu_die(cpu); + + if (cpu_online(cpu)) { + printk("Bad state (DEAD, but in online map) on CPU%d\n", cpu); + err = -EBUSY; + } +out: + spin_unlock(&cpu_add_remove_lock); + return err; +} + +int cpu_up(unsigned int cpu) +{ + int err = 0; + + spin_lock(&cpu_add_remove_lock); + if (cpu_online(cpu)) { + printk("Bring up a online cpu. Bogus!\n"); + err = -EBUSY; + goto out; + } + + err = __cpu_up(cpu); + if (err < 0) + goto out; + +out: + spin_unlock(&cpu_add_remove_lock); + return err; } /* From kernel/power/main.c */ @@ -1308,6 +1374,22 @@ void __cpu_die(unsigned int cpu) int __devinit __cpu_up(unsigned int cpu) { +#ifdef CONFIG_HOTPLUG_CPU + int ret=0; + + /* + * We do warm boot only on cpus that had booted earlier + * Otherwise cold boot is all handled from smp_boot_cpus(). + * cpu_callin_map is set during AP kickstart process. Its reset + * when a cpu is taken offline from cpu_exit_clear(). + */ + if (!cpu_isset(cpu, cpu_callin_map)) + ret = __smp_prepare_cpu(cpu); + + if (ret) + return -EIO; +#endif + /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); diff -r e00547dcda09 -r c3929e540632 xen/include/asm-x86/config.h --- a/xen/include/asm-x86/config.h Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/include/asm-x86/config.h Wed Jul 11 17:48:10 2007 +0100 @@ -39,6 +39,9 @@ #define CONFIG_ACPI_SRAT 1 #define CONFIG_VGA 1 + +#define CONFIG_HOTPLUG 1 +#define CONFIG_HOTPLUG_CPU 1 #define HZ 100 diff -r e00547dcda09 -r c3929e540632 xen/include/asm-x86/smp.h --- a/xen/include/asm-x86/smp.h Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/include/asm-x86/smp.h Wed Jul 11 17:48:10 2007 +0100 @@ -50,9 +50,22 @@ extern u8 x86_cpu_to_apicid[]; #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] +/* State of each CPU. */ +#define CPU_ONLINE 0x0002 /* CPU is up */ +#define CPU_DYING 0x0003 /* CPU is requested to die */ +#define CPU_DEAD 0x0004 /* CPU is dead */ +DECLARE_PER_CPU(int, cpu_state); + #ifdef CONFIG_HOTPLUG_CPU +#define cpu_is_offline(cpu) unlikely(per_cpu(cpu_state,cpu) == CPU_DYING) +extern int cpu_down(unsigned int cpu); +extern int cpu_up(unsigned int cpu); extern void cpu_exit_clear(void); extern void cpu_uninit(void); +extern void disable_nonboot_cpus(void); +extern void enable_nonboot_cpus(void); +#else +static inline int cpu_is_offline(int cpu) {return 0;} #endif /* diff -r e00547dcda09 -r c3929e540632 xen/include/asm-x86/system.h --- a/xen/include/asm-x86/system.h Wed Jul 11 17:28:09 2007 +0100 +++ b/xen/include/asm-x86/system.h Wed Jul 11 17:48:10 2007 +0100 @@ -313,6 +313,8 @@ static always_inline unsigned long long #define __sti() __asm__ __volatile__("sti": : :"memory") /* used in the idle loop; sti takes one instruction cycle to complete */ #define safe_halt() __asm__ __volatile__("sti; hlt": : :"memory") +/* used when interrupts are already enabled or to shutdown the processor */ +#define halt() __asm__ __volatile__("hlt": : :"memory") /* For spinlocks etc */ #if defined(__i386__) _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |