[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] stack overflow during pv-guest restore




When secondary cpus are initialized during an i386 pv-guest restore
(due to save/restore or live migration), and the guest has
a load that generates a fair number of interrupts (e.g., parallel kernel make),
a stack overflow can occur because cpu_initialize_context() has
a 2800 byte structure it declares on its stack.  linux-i386 has 4K stacks, by 
default.
Using 2800 bytes out of 4K by a single function in a call list isn't nice;
add the beginning of interrupt handling at just the right point, before the
switch to the interrupt stack is made... and... the scale tips...

Simple fix: malloc & free structure as needed.

Would fail save/restore testing of an i386-guest running a parallel, kernel 
make after
50->100 save/restores;  with the fix, haven't seen it fail after 650 
save/restores.

Note: this is a basic port of this function in Jeremy Fitzharinge's Xen 
implementation of pv-ops
      in upstream Linux, part of 15/32 patch, Xen SMP guest support.

Original patch done on rhel5; did a simple diff & merge to xen-unstable's 
version
of smpboot.c to generate the attached patch, so it cleanly applies; but
haven't built/run/tested the xen-unstable version.

Signed-off-by: Donald Dutile <ddutile@xxxxxxxxxx>


--- linux-2.6.18-xen.hg/drivers/xen/core/smpboot.c.orig 2008-01-31 
13:45:10.000000000 -0500
+++ linux-2.6.18-xen.hg/drivers/xen/core/smpboot.c      2008-01-31 
13:56:07.000000000 -0500
@@ -180,9 +180,9 @@
        cpu_idle();
 }
 
-static void __cpuinit cpu_initialize_context(unsigned int cpu)
+static int cpu_initialize_context(unsigned int cpu)
 {
-       vcpu_guest_context_t ctxt;
+       vcpu_guest_context_t *ctxt;
        struct task_struct *idle = idle_task(cpu);
 #ifdef __x86_64__
        struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
@@ -191,58 +191,65 @@
 #endif
 
        if (cpu_test_and_set(cpu, cpu_initialized_map))
-               return;
+               return 0;
 
-       memset(&ctxt, 0, sizeof(ctxt));
+       ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+       if (ctxt == NULL)
+               return -ENOMEM;
 
-       ctxt.flags = VGCF_IN_KERNEL;
-       ctxt.user_regs.ds = __USER_DS;
-       ctxt.user_regs.es = __USER_DS;
-       ctxt.user_regs.fs = 0;
-       ctxt.user_regs.gs = 0;
-       ctxt.user_regs.ss = __KERNEL_DS;
-       ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-       ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
+       memset(ctxt, 0, sizeof(*ctxt));
 
-       memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
+       ctxt->flags = VGCF_IN_KERNEL;
+       ctxt->user_regs.ds = __USER_DS;
+       ctxt->user_regs.es = __USER_DS;
+       ctxt->user_regs.fs = 0;
+       ctxt->user_regs.gs = 0;
+       ctxt->user_regs.ss = __KERNEL_DS;
+       ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+       ctxt->user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
 
-       smp_trap_init(ctxt.trap_ctxt);
+       memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
-       ctxt.ldt_ents = 0;
+       smp_trap_init(ctxt->trap_ctxt);
 
-       ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
-       ctxt.gdt_ents      = gdt_descr->size / 8;
+       ctxt->ldt_ents = 0;
+
+       ctxt->gdt_frames[0] = virt_to_mfn(gdt_descr->address);
+       ctxt->gdt_ents      = gdt_descr->size / 8;
 
 #ifdef __i386__
-       ctxt.user_regs.cs = __KERNEL_CS;
-       ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+       ctxt->user_regs.cs = __KERNEL_CS;
+       ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
 
-       ctxt.kernel_ss = __KERNEL_DS;
-       ctxt.kernel_sp = idle->thread.esp0;
+       ctxt->kernel_ss = __KERNEL_DS;
+       ctxt->kernel_sp = idle->thread.esp0;
 
-       ctxt.event_callback_cs     = __KERNEL_CS;
-       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
-       ctxt.failsafe_callback_cs  = __KERNEL_CS;
-       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+       ctxt->event_callback_cs     = __KERNEL_CS;
+       ctxt->event_callback_eip    = (unsigned long)hypervisor_callback;
+       ctxt->failsafe_callback_cs  = __KERNEL_CS;
+       ctxt->failsafe_callback_eip = (unsigned long)failsafe_callback;
 
-       ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+       ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
 #else /* __x86_64__ */
-       ctxt.user_regs.cs = __KERNEL_CS;
-       ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
+       ctxt->user_regs.cs = __KERNEL_CS;
+       ctxt->user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
 
-       ctxt.kernel_ss = __KERNEL_DS;
-       ctxt.kernel_sp = idle->thread.rsp0;
+       ctxt->kernel_ss = __KERNEL_DS;
+       ctxt->kernel_sp = idle->thread.rsp0;
 
-       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
-       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
-       ctxt.syscall_callback_eip  = (unsigned long)system_call;
+       ctxt->event_callback_eip    = (unsigned long)hypervisor_callback;
+       ctxt->failsafe_callback_eip = (unsigned long)failsafe_callback;
+       ctxt->syscall_callback_eip  = (unsigned long)system_call;
 
-       ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
+       ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
 
-       ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
+       ctxt->gs_base_kernel = (unsigned long)(cpu_pda(cpu));
 #endif
 
-       BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
+       BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt));
+
+       kfree(ctxt);
+       return 0;
 }
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
@@ -400,7 +407,9 @@
        if (rc)
                return rc;
 
-       cpu_initialize_context(cpu);
+       rc = cpu_initialize_context(cpu);
+       if (rc)
+               return rc;
 
        if (num_online_cpus() == 1)
                alternatives_smp_switch(1);
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.