[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] Hypervisor crash(!) on xl cpupool-numa-split



Juergen,

as promised some more debug data. This is from c/s 22858 with Stephans debug patch (attached). We get the following dump when the hypervisor crashes, note that the first lock is different from the second and subsequent ones:

(XEN) sched_credit.c, 572: prv: ffff831836df2970 &prv->lock: ffff831836df2970 prv->weight: 256 sdom->active_vcpu_count: 3 sdom->weight: 256 (XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 prv->weight: 768 sdom->active_vcpu_count: 4 sdom->weight: 256 (XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 prv->weight: 1024 sdom->active_vcpu_count: 5 sdom->weight: 256 (XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 prv->weight: 1280 sdom->active_vcpu_count: 6 sdom->weight: 256

....

Hope that gives you an idea. I attach the whole log for your reference.

Regards,
Andre

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Welcome to Linux 2.6.32.27-pvops (hvc0)

dosorca login: root
Password: 
Linux 2.6.32.27-pvops.
Last login: Fri Jan 28 00:15:40 +0100 2011 on hvc0.
You have mail.
root@dosorca:~# sync
root@dosorca:~# cd /data/images/
root@dosorca:/data/images# sh numasplit.sh 
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node1
scheduler:      credit
number of cpus: 1
Populating new pool
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node2
scheduler:      credit
number of cpus: 1
Populating new pool
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node3
scheduler:      credit
number of cpus: 1
Populating new pool
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node4
scheduler:      credit
number of cpus: 1
Populating new pool
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node5
scheduler:      credit
number of cpus: 1
Populating new pool
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node6
scheduler:      credit
number of cpus: 1
Populating new pool
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node7
scheduler:      credit
number of cpus: 1
Populating new pool
root@dosorca:/data/images# sh numasplit.sh revert
Destroying Pool 1
adding freed CPUs to pool 0
Destroying Pool 2
adding freed CPUs to pool 0
Destroying Pool 3
adding freed CPUs to pool 0
Destroying Pool 4
adding freed CPUs to pool 0
Destroying Pool 5
adding freed CPUs to pool 0
Destroying Pool 6
adding freed CPUs to pool 0
Destroying Pool 7
adding freed CPUs to pool 0
root@dosorca:/data/images# sh numasplit.sh
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node1
scheduler:      credit
number of cpus: 1
Populating new pool
Removing CPUs from Pool 0
Rewriting config file
Creating new pool
Using config file "cpupool.test"
cpupool name:   Pool-node2
scheduler:      credit
number of cpus: 1
Populating new pool
Removing CPUs from Pool 0
(XEN) sched_credit.c, 572: prv: ffff831836df2970 &prv->lock: ffff831836df2970 
prv->weight: 256 sdom->active_vcpu_count: 3 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 768 sdom->active_vcpu_count: 4 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 1024 sdom->active_vcpu_count: 5 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 1280 sdom->active_vcpu_count: 6 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 1536 sdom->active_vcpu_count: 7 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 1792 sdom->active_vcpu_count: 8 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 2048 sdom->active_vcpu_count: 9 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 2304 sdom->active_vcpu_count: 10 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 2560 sdom->active_vcpu_count: 11 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 2816 sdom->active_vcpu_count: 12 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 3072 sdom->active_vcpu_count: 13 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 3328 sdom->active_vcpu_count: 14 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 3584 sdom->active_vcpu_count: 15 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 3840 sdom->active_vcpu_count: 16 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 4096 sdom->active_vcpu_count: 17 sdom->weight: 256
(XEN) sched_credit.c, 572: prv: ffff830437ffa5e0 &prv->lock: ffff830437ffa5e0 
prv->weight: 4352 sdom->active_vcpu_count: 18 sdom->weight: 256
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 0 on processor: 33 with state 
0 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 1 on processor: 35 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 2 on processor: 20 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 3 on processor: 26 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 4 on processor: 37 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 5 on processor: 36 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 6 on processor: 2 with state 2 
violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 7 on processor: 24 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 8 on processor: 28 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 9 on processor: 40 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 10 on processor: 4 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 11 on processor: 44 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 12 on processor: 36 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 13 on processor: 29 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 14 on processor: 3 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 15 on processor: 13 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 16 on processor: 21 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 17 on processor: 1 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 18 on processor: 20 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 19 on processor: 28 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 20 on processor: 39 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 21 on processor: 34 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 22 on processor: 41 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 23 on processor: 0 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 24 on processor: 2 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 25 on processor: 22 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 26 on processor: 42 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 27 on processor: 43 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 28 on processor: 30 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 29 on processor: 27 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 30 on processor: 23 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 31 on processor: 32 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 32 on processor: 25 with state 
0 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 33 on processor: 46 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 34 on processor: 38 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 35 on processor: 4 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 36 on processor: 45 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 37 on processor: 34 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 38 on processor: 5 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 39 on processor: 1 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 40 on processor: 30 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 41 on processor: 28 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 42 on processor: 31 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 43 on processor: 0 with state 
1 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 44 on processor: 47 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 45 on processor: 29 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 46 on processor: 44 with state 
2 violates invariant!
(XEN) BUG in sched_credit.c,1008: Domain 0 VCPU: 47 on processor: 20 with state 
2 violates invariant!
(XEN) Xen BUG at sched_credit.c:1013
(XEN) ----[ Xen-4.1.0-rc3-pre  x86_64  debug=y  Not tainted ]----
(XEN) CPU:    0
(XEN) RIP:    e008:[<ffff82c4801182f3>] csched_acct+0x197/0x51d
(XEN) RFLAGS: 0000000000010087   CONTEXT: hypervisor
(XEN) rax: 0000000000000012   rbx: ffff830434321ec0   rcx: 0000000000000000
(XEN) rdx: 0000000000001200   rsi: 0000000000000012   rdi: 0000000000000100
(XEN) rbp: ffff82c480297e10   rsp: ffff82c480297d70   r8:  0000000000000100
(XEN) r9:  ffff82c480214a20   r10: 00000000fffffffc   r11: 0000000000000001
(XEN) r12: ffff830434322000   r13: ffff82c48011815c   r14: ffff83043399f018
(XEN) r15: ffff83043399f010   cr0: 000000008005003b   cr4: 00000000000006f0
(XEN) cr3: 0000000621001000   cr2: 00007f3818efa000
(XEN) ds: 002b   es: 002b   fs: 0000   gs: 0000   ss: e010   cs: e008
(XEN) Xen stack trace from rsp=ffff82c480297d70:
(XEN)    ffff830400000002 ffff82c480297e38 fffffed480118b9e 00000000000010ff
(XEN)    ffff830437ffa5e0 ffff830437ffa5e8 ffff82c4802d3ec0 ffff830437ffa5e0
(XEN)    0000000000000282 ffff830437ffa5e8 ffff830434321ec0 00002a309695b272
(XEN)    0000110000001100 0000000000000000 ffff82c400000000 ffff82c4802d3f80
(XEN)    ffff830437ffa5e0 ffff82c48011815c ffff83043399f018 ffff83043399f010
(XEN)    ffff82c480297e40 ffff82c480126144 0000000000000002 ffff830437ffa600
(XEN)    ffff82c4802d3f80 0000001de513cb60 ffff82c480297e90 ffff82c480126469
(XEN)    ffff82c48024b020 ffff82c4802d3f80 ffff83043399f010 0000000000000000
(XEN)    0000000000000000 ffff82c4802b0880 ffff82c480297f18 ffffffffffffffff
(XEN)    ffff82c480297ed0 ffff82c480123537 ffff8300c7e340f8 ffff82c480297f18
(XEN)    ffff82c48024b020 ffff82c480297f18 0000001de5129a7f ffff82c4802d3ec0
(XEN)    ffff82c480297ee0 ffff82c4801235b2 ffff82c480297f10 ffff82c4801565f5
(XEN)    0000000000000000 ffff8300c7cd6000 0000000000000000 ffff8300c7e34000
(XEN)    ffff82c480297d48 0000000000000000 0000000000000000 0000000000000000
(XEN)    ffffffff81a69060 ffff8817a8535f10 ffff8817a8535fd8 0000000000000246
(XEN)    ffff8817a8535e80 ffff880000000001 0000000000000000 0000000000000000
(XEN)    ffffffff810093aa 000000193592cbd4 00000000deadbeef 00000000deadbeef
(XEN)    0000010000000000 ffffffff810093aa 000000000000e033 0000000000000246
(XEN)    ffff8817a8535ef8 000000000000e02b 0000000000000000 0000000000000000
(XEN)    0000000000000000 0000000000000000 0000000000000000 ffff8300c7cd6000
(XEN) Xen call trace:
(XEN)    [<ffff82c4801182f3>] csched_acct+0x197/0x51d
(XEN)    [<ffff82c480126144>] execute_timer+0x4e/0x6c
(XEN)    [<ffff82c480126469>] timer_softirq_action+0xf2/0x245
(XEN)    [<ffff82c480123537>] __do_softirq+0x88/0x99
(XEN)    [<ffff82c4801235b2>] do_softirq+0x6a/0x7a
(XEN)    [<ffff82c4801565f5>] idle_loop+0x6a/0x6f
(XEN)    
(XEN) 
(XEN) ****************************************
(XEN) Panic on CPU 0:
(XEN) Xen BUG at sched_credit.c:1013
(XEN) ****************************************
(XEN) 
(XEN) Reboot in five seconds...
(XEN) Resetting with ACPI MEMORY or I/O RESET_REG.
diff -r 9a6458e0c3f5 xen/common/cpupool.c
--- a/xen/common/cpupool.c      Tue Feb 01 19:26:36 2011 +0000
+++ b/xen/common/cpupool.c      Thu Feb 03 18:51:40 2011 +0100
@@ -30,6 +30,7 @@
 static int cpupool_moving_cpu = -1;
 static struct cpupool *cpupool_cpu_moving = NULL;
 static cpumask_t cpupool_locked_cpus = CPU_MASK_NONE;
+static int cpupool_debug_move_continue = 0;
 
 static DEFINE_SPINLOCK(cpupool_lock);
 
@@ -226,6 +227,8 @@
                     cpupool_id, cpu, ret);
 
     spin_lock(&cpupool_lock);
+       BUG_ON(!cpupool_debug_move_continue); // Continuation still flagged?
+       BUG_ON(cpu != *((volatile int*)&cpupool_moving_cpu));
     ret = cpu_disable_scheduler(cpu);
     cpu_set(cpu, cpupool_free_cpus);
     if ( !ret )
@@ -236,6 +239,7 @@
         cpupool_put(cpupool_cpu_moving);
         cpupool_cpu_moving = NULL;
     }
+       cpupool_debug_move_continue = 0; // Continuation done.
     spin_unlock(&cpupool_lock);
     return ret;
 }
@@ -300,6 +304,8 @@
     atomic_inc(&c->refcnt);
     cpupool_cpu_moving = c;
     cpu_clear(cpu, c->cpu_valid);
+       BUG_ON(cpupool_debug_move_continue); // Only one outstanding 
continuation!
+       cpupool_debug_move_continue = 1;
     spin_unlock(&cpupool_lock);
 
     work_cpu = smp_processor_id();
@@ -309,6 +315,7 @@
         if ( work_cpu == cpu )
             work_cpu = next_cpu(cpu, cpupool0->cpu_valid);
     }
+       // SD NOTE:  Why not keep the protection through cpupool_lock until 
here?
     return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
 
 out:
diff -r 9a6458e0c3f5 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Tue Feb 01 19:26:36 2011 +0000
+++ b/xen/common/sched_credit.c Thu Feb 03 18:51:40 2011 +0100
@@ -567,6 +567,14 @@
         list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
         /* Make weight per-vcpu */
         prv->weight += sdom->weight;
+        if (prv->weight < sdom->active_vcpu_count * sdom->weight) {
+            printk("%s, %i: Dom: %i VCPU: %i prv: %p &prv->lock: %p 
prv->weight: %i "\
+                   "sdom->active_vcpu_count: %i sdom->weight: %i\n",
+                   __FILE__, __LINE__, sdom->dom->domain_id, 
svc->vcpu->vcpu_id,
+                   (void*) prv, &(prv->lock), prv->weight,
+                   sdom->active_vcpu_count, sdom->weight);
+        }
+        //BUG_ON(prv->weight < sdom->active_vcpu_count * sdom->weight);
         if ( list_empty(&sdom->active_sdom_elem) )
         {
             list_add(&sdom->active_sdom_elem, &prv->active_sdom);
@@ -591,6 +599,14 @@
     sdom->active_vcpu_count--;
     list_del_init(&svc->active_vcpu_elem);
     prv->weight -= sdom->weight;
+    if (prv->weight < sdom->active_vcpu_count * sdom->weight) {
+         printk("%s, %i: Dom: %i VCPU: %i prv: %p &prv->lock: %p prv->weight: 
%i "\
+                "sdom->active_vcpu_count: %i sdom->weight: %i\n",
+                __FILE__, __LINE__, sdom->dom->domain_id, svc->vcpu->vcpu_id,
+                (void*) prv, &(prv->lock), prv->weight,
+                sdom->active_vcpu_count, sdom->weight);
+    }
+    //BUG_ON(prv->weight < sdom->active_vcpu_count * sdom->weight);
     if ( list_empty(&sdom->active_vcpu) )
     {
         list_del_init(&sdom->active_sdom_elem);
@@ -987,6 +1003,17 @@
         BUG_ON( is_idle_domain(sdom->dom) );
         BUG_ON( sdom->active_vcpu_count == 0 );
         BUG_ON( sdom->weight == 0 );
+        if ( (sdom->weight * sdom->active_vcpu_count) > weight_left ) {
+            struct domain *d = sdom->dom;
+            struct vcpu   *v;
+            for_each_vcpu ( d, v ) {
+                printk("BUG in %s,%i: Domain %i VCPU: %i on processor: %i with 
"\
+                       "state %i violates invariant!\n",
+                       __FILE__,__LINE__, d->domain_id, v->vcpu_id, 
v->processor,
+                       v->runstate.state);
+            }
+        }
+
         BUG_ON( (sdom->weight * sdom->active_vcpu_count) > weight_left );
 
         weight_left -= ( sdom->weight * sdom->active_vcpu_count );
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.