[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] xen: sched/Credit2: fix bug when moving CPUs between two Credit2 cpupools



Whether or not a CPU is assigned to a runqueue (and, if yes, to which
one) within a Credit2 scheduler instance must be both a per-cpu and
per-scheduler instance one.

In fact, when we move a CPU between cpupools, we first setup its per-cpu
data in the new pool, and then cleanup its per-cpu data from the old
pool. In Credit2, when there currently is no per-scheduler, per-cpu
data (as the cpu-to-runqueue map is stored on a per-cpu basis only),
this means that the cleanup of the old per-cpu data can mess with the
new per-cpu data, leading to crashes like this:

https://www.mail-archive.com/xen-devel@xxxxxxxxxxxxxxxxxxxx/msg23306.html
https://www.mail-archive.com/xen-devel@xxxxxxxxxxxxxxxxxxxx/msg23350.html

Basically, when csched2_deinit_pdata() is called for CPU 13, for fully
removing the CPU from Pool-0, per_cpu(13,runq_map) already contain the
id of the runqueue to which the CPU has been assigned in the scheduler
of Pool-1, which means wrong runqueue manipulations happen in Pool-0's
scheduler. Furthermore, at the end of such call, that same runq_map is
updated with -1, which is what causes the BUG_ON in csched2_schedule(),
on CPU 13, to trigger.

So, instead of reverting a2c4e5ab59d "xen: credit2: make the cpu to
runqueue map per-cpu" (as we don't want to go back to having the huge
array in struct csched2_private) add a per-cpu scheduler specific data
structure, like, for instance, Credit1 has already. That (for now) only
contains one field: the id of the runqueue the CPU is assigned to.

Signed-off-by: Dario Faggioli <dfaggioli@xxxxxxxx>
---
Cc: George Dunlap <george.dunlap@xxxxxxxxxx>
Cc: Steven Haigh <netwiz@xxxxxxxxx>
Cc: Juergen Gross <jgross@xxxxxxxx>
Cc: Jan Beulich <JBeulich@xxxxxxxx>
---
This is a bugfix, so it wants to be considered for backporting to stable
releases that have commit a2c4e5ab59d in them. In case it turns out to be
not trivial, I can help with that.
---
 xen/common/sched_credit2.c |  107 ++++++++++++++++++++++++++++++--------------
 1 file changed, 73 insertions(+), 34 deletions(-)

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 743848121f..2b16bcea21 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -508,11 +508,10 @@ struct csched2_private {
 
 /*
  * Physical CPU
- *
- * The only per-pCPU information we need to maintain is of which runqueue
- * each CPU is part of.
  */
-static DEFINE_PER_CPU(int, runq_map);
+struct csched2_pcpu {
+    int runq_id;
+};
 
 /*
  * Virtual CPU
@@ -571,6 +570,11 @@ static inline struct csched2_private *csched2_priv(const 
struct scheduler *ops)
     return ops->sched_data;
 }
 
+static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu)
+{
+    return per_cpu(schedule_data, cpu).sched_priv;
+}
+
 static inline struct csched2_vcpu *csched2_vcpu(const struct vcpu *v)
 {
     return v->sched_priv;
@@ -584,7 +588,7 @@ static inline struct csched2_dom *csched2_dom(const struct 
domain *d)
 /* CPU to runq_id macro */
 static inline int c2r(unsigned int cpu)
 {
-    return per_cpu(runq_map, cpu);
+    return csched2_pcpu(cpu)->runq_id;
 }
 
 /* CPU to runqueue struct macro */
@@ -3778,31 +3782,45 @@ csched2_dump(const struct scheduler *ops)
 #undef cpustr
 }
 
+static void *
+csched2_alloc_pdata(const struct scheduler *ops, int cpu)
+{
+    struct csched2_pcpu *spc;
+
+    spc = xzalloc(struct csched2_pcpu);
+    if ( spc == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    /* Not in any runqueue yet */
+    spc->runq_id = -1;
+
+    return spc;
+}
+
 /* Returns the ID of the runqueue the cpu is assigned to. */
 static unsigned
-init_pdata(struct csched2_private *prv, unsigned int cpu)
+init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc,
+           unsigned int cpu)
 {
-    unsigned rqi;
     struct csched2_runqueue_data *rqd;
 
     ASSERT(rw_is_write_locked(&prv->lock));
     ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
+    /* CPU data needs to be allocated, but still uninitialized. */
+    ASSERT(spc && spc->runq_id == -1);
 
     /* Figure out which runqueue to put it in */
-    rqi = cpu_to_runqueue(prv, cpu);
+    spc->runq_id = cpu_to_runqueue(prv, cpu);
 
-    rqd = prv->rqd + rqi;
+    rqd = prv->rqd + spc->runq_id;
 
-    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, rqi);
-    if ( ! cpumask_test_cpu(rqi, &prv->active_queues) )
+    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id);
+    if ( ! cpumask_test_cpu(spc->runq_id, &prv->active_queues) )
     {
         printk(XENLOG_INFO " First cpu on runqueue, activating\n");
-        activate_runqueue(prv, rqi);
+        activate_runqueue(prv, spc->runq_id);
     }
     
-    /* Set the runqueue map */
-    per_cpu(runq_map, cpu) = rqi;
-    
     __cpumask_set_cpu(cpu, &rqd->idle);
     __cpumask_set_cpu(cpu, &rqd->active);
     __cpumask_set_cpu(cpu, &prv->initialized);
@@ -3811,7 +3829,7 @@ init_pdata(struct csched2_private *prv, unsigned int cpu)
     if ( cpumask_weight(&rqd->active) == 1 )
         rqd->pick_bias = cpu;
 
-    return rqi;
+    return spc->runq_id;
 }
 
 static void
@@ -3822,16 +3840,10 @@ csched2_init_pdata(const struct scheduler *ops, void 
*pdata, int cpu)
     unsigned long flags;
     unsigned rqi;
 
-    /*
-     * pdata contains what alloc_pdata returned. But since we don't (need to)
-     * implement alloc_pdata, either that's NULL, or something is very wrong!
-     */
-    ASSERT(!pdata);
-
     write_lock_irqsave(&prv->lock, flags);
     old_lock = pcpu_schedule_lock(cpu);
 
-    rqi = init_pdata(prv, cpu);
+    rqi = init_pdata(prv, pdata, cpu);
     /* Move the scheduler lock to the new runq lock. */
     per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;
 
@@ -3849,7 +3861,7 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned 
int cpu,
     struct csched2_vcpu *svc = vdata;
     unsigned rqi;
 
-    ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
+    ASSERT(pdata && svc && is_idle_vcpu(svc->vcpu));
 
     /*
      * We own one runqueue lock already (from schedule_cpu_switch()). This
@@ -3864,7 +3876,7 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned 
int cpu,
 
     idle_vcpu[cpu]->sched_priv = vdata;
 
-    rqi = init_pdata(prv, cpu);
+    rqi = init_pdata(prv, pdata, cpu);
 
     /*
      * Now that we know what runqueue we'll go in, double check what's said
@@ -3875,7 +3887,7 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned 
int cpu,
     ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->rqd[rqi].lock);
 
     per_cpu(scheduler, cpu) = new_ops;
-    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
+    per_cpu(schedule_data, cpu).sched_priv = pdata;
 
     /*
      * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
@@ -3894,7 +3906,7 @@ csched2_deinit_pdata(const struct scheduler *ops, void 
*pcpu, int cpu)
     unsigned long flags;
     struct csched2_private *prv = csched2_priv(ops);
     struct csched2_runqueue_data *rqd;
-    int rqi;
+    struct csched2_pcpu *spc = pcpu;
 
     write_lock_irqsave(&prv->lock, flags);
 
@@ -3902,17 +3914,24 @@ csched2_deinit_pdata(const struct scheduler *ops, void 
*pcpu, int cpu)
      * alloc_pdata is not implemented, so pcpu must be NULL. On the other
      * hand, init_pdata must have been called for this pCPU.
      */
-    ASSERT(!pcpu && cpumask_test_cpu(cpu, &prv->initialized));
+    /*
+     * Scheduler specific data for this pCPU must still be there and and be
+     * valid. In fact, if we are here:
+     *  1. alloc_pdata must have been called for this cpu, and free_pdata
+     *     must not have been called on it before us,
+     *  2. init_pdata must have been called on this cpu, and deinit_pdata
+     *     (us!) must not have been called on it already.
+     */
+    ASSERT(spc && spc->runq_id != -1);
+    ASSERT(cpumask_test_cpu(cpu, &prv->initialized));
     
     /* Find the old runqueue and remove this cpu from it */
-    rqi = per_cpu(runq_map, cpu);
-
-    rqd = prv->rqd + rqi;
+    rqd = prv->rqd + spc->runq_id;
 
     /* No need to save IRQs here, they're already disabled */
     spin_lock(&rqd->lock);
 
-    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, rqi);
+    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, 
spc->runq_id);
 
     __cpumask_clear_cpu(cpu, &rqd->idle);
     __cpumask_clear_cpu(cpu, &rqd->smt_idle);
@@ -3921,12 +3940,12 @@ csched2_deinit_pdata(const struct scheduler *ops, void 
*pcpu, int cpu)
     if ( cpumask_empty(&rqd->active) )
     {
         printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
-        deactivate_runqueue(prv, rqi);
+        deactivate_runqueue(prv, spc->runq_id);
     }
     else if ( rqd->pick_bias == cpu )
         rqd->pick_bias = cpumask_first(&rqd->active);
 
-    per_cpu(runq_map, cpu) = -1;
+    spc->runq_id = -1;
 
     spin_unlock(&rqd->lock);
 
@@ -3937,6 +3956,24 @@ csched2_deinit_pdata(const struct scheduler *ops, void 
*pcpu, int cpu)
     return;
 }
 
+static void
+csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched2_pcpu *spc = pcpu;
+
+    /*
+     * pcpu either points to a valid struct csched2_pcpu, or is NULL (if
+     * CPU bringup failed, and we're beeing called from CPU_UP_CANCELLED).
+     * xfree() does not really mind, but we want to be sure that either
+     * init_pdata has never been called, or deinit_pdata has been called
+     * already.
+     */
+    ASSERT(!pcpu || spc->runq_id == -1);
+    ASSERT(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized));
+
+    xfree(pcpu);
+}
+
 static int __init
 csched2_global_init(void)
 {
@@ -4061,8 +4098,10 @@ static const struct scheduler sched_credit2_def = {
     .deinit         = csched2_deinit,
     .alloc_vdata    = csched2_alloc_vdata,
     .free_vdata     = csched2_free_vdata,
+    .alloc_pdata    = csched2_alloc_pdata,
     .init_pdata     = csched2_init_pdata,
     .deinit_pdata   = csched2_deinit_pdata,
+    .free_pdata     = csched2_free_pdata,
     .switch_sched   = csched2_switch_sched,
     .alloc_domdata  = csched2_alloc_domdata,
     .free_domdata   = csched2_free_domdata,


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.