[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [XEN] Initial support for multi-core and multi-threaded CPU scheduling.



# HG changeset patch
# User Emmanuel Ackaouy <ack@xxxxxxxxxxxxx>
# Node ID bb6cd7ba259b7552e2f46f986c1580350af10517
# Parent  32e4952c063866165d9cab913f67b57f8e78aded
[XEN] Initial support for multi-core and multi-threaded CPU scheduling.
In multi-core and multi-threaded systems, not all idling "CPUs" are
equal: When there are idling execution vehicles, it's better to spread
VCPUs across sockets and cores before co-scheduling cores and threads.
Signed-off-by: Emmanuel Ackaouy <ack@xxxxxxxxxxxxx>
---
 xen/common/sched_credit.c  |  210 ++++++++++++++++++++++++++++++++++++++++-----
 xen/common/sched_sedf.c    |    9 +
 xen/common/schedule.c      |    4 
 xen/include/xen/sched-if.h |    1 
 4 files changed, 202 insertions(+), 22 deletions(-)

diff -r 32e4952c0638 -r bb6cd7ba259b xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Mon Nov 06 16:55:56 2006 +0000
+++ b/xen/common/sched_credit.c Mon Nov 06 17:32:00 2006 +0000
@@ -115,6 +115,10 @@
     _MACRO(steal_peer_idle)                 \
     _MACRO(steal_peer_running)              \
     _MACRO(steal_peer_pinned)               \
+    _MACRO(steal_peer_migrating)            \
+    _MACRO(steal_peer_best_idler)           \
+    _MACRO(steal_loner_candidate)           \
+    _MACRO(steal_loner_signal)              \
     _MACRO(dom_init)                        \
     _MACRO(dom_destroy)                     \
     _MACRO(vcpu_init)                       \
@@ -370,8 +374,42 @@ __csched_vcpu_check(struct vcpu *vc)
 #define CSCHED_VCPU_CHECK(_vc)
 #endif
 
+/*
+ * Indicates which of two given idlers is most efficient to run
+ * an additional VCPU.
+ *
+ * Returns:
+ *  0:           They are the same.
+ *  negative:    One is less efficient than Two.
+ *  positive:    One is more efficient than Two.
+ */
+static int
+csched_idler_compare(int one, int two)
+{
+    cpumask_t idlers;
+    cpumask_t one_idlers;
+    cpumask_t two_idlers;
+
+    idlers = csched_priv.idlers;
+    cpu_clear(one, idlers);
+    cpu_clear(two, idlers);
+
+    if ( cpu_isset(one, cpu_core_map[two]) )
+    {
+        cpus_and(one_idlers, idlers, cpu_sibling_map[one]);
+        cpus_and(two_idlers, idlers, cpu_sibling_map[two]);
+    }
+    else
+    {
+        cpus_and(one_idlers, idlers, cpu_core_map[one]);
+        cpus_and(two_idlers, idlers, cpu_core_map[two]);
+    }
+
+    return cpus_weight(one_idlers) - cpus_weight(two_idlers);
+}
+
 static inline int
-__csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
+__csched_queued_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
 {
     /*
      * Don't pick up work that's in the peer's scheduling tail. Also only pick
@@ -386,6 +424,32 @@ __csched_vcpu_is_stealable(int local_cpu
     if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
     {
         CSCHED_STAT_CRANK(steal_peer_pinned);
+        return 0;
+    }
+
+    return 1;
+}
+
+static inline int
+__csched_running_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
+{
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
+    {
+        CSCHED_STAT_CRANK(steal_peer_pinned);
+        return 0;
+    }
+
+    if ( test_bit(_VCPUF_migrating, &vc->vcpu_flags) )
+    {
+        CSCHED_STAT_CRANK(steal_peer_migrating);
+        return 0;
+    }
+
+    if ( csched_idler_compare(local_cpu, vc->processor) <= 0 )
+    {
+        CSCHED_STAT_CRANK(steal_peer_best_idler);
         return 0;
     }
 
@@ -652,6 +716,64 @@ csched_dom_destroy(struct domain *dom)
     xfree(sdom);
 }
 
+static int
+csched_cpu_pick(struct vcpu *vc)
+{
+    cpumask_t cpus;
+    int cpu, nxt;
+
+    /*
+     * Pick from online CPUs in VCPU's affinity mask, giving a
+     * preference to its current processor if it's in there.
+     */
+    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    ASSERT( !cpus_empty(cpus) );
+    cpu = cpu_isset(vc->processor, cpus) ? vc->processor : first_cpu(cpus);
+
+    /*
+     * Try to find an idle processor within the above constraints.
+     */
+    cpus_and(cpus, cpus, csched_priv.idlers);
+    if ( !cpus_empty(cpus) )
+    {
+        cpu = cpu_isset(cpu, cpus) ? cpu : first_cpu(cpus);
+        cpu_clear(cpu, cpus);
+
+        /*
+         * In multi-core and multi-threaded CPUs, not all idle execution
+         * vehicles are equal!
+         *
+         * We give preference to the idle execution vehicle with the most
+         * idling neighbours in its grouping. This distributes work across
+         * distinct cores first and guarantees we don't do something stupid
+         * like run two VCPUs on co-hyperthreads while there are idle cores
+         * or sockets.
+         */
+        while ( !cpus_empty(cpus) )
+        {
+            nxt = first_cpu(cpus);
+
+            if ( csched_idler_compare(cpu, nxt) < 0 )
+            {
+                cpu = nxt;
+                cpu_clear(nxt, cpus);
+            }
+            else if ( cpu_isset(cpu, cpu_core_map[nxt]) )
+            {
+                cpus_andnot(cpus, cpus, cpu_sibling_map[nxt]);
+            }
+            else
+            {
+                cpus_andnot(cpus, cpus, cpu_core_map[nxt]);
+            }
+
+            ASSERT( !cpu_isset(nxt, cpus) );
+        }
+    }
+
+    return cpu;
+}
+
 /*
  * This is a O(n) optimized sort of the runq.
  *
@@ -939,7 +1061,7 @@ csched_runq_steal(struct csched_pcpu *sp
         vc = speer->vcpu;
         BUG_ON( is_idle_vcpu(vc) );
 
-        if ( __csched_vcpu_is_stealable(cpu, vc) )
+        if ( __csched_queued_vcpu_is_stealable(cpu, vc) )
         {
             /* We got a candidate. Grab it! */
             __runq_remove(speer);
@@ -959,6 +1081,7 @@ csched_load_balance(int cpu, struct csch
     struct csched_pcpu *spc;
     struct vcpu *peer_vcpu;
     cpumask_t workers;
+    cpumask_t loners;
     int peer_cpu;
 
     if ( snext->pri == CSCHED_PRI_IDLE )
@@ -971,6 +1094,7 @@ csched_load_balance(int cpu, struct csch
     /*
      * Peek at non-idling CPUs in the system
      */
+    cpus_clear(loners);
     cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
     cpu_clear(cpu, workers);
 
@@ -999,13 +1123,12 @@ csched_load_balance(int cpu, struct csch
             continue;
         }
 
+        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
         spc = CSCHED_PCPU(peer_cpu);
-        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
 
         if ( unlikely(spc == NULL) )
         {
             CSCHED_STAT_CRANK(steal_peer_down);
-            speer = NULL;
         }
         else if ( unlikely(is_idle_vcpu(peer_vcpu)) )
         {
@@ -1014,26 +1137,72 @@ csched_load_balance(int cpu, struct csch
              * pick up work from it itself.
              */
             CSCHED_STAT_CRANK(steal_peer_idle);
-            speer = NULL;
+        }
+        else if ( is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) )
+        {
+            if ( snext->pri == CSCHED_PRI_IDLE &&
+                 __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
+            {
+                CSCHED_STAT_CRANK(steal_loner_candidate);
+                cpu_set(peer_cpu, loners);
+            }
         }
         else
         {
-            /* Try to steal work from an online non-idle CPU. */
+            /* Try to steal work from a remote CPU's runq. */
             speer = csched_runq_steal(spc, cpu, snext->pri);
+            if ( speer != NULL )
+            {
+                spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+                CSCHED_STAT_CRANK(vcpu_migrate);
+                speer->stats.migrate++;
+                return speer;
+            }
         }
 
         spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
-
-        /* Got one? */
-        if ( speer )
-        {
-            CSCHED_STAT_CRANK(vcpu_migrate);
-            speer->stats.migrate++;
-            return speer;
-        }
-    }
-
-    /* Failed to find more important work */
+    }
+
+    /*
+     * If we failed to find any remotely queued VCPUs to move here,
+     * see if it would be more efficient to move any of the running
+     * remote VCPUs over here.
+     */
+    while ( !cpus_empty(loners) )
+    {
+        /* For each CPU of interest, starting with our neighbour... */
+        peer_cpu = next_cpu(peer_cpu, loners);
+        if ( peer_cpu == NR_CPUS )
+            peer_cpu = first_cpu(loners);
+
+        cpu_clear(peer_cpu, loners);
+
+        if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
+        {
+            CSCHED_STAT_CRANK(steal_trylock_failed);
+            continue;
+        }
+
+        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
+        spc = CSCHED_PCPU(peer_cpu);
+
+        if ( !is_idle_vcpu(peer_vcpu) &&
+             is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) &&
+             __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
+        {
+            set_bit(_VCPUF_migrating, &peer_vcpu->vcpu_flags);
+            spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+
+            CSCHED_STAT_CRANK(steal_loner_signal);
+            cpu_raise_softirq(peer_cpu, SCHEDULE_SOFTIRQ);
+        }
+        else
+        {
+            spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+        }
+    }
+
+    /* Failed to find more important work elsewhere... */
     __runq_remove(snext);
     return snext;
 }
@@ -1139,9 +1308,11 @@ csched_dump_pcpu(int cpu)
     spc = CSCHED_PCPU(cpu);
     runq = &spc->runq;
 
-    printk(" tick=%lu, sort=%d\n",
+    printk(" tick=%lu, sort=%d, sibling=0x%lx, core=0x%lx\n",
             per_cpu(schedule_data, cpu).tick,
-            spc->runq_sort_last);
+            spc->runq_sort_last,
+            cpu_sibling_map[cpu].bits[0],
+            cpu_core_map[cpu].bits[0]);
 
     /* current VCPU */
     svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
@@ -1247,6 +1418,7 @@ struct scheduler sched_credit_def = {
 
     .adjust         = csched_dom_cntl,
 
+    .pick_cpu       = csched_cpu_pick,
     .tick           = csched_tick,
     .do_schedule    = csched_schedule,
 
diff -r 32e4952c0638 -r bb6cd7ba259b xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c   Mon Nov 06 16:55:56 2006 +0000
+++ b/xen/common/sched_sedf.c   Mon Nov 06 17:32:00 2006 +0000
@@ -409,6 +409,14 @@ static void sedf_destroy_domain(struct d
 static void sedf_destroy_domain(struct domain *d)
 {
     xfree(d->sched_priv);
+}
+
+static int sedf_pick_cpu(struct vcpu *v)
+{
+    cpumask_t online_affinity;
+
+    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    return first_cpu(online_affinity);
 }
 
 /*
@@ -1436,6 +1444,7 @@ struct scheduler sched_sedf_def = {
     .destroy_vcpu   = sedf_destroy_vcpu,
 
     .do_schedule    = sedf_do_schedule,
+    .pick_cpu       = sedf_pick_cpu,
     .dump_cpu_state = sedf_dump_cpu_state,
     .sleep          = sedf_sleep,
     .wake           = sedf_wake,
diff -r 32e4952c0638 -r bb6cd7ba259b xen/common/schedule.c
--- a/xen/common/schedule.c     Mon Nov 06 16:55:56 2006 +0000
+++ b/xen/common/schedule.c     Mon Nov 06 17:32:00 2006 +0000
@@ -203,7 +203,6 @@ void vcpu_wake(struct vcpu *v)
 
 static void vcpu_migrate(struct vcpu *v)
 {
-    cpumask_t online_affinity;
     unsigned long flags;
     int old_cpu;
 
@@ -218,8 +217,7 @@ static void vcpu_migrate(struct vcpu *v)
 
     /* Switch to new CPU, then unlock old CPU. */
     old_cpu = v->processor;
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
-    v->processor = first_cpu(online_affinity);
+    v->processor = SCHED_OP(pick_cpu, v);
     spin_unlock_irqrestore(
         &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
 
diff -r 32e4952c0638 -r bb6cd7ba259b xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Mon Nov 06 16:55:56 2006 +0000
+++ b/xen/include/xen/sched-if.h        Mon Nov 06 17:32:00 2006 +0000
@@ -74,6 +74,7 @@ struct scheduler {
 
     struct task_slice (*do_schedule) (s_time_t);
 
+    int          (*pick_cpu)       (struct vcpu *);
     int          (*adjust)         (struct domain *,
                                     struct xen_domctl_scheduler_op *);
     void         (*dump_settings)  (void);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.