[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen master] xen: credit1: increase efficiency and scalability of load balancing.



commit 341450eaf7534220e67529eb62213edbddb84cae
Author:     Dario Faggioli <dario.faggioli@xxxxxxxxxx>
AuthorDate: Fri Apr 7 18:57:07 2017 +0200
Commit:     George Dunlap <george.dunlap@xxxxxxxxxx>
CommitDate: Fri Apr 7 18:17:39 2017 +0100

    xen: credit1: increase efficiency and scalability of load balancing.
    
    During load balancing, we check the non idle pCPUs to
    see if they have runnable but not running vCPUs that
    can be stolen by and set to run on currently idle pCPUs.
    
    If a pCPU has only one running (or runnable) vCPU,
    though, we don't want to steal it from there, and
    it's therefore pointless bothering with it
    (especially considering that bothering means trying
    to take its runqueue lock!).
    
    On large systems, when load is only slightly higher
    than the number of pCPUs (i.e., there are just a few
    more active vCPUs than the number of the pCPUs), this
    may mean that:
     - we go through all the pCPUs,
     - for each one, we (try to) take its runqueue locks,
     - we figure out there's actually nothing to be stolen!
    
    To mitigate this, we introduce a counter for the number
    of runnable vCPUs on each pCPU. In fact, unless there
    re least 2 runnable vCPUs --typically, one running,
    and the others in the runqueue-- it does not make sense
    to try stealing anything.
    
    Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
    Reviewed-by: George Dunlap <george.dunlap@xxxxxxxxxx>
---
 xen/common/sched_credit.c | 97 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 87 insertions(+), 10 deletions(-)

diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 59b87f7..a0ad167 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -172,6 +172,7 @@ struct csched_pcpu {
     struct timer ticker;
     unsigned int tick;
     unsigned int idle_bias;
+    unsigned int nr_runnable;
 };
 
 /*
@@ -262,9 +263,26 @@ static inline bool_t is_runq_idle(unsigned int cpu)
 }
 
 static inline void
+inc_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable++;
+
+}
+
+static inline void
+dec_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable--;
+    ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 0);
+}
+
+static inline void
 __runq_insert(struct csched_vcpu *svc)
 {
-    const struct list_head * const runq = RUNQ(svc->vcpu->processor);
+    unsigned int cpu = svc->vcpu->processor;
+    const struct list_head * const runq = RUNQ(cpu);
     struct list_head *iter;
 
     BUG_ON( __vcpu_on_runq(svc) );
@@ -292,12 +310,25 @@ __runq_insert(struct csched_vcpu *svc)
 }
 
 static inline void
+runq_insert(struct csched_vcpu *svc)
+{
+    __runq_insert(svc);
+    inc_nr_runnable(svc->vcpu->processor);
+}
+
+static inline void
 __runq_remove(struct csched_vcpu *svc)
 {
     BUG_ON( !__vcpu_on_runq(svc) );
     list_del_init(&svc->runq_elem);
 }
 
+static inline void
+runq_remove(struct csched_vcpu *svc)
+{
+    dec_nr_runnable(svc->vcpu->processor);
+    __runq_remove(svc);
+}
 
 #define for_each_csched_balance_step(step) \
     for ( (step) = 0; (step) <= CSCHED_BALANCE_HARD_AFFINITY; (step)++ )
@@ -601,6 +632,7 @@ init_pdata(struct csched_private *prv, struct csched_pcpu 
*spc, int cpu)
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
     cpumask_set_cpu(cpu, prv->idlers);
+    spc->nr_runnable = 0;
 }
 
 static void
@@ -1052,7 +1084,7 @@ csched_vcpu_insert(const struct scheduler *ops, struct 
vcpu *vc)
     lock = vcpu_schedule_lock_irq(vc);
 
     if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
-        __runq_insert(svc);
+        runq_insert(svc);
 
     vcpu_schedule_unlock_irq(lock, vc);
 
@@ -1117,7 +1149,7 @@ csched_vcpu_sleep(const struct scheduler *ops, struct 
vcpu *vc)
         cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
     }
     else if ( __vcpu_on_runq(svc) )
-        __runq_remove(svc);
+        runq_remove(svc);
 }
 
 static void
@@ -1177,7 +1209,7 @@ csched_vcpu_wake(const struct scheduler *ops, struct vcpu 
*vc)
     }
 
     /* Put the VCPU on the runq and tickle CPUs */
-    __runq_insert(svc);
+    runq_insert(svc);
     __runq_tickle(svc);
 }
 
@@ -1679,8 +1711,14 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, int 
balance_step)
             SCHED_VCPU_STAT_CRANK(speer, migrate_q);
             SCHED_STAT_CRANK(migrate_queued);
             WARN_ON(vc->is_urgent);
-            __runq_remove(speer);
+            runq_remove(speer);
             vc->processor = cpu;
+            /*
+             * speer will start executing directly on cpu, without having to
+             * go through runq_insert(). So we must update the runnable count
+             * for cpu here.
+             */
+            inc_nr_runnable(cpu);
             return speer;
         }
     }
@@ -1736,7 +1774,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
         peer_node = node;
         do
         {
-            /* Find out what the !idle are in this node */
+            /* Select the pCPUs in this node that have work we can steal. */
             cpumask_andnot(&workers, online, prv->idlers);
             cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
             __cpumask_clear_cpu(cpu, &workers);
@@ -1746,6 +1784,40 @@ csched_load_balance(struct csched_private *prv, int cpu,
                 goto next_node;
             do
             {
+                spinlock_t *lock;
+
+                /*
+                 * If there is only one runnable vCPU on peer_cpu, it means
+                 * there's no one to be stolen in its runqueue, so skip it.
+                 *
+                 * Checking this without holding the lock is racy... But that's
+                 * the whole point of this optimization!
+                 *
+                 * In more details:
+                 * - if we race with dec_nr_runnable(), we may try to take the
+                 *   lock and call csched_runq_steal() for no reason. This is
+                 *   not a functional issue, and should be infrequent enough.
+                 *   And we can avoid that by re-checking nr_runnable after
+                 *   having grabbed the lock, if we want;
+                 * - if we race with inc_nr_runnable(), we skip a pCPU that may
+                 *   have runnable vCPUs in its runqueue, but that's not a
+                 *   problem because:
+                 *   + if racing with csched_vcpu_insert() or 
csched_vcpu_wake(),
+                 *     __runq_tickle() will be called afterwords, so the vCPU
+                 *     won't get stuck in the runqueue for too long;
+                 *   + if racing with csched_runq_steal(), it may be that a
+                 *     vCPU that we could have picked up, stays in a runqueue
+                 *     until someone else tries to steal it again. But this is
+                 *     no worse than what can happen already (without this
+                 *     optimization), it the pCPU would schedule right after we
+                 *     have taken the lock, and hence block on it.
+                 */
+                if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
+                {
+                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 
0);
+                    goto next_cpu;
+                }
+
                 /*
                  * Get ahold of the scheduler lock for this peer CPU.
                  *
@@ -1753,14 +1825,13 @@ csched_load_balance(struct csched_private *prv, int cpu,
                  * could cause a deadlock if the peer CPU is also load
                  * balancing and trying to lock this CPU.
                  */
-                spinlock_t *lock = pcpu_schedule_trylock(peer_cpu);
+                lock = pcpu_schedule_trylock(peer_cpu);
                 SCHED_STAT_CRANK(steal_trylock);
                 if ( !lock )
                 {
                     SCHED_STAT_CRANK(steal_trylock_failed);
                     TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
-                    peer_cpu = cpumask_cycle(peer_cpu, &workers);
-                    continue;
+                    goto next_cpu;
                 }
 
                 TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
@@ -1777,6 +1848,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
                     return speer;
                 }
 
+ next_cpu:
                 peer_cpu = cpumask_cycle(peer_cpu, &workers);
 
             } while( peer_cpu != cpumask_first(&workers) );
@@ -1907,7 +1979,11 @@ csched_schedule(
     if ( vcpu_runnable(current) )
         __runq_insert(scurr);
     else
+    {
         BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
+        /* Current has blocked. Update the runnable counter for this cpu. */
+        dec_nr_runnable(cpu);
+    }
 
     snext = __runq_elem(runq->next);
     ret.migrated = 0;
@@ -2024,7 +2100,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu)
     runq = &spc->runq;
 
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
-    printk("CPU[%02d] sort=%d, sibling=%s, ", cpu, spc->runq_sort_last, 
cpustr);
+    printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ",
+           cpu, spc->nr_runnable, spc->runq_sort_last, cpustr);
     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
     printk("core=%s\n", cpustr);
 
--
generated by git-patchbot for /home/xen/git/xen.git#master

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.