[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3 6/7] xen: credit1: increase efficiency and scalability of load balancing.



On Fri, Apr 7, 2017 at 5:57 PM, Dario Faggioli
<dario.faggioli@xxxxxxxxxx> wrote:
> During load balancing, we check the non idle pCPUs to
> see if they have runnable but not running vCPUs that
> can be stolen by and set to run on currently idle pCPUs.
>
> If a pCPU has only one running (or runnable) vCPU,
> though, we don't want to steal it from there, and
> it's therefore pointless bothering with it
> (especially considering that bothering means trying
> to take its runqueue lock!).
>
> On large systems, when load is only slightly higher
> than the number of pCPUs (i.e., there are just a few
> more active vCPUs than the number of the pCPUs), this
> may mean that:
>  - we go through all the pCPUs,
>  - for each one, we (try to) take its runqueue locks,
>  - we figure out there's actually nothing to be stolen!
>
> To mitigate this, we introduce a counter for the number
> of runnable vCPUs on each pCPU. In fact, unless there
> re least 2 runnable vCPUs --typically, one running,
> and the others in the runqueue-- it does not make sense
> to try stealing anything.
>
> signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>

Reviewed-by: George Dunlap <george.dunlap@xxxxxxxxxx>

> ---
> Cc: George Dunlap <george.dunlap@xxxxxxxxxx>
> Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> Cc: Anshul Makkar <anshul.makkar@xxxxxxxxxx>
> ---
> Changes from v2:
> * do as much as nr-runnable accounting inside runqueue remove and runqueue
>   insert helpers. Add comments for the instances that must live outside of
>   them.
>
> Changes from v1:
> * don't count the idle vCPU as runnable. This is just cosmetic and not at all 
> a
>   logic or functionl change wrt v1;
> * don't count inside of __runq_remove() or __runq_insert(), but provide
>   specific counting functions, and call them when appropriate. This is 
> necessary
>   to avoid spurious overloaded state to be reported, basically *all* the time
>   that a pCPU goes through the scheduler (due to the fact that the scheduler
>   calls __runq_insert() on the current vCPU);
> * get rid of the overloaded cpumask and only use the counter. I actually did
>   like the cpumask solution better, but for this purpose, it was overkill.
> ---
>  xen/common/sched_credit.c |   97 
> ++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 87 insertions(+), 10 deletions(-)
>
> diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
> index 59b87f7..a0ad167 100644
> --- a/xen/common/sched_credit.c
> +++ b/xen/common/sched_credit.c
> @@ -172,6 +172,7 @@ struct csched_pcpu {
>      struct timer ticker;
>      unsigned int tick;
>      unsigned int idle_bias;
> +    unsigned int nr_runnable;
>  };
>
>  /*
> @@ -262,9 +263,26 @@ static inline bool_t is_runq_idle(unsigned int cpu)
>  }
>
>  static inline void
> +inc_nr_runnable(unsigned int cpu)
> +{
> +    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
> +    CSCHED_PCPU(cpu)->nr_runnable++;
> +
> +}
> +
> +static inline void
> +dec_nr_runnable(unsigned int cpu)
> +{
> +    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
> +    CSCHED_PCPU(cpu)->nr_runnable--;
> +    ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 0);
> +}
> +
> +static inline void
>  __runq_insert(struct csched_vcpu *svc)
>  {
> -    const struct list_head * const runq = RUNQ(svc->vcpu->processor);
> +    unsigned int cpu = svc->vcpu->processor;
> +    const struct list_head * const runq = RUNQ(cpu);
>      struct list_head *iter;
>
>      BUG_ON( __vcpu_on_runq(svc) );
> @@ -292,12 +310,25 @@ __runq_insert(struct csched_vcpu *svc)
>  }
>
>  static inline void
> +runq_insert(struct csched_vcpu *svc)
> +{
> +    __runq_insert(svc);
> +    inc_nr_runnable(svc->vcpu->processor);
> +}
> +
> +static inline void
>  __runq_remove(struct csched_vcpu *svc)
>  {
>      BUG_ON( !__vcpu_on_runq(svc) );
>      list_del_init(&svc->runq_elem);
>  }
>
> +static inline void
> +runq_remove(struct csched_vcpu *svc)
> +{
> +    dec_nr_runnable(svc->vcpu->processor);
> +    __runq_remove(svc);
> +}
>
>  #define for_each_csched_balance_step(step) \
>      for ( (step) = 0; (step) <= CSCHED_BALANCE_HARD_AFFINITY; (step)++ )
> @@ -601,6 +632,7 @@ init_pdata(struct csched_private *prv, struct csched_pcpu 
> *spc, int cpu)
>      /* Start off idling... */
>      BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
>      cpumask_set_cpu(cpu, prv->idlers);
> +    spc->nr_runnable = 0;
>  }
>
>  static void
> @@ -1052,7 +1084,7 @@ csched_vcpu_insert(const struct scheduler *ops, struct 
> vcpu *vc)
>      lock = vcpu_schedule_lock_irq(vc);
>
>      if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
> -        __runq_insert(svc);
> +        runq_insert(svc);
>
>      vcpu_schedule_unlock_irq(lock, vc);
>
> @@ -1117,7 +1149,7 @@ csched_vcpu_sleep(const struct scheduler *ops, struct 
> vcpu *vc)
>          cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
>      }
>      else if ( __vcpu_on_runq(svc) )
> -        __runq_remove(svc);
> +        runq_remove(svc);
>  }
>
>  static void
> @@ -1177,7 +1209,7 @@ csched_vcpu_wake(const struct scheduler *ops, struct 
> vcpu *vc)
>      }
>
>      /* Put the VCPU on the runq and tickle CPUs */
> -    __runq_insert(svc);
> +    runq_insert(svc);
>      __runq_tickle(svc);
>  }
>
> @@ -1679,8 +1711,14 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, int 
> balance_step)
>              SCHED_VCPU_STAT_CRANK(speer, migrate_q);
>              SCHED_STAT_CRANK(migrate_queued);
>              WARN_ON(vc->is_urgent);
> -            __runq_remove(speer);
> +            runq_remove(speer);
>              vc->processor = cpu;
> +            /*
> +             * speer will start executing directly on cpu, without having to
> +             * go through runq_insert(). So we must update the runnable count
> +             * for cpu here.
> +             */
> +            inc_nr_runnable(cpu);
>              return speer;
>          }
>      }
> @@ -1736,7 +1774,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
>          peer_node = node;
>          do
>          {
> -            /* Find out what the !idle are in this node */
> +            /* Select the pCPUs in this node that have work we can steal. */
>              cpumask_andnot(&workers, online, prv->idlers);
>              cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
>              __cpumask_clear_cpu(cpu, &workers);
> @@ -1746,6 +1784,40 @@ csched_load_balance(struct csched_private *prv, int 
> cpu,
>                  goto next_node;
>              do
>              {
> +                spinlock_t *lock;
> +
> +                /*
> +                 * If there is only one runnable vCPU on peer_cpu, it means
> +                 * there's no one to be stolen in its runqueue, so skip it.
> +                 *
> +                 * Checking this without holding the lock is racy... But 
> that's
> +                 * the whole point of this optimization!
> +                 *
> +                 * In more details:
> +                 * - if we race with dec_nr_runnable(), we may try to take 
> the
> +                 *   lock and call csched_runq_steal() for no reason. This is
> +                 *   not a functional issue, and should be infrequent enough.
> +                 *   And we can avoid that by re-checking nr_runnable after
> +                 *   having grabbed the lock, if we want;
> +                 * - if we race with inc_nr_runnable(), we skip a pCPU that 
> may
> +                 *   have runnable vCPUs in its runqueue, but that's not a
> +                 *   problem because:
> +                 *   + if racing with csched_vcpu_insert() or 
> csched_vcpu_wake(),
> +                 *     __runq_tickle() will be called afterwords, so the vCPU
> +                 *     won't get stuck in the runqueue for too long;
> +                 *   + if racing with csched_runq_steal(), it may be that a
> +                 *     vCPU that we could have picked up, stays in a runqueue
> +                 *     until someone else tries to steal it again. But this 
> is
> +                 *     no worse than what can happen already (without this
> +                 *     optimization), it the pCPU would schedule right after 
> we
> +                 *     have taken the lock, and hence block on it.
> +                 */
> +                if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
> +                {
> +                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 
> 0);
> +                    goto next_cpu;
> +                }
> +
>                  /*
>                   * Get ahold of the scheduler lock for this peer CPU.
>                   *
> @@ -1753,14 +1825,13 @@ csched_load_balance(struct csched_private *prv, int 
> cpu,
>                   * could cause a deadlock if the peer CPU is also load
>                   * balancing and trying to lock this CPU.
>                   */
> -                spinlock_t *lock = pcpu_schedule_trylock(peer_cpu);
> +                lock = pcpu_schedule_trylock(peer_cpu);
>                  SCHED_STAT_CRANK(steal_trylock);
>                  if ( !lock )
>                  {
>                      SCHED_STAT_CRANK(steal_trylock_failed);
>                      TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
> -                    peer_cpu = cpumask_cycle(peer_cpu, &workers);
> -                    continue;
> +                    goto next_cpu;
>                  }
>
>                  TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
> @@ -1777,6 +1848,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
>                      return speer;
>                  }
>
> + next_cpu:
>                  peer_cpu = cpumask_cycle(peer_cpu, &workers);
>
>              } while( peer_cpu != cpumask_first(&workers) );
> @@ -1907,7 +1979,11 @@ csched_schedule(
>      if ( vcpu_runnable(current) )
>          __runq_insert(scurr);
>      else
> +    {
>          BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
> +        /* Current has blocked. Update the runnable counter for this cpu. */
> +        dec_nr_runnable(cpu);
> +    }
>
>      snext = __runq_elem(runq->next);
>      ret.migrated = 0;
> @@ -2024,7 +2100,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu)
>      runq = &spc->runq;
>
>      cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, 
> cpu));
> -    printk("CPU[%02d] sort=%d, sibling=%s, ", cpu, spc->runq_sort_last, 
> cpustr);
> +    printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ",
> +           cpu, spc->nr_runnable, spc->runq_sort_last, cpustr);
>      cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
>      printk("core=%s\n", cpustr);
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> https://lists.xen.org/xen-devel

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.