[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 7/7] xen: credit2: try to avoid tickling cpus subject to ratelimiting



On 06/16/2017 03:14 PM, Dario Faggioli wrote:
> With context switching ratelimiting enabled, the following
> pattern is quite common in a scheduling trace:
> 
>      0.000845622 |||||||||||.x||| d32768v12 csched2:runq_insert d0v13, 
> position 0
>      0.000845831 |||||||||||.x||| d32768v12 csched2:runq_tickle_new d0v13, 
> processor = 12, credit = 10135529
>      0.000846546 |||||||||||.x||| d32768v12 csched2:burn_credits d2v7, credit 
> = 2619231, delta = 255937
>  [1] 0.000846739 |||||||||||.x||| d32768v12 csched2:runq_tickle cpu 12
>      [...]
>  [2] 0.000850597 ||||||||||||x||| d32768v12 csched2:schedule cpu 12, rq# 1, 
> busy, SMT busy, tickled
>      0.000850760 ||||||||||||x||| d32768v12 csched2:burn_credits d2v7, credit 
> = 2614028, delta = 5203
>  [3] 0.000851022 ||||||||||||x||| d32768v12 csched2:ratelimit triggered
>  [4] 0.000851614 ||||||||||||x||| d32768v12 runstate_continue d2v7 
> running->running
> 
> Basically, what happens is that runq_tickle() realizes
> d0v13 should preempt d2v7, running on cpu 12, as it
> has higher credits (10135529 vs. 2619231). It therefore
> tickles cpu 12 [1], which, in turn, schedules [2].
> 
> But --surprise surprise-- d2v7 has run for less than the
> ratelimit interval [3], and hence it is _not_ preempted,
> and continues to run. This indeed looks fine. Actually,
> this is what ratelimiting is there for. Note, however,
> that:
>  1) we interrupted cpu 12 for nothing;
>  2) what if, say on cpu 8, there is a vcpu that has:
>     + less credit than d0v13 (so d0v13 can well
>       preempt it),
>     + more credit than d2v7 (that's why it was not
>       selected to be preempted),
>     + run for more than the ratelimiting interval
>       (so it can really be scheduled out)?
> 
> With this patch, if we are in case 2), we'd realize
> that tickling 12 would be pointless, and we'll continue
> looking, eventually finding and tickling 8.
> 
> Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>

Well spotted:

Reviewed-by: George Dunlap <george.dunlap@xxxxxxxxxx>

> ---
> Cc: George Dunlap <george.dunlap@xxxxxxxxxxxxx>
> Cc: Anshul Makkar <anshul.makkar@xxxxxxxxxx>
> ---
>  xen/common/sched_credit2.c |   30 ++++++++++++++++++++++++++----
>  1 file changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
> index bbda790..c45bc03 100644
> --- a/xen/common/sched_credit2.c
> +++ b/xen/common/sched_credit2.c
> @@ -160,6 +160,8 @@
>  #define CSCHED2_MIGRATE_RESIST       ((opt_migrate_resist)*MICROSECS(1))
>  /* How much to "compensate" a vcpu for L2 migration. */
>  #define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50)
> +/* How tolerant we should be when peeking at runtime of vcpus on other cpus 
> */
> +#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50)
>  /* Reset: Value below which credit will be reset. */
>  #define CSCHED2_CREDIT_RESET         0
>  /* Max timer: Maximum time a guest can be run for. */
> @@ -1167,6 +1169,23 @@ tickle_cpu(unsigned int cpu, struct 
> csched2_runqueue_data *rqd)
>  }
>  
>  /*
> + * What we want to know is whether svc, which we assume to be running on some
> + * pcpu, can be interrupted and preempted (which, so far, basically means
> + * whether or not it already run for more than the ratelimit, to which we
> + * apply some tolerance).
> + */
> +static inline bool is_preemptable(const struct csched2_vcpu *svc,
> +                                    s_time_t now, s_time_t ratelimit)
> +{
> +    if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE )
> +        return true;
> +
> +    ASSERT(svc->vcpu->is_running);
> +    return now - svc->vcpu->runstate.state_entry_time >
> +           ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE;
> +}
> +
> +/*
>   * Score to preempt the target cpu.  Return a negative number if the
>   * credit isn't high enough; if it is, favor a preemption on cpu in
>   * this order:
> @@ -1180,10 +1199,12 @@ tickle_cpu(unsigned int cpu, struct 
> csched2_runqueue_data *rqd)
>   *
>   * Within the same class, the highest difference of credit.
>   */
> -static s_time_t tickle_score(struct csched2_runqueue_data *rqd, s_time_t now,
> +static s_time_t tickle_score(const struct scheduler *ops, s_time_t now,
>                               struct csched2_vcpu *new, unsigned int cpu)
>  {
> +    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
>      struct csched2_vcpu * cur = csched2_vcpu(curr_on_cpu(cpu));
> +    struct csched2_private *prv = csched2_priv(ops);
>      s_time_t score;
>  
>      /*
> @@ -1191,7 +1212,8 @@ static s_time_t tickle_score(struct 
> csched2_runqueue_data *rqd, s_time_t now,
>       * in rqd->idle). However, some of them may be running their idle vcpu,
>       * if taking care of tasklets. In that case, we want to leave it alone.
>       */
> -    if ( unlikely(is_idle_vcpu(cur->vcpu)) )
> +    if ( unlikely(is_idle_vcpu(cur->vcpu) ||
> +         !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) )
>          return -1;
>  
>      burn_credits(rqd, cur, now);
> @@ -1348,7 +1370,7 @@ runq_tickle(const struct scheduler *ops, struct 
> csched2_vcpu *new, s_time_t now)
>      cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
>      if ( __cpumask_test_and_clear_cpu(cpu, &mask) )
>      {
> -        s_time_t score = tickle_score(rqd, now, new, cpu);
> +        s_time_t score = tickle_score(ops, now, new, cpu);
>  
>          if ( score > max )
>          {
> @@ -1371,7 +1393,7 @@ runq_tickle(const struct scheduler *ops, struct 
> csched2_vcpu *new, s_time_t now)
>          /* Already looked at this one above */
>          ASSERT(i != cpu);
>  
> -        score = tickle_score(rqd, now, new, i);
> +        score = tickle_score(ops, now, new, i);
>  
>          if ( score > max )
>          {
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.