Xen project Mailing List

[Xen-devel] [PATCH v2] sched: credit2: respect per-vcpu hard affinity

From: "Justin T. Weaver" <jtweaver@xxxxxxxxxx>

Date: Sun, 8 Feb 2015 17:45:50 -1000

Cc: george.dunlap@xxxxxxxxxxxxx, dario.faggioli@xxxxxxxxxx, "Justin T. Weaver" <jtweaver@xxxxxxxxxx>, henric@xxxxxxxxxx

Delivery-date: Mon, 09 Feb 2015 03:42:17 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

From: "Justin T. Weaver" <jtweaver@xxxxxxxxxx> by making sure that vcpus only run on the pcpu(s) they are allowed to run on based on their hard affinity cpu masks. Signed-off-by: Justin T. Weaver <jtweaver@xxxxxxxxxx> --- Changes in v2: * Added dynamically allocated cpu masks to avoid putting them on the stack; replaced temp masks from v1 throughout * Added helper function for code suggested in v1 review and called it in two locations in function choose_cpu * Removed v1 change to comment in the beginning of choose_cpu * Replaced two instances of cpumask_and/cpumask_empty with cpumask_intersects * Removed v1 re-work of code in function migrate; only change in migrate in v2 is the assignment of a valid pcpu from the destination run queue to vc->processor * In function csched2_vcpu_migrate: removed change from v1 that called function migrate even if cur and dest run queues were the same in order to get a runq_tickle call; added processor assignment to new_cpu to fix the real underlying issue which was the vcpu not getting a call to sched_move_irqs * Removed the looping added in v1 in function balance_load; may be added back later because it would help to have balance_load be more aware of hard affinity, but adding it does not affect credit2's current inability to respect hard affinity. * Removed coding style fix in function balance_load * Improved comment in function runq_candidate --- xen/common/sched_credit2.c | 122 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 108 insertions(+), 14 deletions(-) diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c index cf53770..de8fb5a 100644 --- a/xen/common/sched_credit2.c +++ b/xen/common/sched_credit2.c @@ -194,6 +194,12 @@ int opt_overload_balance_tolerance=-3; integer_param("credit2_balance_over", opt_overload_balance_tolerance); /* + * Use this to avoid having too many cpumask_t structs on the stack + */ +static cpumask_t **cpumask = NULL; +#define csched2_cpumask cpumask[smp_processor_id()] + +/* * Per-runqueue data */ struct csched2_runqueue_data { @@ -268,6 +274,23 @@ struct csched2_dom { uint16_t nr_vcpus; }; +/* + * When a hard affinity change occurs, we may not be able to check some or + * all of the other run queues for a valid new processor for the given vcpu. + * Return svc's current pcpu if valid, otherwise return a safe pcpu. + */ +static int get_safe_pcpu(struct csched2_vcpu *svc) +{ + cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, &svc->rqd->active); + if ( unlikely(cpumask_empty(csched2_cpumask)) ) + cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, + cpupool_online_cpumask(svc->vcpu->domain->cpupool)); + + if ( cpumask_test_cpu(svc->vcpu->processor, csched2_cpumask) ) + return svc->vcpu->processor; + else + return cpumask_any(csched2_cpumask); +} /* * Time-to-credit, credit-to-time. @@ -501,8 +524,9 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu * goto tickle; } - /* Get a mask of idle, but not tickled */ + /* Get a mask of idle, but not tickled, that new is allowed to run on. */ cpumask_andnot(&mask, &rqd->idle, &rqd->tickled); + cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity); /* If it's not empty, choose one */ i = cpumask_cycle(cpu, &mask); @@ -513,9 +537,11 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu * } /* Otherwise, look for the non-idle cpu with the lowest credit, - * skipping cpus which have been tickled but not scheduled yet */ + * skipping cpus which have been tickled but not scheduled yet, + * that new is allowed to run on. */ cpumask_andnot(&mask, &rqd->active, &rqd->idle); cpumask_andnot(&mask, &mask, &rqd->tickled); + cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity); for_each_cpu(i, &mask) { @@ -1063,9 +1089,8 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc) d2printk("%pv -\n", svc->vcpu); clear_bit(__CSFLAG_runq_migrate_request, &svc->flags); } - /* Leave it where it is for now. When we actually pay attention - * to affinity we'll have to figure something out... */ - return vc->processor; + + return get_safe_pcpu(svc); } /* First check to see if we're here because someone else suggested a place @@ -1081,13 +1106,17 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc) else { d2printk("%pv +\n", svc->vcpu); - new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active); - goto out_up; + cpumask_and(csched2_cpumask, vc->cpu_hard_affinity, + &svc->migrate_rqd->active); + if ( !cpumask_empty(csched2_cpumask) ) + { + new_cpu = cpumask_any(csched2_cpumask); + goto out_up; + } + /* Fall-through to normal cpu pick */ } } - /* FIXME: Pay attention to cpu affinity */ - min_avgload = MAX_LOAD; /* Find the runqueue with the lowest instantaneous load */ @@ -1099,17 +1128,24 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc) rqd = prv->rqd + i; /* If checking a different runqueue, grab the lock, - * read the avg, and then release the lock. + * check hard affinity, read the avg, and then release the lock. * * If on our own runqueue, don't grab or release the lock; * but subtract our own load from the runqueue load to simulate * impartiality */ if ( rqd == svc->rqd ) { + if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) ) + continue; rqd_avgload = rqd->b_avgload - svc->avgload; } else if ( spin_trylock(&rqd->lock) ) { + if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) ) + { + spin_unlock(&rqd->lock); + continue; + } rqd_avgload = rqd->b_avgload; spin_unlock(&rqd->lock); } @@ -1123,12 +1159,16 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc) } } - /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */ if ( min_rqi == -1 ) - new_cpu = vc->processor; + { + /* No runqs found (most likely because of spinlock contention). */ + new_cpu = get_safe_pcpu(svc); + } else { - new_cpu = cpumask_cycle(vc->processor, &prv->rqd[min_rqi].active); + cpumask_and(csched2_cpumask, vc->cpu_hard_affinity, + &prv->rqd[min_rqi].active); + new_cpu = cpumask_any(csched2_cpumask); BUG_ON(new_cpu >= nr_cpu_ids); } @@ -1207,7 +1247,12 @@ static void migrate(const struct scheduler *ops, on_runq=1; } __runq_deassign(svc); - svc->vcpu->processor = cpumask_any(&trqd->active); + + cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, + &trqd->active); + svc->vcpu->processor = cpumask_any(csched2_cpumask); + BUG_ON(svc->vcpu->processor >= nr_cpu_ids); + __runq_assign(svc, trqd); if ( on_runq ) { @@ -1330,6 +1375,12 @@ retry: if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) ) continue; + /* Skip if it can't run on the destination runq. */ + cpumask_and(csched2_cpumask, push_svc->vcpu->cpu_hard_affinity, + &st.orqd->active); + if ( cpumask_empty(csched2_cpumask) ) + continue; + list_for_each( pull_iter, &st.orqd->svc ) { struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem); @@ -1343,6 +1394,12 @@ retry: if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) ) continue; + /* Skip if it can't run on the destination runq. */ + cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity, + &st.lrqd->active); + if ( cpumask_empty(csched2_cpumask) ) + continue; + consider(&st, push_svc, pull_svc); } @@ -1360,6 +1417,12 @@ retry: if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) ) continue; + /* Skip if it can't run on the destination runq. */ + cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity, + &st.lrqd->active); + if ( cpumask_empty(csched2_cpumask) ) + continue; + /* Consider pull only */ consider(&st, NULL, pull_svc); } @@ -1396,6 +1459,15 @@ csched2_vcpu_migrate( /* Check if new_cpu is valid */ BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized)); + BUG_ON(!cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity)); + + /* + * Assign new_cpu to vc->processor here to get a call to sched_move_irqs + * in schedule.c in case there was a hard affinity change within the same + * run queue. vc will not be able to run in certain situations without + * this call. + */ + vc->processor = new_cpu; trqd = RQD(ops, new_cpu); @@ -1610,6 +1682,10 @@ runq_candidate(struct csched2_runqueue_data *rqd, { struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem); + /* Only consider vcpus that are allowed to run on this processor. */ + if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) ) + continue; + /* If this is on a different processor, don't pull it unless * its credit is at least CSCHED2_MIGRATE_RESIST higher. */ if ( svc->vcpu->processor != cpu @@ -1992,6 +2068,13 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu) printk("%s: cpu %d not online yet, deferring initializatgion\n", __func__, cpu); + /* + * For each new pcpu, allocate a cpumask_t for use throughout the + * scheduler to avoid putting any cpumask_t structs on the stack. + */ + if ( !zalloc_cpumask_var(&cpumask[cpu]) ) + return NULL; + return (void *)1; } @@ -2040,6 +2123,8 @@ csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) spin_unlock_irqrestore(&prv->lock, flags); + free_cpumask_var(cpumask[cpu]); + return; } @@ -2127,16 +2212,25 @@ csched2_init(struct scheduler *ops) prv->load_window_shift = opt_load_window_shift; + cpumask = xzalloc_bytes(nr_cpu_ids * sizeof(cpumask_t *)); + if ( cpumask == NULL ) + return -ENOMEM; + return 0; } static void csched2_deinit(const struct scheduler *ops) { + int i; struct csched2_private *prv; prv = CSCHED2_PRIV(ops); xfree(prv); + + for ( i = 0; i < nr_cpu_ids; i++ ) + free_cpumask_var(cpumask[i]); + xfree(cpumask); } -- 1.7.10.4 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.