Xen project Mailing List

[Xen-changelog] [xen staging] sched: fix dom0less boot with the null scheduler

Date: Tue, 12 Nov 2019 17:11:05 +0000

Delivery-date: Tue, 12 Nov 2019 17:11:10 +0000

List-id: "Change log for Mercurial \(receive only\)" <xen-changelog.lists.xenproject.org>

commit d13dfb02aafaba376b24ff0dc64e19ba1c360803 Author: Dario Faggioli <dfaggioli@xxxxxxxx> AuthorDate: Tue Nov 12 17:03:49 2019 +0000 Commit: George Dunlap <george.dunlap@xxxxxxxxxx> CommitDate: Tue Nov 12 17:03:49 2019 +0000 sched: fix dom0less boot with the null scheduler In a dom0less configuration, if the null scheduler is used, the system may fail to boot, because the loop in null_unit_wake() never exits. Bisection showed that this behavior occurs since commit d545f1d6 ("xen: sched: deal with vCPUs being or becoming online or offline") but the real problem is that, in this case, pick_res() always return the same CPU. Fix this by only deal with the simple case, i.e., the vCPU that is coming online can be assigned to a sched. resource right away, in null_unit_wake(). If it can't, just add it to the waitqueue, and we will deal with it in null_schedule(), being careful about not racing with vcpu_wake(). Reported-by: Stefano Stabellini <sstabellini@xxxxxxxxxx> Signed-off-by: Dario Faggioli <dfaggioli@xxxxxxxx> Tested-by: Stefano Stabellini <sstabellini@xxxxxxxxxx> Release-acked-by: Juergen Gross <jgross@xxxxxxxx> Reviewed-by: George Dunlap <george.dunlap@xxxxxxxxxx> --- xen/common/sched_null.c | 113 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 38 deletions(-) diff --git a/xen/common/sched_null.c b/xen/common/sched_null.c index 2525464a7c..da3fe29f21 100644 --- a/xen/common/sched_null.c +++ b/xen/common/sched_null.c @@ -568,50 +568,52 @@ static void null_unit_wake(const struct scheduler *ops, else SCHED_STAT_CRANK(unit_wake_not_runnable); + if ( likely(per_cpu(npc, cpu).unit == unit) ) + { + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + return; + } + /* * If a unit is neither on a pCPU nor in the waitqueue, it means it was - * offline, and that it is now coming back being online. + * offline, and that it is now coming back being online. If we're lucky, + * and its previous resource is free (and affinities match), we can just + * assign the unit to it (we own the proper lock already) and be done. */ - if ( unlikely(per_cpu(npc, cpu).unit != unit && list_empty(&nvc->waitq_elem)) ) + if ( per_cpu(npc, cpu).unit == NULL && + unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) ) { - spin_lock(&prv->waitq_lock); - list_add_tail(&nvc->waitq_elem, &prv->waitq); - spin_unlock(&prv->waitq_lock); - - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(unit->domain)); - - if ( !cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) ) + if ( !has_soft_affinity(unit) || + unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) ) { - dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n", - unit->domain->domain_id, unit->unit_id); + unit_assign(prv, unit, cpu); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); return; } + } - /* - * Now we would want to assign the unit to cpu, but we can't, because - * we don't have the lock. So, let's do the following: - * - try to remove cpu from the list of free cpus, to avoid races with - * other onlining, inserting or migrating operations; - * - tickle the cpu, which will pickup work from the waitqueue, and - * assign it to itself; - * - if we're racing already, and if there still are free cpus, try - * again. - */ - while ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) ) - { - unsigned int new_cpu = pick_res(prv, unit)->master_cpu; + /* + * If the resource is not free (or affinities do not match) we need + * to assign unit to some other one, but we can't do it here, as: + * - we don't own the proper lock, + * - we can't change v->processor under vcpu_wake()'s feet. + * So we add it to the waitqueue, and tickle all the free CPUs (if any) + * on which unit can run. The first one that schedules will pick it up. + */ + spin_lock(&prv->waitq_lock); + list_add_tail(&nvc->waitq_elem, &prv->waitq); + spin_unlock(&prv->waitq_lock); - if ( test_and_clear_bit(new_cpu, &prv->cpus_free) ) - { - cpu_raise_softirq(new_cpu, SCHEDULE_SOFTIRQ); - return; - } - } - } + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(unit->domain)); + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + &prv->cpus_free); - /* Note that we get here only for units assigned to a pCPU */ - cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); + if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) + dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n", + unit->domain->domain_id, unit->unit_id); + else + cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ); } static void null_unit_sleep(const struct scheduler *ops, @@ -827,6 +829,8 @@ static void null_schedule(const struct scheduler *ops, struct sched_unit *prev, */ if ( unlikely(prev->next_task == NULL) ) { + bool unit_found; + spin_lock(&prv->waitq_lock); if ( list_empty(&prv->waitq) ) @@ -839,6 +843,7 @@ static void null_schedule(const struct scheduler *ops, struct sched_unit *prev, * it only in cases where a pcpu has no unit associated (e.g., as * said above, the cpu has just joined a cpupool). */ + unit_found = false; for_each_affinity_balance_step( bs ) { list_for_each_entry( wvc, &prv->waitq, waitq_elem ) @@ -849,13 +854,45 @@ static void null_schedule(const struct scheduler *ops, struct sched_unit *prev, if ( unit_check_affinity(wvc->unit, sched_cpu, bs) ) { - unit_assign(prv, wvc->unit, sched_cpu); - list_del_init(&wvc->waitq_elem); - prev->next_task = wvc->unit; - goto unlock; + spinlock_t *lock; + + unit_found = true; + + /* + * If the unit in the waitqueue has just come up online, + * we risk racing with vcpu_wake(). To avoid this, sync + * on the spinlock that vcpu_wake() holds, but only with + * trylock, to avoid deadlock). + */ + lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit)); + + /* + * We know the vcpu's lock is not this resource's lock. In + * fact, if it were, since this cpu is free, vcpu_wake() + * would have assigned the unit to here directly. + */ + ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock); + + if ( lock ) { + unit_assign(prv, wvc->unit, sched_cpu); + list_del_init(&wvc->waitq_elem); + prev->next_task = wvc->unit; + spin_unlock(lock); + goto unlock; + } } } } + /* + * If we did find a unit with suitable affinity in the waitqueue, but + * we could not pick it up (due to lock contention), and hence we are + * still free, plan for another try. In fact, we don't want such unit + * to be stuck in the waitqueue, when there are free cpus where it + * could run. + */ + if ( unlikely( unit_found && prev->next_task == NULL && + !list_empty(&prv->waitq)) ) + cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ); unlock: spin_unlock(&prv->waitq_lock); -- generated by git-patchbot for /home/xen/git/xen.git#staging _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.