[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen staging-4.13] sched: fix cpu offlining with core scheduling



commit 680356ed13b62b790b2f8903e76f3941f71ac0ad
Author:     Juergen Gross <jgross@xxxxxxxx>
AuthorDate: Thu Apr 9 09:11:42 2020 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Thu Apr 9 09:11:42 2020 +0200

    sched: fix cpu offlining with core scheduling
    
    Offlining a cpu with core scheduling active can result in a hanging
    system. Reason is the scheduling resource and unit of the to be removed
    cpus needs to be split in order to remove the cpu from its cpupool and
    move it to the idle scheduler. In case one of the involved cpus happens
    to have received a sched slave event due to a vcpu former having been
    running on that cpu being woken up again, it can happen that this cpu
    will enter sched_wait_rendezvous_in() while its scheduling resource is
    just about to be split. It might wait for ever for the other sibling
    to join, which will never happen due to the resources already being
    modified.
    
    This can easily be avoided by:
    - resetting the rendezvous counters of the idle unit which is kept
    - checking for a new scheduling resource in sched_wait_rendezvous_in()
      after reacquiring the scheduling lock and resetting the counters in
      that case without scheduling another vcpu
    - moving schedule resource modifications (in schedule_cpu_rm()) and
      retrieving (schedule(), sched_slave() is fine already, others are not
      critical) into locked regions
    
    Reported-by: Igor Druzhinin <igor.druzhinin@xxxxxxxxxx>
    Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
    Reviewed-by: Dario Faggioli <dfaggioli@xxxxxxxx>
    master commit: b6f5334aeaca133ad47ade06f4d22baf5984b55d
    master date: 2020-03-26 12:23:59 +0100
---
 xen/common/schedule.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 27fde772be..820bd5050c 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -2112,6 +2112,10 @@ void sched_context_switched(struct vcpu *vprev, struct 
vcpu *vnext)
     rcu_read_unlock(&sched_res_rculock);
 }
 
+/*
+ * Switch to a new context or keep the current one running.
+ * On x86 it won't return, so it needs to drop the still held 
sched_res_rculock.
+ */
 static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
                                  bool reset_idle_unit, s_time_t now)
 {
@@ -2221,6 +2225,9 @@ static struct vcpu *sched_force_context_switch(struct 
vcpu *vprev,
  * zero do_schedule() is called and the rendezvous counter for leaving
  * context_switch() is set. All other members will wait until the counter is
  * becoming zero, dropping the schedule lock in between.
+ * Either returns the new unit to run, or NULL if no context switch is
+ * required or (on Arm) has already been performed. If NULL is returned
+ * sched_res_rculock has been dropped.
  */
 static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
                                                    spinlock_t **lock, int cpu,
@@ -2228,7 +2235,8 @@ static struct sched_unit *sched_wait_rendezvous_in(struct 
sched_unit *prev,
 {
     struct sched_unit *next;
     struct vcpu *v;
-    unsigned int gran = get_sched_res(cpu)->granularity;
+    struct sched_resource *sr = get_sched_res(cpu);
+    unsigned int gran = sr->granularity;
 
     if ( !--prev->rendezvous_in_cnt )
     {
@@ -2295,6 +2303,21 @@ static struct sched_unit 
*sched_wait_rendezvous_in(struct sched_unit *prev,
             atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
             prev->rendezvous_in_cnt = 0;
         }
+
+        /*
+         * Check for scheduling resource switched. This happens when we are
+         * moved away from our cpupool and cpus are subject of the idle
+         * scheduler now.
+         */
+        if ( unlikely(sr != get_sched_res(cpu)) )
+        {
+            ASSERT(is_idle_unit(prev));
+            atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
+            prev->rendezvous_in_cnt = 0;
+            pcpu_schedule_unlock_irq(*lock, cpu);
+            rcu_read_unlock(&sched_res_rculock);
+            return NULL;
+        }
     }
 
     return prev->next_task;
@@ -2380,11 +2403,11 @@ static void schedule(void)
 
     rcu_read_lock(&sched_res_rculock);
 
+    lock = pcpu_schedule_lock_irq(cpu);
+
     sr = get_sched_res(cpu);
     gran = sr->granularity;
 
-    lock = pcpu_schedule_lock_irq(cpu);
-
     if ( prev->rendezvous_in_cnt )
     {
         /*
@@ -2965,7 +2988,10 @@ int schedule_cpu_rm(unsigned int cpu)
         per_cpu(sched_res_idx, cpu_iter) = 0;
         if ( cpu_iter == cpu )
         {
-            idle_vcpu[cpu_iter]->sched_unit->priv = NULL;
+            unit = idle_vcpu[cpu_iter]->sched_unit;
+            unit->priv = NULL;
+            atomic_set(&unit->next_task->rendezvous_out_cnt, 0);
+            unit->rendezvous_in_cnt = 0;
         }
         else
         {
@@ -2996,6 +3022,8 @@ int schedule_cpu_rm(unsigned int cpu)
     }
     sr->scheduler = &sched_idle_ops;
     sr->sched_priv = NULL;
+    sr->granularity = 1;
+    sr->cpupool = NULL;
 
     smp_mb();
     sr->schedule_lock = &sched_free_cpu_lock;
@@ -3008,9 +3036,6 @@ int schedule_cpu_rm(unsigned int cpu)
     sched_free_udata(old_ops, vpriv_old);
     sched_free_pdata(old_ops, ppriv_old, cpu);
 
-    sr->granularity = 1;
-    sr->cpupool = NULL;
-
 out:
     rcu_read_unlock(&sched_res_rculock);
     xfree(sr_new);
--
generated by git-patchbot for /home/xen/git/xen.git#staging-4.13



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.