[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 3/5] xen: RCU/x86/ARM: discount CPUs that were idle when grace period started.



Xen is a tickless (micro-)kernel. This means that, when a CPU
becomes idle, we stop all the activity on it, including any
periodic tick or timer.

When we imported RCU from Linux, Linux (x86) was a ticking
kernel, i.e., there was a periodic timer tick always running,
even on totally idle CPUs. This was bad from a power efficiency
perspective, but it's what maked it possible to monitor the
quiescent states of all the CPUs, and hence tell when an RCU
grace period ends.

In Xen, that is impossible, and that's particularly problematic
when system is idle (or lightly loaded) systems, as CPUs that
are idle may never have the chance to tell RCU about their
quiescence, and grace periods could extend indefinitely!

This has led, on x86, to long (an unpredictable) delays between
RCU callbacks queueing and invokation. On ARM, we actually see
infinite grace periods (e.g., complate_domain_destroy() may
never be actually invoked on an idle system). See here:

 https://lists.xenproject.org/archives/html/xen-devel/2017-01/msg02454.html

The first step for fixing this situation is for RCU to record,
at the beginning of a grace period, which CPUs are already idle.
In fact, being idle, they can't be in the middle of any read-side
critical section, and we don't have to wait for them to declare
a grace period finished.

This is tracked in a cpumask, in a way that is very similar to
how Linux also was achieving the same on s390 --which indeed was
tickless already, even back then-- and to what it started to do
for x86, from 2.6.21 on (see commit 79bf2bb3 "tick-management:
dyntick / highres functionality").

While there, also adopt the memory barrier introduced by Linux
commit commit c3f59023 ("Fix RCU race in access of nohz_cpu_mask").

Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
---
Cc: Stefano Stabellini <sstabellini@xxxxxxxxxx>
Cc: Julien Grall <julien.grall@xxxxxxx>
Cc: Jan Beulich <jbeulich@xxxxxxxx>
Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
 xen/arch/arm/domain.c         |    2 ++
 xen/arch/x86/acpi/cpu_idle.c  |   25 +++++++++++++++++--------
 xen/arch/x86/cpu/mwait-idle.c |    9 ++++++++-
 xen/arch/x86/domain.c         |    8 +++++++-
 xen/common/rcupdate.c         |   28 ++++++++++++++++++++++++++--
 xen/include/xen/rcupdate.h    |    3 +++
 6 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index fce29cb..666b7ef 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -50,8 +50,10 @@ static void do_idle(void)
     local_irq_disable();
     if ( cpu_is_haltable(cpu) )
     {
+        rcu_idle_enter(cpu);
         dsb(sy);
         wfi();
+        rcu_idle_exit(cpu);
     }
     local_irq_enable();
 
diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 482b8a7..04c52e8 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -418,14 +418,16 @@ static void acpi_processor_ffh_cstate_enter(struct 
acpi_processor_cx *cx)
     mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
 }
 
-static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+static void acpi_idle_do_entry(unsigned int cpu, struct acpi_processor_cx *cx)
 {
+    rcu_idle_enter(cpu);
+
     switch ( cx->entry_method )
     {
     case ACPI_CSTATE_EM_FFH:
         /* Call into architectural FFH based C-state */
         acpi_processor_ffh_cstate_enter(cx);
-        return;
+        break;
     case ACPI_CSTATE_EM_SYSIO:
         /* IO port based C-state */
         inb(cx->address);
@@ -433,12 +435,14 @@ static void acpi_idle_do_entry(struct acpi_processor_cx 
*cx)
            because chipsets cannot guarantee that STPCLK# signal
            gets asserted in time to freeze execution properly. */
         inl(pmtmr_ioport);
-        return;
+        break;
     case ACPI_CSTATE_EM_HALT:
         safe_halt();
         local_irq_disable();
-        return;
+        break;
     }
+
+    rcu_idle_exit(cpu);
 }
 
 static int acpi_idle_bm_check(void)
@@ -540,7 +544,8 @@ void update_idle_stats(struct acpi_processor_power *power,
 
 static void acpi_processor_idle(void)
 {
-    struct acpi_processor_power *power = processor_powers[smp_processor_id()];
+    unsigned int cpu = smp_processor_id();
+    struct acpi_processor_power *power = processor_powers[cpu];
     struct acpi_processor_cx *cx = NULL;
     int next_state;
     uint64_t t1, t2 = 0;
@@ -563,7 +568,11 @@ static void acpi_processor_idle(void)
         if ( pm_idle_save )
             pm_idle_save();
         else
+        {
+            rcu_idle_enter(cpu);
             safe_halt();
+            rcu_idle_exit(cpu);
+        }
         return;
     }
 
@@ -579,7 +588,7 @@ static void acpi_processor_idle(void)
      */
     local_irq_disable();
 
-    if ( !cpu_is_haltable(smp_processor_id()) )
+    if ( !cpu_is_haltable(cpu) )
     {
         local_irq_enable();
         sched_tick_resume();
@@ -610,7 +619,7 @@ static void acpi_processor_idle(void)
             update_last_cx_stat(power, cx, t1);
 
             /* Invoke C2 */
-            acpi_idle_do_entry(cx);
+            acpi_idle_do_entry(cpu, cx);
             /* Get end time (ticks) */
             t2 = cpuidle_get_tick();
             trace_exit_reason(irq_traced);
@@ -672,7 +681,7 @@ static void acpi_processor_idle(void)
         }
 
         /* Invoke C3 */
-        acpi_idle_do_entry(cx);
+        acpi_idle_do_entry(cpu, cx);
 
         if ( (cx->type == ACPI_STATE_C3) &&
              power->flags.bm_check && power->flags.bm_control )
diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c
index 762dff1..ae9e92b 100644
--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -735,8 +735,11 @@ static void mwait_idle(void)
        if (!cx) {
                if (pm_idle_save)
                        pm_idle_save();
-               else
+               else {
+                       rcu_idle_enter(cpu);
                        safe_halt();
+                       rcu_idle_exit(cpu);
+               }
                return;
        }
 
@@ -756,6 +759,8 @@ static void mwait_idle(void)
                return;
        }
 
+       rcu_idle_enter(cpu);
+
        eax = cx->address;
        cstate = ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
 
@@ -787,6 +792,8 @@ static void mwait_idle(void)
                irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
 
        /* Now back in C0. */
+       rcu_idle_exit(cpu);
+
        update_idle_stats(power, cx, before, after);
        local_irq_enable();
 
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index dd8bf13..a6c0f66 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -73,9 +73,15 @@ void (*dead_idle) (void) __read_mostly = default_dead_idle;
 
 static void default_idle(void)
 {
+    unsigned int cpu = smp_processor_id();
+
     local_irq_disable();
-    if ( cpu_is_haltable(smp_processor_id()) )
+    if ( cpu_is_haltable(cpu) )
+    {
+        rcu_idle_enter(cpu);
         safe_halt();
+        rcu_idle_exit(cpu);
+    }
     else
         local_irq_enable();
 }
diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
index 8cc5a82..f0fdc87 100644
--- a/xen/common/rcupdate.c
+++ b/xen/common/rcupdate.c
@@ -52,7 +52,8 @@ static struct rcu_ctrlblk {
     int  next_pending;  /* Is the next batch already waiting?         */
 
     spinlock_t  lock __cacheline_aligned;
-    cpumask_t   cpumask; /* CPUs that need to switch in order    */
+    cpumask_t   cpumask; /* CPUs that need to switch in order ... */
+    cpumask_t   idle_cpumask; /* ... unless they are already idle */
     /* for current batch to proceed.        */
 } __cacheline_aligned rcu_ctrlblk = {
     .cur = -300,
@@ -248,7 +249,14 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
         smp_wmb();
         rcp->cur++;
 
-        cpumask_copy(&rcp->cpumask, &cpu_online_map);
+       /*
+        * Accessing idle_cpumask before incrementing rcp->cur needs a
+        * Barrier  Otherwise it can cause tickless idle CPUs to be
+        * included in rcp->cpumask, which will extend graceperiods
+        * unnecessarily.
+        */
+        smp_mb();
+        cpumask_andnot(&rcp->cpumask, &cpu_online_map, &rcp->idle_cpumask);
     }
 }
 
@@ -474,7 +482,23 @@ static struct notifier_block cpu_nfb = {
 void __init rcu_init(void)
 {
     void *cpu = (void *)(long)smp_processor_id();
+
+    cpumask_setall(&rcu_ctrlblk.idle_cpumask);
     cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
     register_cpu_notifier(&cpu_nfb);
     open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
+
+/*
+ * The CPU is becoming idle, so no more read side critical
+ * sections, and one more step toward grace period.
+ */
+void rcu_idle_enter(unsigned int cpu)
+{
+    cpumask_set_cpu(cpu, &rcu_ctrlblk.idle_cpumask);
+}
+
+void rcu_idle_exit(unsigned int cpu)
+{
+    cpumask_clear_cpu(cpu, &rcu_ctrlblk.idle_cpumask);
+}
diff --git a/xen/include/xen/rcupdate.h b/xen/include/xen/rcupdate.h
index 557a7b1..561ac43 100644
--- a/xen/include/xen/rcupdate.h
+++ b/xen/include/xen/rcupdate.h
@@ -146,4 +146,7 @@ void call_rcu(struct rcu_head *head,
 
 int rcu_barrier(void);
 
+void rcu_idle_enter(unsigned int cpu);
+void rcu_idle_exit(unsigned int cpu);
+
 #endif /* __XEN_RCUPDATE_H */


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.