[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] x86/cpuidle: split the max_cstate variable


  • To: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Wed, 8 Apr 2026 13:34:43 +0200
  • Authentication-results: eu.smtp.expurgate.cloud; dkim=pass header.s=google header.d=suse.com header.i="@suse.com" header.h="Content-Transfer-Encoding:Autocrypt:Subject:From:Cc:To:Content-Language:User-Agent:MIME-Version:Date:Message-ID"
  • Autocrypt: addr=jbeulich@xxxxxxxx; keydata= xsDiBFk3nEQRBADAEaSw6zC/EJkiwGPXbWtPxl2xCdSoeepS07jW8UgcHNurfHvUzogEq5xk hu507c3BarVjyWCJOylMNR98Yd8VqD9UfmX0Hb8/BrA+Hl6/DB/eqGptrf4BSRwcZQM32aZK 7Pj2XbGWIUrZrd70x1eAP9QE3P79Y2oLrsCgbZJfEwCgvz9JjGmQqQkRiTVzlZVCJYcyGGsD /0tbFCzD2h20ahe8rC1gbb3K3qk+LpBtvjBu1RY9drYk0NymiGbJWZgab6t1jM7sk2vuf0Py O9Hf9XBmK0uE9IgMaiCpc32XV9oASz6UJebwkX+zF2jG5I1BfnO9g7KlotcA/v5ClMjgo6Gl MDY4HxoSRu3i1cqqSDtVlt+AOVBJBACrZcnHAUSuCXBPy0jOlBhxPqRWv6ND4c9PH1xjQ3NP nxJuMBS8rnNg22uyfAgmBKNLpLgAGVRMZGaGoJObGf72s6TeIqKJo/LtggAS9qAUiuKVnygo 3wjfkS9A3DRO+SpU7JqWdsveeIQyeyEJ/8PTowmSQLakF+3fote9ybzd880fSmFuIEJldWxp Y2ggPGpiZXVsaWNoQHN1c2UuY29tPsJgBBMRAgAgBQJZN5xEAhsDBgsJCAcDAgQVAggDBBYC AwECHgECF4AACgkQoDSui/t3IH4J+wCfQ5jHdEjCRHj23O/5ttg9r9OIruwAn3103WUITZee e7Sbg12UgcQ5lv7SzsFNBFk3nEQQCACCuTjCjFOUdi5Nm244F+78kLghRcin/awv+IrTcIWF hUpSs1Y91iQQ7KItirz5uwCPlwejSJDQJLIS+QtJHaXDXeV6NI0Uef1hP20+y8qydDiVkv6l IreXjTb7DvksRgJNvCkWtYnlS3mYvQ9NzS9PhyALWbXnH6sIJd2O9lKS1Mrfq+y0IXCP10eS FFGg+Av3IQeFatkJAyju0PPthyTqxSI4lZYuJVPknzgaeuJv/2NccrPvmeDg6Coe7ZIeQ8Yj t0ARxu2xytAkkLCel1Lz1WLmwLstV30g80nkgZf/wr+/BXJW/oIvRlonUkxv+IbBM3dX2OV8 AmRv1ySWPTP7AAMFB/9PQK/VtlNUJvg8GXj9ootzrteGfVZVVT4XBJkfwBcpC/XcPzldjv+3 HYudvpdNK3lLujXeA5fLOH+Z/G9WBc5pFVSMocI71I8bT8lIAzreg0WvkWg5V2WZsUMlnDL9 mpwIGFhlbM3gfDMs7MPMu8YQRFVdUvtSpaAs8OFfGQ0ia3LGZcjA6Ik2+xcqscEJzNH+qh8V m5jjp28yZgaqTaRbg3M/+MTbMpicpZuqF4rnB0AQD12/3BNWDR6bmh+EkYSMcEIpQmBM51qM EKYTQGybRCjpnKHGOxG0rfFY1085mBDZCH5Kx0cl0HVJuQKC+dV2ZY5AqjcKwAxpE75MLFkr wkkEGBECAAkFAlk3nEQCGwwACgkQoDSui/t3IH7nnwCfcJWUDUFKdCsBH/E5d+0ZnMQi+G0A nAuWpQkjM1ASeQwSHEeAWPgskBQL
  • Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Teddy Astie <teddy.astie@xxxxxxxxxx>, Marek Marczykowski <marmarek@xxxxxxxxxxxxxxxxxxxxxx>
  • Delivery-date: Wed, 08 Apr 2026 11:34:47 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

The admin can control the upper bound wanted not only via command line
option, but also via XEN_SYSCTL_pm_op_set_max_cstate. While decisions how
to set up the system are okay this way as long as we deem the command line
option a strict upper bound, what to do during S3 resume should not be
based on that potentially varying value. Decisions there need to use
solely the strict upper bound we may have enforced ourselves (or which was
forced onto us via command line option).

Rather than altering pit_broadcast_is_available(), drop the function
altogether. It's pretty odd for acpi/cpu_idle.c to call into time.c, just
for that to call into acpi/cpu_idle.c again.

Fixes: 8d24303023ec ("x86: don't write_tsc() non-zero values on CPUs updating 
only the lower 32 bits")
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
cpuidle_disable_deep_cstate(), when called from handle_rtc_once(), is
still somewhat of a problem: Boot time and resume time runs of
_disable_pit_irq() may still behave differently because of that.

If we wanted the sysctl to possibly move the maximum C-state beyond what
was given on the command line, things would get yet more complicated, as
we'd then need to re-configure the driver that's in use.

I wonder how useful the ACPI_STATE_C<n> #define-s really are. Plain 1 is
used in e.g. probe_c3_errata(), and the plain 7 doesn't even have a
respective constant (nor would that be suitable, as that's not really an
ACPI state).

--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -119,7 +119,7 @@ bool lapic_timer_init(void)
         lapic_timer_off = hpet_broadcast_enter;
         lapic_timer_on = hpet_broadcast_exit;
     }
-    else if ( pit_broadcast_is_available() )
+    else if ( cpuidle_usable_deep_cstate() )
     {
         lapic_timer_off = pit_broadcast_enter;
         lapic_timer_on = pit_broadcast_exit;
@@ -131,12 +131,15 @@ bool lapic_timer_init(void)
 }
 
 void (*__read_mostly pm_idle_save)(void);
-unsigned int max_cstate __read_mostly = UINT_MAX;
+
+unsigned int max_usable_cstate __read_mostly = UINT_MAX;
+unsigned int max_allowed_cstate __read_mostly = UINT_MAX;
 unsigned int max_csubstate __read_mostly = UINT_MAX;
 
 static int __init cf_check parse_cstate(const char *s)
 {
-    max_cstate = simple_strtoul(s, &s, 0);
+    max_allowed_cstate = simple_strtoul(s, &s, 0);
+    max_usable_cstate = max_allowed_cstate;
     if ( *s == ',' )
         max_csubstate = simple_strtoul(s + 1, NULL, 0);
     return 0;
@@ -413,10 +416,11 @@ static void cf_check dump_cx(unsigned ch
     unsigned int cpu;
 
     printk("'%c' pressed -> printing ACPI Cx structures\n", key);
-    if ( max_cstate < UINT_MAX )
+    if ( max_cstate() < UINT_MAX )
     {
-        printk("max state: C%u\n", max_cstate);
-        if ( max_csubstate < UINT_MAX )
+        printk("max state: C%u\n", max_cstate());
+        if ( max_allowed_cstate <= max_usable_cstate &&
+             max_csubstate < UINT_MAX )
             printk("max sub-state: %u\n", max_csubstate);
         else
             printk("max sub-state: unlimited\n");
@@ -690,18 +694,18 @@ static void cf_check acpi_processor_idle
     u32 exp = 0, pred = 0;
     u32 irq_traced[4] = { 0 };
 
-    if ( max_cstate > 0 && power &&
+    if ( max_cstate() > 0 && power &&
          (next_state = cpuidle_current_governor->select(power)) > 0 )
     {
         unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
-                                                         : max_cstate;
+                                                         : max_cstate();
 
         do {
             cx = &power->states[next_state];
         } while ( (cx->type > max_state ||
                    cx->entry_method == ACPI_CSTATE_EM_NONE ||
                    (cx->entry_method == ACPI_CSTATE_EM_FFH &&
-                    cx->type == max_cstate &&
+                    cx->type == max_allowed_cstate &&
                     (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) &&
                   --next_state );
         if ( next_state )
@@ -1448,7 +1452,7 @@ static void amd_cpuidle_init(struct acpi
 
     for ( i = 0; i < nr; ++i )
     {
-        if ( cx[i].type > max_cstate )
+        if ( cx[i].type > max_cstate() )
             break;
         power->states[i + 1] = cx[i];
         power->states[i + 1].idx = i + 1;
@@ -1611,21 +1615,22 @@ int pmstat_reset_cx_stat(unsigned int cp
 
 void cpuidle_disable_deep_cstate(void)
 {
-    if ( max_cstate > ACPI_STATE_C1 )
+    if ( max_usable_cstate > ACPI_STATE_C1 )
     {
         if ( local_apic_timer_c2_ok )
-            max_cstate = ACPI_STATE_C2;
+            max_usable_cstate = ACPI_STATE_C2;
         else
-            max_cstate = ACPI_STATE_C1;
+            max_usable_cstate = ACPI_STATE_C1;
     }
 
     hpet_disable_legacy_broadcast();
 }
 
-bool cpuidle_using_deep_cstate(void)
+bool cpuidle_usable_deep_cstate(void)
 {
-    return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
-                                                               : 
ACPI_STATE_C1);
+    return xen_cpuidle &&
+           max_usable_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
+                                                       : ACPI_STATE_C1);
 }
 
 static int cf_check cpu_callback(
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -384,12 +384,12 @@ static void probe_c3_errata(const struct
     };
 
     /* Serialized by the AP bringup code. */
-    if ( max_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
+    if ( max_usable_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
          x86_match_cpu(models) )
     {
         printk(XENLOG_WARNING
               "Disabling C-states C3 and C6 due to CPU errata\n");
-        max_cstate = 1;
+        max_usable_cstate = 1;
     }
 }
 
--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -1045,15 +1045,16 @@ static void cf_check mwait_idle(void)
        u64 before, after;
        u32 exp = 0, pred = 0, irq_traced[4] = { 0 };
 
-       if (max_cstate > 0 && power &&
+       if (max_cstate() > 0 && power &&
            (next_state = cpuidle_current_governor->select(power)) > 0) {
                unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
-                                                                : max_cstate;
+                                                                : max_cstate();
 
                do {
                        cx = &power->states[next_state];
-               } while ((cx->type > max_state || (cx->type == max_cstate &&
-                         MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
+               } while ((cx->type > max_state ||
+                          (cx->type == max_allowed_cstate &&
+                          MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
                         --next_state);
                if (!next_state)
                        cx = NULL;
@@ -1458,7 +1459,7 @@ static void __init sklh_idle_state_table
        u64 msr;
 
        /* if PC10 disabled via cmdline max_cstate=7 or shallower */
-       if (max_cstate <= 7)
+       if (max_cstate() <= 7)
                return;
 
        /* if PC10 not present in CPUID.MWAIT.EDX */
@@ -1623,7 +1624,7 @@ static int __init mwait_idle_probe(void)
            !mwait_substates)
                return -ENODEV;
 
-       if (!max_cstate || !opt_mwait_idle) {
+       if (!max_cstate() || !opt_mwait_idle) {
                pr_debug(PREFIX "disabled\n");
                return -EPERM;
        }
@@ -1714,8 +1715,8 @@ static int cf_check mwait_idle_cpu_init(
                hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
                state = MWAIT_HINT2CSTATE(hint) + 1;
 
-               if (state > max_cstate) {
-                       printk(PREFIX "max C-state %u reached\n", max_cstate);
+               if (state > max_cstate()) {
+                       printk(PREFIX "max C-state %u reached\n", max_cstate());
                        break;
                }
 
--- a/xen/arch/x86/include/asm/time.h
+++ b/xen/arch/x86/include/asm/time.h
@@ -31,7 +31,6 @@ int cpu_frequency_change(u64 freq);
 
 void cf_check pit_broadcast_enter(void);
 void cf_check pit_broadcast_exit(void);
-int pit_broadcast_is_available(void);
 
 uint64_t cf_check acpi_pm_tick_to_ns(uint64_t ticks);
 
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -517,7 +517,7 @@ static int64_t __init cf_check init_hpet
     bool disable_hpet = false;
 
     if ( hpet_address && strcmp(opt_clocksource, pts->id) &&
-         cpuidle_using_deep_cstate() )
+         cpuidle_usable_deep_cstate() )
     {
         if ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0),
                              PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL )
@@ -2655,7 +2655,7 @@ static int _disable_pit_irq(bool init)
      * XXX dom0 may rely on RTC interrupt delivery, so only enable
      * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
      */
-    if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) )
+    if ( cpuidle_usable_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) )
     {
         init ? hpet_broadcast_init() : hpet_broadcast_resume();
         if ( !hpet_broadcast_is_available() )
@@ -2707,11 +2707,6 @@ void cf_check pit_broadcast_exit(void)
         reprogram_timer(this_cpu(timer_deadline));
 }
 
-int pit_broadcast_is_available(void)
-{
-    return cpuidle_using_deep_cstate();
-}
-
 void send_timer_event(struct vcpu *v)
 {
     send_guest_vcpu_virq(v, VIRQ_TIMER);
@@ -2999,7 +2994,7 @@ static void cf_check dump_softtsc(unsign
     else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
     {
         printk("TSC has constant rate, ");
-        if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
+        if ( max_usable_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
             printk("no deep Cstates, passed warp test, deemed reliable, ");
         else
             printk("deep Cstates possible, so not reliable, ");
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -142,30 +142,33 @@ int acpi_gsi_to_irq (u32 gsi, unsigned i
 
 #ifdef CONFIG_ACPI_CSTATE
 /*
- * max_cstate sets the highest legal C-state.
- * max_cstate = 0: C0 okay, but not C1
- * max_cstate = 1: C1 okay, but not C2
- * max_cstate = 2: C2 okay, but not C3 etc.
-
- * max_csubstate sets the highest legal C-state sub-state. Only applies to the
- * highest legal C-state.
- * max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
- * max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2
- * max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3
- * max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3
+ * max_{allowed,usable}_cstate sets the highest allowed / usable C-state, where
+ * "allowed" is command line / sysctl based.
+ * max_*_cstate = 0: C0 okay, but not C1
+ * max_*_cstate = 1: C1 okay, but not C2
+ * max_*_cstate = 2: C2 okay, but not C3 etc.
+ *
+ * max_csubstate sets the highest allowed C-state sub-state. Only applies to
+ * the highest allowed C-state.
+ * max_allowed_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
+ * max_allowed_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not 
C2
+ * max_allowed_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not 
C3
+ * max_allowed_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not 
C3
  */
 
-extern unsigned int max_cstate;
+extern unsigned int max_usable_cstate;
+extern unsigned int max_allowed_cstate;
 extern unsigned int max_csubstate;
 
+#define max_cstate() min(max_usable_cstate, max_allowed_cstate)
+
 static inline unsigned int acpi_get_cstate_limit(void)
 {
-       return max_cstate;
+       return max_allowed_cstate;
 }
 static inline void acpi_set_cstate_limit(unsigned int new_limit)
 {
-       max_cstate = new_limit;
-       return;
+       max_allowed_cstate = new_limit;
 }
 
 static inline unsigned int acpi_get_csubstate_limit(void)
--- a/xen/include/xen/cpuidle.h
+++ b/xen/include/xen/cpuidle.h
@@ -89,7 +89,7 @@ struct cpuidle_governor
 extern int8_t xen_cpuidle;
 extern struct cpuidle_governor *cpuidle_current_governor;
 
-bool cpuidle_using_deep_cstate(void);
+bool cpuidle_usable_deep_cstate(void);
 void cpuidle_disable_deep_cstate(void);
 
 #define CPUIDLE_DRIVER_STATE_START  1



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.