[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] x86/cpuidle: split the max_cstate variable


  • To: Jan Beulich <jbeulich@xxxxxxxx>
  • From: Marek Marczykowski <marmarek@xxxxxxxxxxxxxxxxxxxxxx>
  • Date: Wed, 8 Apr 2026 23:11:07 +0200
  • Authentication-results: eu.smtp.expurgate.cloud; dkim=pass header.s=fm2 header.d=invisiblethingslab.com header.i="@invisiblethingslab.com" header.h="Cc:Content-Type:Date:From:In-Reply-To:Message-ID:MIME-Version:References:Subject:To"; dkim=pass header.s=fm2 header.d=messagingengine.com header.i="@messagingengine.com" header.h="Cc:Content-Type:Date:Feedback-ID:From:In-Reply-To:Message-ID:MIME-Version:References:Subject:To:X-ME-Proxy:X-ME-Sender"
  • Cc: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Teddy Astie <teddy.astie@xxxxxxxxxx>
  • Delivery-date: Wed, 08 Apr 2026 21:11:23 +0000
  • Feedback-id: i1568416f:Fastmail
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On Wed, Apr 08, 2026 at 01:34:43PM +0200, Jan Beulich wrote:
> The admin can control the upper bound wanted not only via command line
> option, but also via XEN_SYSCTL_pm_op_set_max_cstate. While decisions how
> to set up the system are okay this way as long as we deem the command line
> option a strict upper bound, what to do during S3 resume should not be
> based on that potentially varying value. Decisions there need to use
> solely the strict upper bound we may have enforced ourselves (or which was
> forced onto us via command line option).
> 
> Rather than altering pit_broadcast_is_available(), drop the function
> altogether. It's pretty odd for acpi/cpu_idle.c to call into time.c, just
> for that to call into acpi/cpu_idle.c again.
> 
> Fixes: 8d24303023ec ("x86: don't write_tsc() non-zero values on CPUs updating 
> only the lower 32 bits")
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

What should be observable effect, in absence of max_cstate option and
"xenpm set-max-cstate" calls? The "slow after S3" issue still happens. I
tested this on top of the two other patches:
- [PATCH] x86/HPET: channel handling in hpet_broadcast_resume()
- [PATCH] x86/cpu-policy: set up host policy earlier


> ---
> cpuidle_disable_deep_cstate(), when called from handle_rtc_once(), is
> still somewhat of a problem: Boot time and resume time runs of
> _disable_pit_irq() may still behave differently because of that.

In the above test, both on boot and resume I got:
(XEN) [    9.916522] _disable_pit_irq:2649: using_pit: 0, cpu_has_apic: 1
(XEN) [    9.921198] _disable_pit_irq:2659: cpuidle_usable_deep_cstate: 1, 
boot_cpu_has(X86_FEATURE_XEN_ARAT): 1


> If we wanted the sysctl to possibly move the maximum C-state beyond what
> was given on the command line, things would get yet more complicated, as
> we'd then need to re-configure the driver that's in use.
> 
> I wonder how useful the ACPI_STATE_C<n> #define-s really are. Plain 1 is
> used in e.g. probe_c3_errata(), and the plain 7 doesn't even have a
> respective constant (nor would that be suitable, as that's not really an
> ACPI state).
> 
> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -119,7 +119,7 @@ bool lapic_timer_init(void)
>          lapic_timer_off = hpet_broadcast_enter;
>          lapic_timer_on = hpet_broadcast_exit;
>      }
> -    else if ( pit_broadcast_is_available() )
> +    else if ( cpuidle_usable_deep_cstate() )
>      {
>          lapic_timer_off = pit_broadcast_enter;
>          lapic_timer_on = pit_broadcast_exit;
> @@ -131,12 +131,15 @@ bool lapic_timer_init(void)
>  }
>  
>  void (*__read_mostly pm_idle_save)(void);
> -unsigned int max_cstate __read_mostly = UINT_MAX;
> +
> +unsigned int max_usable_cstate __read_mostly = UINT_MAX;
> +unsigned int max_allowed_cstate __read_mostly = UINT_MAX;
>  unsigned int max_csubstate __read_mostly = UINT_MAX;
>  
>  static int __init cf_check parse_cstate(const char *s)
>  {
> -    max_cstate = simple_strtoul(s, &s, 0);
> +    max_allowed_cstate = simple_strtoul(s, &s, 0);
> +    max_usable_cstate = max_allowed_cstate;
>      if ( *s == ',' )
>          max_csubstate = simple_strtoul(s + 1, NULL, 0);
>      return 0;
> @@ -413,10 +416,11 @@ static void cf_check dump_cx(unsigned ch
>      unsigned int cpu;
>  
>      printk("'%c' pressed -> printing ACPI Cx structures\n", key);
> -    if ( max_cstate < UINT_MAX )
> +    if ( max_cstate() < UINT_MAX )
>      {
> -        printk("max state: C%u\n", max_cstate);
> -        if ( max_csubstate < UINT_MAX )
> +        printk("max state: C%u\n", max_cstate());
> +        if ( max_allowed_cstate <= max_usable_cstate &&
> +             max_csubstate < UINT_MAX )
>              printk("max sub-state: %u\n", max_csubstate);
>          else
>              printk("max sub-state: unlimited\n");
> @@ -690,18 +694,18 @@ static void cf_check acpi_processor_idle
>      u32 exp = 0, pred = 0;
>      u32 irq_traced[4] = { 0 };
>  
> -    if ( max_cstate > 0 && power &&
> +    if ( max_cstate() > 0 && power &&
>           (next_state = cpuidle_current_governor->select(power)) > 0 )
>      {
>          unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
> -                                                         : max_cstate;
> +                                                         : max_cstate();
>  
>          do {
>              cx = &power->states[next_state];
>          } while ( (cx->type > max_state ||
>                     cx->entry_method == ACPI_CSTATE_EM_NONE ||
>                     (cx->entry_method == ACPI_CSTATE_EM_FFH &&
> -                    cx->type == max_cstate &&
> +                    cx->type == max_allowed_cstate &&
>                      (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) &&
>                    --next_state );
>          if ( next_state )
> @@ -1448,7 +1452,7 @@ static void amd_cpuidle_init(struct acpi
>  
>      for ( i = 0; i < nr; ++i )
>      {
> -        if ( cx[i].type > max_cstate )
> +        if ( cx[i].type > max_cstate() )
>              break;
>          power->states[i + 1] = cx[i];
>          power->states[i + 1].idx = i + 1;
> @@ -1611,21 +1615,22 @@ int pmstat_reset_cx_stat(unsigned int cp
>  
>  void cpuidle_disable_deep_cstate(void)
>  {
> -    if ( max_cstate > ACPI_STATE_C1 )
> +    if ( max_usable_cstate > ACPI_STATE_C1 )
>      {
>          if ( local_apic_timer_c2_ok )
> -            max_cstate = ACPI_STATE_C2;
> +            max_usable_cstate = ACPI_STATE_C2;
>          else
> -            max_cstate = ACPI_STATE_C1;
> +            max_usable_cstate = ACPI_STATE_C1;
>      }
>  
>      hpet_disable_legacy_broadcast();
>  }
>  
> -bool cpuidle_using_deep_cstate(void)
> +bool cpuidle_usable_deep_cstate(void)
>  {
> -    return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? 
> ACPI_STATE_C2
> -                                                               : 
> ACPI_STATE_C1);
> +    return xen_cpuidle &&
> +           max_usable_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
> +                                                       : ACPI_STATE_C1);
>  }
>  
>  static int cf_check cpu_callback(
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -384,12 +384,12 @@ static void probe_c3_errata(const struct
>      };
>  
>      /* Serialized by the AP bringup code. */
> -    if ( max_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
> +    if ( max_usable_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
>           x86_match_cpu(models) )
>      {
>          printk(XENLOG_WARNING
>              "Disabling C-states C3 and C6 due to CPU errata\n");
> -        max_cstate = 1;
> +        max_usable_cstate = 1;
>      }
>  }
>  
> --- a/xen/arch/x86/cpu/mwait-idle.c
> +++ b/xen/arch/x86/cpu/mwait-idle.c
> @@ -1045,15 +1045,16 @@ static void cf_check mwait_idle(void)
>       u64 before, after;
>       u32 exp = 0, pred = 0, irq_traced[4] = { 0 };
>  
> -     if (max_cstate > 0 && power &&
> +     if (max_cstate() > 0 && power &&
>           (next_state = cpuidle_current_governor->select(power)) > 0) {
>               unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
> -                                                              : max_cstate;
> +                                                              : max_cstate();
>  
>               do {
>                       cx = &power->states[next_state];
> -             } while ((cx->type > max_state || (cx->type == max_cstate &&
> -                       MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
> +             } while ((cx->type > max_state ||
> +                          (cx->type == max_allowed_cstate &&
> +                        MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
>                        --next_state);
>               if (!next_state)
>                       cx = NULL;
> @@ -1458,7 +1459,7 @@ static void __init sklh_idle_state_table
>       u64 msr;
>  
>       /* if PC10 disabled via cmdline max_cstate=7 or shallower */
> -     if (max_cstate <= 7)
> +     if (max_cstate() <= 7)
>               return;
>  
>       /* if PC10 not present in CPUID.MWAIT.EDX */
> @@ -1623,7 +1624,7 @@ static int __init mwait_idle_probe(void)
>           !mwait_substates)
>               return -ENODEV;
>  
> -     if (!max_cstate || !opt_mwait_idle) {
> +     if (!max_cstate() || !opt_mwait_idle) {
>               pr_debug(PREFIX "disabled\n");
>               return -EPERM;
>       }
> @@ -1714,8 +1715,8 @@ static int cf_check mwait_idle_cpu_init(
>               hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
>               state = MWAIT_HINT2CSTATE(hint) + 1;
>  
> -             if (state > max_cstate) {
> -                     printk(PREFIX "max C-state %u reached\n", max_cstate);
> +             if (state > max_cstate()) {
> +                     printk(PREFIX "max C-state %u reached\n", max_cstate());
>                       break;
>               }
>  
> --- a/xen/arch/x86/include/asm/time.h
> +++ b/xen/arch/x86/include/asm/time.h
> @@ -31,7 +31,6 @@ int cpu_frequency_change(u64 freq);
>  
>  void cf_check pit_broadcast_enter(void);
>  void cf_check pit_broadcast_exit(void);
> -int pit_broadcast_is_available(void);
>  
>  uint64_t cf_check acpi_pm_tick_to_ns(uint64_t ticks);
>  
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -517,7 +517,7 @@ static int64_t __init cf_check init_hpet
>      bool disable_hpet = false;
>  
>      if ( hpet_address && strcmp(opt_clocksource, pts->id) &&
> -         cpuidle_using_deep_cstate() )
> +         cpuidle_usable_deep_cstate() )
>      {
>          if ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0),
>                               PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL )
> @@ -2655,7 +2655,7 @@ static int _disable_pit_irq(bool init)
>       * XXX dom0 may rely on RTC interrupt delivery, so only enable
>       * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
>       */
> -    if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) )
> +    if ( cpuidle_usable_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) 
> )
>      {
>          init ? hpet_broadcast_init() : hpet_broadcast_resume();
>          if ( !hpet_broadcast_is_available() )
> @@ -2707,11 +2707,6 @@ void cf_check pit_broadcast_exit(void)
>          reprogram_timer(this_cpu(timer_deadline));
>  }
>  
> -int pit_broadcast_is_available(void)
> -{
> -    return cpuidle_using_deep_cstate();
> -}
> -
>  void send_timer_event(struct vcpu *v)
>  {
>      send_guest_vcpu_virq(v, VIRQ_TIMER);
> @@ -2999,7 +2994,7 @@ static void cf_check dump_softtsc(unsign
>      else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
>      {
>          printk("TSC has constant rate, ");
> -        if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
> +        if ( max_usable_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
>              printk("no deep Cstates, passed warp test, deemed reliable, ");
>          else
>              printk("deep Cstates possible, so not reliable, ");
> --- a/xen/include/xen/acpi.h
> +++ b/xen/include/xen/acpi.h
> @@ -142,30 +142,33 @@ int acpi_gsi_to_irq (u32 gsi, unsigned i
>  
>  #ifdef       CONFIG_ACPI_CSTATE
>  /*
> - * max_cstate sets the highest legal C-state.
> - * max_cstate = 0: C0 okay, but not C1
> - * max_cstate = 1: C1 okay, but not C2
> - * max_cstate = 2: C2 okay, but not C3 etc.
> -
> - * max_csubstate sets the highest legal C-state sub-state. Only applies to 
> the
> - * highest legal C-state.
> - * max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
> - * max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2
> - * max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3
> - * max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3
> + * max_{allowed,usable}_cstate sets the highest allowed / usable C-state, 
> where
> + * "allowed" is command line / sysctl based.
> + * max_*_cstate = 0: C0 okay, but not C1
> + * max_*_cstate = 1: C1 okay, but not C2
> + * max_*_cstate = 2: C2 okay, but not C3 etc.
> + *
> + * max_csubstate sets the highest allowed C-state sub-state. Only applies to
> + * the highest allowed C-state.
> + * max_allowed_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
> + * max_allowed_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but 
> not C2
> + * max_allowed_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but 
> not C3
> + * max_allowed_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but 
> not C3
>   */
>  
> -extern unsigned int max_cstate;
> +extern unsigned int max_usable_cstate;
> +extern unsigned int max_allowed_cstate;
>  extern unsigned int max_csubstate;
>  
> +#define max_cstate() min(max_usable_cstate, max_allowed_cstate)
> +
>  static inline unsigned int acpi_get_cstate_limit(void)
>  {
> -     return max_cstate;
> +     return max_allowed_cstate;
>  }
>  static inline void acpi_set_cstate_limit(unsigned int new_limit)
>  {
> -     max_cstate = new_limit;
> -     return;
> +     max_allowed_cstate = new_limit;
>  }
>  
>  static inline unsigned int acpi_get_csubstate_limit(void)
> --- a/xen/include/xen/cpuidle.h
> +++ b/xen/include/xen/cpuidle.h
> @@ -89,7 +89,7 @@ struct cpuidle_governor
>  extern int8_t xen_cpuidle;
>  extern struct cpuidle_governor *cpuidle_current_governor;
>  
> -bool cpuidle_using_deep_cstate(void);
> +bool cpuidle_usable_deep_cstate(void);
>  void cpuidle_disable_deep_cstate(void);
>  
>  #define CPUIDLE_DRIVER_STATE_START  1

-- 
Best Regards,
Marek Marczykowski-Górecki
Invisible Things Lab

Attachment: signature.asc
Description: PGP signature


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.