|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH] x86/cpuidle: split the max_cstate variable
On Wed, Apr 08, 2026 at 01:34:43PM +0200, Jan Beulich wrote:
> The admin can control the upper bound wanted not only via command line
> option, but also via XEN_SYSCTL_pm_op_set_max_cstate. While decisions how
> to set up the system are okay this way as long as we deem the command line
> option a strict upper bound, what to do during S3 resume should not be
> based on that potentially varying value. Decisions there need to use
> solely the strict upper bound we may have enforced ourselves (or which was
> forced onto us via command line option).
>
> Rather than altering pit_broadcast_is_available(), drop the function
> altogether. It's pretty odd for acpi/cpu_idle.c to call into time.c, just
> for that to call into acpi/cpu_idle.c again.
>
> Fixes: 8d24303023ec ("x86: don't write_tsc() non-zero values on CPUs updating
> only the lower 32 bits")
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
What should be observable effect, in absence of max_cstate option and
"xenpm set-max-cstate" calls? The "slow after S3" issue still happens. I
tested this on top of the two other patches:
- [PATCH] x86/HPET: channel handling in hpet_broadcast_resume()
- [PATCH] x86/cpu-policy: set up host policy earlier
> ---
> cpuidle_disable_deep_cstate(), when called from handle_rtc_once(), is
> still somewhat of a problem: Boot time and resume time runs of
> _disable_pit_irq() may still behave differently because of that.
In the above test, both on boot and resume I got:
(XEN) [ 9.916522] _disable_pit_irq:2649: using_pit: 0, cpu_has_apic: 1
(XEN) [ 9.921198] _disable_pit_irq:2659: cpuidle_usable_deep_cstate: 1,
boot_cpu_has(X86_FEATURE_XEN_ARAT): 1
> If we wanted the sysctl to possibly move the maximum C-state beyond what
> was given on the command line, things would get yet more complicated, as
> we'd then need to re-configure the driver that's in use.
>
> I wonder how useful the ACPI_STATE_C<n> #define-s really are. Plain 1 is
> used in e.g. probe_c3_errata(), and the plain 7 doesn't even have a
> respective constant (nor would that be suitable, as that's not really an
> ACPI state).
>
> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -119,7 +119,7 @@ bool lapic_timer_init(void)
> lapic_timer_off = hpet_broadcast_enter;
> lapic_timer_on = hpet_broadcast_exit;
> }
> - else if ( pit_broadcast_is_available() )
> + else if ( cpuidle_usable_deep_cstate() )
> {
> lapic_timer_off = pit_broadcast_enter;
> lapic_timer_on = pit_broadcast_exit;
> @@ -131,12 +131,15 @@ bool lapic_timer_init(void)
> }
>
> void (*__read_mostly pm_idle_save)(void);
> -unsigned int max_cstate __read_mostly = UINT_MAX;
> +
> +unsigned int max_usable_cstate __read_mostly = UINT_MAX;
> +unsigned int max_allowed_cstate __read_mostly = UINT_MAX;
> unsigned int max_csubstate __read_mostly = UINT_MAX;
>
> static int __init cf_check parse_cstate(const char *s)
> {
> - max_cstate = simple_strtoul(s, &s, 0);
> + max_allowed_cstate = simple_strtoul(s, &s, 0);
> + max_usable_cstate = max_allowed_cstate;
> if ( *s == ',' )
> max_csubstate = simple_strtoul(s + 1, NULL, 0);
> return 0;
> @@ -413,10 +416,11 @@ static void cf_check dump_cx(unsigned ch
> unsigned int cpu;
>
> printk("'%c' pressed -> printing ACPI Cx structures\n", key);
> - if ( max_cstate < UINT_MAX )
> + if ( max_cstate() < UINT_MAX )
> {
> - printk("max state: C%u\n", max_cstate);
> - if ( max_csubstate < UINT_MAX )
> + printk("max state: C%u\n", max_cstate());
> + if ( max_allowed_cstate <= max_usable_cstate &&
> + max_csubstate < UINT_MAX )
> printk("max sub-state: %u\n", max_csubstate);
> else
> printk("max sub-state: unlimited\n");
> @@ -690,18 +694,18 @@ static void cf_check acpi_processor_idle
> u32 exp = 0, pred = 0;
> u32 irq_traced[4] = { 0 };
>
> - if ( max_cstate > 0 && power &&
> + if ( max_cstate() > 0 && power &&
> (next_state = cpuidle_current_governor->select(power)) > 0 )
> {
> unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
> - : max_cstate;
> + : max_cstate();
>
> do {
> cx = &power->states[next_state];
> } while ( (cx->type > max_state ||
> cx->entry_method == ACPI_CSTATE_EM_NONE ||
> (cx->entry_method == ACPI_CSTATE_EM_FFH &&
> - cx->type == max_cstate &&
> + cx->type == max_allowed_cstate &&
> (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) &&
> --next_state );
> if ( next_state )
> @@ -1448,7 +1452,7 @@ static void amd_cpuidle_init(struct acpi
>
> for ( i = 0; i < nr; ++i )
> {
> - if ( cx[i].type > max_cstate )
> + if ( cx[i].type > max_cstate() )
> break;
> power->states[i + 1] = cx[i];
> power->states[i + 1].idx = i + 1;
> @@ -1611,21 +1615,22 @@ int pmstat_reset_cx_stat(unsigned int cp
>
> void cpuidle_disable_deep_cstate(void)
> {
> - if ( max_cstate > ACPI_STATE_C1 )
> + if ( max_usable_cstate > ACPI_STATE_C1 )
> {
> if ( local_apic_timer_c2_ok )
> - max_cstate = ACPI_STATE_C2;
> + max_usable_cstate = ACPI_STATE_C2;
> else
> - max_cstate = ACPI_STATE_C1;
> + max_usable_cstate = ACPI_STATE_C1;
> }
>
> hpet_disable_legacy_broadcast();
> }
>
> -bool cpuidle_using_deep_cstate(void)
> +bool cpuidle_usable_deep_cstate(void)
> {
> - return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ?
> ACPI_STATE_C2
> - :
> ACPI_STATE_C1);
> + return xen_cpuidle &&
> + max_usable_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
> + : ACPI_STATE_C1);
> }
>
> static int cf_check cpu_callback(
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -384,12 +384,12 @@ static void probe_c3_errata(const struct
> };
>
> /* Serialized by the AP bringup code. */
> - if ( max_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
> + if ( max_usable_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
> x86_match_cpu(models) )
> {
> printk(XENLOG_WARNING
> "Disabling C-states C3 and C6 due to CPU errata\n");
> - max_cstate = 1;
> + max_usable_cstate = 1;
> }
> }
>
> --- a/xen/arch/x86/cpu/mwait-idle.c
> +++ b/xen/arch/x86/cpu/mwait-idle.c
> @@ -1045,15 +1045,16 @@ static void cf_check mwait_idle(void)
> u64 before, after;
> u32 exp = 0, pred = 0, irq_traced[4] = { 0 };
>
> - if (max_cstate > 0 && power &&
> + if (max_cstate() > 0 && power &&
> (next_state = cpuidle_current_governor->select(power)) > 0) {
> unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
> - : max_cstate;
> + : max_cstate();
>
> do {
> cx = &power->states[next_state];
> - } while ((cx->type > max_state || (cx->type == max_cstate &&
> - MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
> + } while ((cx->type > max_state ||
> + (cx->type == max_allowed_cstate &&
> + MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
> --next_state);
> if (!next_state)
> cx = NULL;
> @@ -1458,7 +1459,7 @@ static void __init sklh_idle_state_table
> u64 msr;
>
> /* if PC10 disabled via cmdline max_cstate=7 or shallower */
> - if (max_cstate <= 7)
> + if (max_cstate() <= 7)
> return;
>
> /* if PC10 not present in CPUID.MWAIT.EDX */
> @@ -1623,7 +1624,7 @@ static int __init mwait_idle_probe(void)
> !mwait_substates)
> return -ENODEV;
>
> - if (!max_cstate || !opt_mwait_idle) {
> + if (!max_cstate() || !opt_mwait_idle) {
> pr_debug(PREFIX "disabled\n");
> return -EPERM;
> }
> @@ -1714,8 +1715,8 @@ static int cf_check mwait_idle_cpu_init(
> hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
> state = MWAIT_HINT2CSTATE(hint) + 1;
>
> - if (state > max_cstate) {
> - printk(PREFIX "max C-state %u reached\n", max_cstate);
> + if (state > max_cstate()) {
> + printk(PREFIX "max C-state %u reached\n", max_cstate());
> break;
> }
>
> --- a/xen/arch/x86/include/asm/time.h
> +++ b/xen/arch/x86/include/asm/time.h
> @@ -31,7 +31,6 @@ int cpu_frequency_change(u64 freq);
>
> void cf_check pit_broadcast_enter(void);
> void cf_check pit_broadcast_exit(void);
> -int pit_broadcast_is_available(void);
>
> uint64_t cf_check acpi_pm_tick_to_ns(uint64_t ticks);
>
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -517,7 +517,7 @@ static int64_t __init cf_check init_hpet
> bool disable_hpet = false;
>
> if ( hpet_address && strcmp(opt_clocksource, pts->id) &&
> - cpuidle_using_deep_cstate() )
> + cpuidle_usable_deep_cstate() )
> {
> if ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0),
> PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL )
> @@ -2655,7 +2655,7 @@ static int _disable_pit_irq(bool init)
> * XXX dom0 may rely on RTC interrupt delivery, so only enable
> * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
> */
> - if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) )
> + if ( cpuidle_usable_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT)
> )
> {
> init ? hpet_broadcast_init() : hpet_broadcast_resume();
> if ( !hpet_broadcast_is_available() )
> @@ -2707,11 +2707,6 @@ void cf_check pit_broadcast_exit(void)
> reprogram_timer(this_cpu(timer_deadline));
> }
>
> -int pit_broadcast_is_available(void)
> -{
> - return cpuidle_using_deep_cstate();
> -}
> -
> void send_timer_event(struct vcpu *v)
> {
> send_guest_vcpu_virq(v, VIRQ_TIMER);
> @@ -2999,7 +2994,7 @@ static void cf_check dump_softtsc(unsign
> else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
> {
> printk("TSC has constant rate, ");
> - if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
> + if ( max_usable_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
> printk("no deep Cstates, passed warp test, deemed reliable, ");
> else
> printk("deep Cstates possible, so not reliable, ");
> --- a/xen/include/xen/acpi.h
> +++ b/xen/include/xen/acpi.h
> @@ -142,30 +142,33 @@ int acpi_gsi_to_irq (u32 gsi, unsigned i
>
> #ifdef CONFIG_ACPI_CSTATE
> /*
> - * max_cstate sets the highest legal C-state.
> - * max_cstate = 0: C0 okay, but not C1
> - * max_cstate = 1: C1 okay, but not C2
> - * max_cstate = 2: C2 okay, but not C3 etc.
> -
> - * max_csubstate sets the highest legal C-state sub-state. Only applies to
> the
> - * highest legal C-state.
> - * max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
> - * max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2
> - * max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3
> - * max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3
> + * max_{allowed,usable}_cstate sets the highest allowed / usable C-state,
> where
> + * "allowed" is command line / sysctl based.
> + * max_*_cstate = 0: C0 okay, but not C1
> + * max_*_cstate = 1: C1 okay, but not C2
> + * max_*_cstate = 2: C2 okay, but not C3 etc.
> + *
> + * max_csubstate sets the highest allowed C-state sub-state. Only applies to
> + * the highest allowed C-state.
> + * max_allowed_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
> + * max_allowed_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but
> not C2
> + * max_allowed_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but
> not C3
> + * max_allowed_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but
> not C3
> */
>
> -extern unsigned int max_cstate;
> +extern unsigned int max_usable_cstate;
> +extern unsigned int max_allowed_cstate;
> extern unsigned int max_csubstate;
>
> +#define max_cstate() min(max_usable_cstate, max_allowed_cstate)
> +
> static inline unsigned int acpi_get_cstate_limit(void)
> {
> - return max_cstate;
> + return max_allowed_cstate;
> }
> static inline void acpi_set_cstate_limit(unsigned int new_limit)
> {
> - max_cstate = new_limit;
> - return;
> + max_allowed_cstate = new_limit;
> }
>
> static inline unsigned int acpi_get_csubstate_limit(void)
> --- a/xen/include/xen/cpuidle.h
> +++ b/xen/include/xen/cpuidle.h
> @@ -89,7 +89,7 @@ struct cpuidle_governor
> extern int8_t xen_cpuidle;
> extern struct cpuidle_governor *cpuidle_current_governor;
>
> -bool cpuidle_using_deep_cstate(void);
> +bool cpuidle_usable_deep_cstate(void);
> void cpuidle_disable_deep_cstate(void);
>
> #define CPUIDLE_DRIVER_STATE_START 1
--
Best Regards,
Marek Marczykowski-Górecki
Invisible Things Lab
Attachment:
signature.asc
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |