|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v2] amd: disable C6 after 1000 days on Zen2
On 30/06/2023 2:18 pm, Roger Pau Monne wrote:
> diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
> index 0eaef82e5145..bdf45f8387e8 100644
> --- a/xen/arch/x86/cpu/amd.c
> +++ b/xen/arch/x86/cpu/amd.c
> @@ -51,6 +51,8 @@ bool __read_mostly amd_acpi_c1e_quirk;
> bool __ro_after_init amd_legacy_ssbd;
> bool __initdata amd_virt_spec_ctrl;
>
> +static bool __read_mostly c6_disabled;
> +
> static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo,
> unsigned int *hi)
> {
> @@ -905,6 +907,31 @@ void __init detect_zen2_null_seg_behaviour(void)
>
> }
>
> +static void cf_check disable_c6(void *arg)
> +{
> + uint64_t val;
> +
> + if (!c6_disabled) {
> + printk(XENLOG_WARNING
> + "Disabling C6 after 1000 days apparent uptime due to AMD errata 1474\n");
> + c6_disabled = true;
> + smp_call_function(disable_c6, NULL, 0);
> + }
> +
> + /* Update the MSR to disable C6, done on all threads. */
> + if (rdmsr_safe(MSR_AMD_CSTATE_CFG, val)) {
> +error:
> + printk_once(XENLOG_ERR
> + "Unable to disable C6 on AMD system affected by errata 1474\n"
> + "Reboot recommended, otherwise system might hang\n");
> + return;
> + }
> + val &= ~(CSTATE_CFG_CCR0_CC6EN | CSTATE_CFG_CCR1_CC6EN |
> + CSTATE_CFG_CCR2_CC6EN);
> + if (wrmsr_safe(MSR_AMD_CSTATE_CFG, val))
> + goto error;
These MSRs don't fault, and we already excluded hypervisors previously,
so the safe() doesn't really help. The only way you'd spot a failure is
by reading back and noticing that the update didn't take effect.
Independently, this really really makes me want to dust off my
msr_{set,clear}_bits() helpers to reduce the boilerplate required for
logic like this.
> +}
> +
> static void cf_check init_amd(struct cpuinfo_x86 *c)
> {
> u32 l, h;
> @@ -1171,6 +1198,9 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
> if ((smp_processor_id() == 1) && !cpu_has(c, X86_FEATURE_ITSC))
> disable_c1_ramping();
>
> + if (c6_disabled)
> + disable_c6(NULL);
> +
> check_syscfg_dram_mod_en();
>
> amd_log_freq(c);
> @@ -1180,3 +1210,43 @@ const struct cpu_dev amd_cpu_dev = {
> .c_early_init = early_init_amd,
> .c_init = init_amd,
> };
> +
> +static int __init cf_check c6_errata_check(void)
> +{
> + /*
> + * Errata #1474: A Core May Hang After About 1044 Days
> + * Set up a timer to disable C6 after 1000 days uptime.
> + */
> + s_time_t delta;
> +
> + /*
> + * Apply to all Zen2 models. According to AMD revision guides at least
> + * Rome, Castle Peak, Renoir, Lucienne and Matisse are affected.
You probably want to replicate the spectral chicken comment about how
identifying Zen2 is actually very hard.
That said, I don't see anything which limits this logic to Fam17h.
> + */
> + if (cpu_has_hypervisor || !boot_cpu_has(X86_FEATURE_AMD_STIBP))
> + return 0;
> +
> + /*
> + * Deduct current TSC value, this would be relevant if kexec'ed for
> + * example. Might not be accurate, but worst case we end up disabling
> + * C6 before strictly required, which would still be safe.
> + *
> + * NB: all affected models (Zen2) have invariant TSC and TSC adjust
> + * MSR, so early_time_init() will have already cleared any TSC offset.
> + */
> + delta = DAYS(1000) - tsc_ticks2ns(rdtsc());
> + if (delta > 0) {
> + static struct timer errata_c6;
> +
> + init_timer(&errata_c6, disable_c6, NULL, 0);
> + set_timer(&errata_c6, NOW() + delta);
> + } else
> + disable_c6(NULL);
I doubt the smp_call_function() is going to be happy at presmp time.
Do we really care if we're already beyond the timeout?
> +
> + return 0;
> +}
> +/*
> + * Must be executed after early_time_init() for tsc_ticks2ns() to have been
> + * calibrated. That prevents us doing the check in init_amd().
> + */
> +presmp_initcall(c6_errata_check);
> diff --git a/xen/arch/x86/include/asm/msr-index.h
> b/xen/arch/x86/include/asm/msr-index.h
> index 2749e433d2a7..5df090fba791 100644
> --- a/xen/arch/x86/include/asm/msr-index.h
> +++ b/xen/arch/x86/include/asm/msr-index.h
> @@ -211,6 +211,11 @@
>
> #define MSR_VIRT_SPEC_CTRL 0xc001011f /* Layout matches
> MSR_SPEC_CTRL */
>
> +#define MSR_AMD_CSTATE_CFG 0xc0010296
> +#define CSTATE_CFG_CCR0_CC6EN (_AC(1, ULL) << 6)
> +#define CSTATE_CFG_CCR1_CC6EN (_AC(1, ULL) << 14)
> +#define CSTATE_CFG_CCR2_CC6EN (_AC(1, ULL) << 22)
While MSR_AMD_CSTATE_CFG is liable to stay stable, the CC6EN bits are
uarch specific and not applicable to others.
I'd suggest keeping them local to c6_errata_check(), which probably
ought to gain a zen2 somewhere in the name.
They definitely can't say in a global header with names that don't tie
them to Zen2 specifically.
~Andrew
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |