[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] amd: disable C6 after 1000 days on Fam17h models 30-3fh


  • To: xen-devel@xxxxxxxxxxxxxxxxxxxx
  • From: Roger Pau Monne <roger.pau@xxxxxxxxxx>
  • Date: Mon, 5 Jun 2023 17:10:36 +0200
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=citrix.com; dmarc=pass action=none header.from=citrix.com; dkim=pass header.d=citrix.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=JONrvYVMHp1sBRPD1gxzmVo7lye+rsEHxAYvDe4PyeM=; b=WBXJFXXtJ1jPbtYB1qT9q8ZFZJvV1Swu8MtQEwpw6f4/N2A/7+kBPQvlxfvnr0ZxjAvxxKUHWJBnRIUPv6fuo1jh9OnKHFDpctU6dG/Hy774Kt8ZWTmigzbLQMzr8V03kX75k+U1SPaKw+JJF/M1DCKL/3Eqet5s1Jq9eoEV5AwM0UCjL2l9ithGS2/oExM9zaM9dLH6CVOIsssrzmtdYdpGqLSVl+xWyrO+pyiW8H7n5TbDz/zKAHuKa+Rpqq6AwpLXnG6m3Gs7SzaObGORDsunNS1LeJYuKYtT8iGGlPu3LVpDzk4kdZgetbcvT7+T6FbciLIqM+dWcqZknQg8XQ==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=JBRlWh/haWJbHKLzYQCcxaTqL4A6Xs9n4JaWktWhgRJqDZQrdy7QEjdkis4BktwlnX44pjutUtspNuAoZdQRpmCvfOBkLnzaBFsc5BNxVUqgXehs/kWqXfGWbGB4R4CQmy1tHVxSuajRkdFryp+3JmexPlaWmsVWRWYxxgeCZXUZewOAU3nay50emAAB11VaXaI2qpES9qJmx1fKVYl82F1twlqbc0SU5URUkGGxx8eK7kOOUu6Q+21UDy1rMgLmHz1Wk2d4Q7hpbuJIirrYKpqOxxfKxZvte8luDwmVlZAOQHbQdWfAE7MtF/YOf/3OCyAvwjIL2n8+Z2Ip/tE5xg==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=citrix.com;
  • Cc: Roger Pau Monne <roger.pau@xxxxxxxxxx>, Jan Beulich <jbeulich@xxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>
  • Delivery-date: Mon, 05 Jun 2023 15:11:25 +0000
  • Ironport-data: A9a23:Rapl6K4K6bgNucL6LS4OzQxRtPbGchMFZxGqfqrLsTDasY5as4F+v mUfXGrSPv+MajT1fdt+bYWz9UgPvJCAmNVkQABpqCozHi5G8cbLO4+Ufxz6V8+wwm8vb2o8t plDNYOQRCwQZiWBzvt4GuG59RGQ7YnRGvynTraCYnsrLeNdYH9JoQp5nOIkiZJfj9G8Agec0 fv/uMSaM1K+s9JOGjt8B5mr9lU35JwehBtC5gZlPa4T4AeH/5UoJMl3yZ+ZfiOQrrZ8RoZWd 86bpJml82XQ+QsaC9/Nut4XpWVTH9Y+lSDX4pZnc/DKbipq/0Te4Y5iXBYoUm9Fii3hojxE4 I4lWapc6+seFvakdOw1C3G0GszlVEFM0OevzXOX6aR/w6BaGpdFLjoH4EweZOUlFuhL7W5m1 vIxJWAcZyq4qcWd3K6Hcepjot8sI5y+VG8fkikIITDxK98DGMiGb4CUoNhS0XE3m9xEGuvYa 4wBcz1zYR/cYhpJfFAKFJY5m+TujX76G9FagAvN+exrvC6OnEooiOSF3Nn9I7RmQe1PmUmVv CTe9nnRCRAGLt2PjzGC9xpAg8eWxHugCdlDTOHQGvhCx2aQ2E83CgUtBXSU8PihtVymZ95UN BlBksYphe1onKCxdfHtUhv9rHOasxo0X9tLD/Z8+AyL0rDT4QuSGi4DVDEpQN4sudIyRDcq/ kSUhN6vDjtq2JWKTVqN+7HSqim9UQAXMGsDaCksXQYDpd75r+kblQnTR9xuFKq0iNzdGjzqx T2O6i8kiN0uYdUj0qy6+RXLhmyqr52QFgotvFyPAiSi8x9zY5Oja8qw81/H4P1cLYGfCF6co HwDnMvY5+cLZX2QqBGwrCw2NOnBz5643Pf03jaDw7FJG+yRxkOe
  • Ironport-hdrordr: A9a23:gm4HwqOQcXy+JsBcT6r255DYdb4zR+YMi2TDiHoddfUFSKalfp 6V98jztSWatN/eYgBDpTnmAtj5fZq8z+8N3WB1B9uftWbdyQ+Vxe1ZjbcKoAeQZhEWiNQtsp uIGpIWYLOQMbETt7eB3ODSKadE/DDoytHKuQ+IpE0dNj2CJpsQmDuQTW2gYzxLbTgDIaB8OI uX58JBqTblUXMLbv6jDn1Ac/nfq8bNnJfGZwdDIxI88gGBgR6h9ba/SnGjr1wjegIK5Y1n3X nOkgT/6Knmm/anyiXE32uWw4VKlMDnwt5jAtXJrsQOMD3jhiuheYwkcbyfuzIepv2p9T8R4Z HxiiZlG/42x2Laf2mzrxeo8w780Aw243un7VODm3PsreHwWTp/UqN69MtkWyqcz3BlkMB30a pN0W7cn51LDSnYlCC4w9TTTRllmme9vHJnu+8OiH50V5cYddZq3Poi1XIQNK1FMDPx6YghHu UrJMbA5MxOeVffVHzdtnkH+q3fYl0DWjO9BmQSsM2c1DZb2FpjyVED+cAZlnAcsLogVpht/Y 3/Q+tVvYALavVTQbN2Be8HT8fyIHfKWwjwPGWbJkmiPL0bOkjKt4X87NwOla2XkaQzvdoPca n6IRNlXTZYQTOsNSTO5uwHzvn1ehTyYdy3ofsupaSQudXHNcnW2GO4ORUTevCb0osi6/3gKo qO0a1tcoDexBPVaOB0Ni3FKuxvwColIbkok+d+fW6yieT2DaCvntDnUZ/oVcrQ+HAfKynCPk c=
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

As specified on Errata 1474:

"A core will fail to exit CC6 after about 1044 days after the last
system reset. The time of failure may vary depending on the spread
spectrum and REFCLK frequency."

Detect when running on AMD Fam17h models 30h-3fh and setup a timer to
prevent entering C6 after 1000 days have elapsed.  Take into account
the TSC value at boot in order to account for any time elapsed before
Xen has been booted.

Print a message once C6 is disabled in order to let the user know.

Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
---
I think the only 30-3fh model is 31h (Rome/Castle Peak), but I've
coded the check as to allow the whole range.
---
 xen/arch/x86/acpi/cpu_idle.c   |  3 ++-
 xen/arch/x86/cpu/amd.c         | 42 ++++++++++++++++++++++++++++++++++
 xen/arch/x86/include/asm/amd.h |  2 ++
 xen/include/xen/time.h         |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 427c8c89c5c4..452cba3fb953 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -50,6 +50,7 @@
 #include <public/platform.h>
 #include <public/sysctl.h>
 #include <acpi/cpufreq/cpufreq.h>
+#include <asm/amd.h>
 #include <asm/apic.h>
 #include <asm/cpuidle.h>
 #include <asm/mwait.h>
@@ -643,7 +644,7 @@ bool errata_c6_workaround(void)
                       x86_match_cpu(isr_errata));
     }
 
-    return (fix_needed && cpu_has_pending_apic_eoi());
+    return (fix_needed && cpu_has_pending_apic_eoi()) || amd_disable_c6;
 }
 
 void update_last_cx_stat(struct acpi_processor_power *power,
diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 0d3143031b5b..728fa61a54bb 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -50,6 +50,7 @@ boolean_param("allow_unsafe", opt_allow_unsafe);
 bool __read_mostly amd_acpi_c1e_quirk;
 bool __ro_after_init amd_legacy_ssbd;
 bool __initdata amd_virt_spec_ctrl;
+bool __read_mostly amd_disable_c6;
 
 static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo,
                                 unsigned int *hi)
@@ -1189,3 +1190,44 @@ const struct cpu_dev amd_cpu_dev = {
        .c_early_init   = early_init_amd,
        .c_init         = init_amd,
 };
+
+static void cf_check disable_c6(void *arg)
+{
+       printk(XENLOG_WARNING
+              "Disabling C6 after 1000 days uptime due to AMD errata 1474\n");
+       amd_disable_c6 = true;
+}
+
+static int __init cf_check amd_c6_errata(void)
+{
+       /*
+        * Errata #1474: A Core May Hang After About 1044 Days
+        * Set up a timer to disable C6 after 1000 days uptime.
+        */
+       s_time_t;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+           boot_cpu_data.x86 != 0x17 ||
+           (boot_cpu_data.x86_model & 0xf0) != 0x30)
+               return 0;
+
+       /*
+        * Deduct current TSC value, this would be relevant if
+        * kexec'ed for example.
+        */
+       delta = DAYS(1000) - tsc_ticks2ns(rdtsc());
+       if (delta > 0) {
+               static struct timer errata_c6;
+
+               init_timer(&errata_c6, disable_c6, NULL, 0);
+               set_timer(&errata_c6, NOW() + delta);
+       } else
+               disable_c6(NULL);
+
+       return 0;
+}
+/*
+ * Must be executed after early_time_init() for tsc_ticks2ns() to have been
+ * calibrated.  That prevents us doing the check in init_amd().
+ */
+presmp_initcall(amd_c6_errata);
diff --git a/xen/arch/x86/include/asm/amd.h b/xen/arch/x86/include/asm/amd.h
index 09ee52dc1c09..c54bc6a8903f 100644
--- a/xen/arch/x86/include/asm/amd.h
+++ b/xen/arch/x86/include/asm/amd.h
@@ -157,4 +157,6 @@ bool amd_setup_legacy_ssbd(void);
 void amd_set_legacy_ssbd(bool enable);
 void amd_set_cpuid_user_dis(bool enable);
 
+extern bool amd_disable_c6;
+
 #endif /* __AMD_H__ */
diff --git a/xen/include/xen/time.h b/xen/include/xen/time.h
index b7427460dd13..99a91579438e 100644
--- a/xen/include/xen/time.h
+++ b/xen/include/xen/time.h
@@ -53,6 +53,7 @@ struct tm wallclock_time(uint64_t *ns);
 
 #define SYSTEM_TIME_HZ  1000000000ULL
 #define NOW()           ((s_time_t)get_s_time())
+#define DAYS(_d)        ((s_time_t)((_d)  * 86400000000000ULL))
 #define SECONDS(_s)     ((s_time_t)((_s)  * 1000000000ULL))
 #define MILLISECS(_ms)  ((s_time_t)((_ms) * 1000000ULL))
 #define MICROSECS(_us)  ((s_time_t)((_us) * 1000ULL))
-- 
2.40.0




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.