[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 5/5] x86/mwait-idle: adjust the SKX C6 parameters if PC6 is disabled


  • To: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Mon, 6 Sep 2021 15:02:12 +0200
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version; bh=q25vTfg9n5hflZMUVunRhlsx6PeE8kCcVjZSgImDym4=; b=hYJTTisj4WdVmIQ17WL2uPySfg1kK7K6LElXWrUDjAsXNoXYCNlNO7yYjFKgzEwN3FKA3boWAXwq2nJ6cVdwi6PNtjPKKKMThaxVSt0gyB6RgDmGalP1zpzIU14K5FnO3pIhArPr70TaT1NgCslFBXl3tPZtodqO57L2V/lvJgSJ03omu7gvOav990u28gRUN51F2H2qZkkz11SvfcDMj14cV2yRBt7IijKYcuarT2ZilM8eJO4nByXvs0ajsuI4XySd1Kt9DSnWX4DzdobNZaD6+LnB7HlFT8vqEs1/loegrkDjc5eCIj5LswxMes3b/kP6otqvQ3b6cJUMTcyPUA==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=Wpk2a+RIrLwrozxDrlSrKSr+lrUEz7Qyzi0XBq5rCfDv1GoM4sHuR4rZFTI3lklQ5wW2FPvQLAJc71WboulW5/qvV94Vb61eHbWOkKEP6u4S/y182aEEumftrSyFyIXUVsGrd+Z6eQTV7eMUQA/8kcPdnDo3CVX4HF+EPRqaMpTWef9lYO+KLFlJQeaTWZYxsMsGtfsqbFljLGmhSW/cLG9dmrqWxtIHKtg6ZDKhLQG1YyKUiUxq5jxPdepKaeYlZpQNsUxUireO6QGkmSUky2RN0ZLilno7H7un/V3sqFTWDS2TIpL2qvB4dFR+AiDPTRO2CosbI1TkhBO9rX1X3A==
  • Authentication-results: citrix.com; dkim=none (message not signed) header.d=none;citrix.com; dmarc=none action=none header.from=suse.com;
  • Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>
  • Delivery-date: Mon, 06 Sep 2021 13:02:25 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

From: Chen Yu <yu.c.chen@xxxxxxxxx>

Because cpuidle assumes worst-case C-state parameters, PC6 parameters
are used for describing C6, which is worst-case for requesting CC6.
When PC6 is enabled, this is appropriate. But if PC6 is disabled
in the BIOS, the exit latency and target residency should be adjusted
accordingly.

Exit latency:
Previously the C6 exit latency was measured as the PC6 exit latency.
With PC6 disabled, the C6 exit latency should be the one of CC6.

Target residency:
With PC6 disabled, the idle duration within [CC6, PC6) would make the
idle governor choose C1E over C6. This would cause low energy-efficiency.
We should lower the bar to request C6 when PC6 is disabled.

To fill this gap, check if PC6 is disabled in the BIOS in the
MSR_PKG_CST_CONFIG_CONTROL(0xe2) register. If so, use the CC6 exit latency
for C6 and set target_residency to 3 times of the new exit latency. [This
is consistent with how intel_idle driver uses _CST to calculate the
target_residency.] As a result, the OS would be more likely to choose C6
over C1E when PC6 is disabled, which is reasonable, because if C6 is
enabled, it implies that the user cares about energy, so choosing C6 more
frequently makes sense.

The new CC6 exit latency of 92us was measured with wult[1] on SKX via NIC
wakeup as the 99.99th percentile. Also CLX and CPX both have the same CPU
model number as SkX, but their CC6 exit latencies are similar to the SKX
one, 96us and 89us respectively, so reuse the SKX value for them.

There is a concern that it might be better to use a more generic approach
instead of optimizing every platform. However, if the required code
complexity and different PC6 bit interpretation on different platforms
are taken into account, tuning the code per platform seems to be an
acceptable tradeoff.

Link: https://intel.github.io/wult/ # [1]
Suggested-by: Len Brown <len.brown@xxxxxxxxx>
Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
Reviewed-by: Artem Bityutskiy <artem.bityutskiy@xxxxxxxxxxxxxxx>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
[Linux commit: 64233338499126c5c31e07165735ab5441c7e45a]

Pull in Linux'es MSR_PKG_CST_CONFIG_CONTROL. Alongside the dropping of
"const" from skx_cstates[] add __read_mostly, and extend that to other
similar non-const tables.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -484,7 +484,7 @@ static const struct cpuidle_state bdw_cs
        {}
 };
 
-static struct cpuidle_state skl_cstates[] = {
+static struct cpuidle_state __read_mostly skl_cstates[] = {
        {
                .name = "C1-SKL",
                .flags = MWAIT2flg(0x00),
@@ -536,7 +536,7 @@ static struct cpuidle_state skl_cstates[
        {}
 };
 
-static const struct cpuidle_state skx_cstates[] = {
+static struct cpuidle_state __read_mostly skx_cstates[] = {
        {
                .name = "C1-SKX",
                .flags = MWAIT2flg(0x00),
@@ -674,7 +674,7 @@ static const struct cpuidle_state knl_cs
        {}
 };
 
-static struct cpuidle_state bxt_cstates[] = {
+static struct cpuidle_state __read_mostly bxt_cstates[] = {
        {
                .name = "C1-BXT",
                .flags = MWAIT2flg(0x00),
@@ -870,9 +870,9 @@ static void auto_demotion_disable(void *
 {
        u64 msr_bits;
 
-       rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
+       rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
        msr_bits &= ~(icpu->auto_demotion_disable_flags);
-       wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
+       wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
 }
 
 static void byt_auto_demotion_disable(void *dummy)
@@ -1141,7 +1141,7 @@ static void __init sklh_idle_state_table
        if ((mwait_substates & (MWAIT_CSTATE_MASK << 28)) == 0)
                return;
 
-       rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr);
+       rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
 
        /* PC10 is not enabled in PKG C-state limit */
        if ((msr & 0xF) != 8)
@@ -1161,6 +1161,36 @@ static void __init sklh_idle_state_table
 }
 
 /*
+ * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake
+ * idle states table.
+ */
+static void __init skx_idle_state_table_update(void)
+{
+       unsigned long long msr;
+
+       rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+       /*
+        * 000b: C0/C1 (no package C-state support)
+        * 001b: C2
+        * 010b: C6 (non-retention)
+        * 011b: C6 (retention)
+        * 111b: No Package C state limits.
+        */
+       if ((msr & 0x7) < 2) {
+               /*
+                * Uses the CC6 + PC0 latency and 3 times of
+                * latency for target_residency if the PC6
+                * is disabled in BIOS. This is consistent
+                * with how intel_idle driver uses _CST
+                * to set the target_residency.
+                */
+               skx_cstates[2].exit_latency = 92;
+               skx_cstates[2].target_residency = 276;
+       }
+}
+
+/*
  * mwait_idle_state_table_update()
  *
  * Update the default state_table for this CPU-id
@@ -1178,6 +1208,9 @@ static void __init mwait_idle_state_tabl
        case 0x5e: /* SKL-H */
                sklh_idle_state_table_update();
                break;
+       case 0x55: /* SKL-X */
+               skx_idle_state_table_update();
+               break;
        }
 }
 
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -45,6 +45,13 @@
 #define MSR_CORE_CAPABILITIES               0x000000cf
 #define  CORE_CAPS_SPLITLOCK_DETECT         (_AC(1, ULL) <<  5)
 
+#define MSR_PKG_CST_CONFIG_CONTROL          0x000000e2
+#define  NHM_C3_AUTO_DEMOTE                 (_AC(1, ULL) << 25)
+#define  NHM_C1_AUTO_DEMOTE                 (_AC(1, ULL) << 26)
+#define  ATM_LNC_C6_AUTO_DEMOTE             (_AC(1, ULL) << 25)
+#define  SNB_C3_AUTO_UNDEMOTE               (_AC(1, ULL) << 27)
+#define  SNB_C1_AUTO_UNDEMOTE               (_AC(1, ULL) << 28)
+
 #define MSR_ARCH_CAPABILITIES               0x0000010a
 #define  ARCH_CAPS_RDCL_NO                  (_AC(1, ULL) <<  0)
 #define  ARCH_CAPS_IBRS_ALL                 (_AC(1, ULL) <<  1)
@@ -175,11 +182,6 @@
 #define MSR_IA32_A_PERFCTR0            0x000004c1
 #define MSR_FSB_FREQ                   0x000000cd
 
-#define MSR_NHM_SNB_PKG_CST_CFG_CTL    0x000000e2
-#define NHM_C3_AUTO_DEMOTE             (1UL << 25)
-#define NHM_C1_AUTO_DEMOTE             (1UL << 26)
-#define ATM_LNC_C6_AUTO_DEMOTE         (1UL << 25)
-
 #define MSR_MTRRcap                    0x000000fe
 #define MTRRcap_VCNT                   0x000000ff
 




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.