mwait-idle: fine-tune IVT residency targets Ivy Town processors have slightly different properties than Ivy Bridge processors, particuarly as socket count grows. Here we add dedicated tables covering 1-2 socket, 3-4 socket, and > 4 socket IVT configurations. This reduces the frequency of deep transitions on those systems, which can impact throughput. Signed-off-by: Len Brown Socket count determination needs to be done differently for us: The relevant code runs in a pre-SMP initcall, and hence can't use cpu_to_socket() (as only the boot CPU's cpu_data[].phys_proc_id got set up by that time). As a replacement the patch introduces apicid_to_socket(), thus estimating the socket count based on the boot CPU's internal topology information (if all CPUs in a system are the same, wich ought to be the common case, this estimate will be precise). Note that apicid_to_socket() handles Intel topology determination only for now, as the mwait-idle driver currently only supports Intel CPUs. Signed-off-by: Jan Beulich --- This requires the patch at http://lists.xenproject.org/archives/html/xen-devel/2014-06/msg03335.html as a prerequisite. --- a/xen/arch/x86/cpu/mwait-idle.c +++ b/xen/arch/x86/cpu/mwait-idle.c @@ -261,6 +261,90 @@ static const struct cpuidle_state ivb_cs {} }; +static const struct cpuidle_state ivt_cstates[] = { + { + .name = "C1-IVT", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + }, + { + .name = "C1E-IVT", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 80, + }, + { + .name = "C3-IVT", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 156, + }, + { + .name = "C6-IVT", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 82, + .target_residency = 300, + }, + {} +}; + +static const struct cpuidle_state ivt_cstates_4s[] = { + { + .name = "C1-IVT-4S", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + }, + { + .name = "C1E-IVT-4S", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 250, + }, + { + .name = "C3-IVT-4S", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 300, + }, + { + .name = "C6-IVT-4S", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 84, + .target_residency = 400, + }, + {} +}; + +static const struct cpuidle_state ivt_cstates_8s[] = { + { + .name = "C1-IVT-8S", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + }, + { + .name = "C1E-IVT-8S", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 500, + }, + { + .name = "C3-IVT-8S", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 600, + }, + { + .name = "C6-IVT-8S", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 88, + .target_residency = 700, + }, + {} +}; + static const struct cpuidle_state hsw_cstates[] = { { .name = "C1-HSW", @@ -494,6 +578,11 @@ static const struct idle_cpu idle_cpu_iv .disable_promotion_to_c1e = 1, }; +static const struct idle_cpu idle_cpu_ivt = { + .state_table = ivt_cstates, + .disable_promotion_to_c1e = 1, +}; + static const struct idle_cpu idle_cpu_hsw = { .state_table = hsw_cstates, .disable_promotion_to_c1e = 1, @@ -524,7 +613,7 @@ static struct intel_idle_id { ICPU(0x36, atom), ICPU(0x37, byt), ICPU(0x3a, ivb), - ICPU(0x3e, ivb), + ICPU(0x3e, ivt), ICPU(0x3c, hsw), ICPU(0x3f, hsw), ICPU(0x45, hsw), @@ -533,6 +622,37 @@ static struct intel_idle_id { {} }; +/* + * mwait_idle_state_table_update() + * + * Update the default state_table for this CPU-id + * + * Currently used to access tuned IVT multi-socket targets + * Assumption: num_sockets == (max_package_num + 1) + */ +static void __init mwait_idle_state_table_update(void) +{ + /* IVT uses a different table for 1-2, 3-4, and > 4 sockets */ + if (boot_cpu_data.x86_model == 0x3e) { /* IVT */ + unsigned int cpu, max_apicid = boot_cpu_physical_apicid; + + for_each_present_cpu(cpu) + if (max_apicid < x86_cpu_to_apicid[cpu]) + max_apicid = x86_cpu_to_apicid[cpu]; + switch (apicid_to_socket(max_apicid)) { + case 0: case 1: + /* 1 and 2 socket systems use default ivt_cstates */ + break; + case 2: case 3: + cpuidle_state_table = ivt_cstates_4s; + break; + default: + cpuidle_state_table = ivt_cstates_8s; + break; + } + } +} + static int __init mwait_idle_probe(void) { unsigned int eax, ebx, ecx; @@ -578,6 +698,9 @@ static int __init mwait_idle_probe(void) pr_debug(PREFIX "lapic_timer_reliable_states %#x\n", lapic_timer_reliable_states); + + mwait_idle_state_table_update(); + return 0; } --- a/xen/arch/x86/cpu/common.c +++ b/xen/arch/x86/cpu/common.c @@ -152,6 +152,11 @@ static void __cpuinit get_cpu_vendor(str this_cpu = &default_cpu; } +static inline u32 _phys_pkg_id(u32 cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + /* * cpuid returns the value latched in the HW at reset, not the APIC ID * register's value. For any box whose BIOS changes APIC IDs, like @@ -161,7 +166,7 @@ static void __cpuinit get_cpu_vendor(str */ static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return _phys_pkg_id(hard_smp_processor_id(), index_msb); } /* Do minimum CPU detection early. @@ -471,6 +476,43 @@ void __cpuinit detect_ht(struct cpuinfo_ } #endif +unsigned int __init apicid_to_socket(unsigned int apicid) +{ + unsigned int dummy; + + if (boot_cpu_has(X86_FEATURE_XTOPOLOGY)) { + unsigned int eax, ecx, sub_index = 1, core_plus_mask_width; + + cpuid_count(0xb, SMT_LEVEL, &eax, &dummy, &dummy, &dummy); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + do { + cpuid_count(0xb, sub_index, &eax, &dummy, &ecx, + &dummy); + + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_plus_mask_width = + BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + return _phys_pkg_id(apicid, core_plus_mask_width); + } + + if (boot_cpu_has(X86_FEATURE_HT) && + !boot_cpu_has(X86_FEATURE_CMP_LEGACY)) { + unsigned int num_siblings = (cpuid_ebx(1) & 0xff0000) >> 16; + + if (num_siblings) + return _phys_pkg_id(apicid, + get_count_order(num_siblings)); + } + + return apicid; +} + void __cpuinit print_cpu_info(unsigned int cpu) { const struct cpuinfo_x86 *c = cpu_data + cpu; --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -220,6 +220,8 @@ static always_inline void detect_ht(stru #define cpu_to_core(_cpu) (cpu_data[_cpu].cpu_core_id) #define cpu_to_socket(_cpu) (cpu_data[_cpu].phys_proc_id) +unsigned int apicid_to_socket(unsigned int); + /* * Generic CPUID function * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx