Xen project Mailing List

[Xen-devel] [PATCH v4 07/11] x86/intel_pstate: the main boby of the intel_pstate driver

To: jbeulich@xxxxxxxx, andrew.cooper3@xxxxxxxxxx, xen-devel@xxxxxxxxxxxxx

Date: Thu, 25 Jun 2015 19:16:40 +0800

Delivery-date: Thu, 25 Jun 2015 11:17:31 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

The intel_pstate driver is ported following its kernel code logic (commit: 93f0822d).In order to port the Linux source file with minimal modifications, some of the variable types are kept intact (e.g. "int current_pstae", would otherwise be changed to "unsigned int"). In the kernel, a user can adjust the limits via sysfs (limits.min_sysfs_pct/max_sysfs_pct). In Xen, the policy->limits.min_perf_pct/max_perf_pct acts as the transit station. A user interacts with it via xenpm. The new xen/include/asm-x86/cpufreq.h header file is added. v4 changes: 1) changed the identation to be a "Tab" (same as Linux intel_pstate), instead of 4 "+$"; 2) added a new header file, xen/include/asm-x86/cpufreq.h. Signed-off-by: Wei Wang <wei.w.wang@xxxxxxxxx> --- xen/arch/x86/acpi/cpufreq/Makefile | 1 + xen/arch/x86/acpi/cpufreq/intel_pstate.c | 870 +++++++++++++++++++++++++++++++ xen/include/asm-x86/cpufreq.h | 34 ++ xen/include/asm-x86/msr-index.h | 3 + 4 files changed, 908 insertions(+) create mode 100644 xen/arch/x86/acpi/cpufreq/intel_pstate.c create mode 100644 xen/include/asm-x86/cpufreq.h diff --git a/xen/arch/x86/acpi/cpufreq/Makefile b/xen/arch/x86/acpi/cpufreq/Makefile index f75da9b..99fa9f4 100644 --- a/xen/arch/x86/acpi/cpufreq/Makefile +++ b/xen/arch/x86/acpi/cpufreq/Makefile @@ -1,2 +1,3 @@ obj-y += cpufreq.o +obj-y += intel_pstate.o obj-y += powernow.o diff --git a/xen/arch/x86/acpi/cpufreq/intel_pstate.c b/xen/arch/x86/acpi/cpufreq/intel_pstate.c new file mode 100644 index 0000000..19c74cc --- /dev/null +++ b/xen/arch/x86/acpi/cpufreq/intel_pstate.c @@ -0,0 +1,870 @@ +#include <xen/kernel.h> +#include <xen/types.h> +#include <xen/init.h> +#include <xen/bitmap.h> +#include <xen/cpumask.h> +#include <xen/timer.h> +#include <asm/msr.h> +#include <asm/msr-index.h> +#include <asm/processor.h> +#include <asm/div64.h> +#include <asm/cpufreq.h> +#include <acpi/cpufreq/cpufreq.h> + +#define BYT_RATIOS 0x66a +#define BYT_VIDS 0x66b +#define BYT_TURBO_RATIOS 0x66c +#define BYT_TURBO_VIDS 0x66d + +#define FRAC_BITS 8 +#define int_tofp(X) ((int64_t)(X) << FRAC_BITS) +#define fp_toint(X) ((X) >> FRAC_BITS) + +static inline int32_t mul_fp(int32_t x, int32_t y) +{ + return ((int64_t)x * (int64_t)y) >> FRAC_BITS; +} + +static inline int32_t div_fp(int32_t x, int32_t y) +{ + return div_s64((int64_t)x << FRAC_BITS, y); +} + +static inline int ceiling_fp(int32_t x) +{ + int mask, ret; + + ret = fp_toint(x); + mask = (1 << FRAC_BITS) - 1; + if (x & mask) + ret += 1; + return ret; +} + +struct sample { + int32_t core_pct_busy; + u64 aperf; + u64 mperf; + int freq; + s_time_t time; +}; + +struct pstate_data { + int current_pstate; + int min_pstate; + int max_pstate; + int scaling; + int turbo_pstate; +}; + +struct vid_data { + int min; + int max; + int turbo; + int32_t ratio; +}; + +struct _pid { + int setpoint; + int32_t integral; + int32_t p_gain; + int32_t i_gain; + int32_t d_gain; + int deadband; + int32_t last_err; +}; + +struct cpudata { + int cpu; + + struct timer timer; + + struct pstate_data pstate; + struct vid_data vid; + struct _pid pid; + + s_time_t last_sample_time; + u64 prev_aperf; + u64 prev_mperf; + struct sample sample; +}; + +static struct cpudata **all_cpu_data; + +struct pstate_adjust_policy { + int sample_rate_ms; + int deadband; + int setpoint; + int p_gain_pct; + int d_gain_pct; + int i_gain_pct; +}; + +struct pstate_funcs { + int (*get_max)(void); + int (*get_min)(void); + int (*get_turbo)(void); + int (*get_scaling)(void); + void (*set)(struct perf_limits *, struct cpudata *, int pstate); + void (*get_vid)(struct cpudata *); +}; + +struct cpu_defaults { + struct pstate_adjust_policy pid_policy; + struct pstate_funcs funcs; +}; + +static struct pstate_adjust_policy pid_params; +static struct pstate_funcs pstate_funcs; + +static inline void pid_reset(struct _pid *pid, int setpoint, int busy, + int deadband, int integral) { + pid->setpoint = setpoint; + pid->deadband = deadband; + pid->integral = int_tofp(integral); + pid->last_err = int_tofp(setpoint) - int_tofp(busy); +} + +static inline void pid_p_gain_set(struct _pid *pid, int percent) +{ + pid->p_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static inline void pid_i_gain_set(struct _pid *pid, int percent) +{ + pid->i_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static inline void pid_d_gain_set(struct _pid *pid, int percent) +{ + pid->d_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static signed int pid_calc(struct _pid *pid, int32_t busy) +{ + signed int result; + int32_t pterm, dterm, fp_error; + int32_t integral_limit; + + fp_error = int_tofp(pid->setpoint) - busy; + + if (ABS(fp_error) <= int_tofp(pid->deadband)) + return 0; + + pterm = mul_fp(pid->p_gain, fp_error); + + pid->integral += fp_error; + + /* + * We limit the integral here so that it will never + * get higher than 30. This prevents it from becoming + * too large an input over long periods of time and allows + * it to get factored out sooner. + * The value of 30 was chosen through experimentation. + */ + integral_limit = int_tofp(30); + if (pid->integral > integral_limit) + pid->integral = integral_limit; + if (pid->integral < -integral_limit) + pid->integral = -integral_limit; + + dterm = mul_fp(pid->d_gain, fp_error - pid->last_err); + pid->last_err = fp_error; + + result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm; + result = result + (1 << (FRAC_BITS-1)); + return (signed int)fp_toint(result); +} + +static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu) +{ + pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct); + pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct); + pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct); + + pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0); +} + +static inline void intel_pstate_reset_all_pid(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (all_cpu_data[cpu]) + intel_pstate_busy_pid_reset(all_cpu_data[cpu]); + } +} + +static inline void update_turbo_state(struct cpufreq_policy *policy) +{ + u64 misc_en; + struct cpudata *cpu; + + cpu = all_cpu_data[policy->cpu]; + rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); + policy->limits.turbo_disabled = + (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || + cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); +} + +#define BYT_TURBO_CONTROL_BIT 32 +#define BYT_MIN_PSTATE(val) (((value) >> 8) & 0x7f) +#define BYT_MAX_PSTATE(val) (((value) >> 16) & 0x7f) +#define BYT_TURBO_PSTATE(value) ((value) & 0x7f) +static int byt_get_min_pstate(void) +{ + u64 value; + + rdmsrl(BYT_RATIOS, value); + return BYT_MIN_PSTATE(val); +} + +static int byt_get_max_pstate(void) +{ + u64 value; + + rdmsrl(BYT_RATIOS, value); + return BYT_MAX_PSTATE(val); +} + +static int byt_get_turbo_pstate(void) +{ + u64 value; + + rdmsrl(BYT_TURBO_RATIOS, value); + return BYT_TURBO_PSTATE(value); +} + +static void byt_set_pstate(struct perf_limits *limits, + struct cpudata *cpudata, int pstate) +{ + u64 val; + int32_t vid_fp; + u32 vid; + + val = pstate << 8; + if (limits->no_turbo && !limits->turbo_disabled) + val |= (u64)1 << BYT_TURBO_CONTROL_BIT; + + vid_fp = cpudata->vid.min + mul_fp( + int_tofp(pstate - cpudata->pstate.min_pstate), + cpudata->vid.ratio); + + vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); + vid = ceiling_fp(vid_fp); + + if (pstate > cpudata->pstate.max_pstate) + vid = cpudata->vid.turbo; + + val |= vid; + + wrmsrl(MSR_IA32_PERF_CTL, val); +} + +#define BYT_BCLK_FREQS 5 +#define TO_FREQ_TABLE_IDX_MASK 0x7 +static const int byt_freq_table[BYT_BCLK_FREQS] = { 833, 1000, 1333, 1167, 800}; + +static int byt_get_scaling(void) +{ + u64 value; + int i; + + rdmsrl(MSR_FSB_FREQ, value); + i = value & TO_FREQ_TABLE_IDX_MASK; + + BUG_ON(i > BYT_BCLK_FREQS); + + return byt_freq_table[i] * 100; +} + +static void byt_get_vid(struct cpudata *cpudata) +{ + u64 value; + + rdmsrl(BYT_VIDS, value); + cpudata->vid.min = int_tofp(BYT_MIN_PSTATE(val)); + cpudata->vid.max = int_tofp(BYT_MAX_PSTATE(val)); + cpudata->vid.ratio = div_fp( + cpudata->vid.max - cpudata->vid.min, + int_tofp(cpudata->pstate.max_pstate - + cpudata->pstate.min_pstate)); + + rdmsrl(BYT_TURBO_VIDS, value); + cpudata->vid.turbo = BYT_TURBO_PSTATE(value); +} + +#define SCALING_FACTOR 100000 +#define CORE_TURBO_CONTROL_BIT 32 +#define CORE_MIN_PSTATE(val) (((value) >> 40) & 0xff) +#define CORE_MAX_PSTATE(val) (((value) >> 8) & 0xff) +#define CORE_TURBO_PSTATE(value) ((value) & 0xff) +static int core_get_min_pstate(void) +{ + u64 value; + + rdmsrl(MSR_INTEL_PLATFORM_INFO, value); + return CORE_MIN_PSTATE(val); +} + +static int core_get_max_pstate(void) +{ + u64 value; + + rdmsrl(MSR_INTEL_PLATFORM_INFO, value); + return CORE_MAX_PSTATE(val); +} + +static int core_get_turbo_pstate(void) +{ + u64 value; + int nont, ret; + + rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value); + nont = core_get_max_pstate(); + ret = CORE_TURBO_PSTATE(value); + if (ret <= nont) + ret = nont; + return ret; +} + +static inline int core_get_scaling(void) +{ + return SCALING_FACTOR; +} + +static void core_set_pstate(struct perf_limits *limits, + struct cpudata *cpudata, int pstate) +{ + u64 val; + + val = pstate << 8; + if (limits->no_turbo && !limits->turbo_disabled) + val |= (u64)1 << CORE_TURBO_CONTROL_BIT; + + wrmsrl(MSR_IA32_PERF_CTL, val); +} + +static const struct cpu_defaults core_params = { + .pid_policy = { + .sample_rate_ms = 10, + .deadband = 0, + .setpoint = 97, + .p_gain_pct = 20, + .d_gain_pct = 0, + .i_gain_pct = 0, + }, + .funcs = { + .get_max = core_get_max_pstate, + .get_min = core_get_min_pstate, + .get_turbo = core_get_turbo_pstate, + .get_scaling = core_get_scaling, + .set = core_set_pstate, + }, +}; + +static const struct cpu_defaults byt_params = { + .pid_policy = { + .sample_rate_ms = 10, + .deadband = 0, + .setpoint = 97, + .p_gain_pct = 14, + .d_gain_pct = 0, + .i_gain_pct = 4, + }, + .funcs = { + .get_max = byt_get_max_pstate, + .get_min = byt_get_min_pstate, + .get_turbo = byt_get_turbo_pstate, + .set = byt_set_pstate, + .get_scaling = byt_get_scaling, + .get_vid = byt_get_vid, + }, +}; + +static void intel_pstate_get_min_max(struct perf_limits *limits, + struct cpudata *cpu, int *min, int *max) +{ + int max_perf = cpu->pstate.turbo_pstate; + int max_perf_adj; + int min_perf; + + if (limits->no_turbo || limits->turbo_disabled) + max_perf = cpu->pstate.max_pstate; + + /* performance can be limited by user through xenpm */ + max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits->max_perf)); + *max = clamp_t(int, max_perf_adj, + cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); + + min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits->min_perf)); + *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); +} + +static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) +{ + int max_perf, min_perf; + struct cpufreq_policy *policy; + struct perf_limits *limits; + + policy = per_cpu(cpufreq_cpu_policy, cpu->cpu); + limits = &policy->limits; + + update_turbo_state(policy); + + if (limits->turbo_disabled) + policy->turbo = CPUFREQ_TURBO_UNSUPPORTED; + else if (limits->no_turbo) + policy->turbo = CPUFREQ_TURBO_DISABLED; + else + policy->turbo = CPUFREQ_TURBO_ENABLED; + + intel_pstate_get_min_max(limits, cpu, &min_perf, &max_perf); + + pstate = clamp_t(int, pstate, min_perf, max_perf); + + if (pstate == cpu->pstate.current_pstate) + return; + + cpu->pstate.current_pstate = pstate; + policy->cur = pstate * SCALING_FACTOR; + + pstate_funcs.set(limits, cpu, pstate); +} + +static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) +{ + cpu->pstate.min_pstate = pstate_funcs.get_min(); + cpu->pstate.max_pstate = pstate_funcs.get_max(); + cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); + cpu->pstate.scaling = pstate_funcs.get_scaling(); + + if (pstate_funcs.get_vid) + pstate_funcs.get_vid(cpu); + intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); +} + +static inline void intel_pstate_calc_busy(struct cpudata *cpu) +{ + struct sample *sample = &cpu->sample; + int64_t core_pct; + + core_pct = int_tofp(sample->aperf) * int_tofp(100); + core_pct = div64_u64(core_pct, int_tofp(sample->mperf)); + + sample->freq = fp_toint( + mul_fp(int_tofp( + cpu->pstate.max_pstate * cpu->pstate.scaling / 100), + core_pct)); + + sample->core_pct_busy = (int32_t)core_pct; +} + +static inline void intel_pstate_sample(struct cpudata *cpu) +{ + u64 aperf, mperf; + unsigned long flags; + + local_irq_save(flags); + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + local_irq_restore(flags); + + cpu->last_sample_time = cpu->sample.time; + cpu->sample.time = get_s_time(); + cpu->sample.aperf = aperf; + cpu->sample.mperf = mperf; + cpu->sample.aperf -= cpu->prev_aperf; + cpu->sample.mperf -= cpu->prev_mperf; + + intel_pstate_calc_busy(cpu); + + cpu->prev_aperf = aperf; + cpu->prev_mperf = mperf; +} + +static inline void intel_pstate_set_sample_time(struct cpudata *cpu) +{ + set_timer(&cpu->timer, NOW() + MILLISECS(pid_params.sample_rate_ms)); +} + +static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu) +{ + int32_t core_busy, max_pstate, current_pstate, sample_ratio; + u32 duration_us; + u32 sample_time_us; + + /* + * core_busy is the ratio of actual performance to max + * max_pstate is the max non turbo pstate available + * current_pstate was the pstate that was requested during + * the last sample period. + * + * We normalize core_busy, which was our actual percent + * performance to what we requested during the last sample + * period. The result will be a percentage of busy at a + * specified pstate. + */ + core_busy = cpu->sample.core_pct_busy; + max_pstate = int_tofp(cpu->pstate.max_pstate); + current_pstate = int_tofp(cpu->pstate.current_pstate); + core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); + + /* + * Since we have a deferred timer, it will not fire unless + * we are in C0. So, determine if the actual elapsed time + * is significantly greater (3x) than our sample interval. If it + * is, then we were idle for a long enough period of time + * to adjust our busyness. + */ + sample_time_us = pid_params.sample_rate_ms * 1000ULL; + duration_us = (u32)((s_time_t)(cpu->sample.time - cpu->last_sample_time) + / 1000); + if (duration_us > sample_time_us * 3) { + sample_ratio = div_fp(int_tofp(sample_time_us), + int_tofp(duration_us)); + core_busy = mul_fp(core_busy, sample_ratio); + } + + return core_busy; +} + +static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) +{ + int32_t busy_scaled; + struct _pid *pid; + signed int ctl; + + pid = &cpu->pid; + busy_scaled = intel_pstate_get_scaled_busy(cpu); + + ctl = pid_calc(pid, busy_scaled); + + /* Negative values of ctl increase the pstate and vice versa */ + intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl); +} + +static void intel_pstate_timer_func(void *data) +{ + struct cpudata *cpu = (struct cpudata *) data; + + intel_pstate_sample(cpu); + + intel_pstate_adjust_busy_pstate(cpu); + + intel_pstate_set_sample_time(cpu); +} + +#define ICPU(model, policy) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\ + &policy##_params } + +static const struct x86_cpu_id intel_pstate_cpu_ids[] __initconst = { + ICPU(0x2a, core), + ICPU(0x2d, core), + ICPU(0x37, byt), + ICPU(0x3a, core), + ICPU(0x3c, core), + ICPU(0x3d, core), + ICPU(0x3e, core), + ICPU(0x3f, core), + ICPU(0x45, core), + ICPU(0x46, core), + ICPU(0x47, core), + ICPU(0x4c, byt), + ICPU(0x4e, core), + ICPU(0x4f, core), + ICPU(0x56, core), + {} +}; + +static int intel_pstate_init_cpu(unsigned int cpunum) +{ + struct cpudata *cpu; + s_time_t expires; + + if (!all_cpu_data[cpunum]) + all_cpu_data[cpunum] = xzalloc(struct cpudata); + if (!all_cpu_data[cpunum]) + return -ENOMEM; + + cpu = all_cpu_data[cpunum]; + + cpu->cpu = cpunum; + intel_pstate_get_cpu_pstates(cpu); + + init_timer(&cpu->timer, intel_pstate_timer_func, cpu, cpunum); + expires = NOW() + MILLISECS(10); + + intel_pstate_busy_pid_reset(cpu); + intel_pstate_sample(cpu); + + set_timer(&cpu->timer, expires); + + return 0; +} + +static int intel_pstate_set_policy(struct cpufreq_policy *policy) +{ + struct perf_limits *limits = &policy->limits; + uint32_t cur_gov = policy->internal_gov->cur_gov; + + if (!policy->cpuinfo.max_freq) + return -ENODEV; + + switch (cur_gov) { + case INTERNAL_GOV_PERFORMANCE: + limits->no_turbo = 0; + limits->max_perf_pct = 100; + limits->max_perf = int_tofp(1); + limits->min_perf_pct = 100; + limits->min_perf = int_tofp(1); + break; + case INTERNAL_GOV_POWERSAVE: + limits->min_perf = + div_fp(int_tofp(limits->min_policy_pct), + int_tofp(100)); + limits->max_perf = limits->min_perf; + limits->min_perf_pct = limits->min_policy_pct; + limits->max_perf_pct = limits->min_perf_pct; + break; + case INTERNAL_GOV_USERSPACE: + limits->max_perf = + div_fp(int_tofp(limits->max_perf_pct), + int_tofp(100)); + limits->min_perf = limits->max_perf; + limits->min_perf_pct = limits->max_perf_pct; + break; + case INTERNAL_GOV_ONDEMAND: + default: + limits->min_perf = + div_fp(int_tofp(limits->min_perf_pct), int_tofp(100)); + limits->max_perf = + div_fp(int_tofp(limits->max_perf_pct), int_tofp(100)); + break; + } + + return 0; +} + +static int intel_pstate_verify_policy(struct cpufreq_policy *policy) +{ + uint32_t cur_gov = policy->internal_gov->cur_gov; + + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + + switch(cur_gov) { + case INTERNAL_GOV_PERFORMANCE: + case INTERNAL_GOV_POWERSAVE: + case INTERNAL_GOV_USERSPACE: + case INTERNAL_GOV_ONDEMAND: + return 0; + default: + return -EINVAL; + } +} + +static void intel_pstate_internal_gov_release(struct internal_governor *gov) +{ + xfree(gov->avail_gov); + xfree(gov); +} + +static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) +{ + int cpu_num = policy->cpu; + struct cpudata *cpu = all_cpu_data[cpu_num]; + + kill_timer(&all_cpu_data[cpu_num]->timer); + + intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); + + intel_pstate_internal_gov_release(policy->internal_gov); + + return 0; +} + +static int intel_pstate_turbo_update(int cpuid, struct cpufreq_policy *policy) +{ + struct cpudata *cpu = all_cpu_data[policy->cpu]; + struct perf_limits *limits = &policy->limits; + + update_turbo_state(policy); + if (limits->turbo_disabled) { + printk("Turbo disabled by BIOS or not supported on CPU\n"); + return -EINVAL; + } + limits->no_turbo = policy->turbo == CPUFREQ_TURBO_ENABLED ? 0 : 1; + + if (limits->no_turbo) + policy->cpuinfo.max_freq = + cpu->pstate.max_pstate * cpu->pstate.scaling; + else + policy->cpuinfo.max_freq = + cpu->pstate.turbo_pstate * cpu->pstate.scaling; + + policy->max = clamp_t(unsigned int, policy->max, + policy->cpuinfo.min_freq, policy->cpuinfo.max_freq); + + return 0; +} + +static int get_turbo_pct(struct cpudata *cpu) +{ + int total, no_turbo, turbo_pct; + uint32_t turbo_fp; + + total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; + no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; + turbo_fp = div_fp(int_tofp(no_turbo), int_tofp(total)); + turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); + return turbo_pct; +} + +#define INTEL_PSTATE_GOV_NUM 4 +static struct internal_governor* intel_pstate_internal_gov_init(void) +{ + unsigned int i = 0; + struct internal_governor *gov; + char *avail_gov; + + gov = xzalloc(struct internal_governor); + if (!gov) + return NULL; + avail_gov = xzalloc_array(char, + INTEL_PSTATE_GOV_NUM * CPUFREQ_NAME_LEN); + if (!avail_gov) + return NULL; + + gov->avail_gov = avail_gov; + + i += scnprintf(&avail_gov[0], CPUFREQ_NAME_LEN, "%s ", "performance"); + i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "powersave"); + i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "userspace"); + i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "ondemand"); + avail_gov[i-1] = '\0'; + gov->gov_num = INTEL_PSTATE_GOV_NUM; + gov->cur_gov = INTERNAL_GOV_ONDEMAND; + return gov; +} + +static int intel_pstate_cpu_setup(struct cpufreq_policy *policy) +{ + struct cpudata *cpu; + struct perf_limits *limits = &policy->limits; + int rc; + + rc = intel_pstate_init_cpu(policy->cpu); + if (rc) + return rc; + + policy->internal_gov = intel_pstate_internal_gov_init(); + if (!policy->internal_gov) + return -ENOMEM; + + cpu = all_cpu_data[policy->cpu]; + policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + + /* cpuinfo and default policy values */ + policy->cpuinfo.min_freq = + cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->cpuinfo.max_freq = + cpu->pstate.turbo_pstate * cpu->pstate.scaling; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + cpumask_set_cpu(policy->cpu, policy->cpus); + + limits->no_turbo = 0; + limits->turbo_disabled = 0; + limits->turbo_pct = get_turbo_pct(cpu); + limits->min_policy_pct = + (policy->min * 100) / policy->cpuinfo.max_freq; + limits->min_policy_pct = + clamp_t(uint32_t, limits->min_policy_pct, 0, 100); + limits->max_policy_pct = + (policy->max * 100) / policy->cpuinfo.max_freq; + limits->max_policy_pct = + clamp_t(uint32_t, limits->max_policy_pct, 0, 100); + limits->max_perf_pct = limits->max_policy_pct; + limits->min_perf_pct = limits->min_policy_pct; + + return 0; +} + +static struct cpufreq_driver intel_pstate_driver = { + .verify = intel_pstate_verify_policy, + .setpolicy = intel_pstate_set_policy, + .init = intel_pstate_cpu_setup, + .exit = intel_pstate_cpu_exit, + .update = intel_pstate_turbo_update, + .name = "intel_pstate", +}; + +static int intel_pstate_msrs_not_valid(void) +{ + if (!pstate_funcs.get_max() || + !pstate_funcs.get_min() || + !pstate_funcs.get_turbo()) + return -ENODEV; + + return 0; +} + +static void __init copy_pid_params(struct pstate_adjust_policy *policy) +{ + pid_params.sample_rate_ms = policy->sample_rate_ms; + pid_params.p_gain_pct = policy->p_gain_pct; + pid_params.i_gain_pct = policy->i_gain_pct; + pid_params.d_gain_pct = policy->d_gain_pct; + pid_params.deadband = policy->deadband; + pid_params.setpoint = policy->setpoint; +} + +static void __init copy_cpu_funcs(struct pstate_funcs *funcs) +{ + pstate_funcs.get_max = funcs->get_max; + pstate_funcs.get_min = funcs->get_min; + pstate_funcs.get_turbo = funcs->get_turbo; + pstate_funcs.get_scaling = funcs->get_scaling; + pstate_funcs.set = funcs->set; + pstate_funcs.get_vid = funcs->get_vid; +} + +int __init intel_pstate_init(void) +{ + int cpu, rc = 0; + const struct x86_cpu_id *id; + struct cpu_defaults *cpu_info; + + id = x86_match_cpu(intel_pstate_cpu_ids); + if (!id) + return -ENODEV; + + cpu_info = (struct cpu_defaults *)id->driver_data; + + copy_pid_params(&cpu_info->pid_policy); + copy_cpu_funcs(&cpu_info->funcs); + + if (intel_pstate_msrs_not_valid()) + return -ENODEV; + + all_cpu_data = xzalloc_array(struct cpudata *, NR_CPUS); + if (!all_cpu_data) + return -ENOMEM; + + rc = cpufreq_register_driver(&intel_pstate_driver); + if (rc) + goto out; + + return rc; +out: + for_each_online_cpu(cpu) { + if (all_cpu_data[cpu]) { + kill_timer(&all_cpu_data[cpu]->timer); + xfree(all_cpu_data[cpu]); + } + } + xfree(all_cpu_data); + return -ENODEV; +} diff --git a/xen/include/asm-x86/cpufreq.h b/xen/include/asm-x86/cpufreq.h new file mode 100644 index 0000000..94410f8 --- /dev/null +++ b/xen/include/asm-x86/cpufreq.h @@ -0,0 +1,34 @@ +#ifndef _ASM_X86_CPUFREQ_H +#define _ASM_X86_CPUFREQ_H + +/* + * Copyright (C) 2015 Wei Wang <wei.w.wang@xxxxxxxxx> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +extern int intel_pstate_init(void); + +/* + * Maximum transition latency is in nanoseconds - if it's unknown, + * CPUFREQ_ETERNAL shall be used. + */ +#define CPUFREQ_ETERNAL (-1) + +#endif /* _ASM_X86_CPUFREQ_H */ diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h index 83f2f70..57945d9 100644 --- a/xen/include/asm-x86/msr-index.h +++ b/xen/include/asm-x86/msr-index.h @@ -52,6 +52,8 @@ #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 @@ -319,6 +321,7 @@ #define MSR_IA32_MISC_ENABLE_MONITOR_ENABLE (1<<18) #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1<<22) #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1<<23) +#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL<<38) #define MSR_IA32_TSC_DEADLINE 0x000006E0 #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 -- 1.9.1 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.