Xen project Mailing List

Re: [Xen-devel] [PATCH v6 3/6] x86/intel_pstate: the main body of the intel_pstate driver

From: "Jan Beulich" <JBeulich@xxxxxxxx>

Date: Fri, 20 Nov 2015 07:09:30 -0700

Cc: andrew.cooper3@xxxxxxxxxx, wei.liu2@xxxxxxxxxx, xen-devel@xxxxxxxxxxxxx

Delivery-date: Fri, 20 Nov 2015 14:09:42 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

>>> On 28.10.15 at 04:21, <wei.w.wang@xxxxxxxxx> wrote: > --- /dev/null > +++ b/xen/arch/x86/acpi/cpufreq/intel_pstate.c > @@ -0,0 +1,882 @@ > +#include <xen/kernel.h> > +#include <xen/types.h> > +#include <xen/init.h> > +#include <xen/bitmap.h> > +#include <xen/cpumask.h> > +#include <xen/timer.h> > +#include <asm/msr.h> > +#include <asm/msr-index.h> > +#include <asm/processor.h> > +#include <asm/div64.h> > +#include <asm/cpufreq.h> > +#include <acpi/cpufreq/cpufreq.h> > + > +#define BYT_RATIOS 0x66a > +#define BYT_VIDS 0x66b > +#define BYT_TURBO_RATIOS 0x66c > +#define BYT_TURBO_VIDS 0x66d Considering these are MSR names, the acronym MSR should appear in them. > +#define FRAC_BITS 8 > +#define int_tofp(X) ((uint64_t)(X) << FRAC_BITS) > +#define fp_toint(X) ((X) >> FRAC_BITS) > + > +static inline uint32_t mul_fp(uint32_t x, uint32_t y) > +{ > + return ((uint64_t)x * (uint64_t)y) >> FRAC_BITS; Casting just one side of the * would suffice. Also please don't open-code fp_to_int() ... > +static inline uint32_t div_fp(uint32_t x, uint32_t y) > +{ > + return div_s64((uint64_t)x << FRAC_BITS, y); ... or int_to_fp() (for this one it would be a different thing if you removed the - apparently pointless - cast from the macro). > +static inline uint32_t ceiling_fp(uint32_t x) > +{ > + uint32_t mask, ret; > + > + ret = fp_toint(x); > + mask = (1 << FRAC_BITS) - 1; > + if ( x & mask ) > + ret += 1; > + return ret; > +} Blank line before final return statement please. Code further down suggests there may be a sign in these fixed point numbers, yet here you converted _everything_ to uintNN_t. Did you go too far? What sense does e.g. div_s64() on a uint64_t make? > +struct cpudata { > + int cpu; unsigned int (but I wonder if this is really needed, see the comment at the allocation site) > +static signed int pid_calc(struct _pid *pid, uint32_t busy) > +{ > + signed int result; > + int32_t pterm, dterm, fp_error; > + int32_t integral_limit; > + > + fp_error = int_tofp(pid->setpoint) - busy; > + if ( ABS(fp_error) <= int_tofp(pid->deadband) ) int_tofp() currently returning an unsigned value - why ABS()? > +static inline void intel_pstate_reset_all_pid(void) > +{ > + uint32_t cpu; unsigned int > +static inline void update_turbo_state(struct cpufreq_policy *policy) > +{ > + uint64_t misc_en; > + struct cpudata *cpu; > + > + cpu = all_cpu_data[policy->cpu]; > + rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); > + policy->limits.turbo_disabled = > + (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || > + cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); At least the & needs parenthesization. > +static void byt_set_pstate(struct perf_limits *limits, > + struct cpudata *cpudata, uint32_t pstate) > +{ > + uint64_t val; > + uint32_t vid_fp; > + uint32_t vid; > + > + val = pstate << 8; Didn't we agree not to have any unnamed literals? > + if ( limits->no_turbo && !limits->turbo_disabled ) > + val |= (uint64_t)1 << BYT_TURBO_CONTROL_BIT; Perhaps the sense of that bit isn't what one would imply at first glance, but the combination of conditions in the if() looks certainly odd. Please confirm this is really intended to be this way (which should also include pointing out what meaning the bit being on has). > + vid_fp = cpudata->vid.min + mul_fp( > + int_tofp(pstate - cpudata->pstate.min_pstate), > + cpudata->vid.ratio); > + > + vid_fp = clamp(vid_fp, cpudata->vid.min, cpudata->vid.max); > + vid = ceiling_fp(vid_fp); > + > + if ( pstate > cpudata->pstate.max_pstate ) > + vid = cpudata->vid.turbo; The prior calculation of vid (and vid_fp) is completely pointless in this case. Also - what if limits->no_turbo or limits->turbo_disabled? > + val |= vid; > + > + wrmsrl(MSR_IA32_PERF_CTL, val); > +} > + > +#define BYT_BCLK_FREQS 5 > +#define TO_FREQ_TABLE_IDX_MASK 0x7 > +static uint32_t byt_get_scaling(void) Missing blank line. > +{ > + const uint32_t byt_freq_table[BYT_BCLK_FREQS] = static > + {833, 1000, 1333, 1167, 800}; > + uint64_t value; > + int i; > + > + rdmsrl(MSR_FSB_FREQ, value); > + i = value & TO_FREQ_TABLE_IDX_MASK; With this calculation i is hardly a signed quantity. > +#define CORE_MIN_PSTATE(val) (((value) >> 40) & 0xff) > +#define CORE_MAX_PSTATE(val) (((value) >> 8) & 0xff) > +#define CORE_TURBO_PSTATE(value) ((value) & 0xff) > +static uint32_t core_get_min_pstate(void) > +{ > + uint64_t value; > + > + rdmsrl(MSR_INTEL_PLATFORM_INFO, value); > + return CORE_MIN_PSTATE(val); The fact that (presumably) don't get a build error here is because the mix of variable names here happens to invert the mix of names in the macros ahead of the function. > +static void core_set_pstate(struct perf_limits *limits, > + struct cpudata *cpudata, uint32_t pstate) > +{ > + uint64_t val; > + > + val = pstate << 8; > + if ( limits->no_turbo && !limits->turbo_disabled ) > + val |= (uint64_t)1 << CORE_TURBO_CONTROL_BIT; > + > + wrmsrl(MSR_IA32_PERF_CTL, val); > +} > + > +static __initconst struct cpu_defaults core_params = { static const struct cpu_defaults __initconst core_params = { > +static inline void intel_pstate_calc_busy(struct cpudata *cpu) > +{ > + struct sample *sample = &cpu->sample; > + uint64_t core_pct; > + > + core_pct = int_tofp(sample->aperf) * int_tofp(100); > + core_pct = div64_u64(core_pct, int_tofp(sample->mperf)); > + > + sample->freq = fp_toint(mul_fp(int_tofp(cpu->pstate.max_pstate * > + cpu->pstate.scaling / 100), core_pct)); > + > + sample->core_pct_busy = (int32_t)core_pct; Pointless cast. > +static inline void intel_pstate_sample(struct cpudata *cpu) > +{ > + uint64_t aperf, mperf; > + unsigned long flags; > + > + local_irq_save(flags); > + rdmsrl(MSR_IA32_APERF, aperf); > + rdmsrl(MSR_IA32_MPERF, mperf); > + local_irq_restore(flags); > + > + cpu->last_sample_time = cpu->sample.time; > + cpu->sample.time = get_s_time(); > + cpu->sample.aperf = aperf; > + cpu->sample.mperf = mperf; > + cpu->sample.aperf -= cpu->prev_aperf; > + cpu->sample.mperf -= cpu->prev_mperf; Can these two please be done in a single statement each? > + intel_pstate_calc_busy(cpu); > + > + cpu->prev_aperf = aperf; > + cpu->prev_mperf = mperf; And these two be moved up (intel_pstate_calc_busy() doesn't appear to be using the updated fields)? > +static void intel_pstate_timer_func(void *data) > +{ > + struct cpudata *cpu = (struct cpudata *) data; Pointless cast. > +static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) > +{ > + int cpu_num = policy->cpu; unsigned int > + struct cpudata *cpu = all_cpu_data[cpu_num]; > + > + kill_timer(&all_cpu_data[cpu_num]->timer); > + > + intel_pstate_set_pstate(policy, cpu, cpu->pstate.min_pstate); Why min and not max? Or whatever the CPU was in when the governor initialized the CPU? > +static int intel_pstate_turbo_update(int cpuid, struct cpufreq_policy > *policy) > +{ > + struct cpudata *cpu = all_cpu_data[policy->cpu]; > + struct perf_limits *limits = &policy->limits; > + > + update_turbo_state(policy); > + if ( limits->turbo_disabled ) > + { > + printk("Turbo disabled by BIOS or not supported on CPU\n"); > + return -EINVAL; > + } I'm afraid this message may get printed more often than you'd want (it should be printed at most once during any session). > + limits->no_turbo = policy->turbo == CPUFREQ_TURBO_ENABLED ? 0 : 1; limits->no_turbo = policy->turbo != CPUFREQ_TURBO_ENABLED; > +#define INTEL_PSTATE_GOV_NUM 4 > +static struct internal_governor* intel_pstate_internal_gov_init(void) > +{ > + unsigned int i = 0; > + struct internal_governor *gov; > + char *avail_gov; > + > + gov = xzalloc(struct internal_governor); > + if ( !gov ) > + return NULL; > + avail_gov = xzalloc_array(char, > + INTEL_PSTATE_GOV_NUM * CPUFREQ_NAME_LEN); > + if ( !avail_gov ) > + return NULL; > + > + gov->avail_gov = avail_gov; > + > + i += scnprintf(&avail_gov[0], CPUFREQ_NAME_LEN, "%s ", "performance"); i = ... (and the initializer dropped above). > + i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "powersave"); > + i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "userspace"); > + i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "ondemand"); > + avail_gov[i-1] = '\0'; Coding style. But overall I wonder whether this couldn't be done in a less convoluted way (e.g. a single strlcpy(), or a strlcpy() followed by three strlcat()s). In no case is the indirection via %s warranted. > + gov->gov_num = INTEL_PSTATE_GOV_NUM; > + gov->cur_gov = INTERNAL_GOV_ONDEMAND; Shouldn't this depend on the "cpufreq=" option setting (if any)? > +static int intel_pstate_cpu_setup(struct cpufreq_policy *policy) > +{ > + struct cpudata *cpu; > + struct perf_limits *limits = &policy->limits; > + int rc; > + > + rc = intel_pstate_init_cpu(policy->cpu); Please use reasonably consistent naming (compare the name of the function we're in and that of the called function). > + if ( rc ) > + return rc; > + > + policy->internal_gov = intel_pstate_internal_gov_init(); > + if ( !policy->internal_gov ) > + return -ENOMEM; > + > + cpu = all_cpu_data[policy->cpu]; > + policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; > + policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; DYM .max_pstate here? > + /* cpuinfo and default policy values */ > + policy->cpuinfo.min_freq = cpu->pstate.min_pstate * > + cpu->pstate.scaling; > + policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate * > + cpu->pstate.scaling; Same here? Or if intentional, then I think a comment is warranted, and policy->turbo would then also seem to need initializing to 1 (that actually might eliminate the need for a separate comment). > + limits->min_policy_pct = clamp_t(uint32_t, > + limits->min_policy_pct, 0, 100); > + limits->max_policy_pct = (policy->max * 100) / > + policy->cpuinfo.max_freq; > + limits->max_policy_pct = clamp_t(uint32_t, > + limits->max_policy_pct, 0, 100); These appear to be the only two uses of clamp_t(), which can be easily made just clamp(). Hence I don't see the need for introducing clamp_t() in the first place. > +static void __init copy_pid_params(struct pstate_adjust_policy *policy) const > +{ > + pid_params.sample_rate_ms = policy->sample_rate_ms; > + pid_params.p_gain_pct = policy->p_gain_pct; > + pid_params.i_gain_pct = policy->i_gain_pct; > + pid_params.d_gain_pct = policy->d_gain_pct; > + pid_params.deadband = policy->deadband; > + pid_params.setpoint = policy->setpoint; Why not just pid_params = *policy; ? > +static void __init copy_cpu_funcs(struct pstate_funcs *funcs) const > +{ > + pstate_funcs.get_max = funcs->get_max; > + pstate_funcs.get_min = funcs->get_min; > + pstate_funcs.get_turbo = funcs->get_turbo; > + pstate_funcs.get_scaling = funcs->get_scaling; > + pstate_funcs.set = funcs->set; > + pstate_funcs.get_vid = funcs->get_vid; And again pstate_funcs = *funcs; ? > +int __init intel_pstate_init(void) > +{ > + int cpu, rc = 0; unsigned int cpu; int rc; > + const struct x86_cpu_id *id; > + struct cpu_defaults *cpu_info; > + static bool_t load; __initdata (and I'm sure I said so before). > + boolean_param("intel_pstate", load); > + > + if ( !load ) > + return -ENODEV; > + > + id = x86_match_cpu(intel_pstate_cpu_ids); > + if ( !id ) > + return -ENODEV; > + > + cpu_info = (struct cpu_defaults *)id->driver_data; This casts away const-ness, which is why casts should be avoided wherever possible (and the more when they're pointless like this one). > + copy_pid_params(&cpu_info->pid_policy); > + copy_cpu_funcs(&cpu_info->funcs); > + > + if ( intel_pstate_msrs_not_valid() ) > + return -ENODEV; This reads as if the function returned a boolean. Either make it so (and the perhaps invert the sense and drop the "not" from the name) or use the function's return value (in which the function should perhaps be named ..._validate or some such). > + all_cpu_data = xzalloc_array(struct cpudata *, NR_CPUS); Why NR_CPUS? Can't this be a per-CPU item anyway? > + if ( !all_cpu_data ) > + return -ENOMEM; > + > + rc = cpufreq_register_driver(&intel_pstate_driver); > + if ( rc ) > + goto out; > + > + return rc; Looks like this could be "return 0", albeit ... > +out: ... just a single path leading here (and the comment below taken into account) just doing the cleanup in the if() body above would seem more reasonable. Otherwise - labels indented by at least one space please. > + for_each_online_cpu(cpu) > + { > + if ( all_cpu_data[cpu] ) > + { > + kill_timer(&all_cpu_data[cpu]->timer); > + xfree(all_cpu_data[cpu]); > + } > + } Why is all of this needed? If cpufreq_register_driver() (or one of its descendants) fails, it should clean up after itself. > --- /dev/null > +++ b/xen/include/asm-x86/cpufreq.h > @@ -0,0 +1,31 @@ > +#ifndef _ASM_X86_CPUFREQ_H > +#define _ASM_X86_CPUFREQ_H > + > +/* > + * Copyright (C) 2015, Intel Corporation. > + * Wei Wang <wei.w.wang@xxxxxxxxx> > + * > + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; If not, see <http://www.gnu.org/licenses/>. > + * > + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > + */ > + > +/* > + * Maximum transition latency is in nanoseconds - if it's unknown, > + * CPUFREQ_ETERNAL shall be used. > + */ > +#define CPUFREQ_ETERNAL (-1) So what is x86-specific about this #define? I.e. why can't this go into an existing header? Jan _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.