[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v6 3/6] x86/intel_pstate: the main body of the intel_pstate driver



>>> On 28.10.15 at 04:21, <wei.w.wang@xxxxxxxxx> wrote:
> --- /dev/null
> +++ b/xen/arch/x86/acpi/cpufreq/intel_pstate.c
> @@ -0,0 +1,882 @@
> +#include <xen/kernel.h>
> +#include <xen/types.h>
> +#include <xen/init.h>
> +#include <xen/bitmap.h>
> +#include <xen/cpumask.h>
> +#include <xen/timer.h>
> +#include <asm/msr.h>
> +#include <asm/msr-index.h>
> +#include <asm/processor.h>
> +#include <asm/div64.h>
> +#include <asm/cpufreq.h>
> +#include <acpi/cpufreq/cpufreq.h>
> +
> +#define BYT_RATIOS       0x66a
> +#define BYT_VIDS         0x66b
> +#define BYT_TURBO_RATIOS 0x66c
> +#define BYT_TURBO_VIDS   0x66d

Considering these are MSR names, the acronym MSR should appear in
them.

> +#define FRAC_BITS 8
> +#define int_tofp(X) ((uint64_t)(X) << FRAC_BITS)
> +#define fp_toint(X) ((X) >> FRAC_BITS)
> +
> +static inline uint32_t mul_fp(uint32_t x, uint32_t y)
> +{
> +    return ((uint64_t)x * (uint64_t)y) >> FRAC_BITS;

Casting just one side of the * would suffice. Also please don't
open-code fp_to_int() ...

> +static inline uint32_t div_fp(uint32_t x, uint32_t y)
> +{
> +    return div_s64((uint64_t)x << FRAC_BITS, y);

... or int_to_fp() (for this one it would be a different thing if you
removed the - apparently pointless - cast from the macro).

> +static inline uint32_t ceiling_fp(uint32_t x)
> +{
> +    uint32_t mask, ret;
> +
> +    ret = fp_toint(x);
> +    mask = (1 << FRAC_BITS) - 1;
> +    if ( x & mask )
> +        ret += 1;
> +    return ret;
> +}

Blank line before final return statement please.

Code further down suggests there may be a sign in these fixed
point numbers, yet here you converted _everything_ to uintNN_t.
Did you go too far? What sense does e.g. div_s64() on a uint64_t
make?

> +struct cpudata {
> +    int cpu;

unsigned int (but I wonder if this is really needed, see the comment
at the allocation site)

> +static signed int pid_calc(struct _pid *pid, uint32_t busy)
> +{
> +    signed int result;
> +    int32_t pterm, dterm, fp_error;
> +    int32_t integral_limit;
> +
> +    fp_error = int_tofp(pid->setpoint) - busy;
> +    if ( ABS(fp_error) <= int_tofp(pid->deadband) )

int_tofp() currently returning an unsigned value - why ABS()?

> +static inline void intel_pstate_reset_all_pid(void)
> +{
> +    uint32_t cpu;

unsigned int

> +static inline void update_turbo_state(struct cpufreq_policy *policy)
> +{
> +    uint64_t misc_en;
> +    struct cpudata *cpu;
> +
> +    cpu = all_cpu_data[policy->cpu];
> +    rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
> +    policy->limits.turbo_disabled =
> +        (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
> +            cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);

At least the & needs parenthesization.

> +static void byt_set_pstate(struct perf_limits *limits,
> +                struct cpudata *cpudata, uint32_t pstate)
> +{
> +    uint64_t val;
> +    uint32_t vid_fp;
> +    uint32_t vid;
> +
> +    val = pstate << 8;

Didn't we agree not to have any unnamed literals?

> +    if ( limits->no_turbo && !limits->turbo_disabled )
> +        val |= (uint64_t)1 << BYT_TURBO_CONTROL_BIT;

Perhaps the sense of that bit isn't what one would imply at first
glance, but the combination of conditions in the if() looks certainly
odd. Please confirm this is really intended to be this way (which
should also include pointing out what meaning the bit being on
has).

> +    vid_fp = cpudata->vid.min + mul_fp(
> +        int_tofp(pstate - cpudata->pstate.min_pstate),
> +        cpudata->vid.ratio);
> +
> +    vid_fp = clamp(vid_fp, cpudata->vid.min, cpudata->vid.max);
> +    vid = ceiling_fp(vid_fp);
> +
> +    if ( pstate > cpudata->pstate.max_pstate )
> +        vid = cpudata->vid.turbo;

The prior calculation of vid (and vid_fp) is completely pointless in
this case. Also - what if limits->no_turbo or limits->turbo_disabled?

> +    val |= vid;
> +
> +    wrmsrl(MSR_IA32_PERF_CTL, val);
> +}
> +
> +#define BYT_BCLK_FREQS 5
> +#define TO_FREQ_TABLE_IDX_MASK 0x7
> +static uint32_t byt_get_scaling(void)

Missing blank line.

> +{
> +    const uint32_t byt_freq_table[BYT_BCLK_FREQS] =

static

> +                   {833, 1000, 1333, 1167, 800};
> +    uint64_t value;
> +    int i;
> +
> +    rdmsrl(MSR_FSB_FREQ, value);
> +    i = value & TO_FREQ_TABLE_IDX_MASK;

With this calculation i is hardly a signed quantity.

> +#define CORE_MIN_PSTATE(val) (((value) >> 40) & 0xff)
> +#define CORE_MAX_PSTATE(val) (((value) >> 8) & 0xff)
> +#define CORE_TURBO_PSTATE(value) ((value) & 0xff)
> +static uint32_t core_get_min_pstate(void)
> +{
> +    uint64_t value;
> +
> +    rdmsrl(MSR_INTEL_PLATFORM_INFO, value);
> +    return CORE_MIN_PSTATE(val);

The fact that (presumably) don't get a build error here is because
the mix of variable names here happens to invert the mix of names
in the macros ahead of the function.

> +static void core_set_pstate(struct perf_limits *limits,
> +                            struct cpudata *cpudata, uint32_t pstate)
> +{
> +    uint64_t val;
> +
> +    val = pstate << 8;
> +    if ( limits->no_turbo && !limits->turbo_disabled )
> +        val |= (uint64_t)1 << CORE_TURBO_CONTROL_BIT;
> +
> +    wrmsrl(MSR_IA32_PERF_CTL, val);
> +}
> +
> +static __initconst struct cpu_defaults core_params = {

static const struct cpu_defaults __initconst core_params = {

> +static inline void intel_pstate_calc_busy(struct cpudata *cpu)
> +{
> +    struct sample *sample = &cpu->sample;
> +    uint64_t core_pct;
> +
> +    core_pct = int_tofp(sample->aperf) * int_tofp(100);
> +    core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
> +
> +    sample->freq = fp_toint(mul_fp(int_tofp(cpu->pstate.max_pstate *
> +                                   cpu->pstate.scaling / 100), core_pct));
> +
> +    sample->core_pct_busy = (int32_t)core_pct;

Pointless cast.

> +static inline void intel_pstate_sample(struct cpudata *cpu)
> +{
> +    uint64_t aperf, mperf;
> +    unsigned long flags;
> +
> +    local_irq_save(flags);
> +    rdmsrl(MSR_IA32_APERF, aperf);
> +    rdmsrl(MSR_IA32_MPERF, mperf);
> +    local_irq_restore(flags);
> +
> +    cpu->last_sample_time = cpu->sample.time;
> +    cpu->sample.time = get_s_time();
> +    cpu->sample.aperf = aperf;
> +    cpu->sample.mperf = mperf;
> +    cpu->sample.aperf -= cpu->prev_aperf;
> +    cpu->sample.mperf -= cpu->prev_mperf;

Can these two please be done in a single statement each?

> +    intel_pstate_calc_busy(cpu);
> +
> +    cpu->prev_aperf = aperf;
> +    cpu->prev_mperf = mperf;

And these two be moved up (intel_pstate_calc_busy() doesn't
appear to be using the updated fields)?

> +static void intel_pstate_timer_func(void *data)
> +{
> +    struct cpudata *cpu = (struct cpudata *) data;

Pointless cast.

> +static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
> +{
> +    int cpu_num = policy->cpu;

unsigned int

> +    struct cpudata *cpu = all_cpu_data[cpu_num];
> +
> +    kill_timer(&all_cpu_data[cpu_num]->timer);
> +
> +    intel_pstate_set_pstate(policy, cpu, cpu->pstate.min_pstate);

Why min and not max? Or whatever the CPU was in when the
governor initialized the CPU?

> +static int intel_pstate_turbo_update(int cpuid, struct cpufreq_policy 
> *policy)
> +{
> +    struct cpudata *cpu = all_cpu_data[policy->cpu];
> +    struct perf_limits *limits = &policy->limits;
> +
> +    update_turbo_state(policy);
> +    if ( limits->turbo_disabled )
> +    {
> +        printk("Turbo disabled by BIOS or not supported on CPU\n");
> +        return -EINVAL;
> +    }

I'm afraid this message may get printed more often than you'd
want (it should be printed at most once during any session).

> +    limits->no_turbo = policy->turbo == CPUFREQ_TURBO_ENABLED ? 0 : 1;

limits->no_turbo = policy->turbo != CPUFREQ_TURBO_ENABLED;

> +#define INTEL_PSTATE_GOV_NUM 4
> +static struct internal_governor* intel_pstate_internal_gov_init(void)
> +{
> +    unsigned int i = 0;
> +    struct internal_governor *gov;
> +    char *avail_gov;
> +
> +    gov = xzalloc(struct internal_governor);
> +    if ( !gov )
> +        return NULL;
> +    avail_gov = xzalloc_array(char,
> +            INTEL_PSTATE_GOV_NUM * CPUFREQ_NAME_LEN);
> +    if ( !avail_gov )
> +        return NULL;
> +
> +    gov->avail_gov = avail_gov;
> +
> +    i += scnprintf(&avail_gov[0], CPUFREQ_NAME_LEN, "%s ", "performance");

i = ... (and the initializer dropped above).

> +    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "powersave");
> +    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "userspace");
> +    i += scnprintf(&avail_gov[i], CPUFREQ_NAME_LEN, "%s ", "ondemand");
> +    avail_gov[i-1] = '\0';

Coding style.

But overall I wonder whether this couldn't be done in a less
convoluted way (e.g. a single strlcpy(), or a strlcpy() followed by
three strlcat()s). In no case is the indirection via %s warranted.

> +    gov->gov_num = INTEL_PSTATE_GOV_NUM;
> +    gov->cur_gov = INTERNAL_GOV_ONDEMAND;

Shouldn't this depend on the "cpufreq=" option setting (if any)?

> +static int intel_pstate_cpu_setup(struct cpufreq_policy *policy)
> +{
> +    struct cpudata *cpu;
> +    struct perf_limits *limits = &policy->limits;
> +    int rc;
> +
> +    rc = intel_pstate_init_cpu(policy->cpu);

Please use reasonably consistent naming (compare the name of the
function we're in and that of the called function).

> +    if ( rc )
> +        return rc;
> +
> +    policy->internal_gov = intel_pstate_internal_gov_init();
> +    if ( !policy->internal_gov )
> +        return -ENOMEM;
> +
> +    cpu = all_cpu_data[policy->cpu];
> +    policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
> +    policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;

DYM .max_pstate here?

> +    /* cpuinfo and default policy values */
> +    policy->cpuinfo.min_freq = cpu->pstate.min_pstate *
> +                               cpu->pstate.scaling;
> +    policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate *
> +                               cpu->pstate.scaling;

Same here? Or if intentional, then I think a comment is warranted,
and policy->turbo would then also seem to need initializing to 1
(that actually might eliminate the need for a separate comment).

> +    limits->min_policy_pct = clamp_t(uint32_t,
> +                                     limits->min_policy_pct, 0, 100);
> +    limits->max_policy_pct = (policy->max * 100) /
> +                             policy->cpuinfo.max_freq;
> +    limits->max_policy_pct = clamp_t(uint32_t,
> +                                     limits->max_policy_pct, 0, 100);

These appear to be the only two uses of clamp_t(), which can be
easily made just clamp(). Hence I don't see the need for introducing
clamp_t() in the first place.

> +static void __init copy_pid_params(struct pstate_adjust_policy *policy)

const

> +{
> +    pid_params.sample_rate_ms = policy->sample_rate_ms;
> +    pid_params.p_gain_pct = policy->p_gain_pct;
> +    pid_params.i_gain_pct = policy->i_gain_pct;
> +    pid_params.d_gain_pct = policy->d_gain_pct;
> +    pid_params.deadband = policy->deadband;
> +    pid_params.setpoint = policy->setpoint;

Why not just

    pid_params = *policy;

?

> +static void __init copy_cpu_funcs(struct pstate_funcs *funcs)

const

> +{
> +    pstate_funcs.get_max   = funcs->get_max;
> +    pstate_funcs.get_min   = funcs->get_min;
> +    pstate_funcs.get_turbo = funcs->get_turbo;
> +    pstate_funcs.get_scaling = funcs->get_scaling;
> +    pstate_funcs.set       = funcs->set;
> +    pstate_funcs.get_vid   = funcs->get_vid;

And again

    pstate_funcs = *funcs;

?

> +int __init intel_pstate_init(void)
> +{
> +    int cpu, rc = 0;

unsigned int cpu;
int rc;

> +    const struct x86_cpu_id *id;
> +    struct cpu_defaults *cpu_info;
> +    static bool_t load;

__initdata (and I'm sure I said so before).

> +    boolean_param("intel_pstate", load);
> +
> +    if ( !load )
> +        return -ENODEV;
> +
> +    id = x86_match_cpu(intel_pstate_cpu_ids);
> +    if ( !id )
> +        return -ENODEV;
> +
> +    cpu_info = (struct cpu_defaults *)id->driver_data;

This casts away const-ness, which is why casts should be avoided
wherever possible (and the more when they're pointless like this one).

> +    copy_pid_params(&cpu_info->pid_policy);
> +    copy_cpu_funcs(&cpu_info->funcs);
> +
> +    if ( intel_pstate_msrs_not_valid() )
> +        return -ENODEV;

This reads as if the function returned a boolean. Either make it so
(and the perhaps invert the sense and drop the "not" from the
name) or use the function's return value (in which the function
should perhaps be named ..._validate or some such).

> +    all_cpu_data = xzalloc_array(struct cpudata *, NR_CPUS);

Why NR_CPUS? Can't this be a per-CPU item anyway?

> +    if ( !all_cpu_data )
> +        return -ENOMEM;
> +
> +    rc = cpufreq_register_driver(&intel_pstate_driver);
> +    if ( rc )
> +        goto out;
> +
> +    return rc;

Looks like this could be "return 0", albeit ...

> +out:

... just a single path leading here (and the comment below taken
into account) just doing the cleanup in the if() body above would
seem more reasonable. Otherwise - labels indented by at least one
space please.

> +    for_each_online_cpu(cpu)
> +    {
> +        if ( all_cpu_data[cpu] )
> +        {
> +            kill_timer(&all_cpu_data[cpu]->timer);
> +            xfree(all_cpu_data[cpu]);
> +        }
> +    }

Why is all of this needed? If cpufreq_register_driver() (or one of its
descendants) fails, it should clean up after itself.

> --- /dev/null
> +++ b/xen/include/asm-x86/cpufreq.h
> @@ -0,0 +1,31 @@
> +#ifndef _ASM_X86_CPUFREQ_H
> +#define _ASM_X86_CPUFREQ_H
> +
> +/*
> + *  Copyright (C) 2015, Intel Corporation.
> + *  Wei Wang <wei.w.wang@xxxxxxxxx>
> + *
> + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; If not, see <http://www.gnu.org/licenses/>.
> + *
> + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> + */
> +
> +/*
> + * Maximum transition latency is in nanoseconds - if it's unknown,
> + * CPUFREQ_ETERNAL shall be used.
> + */
> +#define CPUFREQ_ETERNAL        (-1)

So what is x86-specific about this #define? I.e. why can't this go
into an existing header?

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.