[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v6 06/15] cpufreq: Add Hardware P-State (HWP) driver



On Mon, Jul 24, 2023 at 12:15 PM Jan Beulich <jbeulich@xxxxxxxx> wrote:
>
> On 24.07.2023 14:58, Jason Andryuk wrote:
> > --- /dev/null
> > +++ b/xen/arch/x86/acpi/cpufreq/hwp.c
> > @@ -0,0 +1,521 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * hwp.c cpufreq driver to run Intel Hardware P-States (HWP)
> > + *
> > + * Copyright (C) 2021 Jason Andryuk <jandryuk@xxxxxxxxx>
> > + */
> > +
> > +#include <xen/cpumask.h>
> > +#include <xen/init.h>
> > +#include <xen/param.h>
> > +#include <xen/xmalloc.h>
> > +#include <asm/msr.h>
> > +#include <acpi/cpufreq/cpufreq.h>
> > +
> > +static bool __ro_after_init feature_hwp_notification;
> > +static bool __ro_after_init feature_hwp_activity_window;
> > +
> > +static bool __ro_after_init feature_hdc;
> > +
> > +static bool __ro_after_init opt_cpufreq_hdc = true;
> > +
> > +union hwp_request
> > +{
> > +    struct
> > +    {
> > +        unsigned int min_perf:8;
> > +        unsigned int max_perf:8;
> > +        unsigned int desired:8;
> > +        unsigned int energy_perf:8;
> > +        unsigned int activity_window:10;
> > +        bool package_control:1;
> > +        unsigned int :16;
> > +        bool activity_window_valid:1;
> > +        bool energy_perf_valid:1;
> > +        bool desired_valid:1;
> > +        bool max_perf_valid:1;
> > +        bool min_perf_valid:1;
> > +    };
> > +    uint64_t raw;
> > +};
> > +
> > +struct hwp_drv_data
> > +{
> > +    union
> > +    {
> > +        uint64_t hwp_caps;
> > +        struct
> > +        {
> > +            unsigned int highest:8;
> > +            unsigned int guaranteed:8;
> > +            unsigned int most_efficient:8;
> > +            unsigned int lowest:8;
> > +            unsigned int :32;
> > +        } hw;
> > +    };
> > +    union hwp_request curr_req;
> > +    int ret;
> > +    uint16_t activity_window;
> > +    uint8_t minimum;
> > +    uint8_t maximum;
> > +    uint8_t desired;
> > +    uint8_t energy_perf;
> > +};
> > +static DEFINE_PER_CPU_READ_MOSTLY(struct hwp_drv_data *, hwp_drv_data);
> > +
> > +#define hwp_err(cpu, fmt, ...) \
> > +    printk(XENLOG_ERR "HWP: CPU%u error: " fmt, cpu, ##__VA_ARGS__)
> > +#define hwp_info(fmt, ...)    printk(XENLOG_INFO "HWP: " fmt, 
> > ##__VA_ARGS__)
>
> I'm not convinced ", ##__VA_ARGS__" is a good construct to use. I notice
> we have a few instances (mainly in code inherited from elsewhere), but I
> think it ought to either be plain C99 style (without the ##) or purely
> the gcc extension form (not using __VA_ARGS__).

Using plain __VA_ARGS__ doesn't work for the cases without arguments:
arch/x86/acpi/cpufreq/hwp.c:78:53: error: expected expression before ‘)’ token
   78 |         printk(XENLOG_DEBUG "HWP: " fmt, __VA_ARGS__);  \
      |                                                     ^
arch/x86/acpi/cpufreq/hwp.c:201:9: note: in expansion of macro ‘hwp_verbose’
  201 |         hwp_verbose("disabled: No energy/performance
preference available");
      |         ^~~~~~~~~~~

I can use "__VA_OPT__(,) __VA_ARGS__" though.

> > +#define hwp_verbose(fmt, ...)                             \
> > +({                                                        \
> > +    if ( cpufreq_verbose )                                \
> > +        printk(XENLOG_DEBUG "HWP: " fmt, ##__VA_ARGS__);  \
>
> (One more here then.)
>
> > +static bool hwp_handle_option(const char *s, const char *end)
>
> __init?

Yes, thanks.

> > +static void hwp_get_cpu_speeds(struct cpufreq_policy *policy)
> > +{
> > +    uint32_t base_khz, max_khz, bus_khz, edx;
> > +
> > +    cpuid(0x16, &base_khz, &max_khz, &bus_khz, &edx);
> > +
> > +    policy->cpuinfo.perf_freq = base_khz * 1000;
> > +    policy->cpuinfo.min_freq = base_khz * 1000;
> > +    policy->cpuinfo.max_freq = max_khz * 1000;
> > +    policy->min = base_khz * 1000;
> > +    policy->max = max_khz * 1000;
>
> Earlier on I asked what about cases where the CPUID output yields
> some zero values (as I know can happen). Iirc you said this doesn't
> affect functionality, but wouldn't it make sense to have a comment
> to this effect here (proving the cases were thought through).

Sure.

> > +static void cf_check hwp_init_msrs(void *info)
> > +{
> > +    struct cpufreq_policy *policy = info;
> > +    struct hwp_drv_data *data = this_cpu(hwp_drv_data);
> > +    uint64_t val;
> > +
> > +    /*
> > +     * Package level MSR, but we don't have a good idea of packages here, 
> > so
> > +     * just do it everytime.
> > +     */
> > +    if ( rdmsr_safe(MSR_PM_ENABLE, val) )
> > +    {
> > +        hwp_err(policy->cpu, "rdmsr_safe(MSR_PM_ENABLE)\n");
> > +        data->curr_req.raw = -1;
> > +        return;
> > +    }
> > +
> > +    /* Ensure we don't generate interrupts */
> > +    if ( feature_hwp_notification )
> > +        wrmsr_safe(MSR_HWP_INTERRUPT, 0);
> > +
> > +    if ( !(val & PM_ENABLE_HWP_ENABLE) )
> > +    {
> > +        val |= PM_ENABLE_HWP_ENABLE;
> > +        if ( wrmsr_safe(MSR_PM_ENABLE, val) )
> > +        {
> > +            hwp_err(policy->cpu, "wrmsr_safe(MSR_PM_ENABLE, %lx)\n", val);
> > +            data->curr_req.raw = -1;
> > +            return;
> > +        }
> > +    }
> > +
> > +    if ( rdmsr_safe(MSR_HWP_CAPABILITIES, data->hwp_caps) )
> > +    {
> > +        hwp_err(policy->cpu, "rdmsr_safe(MSR_HWP_CAPABILITIES)\n");
> > +        goto error;
> > +    }
> > +
> > +    if ( rdmsr_safe(MSR_HWP_REQUEST, data->curr_req.raw) )
> > +    {
> > +        hwp_err(policy->cpu, "rdmsr_safe(MSR_HWP_REQUEST)\n");
> > +        goto error;
> > +    }
> > +
> > +    /*
> > +     * Check for APERF/MPERF support in hardware
> > +     * also check for boost/turbo support
> > +     */
> > +    intel_feature_detect(policy);
>
> Nit: The comment could do with adding at least a comma or semicolon.

Will change to "Check for turbo support."  APERF/MPERF was removed
from intel_feature_detect() in commit 4dd16c44152f ("x86/cpufreq:
Rework APERF/MPERF handling").

> > +    if ( feature_hdc &&
> > +         (!hdc_set_pkg_hdc_ctl(policy->cpu, opt_cpufreq_hdc) ||
> > +          !hdc_set_pm_ctl1(policy->cpu, opt_cpufreq_hdc)) )
> > +    {
> > +            hwp_err(policy->cpu, "Disabling HDC support\n");
> > +            feature_hdc = false;
>
> Nit: Too deep indentation.

Yes, thanks.

> > +static int cf_check hwp_cpufreq_cpu_init(struct cpufreq_policy *policy)
> > +{
> > +    static union hwp_request initial_req;
> > +    unsigned int cpu = policy->cpu;
> > +    struct hwp_drv_data *data;
> > +    bool first_run = false;
> > +
> > +    data = xzalloc(struct hwp_drv_data);
> > +    if ( !data )
> > +        return -ENOMEM;
> > +
> > +    policy->governor = &cpufreq_gov_hwp;
> > +
> > +    per_cpu(hwp_drv_data, cpu) = data;
> > +
> > +    on_selected_cpus(cpumask_of(cpu), hwp_init_msrs, policy, 1);
> > +
> > +    if ( data->curr_req.raw == -1 )
> > +    {
> > +        hwp_err(cpu, "Could not initialize HWP properly\n");
> > +        per_cpu(hwp_drv_data, cpu) = NULL;
> > +        xfree(data);
> > +        return -ENODEV;
> > +    }
> > +
> > +    data->minimum = data->curr_req.min_perf;
> > +    data->maximum = data->curr_req.max_perf;
> > +    data->desired = data->curr_req.desired;
> > +    data->energy_perf = data->curr_req.energy_perf;
> > +    data->activity_window = data->curr_req.activity_window;
> > +
> > +    if ( initial_req.raw == 0 )
> > +    {
> > +        hwp_verbose("CPU%u: HWP_CAPABILITIES: %016lx\n", cpu, 
> > data->hwp_caps);
> > +        initial_req = data->curr_req;
> > +        first_run = true;
> > +    }
>
> What part of data->curr_req is guaranteed to be non-0 (for the condition
> around ...
>
> > +    if ( first_run || data->curr_req.raw != initial_req.raw )
> > +        hwp_verbose("CPU%u: rdmsr HWP_REQUEST %016lx\n", cpu,
> > +                    data->curr_req.raw);
>
> ... this logging to be effective)?

Hmmm.  I think you are correct that there is no guarantee that
data->curr_req will be non-zero.  i.e. a BIOS could zero the whole
register.  In practice, I see 0x8000ff01 - energy_perf = 0x80, maximum
= 0xff and minimum = 0x01.

Would you prefer that I make first_run a static variable to track if
initial_req has been populated?

> > +static void cf_check hwp_set_misc_turbo(void *info)
> > +{
> > +    const struct cpufreq_policy *policy = info;
> > +    struct hwp_drv_data *data = per_cpu(hwp_drv_data, policy->cpu);
> > +    uint64_t msr;
> > +
> > +    data->ret = 0;
> > +
> > +    if ( rdmsr_safe(MSR_IA32_MISC_ENABLE, msr) )
> > +    {
> > +        hwp_verbose("CPU%u: error rdmsr_safe(MSR_IA32_MISC_ENABLE)\n",
> > +                    policy->cpu);
> > +        data->ret = -EINVAL;
> > +
> > +        return;
> > +    }
> > +
> > +    if ( policy->turbo == CPUFREQ_TURBO_ENABLED )
> > +        msr &= ~MSR_IA32_MISC_ENABLE_TURBO_DISENGAGE;
> > +    else
> > +        msr |= MSR_IA32_MISC_ENABLE_TURBO_DISENGAGE;
> > +
> > +    if ( wrmsr_safe(MSR_IA32_MISC_ENABLE, msr) )
> > +    {
> > +        hwp_verbose("CPU%u: error wrmsr_safe(MSR_IA32_MISC_ENABLE): 
> > %016lx\n",
> > +                    policy->cpu, msr);
> > +        data->ret = -EINVAL;
> > +    }
> > +}
>
> Neither of the two -EINVAL really indicate an invalid argument that was
> passed. Maybe EACCES (or less desirably EFAULT)?

Ok, I'll use EACCES.

> > @@ -89,15 +96,45 @@ static int __init cf_check setup_cpufreq_option(const 
> > char *str)
> >          return 0;
> >      }
> >
> > -    if ( choice > 0 || !cmdline_strcmp(str, "xen") )
> > +    do
> >      {
> > -        xen_processor_pmbits |= XEN_PROCESSOR_PM_PX;
> > -        cpufreq_controller = FREQCTL_xen;
> > -        if ( *arg && *(arg + 1) )
> > -            return cpufreq_cmdline_parse(arg + 1);
> > -    }
> > +        const char *end = strchr(str, ';');
> > +
> > +        if ( end == NULL )
> > +            end = strchr(str, '\0');
> > +
> > +        arg = strpbrk(str, ",:");
> > +        if ( !arg || arg > end )
> > +            arg = strchr(str, '\0');
> > +
> > +        if ( cpufreq_xen_cnt == ARRAY_SIZE(cpufreq_xen_opts) )
> > +            return -E2BIG;
> > +
> > +        if ( choice > 0 || !cmdline_strcmp(str, "xen") )
> > +        {
> > +            xen_processor_pmbits |= XEN_PROCESSOR_PM_PX;
> > +            cpufreq_controller = FREQCTL_xen;
> > +            cpufreq_xen_opts[cpufreq_xen_cnt++] = CPUFREQ_xen;
> > +            ret = 0;
> > +            if ( arg[0] && arg[1] )
> > +                ret = cpufreq_cmdline_parse(arg + 1, end);
> > +        }
> > +        else if ( choice < 0 && !cmdline_strcmp(str, "hwp") )
> > +        {
> > +            xen_processor_pmbits |= XEN_PROCESSOR_PM_PX;
> > +            cpufreq_controller = FREQCTL_xen;
> > +            cpufreq_xen_opts[cpufreq_xen_cnt++] = CPUFREQ_hwp;
> > +            ret = 0;
> > +            if ( arg[0] && arg[1] )
> > +                ret = hwp_cmdline_parse(arg + 1, end);
> > +        }
> > +        else
> > +            ret = -EINVAL;
> > +
> > +        str = end ? ++end : end;
> > +    } while ( choice < 0 && ret == 0 && *str );
>
> According to the earlier of the two lines, str may be NULL and hence
> cannot be dereferenced without first checking to be non-NULL. Earlier
> in the loop though you ensure end cannot be NULL. In that case,
> however, you point end at the nul character, so the increment above
> would point end at the next following one. So my guess is that you
> meant
>
>         str = *end ? ++end : end;

Yes, you are correct.  Thanks for catching it.

Thanks,
Jason



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.