Xen project Mailing List

[Xen-changelog] [xen staging] xen/sched: move schedulers and cpupool coding to dedicated directory

Date: Wed, 22 Jan 2020 17:44:05 +0000

Delivery-date: Wed, 22 Jan 2020 17:44:23 +0000

List-id: "Change log for Mercurial $receive only$" <xen-changelog.lists.xenproject.org>

commit 6cb4b01c033b7abc3e7175501330dfb01fb09da5 Author: Juergen Gross <jgross@xxxxxxxx> AuthorDate: Wed Jan 22 15:06:43 2020 +0100 Commit: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> CommitDate: Wed Jan 22 17:37:11 2020 +0000 xen/sched: move schedulers and cpupool coding to dedicated directory Move sched*c and cpupool.c to a new directory common/sched. Signed-off-by: Juergen Gross <jgross@xxxxxxxx> Reviewed-by: Dario Faggioli <dfaggioli@xxxxxxxx> --- MAINTAINERS | 8 +- xen/common/Kconfig | 66 +- xen/common/Makefile | 8 +- xen/common/compat/schedule.c | 55 - xen/common/cpupool.c | 979 ---------- xen/common/sched/Kconfig | 65 + xen/common/sched/Makefile | 7 + xen/common/sched/arinc653.c | 739 ++++++++ xen/common/sched/compat.c | 55 + xen/common/sched/core.c | 3144 ++++++++++++++++++++++++++++++++ xen/common/sched/cpupool.c | 979 ++++++++++ xen/common/sched/credit.c | 2284 +++++++++++++++++++++++ xen/common/sched/credit2.c | 4122 ++++++++++++++++++++++++++++++++++++++++++ xen/common/sched/null.c | 1034 +++++++++++ xen/common/sched/rt.c | 1571 ++++++++++++++++ xen/common/sched_arinc653.c | 739 -------- xen/common/sched_credit.c | 2284 ----------------------- xen/common/sched_credit2.c | 4122 ------------------------------------------ xen/common/sched_null.c | 1034 ----------- xen/common/sched_rt.c | 1571 ---------------- xen/common/schedule.c | 3144 -------------------------------- 21 files changed, 14006 insertions(+), 14004 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a91080cde5..dadcfb63d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -174,7 +174,7 @@ M: Josh Whitehead <josh.whitehead@xxxxxxxxxxxxxxx> M: Stewart Hildebrand <stewart.hildebrand@xxxxxxxxxxxxxxx> S: Supported L: xen-devel@xxxxxxxxxxxxxxx -F: xen/common/sched_arinc653.c +F: xen/common/sched/arinc653.c F: tools/libxc/xc_arinc653.c ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE @@ -218,7 +218,7 @@ CPU POOLS M: Juergen Gross <jgross@xxxxxxxx> M: Dario Faggioli <dfaggioli@xxxxxxxx> S: Supported -F: xen/common/cpupool.c +F: xen/common/sched/cpupool.c DEVICE TREE M: Stefano Stabellini <sstabellini@xxxxxxxxxx> @@ -384,13 +384,13 @@ RTDS SCHEDULER M: Dario Faggioli <dfaggioli@xxxxxxxx> M: Meng Xu <mengxu@xxxxxxxxxxxxx> S: Supported -F: xen/common/sched_rt.c +F: xen/common/sched/rt.c SCHEDULING M: George Dunlap <george.dunlap@xxxxxxxxxxxxx> M: Dario Faggioli <dfaggioli@xxxxxxxx> S: Supported -F: xen/common/sched* +F: xen/common/sched/ SEABIOS UPSTREAM M: Wei Liu <wl@xxxxxxx> diff --git a/xen/common/Kconfig b/xen/common/Kconfig index b3d161d057..9d6d09eb37 100644 --- a/xen/common/Kconfig +++ b/xen/common/Kconfig @@ -275,71 +275,7 @@ config ARGO If unsure, say N. -menu "Schedulers" - visible if EXPERT = "y" - -config SCHED_CREDIT - bool "Credit scheduler support" - default y - ---help--- - The traditional credit scheduler is a general purpose scheduler. - -config SCHED_CREDIT2 - bool "Credit2 scheduler support" - default y - ---help--- - The credit2 scheduler is a general purpose scheduler that is - optimized for lower latency and higher VM density. - -config SCHED_RTDS - bool "RTDS scheduler support (EXPERIMENTAL)" - default y - ---help--- - The RTDS scheduler is a soft and firm real-time scheduler for - multicore, targeted for embedded, automotive, graphics and gaming - in the cloud, and general low-latency workloads. - -config SCHED_ARINC653 - bool "ARINC653 scheduler support (EXPERIMENTAL)" - default DEBUG - ---help--- - The ARINC653 scheduler is a hard real-time scheduler for single - cores, targeted for avionics, drones, and medical devices. - -config SCHED_NULL - bool "Null scheduler support (EXPERIMENTAL)" - default y - ---help--- - The null scheduler is a static, zero overhead scheduler, - for when there always are less vCPUs than pCPUs, typically - in embedded or HPC scenarios. - -choice - prompt "Default Scheduler?" - default SCHED_CREDIT2_DEFAULT - - config SCHED_CREDIT_DEFAULT - bool "Credit Scheduler" if SCHED_CREDIT - config SCHED_CREDIT2_DEFAULT - bool "Credit2 Scheduler" if SCHED_CREDIT2 - config SCHED_RTDS_DEFAULT - bool "RT Scheduler" if SCHED_RTDS - config SCHED_ARINC653_DEFAULT - bool "ARINC653 Scheduler" if SCHED_ARINC653 - config SCHED_NULL_DEFAULT - bool "Null Scheduler" if SCHED_NULL -endchoice - -config SCHED_DEFAULT - string - default "credit" if SCHED_CREDIT_DEFAULT - default "credit2" if SCHED_CREDIT2_DEFAULT - default "rtds" if SCHED_RTDS_DEFAULT - default "arinc653" if SCHED_ARINC653_DEFAULT - default "null" if SCHED_NULL_DEFAULT - default "credit2" - -endmenu +source "common/sched/Kconfig" config CRYPTO bool diff --git a/xen/common/Makefile b/xen/common/Makefile index 62b34e69e9..2abb8250b0 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -3,7 +3,6 @@ obj-y += bitmap.o obj-y += bsearch.o obj-$(CONFIG_CORE_PARKING) += core_parking.o obj-y += cpu.o -obj-y += cpupool.o obj-$(CONFIG_DEBUG_TRACE) += debugtrace.o obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o obj-y += domctl.o @@ -38,12 +37,6 @@ obj-y += radix-tree.o obj-y += rbtree.o obj-y += rcupdate.o obj-y += rwlock.o -obj-$(CONFIG_SCHED_ARINC653) += sched_arinc653.o -obj-$(CONFIG_SCHED_CREDIT) += sched_credit.o -obj-$(CONFIG_SCHED_CREDIT2) += sched_credit2.o -obj-$(CONFIG_SCHED_RTDS) += sched_rt.o -obj-$(CONFIG_SCHED_NULL) += sched_null.o -obj-y += schedule.o obj-y += shutdown.o obj-y += softirq.o obj-y += sort.o @@ -74,6 +67,7 @@ obj-$(CONFIG_COMPAT) += $(addprefix compat/,domain.o kernel.o memory.o multicall extra-y := symbols-dummy.o subdir-$(CONFIG_COVERAGE) += coverage +subdir-y += sched subdir-$(CONFIG_UBSAN) += ubsan subdir-$(CONFIG_NEEDS_LIBELF) += libelf diff --git a/xen/common/compat/schedule.c b/xen/common/compat/schedule.c deleted file mode 100644 index 8b6e6f107d..0000000000 --- a/xen/common/compat/schedule.c +++ /dev/null @@ -1,55 +0,0 @@ -/**************************************************************************** - * schedule.c - * - */ - -#include <compat/sched.h> - -#define COMPAT -#define ret_t int - -#define do_sched_op compat_sched_op - -#define xen_sched_pin_override sched_pin_override -CHECK_sched_pin_override; -#undef xen_sched_pin_override - -#define xen_sched_shutdown sched_shutdown -CHECK_sched_shutdown; -#undef xen_sched_shutdown - -#define xen_sched_remote_shutdown sched_remote_shutdown -CHECK_sched_remote_shutdown; -#undef xen_sched_remote_shutdown - -static int compat_poll(struct compat_sched_poll *compat) -{ - struct sched_poll native; - -#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \ - guest_from_compat_handle((_d_)->ports, (_s_)->ports) - XLAT_sched_poll(&native, compat); -#undef XLAT_sched_poll_HNDL_ports - - return do_poll(&native); -} - -#define do_poll compat_poll -#define sched_poll compat_sched_poll - -#include "../schedule.c" - -int compat_set_timer_op(u32 lo, s32 hi) -{ - return do_set_timer_op(((s64)hi << 32) | lo); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c deleted file mode 100644 index d66b541a94..0000000000 --- a/xen/common/cpupool.c +++ /dev/null @@ -1,979 +0,0 @@ -/****************************************************************************** - * cpupool.c - * - * Generic cpupool-handling functions. - * - * Cpupools are a feature to have configurable scheduling domains. Each - * cpupool runs an own scheduler on a dedicated set of physical cpus. - * A domain is bound to one cpupool at any time, but it can be moved to - * another cpupool. - * - * (C) 2009, Juergen Gross, Fujitsu Technology Solutions - */ - -#include <xen/lib.h> -#include <xen/init.h> -#include <xen/cpumask.h> -#include <xen/percpu.h> -#include <xen/sched.h> -#include <xen/sched-if.h> -#include <xen/warning.h> -#include <xen/keyhandler.h> -#include <xen/cpu.h> - -#define for_each_cpupool(ptr) \ - for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next)) - -struct cpupool *cpupool0; /* Initial cpupool with Dom0 */ -cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */ - -static struct cpupool *cpupool_list; /* linked list, sorted by poolid */ - -static int cpupool_moving_cpu = -1; -static struct cpupool *cpupool_cpu_moving = NULL; -static cpumask_t cpupool_locked_cpus; - -static DEFINE_SPINLOCK(cpupool_lock); - -static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu; -static unsigned int __read_mostly sched_granularity = 1; - -#ifdef CONFIG_HAS_SCHED_GRANULARITY -static int __init sched_select_granularity(const char *str) -{ - if ( strcmp("cpu", str) == 0 ) - opt_sched_granularity = SCHED_GRAN_cpu; - else if ( strcmp("core", str) == 0 ) - opt_sched_granularity = SCHED_GRAN_core; - else if ( strcmp("socket", str) == 0 ) - opt_sched_granularity = SCHED_GRAN_socket; - else - return -EINVAL; - - return 0; -} -custom_param("sched-gran", sched_select_granularity); -#endif - -static unsigned int __init cpupool_check_granularity(void) -{ - unsigned int cpu; - unsigned int siblings, gran = 0; - - if ( opt_sched_granularity == SCHED_GRAN_cpu ) - return 1; - - for_each_online_cpu ( cpu ) - { - siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity, - cpu)); - if ( gran == 0 ) - gran = siblings; - else if ( gran != siblings ) - return 0; - } - - sched_disable_smt_switching = true; - - return gran; -} - -/* Setup data for selected scheduler granularity. */ -static void __init cpupool_gran_init(void) -{ - unsigned int gran = 0; - const char *fallback = NULL; - - while ( gran == 0 ) - { - gran = cpupool_check_granularity(); - - if ( gran == 0 ) - { - switch ( opt_sched_granularity ) - { - case SCHED_GRAN_core: - opt_sched_granularity = SCHED_GRAN_cpu; - fallback = "Asymmetric cpu configuration.\n" - "Falling back to sched-gran=cpu.\n"; - break; - case SCHED_GRAN_socket: - opt_sched_granularity = SCHED_GRAN_core; - fallback = "Asymmetric cpu configuration.\n" - "Falling back to sched-gran=core.\n"; - break; - default: - ASSERT_UNREACHABLE(); - break; - } - } - } - - if ( fallback ) - warning_add(fallback); - - sched_granularity = gran; -} - -unsigned int cpupool_get_granularity(const struct cpupool *c) -{ - return c ? sched_granularity : 1; -} - -static void free_cpupool_struct(struct cpupool *c) -{ - if ( c ) - { - free_cpumask_var(c->res_valid); - free_cpumask_var(c->cpu_valid); - } - xfree(c); -} - -static struct cpupool *alloc_cpupool_struct(void) -{ - struct cpupool *c = xzalloc(struct cpupool); - - if ( !c ) - return NULL; - - if ( !zalloc_cpumask_var(&c->cpu_valid) || - !zalloc_cpumask_var(&c->res_valid) ) - { - free_cpupool_struct(c); - c = NULL; - } - - return c; -} - -/* - * find a cpupool by it's id. to be called with cpupool lock held - * if exact is not specified, the first cpupool with an id larger or equal to - * the searched id is returned - * returns NULL if not found. - */ -static struct cpupool *__cpupool_find_by_id(int id, int exact) -{ - struct cpupool **q; - - ASSERT(spin_is_locked(&cpupool_lock)); - - for_each_cpupool(q) - if ( (*q)->cpupool_id >= id ) - break; - - return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL; -} - -static struct cpupool *cpupool_find_by_id(int poolid) -{ - return __cpupool_find_by_id(poolid, 1); -} - -static struct cpupool *__cpupool_get_by_id(int poolid, int exact) -{ - struct cpupool *c; - spin_lock(&cpupool_lock); - c = __cpupool_find_by_id(poolid, exact); - if ( c != NULL ) - atomic_inc(&c->refcnt); - spin_unlock(&cpupool_lock); - return c; -} - -struct cpupool *cpupool_get_by_id(int poolid) -{ - return __cpupool_get_by_id(poolid, 1); -} - -static struct cpupool *cpupool_get_next_by_id(int poolid) -{ - return __cpupool_get_by_id(poolid, 0); -} - -void cpupool_put(struct cpupool *pool) -{ - if ( !atomic_dec_and_test(&pool->refcnt) ) - return; - scheduler_free(pool->sched); - free_cpupool_struct(pool); -} - -/* - * create a new cpupool with specified poolid and scheduler - * returns pointer to new cpupool structure if okay, NULL else - * possible failures: - * - no memory - * - poolid already used - * - unknown scheduler - */ -static struct cpupool *cpupool_create( - int poolid, unsigned int sched_id, int *perr) -{ - struct cpupool *c; - struct cpupool **q; - int last = 0; - - *perr = -ENOMEM; - if ( (c = alloc_cpupool_struct()) == NULL ) - return NULL; - - /* One reference for caller, one reference for cpupool_destroy(). */ - atomic_set(&c->refcnt, 2); - - debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id); - - spin_lock(&cpupool_lock); - - for_each_cpupool(q) - { - last = (*q)->cpupool_id; - if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) ) - break; - } - if ( *q != NULL ) - { - if ( (*q)->cpupool_id == poolid ) - { - *perr = -EEXIST; - goto err; - } - c->next = *q; - } - - c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid; - if ( poolid == 0 ) - { - c->sched = scheduler_get_default(); - } - else - { - c->sched = scheduler_alloc(sched_id, perr); - if ( c->sched == NULL ) - goto err; - } - c->gran = opt_sched_granularity; - - *q = c; - - spin_unlock(&cpupool_lock); - - debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n", - c->cpupool_id, c->sched->name, c->sched->opt_name); - - *perr = 0; - return c; - - err: - spin_unlock(&cpupool_lock); - free_cpupool_struct(c); - return NULL; -} -/* - * destroys the given cpupool - * returns 0 on success, 1 else - * possible failures: - * - pool still in use - * - cpus still assigned to pool - * - pool not in list - */ -static int cpupool_destroy(struct cpupool *c) -{ - struct cpupool **q; - - spin_lock(&cpupool_lock); - for_each_cpupool(q) - if ( *q == c ) - break; - if ( *q != c ) - { - spin_unlock(&cpupool_lock); - return -ENOENT; - } - if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) ) - { - spin_unlock(&cpupool_lock); - return -EBUSY; - } - *q = c->next; - spin_unlock(&cpupool_lock); - - cpupool_put(c); - - debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id); - return 0; -} - -/* - * Move domain to another cpupool - */ -static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c) -{ - int ret; - - if ( unlikely(d->cpupool == c) ) - return 0; - - d->cpupool->n_dom--; - ret = sched_move_domain(d, c); - if ( ret ) - d->cpupool->n_dom++; - else - c->n_dom++; - - return ret; -} -int cpupool_move_domain(struct domain *d, struct cpupool *c) -{ - int ret; - - spin_lock(&cpupool_lock); - - ret = cpupool_move_domain_locked(d, c); - - spin_unlock(&cpupool_lock); - - return ret; -} - -/* - * assign a specific cpu to a cpupool - * cpupool_lock must be held - */ -static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) -{ - int ret; - struct domain *d; - const cpumask_t *cpus; - - cpus = sched_get_opt_cpumask(c->gran, cpu); - - if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) ) - return -EADDRNOTAVAIL; - ret = schedule_cpu_add(cpumask_first(cpus), c); - if ( ret ) - return ret; - - rcu_read_lock(&sched_res_rculock); - - cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); - if (cpupool_moving_cpu == cpu) - { - cpupool_moving_cpu = -1; - cpupool_put(cpupool_cpu_moving); - cpupool_cpu_moving = NULL; - } - cpumask_or(c->cpu_valid, c->cpu_valid, cpus); - cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask); - - rcu_read_unlock(&sched_res_rculock); - - rcu_read_lock(&domlist_read_lock); - for_each_domain_in_cpupool(d, c) - { - domain_update_node_affinity(d); - } - rcu_read_unlock(&domlist_read_lock); - - return 0; -} - -static int cpupool_unassign_cpu_finish(struct cpupool *c) -{ - int cpu = cpupool_moving_cpu; - const cpumask_t *cpus; - struct domain *d; - int ret; - - if ( c != cpupool_cpu_moving ) - return -EADDRNOTAVAIL; - - /* - * We need this for scanning the domain list, both in - * cpu_disable_scheduler(), and at the bottom of this function. - */ - rcu_read_lock(&domlist_read_lock); - ret = cpu_disable_scheduler(cpu); - - rcu_read_lock(&sched_res_rculock); - cpus = get_sched_res(cpu)->cpus; - cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus); - - /* - * cpu_disable_scheduler() returning an error doesn't require resetting - * cpupool_free_cpus' cpu bit. All error cases should be of temporary - * nature and tools will retry the operation. Even if the number of - * retries may be limited, the in-between state can easily be repaired - * by adding the cpu to the cpupool again. - */ - if ( !ret ) - { - ret = schedule_cpu_rm(cpu); - if ( ret ) - cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); - else - { - cpupool_moving_cpu = -1; - cpupool_put(cpupool_cpu_moving); - cpupool_cpu_moving = NULL; - } - } - rcu_read_unlock(&sched_res_rculock); - - for_each_domain_in_cpupool(d, c) - { - domain_update_node_affinity(d); - } - rcu_read_unlock(&domlist_read_lock); - - return ret; -} - -static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu) -{ - int ret; - struct domain *d; - const cpumask_t *cpus; - - spin_lock(&cpupool_lock); - ret = -EADDRNOTAVAIL; - if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid)) - && (cpu != cpupool_moving_cpu) ) - goto out; - - ret = 0; - rcu_read_lock(&sched_res_rculock); - cpus = get_sched_res(cpu)->cpus; - - if ( (c->n_dom > 0) && - (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) && - (cpu != cpupool_moving_cpu) ) - { - rcu_read_lock(&domlist_read_lock); - for_each_domain_in_cpupool(d, c) - { - if ( !d->is_dying && system_state == SYS_STATE_active ) - { - ret = -EBUSY; - break; - } - ret = cpupool_move_domain_locked(d, cpupool0); - if ( ret ) - break; - } - rcu_read_unlock(&domlist_read_lock); - if ( ret ) - goto out; - } - cpupool_moving_cpu = cpu; - atomic_inc(&c->refcnt); - cpupool_cpu_moving = c; - cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus); - cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask); - - rcu_read_unlock(&domlist_read_lock); -out: - spin_unlock(&cpupool_lock); - - return ret; -} - -static long cpupool_unassign_cpu_helper(void *info) -{ - struct cpupool *c = info; - long ret; - - debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n", - cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu); - spin_lock(&cpupool_lock); - - ret = cpupool_unassign_cpu_finish(c); - - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret); - - return ret; -} - -/* - * unassign a specific cpu from a cpupool - * we must be sure not to run on the cpu to be unassigned! to achieve this - * the main functionality is performed via continue_hypercall_on_cpu on a - * specific cpu. - * if the cpu to be removed is the last one of the cpupool no active domain - * must be bound to the cpupool. dying domains are moved to cpupool0 as they - * might be zombies. - * possible failures: - * - last cpu and still active domains in cpupool - * - cpu just being unplugged - */ -static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu) -{ - int work_cpu; - int ret; - unsigned int master_cpu; - - debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n", - c->cpupool_id, cpu); - - master_cpu = sched_get_resource_cpu(cpu); - ret = cpupool_unassign_cpu_start(c, master_cpu); - if ( ret ) - { - debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n", - c->cpupool_id, cpu, ret); - return ret; - } - - work_cpu = sched_get_resource_cpu(smp_processor_id()); - if ( work_cpu == master_cpu ) - { - work_cpu = cpumask_first(cpupool0->cpu_valid); - if ( work_cpu == master_cpu ) - work_cpu = cpumask_last(cpupool0->cpu_valid); - } - return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c); -} - -/* - * add a new domain to a cpupool - * possible failures: - * - pool does not exist - * - no cpu assigned to pool - */ -int cpupool_add_domain(struct domain *d, int poolid) -{ - struct cpupool *c; - int rc; - int n_dom = 0; - - if ( poolid == CPUPOOLID_NONE ) - return 0; - spin_lock(&cpupool_lock); - c = cpupool_find_by_id(poolid); - if ( c == NULL ) - rc = -ESRCH; - else if ( !cpumask_weight(c->cpu_valid) ) - rc = -ENODEV; - else - { - c->n_dom++; - n_dom = c->n_dom; - d->cpupool = c; - rc = 0; - } - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n", - d->domain_id, poolid, n_dom, rc); - return rc; -} - -/* - * remove a domain from a cpupool - */ -void cpupool_rm_domain(struct domain *d) -{ - int cpupool_id; - int n_dom; - - if ( d->cpupool == NULL ) - return; - spin_lock(&cpupool_lock); - cpupool_id = d->cpupool->cpupool_id; - d->cpupool->n_dom--; - n_dom = d->cpupool->n_dom; - d->cpupool = NULL; - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n", - d->domain_id, cpupool_id, n_dom); - return; -} - -/* - * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0, - * as they must have been in there when unplugged. - */ -static int cpupool_cpu_add(unsigned int cpu) -{ - int ret = 0; - const cpumask_t *cpus; - - spin_lock(&cpupool_lock); - cpumask_clear_cpu(cpu, &cpupool_locked_cpus); - cpumask_set_cpu(cpu, &cpupool_free_cpus); - - /* - * If we are not resuming, we are hot-plugging cpu, and in which case - * we add it to pool0, as it certainly was there when hot-unplagged - * (or unplugging would have failed) and that is the default behavior - * anyway. - */ - rcu_read_lock(&sched_res_rculock); - get_sched_res(cpu)->cpupool = NULL; - - cpus = sched_get_opt_cpumask(cpupool0->gran, cpu); - if ( cpumask_subset(cpus, &cpupool_free_cpus) ) - ret = cpupool_assign_cpu_locked(cpupool0, cpu); - - rcu_read_unlock(&sched_res_rculock); - - spin_unlock(&cpupool_lock); - - return ret; -} - -/* - * This function is called in stop_machine context, so we can be sure no - * non-idle vcpu is active on the system. - */ -static void cpupool_cpu_remove(unsigned int cpu) -{ - int ret; - - ASSERT(is_idle_vcpu(current)); - - if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) ) - { - ret = cpupool_unassign_cpu_finish(cpupool0); - BUG_ON(ret); - } - cpumask_clear_cpu(cpu, &cpupool_free_cpus); -} - -/* - * Called before a CPU is being removed from the system. - * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved - * to free cpus actually before removing them). - * The CPU is locked, to forbid adding it again to another cpupool. - */ -static int cpupool_cpu_remove_prologue(unsigned int cpu) -{ - int ret = 0; - cpumask_t *cpus; - unsigned int master_cpu; - - spin_lock(&cpupool_lock); - - rcu_read_lock(&sched_res_rculock); - cpus = get_sched_res(cpu)->cpus; - master_cpu = sched_get_resource_cpu(cpu); - if ( cpumask_intersects(cpus, &cpupool_locked_cpus) ) - ret = -EBUSY; - else - cpumask_set_cpu(cpu, &cpupool_locked_cpus); - rcu_read_unlock(&sched_res_rculock); - - spin_unlock(&cpupool_lock); - - if ( ret ) - return ret; - - if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) ) - { - /* Cpupool0 is populated only after all cpus are up. */ - ASSERT(system_state == SYS_STATE_active); - - ret = cpupool_unassign_cpu_start(cpupool0, master_cpu); - } - else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) ) - ret = -ENODEV; - - return ret; -} - -/* - * Called during resume for all cpus which didn't come up again. The cpu must - * be removed from the cpupool it is assigned to. In case a cpupool will be - * left without cpu we move all domains of that cpupool to cpupool0. - * As we are called with all domains still frozen there is no need to take the - * cpupool lock here. - */ -static void cpupool_cpu_remove_forced(unsigned int cpu) -{ - struct cpupool **c; - int ret; - unsigned int master_cpu = sched_get_resource_cpu(cpu); - - for_each_cpupool ( c ) - { - if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) ) - { - ret = cpupool_unassign_cpu_start(*c, master_cpu); - BUG_ON(ret); - ret = cpupool_unassign_cpu_finish(*c); - BUG_ON(ret); - } - } - - cpumask_clear_cpu(cpu, &cpupool_free_cpus); - - rcu_read_lock(&sched_res_rculock); - sched_rm_cpu(cpu); - rcu_read_unlock(&sched_res_rculock); -} - -/* - * do cpupool related sysctl operations - */ -int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op) -{ - int ret; - struct cpupool *c; - - switch ( op->op ) - { - - case XEN_SYSCTL_CPUPOOL_OP_CREATE: - { - int poolid; - - poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ? - CPUPOOLID_NONE: op->cpupool_id; - c = cpupool_create(poolid, op->sched_id, &ret); - if ( c != NULL ) - { - op->cpupool_id = c->cpupool_id; - cpupool_put(c); - } - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_DESTROY: - { - c = cpupool_get_by_id(op->cpupool_id); - ret = -ENOENT; - if ( c == NULL ) - break; - ret = cpupool_destroy(c); - cpupool_put(c); - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_INFO: - { - c = cpupool_get_next_by_id(op->cpupool_id); - ret = -ENOENT; - if ( c == NULL ) - break; - op->cpupool_id = c->cpupool_id; - op->sched_id = c->sched->sched_id; - op->n_dom = c->n_dom; - ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid); - cpupool_put(c); - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_ADDCPU: - { - unsigned cpu; - const cpumask_t *cpus; - - cpu = op->cpu; - debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n", - op->cpupool_id, cpu); - - spin_lock(&cpupool_lock); - - c = cpupool_find_by_id(op->cpupool_id); - ret = -ENOENT; - if ( c == NULL ) - goto addcpu_out; - if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) - { - for_each_cpu ( cpu, &cpupool_free_cpus ) - { - cpus = sched_get_opt_cpumask(c->gran, cpu); - if ( cpumask_subset(cpus, &cpupool_free_cpus) ) - break; - } - ret = -ENODEV; - if ( cpu >= nr_cpu_ids ) - goto addcpu_out; - } - ret = -EINVAL; - if ( cpu >= nr_cpu_ids ) - goto addcpu_out; - ret = -ENODEV; - cpus = sched_get_opt_cpumask(c->gran, cpu); - if ( !cpumask_subset(cpus, &cpupool_free_cpus) || - cpumask_intersects(cpus, &cpupool_locked_cpus) ) - goto addcpu_out; - ret = cpupool_assign_cpu_locked(c, cpu); - - addcpu_out: - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n", - op->cpupool_id, cpu, ret); - - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_RMCPU: - { - unsigned cpu; - - c = cpupool_get_by_id(op->cpupool_id); - ret = -ENOENT; - if ( c == NULL ) - break; - cpu = op->cpu; - if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) - cpu = cpumask_last(c->cpu_valid); - ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL; - cpupool_put(c); - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN: - { - struct domain *d; - - ret = rcu_lock_remote_domain_by_id(op->domid, &d); - if ( ret ) - break; - if ( d->cpupool == NULL ) - { - ret = -EINVAL; - rcu_unlock_domain(d); - break; - } - if ( op->cpupool_id == d->cpupool->cpupool_id ) - { - ret = 0; - rcu_unlock_domain(d); - break; - } - debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n", - d->domain_id, op->cpupool_id); - ret = -ENOENT; - spin_lock(&cpupool_lock); - - c = cpupool_find_by_id(op->cpupool_id); - if ( (c != NULL) && cpumask_weight(c->cpu_valid) ) - ret = cpupool_move_domain_locked(d, c); - - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n", - d->domain_id, op->cpupool_id, ret); - rcu_unlock_domain(d); - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_FREEINFO: - { - ret = cpumask_to_xenctl_bitmap( - &op->cpumap, &cpupool_free_cpus); - } - break; - - default: - ret = -ENOSYS; - break; - } - - return ret; -} - -void dump_runq(unsigned char key) -{ - unsigned long flags; - s_time_t now = NOW(); - struct cpupool **c; - - spin_lock(&cpupool_lock); - local_irq_save(flags); - - printk("sched_smt_power_savings: %s\n", - sched_smt_power_savings? "enabled":"disabled"); - printk("NOW=%"PRI_stime"\n", now); - - printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map)); - if ( !cpumask_empty(&cpupool_free_cpus) ) - { - printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus)); - schedule_dump(NULL); - } - - for_each_cpupool(c) - { - printk("Cpupool %d:\n", (*c)->cpupool_id); - printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid)); - schedule_dump(*c); - } - - local_irq_restore(flags); - spin_unlock(&cpupool_lock); -} - -static int cpu_callback( - struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int rc = 0; - - switch ( action ) - { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - if ( system_state <= SYS_STATE_active ) - rc = cpupool_cpu_add(cpu); - break; - case CPU_DOWN_PREPARE: - /* Suspend/Resume don't change assignments of cpus to cpupools. */ - if ( system_state <= SYS_STATE_active ) - rc = cpupool_cpu_remove_prologue(cpu); - break; - case CPU_DYING: - /* Suspend/Resume don't change assignments of cpus to cpupools. */ - if ( system_state <= SYS_STATE_active ) - cpupool_cpu_remove(cpu); - break; - case CPU_RESUME_FAILED: - cpupool_cpu_remove_forced(cpu); - break; - default: - break; - } - - return !rc ? NOTIFY_DONE : notifier_from_errno(rc); -} - -static struct notifier_block cpu_nfb = { - .notifier_call = cpu_callback -}; - -static int __init cpupool_init(void) -{ - unsigned int cpu; - int err; - - cpupool_gran_init(); - - cpupool0 = cpupool_create(0, 0, &err); - BUG_ON(cpupool0 == NULL); - cpupool_put(cpupool0); - register_cpu_notifier(&cpu_nfb); - - spin_lock(&cpupool_lock); - - cpumask_copy(&cpupool_free_cpus, &cpu_online_map); - - for_each_cpu ( cpu, &cpupool_free_cpus ) - cpupool_assign_cpu_locked(cpupool0, cpu); - - spin_unlock(&cpupool_lock); - - return 0; -} -__initcall(cpupool_init); - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/common/sched/Kconfig b/xen/common/sched/Kconfig new file mode 100644 index 0000000000..883ac87cab --- /dev/null +++ b/xen/common/sched/Kconfig @@ -0,0 +1,65 @@ +menu "Schedulers" + visible if EXPERT = "y" + +config SCHED_CREDIT + bool "Credit scheduler support" + default y + ---help--- + The traditional credit scheduler is a general purpose scheduler. + +config SCHED_CREDIT2 + bool "Credit2 scheduler support" + default y + ---help--- + The credit2 scheduler is a general purpose scheduler that is + optimized for lower latency and higher VM density. + +config SCHED_RTDS + bool "RTDS scheduler support (EXPERIMENTAL)" + default y + ---help--- + The RTDS scheduler is a soft and firm real-time scheduler for + multicore, targeted for embedded, automotive, graphics and gaming + in the cloud, and general low-latency workloads. + +config SCHED_ARINC653 + bool "ARINC653 scheduler support (EXPERIMENTAL)" + default DEBUG + ---help--- + The ARINC653 scheduler is a hard real-time scheduler for single + cores, targeted for avionics, drones, and medical devices. + +config SCHED_NULL + bool "Null scheduler support (EXPERIMENTAL)" + default y + ---help--- + The null scheduler is a static, zero overhead scheduler, + for when there always are less vCPUs than pCPUs, typically + in embedded or HPC scenarios. + +choice + prompt "Default Scheduler?" + default SCHED_CREDIT2_DEFAULT + + config SCHED_CREDIT_DEFAULT + bool "Credit Scheduler" if SCHED_CREDIT + config SCHED_CREDIT2_DEFAULT + bool "Credit2 Scheduler" if SCHED_CREDIT2 + config SCHED_RTDS_DEFAULT + bool "RT Scheduler" if SCHED_RTDS + config SCHED_ARINC653_DEFAULT + bool "ARINC653 Scheduler" if SCHED_ARINC653 + config SCHED_NULL_DEFAULT + bool "Null Scheduler" if SCHED_NULL +endchoice + +config SCHED_DEFAULT + string + default "credit" if SCHED_CREDIT_DEFAULT + default "credit2" if SCHED_CREDIT2_DEFAULT + default "rtds" if SCHED_RTDS_DEFAULT + default "arinc653" if SCHED_ARINC653_DEFAULT + default "null" if SCHED_NULL_DEFAULT + default "credit2" + +endmenu diff --git a/xen/common/sched/Makefile b/xen/common/sched/Makefile new file mode 100644 index 0000000000..3537f2a68d --- /dev/null +++ b/xen/common/sched/Makefile @@ -0,0 +1,7 @@ +obj-y += cpupool.o +obj-$(CONFIG_SCHED_ARINC653) += arinc653.o +obj-$(CONFIG_SCHED_CREDIT) += credit.o +obj-$(CONFIG_SCHED_CREDIT2) += credit2.o +obj-$(CONFIG_SCHED_RTDS) += rt.o +obj-$(CONFIG_SCHED_NULL) += null.o +obj-y += core.o diff --git a/xen/common/sched/arinc653.c b/xen/common/sched/arinc653.c new file mode 100644 index 0000000000..565575c326 --- /dev/null +++ b/xen/common/sched/arinc653.c @@ -0,0 +1,739 @@ +/****************************************************************************** + * sched_arinc653.c + * + * An ARINC653-compatible scheduling algorithm for use in Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2010, DornerWorks, Ltd. <DornerWorks.com> + */ + +#include <xen/lib.h> +#include <xen/sched.h> +#include <xen/sched-if.h> +#include <xen/timer.h> +#include <xen/softirq.h> +#include <xen/time.h> +#include <xen/errno.h> +#include <xen/list.h> +#include <xen/guest_access.h> +#include <public/sysctl.h> + +/************************************************************************** + * Private Macros * + **************************************************************************/ + +/** + * Default timeslice for domain 0. + */ +#define DEFAULT_TIMESLICE MILLISECS(10) + +/** + * Retrieve the idle UNIT for a given physical CPU + */ +#define IDLETASK(cpu) (sched_idle_unit(cpu)) + +/** + * Return a pointer to the ARINC 653-specific scheduler data information + * associated with the given UNIT (unit) + */ +#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv) + +/** + * Return the global scheduler private data given the scheduler ops pointer + */ +#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data)) + +/************************************************************************** + * Private Type Definitions * + **************************************************************************/ + +/** + * The arinc653_unit_t structure holds ARINC 653-scheduler-specific + * information for all non-idle UNITs + */ +typedef struct arinc653_unit_s +{ + /* unit points to Xen's struct sched_unit so we can get to it from an + * arinc653_unit_t pointer. */ + struct sched_unit * unit; + /* awake holds whether the UNIT has been woken with vcpu_wake() */ + bool_t awake; + /* list holds the linked list information for the list this UNIT + * is stored in */ + struct list_head list; +} arinc653_unit_t; + +/** + * The sched_entry_t structure holds a single entry of the + * ARINC 653 schedule. + */ +typedef struct sched_entry_s +{ + /* dom_handle holds the handle ("UUID") for the domain that this + * schedule entry refers to. */ + xen_domain_handle_t dom_handle; + /* unit_id holds the UNIT number for the UNIT that this schedule + * entry refers to. */ + int unit_id; + /* runtime holds the number of nanoseconds that the UNIT for this + * schedule entry should be allowed to run per major frame. */ + s_time_t runtime; + /* unit holds a pointer to the Xen sched_unit structure */ + struct sched_unit * unit; +} sched_entry_t; + +/** + * This structure defines data that is global to an instance of the scheduler + */ +typedef struct a653sched_priv_s +{ + /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ + spinlock_t lock; + + /** + * This array holds the active ARINC 653 schedule. + * + * When the system tries to start a new UNIT, this schedule is scanned + * to look for a matching (handle, UNIT #) pair. If both the handle (UUID) + * and UNIT number match, then the UNIT is allowed to run. Its run time + * (per major frame) is given in the third entry of the schedule. + */ + sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE]; + + /** + * This variable holds the number of entries that are valid in + * the arinc653_schedule table. + * + * This is not necessarily the same as the number of domains in the + * schedule. A domain could be listed multiple times within the schedule, + * or a domain with multiple UNITs could have a different + * schedule entry for each UNIT. + */ + unsigned int num_schedule_entries; + + /** + * the major frame time for the ARINC 653 schedule. + */ + s_time_t major_frame; + + /** + * the time that the next major frame starts + */ + s_time_t next_major_frame; + + /** + * pointers to all Xen UNIT structures for iterating through + */ + struct list_head unit_list; +} a653sched_priv_t; + +/************************************************************************** + * Helper functions * + **************************************************************************/ + +/** + * This function compares two domain handles. + * + * @param h1 Pointer to handle 1 + * @param h2 Pointer to handle 2 + * + * @return <ul> + * <li> <0: handle 1 is less than handle 2 + * <li> 0: handle 1 is equal to handle 2 + * <li> >0: handle 1 is greater than handle 2 + * </ul> + */ +static int dom_handle_cmp(const xen_domain_handle_t h1, + const xen_domain_handle_t h2) +{ + return memcmp(h1, h2, sizeof(xen_domain_handle_t)); +} + +/** + * This function searches the unit list to find a UNIT that matches + * the domain handle and UNIT ID specified. + * + * @param ops Pointer to this instance of the scheduler structure + * @param handle Pointer to handler + * @param unit_id UNIT ID + * + * @return <ul> + * <li> Pointer to the matching UNIT if one is found + * <li> NULL otherwise + * </ul> + */ +static struct sched_unit *find_unit( + const struct scheduler *ops, + xen_domain_handle_t handle, + int unit_id) +{ + arinc653_unit_t *aunit; + + /* loop through the unit_list looking for the specified UNIT */ + list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list ) + if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0) + && (unit_id == aunit->unit->unit_id) ) + return aunit->unit; + + return NULL; +} + +/** + * This function updates the pointer to the Xen UNIT structure for each entry + * in the ARINC 653 schedule. + * + * @param ops Pointer to this instance of the scheduler structure + * @return <None> + */ +static void update_schedule_units(const struct scheduler *ops) +{ + unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries; + + for ( i = 0; i < n_entries; i++ ) + SCHED_PRIV(ops)->schedule[i].unit = + find_unit(ops, + SCHED_PRIV(ops)->schedule[i].dom_handle, + SCHED_PRIV(ops)->schedule[i].unit_id); +} + +/** + * This function is called by the adjust_global scheduler hook to put + * in place a new ARINC653 schedule. + * + * @param ops Pointer to this instance of the scheduler structure + * + * @return <ul> + * <li> 0 = success + * <li> !0 = error + * </ul> + */ +static int +arinc653_sched_set( + const struct scheduler *ops, + struct xen_sysctl_arinc653_schedule *schedule) +{ + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + s_time_t total_runtime = 0; + unsigned int i; + unsigned long flags; + int rc = -EINVAL; + + spin_lock_irqsave(&sched_priv->lock, flags); + + /* Check for valid major frame and number of schedule entries. */ + if ( (schedule->major_frame <= 0) + || (schedule->num_sched_entries < 1) + || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) ) + goto fail; + + for ( i = 0; i < schedule->num_sched_entries; i++ ) + { + /* Check for a valid run time. */ + if ( schedule->sched_entries[i].runtime <= 0 ) + goto fail; + + /* Add this entry's run time to total run time. */ + total_runtime += schedule->sched_entries[i].runtime; + } + + /* + * Error if the major frame is not large enough to run all entries as + * indicated by comparing the total run time to the major frame length. + */ + if ( total_runtime > schedule->major_frame ) + goto fail; + + /* Copy the new schedule into place. */ + sched_priv->num_schedule_entries = schedule->num_sched_entries; + sched_priv->major_frame = schedule->major_frame; + for ( i = 0; i < schedule->num_sched_entries; i++ ) + { + memcpy(sched_priv->schedule[i].dom_handle, + schedule->sched_entries[i].dom_handle, + sizeof(sched_priv->schedule[i].dom_handle)); + sched_priv->schedule[i].unit_id = + schedule->sched_entries[i].vcpu_id; + sched_priv->schedule[i].runtime = + schedule->sched_entries[i].runtime; + } + update_schedule_units(ops); + + /* + * The newly-installed schedule takes effect immediately. We do not even + * wait for the current major frame to expire. + * + * Signal a new major frame to begin. The next major frame is set up by + * the do_schedule callback function when it is next invoked. + */ + sched_priv->next_major_frame = NOW(); + + rc = 0; + + fail: + spin_unlock_irqrestore(&sched_priv->lock, flags); + return rc; +} + +/** + * This function is called by the adjust_global scheduler hook to read the + * current ARINC 653 schedule + * + * @param ops Pointer to this instance of the scheduler structure + * @return <ul> + * <li> 0 = success + * <li> !0 = error + * </ul> + */ +static int +arinc653_sched_get( + const struct scheduler *ops, + struct xen_sysctl_arinc653_schedule *schedule) +{ + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + unsigned int i; + unsigned long flags; + + spin_lock_irqsave(&sched_priv->lock, flags); + + schedule->num_sched_entries = sched_priv->num_schedule_entries; + schedule->major_frame = sched_priv->major_frame; + for ( i = 0; i < sched_priv->num_schedule_entries; i++ ) + { + memcpy(schedule->sched_entries[i].dom_handle, + sched_priv->schedule[i].dom_handle, + sizeof(sched_priv->schedule[i].dom_handle)); + schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id; + schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime; + } + + spin_unlock_irqrestore(&sched_priv->lock, flags); + + return 0; +} + +/************************************************************************** + * Scheduler callback functions * + **************************************************************************/ + +/** + * This function performs initialization for an instance of the scheduler. + * + * @param ops Pointer to this instance of the scheduler structure + * + * @return <ul> + * <li> 0 = success + * <li> !0 = error + * </ul> + */ +static int +a653sched_init(struct scheduler *ops) +{ + a653sched_priv_t *prv; + + prv = xzalloc(a653sched_priv_t); + if ( prv == NULL ) + return -ENOMEM; + + ops->sched_data = prv; + + prv->next_major_frame = 0; + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->unit_list); + + return 0; +} + +/** + * This function performs deinitialization for an instance of the scheduler + * + * @param ops Pointer to this instance of the scheduler structure + */ +static void +a653sched_deinit(struct scheduler *ops) +{ + xfree(SCHED_PRIV(ops)); + ops->sched_data = NULL; +} + +/** + * This function allocates scheduler-specific data for a UNIT + * + * @param ops Pointer to this instance of the scheduler structure + * @param unit Pointer to struct sched_unit + * + * @return Pointer to the allocated data + */ +static void * +a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, + void *dd) +{ + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + arinc653_unit_t *svc; + unsigned int entry; + unsigned long flags; + + /* + * Allocate memory for the ARINC 653-specific scheduler data information + * associated with the given UNIT (unit). + */ + svc = xmalloc(arinc653_unit_t); + if ( svc == NULL ) + return NULL; + + spin_lock_irqsave(&sched_priv->lock, flags); + + /* + * Add every one of dom0's units to the schedule, as long as there are + * slots available. + */ + if ( unit->domain->domain_id == 0 ) + { + entry = sched_priv->num_schedule_entries; + + if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE ) + { + sched_priv->schedule[entry].dom_handle[0] = '\0'; + sched_priv->schedule[entry].unit_id = unit->unit_id; + sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE; + sched_priv->schedule[entry].unit = unit; + + sched_priv->major_frame += DEFAULT_TIMESLICE; + ++sched_priv->num_schedule_entries; + } + } + + /* + * Initialize our ARINC 653 scheduler-specific information for the UNIT. + * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it + * will call the vcpu_wake scheduler callback function and our scheduler + * will mark the UNIT awake. + */ + svc->unit = unit; + svc->awake = 0; + if ( !is_idle_unit(unit) ) + list_add(&svc->list, &SCHED_PRIV(ops)->unit_list); + update_schedule_units(ops); + + spin_unlock_irqrestore(&sched_priv->lock, flags); + + return svc; +} + +/** + * This function frees scheduler-specific UNIT data + * + * @param ops Pointer to this instance of the scheduler structure + */ +static void +a653sched_free_udata(const struct scheduler *ops, void *priv) +{ + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + arinc653_unit_t *av = priv; + unsigned long flags; + + if (av == NULL) + return; + + spin_lock_irqsave(&sched_priv->lock, flags); + + if ( !is_idle_unit(av->unit) ) + list_del(&av->list); + + xfree(av); + update_schedule_units(ops); + + spin_unlock_irqrestore(&sched_priv->lock, flags); +} + +/** + * Xen scheduler callback function to sleep a UNIT + * + * @param ops Pointer to this instance of the scheduler structure + * @param unit Pointer to struct sched_unit + */ +static void +a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) +{ + if ( AUNIT(unit) != NULL ) + AUNIT(unit)->awake = 0; + + /* + * If the UNIT being put to sleep is the same one that is currently + * running, raise a softirq to invoke the scheduler to switch domains. + */ + if ( get_sched_res(sched_unit_master(unit))->curr == unit ) + cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); +} + +/** + * Xen scheduler callback function to wake up a UNIT + * + * @param ops Pointer to this instance of the scheduler structure + * @param unit Pointer to struct sched_unit + */ +static void +a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit) +{ + if ( AUNIT(unit) != NULL ) + AUNIT(unit)->awake = 1; + + cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); +} + +/** + * Xen scheduler callback function to select a UNIT to run. + * This is the main scheduler routine. + * + * @param ops Pointer to this instance of the scheduler structure + * @param now Current time + */ +static void +a653sched_do_schedule( + const struct scheduler *ops, + struct sched_unit *prev, + s_time_t now, + bool tasklet_work_scheduled) +{ + struct sched_unit *new_task = NULL; + static unsigned int sched_index = 0; + static s_time_t next_switch_time; + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + const unsigned int cpu = sched_get_resource_cpu(smp_processor_id()); + unsigned long flags; + + spin_lock_irqsave(&sched_priv->lock, flags); + + if ( sched_priv->num_schedule_entries < 1 ) + sched_priv->next_major_frame = now + DEFAULT_TIMESLICE; + else if ( now >= sched_priv->next_major_frame ) + { + /* time to enter a new major frame + * the first time this function is called, this will be true */ + /* start with the first domain in the schedule */ + sched_index = 0; + sched_priv->next_major_frame = now + sched_priv->major_frame; + next_switch_time = now + sched_priv->schedule[0].runtime; + } + else + { + while ( (now >= next_switch_time) + && (sched_index < sched_priv->num_schedule_entries) ) + { + /* time to switch to the next domain in this major frame */ + sched_index++; + next_switch_time += sched_priv->schedule[sched_index].runtime; + } + } + + /* + * If we exhausted the domains in the schedule and still have time left + * in the major frame then switch next at the next major frame. + */ + if ( sched_index >= sched_priv->num_schedule_entries ) + next_switch_time = sched_priv->next_major_frame; + + /* + * If there are more domains to run in the current major frame, set + * new_task equal to the address of next domain's sched_unit structure. + * Otherwise, set new_task equal to the address of the idle task's + * sched_unit structure. + */ + new_task = (sched_index < sched_priv->num_schedule_entries) + ? sched_priv->schedule[sched_index].unit + : IDLETASK(cpu); + + /* Check to see if the new task can be run (awake & runnable). */ + if ( !((new_task != NULL) + && (AUNIT(new_task) != NULL) + && AUNIT(new_task)->awake + && unit_runnable_state(new_task)) ) + new_task = IDLETASK(cpu); + BUG_ON(new_task == NULL); + + /* + * Check to make sure we did not miss a major frame. + * This is a good test for robust partitioning. + */ + BUG_ON(now >= sched_priv->next_major_frame); + + spin_unlock_irqrestore(&sched_priv->lock, flags); + + /* Tasklet work (which runs in idle UNIT context) overrides all else. */ + if ( tasklet_work_scheduled ) + new_task = IDLETASK(cpu); + + /* Running this task would result in a migration */ + if ( !is_idle_unit(new_task) + && (sched_unit_master(new_task) != cpu) ) + new_task = IDLETASK(cpu); + + /* + * Return the amount of time the next domain has to run and the address + * of the selected task's UNIT structure. + */ + prev->next_time = next_switch_time - now; + prev->next_task = new_task; + new_task->migrated = false; + + BUG_ON(prev->next_time <= 0); +} + +/** + * Xen scheduler callback function to select a resource for the UNIT to run on + * + * @param ops Pointer to this instance of the scheduler structure + * @param unit Pointer to struct sched_unit + * + * @return Scheduler resource to run on + */ +static struct sched_resource * +a653sched_pick_resource(const struct scheduler *ops, + const struct sched_unit *unit) +{ + cpumask_t *online; + unsigned int cpu; + + /* + * If present, prefer unit's current processor, else + * just find the first valid unit. + */ + online = cpupool_domain_master_cpumask(unit->domain); + + cpu = cpumask_first(online); + + if ( cpumask_test_cpu(sched_unit_master(unit), online) + || (cpu >= nr_cpu_ids) ) + cpu = sched_unit_master(unit); + + return get_sched_res(cpu); +} + +/** + * Xen scheduler callback to change the scheduler of a cpu + * + * @param new_ops Pointer to this instance of the scheduler structure + * @param cpu The cpu that is changing scheduler + * @param pdata scheduler specific PCPU data (we don't have any) + * @param vdata scheduler specific UNIT data of the idle unit + */ +static spinlock_t * +a653_switch_sched(struct scheduler *new_ops, unsigned int cpu, + void *pdata, void *vdata) +{ + struct sched_resource *sr = get_sched_res(cpu); + arinc653_unit_t *svc = vdata; + + ASSERT(!pdata && svc && is_idle_unit(svc->unit)); + + sched_idle_unit(cpu)->priv = vdata; + + return &sr->_lock; +} + +/** + * Xen scheduler callback function to perform a global (not domain-specific) + * adjustment. It is used by the ARINC 653 scheduler to put in place a new + * ARINC 653 schedule or to retrieve the schedule currently in place. + * + * @param ops Pointer to this instance of the scheduler structure + * @param sc Pointer to the scheduler operation specified by Domain 0 + */ +static int +a653sched_adjust_global(const struct scheduler *ops, + struct xen_sysctl_scheduler_op *sc) +{ + struct xen_sysctl_arinc653_schedule local_sched; + int rc = -EINVAL; + + switch ( sc->cmd ) + { + case XEN_SYSCTL_SCHEDOP_putinfo: + if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) ) + { + rc = -EFAULT; + break; + } + + rc = arinc653_sched_set(ops, &local_sched); + break; + case XEN_SYSCTL_SCHEDOP_getinfo: + memset(&local_sched, -1, sizeof(local_sched)); + rc = arinc653_sched_get(ops, &local_sched); + if ( rc ) + break; + + if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) ) + rc = -EFAULT; + break; + } + + return rc; +} + +/** + * This structure defines our scheduler for Xen. + * The entries tell Xen where to find our scheduler-specific + * callback functions. + * The symbol must be visible to the rest of Xen at link time. + */ +static const struct scheduler sched_arinc653_def = { + .name = "ARINC 653 Scheduler", + .opt_name = "arinc653", + .sched_id = XEN_SCHEDULER_ARINC653, + .sched_data = NULL, + + .init = a653sched_init, + .deinit = a653sched_deinit, + + .free_udata = a653sched_free_udata, + .alloc_udata = a653sched_alloc_udata, + + .insert_unit = NULL, + .remove_unit = NULL, + + .sleep = a653sched_unit_sleep, + .wake = a653sched_unit_wake, + .yield = NULL, + .context_saved = NULL, + + .do_schedule = a653sched_do_schedule, + + .pick_resource = a653sched_pick_resource, + + .switch_sched = a653_switch_sched, + + .adjust = NULL, + .adjust_global = a653sched_adjust_global, + + .dump_settings = NULL, + .dump_cpu_state = NULL, +}; + +REGISTER_SCHEDULER(sched_arinc653_def); + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c new file mode 100644 index 0000000000..040b4caca2 --- /dev/null +++ b/xen/common/sched/compat.c @@ -0,0 +1,55 @@ +/**************************************************************************** + * schedule.c + * + */ + +#include <compat/sched.h> + +#define COMPAT +#define ret_t int + +#define do_sched_op compat_sched_op + +#define xen_sched_pin_override sched_pin_override +CHECK_sched_pin_override; +#undef xen_sched_pin_override + +#define xen_sched_shutdown sched_shutdown +CHECK_sched_shutdown; +#undef xen_sched_shutdown + +#define xen_sched_remote_shutdown sched_remote_shutdown +CHECK_sched_remote_shutdown; +#undef xen_sched_remote_shutdown + +static int compat_poll(struct compat_sched_poll *compat) +{ + struct sched_poll native; + +#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \ + guest_from_compat_handle((_d_)->ports, (_s_)->ports) + XLAT_sched_poll(&native, compat); +#undef XLAT_sched_poll_HNDL_ports + + return do_poll(&native); +} + +#define do_poll compat_poll +#define sched_poll compat_sched_poll + +#include "core.c" + +int compat_set_timer_op(u32 lo, s32 hi) +{ + return do_set_timer_op(((s64)hi << 32) | lo); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c new file mode 100644 index 0000000000..4d8eb4c617 --- /dev/null +++ b/xen/common/sched/core.c @@ -0,0 +1,3144 @@ +/**************************************************************************** + * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge + * (C) 2002-2003 University of Cambridge + * (C) 2004 - Mark Williamson - Intel Research Cambridge + **************************************************************************** + * + * File: common/schedule.c + * Author: Rolf Neugebauer & Keir Fraser + * Updated for generic API by Mark Williamson + * + * Description: Generic CPU scheduling code + * implements support functionality for the Xen scheduler API. + * + */ + +#ifndef COMPAT +#include <xen/init.h> +#include <xen/lib.h> +#include <xen/sched.h> +#include <xen/domain.h> +#include <xen/delay.h> +#include <xen/event.h> +#include <xen/time.h> +#include <xen/timer.h> +#include <xen/perfc.h> +#include <xen/sched-if.h> +#include <xen/softirq.h> +#include <xen/trace.h> +#include <xen/mm.h> +#include <xen/err.h> +#include <xen/guest_access.h> +#include <xen/hypercall.h> +#include <xen/multicall.h> +#include <xen/cpu.h> +#include <xen/preempt.h> +#include <xen/event.h> +#include <public/sched.h> +#include <xsm/xsm.h> +#include <xen/err.h> + +#ifdef CONFIG_XEN_GUEST +#include <asm/guest.h> +#else +#define pv_shim false +#endif + +/* opt_sched: scheduler - default to configured value */ +static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT; +string_param("sched", opt_sched); + +/* if sched_smt_power_savings is set, + * scheduler will give preferrence to partially idle package compared to + * the full idle package, when picking pCPU to schedule vCPU. + */ +bool_t sched_smt_power_savings = 0; +boolean_param("sched_smt_power_savings", sched_smt_power_savings); + +/* Default scheduling rate limit: 1ms + * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined + * */ +int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; +integer_param("sched_ratelimit_us", sched_ratelimit_us); + +/* Number of vcpus per struct sched_unit. */ +bool __read_mostly sched_disable_smt_switching; +cpumask_t sched_res_mask; + +/* Common lock for free cpus. */ +static DEFINE_SPINLOCK(sched_free_cpu_lock); + +/* Various timer handlers. */ +static void s_timer_fn(void *unused); +static void vcpu_periodic_timer_fn(void *data); +static void vcpu_singleshot_timer_fn(void *data); +static void poll_timer_fn(void *data); + +/* This is global for now so that private implementations can reach it */ +DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res); +static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx); +DEFINE_RCU_READ_LOCK(sched_res_rculock); + +/* Scratch space for cpumasks. */ +DEFINE_PER_CPU(cpumask_t, cpumask_scratch); + +/* How many urgent vcpus. */ +DEFINE_PER_CPU(atomic_t, sched_urgent_count); + +extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[]; +#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array) +#define schedulers __start_schedulers_array + +static struct scheduler __read_mostly ops; + +static bool scheduler_active; + +static void sched_set_affinity( + struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft); + +static struct sched_resource * +sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit) +{ + return unit->res; +} + +static void * +sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, + void *dd) +{ + /* Any non-NULL pointer is fine here. */ + return ZERO_BLOCK_PTR; +} + +static void +sched_idle_free_udata(const struct scheduler *ops, void *priv) +{ +} + +static void sched_idle_schedule( + const struct scheduler *ops, struct sched_unit *unit, s_time_t now, + bool tasklet_work_scheduled) +{ + const unsigned int cpu = smp_processor_id(); + + unit->next_time = -1; + unit->next_task = sched_idle_unit(cpu); +} + +static struct scheduler sched_idle_ops = { + .name = "Idle Scheduler", + .opt_name = "idle", + .sched_data = NULL, + + .pick_resource = sched_idle_res_pick, + .do_schedule = sched_idle_schedule, + + .alloc_udata = sched_idle_alloc_udata, + .free_udata = sched_idle_free_udata, +}; + +static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit, + unsigned int cpu) +{ + unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu); + const struct domain *d = unit->domain; + + return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL; +} + +static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit, + unsigned int cpu) +{ + struct vcpu *v = unit2vcpu_cpu(unit, cpu); + + return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu]; +} + +static inline struct scheduler *dom_scheduler(const struct domain *d) +{ + if ( likely(d->cpupool != NULL) ) + return d->cpupool->sched; + + /* + * If d->cpupool is NULL, this is the idle domain. This is special + * because the idle domain does not really belong to any cpupool, and, + * hence, does not really have a scheduler. + * + * This is (should be!) only called like this for allocating the idle + * vCPUs for the first time, during boot, in which case what we want + * is the default scheduler that has been, choosen at boot. + */ + ASSERT(is_idle_domain(d)); + return &ops; +} + +static inline struct scheduler *unit_scheduler(const struct sched_unit *unit) +{ + struct domain *d = unit->domain; + + if ( likely(d->cpupool != NULL) ) + return d->cpupool->sched; + + /* + * If d->cpupool is NULL, this is a unit of the idle domain. And this + * case is special because the idle domain does not really belong to + * a cpupool and, hence, doesn't really have a scheduler). In fact, its + * units (may) run on pCPUs which are in different pools, with different + * schedulers. + * + * What we want, in this case, is the scheduler of the pCPU where this + * particular idle unit is running. And, since unit->res never changes + * for idle units, it is safe to use it, with no locks, to figure that out. + */ + + ASSERT(is_idle_domain(d)); + return unit->res->scheduler; +} + +static inline struct scheduler *vcpu_scheduler(const struct vcpu *v) +{ + return unit_scheduler(v->sched_unit); +} +#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain) + +static inline void trace_runstate_change(struct vcpu *v, int new_state) +{ + struct { uint32_t vcpu:16, domain:16; } d; + uint32_t event; + + if ( likely(!tb_init_done) ) + return; + + d.vcpu = v->vcpu_id; + d.domain = v->domain->domain_id; + + event = TRC_SCHED_RUNSTATE_CHANGE; + event |= ( v->runstate.state & 0x3 ) << 8; + event |= ( new_state & 0x3 ) << 4; + + __trace_var(event, 1/*tsc*/, sizeof(d), &d); +} + +static inline void trace_continue_running(struct vcpu *v) +{ + struct { uint32_t vcpu:16, domain:16; } d; + + if ( likely(!tb_init_done) ) + return; + + d.vcpu = v->vcpu_id; + d.domain = v->domain->domain_id; + + __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d); +} + +static inline void vcpu_urgent_count_update(struct vcpu *v) +{ + if ( is_idle_vcpu(v) ) + return; + + if ( unlikely(v->is_urgent) ) + { + if ( !(v->pause_flags & VPF_blocked) || + !test_bit(v->vcpu_id, v->domain->poll_mask) ) + { + v->is_urgent = 0; + atomic_dec(&per_cpu(sched_urgent_count, v->processor)); + } + } + else + { + if ( unlikely(v->pause_flags & VPF_blocked) && + unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) ) + { + v->is_urgent = 1; + atomic_inc(&per_cpu(sched_urgent_count, v->processor)); + } + } +} + +static inline void vcpu_runstate_change( + struct vcpu *v, int new_state, s_time_t new_entry_time) +{ + s_time_t delta; + struct sched_unit *unit = v->sched_unit; + + ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock)); + if ( v->runstate.state == new_state ) + return; + + vcpu_urgent_count_update(v); + + trace_runstate_change(v, new_state); + + if ( !is_idle_vcpu(v) ) + { + unit->runstate_cnt[v->runstate.state]--; + unit->runstate_cnt[new_state]++; + } + + delta = new_entry_time - v->runstate.state_entry_time; + if ( delta > 0 ) + { + v->runstate.time[v->runstate.state] += delta; + v->runstate.state_entry_time = new_entry_time; + } + + v->runstate.state = new_state; +} + +void sched_guest_idle(void (*idle) (void), unsigned int cpu) +{ + /* + * Another vcpu of the unit is active in guest context while this one is + * idle. In case of a scheduling event we don't want to have high latencies + * due to a cpu needing to wake up from deep C state for joining the + * rendezvous, so avoid those deep C states by incrementing the urgent + * count of the cpu. + */ + atomic_inc(&per_cpu(sched_urgent_count, cpu)); + idle(); + atomic_dec(&per_cpu(sched_urgent_count, cpu)); +} + +void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) +{ + spinlock_t *lock; + s_time_t delta; + + rcu_read_lock(&sched_res_rculock); + + lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit); + memcpy(runstate, &v->runstate, sizeof(*runstate)); + delta = NOW() - runstate->state_entry_time; + if ( delta > 0 ) + runstate->time[runstate->state] += delta; + + if ( unlikely(lock != NULL) ) + unit_schedule_unlock_irq(lock, v->sched_unit); + + rcu_read_unlock(&sched_res_rculock); +} + +uint64_t get_cpu_idle_time(unsigned int cpu) +{ + struct vcpu_runstate_info state = { 0 }; + struct vcpu *v = idle_vcpu[cpu]; + + if ( cpu_online(cpu) && v ) + vcpu_runstate_get(v, &state); + + return state.time[RUNSTATE_running]; +} + +/* + * If locks are different, take the one with the lower address first. + * This avoids dead- or live-locks when this code is running on both + * cpus at the same time. + */ +static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, + unsigned long *flags) +{ + if ( lock1 == lock2 ) + { + spin_lock_irqsave(lock1, *flags); + } + else if ( lock1 < lock2 ) + { + spin_lock_irqsave(lock1, *flags); + spin_lock(lock2); + } + else + { + spin_lock_irqsave(lock2, *flags); + spin_lock(lock1); + } +} + +static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, + unsigned long flags) +{ + if ( lock1 != lock2 ) + spin_unlock(lock2); + spin_unlock_irqrestore(lock1, flags); +} + +static void sched_free_unit_mem(struct sched_unit *unit) +{ + struct sched_unit *prev_unit; + struct domain *d = unit->domain; + + if ( d->sched_unit_list == unit ) + d->sched_unit_list = unit->next_in_list; + else + { + for_each_sched_unit ( d, prev_unit ) + { + if ( prev_unit->next_in_list == unit ) + { + prev_unit->next_in_list = unit->next_in_list; + break; + } + } + } + + free_cpumask_var(unit->cpu_hard_affinity); + free_cpumask_var(unit->cpu_hard_affinity_saved); + free_cpumask_var(unit->cpu_soft_affinity); + + xfree(unit); +} + +static void sched_free_unit(struct sched_unit *unit, struct vcpu *v) +{ + struct vcpu *vunit; + unsigned int cnt = 0; + + /* Don't count to be released vcpu, might be not in vcpu list yet. */ + for_each_sched_unit_vcpu ( unit, vunit ) + if ( vunit != v ) + cnt++; + + v->sched_unit = NULL; + unit->runstate_cnt[v->runstate.state]--; + + if ( unit->vcpu_list == v ) + unit->vcpu_list = v->next_in_list; + + if ( !cnt ) + sched_free_unit_mem(unit); +} + +static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v) +{ + v->sched_unit = unit; + + /* All but idle vcpus are allocated with sequential vcpu_id. */ + if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id ) + { + unit->vcpu_list = v; + /* + * unit_id is always the same as lowest vcpu_id of unit. + * This is used for stopping for_each_sched_unit_vcpu() loop and in + * order to support cpupools with different granularities. + */ + unit->unit_id = v->vcpu_id; + } + unit->runstate_cnt[v->runstate.state]++; +} + +static struct sched_unit *sched_alloc_unit_mem(void) +{ + struct sched_unit *unit; + + unit = xzalloc(struct sched_unit); + if ( !unit ) + return NULL; + + if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) || + !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) || + !zalloc_cpumask_var(&unit->cpu_soft_affinity) ) + { + sched_free_unit_mem(unit); + unit = NULL; + } + + return unit; +} + +static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d) +{ + struct sched_unit **prev_unit; + + unit->domain = d; + + for ( prev_unit = &d->sched_unit_list; *prev_unit; + prev_unit = &(*prev_unit)->next_in_list ) + if ( (*prev_unit)->next_in_list && + (*prev_unit)->next_in_list->unit_id > unit->unit_id ) + break; + + unit->next_in_list = *prev_unit; + *prev_unit = unit; +} + +static struct sched_unit *sched_alloc_unit(struct vcpu *v) +{ + struct sched_unit *unit; + struct domain *d = v->domain; + unsigned int gran = cpupool_get_granularity(d->cpupool); + + for_each_sched_unit ( d, unit ) + if ( unit->unit_id / gran == v->vcpu_id / gran ) + break; + + if ( unit ) + { + sched_unit_add_vcpu(unit, v); + return unit; + } + + if ( (unit = sched_alloc_unit_mem()) == NULL ) + return NULL; + + sched_unit_add_vcpu(unit, v); + sched_domain_insert_unit(unit, d); + + return unit; +} + +static unsigned int sched_select_initial_cpu(const struct vcpu *v) +{ + const struct domain *d = v->domain; + nodeid_t node; + spinlock_t *lock; + unsigned long flags; + unsigned int cpu_ret, cpu = smp_processor_id(); + cpumask_t *cpus = cpumask_scratch_cpu(cpu); + + lock = pcpu_schedule_lock_irqsave(cpu, &flags); + cpumask_clear(cpus); + for_each_node_mask ( node, d->node_affinity ) + cpumask_or(cpus, cpus, &node_to_cpumask(node)); + cpumask_and(cpus, cpus, d->cpupool->cpu_valid); + if ( cpumask_empty(cpus) ) + cpumask_copy(cpus, d->cpupool->cpu_valid); + + if ( v->vcpu_id == 0 ) + cpu_ret = cpumask_first(cpus); + else + { + /* We can rely on previous vcpu being available. */ + ASSERT(!is_idle_domain(d)); + + cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus); + } + + pcpu_schedule_unlock_irqrestore(lock, flags, cpu); + + return cpu_ret; +} + +int sched_init_vcpu(struct vcpu *v) +{ + struct domain *d = v->domain; + struct sched_unit *unit; + unsigned int processor; + + if ( (unit = sched_alloc_unit(v)) == NULL ) + return 1; + + if ( is_idle_domain(d) ) + processor = v->vcpu_id; + else + processor = sched_select_initial_cpu(v); + + /* Initialise the per-vcpu timers. */ + spin_lock_init(&v->periodic_timer_lock); + init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor); + init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor); + init_timer(&v->poll_timer, poll_timer_fn, v, processor); + + /* If this is not the first vcpu of the unit we are done. */ + if ( unit->priv != NULL ) + { + v->processor = processor; + return 0; + } + + rcu_read_lock(&sched_res_rculock); + + /* The first vcpu of an unit can be set via sched_set_res(). */ + sched_set_res(unit, get_sched_res(processor)); + + unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv); + if ( unit->priv == NULL ) + { + sched_free_unit(unit, v); + rcu_read_unlock(&sched_res_rculock); + return 1; + } + + /* + * Initialize affinity settings. The idler, and potentially + * domain-0 VCPUs, are pinned onto their respective physical CPUs. + */ + if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) ) + sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); + else + sched_set_affinity(unit, &cpumask_all, &cpumask_all); + + /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */ + if ( is_idle_domain(d) ) + { + get_sched_res(v->processor)->curr = unit; + get_sched_res(v->processor)->sched_unit_idle = unit; + v->is_running = 1; + unit->is_running = true; + unit->state_entry_time = NOW(); + } + else + { + sched_insert_unit(dom_scheduler(d), unit); + } + + rcu_read_unlock(&sched_res_rculock); + + return 0; +} + +static void vcpu_move_irqs(struct vcpu *v) +{ + arch_move_irqs(v); + evtchn_move_pirqs(v); +} + +static void sched_move_irqs(const struct sched_unit *unit) +{ + struct vcpu *v; + + for_each_sched_unit_vcpu ( unit, v ) + vcpu_move_irqs(v); +} + +int sched_move_domain(struct domain *d, struct cpupool *c) +{ + struct vcpu *v; + struct sched_unit *unit; + unsigned int new_p, unit_idx; + void **unit_priv; + void *domdata; + void *unitdata; + struct scheduler *old_ops; + void *old_domdata; + unsigned int gran = cpupool_get_granularity(c); + int ret = 0; + + for_each_vcpu ( d, v ) + { + if ( v->affinity_broken ) + return -EBUSY; + } + + rcu_read_lock(&sched_res_rculock); + + domdata = sched_alloc_domdata(c->sched, d); + if ( IS_ERR(domdata) ) + { + ret = PTR_ERR(domdata); + goto out; + } + + unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran)); + if ( unit_priv == NULL ) + { + sched_free_domdata(c->sched, domdata); + ret = -ENOMEM; + goto out; + } + + unit_idx = 0; + for_each_sched_unit ( d, unit ) + { + unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata); + if ( unit_priv[unit_idx] == NULL ) + { + for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ ) + sched_free_udata(c->sched, unit_priv[unit_idx]); + xfree(unit_priv); + sched_free_domdata(c->sched, domdata); + ret = -ENOMEM; + goto out; + } + unit_idx++; + } + + domain_pause(d); + + old_ops = dom_scheduler(d); + old_domdata = d->sched_priv; + + for_each_sched_unit ( d, unit ) + { + sched_remove_unit(old_ops, unit); + } + + d->cpupool = c; + d->sched_priv = domdata; + + new_p = cpumask_first(c->cpu_valid); + unit_idx = 0; + for_each_sched_unit ( d, unit ) + { + spinlock_t *lock; + unsigned int unit_p = new_p; + + unitdata = unit->priv; + + for_each_sched_unit_vcpu ( unit, v ) + { + migrate_timer(&v->periodic_timer, new_p); + migrate_timer(&v->singleshot_timer, new_p); + migrate_timer(&v->poll_timer, new_p); + new_p = cpumask_cycle(new_p, c->cpu_valid); + } + + lock = unit_schedule_lock_irq(unit); + + sched_set_affinity(unit, &cpumask_all, &cpumask_all); + + sched_set_res(unit, get_sched_res(unit_p)); + /* + * With v->processor modified we must not + * - make any further changes assuming we hold the scheduler lock, + * - use unit_schedule_unlock_irq(). + */ + spin_unlock_irq(lock); + + unit->priv = unit_priv[unit_idx]; + if ( !d->is_dying ) + sched_move_irqs(unit); + + sched_insert_unit(c->sched, unit); + + sched_free_udata(old_ops, unitdata); + + unit_idx++; + } + + domain_update_node_affinity(d); + + domain_unpause(d); + + sched_free_domdata(old_ops, old_domdata); + + xfree(unit_priv); + +out: + rcu_read_unlock(&sched_res_rculock); + + return ret; +} + +void sched_destroy_vcpu(struct vcpu *v) +{ + struct sched_unit *unit = v->sched_unit; + + kill_timer(&v->periodic_timer); + kill_timer(&v->singleshot_timer); + kill_timer(&v->poll_timer); + if ( test_and_clear_bool(v->is_urgent) ) + atomic_dec(&per_cpu(sched_urgent_count, v->processor)); + /* + * Vcpus are being destroyed top-down. So being the first vcpu of an unit + * is the same as being the only one. + */ + if ( unit->vcpu_list == v ) + { + rcu_read_lock(&sched_res_rculock); + + sched_remove_unit(vcpu_scheduler(v), unit); + sched_free_udata(vcpu_scheduler(v), unit->priv); + sched_free_unit(unit, v); + + rcu_read_unlock(&sched_res_rculock); + } +} + +int sched_init_domain(struct domain *d, int poolid) +{ + void *sdom; + int ret; + + ASSERT(d->cpupool == NULL); + ASSERT(d->domain_id < DOMID_FIRST_RESERVED); + + if ( (ret = cpupool_add_domain(d, poolid)) ) + return ret; + + SCHED_STAT_CRANK(dom_init); + TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id); + + rcu_read_lock(&sched_res_rculock); + + sdom = sched_alloc_domdata(dom_scheduler(d), d); + + rcu_read_unlock(&sched_res_rculock); + + if ( IS_ERR(sdom) ) + return PTR_ERR(sdom); + + d->sched_priv = sdom; + + return 0; +} + +void sched_destroy_domain(struct domain *d) +{ + ASSERT(d->domain_id < DOMID_FIRST_RESERVED); + + if ( d->cpupool ) + { + SCHED_STAT_CRANK(dom_destroy); + TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id); + + rcu_read_lock(&sched_res_rculock); + + sched_free_domdata(dom_scheduler(d), d->sched_priv); + d->sched_priv = NULL; + + rcu_read_unlock(&sched_res_rculock); + + cpupool_rm_domain(d); + } +} + +static void vcpu_sleep_nosync_locked(struct vcpu *v) +{ + struct sched_unit *unit = v->sched_unit; + + ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock)); + + if ( likely(!vcpu_runnable(v)) ) + { + if ( v->runstate.state == RUNSTATE_runnable ) + vcpu_runstate_change(v, RUNSTATE_offline, NOW()); + + /* Only put unit to sleep in case all vcpus are not runnable. */ + if ( likely(!unit_runnable(unit)) ) + sched_sleep(unit_scheduler(unit), unit); + else if ( unit_running(unit) > 1 && v->is_running && + !v->force_context_switch ) + { + v->force_context_switch = true; + cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ); + } + } +} + +void vcpu_sleep_nosync(struct vcpu *v) +{ + unsigned long flags; + spinlock_t *lock; + + TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); + + rcu_read_lock(&sched_res_rculock); + + lock = unit_schedule_lock_irqsave(v->sched_unit, &flags); + + vcpu_sleep_nosync_locked(v); + + unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit); + + rcu_read_unlock(&sched_res_rculock); +} + +void vcpu_sleep_sync(struct vcpu *v) +{ + vcpu_sleep_nosync(v); + + while ( !vcpu_runnable(v) && v->is_running ) + cpu_relax(); + + sync_vcpu_execstate(v); +} + +void vcpu_wake(struct vcpu *v) +{ + unsigned long flags; + spinlock_t *lock; + struct sched_unit *unit = v->sched_unit; + + TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); + + rcu_read_lock(&sched_res_rculock); + + lock = unit_schedule_lock_irqsave(unit, &flags); + + if ( likely(vcpu_runnable(v)) ) + { + if ( v->runstate.state >= RUNSTATE_blocked ) + vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); + /* + * Call sched_wake() unconditionally, even if unit is running already. + * We might have not been de-scheduled after vcpu_sleep_nosync_locked() + * and are now to be woken up again. + */ + sched_wake(unit_scheduler(unit), unit); + if ( unit->is_running && !v->is_running && !v->force_context_switch ) _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.