[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen master] xen/sched: move schedulers and cpupool coding to dedicated directory



commit 6cb4b01c033b7abc3e7175501330dfb01fb09da5
Author:     Juergen Gross <jgross@xxxxxxxx>
AuthorDate: Wed Jan 22 15:06:43 2020 +0100
Commit:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Wed Jan 22 17:37:11 2020 +0000

    xen/sched: move schedulers and cpupool coding to dedicated directory
    
    Move sched*c and cpupool.c to a new directory common/sched.
    
    Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
    Reviewed-by: Dario Faggioli <dfaggioli@xxxxxxxx>
---
 MAINTAINERS                  |    8 +-
 xen/common/Kconfig           |   66 +-
 xen/common/Makefile          |    8 +-
 xen/common/compat/schedule.c |   55 -
 xen/common/cpupool.c         |  979 ----------
 xen/common/sched/Kconfig     |   65 +
 xen/common/sched/Makefile    |    7 +
 xen/common/sched/arinc653.c  |  739 ++++++++
 xen/common/sched/compat.c    |   55 +
 xen/common/sched/core.c      | 3144 ++++++++++++++++++++++++++++++++
 xen/common/sched/cpupool.c   |  979 ++++++++++
 xen/common/sched/credit.c    | 2284 +++++++++++++++++++++++
 xen/common/sched/credit2.c   | 4122 ++++++++++++++++++++++++++++++++++++++++++
 xen/common/sched/null.c      | 1034 +++++++++++
 xen/common/sched/rt.c        | 1571 ++++++++++++++++
 xen/common/sched_arinc653.c  |  739 --------
 xen/common/sched_credit.c    | 2284 -----------------------
 xen/common/sched_credit2.c   | 4122 ------------------------------------------
 xen/common/sched_null.c      | 1034 -----------
 xen/common/sched_rt.c        | 1571 ----------------
 xen/common/schedule.c        | 3144 --------------------------------
 21 files changed, 14006 insertions(+), 14004 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a91080cde5..dadcfb63d8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -174,7 +174,7 @@ M:  Josh Whitehead <josh.whitehead@xxxxxxxxxxxxxxx>
 M:     Stewart Hildebrand <stewart.hildebrand@xxxxxxxxxxxxxxx>
 S:     Supported
 L:     xen-devel@xxxxxxxxxxxxxxx
-F:     xen/common/sched_arinc653.c
+F:     xen/common/sched/arinc653.c
 F:     tools/libxc/xc_arinc653.c
 
 ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE
@@ -218,7 +218,7 @@ CPU POOLS
 M:     Juergen Gross <jgross@xxxxxxxx>
 M:     Dario Faggioli <dfaggioli@xxxxxxxx>
 S:     Supported
-F:     xen/common/cpupool.c
+F:     xen/common/sched/cpupool.c
 
 DEVICE TREE
 M:     Stefano Stabellini <sstabellini@xxxxxxxxxx>
@@ -384,13 +384,13 @@ RTDS SCHEDULER
 M:     Dario Faggioli <dfaggioli@xxxxxxxx>
 M:     Meng Xu <mengxu@xxxxxxxxxxxxx>
 S:     Supported
-F:     xen/common/sched_rt.c
+F:     xen/common/sched/rt.c
 
 SCHEDULING
 M:     George Dunlap <george.dunlap@xxxxxxxxxxxxx>
 M:     Dario Faggioli <dfaggioli@xxxxxxxx>
 S:     Supported
-F:     xen/common/sched*
+F:     xen/common/sched/
 
 SEABIOS UPSTREAM
 M:     Wei Liu <wl@xxxxxxx>
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index b3d161d057..9d6d09eb37 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -275,71 +275,7 @@ config ARGO
 
          If unsure, say N.
 
-menu "Schedulers"
-       visible if EXPERT = "y"
-
-config SCHED_CREDIT
-       bool "Credit scheduler support"
-       default y
-       ---help---
-         The traditional credit scheduler is a general purpose scheduler.
-
-config SCHED_CREDIT2
-       bool "Credit2 scheduler support"
-       default y
-       ---help---
-         The credit2 scheduler is a general purpose scheduler that is
-         optimized for lower latency and higher VM density.
-
-config SCHED_RTDS
-       bool "RTDS scheduler support (EXPERIMENTAL)"
-       default y
-       ---help---
-         The RTDS scheduler is a soft and firm real-time scheduler for
-         multicore, targeted for embedded, automotive, graphics and gaming
-         in the cloud, and general low-latency workloads.
-
-config SCHED_ARINC653
-       bool "ARINC653 scheduler support (EXPERIMENTAL)"
-       default DEBUG
-       ---help---
-         The ARINC653 scheduler is a hard real-time scheduler for single
-         cores, targeted for avionics, drones, and medical devices.
-
-config SCHED_NULL
-       bool "Null scheduler support (EXPERIMENTAL)"
-       default y
-       ---help---
-         The null scheduler is a static, zero overhead scheduler,
-         for when there always are less vCPUs than pCPUs, typically
-         in embedded or HPC scenarios.
-
-choice
-       prompt "Default Scheduler?"
-       default SCHED_CREDIT2_DEFAULT
-
-       config SCHED_CREDIT_DEFAULT
-               bool "Credit Scheduler" if SCHED_CREDIT
-       config SCHED_CREDIT2_DEFAULT
-               bool "Credit2 Scheduler" if SCHED_CREDIT2
-       config SCHED_RTDS_DEFAULT
-               bool "RT Scheduler" if SCHED_RTDS
-       config SCHED_ARINC653_DEFAULT
-               bool "ARINC653 Scheduler" if SCHED_ARINC653
-       config SCHED_NULL_DEFAULT
-               bool "Null Scheduler" if SCHED_NULL
-endchoice
-
-config SCHED_DEFAULT
-       string
-       default "credit" if SCHED_CREDIT_DEFAULT
-       default "credit2" if SCHED_CREDIT2_DEFAULT
-       default "rtds" if SCHED_RTDS_DEFAULT
-       default "arinc653" if SCHED_ARINC653_DEFAULT
-       default "null" if SCHED_NULL_DEFAULT
-       default "credit2"
-
-endmenu
+source "common/sched/Kconfig"
 
 config CRYPTO
        bool
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 62b34e69e9..2abb8250b0 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -3,7 +3,6 @@ obj-y += bitmap.o
 obj-y += bsearch.o
 obj-$(CONFIG_CORE_PARKING) += core_parking.o
 obj-y += cpu.o
-obj-y += cpupool.o
 obj-$(CONFIG_DEBUG_TRACE) += debugtrace.o
 obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o
 obj-y += domctl.o
@@ -38,12 +37,6 @@ obj-y += radix-tree.o
 obj-y += rbtree.o
 obj-y += rcupdate.o
 obj-y += rwlock.o
-obj-$(CONFIG_SCHED_ARINC653) += sched_arinc653.o
-obj-$(CONFIG_SCHED_CREDIT) += sched_credit.o
-obj-$(CONFIG_SCHED_CREDIT2) += sched_credit2.o
-obj-$(CONFIG_SCHED_RTDS) += sched_rt.o
-obj-$(CONFIG_SCHED_NULL) += sched_null.o
-obj-y += schedule.o
 obj-y += shutdown.o
 obj-y += softirq.o
 obj-y += sort.o
@@ -74,6 +67,7 @@ obj-$(CONFIG_COMPAT) += $(addprefix compat/,domain.o kernel.o 
memory.o multicall
 extra-y := symbols-dummy.o
 
 subdir-$(CONFIG_COVERAGE) += coverage
+subdir-y += sched
 subdir-$(CONFIG_UBSAN) += ubsan
 
 subdir-$(CONFIG_NEEDS_LIBELF) += libelf
diff --git a/xen/common/compat/schedule.c b/xen/common/compat/schedule.c
deleted file mode 100644
index 8b6e6f107d..0000000000
--- a/xen/common/compat/schedule.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/****************************************************************************
- * schedule.c
- *
- */
-
-#include <compat/sched.h>
-
-#define COMPAT
-#define ret_t int
-
-#define do_sched_op compat_sched_op
-
-#define xen_sched_pin_override sched_pin_override
-CHECK_sched_pin_override;
-#undef xen_sched_pin_override
-
-#define xen_sched_shutdown sched_shutdown
-CHECK_sched_shutdown;
-#undef xen_sched_shutdown
-
-#define xen_sched_remote_shutdown sched_remote_shutdown
-CHECK_sched_remote_shutdown;
-#undef xen_sched_remote_shutdown
-
-static int compat_poll(struct compat_sched_poll *compat)
-{
-    struct sched_poll native;
-
-#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \
-    guest_from_compat_handle((_d_)->ports, (_s_)->ports)
-    XLAT_sched_poll(&native, compat);
-#undef XLAT_sched_poll_HNDL_ports
-
-    return do_poll(&native);
-}
-
-#define do_poll compat_poll
-#define sched_poll compat_sched_poll
-
-#include "../schedule.c"
-
-int compat_set_timer_op(u32 lo, s32 hi)
-{
-    return do_set_timer_op(((s64)hi << 32) | lo);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
deleted file mode 100644
index d66b541a94..0000000000
--- a/xen/common/cpupool.c
+++ /dev/null
@@ -1,979 +0,0 @@
-/******************************************************************************
- * cpupool.c
- * 
- * Generic cpupool-handling functions.
- *
- * Cpupools are a feature to have configurable scheduling domains. Each
- * cpupool runs an own scheduler on a dedicated set of physical cpus.
- * A domain is bound to one cpupool at any time, but it can be moved to
- * another cpupool.
- *
- * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
- */
-
-#include <xen/lib.h>
-#include <xen/init.h>
-#include <xen/cpumask.h>
-#include <xen/percpu.h>
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/warning.h>
-#include <xen/keyhandler.h>
-#include <xen/cpu.h>
-
-#define for_each_cpupool(ptr)    \
-    for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
-
-struct cpupool *cpupool0;                /* Initial cpupool with Dom0 */
-cpumask_t cpupool_free_cpus;             /* cpus not in any cpupool */
-
-static struct cpupool *cpupool_list;     /* linked list, sorted by poolid */
-
-static int cpupool_moving_cpu = -1;
-static struct cpupool *cpupool_cpu_moving = NULL;
-static cpumask_t cpupool_locked_cpus;
-
-static DEFINE_SPINLOCK(cpupool_lock);
-
-static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu;
-static unsigned int __read_mostly sched_granularity = 1;
-
-#ifdef CONFIG_HAS_SCHED_GRANULARITY
-static int __init sched_select_granularity(const char *str)
-{
-    if ( strcmp("cpu", str) == 0 )
-        opt_sched_granularity = SCHED_GRAN_cpu;
-    else if ( strcmp("core", str) == 0 )
-        opt_sched_granularity = SCHED_GRAN_core;
-    else if ( strcmp("socket", str) == 0 )
-        opt_sched_granularity = SCHED_GRAN_socket;
-    else
-        return -EINVAL;
-
-    return 0;
-}
-custom_param("sched-gran", sched_select_granularity);
-#endif
-
-static unsigned int __init cpupool_check_granularity(void)
-{
-    unsigned int cpu;
-    unsigned int siblings, gran = 0;
-
-    if ( opt_sched_granularity == SCHED_GRAN_cpu )
-        return 1;
-
-    for_each_online_cpu ( cpu )
-    {
-        siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity,
-                                                        cpu));
-        if ( gran == 0 )
-            gran = siblings;
-        else if ( gran != siblings )
-            return 0;
-    }
-
-    sched_disable_smt_switching = true;
-
-    return gran;
-}
-
-/* Setup data for selected scheduler granularity. */
-static void __init cpupool_gran_init(void)
-{
-    unsigned int gran = 0;
-    const char *fallback = NULL;
-
-    while ( gran == 0 )
-    {
-        gran = cpupool_check_granularity();
-
-        if ( gran == 0 )
-        {
-            switch ( opt_sched_granularity )
-            {
-            case SCHED_GRAN_core:
-                opt_sched_granularity = SCHED_GRAN_cpu;
-                fallback = "Asymmetric cpu configuration.\n"
-                           "Falling back to sched-gran=cpu.\n";
-                break;
-            case SCHED_GRAN_socket:
-                opt_sched_granularity = SCHED_GRAN_core;
-                fallback = "Asymmetric cpu configuration.\n"
-                           "Falling back to sched-gran=core.\n";
-                break;
-            default:
-                ASSERT_UNREACHABLE();
-                break;
-            }
-        }
-    }
-
-    if ( fallback )
-        warning_add(fallback);
-
-    sched_granularity = gran;
-}
-
-unsigned int cpupool_get_granularity(const struct cpupool *c)
-{
-    return c ? sched_granularity : 1;
-}
-
-static void free_cpupool_struct(struct cpupool *c)
-{
-    if ( c )
-    {
-        free_cpumask_var(c->res_valid);
-        free_cpumask_var(c->cpu_valid);
-    }
-    xfree(c);
-}
-
-static struct cpupool *alloc_cpupool_struct(void)
-{
-    struct cpupool *c = xzalloc(struct cpupool);
-
-    if ( !c )
-        return NULL;
-
-    if ( !zalloc_cpumask_var(&c->cpu_valid) ||
-         !zalloc_cpumask_var(&c->res_valid) )
-    {
-        free_cpupool_struct(c);
-        c = NULL;
-    }
-
-    return c;
-}
-
-/*
- * find a cpupool by it's id. to be called with cpupool lock held
- * if exact is not specified, the first cpupool with an id larger or equal to
- * the searched id is returned
- * returns NULL if not found.
- */
-static struct cpupool *__cpupool_find_by_id(int id, int exact)
-{
-    struct cpupool **q;
-
-    ASSERT(spin_is_locked(&cpupool_lock));
-
-    for_each_cpupool(q)
-        if ( (*q)->cpupool_id >= id )
-            break;
-
-    return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL;
-}
-
-static struct cpupool *cpupool_find_by_id(int poolid)
-{
-    return __cpupool_find_by_id(poolid, 1);
-}
-
-static struct cpupool *__cpupool_get_by_id(int poolid, int exact)
-{
-    struct cpupool *c;
-    spin_lock(&cpupool_lock);
-    c = __cpupool_find_by_id(poolid, exact);
-    if ( c != NULL )
-        atomic_inc(&c->refcnt);
-    spin_unlock(&cpupool_lock);
-    return c;
-}
-
-struct cpupool *cpupool_get_by_id(int poolid)
-{
-    return __cpupool_get_by_id(poolid, 1);
-}
-
-static struct cpupool *cpupool_get_next_by_id(int poolid)
-{
-    return __cpupool_get_by_id(poolid, 0);
-}
-
-void cpupool_put(struct cpupool *pool)
-{
-    if ( !atomic_dec_and_test(&pool->refcnt) )
-        return;
-    scheduler_free(pool->sched);
-    free_cpupool_struct(pool);
-}
-
-/*
- * create a new cpupool with specified poolid and scheduler
- * returns pointer to new cpupool structure if okay, NULL else
- * possible failures:
- * - no memory
- * - poolid already used
- * - unknown scheduler
- */
-static struct cpupool *cpupool_create(
-    int poolid, unsigned int sched_id, int *perr)
-{
-    struct cpupool *c;
-    struct cpupool **q;
-    int last = 0;
-
-    *perr = -ENOMEM;
-    if ( (c = alloc_cpupool_struct()) == NULL )
-        return NULL;
-
-    /* One reference for caller, one reference for cpupool_destroy(). */
-    atomic_set(&c->refcnt, 2);
-
-    debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id);
-
-    spin_lock(&cpupool_lock);
-
-    for_each_cpupool(q)
-    {
-        last = (*q)->cpupool_id;
-        if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
-            break;
-    }
-    if ( *q != NULL )
-    {
-        if ( (*q)->cpupool_id == poolid )
-        {
-            *perr = -EEXIST;
-            goto err;
-        }
-        c->next = *q;
-    }
-
-    c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
-    if ( poolid == 0 )
-    {
-        c->sched = scheduler_get_default();
-    }
-    else
-    {
-        c->sched = scheduler_alloc(sched_id, perr);
-        if ( c->sched == NULL )
-            goto err;
-    }
-    c->gran = opt_sched_granularity;
-
-    *q = c;
-
-    spin_unlock(&cpupool_lock);
-
-    debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n",
-                      c->cpupool_id, c->sched->name, c->sched->opt_name);
-
-    *perr = 0;
-    return c;
-
- err:
-    spin_unlock(&cpupool_lock);
-    free_cpupool_struct(c);
-    return NULL;
-}
-/*
- * destroys the given cpupool
- * returns 0 on success, 1 else
- * possible failures:
- * - pool still in use
- * - cpus still assigned to pool
- * - pool not in list
- */
-static int cpupool_destroy(struct cpupool *c)
-{
-    struct cpupool **q;
-
-    spin_lock(&cpupool_lock);
-    for_each_cpupool(q)
-        if ( *q == c )
-            break;
-    if ( *q != c )
-    {
-        spin_unlock(&cpupool_lock);
-        return -ENOENT;
-    }
-    if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) )
-    {
-        spin_unlock(&cpupool_lock);
-        return -EBUSY;
-    }
-    *q = c->next;
-    spin_unlock(&cpupool_lock);
-
-    cpupool_put(c);
-
-    debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id);
-    return 0;
-}
-
-/*
- * Move domain to another cpupool
- */
-static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c)
-{
-    int ret;
-
-    if ( unlikely(d->cpupool == c) )
-        return 0;
-
-    d->cpupool->n_dom--;
-    ret = sched_move_domain(d, c);
-    if ( ret )
-        d->cpupool->n_dom++;
-    else
-        c->n_dom++;
-
-    return ret;
-}
-int cpupool_move_domain(struct domain *d, struct cpupool *c)
-{
-    int ret;
-
-    spin_lock(&cpupool_lock);
-
-    ret = cpupool_move_domain_locked(d, c);
-
-    spin_unlock(&cpupool_lock);
-
-    return ret;
-}
-
-/*
- * assign a specific cpu to a cpupool
- * cpupool_lock must be held
- */
-static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
-{
-    int ret;
-    struct domain *d;
-    const cpumask_t *cpus;
-
-    cpus = sched_get_opt_cpumask(c->gran, cpu);
-
-    if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
-        return -EADDRNOTAVAIL;
-    ret = schedule_cpu_add(cpumask_first(cpus), c);
-    if ( ret )
-        return ret;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
-    if (cpupool_moving_cpu == cpu)
-    {
-        cpupool_moving_cpu = -1;
-        cpupool_put(cpupool_cpu_moving);
-        cpupool_cpu_moving = NULL;
-    }
-    cpumask_or(c->cpu_valid, c->cpu_valid, cpus);
-    cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    rcu_read_lock(&domlist_read_lock);
-    for_each_domain_in_cpupool(d, c)
-    {
-        domain_update_node_affinity(d);
-    }
-    rcu_read_unlock(&domlist_read_lock);
-
-    return 0;
-}
-
-static int cpupool_unassign_cpu_finish(struct cpupool *c)
-{
-    int cpu = cpupool_moving_cpu;
-    const cpumask_t *cpus;
-    struct domain *d;
-    int ret;
-
-    if ( c != cpupool_cpu_moving )
-        return -EADDRNOTAVAIL;
-
-    /*
-     * We need this for scanning the domain list, both in
-     * cpu_disable_scheduler(), and at the bottom of this function.
-     */
-    rcu_read_lock(&domlist_read_lock);
-    ret = cpu_disable_scheduler(cpu);
-
-    rcu_read_lock(&sched_res_rculock);
-    cpus = get_sched_res(cpu)->cpus;
-    cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
-
-    /*
-     * cpu_disable_scheduler() returning an error doesn't require resetting
-     * cpupool_free_cpus' cpu bit. All error cases should be of temporary
-     * nature and tools will retry the operation. Even if the number of
-     * retries may be limited, the in-between state can easily be repaired
-     * by adding the cpu to the cpupool again.
-     */
-    if ( !ret )
-    {
-        ret = schedule_cpu_rm(cpu);
-        if ( ret )
-            cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
-        else
-        {
-            cpupool_moving_cpu = -1;
-            cpupool_put(cpupool_cpu_moving);
-            cpupool_cpu_moving = NULL;
-        }
-    }
-    rcu_read_unlock(&sched_res_rculock);
-
-    for_each_domain_in_cpupool(d, c)
-    {
-        domain_update_node_affinity(d);
-    }
-    rcu_read_unlock(&domlist_read_lock);
-
-    return ret;
-}
-
-static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu)
-{
-    int ret;
-    struct domain *d;
-    const cpumask_t *cpus;
-
-    spin_lock(&cpupool_lock);
-    ret = -EADDRNOTAVAIL;
-    if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid))
-         && (cpu != cpupool_moving_cpu) )
-        goto out;
-
-    ret = 0;
-    rcu_read_lock(&sched_res_rculock);
-    cpus = get_sched_res(cpu)->cpus;
-
-    if ( (c->n_dom > 0) &&
-         (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) &&
-         (cpu != cpupool_moving_cpu) )
-    {
-        rcu_read_lock(&domlist_read_lock);
-        for_each_domain_in_cpupool(d, c)
-        {
-            if ( !d->is_dying && system_state == SYS_STATE_active )
-            {
-                ret = -EBUSY;
-                break;
-            }
-            ret = cpupool_move_domain_locked(d, cpupool0);
-            if ( ret )
-                break;
-        }
-        rcu_read_unlock(&domlist_read_lock);
-        if ( ret )
-            goto out;
-    }
-    cpupool_moving_cpu = cpu;
-    atomic_inc(&c->refcnt);
-    cpupool_cpu_moving = c;
-    cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus);
-    cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
-
-    rcu_read_unlock(&domlist_read_lock);
-out:
-    spin_unlock(&cpupool_lock);
-
-    return ret;
-}
-
-static long cpupool_unassign_cpu_helper(void *info)
-{
-    struct cpupool *c = info;
-    long ret;
-
-    debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
-                      cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu);
-    spin_lock(&cpupool_lock);
-
-    ret = cpupool_unassign_cpu_finish(c);
-
-    spin_unlock(&cpupool_lock);
-    debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret);
-
-    return ret;
-}
-
-/*
- * unassign a specific cpu from a cpupool
- * we must be sure not to run on the cpu to be unassigned! to achieve this
- * the main functionality is performed via continue_hypercall_on_cpu on a
- * specific cpu.
- * if the cpu to be removed is the last one of the cpupool no active domain
- * must be bound to the cpupool. dying domains are moved to cpupool0 as they
- * might be zombies.
- * possible failures:
- * - last cpu and still active domains in cpupool
- * - cpu just being unplugged
- */
-static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
-{
-    int work_cpu;
-    int ret;
-    unsigned int master_cpu;
-
-    debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
-                      c->cpupool_id, cpu);
-
-    master_cpu = sched_get_resource_cpu(cpu);
-    ret = cpupool_unassign_cpu_start(c, master_cpu);
-    if ( ret )
-    {
-        debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
-                          c->cpupool_id, cpu, ret);
-        return ret;
-    }
-
-    work_cpu = sched_get_resource_cpu(smp_processor_id());
-    if ( work_cpu == master_cpu )
-    {
-        work_cpu = cpumask_first(cpupool0->cpu_valid);
-        if ( work_cpu == master_cpu )
-            work_cpu = cpumask_last(cpupool0->cpu_valid);
-    }
-    return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
-}
-
-/*
- * add a new domain to a cpupool
- * possible failures:
- * - pool does not exist
- * - no cpu assigned to pool
- */
-int cpupool_add_domain(struct domain *d, int poolid)
-{
-    struct cpupool *c;
-    int rc;
-    int n_dom = 0;
-
-    if ( poolid == CPUPOOLID_NONE )
-        return 0;
-    spin_lock(&cpupool_lock);
-    c = cpupool_find_by_id(poolid);
-    if ( c == NULL )
-        rc = -ESRCH;
-    else if ( !cpumask_weight(c->cpu_valid) )
-        rc = -ENODEV;
-    else
-    {
-        c->n_dom++;
-        n_dom = c->n_dom;
-        d->cpupool = c;
-        rc = 0;
-    }
-    spin_unlock(&cpupool_lock);
-    debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n",
-                      d->domain_id, poolid, n_dom, rc);
-    return rc;
-}
-
-/*
- * remove a domain from a cpupool
- */
-void cpupool_rm_domain(struct domain *d)
-{
-    int cpupool_id;
-    int n_dom;
-
-    if ( d->cpupool == NULL )
-        return;
-    spin_lock(&cpupool_lock);
-    cpupool_id = d->cpupool->cpupool_id;
-    d->cpupool->n_dom--;
-    n_dom = d->cpupool->n_dom;
-    d->cpupool = NULL;
-    spin_unlock(&cpupool_lock);
-    debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
-                      d->domain_id, cpupool_id, n_dom);
-    return;
-}
-
-/*
- * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0,
- * as they must have been in there when unplugged.
- */
-static int cpupool_cpu_add(unsigned int cpu)
-{
-    int ret = 0;
-    const cpumask_t *cpus;
-
-    spin_lock(&cpupool_lock);
-    cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
-    cpumask_set_cpu(cpu, &cpupool_free_cpus);
-
-    /*
-     * If we are not resuming, we are hot-plugging cpu, and in which case
-     * we add it to pool0, as it certainly was there when hot-unplagged
-     * (or unplugging would have failed) and that is the default behavior
-     * anyway.
-     */
-    rcu_read_lock(&sched_res_rculock);
-    get_sched_res(cpu)->cpupool = NULL;
-
-    cpus = sched_get_opt_cpumask(cpupool0->gran, cpu);
-    if ( cpumask_subset(cpus, &cpupool_free_cpus) )
-        ret = cpupool_assign_cpu_locked(cpupool0, cpu);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    spin_unlock(&cpupool_lock);
-
-    return ret;
-}
-
-/*
- * This function is called in stop_machine context, so we can be sure no
- * non-idle vcpu is active on the system.
- */
-static void cpupool_cpu_remove(unsigned int cpu)
-{
-    int ret;
-
-    ASSERT(is_idle_vcpu(current));
-
-    if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) )
-    {
-        ret = cpupool_unassign_cpu_finish(cpupool0);
-        BUG_ON(ret);
-    }
-    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-}
-
-/*
- * Called before a CPU is being removed from the system.
- * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved
- * to free cpus actually before removing them).
- * The CPU is locked, to forbid adding it again to another cpupool.
- */
-static int cpupool_cpu_remove_prologue(unsigned int cpu)
-{
-    int ret = 0;
-    cpumask_t *cpus;
-    unsigned int master_cpu;
-
-    spin_lock(&cpupool_lock);
-
-    rcu_read_lock(&sched_res_rculock);
-    cpus = get_sched_res(cpu)->cpus;
-    master_cpu = sched_get_resource_cpu(cpu);
-    if ( cpumask_intersects(cpus, &cpupool_locked_cpus) )
-        ret = -EBUSY;
-    else
-        cpumask_set_cpu(cpu, &cpupool_locked_cpus);
-    rcu_read_unlock(&sched_res_rculock);
-
-    spin_unlock(&cpupool_lock);
-
-    if ( ret )
-        return  ret;
-
-    if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) )
-    {
-        /* Cpupool0 is populated only after all cpus are up. */
-        ASSERT(system_state == SYS_STATE_active);
-
-        ret = cpupool_unassign_cpu_start(cpupool0, master_cpu);
-    }
-    else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) )
-        ret = -ENODEV;
-
-    return ret;
-}
-
-/*
- * Called during resume for all cpus which didn't come up again. The cpu must
- * be removed from the cpupool it is assigned to. In case a cpupool will be
- * left without cpu we move all domains of that cpupool to cpupool0.
- * As we are called with all domains still frozen there is no need to take the
- * cpupool lock here.
- */
-static void cpupool_cpu_remove_forced(unsigned int cpu)
-{
-    struct cpupool **c;
-    int ret;
-    unsigned int master_cpu = sched_get_resource_cpu(cpu);
-
-    for_each_cpupool ( c )
-    {
-        if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) )
-        {
-            ret = cpupool_unassign_cpu_start(*c, master_cpu);
-            BUG_ON(ret);
-            ret = cpupool_unassign_cpu_finish(*c);
-            BUG_ON(ret);
-        }
-    }
-
-    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-
-    rcu_read_lock(&sched_res_rculock);
-    sched_rm_cpu(cpu);
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-/*
- * do cpupool related sysctl operations
- */
-int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
-{
-    int ret;
-    struct cpupool *c;
-
-    switch ( op->op )
-    {
-
-    case XEN_SYSCTL_CPUPOOL_OP_CREATE:
-    {
-        int poolid;
-
-        poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ?
-            CPUPOOLID_NONE: op->cpupool_id;
-        c = cpupool_create(poolid, op->sched_id, &ret);
-        if ( c != NULL )
-        {
-            op->cpupool_id = c->cpupool_id;
-            cpupool_put(c);
-        }
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_DESTROY:
-    {
-        c = cpupool_get_by_id(op->cpupool_id);
-        ret = -ENOENT;
-        if ( c == NULL )
-            break;
-        ret = cpupool_destroy(c);
-        cpupool_put(c);
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_INFO:
-    {
-        c = cpupool_get_next_by_id(op->cpupool_id);
-        ret = -ENOENT;
-        if ( c == NULL )
-            break;
-        op->cpupool_id = c->cpupool_id;
-        op->sched_id = c->sched->sched_id;
-        op->n_dom = c->n_dom;
-        ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid);
-        cpupool_put(c);
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_ADDCPU:
-    {
-        unsigned cpu;
-        const cpumask_t *cpus;
-
-        cpu = op->cpu;
-        debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n",
-                          op->cpupool_id, cpu);
-
-        spin_lock(&cpupool_lock);
-
-        c = cpupool_find_by_id(op->cpupool_id);
-        ret = -ENOENT;
-        if ( c == NULL )
-            goto addcpu_out;
-        if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
-        {
-            for_each_cpu ( cpu, &cpupool_free_cpus )
-            {
-                cpus = sched_get_opt_cpumask(c->gran, cpu);
-                if ( cpumask_subset(cpus, &cpupool_free_cpus) )
-                    break;
-            }
-            ret = -ENODEV;
-            if ( cpu >= nr_cpu_ids )
-                goto addcpu_out;
-        }
-        ret = -EINVAL;
-        if ( cpu >= nr_cpu_ids )
-            goto addcpu_out;
-        ret = -ENODEV;
-        cpus = sched_get_opt_cpumask(c->gran, cpu);
-        if ( !cpumask_subset(cpus, &cpupool_free_cpus) ||
-             cpumask_intersects(cpus, &cpupool_locked_cpus) )
-            goto addcpu_out;
-        ret = cpupool_assign_cpu_locked(c, cpu);
-
-    addcpu_out:
-        spin_unlock(&cpupool_lock);
-        debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
-                          op->cpupool_id, cpu, ret);
-
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_RMCPU:
-    {
-        unsigned cpu;
-
-        c = cpupool_get_by_id(op->cpupool_id);
-        ret = -ENOENT;
-        if ( c == NULL )
-            break;
-        cpu = op->cpu;
-        if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
-            cpu = cpumask_last(c->cpu_valid);
-        ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL;
-        cpupool_put(c);
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN:
-    {
-        struct domain *d;
-
-        ret = rcu_lock_remote_domain_by_id(op->domid, &d);
-        if ( ret )
-            break;
-        if ( d->cpupool == NULL )
-        {
-            ret = -EINVAL;
-            rcu_unlock_domain(d);
-            break;
-        }
-        if ( op->cpupool_id == d->cpupool->cpupool_id )
-        {
-            ret = 0;
-            rcu_unlock_domain(d);
-            break;
-        }
-        debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n",
-                          d->domain_id, op->cpupool_id);
-        ret = -ENOENT;
-        spin_lock(&cpupool_lock);
-
-        c = cpupool_find_by_id(op->cpupool_id);
-        if ( (c != NULL) && cpumask_weight(c->cpu_valid) )
-            ret = cpupool_move_domain_locked(d, c);
-
-        spin_unlock(&cpupool_lock);
-        debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n",
-                          d->domain_id, op->cpupool_id, ret);
-        rcu_unlock_domain(d);
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_FREEINFO:
-    {
-        ret = cpumask_to_xenctl_bitmap(
-            &op->cpumap, &cpupool_free_cpus);
-    }
-    break;
-
-    default:
-        ret = -ENOSYS;
-        break;
-    }
-
-    return ret;
-}
-
-void dump_runq(unsigned char key)
-{
-    unsigned long    flags;
-    s_time_t         now = NOW();
-    struct cpupool **c;
-
-    spin_lock(&cpupool_lock);
-    local_irq_save(flags);
-
-    printk("sched_smt_power_savings: %s\n",
-            sched_smt_power_savings? "enabled":"disabled");
-    printk("NOW=%"PRI_stime"\n", now);
-
-    printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map));
-    if ( !cpumask_empty(&cpupool_free_cpus) )
-    {
-        printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus));
-        schedule_dump(NULL);
-    }
-
-    for_each_cpupool(c)
-    {
-        printk("Cpupool %d:\n", (*c)->cpupool_id);
-        printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid));
-        schedule_dump(*c);
-    }
-
-    local_irq_restore(flags);
-    spin_unlock(&cpupool_lock);
-}
-
-static int cpu_callback(
-    struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-    unsigned int cpu = (unsigned long)hcpu;
-    int rc = 0;
-
-    switch ( action )
-    {
-    case CPU_DOWN_FAILED:
-    case CPU_ONLINE:
-        if ( system_state <= SYS_STATE_active )
-            rc = cpupool_cpu_add(cpu);
-        break;
-    case CPU_DOWN_PREPARE:
-        /* Suspend/Resume don't change assignments of cpus to cpupools. */
-        if ( system_state <= SYS_STATE_active )
-            rc = cpupool_cpu_remove_prologue(cpu);
-        break;
-    case CPU_DYING:
-        /* Suspend/Resume don't change assignments of cpus to cpupools. */
-        if ( system_state <= SYS_STATE_active )
-            cpupool_cpu_remove(cpu);
-        break;
-    case CPU_RESUME_FAILED:
-        cpupool_cpu_remove_forced(cpu);
-        break;
-    default:
-        break;
-    }
-
-    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
-}
-
-static struct notifier_block cpu_nfb = {
-    .notifier_call = cpu_callback
-};
-
-static int __init cpupool_init(void)
-{
-    unsigned int cpu;
-    int err;
-
-    cpupool_gran_init();
-
-    cpupool0 = cpupool_create(0, 0, &err);
-    BUG_ON(cpupool0 == NULL);
-    cpupool_put(cpupool0);
-    register_cpu_notifier(&cpu_nfb);
-
-    spin_lock(&cpupool_lock);
-
-    cpumask_copy(&cpupool_free_cpus, &cpu_online_map);
-
-    for_each_cpu ( cpu, &cpupool_free_cpus )
-        cpupool_assign_cpu_locked(cpupool0, cpu);
-
-    spin_unlock(&cpupool_lock);
-
-    return 0;
-}
-__initcall(cpupool_init);
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/sched/Kconfig b/xen/common/sched/Kconfig
new file mode 100644
index 0000000000..883ac87cab
--- /dev/null
+++ b/xen/common/sched/Kconfig
@@ -0,0 +1,65 @@
+menu "Schedulers"
+       visible if EXPERT = "y"
+
+config SCHED_CREDIT
+       bool "Credit scheduler support"
+       default y
+       ---help---
+         The traditional credit scheduler is a general purpose scheduler.
+
+config SCHED_CREDIT2
+       bool "Credit2 scheduler support"
+       default y
+       ---help---
+         The credit2 scheduler is a general purpose scheduler that is
+         optimized for lower latency and higher VM density.
+
+config SCHED_RTDS
+       bool "RTDS scheduler support (EXPERIMENTAL)"
+       default y
+       ---help---
+         The RTDS scheduler is a soft and firm real-time scheduler for
+         multicore, targeted for embedded, automotive, graphics and gaming
+         in the cloud, and general low-latency workloads.
+
+config SCHED_ARINC653
+       bool "ARINC653 scheduler support (EXPERIMENTAL)"
+       default DEBUG
+       ---help---
+         The ARINC653 scheduler is a hard real-time scheduler for single
+         cores, targeted for avionics, drones, and medical devices.
+
+config SCHED_NULL
+       bool "Null scheduler support (EXPERIMENTAL)"
+       default y
+       ---help---
+         The null scheduler is a static, zero overhead scheduler,
+         for when there always are less vCPUs than pCPUs, typically
+         in embedded or HPC scenarios.
+
+choice
+       prompt "Default Scheduler?"
+       default SCHED_CREDIT2_DEFAULT
+
+       config SCHED_CREDIT_DEFAULT
+               bool "Credit Scheduler" if SCHED_CREDIT
+       config SCHED_CREDIT2_DEFAULT
+               bool "Credit2 Scheduler" if SCHED_CREDIT2
+       config SCHED_RTDS_DEFAULT
+               bool "RT Scheduler" if SCHED_RTDS
+       config SCHED_ARINC653_DEFAULT
+               bool "ARINC653 Scheduler" if SCHED_ARINC653
+       config SCHED_NULL_DEFAULT
+               bool "Null Scheduler" if SCHED_NULL
+endchoice
+
+config SCHED_DEFAULT
+       string
+       default "credit" if SCHED_CREDIT_DEFAULT
+       default "credit2" if SCHED_CREDIT2_DEFAULT
+       default "rtds" if SCHED_RTDS_DEFAULT
+       default "arinc653" if SCHED_ARINC653_DEFAULT
+       default "null" if SCHED_NULL_DEFAULT
+       default "credit2"
+
+endmenu
diff --git a/xen/common/sched/Makefile b/xen/common/sched/Makefile
new file mode 100644
index 0000000000..3537f2a68d
--- /dev/null
+++ b/xen/common/sched/Makefile
@@ -0,0 +1,7 @@
+obj-y += cpupool.o
+obj-$(CONFIG_SCHED_ARINC653) += arinc653.o
+obj-$(CONFIG_SCHED_CREDIT) += credit.o
+obj-$(CONFIG_SCHED_CREDIT2) += credit2.o
+obj-$(CONFIG_SCHED_RTDS) += rt.o
+obj-$(CONFIG_SCHED_NULL) += null.o
+obj-y += core.o
diff --git a/xen/common/sched/arinc653.c b/xen/common/sched/arinc653.c
new file mode 100644
index 0000000000..565575c326
--- /dev/null
+++ b/xen/common/sched/arinc653.c
@@ -0,0 +1,739 @@
+/******************************************************************************
+ * sched_arinc653.c
+ *
+ * An ARINC653-compatible scheduling algorithm for use in Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2010, DornerWorks, Ltd. <DornerWorks.com>
+ */
+
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/timer.h>
+#include <xen/softirq.h>
+#include <xen/time.h>
+#include <xen/errno.h>
+#include <xen/list.h>
+#include <xen/guest_access.h>
+#include <public/sysctl.h>
+
+/**************************************************************************
+ * Private Macros                                                         *
+ **************************************************************************/
+
+/**
+ * Default timeslice for domain 0.
+ */
+#define DEFAULT_TIMESLICE MILLISECS(10)
+
+/**
+ * Retrieve the idle UNIT for a given physical CPU
+ */
+#define IDLETASK(cpu)  (sched_idle_unit(cpu))
+
+/**
+ * Return a pointer to the ARINC 653-specific scheduler data information
+ * associated with the given UNIT (unit)
+ */
+#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv)
+
+/**
+ * Return the global scheduler private data given the scheduler ops pointer
+ */
+#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data))
+
+/**************************************************************************
+ * Private Type Definitions                                               *
+ **************************************************************************/
+
+/**
+ * The arinc653_unit_t structure holds ARINC 653-scheduler-specific
+ * information for all non-idle UNITs
+ */
+typedef struct arinc653_unit_s
+{
+    /* unit points to Xen's struct sched_unit so we can get to it from an
+     * arinc653_unit_t pointer. */
+    struct sched_unit * unit;
+    /* awake holds whether the UNIT has been woken with vcpu_wake() */
+    bool_t              awake;
+    /* list holds the linked list information for the list this UNIT
+     * is stored in */
+    struct list_head    list;
+} arinc653_unit_t;
+
+/**
+ * The sched_entry_t structure holds a single entry of the
+ * ARINC 653 schedule.
+ */
+typedef struct sched_entry_s
+{
+    /* dom_handle holds the handle ("UUID") for the domain that this
+     * schedule entry refers to. */
+    xen_domain_handle_t dom_handle;
+    /* unit_id holds the UNIT number for the UNIT that this schedule
+     * entry refers to. */
+    int                 unit_id;
+    /* runtime holds the number of nanoseconds that the UNIT for this
+     * schedule entry should be allowed to run per major frame. */
+    s_time_t            runtime;
+    /* unit holds a pointer to the Xen sched_unit structure */
+    struct sched_unit * unit;
+} sched_entry_t;
+
+/**
+ * This structure defines data that is global to an instance of the scheduler
+ */
+typedef struct a653sched_priv_s
+{
+    /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
+    spinlock_t lock;
+
+    /**
+     * This array holds the active ARINC 653 schedule.
+     *
+     * When the system tries to start a new UNIT, this schedule is scanned
+     * to look for a matching (handle, UNIT #) pair. If both the handle (UUID)
+     * and UNIT number match, then the UNIT is allowed to run. Its run time
+     * (per major frame) is given in the third entry of the schedule.
+     */
+    sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
+
+    /**
+     * This variable holds the number of entries that are valid in
+     * the arinc653_schedule table.
+     *
+     * This is not necessarily the same as the number of domains in the
+     * schedule. A domain could be listed multiple times within the schedule,
+     * or a domain with multiple UNITs could have a different
+     * schedule entry for each UNIT.
+     */
+    unsigned int num_schedule_entries;
+
+    /**
+     * the major frame time for the ARINC 653 schedule.
+     */
+    s_time_t major_frame;
+
+    /**
+     * the time that the next major frame starts
+     */
+    s_time_t next_major_frame;
+
+    /**
+     * pointers to all Xen UNIT structures for iterating through
+     */
+    struct list_head unit_list;
+} a653sched_priv_t;
+
+/**************************************************************************
+ * Helper functions                                                       *
+ **************************************************************************/
+
+/**
+ * This function compares two domain handles.
+ *
+ * @param h1        Pointer to handle 1
+ * @param h2        Pointer to handle 2
+ *
+ * @return          <ul>
+ *                  <li> <0:  handle 1 is less than handle 2
+ *                  <li>  0:  handle 1 is equal to handle 2
+ *                  <li> >0:  handle 1 is greater than handle 2
+ *                  </ul>
+ */
+static int dom_handle_cmp(const xen_domain_handle_t h1,
+                          const xen_domain_handle_t h2)
+{
+    return memcmp(h1, h2, sizeof(xen_domain_handle_t));
+}
+
+/**
+ * This function searches the unit list to find a UNIT that matches
+ * the domain handle and UNIT ID specified.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param handle    Pointer to handler
+ * @param unit_id   UNIT ID
+ *
+ * @return          <ul>
+ *                  <li> Pointer to the matching UNIT if one is found
+ *                  <li> NULL otherwise
+ *                  </ul>
+ */
+static struct sched_unit *find_unit(
+    const struct scheduler *ops,
+    xen_domain_handle_t handle,
+    int unit_id)
+{
+    arinc653_unit_t *aunit;
+
+    /* loop through the unit_list looking for the specified UNIT */
+    list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list )
+        if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0)
+             && (unit_id == aunit->unit->unit_id) )
+            return aunit->unit;
+
+    return NULL;
+}
+
+/**
+ * This function updates the pointer to the Xen UNIT structure for each entry
+ * in the ARINC 653 schedule.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @return          <None>
+ */
+static void update_schedule_units(const struct scheduler *ops)
+{
+    unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries;
+
+    for ( i = 0; i < n_entries; i++ )
+        SCHED_PRIV(ops)->schedule[i].unit =
+            find_unit(ops,
+                      SCHED_PRIV(ops)->schedule[i].dom_handle,
+                      SCHED_PRIV(ops)->schedule[i].unit_id);
+}
+
+/**
+ * This function is called by the adjust_global scheduler hook to put
+ * in place a new ARINC653 schedule.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ *
+ * @return          <ul>
+ *                  <li> 0 = success
+ *                  <li> !0 = error
+ *                  </ul>
+ */
+static int
+arinc653_sched_set(
+    const struct scheduler *ops,
+    struct xen_sysctl_arinc653_schedule *schedule)
+{
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    s_time_t total_runtime = 0;
+    unsigned int i;
+    unsigned long flags;
+    int rc = -EINVAL;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    /* Check for valid major frame and number of schedule entries. */
+    if ( (schedule->major_frame <= 0)
+         || (schedule->num_sched_entries < 1)
+         || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) )
+        goto fail;
+
+    for ( i = 0; i < schedule->num_sched_entries; i++ )
+    {
+        /* Check for a valid run time. */
+        if ( schedule->sched_entries[i].runtime <= 0 )
+            goto fail;
+
+        /* Add this entry's run time to total run time. */
+        total_runtime += schedule->sched_entries[i].runtime;
+    }
+
+    /*
+     * Error if the major frame is not large enough to run all entries as
+     * indicated by comparing the total run time to the major frame length.
+     */
+    if ( total_runtime > schedule->major_frame )
+        goto fail;
+
+    /* Copy the new schedule into place. */
+    sched_priv->num_schedule_entries = schedule->num_sched_entries;
+    sched_priv->major_frame = schedule->major_frame;
+    for ( i = 0; i < schedule->num_sched_entries; i++ )
+    {
+        memcpy(sched_priv->schedule[i].dom_handle,
+               schedule->sched_entries[i].dom_handle,
+               sizeof(sched_priv->schedule[i].dom_handle));
+        sched_priv->schedule[i].unit_id =
+            schedule->sched_entries[i].vcpu_id;
+        sched_priv->schedule[i].runtime =
+            schedule->sched_entries[i].runtime;
+    }
+    update_schedule_units(ops);
+
+    /*
+     * The newly-installed schedule takes effect immediately. We do not even
+     * wait for the current major frame to expire.
+     *
+     * Signal a new major frame to begin. The next major frame is set up by
+     * the do_schedule callback function when it is next invoked.
+     */
+    sched_priv->next_major_frame = NOW();
+
+    rc = 0;
+
+ fail:
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+    return rc;
+}
+
+/**
+ * This function is called by the adjust_global scheduler hook to read the
+ * current ARINC 653 schedule
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @return          <ul>
+ *                  <li> 0 = success
+ *                  <li> !0 = error
+ *                  </ul>
+ */
+static int
+arinc653_sched_get(
+    const struct scheduler *ops,
+    struct xen_sysctl_arinc653_schedule *schedule)
+{
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    unsigned int i;
+    unsigned long flags;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    schedule->num_sched_entries = sched_priv->num_schedule_entries;
+    schedule->major_frame = sched_priv->major_frame;
+    for ( i = 0; i < sched_priv->num_schedule_entries; i++ )
+    {
+        memcpy(schedule->sched_entries[i].dom_handle,
+               sched_priv->schedule[i].dom_handle,
+               sizeof(sched_priv->schedule[i].dom_handle));
+        schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id;
+        schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime;
+    }
+
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+    return 0;
+}
+
+/**************************************************************************
+ * Scheduler callback functions                                           *
+ **************************************************************************/
+
+/**
+ * This function performs initialization for an instance of the scheduler.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ *
+ * @return          <ul>
+ *                  <li> 0 = success
+ *                  <li> !0 = error
+ *                  </ul>
+ */
+static int
+a653sched_init(struct scheduler *ops)
+{
+    a653sched_priv_t *prv;
+
+    prv = xzalloc(a653sched_priv_t);
+    if ( prv == NULL )
+        return -ENOMEM;
+
+    ops->sched_data = prv;
+
+    prv->next_major_frame = 0;
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->unit_list);
+
+    return 0;
+}
+
+/**
+ * This function performs deinitialization for an instance of the scheduler
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ */
+static void
+a653sched_deinit(struct scheduler *ops)
+{
+    xfree(SCHED_PRIV(ops));
+    ops->sched_data = NULL;
+}
+
+/**
+ * This function allocates scheduler-specific data for a UNIT
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param unit      Pointer to struct sched_unit
+ *
+ * @return          Pointer to the allocated data
+ */
+static void *
+a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+                      void *dd)
+{
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    arinc653_unit_t *svc;
+    unsigned int entry;
+    unsigned long flags;
+
+    /*
+     * Allocate memory for the ARINC 653-specific scheduler data information
+     * associated with the given UNIT (unit).
+     */
+    svc = xmalloc(arinc653_unit_t);
+    if ( svc == NULL )
+        return NULL;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    /*
+     * Add every one of dom0's units to the schedule, as long as there are
+     * slots available.
+     */
+    if ( unit->domain->domain_id == 0 )
+    {
+        entry = sched_priv->num_schedule_entries;
+
+        if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE )
+        {
+            sched_priv->schedule[entry].dom_handle[0] = '\0';
+            sched_priv->schedule[entry].unit_id = unit->unit_id;
+            sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE;
+            sched_priv->schedule[entry].unit = unit;
+
+            sched_priv->major_frame += DEFAULT_TIMESLICE;
+            ++sched_priv->num_schedule_entries;
+        }
+    }
+
+    /*
+     * Initialize our ARINC 653 scheduler-specific information for the UNIT.
+     * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it
+     * will call the vcpu_wake scheduler callback function and our scheduler
+     * will mark the UNIT awake.
+     */
+    svc->unit = unit;
+    svc->awake = 0;
+    if ( !is_idle_unit(unit) )
+        list_add(&svc->list, &SCHED_PRIV(ops)->unit_list);
+    update_schedule_units(ops);
+
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+    return svc;
+}
+
+/**
+ * This function frees scheduler-specific UNIT data
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ */
+static void
+a653sched_free_udata(const struct scheduler *ops, void *priv)
+{
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    arinc653_unit_t *av = priv;
+    unsigned long flags;
+
+    if (av == NULL)
+        return;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    if ( !is_idle_unit(av->unit) )
+        list_del(&av->list);
+
+    xfree(av);
+    update_schedule_units(ops);
+
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+}
+
+/**
+ * Xen scheduler callback function to sleep a UNIT
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param unit      Pointer to struct sched_unit
+ */
+static void
+a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+    if ( AUNIT(unit) != NULL )
+        AUNIT(unit)->awake = 0;
+
+    /*
+     * If the UNIT being put to sleep is the same one that is currently
+     * running, raise a softirq to invoke the scheduler to switch domains.
+     */
+    if ( get_sched_res(sched_unit_master(unit))->curr == unit )
+        cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+}
+
+/**
+ * Xen scheduler callback function to wake up a UNIT
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param unit      Pointer to struct sched_unit
+ */
+static void
+a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+    if ( AUNIT(unit) != NULL )
+        AUNIT(unit)->awake = 1;
+
+    cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+}
+
+/**
+ * Xen scheduler callback function to select a UNIT to run.
+ * This is the main scheduler routine.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param now       Current time
+ */
+static void
+a653sched_do_schedule(
+    const struct scheduler *ops,
+    struct sched_unit *prev,
+    s_time_t now,
+    bool tasklet_work_scheduled)
+{
+    struct sched_unit *new_task = NULL;
+    static unsigned int sched_index = 0;
+    static s_time_t next_switch_time;
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    const unsigned int cpu = sched_get_resource_cpu(smp_processor_id());
+    unsigned long flags;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    if ( sched_priv->num_schedule_entries < 1 )
+        sched_priv->next_major_frame = now + DEFAULT_TIMESLICE;
+    else if ( now >= sched_priv->next_major_frame )
+    {
+        /* time to enter a new major frame
+         * the first time this function is called, this will be true */
+        /* start with the first domain in the schedule */
+        sched_index = 0;
+        sched_priv->next_major_frame = now + sched_priv->major_frame;
+        next_switch_time = now + sched_priv->schedule[0].runtime;
+    }
+    else
+    {
+        while ( (now >= next_switch_time)
+                && (sched_index < sched_priv->num_schedule_entries) )
+        {
+            /* time to switch to the next domain in this major frame */
+            sched_index++;
+            next_switch_time += sched_priv->schedule[sched_index].runtime;
+        }
+    }
+
+    /*
+     * If we exhausted the domains in the schedule and still have time left
+     * in the major frame then switch next at the next major frame.
+     */
+    if ( sched_index >= sched_priv->num_schedule_entries )
+        next_switch_time = sched_priv->next_major_frame;
+
+    /*
+     * If there are more domains to run in the current major frame, set
+     * new_task equal to the address of next domain's sched_unit structure.
+     * Otherwise, set new_task equal to the address of the idle task's
+     * sched_unit structure.
+     */
+    new_task = (sched_index < sched_priv->num_schedule_entries)
+        ? sched_priv->schedule[sched_index].unit
+        : IDLETASK(cpu);
+
+    /* Check to see if the new task can be run (awake & runnable). */
+    if ( !((new_task != NULL)
+           && (AUNIT(new_task) != NULL)
+           && AUNIT(new_task)->awake
+           && unit_runnable_state(new_task)) )
+        new_task = IDLETASK(cpu);
+    BUG_ON(new_task == NULL);
+
+    /*
+     * Check to make sure we did not miss a major frame.
+     * This is a good test for robust partitioning.
+     */
+    BUG_ON(now >= sched_priv->next_major_frame);
+
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+    /* Tasklet work (which runs in idle UNIT context) overrides all else. */
+    if ( tasklet_work_scheduled )
+        new_task = IDLETASK(cpu);
+
+    /* Running this task would result in a migration */
+    if ( !is_idle_unit(new_task)
+         && (sched_unit_master(new_task) != cpu) )
+        new_task = IDLETASK(cpu);
+
+    /*
+     * Return the amount of time the next domain has to run and the address
+     * of the selected task's UNIT structure.
+     */
+    prev->next_time = next_switch_time - now;
+    prev->next_task = new_task;
+    new_task->migrated = false;
+
+    BUG_ON(prev->next_time <= 0);
+}
+
+/**
+ * Xen scheduler callback function to select a resource for the UNIT to run on
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param unit      Pointer to struct sched_unit
+ *
+ * @return          Scheduler resource to run on
+ */
+static struct sched_resource *
+a653sched_pick_resource(const struct scheduler *ops,
+                        const struct sched_unit *unit)
+{
+    cpumask_t *online;
+    unsigned int cpu;
+
+    /*
+     * If present, prefer unit's current processor, else
+     * just find the first valid unit.
+     */
+    online = cpupool_domain_master_cpumask(unit->domain);
+
+    cpu = cpumask_first(online);
+
+    if ( cpumask_test_cpu(sched_unit_master(unit), online)
+         || (cpu >= nr_cpu_ids) )
+        cpu = sched_unit_master(unit);
+
+    return get_sched_res(cpu);
+}
+
+/**
+ * Xen scheduler callback to change the scheduler of a cpu
+ *
+ * @param new_ops   Pointer to this instance of the scheduler structure
+ * @param cpu       The cpu that is changing scheduler
+ * @param pdata     scheduler specific PCPU data (we don't have any)
+ * @param vdata     scheduler specific UNIT data of the idle unit
+ */
+static spinlock_t *
+a653_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                  void *pdata, void *vdata)
+{
+    struct sched_resource *sr = get_sched_res(cpu);
+    arinc653_unit_t *svc = vdata;
+
+    ASSERT(!pdata && svc && is_idle_unit(svc->unit));
+
+    sched_idle_unit(cpu)->priv = vdata;
+
+    return &sr->_lock;
+}
+
+/**
+ * Xen scheduler callback function to perform a global (not domain-specific)
+ * adjustment. It is used by the ARINC 653 scheduler to put in place a new
+ * ARINC 653 schedule or to retrieve the schedule currently in place.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param sc        Pointer to the scheduler operation specified by Domain 0
+ */
+static int
+a653sched_adjust_global(const struct scheduler *ops,
+                        struct xen_sysctl_scheduler_op *sc)
+{
+    struct xen_sysctl_arinc653_schedule local_sched;
+    int rc = -EINVAL;
+
+    switch ( sc->cmd )
+    {
+    case XEN_SYSCTL_SCHEDOP_putinfo:
+        if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) )
+        {
+            rc = -EFAULT;
+            break;
+        }
+
+        rc = arinc653_sched_set(ops, &local_sched);
+        break;
+    case XEN_SYSCTL_SCHEDOP_getinfo:
+        memset(&local_sched, -1, sizeof(local_sched));
+        rc = arinc653_sched_get(ops, &local_sched);
+        if ( rc )
+            break;
+
+        if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) )
+            rc = -EFAULT;
+        break;
+    }
+
+    return rc;
+}
+
+/**
+ * This structure defines our scheduler for Xen.
+ * The entries tell Xen where to find our scheduler-specific
+ * callback functions.
+ * The symbol must be visible to the rest of Xen at link time.
+ */
+static const struct scheduler sched_arinc653_def = {
+    .name           = "ARINC 653 Scheduler",
+    .opt_name       = "arinc653",
+    .sched_id       = XEN_SCHEDULER_ARINC653,
+    .sched_data     = NULL,
+
+    .init           = a653sched_init,
+    .deinit         = a653sched_deinit,
+
+    .free_udata     = a653sched_free_udata,
+    .alloc_udata    = a653sched_alloc_udata,
+
+    .insert_unit    = NULL,
+    .remove_unit    = NULL,
+
+    .sleep          = a653sched_unit_sleep,
+    .wake           = a653sched_unit_wake,
+    .yield          = NULL,
+    .context_saved  = NULL,
+
+    .do_schedule    = a653sched_do_schedule,
+
+    .pick_resource  = a653sched_pick_resource,
+
+    .switch_sched   = a653_switch_sched,
+
+    .adjust         = NULL,
+    .adjust_global  = a653sched_adjust_global,
+
+    .dump_settings  = NULL,
+    .dump_cpu_state = NULL,
+};
+
+REGISTER_SCHEDULER(sched_arinc653_def);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c
new file mode 100644
index 0000000000..040b4caca2
--- /dev/null
+++ b/xen/common/sched/compat.c
@@ -0,0 +1,55 @@
+/****************************************************************************
+ * schedule.c
+ *
+ */
+
+#include <compat/sched.h>
+
+#define COMPAT
+#define ret_t int
+
+#define do_sched_op compat_sched_op
+
+#define xen_sched_pin_override sched_pin_override
+CHECK_sched_pin_override;
+#undef xen_sched_pin_override
+
+#define xen_sched_shutdown sched_shutdown
+CHECK_sched_shutdown;
+#undef xen_sched_shutdown
+
+#define xen_sched_remote_shutdown sched_remote_shutdown
+CHECK_sched_remote_shutdown;
+#undef xen_sched_remote_shutdown
+
+static int compat_poll(struct compat_sched_poll *compat)
+{
+    struct sched_poll native;
+
+#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \
+    guest_from_compat_handle((_d_)->ports, (_s_)->ports)
+    XLAT_sched_poll(&native, compat);
+#undef XLAT_sched_poll_HNDL_ports
+
+    return do_poll(&native);
+}
+
+#define do_poll compat_poll
+#define sched_poll compat_sched_poll
+
+#include "core.c"
+
+int compat_set_timer_op(u32 lo, s32 hi)
+{
+    return do_set_timer_op(((s64)hi << 32) | lo);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
new file mode 100644
index 0000000000..4d8eb4c617
--- /dev/null
+++ b/xen/common/sched/core.c
@@ -0,0 +1,3144 @@
+/****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004      - Mark Williamson - Intel Research Cambridge
+ ****************************************************************************
+ *
+ *        File: common/schedule.c
+ *      Author: Rolf Neugebauer & Keir Fraser
+ *              Updated for generic API by Mark Williamson
+ *
+ * Description: Generic CPU scheduling code
+ *              implements support functionality for the Xen scheduler API.
+ *
+ */
+
+#ifndef COMPAT
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/timer.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <xen/trace.h>
+#include <xen/mm.h>
+#include <xen/err.h>
+#include <xen/guest_access.h>
+#include <xen/hypercall.h>
+#include <xen/multicall.h>
+#include <xen/cpu.h>
+#include <xen/preempt.h>
+#include <xen/event.h>
+#include <public/sched.h>
+#include <xsm/xsm.h>
+#include <xen/err.h>
+
+#ifdef CONFIG_XEN_GUEST
+#include <asm/guest.h>
+#else
+#define pv_shim false
+#endif
+
+/* opt_sched: scheduler - default to configured value */
+static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
+string_param("sched", opt_sched);
+
+/* if sched_smt_power_savings is set,
+ * scheduler will give preferrence to partially idle package compared to
+ * the full idle package, when picking pCPU to schedule vCPU.
+ */
+bool_t sched_smt_power_savings = 0;
+boolean_param("sched_smt_power_savings", sched_smt_power_savings);
+
+/* Default scheduling rate limit: 1ms
+ * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms 
is undefined
+ * */
+int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
+integer_param("sched_ratelimit_us", sched_ratelimit_us);
+
+/* Number of vcpus per struct sched_unit. */
+bool __read_mostly sched_disable_smt_switching;
+cpumask_t sched_res_mask;
+
+/* Common lock for free cpus. */
+static DEFINE_SPINLOCK(sched_free_cpu_lock);
+
+/* Various timer handlers. */
+static void s_timer_fn(void *unused);
+static void vcpu_periodic_timer_fn(void *data);
+static void vcpu_singleshot_timer_fn(void *data);
+static void poll_timer_fn(void *data);
+
+/* This is global for now so that private implementations can reach it */
+DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res);
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx);
+DEFINE_RCU_READ_LOCK(sched_res_rculock);
+
+/* Scratch space for cpumasks. */
+DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
+
+/* How many urgent vcpus. */
+DEFINE_PER_CPU(atomic_t, sched_urgent_count);
+
+extern const struct scheduler *__start_schedulers_array[], 
*__end_schedulers_array[];
+#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
+#define schedulers __start_schedulers_array
+
+static struct scheduler __read_mostly ops;
+
+static bool scheduler_active;
+
+static void sched_set_affinity(
+    struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft);
+
+static struct sched_resource *
+sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+    return unit->res;
+}
+
+static void *
+sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+                       void *dd)
+{
+    /* Any non-NULL pointer is fine here. */
+    return ZERO_BLOCK_PTR;
+}
+
+static void
+sched_idle_free_udata(const struct scheduler *ops, void *priv)
+{
+}
+
+static void sched_idle_schedule(
+    const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
+    bool tasklet_work_scheduled)
+{
+    const unsigned int cpu = smp_processor_id();
+
+    unit->next_time = -1;
+    unit->next_task = sched_idle_unit(cpu);
+}
+
+static struct scheduler sched_idle_ops = {
+    .name           = "Idle Scheduler",
+    .opt_name       = "idle",
+    .sched_data     = NULL,
+
+    .pick_resource  = sched_idle_res_pick,
+    .do_schedule    = sched_idle_schedule,
+
+    .alloc_udata    = sched_idle_alloc_udata,
+    .free_udata     = sched_idle_free_udata,
+};
+
+static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
+                                         unsigned int cpu)
+{
+    unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
+    const struct domain *d = unit->domain;
+
+    return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
+}
+
+static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
+                                               unsigned int cpu)
+{
+    struct vcpu *v = unit2vcpu_cpu(unit, cpu);
+
+    return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
+}
+
+static inline struct scheduler *dom_scheduler(const struct domain *d)
+{
+    if ( likely(d->cpupool != NULL) )
+        return d->cpupool->sched;
+
+    /*
+     * If d->cpupool is NULL, this is the idle domain. This is special
+     * because the idle domain does not really belong to any cpupool, and,
+     * hence, does not really have a scheduler.
+     *
+     * This is (should be!) only called like this for allocating the idle
+     * vCPUs for the first time, during boot, in which case what we want
+     * is the default scheduler that has been, choosen at boot.
+     */
+    ASSERT(is_idle_domain(d));
+    return &ops;
+}
+
+static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
+{
+    struct domain *d = unit->domain;
+
+    if ( likely(d->cpupool != NULL) )
+        return d->cpupool->sched;
+
+    /*
+     * If d->cpupool is NULL, this is a unit of the idle domain. And this
+     * case is special because the idle domain does not really belong to
+     * a cpupool and, hence, doesn't really have a scheduler). In fact, its
+     * units (may) run on pCPUs which are in different pools, with different
+     * schedulers.
+     *
+     * What we want, in this case, is the scheduler of the pCPU where this
+     * particular idle unit is running. And, since unit->res never changes
+     * for idle units, it is safe to use it, with no locks, to figure that out.
+     */
+
+    ASSERT(is_idle_domain(d));
+    return unit->res->scheduler;
+}
+
+static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
+{
+    return unit_scheduler(v->sched_unit);
+}
+#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain)
+
+static inline void trace_runstate_change(struct vcpu *v, int new_state)
+{
+    struct { uint32_t vcpu:16, domain:16; } d;
+    uint32_t event;
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    d.vcpu = v->vcpu_id;
+    d.domain = v->domain->domain_id;
+
+    event = TRC_SCHED_RUNSTATE_CHANGE;
+    event |= ( v->runstate.state & 0x3 ) << 8;
+    event |= ( new_state & 0x3 ) << 4;
+
+    __trace_var(event, 1/*tsc*/, sizeof(d), &d);
+}
+
+static inline void trace_continue_running(struct vcpu *v)
+{
+    struct { uint32_t vcpu:16, domain:16; } d;
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    d.vcpu = v->vcpu_id;
+    d.domain = v->domain->domain_id;
+
+    __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
+}
+
+static inline void vcpu_urgent_count_update(struct vcpu *v)
+{
+    if ( is_idle_vcpu(v) )
+        return;
+
+    if ( unlikely(v->is_urgent) )
+    {
+        if ( !(v->pause_flags & VPF_blocked) ||
+             !test_bit(v->vcpu_id, v->domain->poll_mask) )
+        {
+            v->is_urgent = 0;
+            atomic_dec(&per_cpu(sched_urgent_count, v->processor));
+        }
+    }
+    else
+    {
+        if ( unlikely(v->pause_flags & VPF_blocked) &&
+             unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
+        {
+            v->is_urgent = 1;
+            atomic_inc(&per_cpu(sched_urgent_count, v->processor));
+        }
+    }
+}
+
+static inline void vcpu_runstate_change(
+    struct vcpu *v, int new_state, s_time_t new_entry_time)
+{
+    s_time_t delta;
+    struct sched_unit *unit = v->sched_unit;
+
+    ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+    if ( v->runstate.state == new_state )
+        return;
+
+    vcpu_urgent_count_update(v);
+
+    trace_runstate_change(v, new_state);
+
+    if ( !is_idle_vcpu(v) )
+    {
+        unit->runstate_cnt[v->runstate.state]--;
+        unit->runstate_cnt[new_state]++;
+    }
+
+    delta = new_entry_time - v->runstate.state_entry_time;
+    if ( delta > 0 )
+    {
+        v->runstate.time[v->runstate.state] += delta;
+        v->runstate.state_entry_time = new_entry_time;
+    }
+
+    v->runstate.state = new_state;
+}
+
+void sched_guest_idle(void (*idle) (void), unsigned int cpu)
+{
+    /*
+     * Another vcpu of the unit is active in guest context while this one is
+     * idle. In case of a scheduling event we don't want to have high latencies
+     * due to a cpu needing to wake up from deep C state for joining the
+     * rendezvous, so avoid those deep C states by incrementing the urgent
+     * count of the cpu.
+     */
+    atomic_inc(&per_cpu(sched_urgent_count, cpu));
+    idle();
+    atomic_dec(&per_cpu(sched_urgent_count, cpu));
+}
+
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
+{
+    spinlock_t *lock;
+    s_time_t delta;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit);
+    memcpy(runstate, &v->runstate, sizeof(*runstate));
+    delta = NOW() - runstate->state_entry_time;
+    if ( delta > 0 )
+        runstate->time[runstate->state] += delta;
+
+    if ( unlikely(lock != NULL) )
+        unit_schedule_unlock_irq(lock, v->sched_unit);
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+uint64_t get_cpu_idle_time(unsigned int cpu)
+{
+    struct vcpu_runstate_info state = { 0 };
+    struct vcpu *v = idle_vcpu[cpu];
+
+    if ( cpu_online(cpu) && v )
+        vcpu_runstate_get(v, &state);
+
+    return state.time[RUNSTATE_running];
+}
+
+/*
+ * If locks are different, take the one with the lower address first.
+ * This avoids dead- or live-locks when this code is running on both
+ * cpus at the same time.
+ */
+static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
+                                   unsigned long *flags)
+{
+    if ( lock1 == lock2 )
+    {
+        spin_lock_irqsave(lock1, *flags);
+    }
+    else if ( lock1 < lock2 )
+    {
+        spin_lock_irqsave(lock1, *flags);
+        spin_lock(lock2);
+    }
+    else
+    {
+        spin_lock_irqsave(lock2, *flags);
+        spin_lock(lock1);
+    }
+}
+
+static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
+                                     unsigned long flags)
+{
+    if ( lock1 != lock2 )
+        spin_unlock(lock2);
+    spin_unlock_irqrestore(lock1, flags);
+}
+
+static void sched_free_unit_mem(struct sched_unit *unit)
+{
+    struct sched_unit *prev_unit;
+    struct domain *d = unit->domain;
+
+    if ( d->sched_unit_list == unit )
+        d->sched_unit_list = unit->next_in_list;
+    else
+    {
+        for_each_sched_unit ( d, prev_unit )
+        {
+            if ( prev_unit->next_in_list == unit )
+            {
+                prev_unit->next_in_list = unit->next_in_list;
+                break;
+            }
+        }
+    }
+
+    free_cpumask_var(unit->cpu_hard_affinity);
+    free_cpumask_var(unit->cpu_hard_affinity_saved);
+    free_cpumask_var(unit->cpu_soft_affinity);
+
+    xfree(unit);
+}
+
+static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
+{
+    struct vcpu *vunit;
+    unsigned int cnt = 0;
+
+    /* Don't count to be released vcpu, might be not in vcpu list yet. */
+    for_each_sched_unit_vcpu ( unit, vunit )
+        if ( vunit != v )
+            cnt++;
+
+    v->sched_unit = NULL;
+    unit->runstate_cnt[v->runstate.state]--;
+
+    if ( unit->vcpu_list == v )
+        unit->vcpu_list = v->next_in_list;
+
+    if ( !cnt )
+        sched_free_unit_mem(unit);
+}
+
+static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
+{
+    v->sched_unit = unit;
+
+    /* All but idle vcpus are allocated with sequential vcpu_id. */
+    if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
+    {
+        unit->vcpu_list = v;
+        /*
+         * unit_id is always the same as lowest vcpu_id of unit.
+         * This is used for stopping for_each_sched_unit_vcpu() loop and in
+         * order to support cpupools with different granularities.
+         */
+        unit->unit_id = v->vcpu_id;
+    }
+    unit->runstate_cnt[v->runstate.state]++;
+}
+
+static struct sched_unit *sched_alloc_unit_mem(void)
+{
+    struct sched_unit *unit;
+
+    unit = xzalloc(struct sched_unit);
+    if ( !unit )
+        return NULL;
+
+    if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
+         !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
+         !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
+    {
+        sched_free_unit_mem(unit);
+        unit = NULL;
+    }
+
+    return unit;
+}
+
+static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
+{
+    struct sched_unit **prev_unit;
+
+    unit->domain = d;
+
+    for ( prev_unit = &d->sched_unit_list; *prev_unit;
+          prev_unit = &(*prev_unit)->next_in_list )
+        if ( (*prev_unit)->next_in_list &&
+             (*prev_unit)->next_in_list->unit_id > unit->unit_id )
+            break;
+
+    unit->next_in_list = *prev_unit;
+    *prev_unit = unit;
+}
+
+static struct sched_unit *sched_alloc_unit(struct vcpu *v)
+{
+    struct sched_unit *unit;
+    struct domain *d = v->domain;
+    unsigned int gran = cpupool_get_granularity(d->cpupool);
+
+    for_each_sched_unit ( d, unit )
+        if ( unit->unit_id / gran == v->vcpu_id / gran )
+            break;
+
+    if ( unit )
+    {
+        sched_unit_add_vcpu(unit, v);
+        return unit;
+    }
+
+    if ( (unit = sched_alloc_unit_mem()) == NULL )
+        return NULL;
+
+    sched_unit_add_vcpu(unit, v);
+    sched_domain_insert_unit(unit, d);
+
+    return unit;
+}
+
+static unsigned int sched_select_initial_cpu(const struct vcpu *v)
+{
+    const struct domain *d = v->domain;
+    nodeid_t node;
+    spinlock_t *lock;
+    unsigned long flags;
+    unsigned int cpu_ret, cpu = smp_processor_id();
+    cpumask_t *cpus = cpumask_scratch_cpu(cpu);
+
+    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+    cpumask_clear(cpus);
+    for_each_node_mask ( node, d->node_affinity )
+        cpumask_or(cpus, cpus, &node_to_cpumask(node));
+    cpumask_and(cpus, cpus, d->cpupool->cpu_valid);
+    if ( cpumask_empty(cpus) )
+        cpumask_copy(cpus, d->cpupool->cpu_valid);
+
+    if ( v->vcpu_id == 0 )
+        cpu_ret = cpumask_first(cpus);
+    else
+    {
+        /* We can rely on previous vcpu being available. */
+        ASSERT(!is_idle_domain(d));
+
+        cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus);
+    }
+
+    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
+
+    return cpu_ret;
+}
+
+int sched_init_vcpu(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct sched_unit *unit;
+    unsigned int processor;
+
+    if ( (unit = sched_alloc_unit(v)) == NULL )
+        return 1;
+
+    if ( is_idle_domain(d) )
+        processor = v->vcpu_id;
+    else
+        processor = sched_select_initial_cpu(v);
+
+    /* Initialise the per-vcpu timers. */
+    spin_lock_init(&v->periodic_timer_lock);
+    init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
+    init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
+    init_timer(&v->poll_timer, poll_timer_fn, v, processor);
+
+    /* If this is not the first vcpu of the unit we are done. */
+    if ( unit->priv != NULL )
+    {
+        v->processor = processor;
+        return 0;
+    }
+
+    rcu_read_lock(&sched_res_rculock);
+
+    /* The first vcpu of an unit can be set via sched_set_res(). */
+    sched_set_res(unit, get_sched_res(processor));
+
+    unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv);
+    if ( unit->priv == NULL )
+    {
+        sched_free_unit(unit, v);
+        rcu_read_unlock(&sched_res_rculock);
+        return 1;
+    }
+
+    /*
+     * Initialize affinity settings. The idler, and potentially
+     * domain-0 VCPUs, are pinned onto their respective physical CPUs.
+     */
+    if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) )
+        sched_set_affinity(unit, cpumask_of(processor), &cpumask_all);
+    else
+        sched_set_affinity(unit, &cpumask_all, &cpumask_all);
+
+    /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
+    if ( is_idle_domain(d) )
+    {
+        get_sched_res(v->processor)->curr = unit;
+        get_sched_res(v->processor)->sched_unit_idle = unit;
+        v->is_running = 1;
+        unit->is_running = true;
+        unit->state_entry_time = NOW();
+    }
+    else
+    {
+        sched_insert_unit(dom_scheduler(d), unit);
+    }
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    return 0;
+}
+
+static void vcpu_move_irqs(struct vcpu *v)
+{
+    arch_move_irqs(v);
+    evtchn_move_pirqs(v);
+}
+
+static void sched_move_irqs(const struct sched_unit *unit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        vcpu_move_irqs(v);
+}
+
+int sched_move_domain(struct domain *d, struct cpupool *c)
+{
+    struct vcpu *v;
+    struct sched_unit *unit;
+    unsigned int new_p, unit_idx;
+    void **unit_priv;
+    void *domdata;
+    void *unitdata;
+    struct scheduler *old_ops;
+    void *old_domdata;
+    unsigned int gran = cpupool_get_granularity(c);
+    int ret = 0;
+
+    for_each_vcpu ( d, v )
+    {
+        if ( v->affinity_broken )
+            return -EBUSY;
+    }
+
+    rcu_read_lock(&sched_res_rculock);
+
+    domdata = sched_alloc_domdata(c->sched, d);
+    if ( IS_ERR(domdata) )
+    {
+        ret = PTR_ERR(domdata);
+        goto out;
+    }
+
+    unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran));
+    if ( unit_priv == NULL )
+    {
+        sched_free_domdata(c->sched, domdata);
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    unit_idx = 0;
+    for_each_sched_unit ( d, unit )
+    {
+        unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata);
+        if ( unit_priv[unit_idx] == NULL )
+        {
+            for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
+                sched_free_udata(c->sched, unit_priv[unit_idx]);
+            xfree(unit_priv);
+            sched_free_domdata(c->sched, domdata);
+            ret = -ENOMEM;
+            goto out;
+        }
+        unit_idx++;
+    }
+
+    domain_pause(d);
+
+    old_ops = dom_scheduler(d);
+    old_domdata = d->sched_priv;
+
+    for_each_sched_unit ( d, unit )
+    {
+        sched_remove_unit(old_ops, unit);
+    }
+
+    d->cpupool = c;
+    d->sched_priv = domdata;
+
+    new_p = cpumask_first(c->cpu_valid);
+    unit_idx = 0;
+    for_each_sched_unit ( d, unit )
+    {
+        spinlock_t *lock;
+        unsigned int unit_p = new_p;
+
+        unitdata = unit->priv;
+
+        for_each_sched_unit_vcpu ( unit, v )
+        {
+            migrate_timer(&v->periodic_timer, new_p);
+            migrate_timer(&v->singleshot_timer, new_p);
+            migrate_timer(&v->poll_timer, new_p);
+            new_p = cpumask_cycle(new_p, c->cpu_valid);
+        }
+
+        lock = unit_schedule_lock_irq(unit);
+
+        sched_set_affinity(unit, &cpumask_all, &cpumask_all);
+
+        sched_set_res(unit, get_sched_res(unit_p));
+        /*
+         * With v->processor modified we must not
+         * - make any further changes assuming we hold the scheduler lock,
+         * - use unit_schedule_unlock_irq().
+         */
+        spin_unlock_irq(lock);
+
+        unit->priv = unit_priv[unit_idx];
+        if ( !d->is_dying )
+            sched_move_irqs(unit);
+
+        sched_insert_unit(c->sched, unit);
+
+        sched_free_udata(old_ops, unitdata);
+
+        unit_idx++;
+    }
+
+    domain_update_node_affinity(d);
+
+    domain_unpause(d);
+
+    sched_free_domdata(old_ops, old_domdata);
+
+    xfree(unit_priv);
+
+out:
+    rcu_read_unlock(&sched_res_rculock);
+
+    return ret;
+}
+
+void sched_destroy_vcpu(struct vcpu *v)
+{
+    struct sched_unit *unit = v->sched_unit;
+
+    kill_timer(&v->periodic_timer);
+    kill_timer(&v->singleshot_timer);
+    kill_timer(&v->poll_timer);
+    if ( test_and_clear_bool(v->is_urgent) )
+        atomic_dec(&per_cpu(sched_urgent_count, v->processor));
+    /*
+     * Vcpus are being destroyed top-down. So being the first vcpu of an unit
+     * is the same as being the only one.
+     */
+    if ( unit->vcpu_list == v )
+    {
+        rcu_read_lock(&sched_res_rculock);
+
+        sched_remove_unit(vcpu_scheduler(v), unit);
+        sched_free_udata(vcpu_scheduler(v), unit->priv);
+        sched_free_unit(unit, v);
+
+        rcu_read_unlock(&sched_res_rculock);
+    }
+}
+
+int sched_init_domain(struct domain *d, int poolid)
+{
+    void *sdom;
+    int ret;
+
+    ASSERT(d->cpupool == NULL);
+    ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
+
+    if ( (ret = cpupool_add_domain(d, poolid)) )
+        return ret;
+
+    SCHED_STAT_CRANK(dom_init);
+    TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sdom = sched_alloc_domdata(dom_scheduler(d), d);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    if ( IS_ERR(sdom) )
+        return PTR_ERR(sdom);
+
+    d->sched_priv = sdom;
+
+    return 0;
+}
+
+void sched_destroy_domain(struct domain *d)
+{
+    ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
+
+    if ( d->cpupool )
+    {
+        SCHED_STAT_CRANK(dom_destroy);
+        TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
+
+        rcu_read_lock(&sched_res_rculock);
+
+        sched_free_domdata(dom_scheduler(d), d->sched_priv);
+        d->sched_priv = NULL;
+
+        rcu_read_unlock(&sched_res_rculock);
+
+        cpupool_rm_domain(d);
+    }
+}
+
+static void vcpu_sleep_nosync_locked(struct vcpu *v)
+{
+    struct sched_unit *unit = v->sched_unit;
+
+    ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+
+    if ( likely(!vcpu_runnable(v)) )
+    {
+        if ( v->runstate.state == RUNSTATE_runnable )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+
+        /* Only put unit to sleep in case all vcpus are not runnable. */
+        if ( likely(!unit_runnable(unit)) )
+            sched_sleep(unit_scheduler(unit), unit);
+        else if ( unit_running(unit) > 1 && v->is_running &&
+                  !v->force_context_switch )
+        {
+            v->force_context_switch = true;
+            cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
+        }
+    }
+}
+
+void vcpu_sleep_nosync(struct vcpu *v)
+{
+    unsigned long flags;
+    spinlock_t *lock;
+
+    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
+
+    vcpu_sleep_nosync_locked(v);
+
+    unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+void vcpu_sleep_sync(struct vcpu *v)
+{
+    vcpu_sleep_nosync(v);
+
+    while ( !vcpu_runnable(v) && v->is_running )
+        cpu_relax();
+
+    sync_vcpu_execstate(v);
+}
+
+void vcpu_wake(struct vcpu *v)
+{
+    unsigned long flags;
+    spinlock_t *lock;
+    struct sched_unit *unit = v->sched_unit;
+
+    TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = unit_schedule_lock_irqsave(unit, &flags);
+
+    if ( likely(vcpu_runnable(v)) )
+    {
+        if ( v->runstate.state >= RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
+        /*
+         * Call sched_wake() unconditionally, even if unit is running already.
+         * We might have not been de-scheduled after vcpu_sleep_nosync_locked()
+         * and are now to be woken up again.
+         */
+        sched_wake(unit_scheduler(unit), unit);
+        if ( unit->is_running && !v->is_running && !v->force_context_switch )

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.