[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [Patch 1/6] xen: cpupool support - hypervisor support of cpupools


  • To: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
  • From: Juergen Gross <juergen.gross@xxxxxxxxxxxxxx>
  • Date: Fri, 17 Apr 2009 11:53:56 +0200
  • Delivery-date: Fri, 17 Apr 2009 02:56:26 -0700
  • Domainkey-signature: s=s1536a; d=ts.fujitsu.com; c=nofws; q=dns; h=X-SBRSScore:X-IronPort-AV:Received:X-IronPort-AV: Received:Received:Message-ID:Date:From:Organization: User-Agent:MIME-Version:To:Subject:X-Enigmail-Version: Content-Type; b=t3WwjMSh2l9diu2v0OapHIDGATMF07rlFYtYYb8loojF6KqkAy1QJbnW X4xamnSSkuWN0eAl8nI7xSmBRaxDfZJ4qOvV++sg8eh24n37hPGBsNO5W LxJkC91XwgF6W4i6bPo9EEtSXZsPzW5a+kZykL8dQTIkOdkBsNU6LhHyf Vs1NyzOkH90m8HgtEMMkF1qHNiI1DxEw+oHnKGg2RfjYN/eytIXecKGUx xQpyPNlgyVblAg4f9CStHxQjfTwJC;
  • List-id: Xen developer discussion <xen-devel.lists.xensource.com>

Signed-off-by: juergen.gross@xxxxxxxxxxxxxx

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 636 47950
Fujitsu Technolgy Solutions               e-mail: juergen.gross@xxxxxxxxxxxxxx
Otto-Hahn-Ring 6                        Internet: ts.fujitsu.com
D-81739 Muenchen                 Company details: ts.fujitsu.com/imprint.html
diff -r 655dc3bc1d8e xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c      Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/acpi/cpu_idle.c      Thu Apr 16 15:04:13 2009 +0200
@@ -198,7 +198,7 @@ static void acpi_processor_idle(void)
 
     cpufreq_dbs_timer_suspend();
 
-    sched_tick_suspend();
+    sched_tick_suspend(smp_processor_id());
     /*
      * sched_tick_suspend may raise TIMER_SOFTIRQ by __stop_timer,
      * which will break the later assumption of no sofirq pending,
@@ -216,7 +216,7 @@ static void acpi_processor_idle(void)
     if ( softirq_pending(smp_processor_id()) )
     {
         local_irq_enable();
-        sched_tick_resume();
+        sched_tick_resume(smp_processor_id());
         cpufreq_dbs_timer_resume();
         return;
     }
@@ -237,7 +237,7 @@ static void acpi_processor_idle(void)
             pm_idle_save();
         else
             acpi_safe_halt();
-        sched_tick_resume();
+        sched_tick_resume(smp_processor_id());
         cpufreq_dbs_timer_resume();
         return;
     }
@@ -345,7 +345,7 @@ static void acpi_processor_idle(void)
 
     default:
         local_irq_enable();
-        sched_tick_resume();
+        sched_tick_resume(smp_processor_id());
         cpufreq_dbs_timer_resume();
         return;
     }
@@ -357,7 +357,7 @@ static void acpi_processor_idle(void)
         cx->time += sleep_ticks;
     }
 
-    sched_tick_resume();
+    sched_tick_resume(smp_processor_id());
     cpufreq_dbs_timer_resume();
 
     if ( cpuidle_current_governor->reflect )
diff -r 655dc3bc1d8e xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/domain.c     Thu Apr 09 11:58:17 2009 +0200
@@ -1412,7 +1412,13 @@ struct migrate_info {
     void (*saved_schedule_tail)(struct vcpu *);
     cpumask_t saved_affinity;
     unsigned int nest;
+    int borrowed;
 };
+
+long continue_hypercall_on_cpu_dummy(void *data)
+{
+    return 0;
+}
 
 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
 {
@@ -1420,8 +1426,16 @@ static void continue_hypercall_on_cpu_he
     struct migrate_info *info = v->arch.continue_info;
     cpumask_t mask = info->saved_affinity;
     void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
+    int cpu = -1;
 
     regs->eax = info->func(info->data);
+
+    if ( (info->nest == 0) && info->borrowed &&
+         (cpu = cpupool_return_cpu(v->domain->cpupool) >= 0) )
+    {
+        continue_hypercall_on_cpu(cpu, continue_hypercall_on_cpu_dummy,
+            info->data);
+    }
 
     if ( info->nest-- == 0 )
     {
@@ -1440,27 +1454,32 @@ int continue_hypercall_on_cpu(int cpu, l
     struct migrate_info *info;
     cpumask_t mask = cpumask_of_cpu(cpu);
     int rc;
+    int borrowed = 0;
 
     if ( cpu == smp_processor_id() )
         return func(data);
 
+    borrowed = cpupool_borrow_cpu(v->domain->cpupool, cpu);
+
     info = v->arch.continue_info;
     if ( info == NULL )
     {
         info = xmalloc(struct migrate_info);
+        rc = -ENOMEM;
         if ( info == NULL )
-            return -ENOMEM;
+            goto out;
 
         rc = vcpu_lock_affinity(v, &mask);
         if ( rc )
         {
             xfree(info);
-            return rc;
+            goto out;
         }
 
         info->saved_schedule_tail = v->arch.schedule_tail;
         info->saved_affinity = mask;
         info->nest = 0;
+        info->borrowed = 0;
 
         v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
         v->arch.continue_info = info;
@@ -1470,16 +1489,22 @@ int continue_hypercall_on_cpu(int cpu, l
         BUG_ON(info->nest != 0);
         rc = vcpu_locked_change_affinity(v, &mask);
         if ( rc )
-            return rc;
+            goto out;
         info->nest++;
     }
 
+    info->borrowed += borrowed;
     info->func = func;
     info->data = data;
 
     /* Dummy return value will be overwritten by new schedule_tail. */
     BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
     return 0;
+
+out:
+    if ( borrowed )
+        cpupool_return_cpu(v->domain->cpupool);
+    return rc;
 }
 
 #define next_arg(fmt, args) ({                                              \
diff -r 655dc3bc1d8e xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/domain_build.c       Thu Apr 09 11:58:46 2009 +0200
@@ -9,6 +9,7 @@
 #include <xen/lib.h>
 #include <xen/ctype.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/smp.h>
 #include <xen/delay.h>
 #include <xen/event.h>
@@ -706,13 +707,13 @@ int __init construct_dom0(
         shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
 
     if ( opt_dom0_max_vcpus == 0 )
-        opt_dom0_max_vcpus = num_online_cpus();
+        opt_dom0_max_vcpus = num_cpupool_cpus(cpupool0);
     if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS )
         opt_dom0_max_vcpus = MAX_VIRT_CPUS;
     printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
 
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
-        (void)alloc_vcpu(d, i, i % num_online_cpus());
+        (void)alloc_vcpu(d, i, i % num_cpupool_cpus(cpupool0));
 
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
diff -r 655dc3bc1d8e xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/mm.c Thu Apr 09 12:00:02 2009 +0200
@@ -212,7 +212,7 @@ void __init arch_init_memory(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
+    dom_xen = domain_create(DOMID_XEN, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_xen == NULL);
 
     /*
@@ -220,7 +220,7 @@ void __init arch_init_memory(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
+    dom_io = domain_create(DOMID_IO, CPUPOOLID_NONE, DOMCRF_dummy, 0);
     BUG_ON(dom_io == NULL);
 
     /* First 1MB of RAM is historically marked as I/O. */
diff -r 655dc3bc1d8e xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/setup.c      Thu Apr 16 08:20:11 2009 +0200
@@ -2,6 +2,7 @@
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/serial.h>
 #include <xen/softirq.h>
@@ -232,7 +233,7 @@ static void __init init_idle_domain(void
     /* Domain creation requires that scheduler structures are initialised. */
     scheduler_init();
 
-    idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
+    idle_domain = domain_create(IDLE_DOMAIN_ID, CPUPOOLID_NONE, 0, 0);
     if ( (idle_domain == NULL) || (alloc_vcpu(idle_domain, 0, 0) == NULL) )
         BUG();
 
@@ -995,8 +996,12 @@ void __init __start_xen(unsigned long mb
     if ( !tboot_protect_mem_regions() )
         panic("Could not protect TXT memory regions\n");
 
+    /* Create initial cpupool 0. */
+    cpupool0 = cpupool_create(0, NULL);
+    if ( (cpupool0 == NULL) || cpupool0_cpu_assign(cpupool0) )
+        panic("Error creating cpupool 0\n");
     /* Create initial domain 0. */
-    dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
+    dom0 = domain_create(0, 0, DOMCRF_s3_integrity, DOM0_SSIDREF);
     if ( (dom0 == NULL) || (alloc_vcpu(dom0, 0, 0) == NULL) )
         panic("Error creating domain 0\n");
 
diff -r 655dc3bc1d8e xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/arch/x86/smpboot.c    Thu Apr 09 12:04:14 2009 +0200
@@ -1265,7 +1265,7 @@ int __cpu_disable(void)
        /* It's now safe to remove this processor from the online map */
        cpu_clear(cpu, cpu_online_map);
 
-       cpu_disable_scheduler();
+       cpu_disable_scheduler(cpu, 0);
 
        return 0;
 }
@@ -1299,7 +1299,7 @@ int cpu_down(unsigned int cpu)
        int err = 0;
 
        spin_lock(&cpu_add_remove_lock);
-       if (num_online_cpus() == 1) {
+       if (cpupool_cpu_remove(cpu)) {
                err = -EBUSY;
                goto out;
        }
@@ -1451,6 +1451,7 @@ int __devinit __cpu_up(unsigned int cpu)
                process_pending_timers();
        }
 
+       cpupool_cpu_add(cpu);
        cpufreq_add_cpu(cpu);
        return 0;
 }
diff -r 655dc3bc1d8e xen/common/Makefile
--- a/xen/common/Makefile       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/Makefile       Thu Apr 09 12:04:41 2009 +0200
@@ -1,4 +1,5 @@ obj-y += bitmap.o
 obj-y += bitmap.o
+obj-y += cpupool.o
 obj-y += domctl.o
 obj-y += domain.o
 obj-y += event_channel.o
diff -r 655dc3bc1d8e xen/common/domain.c
--- a/xen/common/domain.c       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/domain.c       Thu Apr 09 13:45:33 2009 +0200
@@ -187,7 +187,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
         return v;
 
     d = (vcpu_id == 0) ?
-        domain_create(IDLE_DOMAIN_ID, 0, 0) :
+        domain_create(IDLE_DOMAIN_ID, CPUPOOLID_NONE, 0, 0) :
         idle_vcpu[cpu_id - vcpu_id]->domain;
     BUG_ON(d == NULL);
 
@@ -198,7 +198,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
 }
 
 struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
+    domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref)
 {
     struct domain *d, **pd;
     enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
@@ -259,6 +259,9 @@ struct domain *domain_create(
     d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex);
     d->irq_caps   = rangeset_new(d, "Interrupts", 0);
     if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) )
+        goto fail;
+
+    if ( cpupool_add_domain(d, poolid) != 0 )
         goto fail;
 
     if ( sched_init_domain(d) != 0 )
@@ -564,6 +567,8 @@ static void complete_domain_destroy(stru
 
     sched_destroy_domain(d);
 
+    cpupool_rm_domain(d);
+
     /* Free page used by xen oprofile buffer. */
     free_xenoprof_pages(d);
 
diff -r 655dc3bc1d8e xen/common/domctl.c
--- a/xen/common/domctl.c       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/domctl.c       Thu Apr 16 08:20:11 2009 +0200
@@ -11,6 +11,7 @@
 #include <xen/lib.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/event.h>
 #include <xen/domain_page.h>
@@ -138,15 +139,18 @@ void getdomaininfo(struct domain *d, str
     info->max_pages         = d->max_pages;
     info->shared_info_frame = mfn_to_gmfn(d, __pa(d->shared_info)>>PAGE_SHIFT);
 
+    info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
+
     memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t));
 }
 
-static unsigned int default_vcpu0_location(void)
+static unsigned int default_vcpu0_location(struct domain *dom)
 {
     struct domain *d;
     struct vcpu   *v;
     unsigned int   i, cpu, nr_cpus, *cnt;
     cpumask_t      cpu_exclude_map;
+    cpumask_t      online;
 
     /* Do an initial CPU placement. Pick the least-populated CPU. */
     nr_cpus = last_cpu(cpu_possible_map) + 1;
@@ -171,7 +175,8 @@ static unsigned int default_vcpu0_locati
     if ( cpus_weight(cpu_sibling_map[0]) > 1 )
         cpu = next_cpu(cpu, cpu_sibling_map[0]);
     cpu_exclude_map = cpu_sibling_map[0];
-    for_each_online_cpu ( i )
+    online = (dom->cpupool == NULL) ? cpu_online_map : dom->cpupool->cpu_valid;
+    for_each_cpu_mask(i, online)
     {
         if ( cpu_isset(i, cpu_exclude_map) )
             continue;
@@ -366,12 +371,13 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         domid_t        dom;
         static domid_t rover = 0;
         unsigned int domcr_flags;
+        int            pool = 0;
 
         ret = -EINVAL;
         if ( supervisor_mode_kernel ||
              (op->u.createdomain.flags &
              ~(XEN_DOMCTL_CDF_hvm_guest | XEN_DOMCTL_CDF_hap |
-               XEN_DOMCTL_CDF_s3_integrity)) )
+               XEN_DOMCTL_CDF_s3_integrity | XEN_DOMCTL_CDF_pool)) )
             break;
 
         dom = op->domain;
@@ -405,9 +411,11 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             domcr_flags |= DOMCRF_hap;
         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_s3_integrity )
             domcr_flags |= DOMCRF_s3_integrity;
+        if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_pool )
+            pool = op->u.createdomain.cpupool;
 
         ret = -ENOMEM;
-        d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref);
+        d = domain_create(dom, pool, domcr_flags, op->u.createdomain.ssidref);
         if ( d == NULL )
             break;
 
@@ -426,6 +434,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     {
         struct domain *d;
         unsigned int i, max = op->u.max_vcpus.max, cpu;
+        cpumask_t online;
 
         ret = -ESRCH;
         if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL )
@@ -455,14 +464,15 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
             goto maxvcpu_out;
 
         ret = -ENOMEM;
+        online = (d->cpupool == NULL) ? cpu_online_map : d->cpupool->cpu_valid;
         for ( i = 0; i < max; i++ )
         {
             if ( d->vcpu[i] != NULL )
                 continue;
 
             cpu = (i == 0) ?
-                default_vcpu0_location() :
-                cycle_cpu(d->vcpu[i-1]->processor, cpu_online_map);
+                default_vcpu0_location(d) :
+                cycle_cpu(d->vcpu[i-1]->processor, online);
 
             if ( alloc_vcpu(d, i, cpu) == NULL )
                 goto maxvcpu_out;
@@ -890,6 +900,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     }
     break;
 
+    case XEN_DOMCTL_cpupool_op:
+    {
+        ret = cpupool_do_domctl(op);
+        if ( (ret == 0) && copy_to_guest(u_domctl, op, 1) )
+            ret = -EFAULT;
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, u_domctl);
         break;
diff -r 655dc3bc1d8e xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/sched_credit.c Thu Apr 16 09:41:15 2009 +0200
@@ -69,11 +69,15 @@
 /*
  * Useful macros
  */
+#define CSCHED_PRIV(_ops)   \
+    ((struct csched_private *)((_ops)->sched_data))
 #define CSCHED_PCPU(_c)     \
     ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
 #define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
 #define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
 #define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+#define CSCHED_CPUONLINE(_pool)    \
+    (((_pool) == NULL) ? cpupool_free_cpus : (_pool)->cpu_valid)
 
 
 /*
@@ -157,10 +161,12 @@ struct csched_private {
     struct timer  master_ticker;
     unsigned int master;
     cpumask_t idlers;
+    cpumask_t cpus;
     uint32_t weight;
     uint32_t credit;
     int credit_balance;
     uint32_t runq_sort;
+    int ticker_active;
 };
 
 
@@ -168,8 +174,10 @@ struct csched_private {
  * Global variables
  */
 static struct csched_private csched_priv;
+static struct csched_private *csched_priv0 = NULL;
 
 static void csched_tick(void *_cpu);
+static void csched_acct(void *dummy);
 
 static inline int
 __vcpu_on_runq(struct csched_vcpu *svc)
@@ -214,6 +222,7 @@ __runq_tickle(unsigned int cpu, struct c
 {
     struct csched_vcpu * const cur =
         CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
     cpumask_t mask;
 
     ASSERT(cur);
@@ -240,14 +249,14 @@ __runq_tickle(unsigned int cpu, struct c
      */
     if ( cur->pri > CSCHED_PRI_IDLE )
     {
-        if ( cpus_empty(csched_priv.idlers) )
+        if ( cpus_empty(prv->idlers) )
         {
             CSCHED_STAT_CRANK(tickle_idlers_none);
         }
         else
         {
             CSCHED_STAT_CRANK(tickle_idlers_some);
-            cpus_or(mask, mask, csched_priv.idlers);
+            cpus_or(mask, mask, prv->idlers);
             cpus_and(mask, mask, new->vcpu->cpu_affinity);
         }
     }
@@ -257,38 +266,78 @@ __runq_tickle(unsigned int cpu, struct c
         cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
 }
 
-static int
-csched_pcpu_init(int cpu)
+static void
+csched_free_pdata(struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_pcpu *spc = pcpu;
+    unsigned long flags;
+
+    if ( spc == NULL )
+        return;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    prv->credit -= CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus--;
+    cpu_clear(cpu, prv->idlers);
+    cpu_clear(cpu, prv->cpus);
+    if ( (prv->master == cpu) && (prv->ncpus > 0) )
+    {
+        prv->master = first_cpu(prv->cpus);
+        migrate_timer(&prv->master_ticker, prv->master);
+    }
+    kill_timer(&spc->ticker);
+    if ( prv->ncpus == 0 )
+        kill_timer(&prv->master_ticker);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    xfree(spc);
+}
+
+static void *
+csched_alloc_pdata(struct scheduler *ops, int cpu)
 {
     struct csched_pcpu *spc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     /* Allocate per-PCPU info */
     spc = xmalloc(struct csched_pcpu);
     if ( spc == NULL )
-        return -1;
-
-    spin_lock_irqsave(&csched_priv.lock, flags);
+        return NULL;
+
+    spin_lock_irqsave(&prv->lock, flags);
 
     /* Initialize/update system-wide config */
-    csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
-    if ( csched_priv.ncpus <= cpu )
-        csched_priv.ncpus = cpu + 1;
-    if ( csched_priv.master >= csched_priv.ncpus )
-        csched_priv.master = cpu;
+    prv->credit += CSCHED_CREDITS_PER_ACCT;
+    prv->ncpus++;
+    cpu_set(cpu, prv->cpus);
+    if ( (prv->ncpus == 1) && (prv != csched_priv0) )
+    {
+        prv->master = cpu;
+        init_timer( &prv->master_ticker, csched_acct, prv, cpu);
+        prv->ticker_active = 2;
+    }
 
     init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
+
+    if ( prv == csched_priv0 )
+        prv->master = first_cpu(prv->cpus);
+
     INIT_LIST_HEAD(&spc->runq);
-    spc->runq_sort_last = csched_priv.runq_sort;
-    per_cpu(schedule_data, cpu).sched_priv = spc;
+    spc->runq_sort_last = prv->runq_sort;
+    if ( per_cpu(schedule_data, cpu).sched_priv == NULL )
+        per_cpu(schedule_data, cpu).sched_priv = spc;
 
     /* Start off idling... */
     BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
-    cpu_set(cpu, csched_priv.idlers);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    return 0;
+    cpu_set(cpu, prv->idlers);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    return spc;
 }
 
 #ifndef NDEBUG
@@ -361,17 +410,19 @@ __csched_vcpu_is_migrateable(struct vcpu
 }
 
 static int
-csched_cpu_pick(struct vcpu *vc)
+csched_cpu_pick(struct scheduler *ops, struct vcpu *vc)
 {
     cpumask_t cpus;
     cpumask_t idlers;
+    cpumask_t online;
     int cpu;
 
     /*
      * Pick from online CPUs in VCPU's affinity mask, giving a
      * preference to its current processor if it's in there.
      */
-    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    online = CSCHED_CPUONLINE(vc->domain->cpupool);
+    cpus_and(cpus, online, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : cycle_cpu(vc->processor, cpus);
@@ -389,7 +440,7 @@ csched_cpu_pick(struct vcpu *vc)
      * like run two VCPUs on co-hyperthreads while there are idle cores
      * or sockets.
      */
-    idlers = csched_priv.idlers;
+    idlers = CSCHED_PRIV(ops)->idlers;
     cpu_set(cpu, idlers);
     cpus_and(cpus, cpus, idlers);
     cpu_clear(cpu, cpus);
@@ -433,12 +484,12 @@ csched_cpu_pick(struct vcpu *vc)
 }
 
 static inline void
-__csched_vcpu_acct_start(struct csched_vcpu *svc)
+__csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
     unsigned long flags;
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
+    spin_lock_irqsave(&(prv->lock), flags);
 
     if ( list_empty(&svc->active_vcpu_elem) )
     {
@@ -449,16 +500,17 @@ __csched_vcpu_acct_start(struct csched_v
         list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
         if ( list_empty(&sdom->active_sdom_elem) )
         {
-            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
-            csched_priv.weight += sdom->weight;
-        }
-    }
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+            list_add(&sdom->active_sdom_elem, &(prv->active_sdom));
+            prv->weight += sdom->weight;
+        }
+    }
+
+    spin_unlock_irqrestore(&(prv->lock), flags);
 }
 
 static inline void
-__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
+__csched_vcpu_acct_stop_locked(struct csched_private *prv,
+    struct csched_vcpu *svc)
 {
     struct csched_dom * const sdom = svc->sdom;
 
@@ -471,16 +523,17 @@ __csched_vcpu_acct_stop_locked(struct cs
     list_del_init(&svc->active_vcpu_elem);
     if ( list_empty(&sdom->active_vcpu) )
     {
-        BUG_ON( csched_priv.weight < sdom->weight );
+        BUG_ON( prv->weight < sdom->weight );
         list_del_init(&sdom->active_sdom_elem);
-        csched_priv.weight -= sdom->weight;
+        prv->weight -= sdom->weight;
     }
 }
 
 static void
-csched_vcpu_acct(unsigned int cpu)
+csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(current);
+    struct scheduler *ops = per_cpu(scheduler, cpu);
 
     ASSERT( current->processor == cpu );
     ASSERT( svc->sdom != NULL );
@@ -508,9 +561,9 @@ csched_vcpu_acct(unsigned int cpu)
      */
     if ( list_empty(&svc->active_vcpu_elem) )
     {
-        __csched_vcpu_acct_start(svc);
-    }
-    else if ( csched_cpu_pick(current) != cpu )
+        __csched_vcpu_acct_start(prv, svc);
+    }
+    else if ( csched_cpu_pick(ops, current) != cpu )
     {
         CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
         CSCHED_STAT_CRANK(migrate_running);
@@ -519,34 +572,54 @@ csched_vcpu_acct(unsigned int cpu)
     }
 }
 
-static int
-csched_vcpu_init(struct vcpu *vc)
-{
-    struct domain * const dom = vc->domain;
-    struct csched_dom *sdom = CSCHED_DOM(dom);
+static void *
+csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc)
+{
     struct csched_vcpu *svc;
-
-    CSCHED_STAT_CRANK(vcpu_init);
 
     /* Allocate per-VCPU info */
     svc = xmalloc(struct csched_vcpu);
     if ( svc == NULL )
-        return -1;
+        return NULL;
 
     INIT_LIST_HEAD(&svc->runq_elem);
     INIT_LIST_HEAD(&svc->active_vcpu_elem);
-    svc->sdom = sdom;
+    svc->sdom = CSCHED_DOM(vc->domain);
     svc->vcpu = vc;
     atomic_set(&svc->credit, 0);
     svc->flags = 0U;
-    svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+    svc->pri = is_idle_domain(vc->domain) ?
+        CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
     CSCHED_VCPU_STATS_RESET(svc);
+    return svc;
+}
+
+static void
+csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc = vc->sched_priv;
+
+    if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
+        __runq_insert(vc->processor, svc);
+}
+
+static int
+csched_vcpu_init(struct scheduler *ops, struct vcpu *vc)
+{
+    struct csched_vcpu *svc;
+
+    CSCHED_STAT_CRANK(vcpu_init);
+
+    svc = csched_alloc_vdata(ops, vc);
+    if ( svc == NULL )
+        return -1;
+
     vc->sched_priv = svc;
 
     /* Allocate per-PCPU info */
     if ( unlikely(!CSCHED_PCPU(vc->processor)) )
     {
-        if ( csched_pcpu_init(vc->processor) != 0 )
+        if ( csched_alloc_pdata(ops, vc->processor) == NULL )
             return -1;
     }
 
@@ -555,29 +628,41 @@ csched_vcpu_init(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_destroy(struct vcpu *vc)
+csched_free_vdata(struct scheduler *ops, void *priv)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_vcpu *svc = priv;
+    unsigned long flags;
+
+    if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
+
+    spin_lock_irqsave(&(prv->lock), flags);
+
+    if ( !list_empty(&svc->active_vcpu_elem) )
+        __csched_vcpu_acct_stop_locked(prv, svc);
+
+    spin_unlock_irqrestore(&(prv->lock), flags);
+
+    xfree(svc);
+}
+
+static void
+csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     struct csched_dom * const sdom = svc->sdom;
-    unsigned long flags;
 
     CSCHED_STAT_CRANK(vcpu_destroy);
 
     BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    if ( !list_empty(&svc->active_vcpu_elem) )
-        __csched_vcpu_acct_stop_locked(svc);
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
-
-    xfree(svc);
+    csched_free_vdata(ops, svc);
 }
 
 static void
-csched_vcpu_sleep(struct vcpu *vc)
+csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
 
@@ -592,7 +677,7 @@ csched_vcpu_sleep(struct vcpu *vc)
 }
 
 static void
-csched_vcpu_wake(struct vcpu *vc)
+csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
     const unsigned int cpu = vc->processor;
@@ -648,10 +733,11 @@ csched_vcpu_wake(struct vcpu *vc)
 
 static int
 csched_dom_cntl(
-    struct domain *d,
+    struct scheduler *ops, struct domain *d,
     struct xen_domctl_scheduler_op *op)
 {
     struct csched_dom * const sdom = CSCHED_DOM(d);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     unsigned long flags;
 
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
@@ -663,14 +749,14 @@ csched_dom_cntl(
     {
         ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
 
-        spin_lock_irqsave(&csched_priv.lock, flags);
+        spin_lock_irqsave(&(prv->lock), flags);
 
         if ( op->u.credit.weight != 0 )
         {
             if ( !list_empty(&sdom->active_sdom_elem) )
             {
-                csched_priv.weight -= sdom->weight;
-                csched_priv.weight += op->u.credit.weight;
+                prv->weight -= sdom->weight;
+                prv->weight += op->u.credit.weight;
             }
             sdom->weight = op->u.credit.weight;
         }
@@ -678,14 +764,14 @@ csched_dom_cntl(
         if ( op->u.credit.cap != (uint16_t)~0U )
             sdom->cap = op->u.credit.cap;
 
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        spin_unlock_irqrestore(&(prv->lock), flags);
     }
 
     return 0;
 }
 
 static int
-csched_dom_init(struct domain *dom)
+csched_dom_init(struct scheduler *ops, struct domain *dom)
 {
     struct csched_dom *sdom;
 
@@ -711,7 +797,7 @@ csched_dom_init(struct domain *dom)
 }
 
 static void
-csched_dom_destroy(struct domain *dom)
+csched_dom_destroy(struct scheduler *ops, struct domain *dom)
 {
     CSCHED_STAT_CRANK(dom_destroy);
     xfree(CSCHED_DOM(dom));
@@ -725,7 +811,7 @@ csched_dom_destroy(struct domain *dom)
  * remember the last UNDER to make the move up operation O(1).
  */
 static void
-csched_runq_sort(unsigned int cpu)
+csched_runq_sort(struct csched_private *prv, unsigned int cpu)
 {
     struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
     struct list_head *runq, *elem, *next, *last_under;
@@ -733,7 +819,7 @@ csched_runq_sort(unsigned int cpu)
     unsigned long flags;
     int sort_epoch;
 
-    sort_epoch = csched_priv.runq_sort;
+    sort_epoch = prv->runq_sort;
     if ( sort_epoch == spc->runq_sort_last )
         return;
 
@@ -768,8 +854,9 @@ csched_runq_sort(unsigned int cpu)
 }
 
 static void
-csched_acct(void* dummy)
-{
+csched_acct(void *dummy)
+{
+    struct csched_private *prv = dummy;
     unsigned long flags;
     struct list_head *iter_vcpu, *next_vcpu;
     struct list_head *iter_sdom, *next_sdom;
@@ -786,22 +873,22 @@ csched_acct(void* dummy)
     int credit;
 
 
-    spin_lock_irqsave(&csched_priv.lock, flags);
-
-    weight_total = csched_priv.weight;
-    credit_total = csched_priv.credit;
+    spin_lock_irqsave(&(prv->lock), flags);
+
+    weight_total = prv->weight;
+    credit_total = prv->credit;
 
     /* Converge balance towards 0 when it drops negative */
-    if ( csched_priv.credit_balance < 0 )
-    {
-        credit_total -= csched_priv.credit_balance;
+    if ( prv->credit_balance < 0 )
+    {
+        credit_total -= prv->credit_balance;
         CSCHED_STAT_CRANK(acct_balance);
     }
 
     if ( unlikely(weight_total == 0) )
     {
-        csched_priv.credit_balance = 0;
-        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        prv->credit_balance = 0;
+        spin_unlock_irqrestore(&(prv->lock), flags);
         CSCHED_STAT_CRANK(acct_no_work);
         goto out;
     }
@@ -813,7 +900,7 @@ csched_acct(void* dummy)
     credit_xtra = 0;
     credit_cap = 0U;
 
-    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+    list_for_each_safe( iter_sdom, next_sdom, &(prv->active_sdom) )
     {
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
 
@@ -833,9 +920,9 @@ csched_acct(void* dummy)
          * only when the system-wide credit balance is negative.
          */
         credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
-        if ( csched_priv.credit_balance < 0 )
-        {
-            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+        if ( prv->credit_balance < 0 )
+        {
+            credit_peak += ( ( -prv->credit_balance * sdom->weight) +
                              (weight_total - 1)
                            ) / weight_total;
         }
@@ -877,7 +964,7 @@ csched_acct(void* dummy)
                  */
                 CSCHED_STAT_CRANK(acct_reorder);
                 list_del(&sdom->active_sdom_elem);
-                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+                list_add(&sdom->active_sdom_elem, &(prv->active_sdom));
             }
 
             credit_fair = credit_peak;
@@ -943,7 +1030,7 @@ csched_acct(void* dummy)
                 /* Upper bound on credits means VCPU stops earning */
                 if ( credit > CSCHED_CREDITS_PER_TSLICE )
                 {
-                    __csched_vcpu_acct_stop_locked(svc);
+                    __csched_vcpu_acct_stop_locked(prv, svc);
                     credit = 0;
                     atomic_set(&svc->credit, credit);
                 }
@@ -955,15 +1042,15 @@ csched_acct(void* dummy)
         }
     }
 
-    csched_priv.credit_balance = credit_balance;
-
-    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    prv->credit_balance = credit_balance;
+
+    spin_unlock_irqrestore(&(prv->lock), flags);
 
     /* Inform each CPU that its runq needs to be sorted */
-    csched_priv.runq_sort++;
+    prv->runq_sort++;
 
 out:
-    set_timer( &csched_priv.master_ticker, NOW() +
+    set_timer( &(prv->master_ticker), NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 }
 
@@ -972,6 +1059,7 @@ csched_tick(void *_cpu)
 {
     unsigned int cpu = (unsigned long)_cpu;
     struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
 
     spc->tick++;
 
@@ -979,7 +1067,7 @@ csched_tick(void *_cpu)
      * Accounting for running VCPU
      */
     if ( !is_idle_vcpu(current) )
-        csched_vcpu_acct(cpu);
+        csched_vcpu_acct(prv, cpu);
 
     /*
      * Check if runq needs to be sorted
@@ -988,7 +1076,7 @@ csched_tick(void *_cpu)
      * modified priorities. This is a special O(n) sort and runs at most
      * once per accounting period (currently 30 milliseconds).
      */
-    csched_runq_sort(cpu);
+    csched_runq_sort(prv, cpu);
 
     set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
 }
@@ -1040,10 +1128,12 @@ csched_runq_steal(int peer_cpu, int cpu,
 }
 
 static struct csched_vcpu *
-csched_load_balance(int cpu, struct csched_vcpu *snext)
+csched_load_balance(struct csched_private *prv, int cpu,
+    struct csched_vcpu *snext)
 {
     struct csched_vcpu *speer;
     cpumask_t workers;
+    cpumask_t online;
     int peer_cpu;
 
     BUG_ON( cpu != snext->vcpu->processor );
@@ -1063,7 +1153,8 @@ csched_load_balance(int cpu, struct csch
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
      */
-    cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
+    online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu));
+    cpus_andnot(workers, online, prv->idlers);
     cpu_clear(cpu, workers);
     peer_cpu = cpu;
 
@@ -1105,16 +1196,39 @@ csched_load_balance(int cpu, struct csch
  * fast for the common case.
  */
 static struct task_slice
-csched_schedule(s_time_t now)
+csched_schedule(struct scheduler *ops, s_time_t now)
 {
     const int cpu = smp_processor_id();
     struct list_head * const runq = RUNQ(cpu);
     struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+    struct csched_private *prv = CSCHED_PRIV(ops);
     struct csched_vcpu *snext;
     struct task_slice ret;
 
     CSCHED_STAT_CRANK(schedule);
     CSCHED_VCPU_CHECK(current);
+
+    if ( unlikely(!cpu_isset(cpu, CSCHED_CPUONLINE(per_cpu(cpupool, cpu)))) )
+    {
+        struct list_head * iter;
+
+        snext = scurr;
+        if (is_idle_vcpu(current))
+            goto out;
+
+        if ( vcpu_runnable(current) )
+            __runq_insert(cpu, scurr);
+
+        list_for_each(iter, runq)
+        {
+            snext = __runq_elem(iter);
+            if ( snext->pri == CSCHED_PRI_IDLE )
+                break;
+        }
+        BUG_ON( snext->pri != CSCHED_PRI_IDLE );
+        __runq_remove(snext);
+        goto out;
+    }
 
     /*
      * Select next runnable local VCPU (ie top of local runq)
@@ -1137,20 +1251,21 @@ csched_schedule(s_time_t now)
     if ( snext->pri > CSCHED_PRI_TS_OVER )
         __runq_remove(snext);
     else
-        snext = csched_load_balance(cpu, snext);
-
+        snext = csched_load_balance(prv, cpu, snext);
+
+out:
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
      * will tickle us when they get extra work.
      */
     if ( snext->pri == CSCHED_PRI_IDLE )
     {
-        if ( !cpu_isset(cpu, csched_priv.idlers) )
-            cpu_set(cpu, csched_priv.idlers);
-    }
-    else if ( cpu_isset(cpu, csched_priv.idlers) )
-    {
-        cpu_clear(cpu, csched_priv.idlers);
+        if ( !cpu_isset(cpu, prv->idlers) )
+            cpu_set(cpu, prv->idlers);
+    }
+    else if ( cpu_isset(cpu, prv->idlers) )
+    {
+        cpu_clear(cpu, prv->idlers);
     }
 
     /*
@@ -1194,7 +1309,7 @@ csched_dump_vcpu(struct csched_vcpu *svc
 }
 
 static void
-csched_dump_pcpu(int cpu)
+csched_dump_pcpu(struct scheduler *ops, int cpu)
 {
     struct list_head *runq, *iter;
     struct csched_pcpu *spc;
@@ -1231,9 +1346,10 @@ csched_dump_pcpu(int cpu)
 }
 
 static void
-csched_dump(void)
+csched_dump(struct scheduler *ops)
 {
     struct list_head *iter_sdom, *iter_svc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     int loop;
     char idlers_buf[100];
 
@@ -1250,12 +1366,12 @@ csched_dump(void)
            "\tticks per tslice   = %d\n"
            "\tticks per acct     = %d\n"
            "\tmigration delay    = %uus\n",
-           csched_priv.ncpus,
-           csched_priv.master,
-           csched_priv.credit,
-           csched_priv.credit_balance,
-           csched_priv.weight,
-           csched_priv.runq_sort,
+           prv->ncpus,
+           prv->master,
+           prv->credit,
+           prv->credit_balance,
+           prv->weight,
+           prv->runq_sort,
            CSCHED_DEFAULT_WEIGHT,
            CSCHED_MSECS_PER_TICK,
            CSCHED_CREDITS_PER_TICK,
@@ -1263,12 +1379,12 @@ csched_dump(void)
            CSCHED_TICKS_PER_ACCT,
            vcpu_migration_delay);
 
-    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
+    cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers);
     printk("idlers: %s\n", idlers_buf);
 
     printk("active vcpus:\n");
     loop = 0;
-    list_for_each( iter_sdom, &csched_priv.active_sdom )
+    list_for_each( iter_sdom, &(prv->active_sdom) )
     {
         struct csched_dom *sdom;
         sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
@@ -1284,18 +1400,29 @@ csched_dump(void)
     }
 }
 
-static void
-csched_init(void)
-{
-    spin_lock_init(&csched_priv.lock);
-    INIT_LIST_HEAD(&csched_priv.active_sdom);
-    csched_priv.ncpus = 0;
-    csched_priv.master = UINT_MAX;
-    cpus_clear(csched_priv.idlers);
-    csched_priv.weight = 0U;
-    csched_priv.credit = 0U;
-    csched_priv.credit_balance = 0;
-    csched_priv.runq_sort = 0U;
+static int
+csched_init(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = xmalloc(struct csched_private);
+    if ( prv == NULL )
+        return 1;
+    if (csched_priv0 == NULL)
+        csched_priv0 = prv;
+    ops->sched_data = prv;
+    spin_lock_init(&(prv->lock));
+    INIT_LIST_HEAD(&(prv->active_sdom));
+    prv->ncpus = 0;
+    prv->master = UINT_MAX;
+    cpus_clear(prv->idlers);
+    prv->weight = 0U;
+    prv->credit = 0U;
+    prv->credit_balance = 0;
+    prv->runq_sort = 0U;
+    prv->ticker_active = (csched_priv0 == prv) ? 0 : 1;
+
+    return 0;
 }
 
 /* Tickers cannot be kicked until SMP subsystem is alive. */
@@ -1305,8 +1432,10 @@ static __init int csched_start_tickers(v
     unsigned int cpu;
 
     /* Is the credit scheduler initialised? */
-    if ( csched_priv.ncpus == 0 )
+    if ( (csched_priv0 == NULL) || (csched_priv0->ncpus == 0) )
         return 0;
+
+    csched_priv0->ticker_active = 1;
 
     for_each_online_cpu ( cpu )
     {
@@ -1314,45 +1443,70 @@ static __init int csched_start_tickers(v
         set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
     }
 
-    init_timer( &csched_priv.master_ticker, csched_acct, NULL,
-                    csched_priv.master);
-
-    set_timer( &csched_priv.master_ticker, NOW() +
+    init_timer( &(csched_priv0->master_ticker), csched_acct, csched_priv0,
+                    csched_priv0->master);
+
+    set_timer( &(csched_priv0->master_ticker), NOW() +
             MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
 
     return 0;
 }
 __initcall(csched_start_tickers);
 
-static void csched_tick_suspend(void)
+static void
+csched_deinit(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( prv != NULL )
+        xfree(prv);
+}
+
+static void csched_tick_suspend(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
 
-    spc = CSCHED_PCPU(smp_processor_id());
+    spc = CSCHED_PCPU(cpu);
 
     stop_timer(&spc->ticker);
 }
 
-static void csched_tick_resume(void)
+static void csched_tick_resume(struct scheduler *ops, unsigned int cpu)
 {
     struct csched_pcpu *spc;
     uint64_t now = NOW();
-
-    spc = CSCHED_PCPU(smp_processor_id());
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( !prv->ticker_active )
+        return;
+
+    spc = CSCHED_PCPU(cpu);
 
     set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
             - now % MILLISECS(CSCHED_MSECS_PER_TICK) );
+
+    if ( (prv->ticker_active == 2) && (prv->master == cpu) )
+    {
+        set_timer( &prv->master_ticker, now +
+            MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT -
+            now % MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT);
+        prv->ticker_active = 1;
+    }
 }
 
 struct scheduler sched_credit_def = {
     .name           = "SMP Credit Scheduler",
     .opt_name       = "credit",
     .sched_id       = XEN_SCHEDULER_CREDIT,
+    .sched_data     = &csched_priv,
 
     .init_domain    = csched_dom_init,
     .destroy_domain = csched_dom_destroy,
 
     .init_vcpu      = csched_vcpu_init,
+    .insert_vcpu    = csched_vcpu_insert,
     .destroy_vcpu   = csched_vcpu_destroy,
 
     .sleep          = csched_vcpu_sleep,
@@ -1366,6 +1520,11 @@ struct scheduler sched_credit_def = {
     .dump_cpu_state = csched_dump_pcpu,
     .dump_settings  = csched_dump,
     .init           = csched_init,
+    .deinit         = csched_deinit,
+    .alloc_vdata    = csched_alloc_vdata,
+    .free_vdata     = csched_free_vdata,
+    .alloc_pdata    = csched_alloc_pdata,
+    .free_pdata     = csched_free_pdata,
 
     .tick_suspend   = csched_tick_suspend,
     .tick_resume    = csched_tick_resume,
diff -r 655dc3bc1d8e xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c   Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/sched_sedf.c   Thu Apr 09 14:54:22 2009 +0200
@@ -20,6 +20,9 @@
         if ( (_f) <= SEDFLEVEL )                \
             printk(_a );                        \
     } while ( 0 )
+
+#define SEDF_CPUONLINE(_pool)                                             \
+    (((_pool) == NULL) ? cpupool_free_cpus : (_pool)->cpu_valid)
 
 #ifndef NDEBUG
 #define SEDF_STATS
@@ -132,7 +135,7 @@ struct sedf_cpu_info {
 #define sedf_runnable(edom)  (!(EDOM_INFO(edom)->status & SEDF_ASLEEP))
 
 
-static void sedf_dump_cpu_state(int i);
+static void sedf_dump_cpu_state(struct scheduler *ops, int i);
 
 static inline int extraq_on(struct vcpu *d, int i)
 {
@@ -329,30 +332,17 @@ static inline void __add_to_runqueue_sor
 }
 
 
-static int sedf_init_vcpu(struct vcpu *v)
+static void *sedf_alloc_vdata(struct scheduler *ops, struct vcpu *v)
 {
     struct sedf_vcpu_info *inf;
 
-    if ( (v->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL )
-        return -1;
-    memset(v->sched_priv, 0, sizeof(struct sedf_vcpu_info));
-
-    inf = EDOM_INFO(v);
+    inf = xmalloc(struct sedf_vcpu_info);
+    if ( inf == NULL )
+        return NULL;
+
+    memset(inf, 0, sizeof(struct sedf_vcpu_info));
     inf->vcpu = v;
- 
-    /* Allocate per-CPU context if this is the first domain to be added. */
-    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
-    {
-        per_cpu(schedule_data, v->processor).sched_priv = 
-            xmalloc(struct sedf_cpu_info);
-        BUG_ON(per_cpu(schedule_data, v->processor).sched_priv == NULL);
-        memset(CPU_INFO(v->processor), 0, sizeof(*CPU_INFO(v->processor)));
-        INIT_LIST_HEAD(WAITQ(v->processor));
-        INIT_LIST_HEAD(RUNQ(v->processor));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_PEN_Q));
-        INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_UTIL_Q));
-    }
-       
+
     /* Every VCPU gets an equal share of extratime by default. */
     inf->deadl_abs   = 0;
     inf->latency     = 0;
@@ -383,19 +373,69 @@ static int sedf_init_vcpu(struct vcpu *v
     }
     else
     {
-        EDOM_INFO(v)->deadl_abs = 0;
-        EDOM_INFO(v)->status &= ~SEDF_ASLEEP;
-    }
-
+        inf->deadl_abs = 0;
+        inf->status &= ~SEDF_ASLEEP;
+    }
+
+    return inf;
+}
+
+static void *
+sedf_alloc_pdata(struct scheduler *ops, int cpu)
+{
+    struct sedf_cpu_info *spc;
+
+    spc = xmalloc(struct sedf_cpu_info);
+    BUG_ON(spc == NULL);
+    memset(spc, 0, sizeof(*spc));
+    INIT_LIST_HEAD(&spc->waitq);
+    INIT_LIST_HEAD(&spc->runnableq);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]);
+    INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]);
+
+    return (void *)spc;
+}
+
+static void
+sedf_free_pdata(struct scheduler *ops, void *spc, int cpu)
+{
+    if ( spc == NULL )
+        return;
+
+    xfree(spc);
+}
+
+static int sedf_init_vcpu(struct scheduler *ops, struct vcpu *v)
+{
+    struct sedf_vcpu_info *inf;
+
+    /* Allocate per-CPU context if this is the first domain to be added. */
+    if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) )
+    {
+        per_cpu(schedule_data, v->processor).sched_priv = 
+            sedf_alloc_pdata(ops, v->processor);
+    }
+       
+    inf = sedf_alloc_vdata(ops, v);
+    if ( inf == NULL )
+        return -1;
+
+    v->sched_priv = inf;
+ 
     return 0;
 }
 
-static void sedf_destroy_vcpu(struct vcpu *v)
-{
-    xfree(v->sched_priv);
-}
-
-static int sedf_init_domain(struct domain *d)
+static void sedf_free_vdata(struct scheduler *ops, void *priv)
+{
+    xfree(priv);
+}
+
+static void sedf_destroy_vcpu(struct scheduler *ops, struct vcpu *v)
+{
+    sedf_free_vdata(ops, v->sched_priv);
+}
+
+static int sedf_init_domain(struct scheduler *ops, struct domain *d)
 {
     d->sched_priv = xmalloc(struct sedf_dom_info);
     if ( d->sched_priv == NULL )
@@ -406,16 +446,18 @@ static int sedf_init_domain(struct domai
     return 0;
 }
 
-static void sedf_destroy_domain(struct domain *d)
+static void sedf_destroy_domain(struct scheduler *ops, struct domain *d)
 {
     xfree(d->sched_priv);
 }
 
-static int sedf_pick_cpu(struct vcpu *v)
+static int sedf_pick_cpu(struct scheduler *ops, struct vcpu *v)
 {
     cpumask_t online_affinity;
-
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    cpumask_t online;
+
+    online = SEDF_CPUONLINE(v->domain->cpupool);
+    cpus_and(online_affinity, v->cpu_affinity, online);
     return first_cpu(online_affinity);
 }
 
@@ -751,7 +793,7 @@ static struct task_slice sedf_do_extra_s
    -timeslice for the current period used up
    -domain on waitqueue has started it's period
    -and various others ;) in general: determine which domain to run next*/
-static struct task_slice sedf_do_schedule(s_time_t now)
+static struct task_slice sedf_do_schedule(struct scheduler *ops, s_time_t now)
 {
     int                   cpu      = smp_processor_id();
     struct list_head     *runq     = RUNQ(cpu);
@@ -786,6 +828,13 @@ static struct task_slice sedf_do_schedul
     }
  check_waitq:
     update_queues(now, runq, waitq);
+
+    if ( unlikely(!cpu_isset(cpu, SEDF_CPUONLINE(per_cpu(cpupool, cpu)))) )
+    {
+        ret.task = IDLETASK(cpu);
+        ret.time = SECONDS(1);
+        goto sched_done;
+    }
  
     /*now simply pick the first domain from the runqueue, which has the
       earliest deadline, because the list is sorted*/
@@ -848,7 +897,7 @@ static struct task_slice sedf_do_schedul
 }
 
 
-static void sedf_sleep(struct vcpu *d)
+static void sedf_sleep(struct scheduler *ops, struct vcpu *d)
 {
     PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",
           d->domain->domain_id, d->vcpu_id);
@@ -1067,7 +1116,7 @@ static inline int should_switch(struct v
     return 1;
 }
 
-static void sedf_wake(struct vcpu *d)
+static void sedf_wake(struct scheduler *ops, struct vcpu *d)
 {
     s_time_t              now = NOW();
     struct sedf_vcpu_info* inf = EDOM_INFO(d);
@@ -1220,8 +1269,8 @@ static void sedf_dump_domain(struct vcpu
 }
 
 
-/* dumps all domains on hte specified cpu */
-static void sedf_dump_cpu_state(int i)
+/* dumps all domains on the specified cpu */
+static void sedf_dump_cpu_state(struct scheduler *ops, int i)
 {
     struct list_head      *list, *queue, *tmp;
     struct sedf_vcpu_info *d_inf;
@@ -1294,7 +1343,7 @@ static void sedf_dump_cpu_state(int i)
 
 
 /* Adjusts periods and slices of the domains accordingly to their weights. */
-static int sedf_adjust_weights(struct xen_domctl_scheduler_op *cmd)
+static int sedf_adjust_weights(struct cpupool *c, struct 
xen_domctl_scheduler_op *cmd)
 {
     struct vcpu *p;
     struct domain      *d;
@@ -1315,6 +1364,8 @@ static int sedf_adjust_weights(struct xe
     rcu_read_lock(&domlist_read_lock);
     for_each_domain( d )
     {
+        if ( c != d->cpupool )
+           continue;
         for_each_vcpu( d, p )
         {
             if ( EDOM_INFO(p)->weight )
@@ -1366,7 +1417,7 @@ static int sedf_adjust_weights(struct xe
 
 
 /* set or fetch domain scheduling parameters */
-static int sedf_adjust(struct domain *p, struct xen_domctl_scheduler_op *op)
+static int sedf_adjust(struct scheduler *ops, struct domain *p, struct 
xen_domctl_scheduler_op *op)
 {
     struct vcpu *v;
     int rc;
@@ -1425,7 +1476,7 @@ static int sedf_adjust(struct domain *p,
             }
         }
 
-        rc = sedf_adjust_weights(op);
+        rc = sedf_adjust_weights(p->cpupool, op);
         if ( rc )
             return rc;
 
@@ -1463,6 +1514,11 @@ struct scheduler sched_sedf_def = {
 
     .init_vcpu      = sedf_init_vcpu,
     .destroy_vcpu   = sedf_destroy_vcpu,
+
+    .alloc_vdata    = sedf_alloc_vdata,
+    .free_vdata     = sedf_free_vdata,
+    .alloc_pdata    = sedf_alloc_pdata,
+    .free_pdata     = sedf_free_pdata,
 
     .do_schedule    = sedf_do_schedule,
     .pick_cpu       = sedf_pick_cpu,
diff -r 655dc3bc1d8e xen/common/schedule.c
--- a/xen/common/schedule.c     Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/common/schedule.c     Thu Apr 16 09:18:40 2009 +0200
@@ -55,6 +55,7 @@ static void poll_timer_fn(void *data);
 
 /* This is global for now so that private implementations can reach it */
 DEFINE_PER_CPU(struct schedule_data, schedule_data);
+DEFINE_PER_CPU(struct scheduler *, scheduler);
 
 extern struct scheduler sched_sedf_def;
 extern struct scheduler sched_credit_def;
@@ -66,9 +67,15 @@ static struct scheduler *schedulers[] = 
 
 static struct scheduler ops;
 
-#define SCHED_OP(fn, ...)                                 \
-         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ )      \
-          : (typeof(ops.fn(__VA_ARGS__)))0 )
+#define SCHED_OP(opsptr, fn, ...)                                          \
+         (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ )  \
+          : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
+
+#define DOM2OP(_d)    (((_d)->cpupool == NULL) ? &ops : 
&((_d)->cpupool->sched))
+#define VCPU2OP(_v)   (DOM2OP((_v)->domain))
+#define VCPU2ONLINE(_v)                                                    \
+         (((_v)->domain->cpupool == NULL) ? cpu_online_map                 \
+         : (_v)->domain->cpupool->cpu_valid)
 
 static inline void trace_runstate_change(struct vcpu *v, int new_state)
 {
@@ -182,7 +189,13 @@ int sched_init_vcpu(struct vcpu *v, unsi
 
     TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
 
-    return SCHED_OP(init_vcpu, v);
+    if ( SCHED_OP(DOM2OP(d), init_vcpu, v) != 0 )
+        return 1;
+
+    if ( is_idle_domain(d) )
+        per_cpu(schedule_data, v->processor).sched_idlevpriv = v->sched_priv;
+
+    return 0;
 }
 
 void sched_destroy_vcpu(struct vcpu *v)
@@ -190,17 +203,47 @@ void sched_destroy_vcpu(struct vcpu *v)
     kill_timer(&v->periodic_timer);
     kill_timer(&v->singleshot_timer);
     kill_timer(&v->poll_timer);
-    SCHED_OP(destroy_vcpu, v);
+    SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
+}
+
+void sched_move_domain(struct domain *d, struct cpupool *c)
+{
+    struct vcpu *v;
+    unsigned int new_p;
+
+    domain_pause(d);
+
+    new_p = first_cpu(c->cpu_valid);
+    for_each_vcpu ( d, v )
+    {
+        migrate_timer(&v->periodic_timer, new_p);
+        migrate_timer(&v->singleshot_timer, new_p);
+        migrate_timer(&v->poll_timer, new_p);
+
+        SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
+
+        cpus_setall(v->cpu_affinity);
+        v->processor = new_p;
+        SCHED_OP(&(c->sched), init_vcpu, v);
+
+        new_p = next_cpu(new_p, c->cpu_valid);
+        if ( new_p == NR_CPUS )
+            new_p = first_cpu(c->cpu_valid);
+    }
+
+    d->cpupool = c;
+
+    domain_unpause(d);
 }
 
 int sched_init_domain(struct domain *d)
 {
-    return SCHED_OP(init_domain, d);
+    return SCHED_OP(DOM2OP(d), init_domain, d);
 }
 
 void sched_destroy_domain(struct domain *d)
 {
-    SCHED_OP(destroy_domain, d);
+    SCHED_OP(DOM2OP(d), destroy_domain, d);
 }
 
 void vcpu_sleep_nosync(struct vcpu *v)
@@ -214,7 +257,7 @@ void vcpu_sleep_nosync(struct vcpu *v)
         if ( v->runstate.state == RUNSTATE_runnable )
             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
 
-        SCHED_OP(sleep, v);
+        SCHED_OP(VCPU2OP(v), sleep, v);
     }
 
     vcpu_schedule_unlock_irqrestore(v, flags);
@@ -242,7 +285,7 @@ void vcpu_wake(struct vcpu *v)
     {
         if ( v->runstate.state >= RUNSTATE_blocked )
             vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
-        SCHED_OP(wake, v);
+        SCHED_OP(VCPU2OP(v), wake, v);
     }
     else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
     {
@@ -297,7 +340,7 @@ static void vcpu_migrate(struct vcpu *v)
 
     /* Switch to new CPU, then unlock old CPU. */
     old_cpu = v->processor;
-    v->processor = SCHED_OP(pick_cpu, v);
+    v->processor = SCHED_OP(VCPU2OP(v), pick_cpu, v);
     spin_unlock_irqrestore(
         &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
 
@@ -326,22 +369,32 @@ void vcpu_force_reschedule(struct vcpu *
 }
 
 /*
- * This function is used by cpu_hotplug code from stop_machine context.
- * Hence we can avoid needing to take the 
+ * This function is used by cpu_hotplug code from stop_machine context
+ * and from cpupools to switch schedulers on a cpu.
  */
-void cpu_disable_scheduler(void)
+int cpu_disable_scheduler(unsigned int cpu, int lock)
 {
     struct domain *d;
     struct vcpu *v;
-    unsigned int cpu = smp_processor_id();
+    struct cpupool *c;
+    int    ret = 0;
+
+    c = per_cpu(cpupool, cpu);
+    if ( c == NULL )
+        return ret;
 
     for_each_domain ( d )
     {
+        if ( (d->cpupool != c) || c->pool_paused )
+            continue;
+
         for_each_vcpu ( d, v )
         {
             if ( is_idle_vcpu(v) )
                 continue;
 
+            if ( lock != 0 )
+                vcpu_schedule_lock_irq(v);
             if ( (cpus_weight(v->cpu_affinity) == 1) &&
                  cpu_isset(cpu, v->cpu_affinity) )
             {
@@ -351,29 +404,49 @@ void cpu_disable_scheduler(void)
             }
 
             /*
-             * Migrate single-shot timers to CPU0. A new cpu will automatically
-             * be chosen when the timer is next re-set.
+             * Migrate single-shot timers to other cpu of same pool. A new cpu
+             * will automatically be chosen when the timer is next re-set.
              */
             if ( v->singleshot_timer.cpu == cpu )
-                migrate_timer(&v->singleshot_timer, 0);
+            {
+                int cpu_mig;
+
+                cpu_mig = first_cpu(c->cpu_valid);
+                if (cpu_mig == cpu)
+                    cpu_mig = next_cpu(cpu_mig, c->cpu_valid);
+                migrate_timer(&v->singleshot_timer, cpu_mig);
+            }
 
             if ( v->processor == cpu )
             {
                 set_bit(_VPF_migrating, &v->pause_flags);
+                if ( lock != 0 )
+                    vcpu_schedule_unlock_irq(v);
                 vcpu_sleep_nosync(v);
                 vcpu_migrate(v);
             }
+            else if ( lock != 0 )
+                vcpu_schedule_unlock_irq(v);
+            /*
+             * A vcpu active in the hypervisor will not be migratable.
+             * The caller should try again after releasing and reaquiring
+             * all locks.
+             */
+            if ( v->processor == cpu )
+                ret = -EAGAIN;
         }
     }
+    return ret;
 }
 
 static int __vcpu_set_affinity(
     struct vcpu *v, cpumask_t *affinity,
     bool_t old_lock_status, bool_t new_lock_status)
 {
-    cpumask_t online_affinity, old_affinity;
-
-    cpus_and(online_affinity, *affinity, cpu_online_map);
+    cpumask_t online, online_affinity, old_affinity;
+
+    online = VCPU2ONLINE(v);
+    cpus_and(online_affinity, *affinity, online);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
@@ -424,12 +497,13 @@ int vcpu_locked_change_affinity(struct v
 
 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
 {
-    cpumask_t online_affinity;
+    cpumask_t online, online_affinity;
 
     /* Do not fail if no CPU in old affinity mask is online. */
-    cpus_and(online_affinity, *affinity, cpu_online_map);
+    online = VCPU2ONLINE(v);
+    cpus_and(online_affinity, *affinity, online);
     if ( cpus_empty(online_affinity) )
-        *affinity = cpu_online_map;
+        *affinity = VCPU2ONLINE(v);
 
     if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
         BUG();
@@ -721,7 +795,7 @@ long sched_adjust(struct domain *d, stru
     struct vcpu *v;
     long ret;
     
-    if ( (op->sched_id != ops.sched_id) ||
+    if ( (op->sched_id != DOM2OP(d)->sched_id) ||
          ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
           (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
         return -EINVAL;
@@ -748,7 +822,7 @@ long sched_adjust(struct domain *d, stru
     if ( d == current->domain )
         vcpu_schedule_lock_irq(current);
 
-    if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
+    if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 )
         TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
 
     if ( d == current->domain )
@@ -796,6 +870,7 @@ static void schedule(void)
 {
     struct vcpu          *prev = current, *next = NULL;
     s_time_t              now = NOW();
+    struct scheduler     *sched = this_cpu(scheduler);
     struct schedule_data *sd;
     struct task_slice     next_slice;
 
@@ -811,7 +886,7 @@ static void schedule(void)
     stop_timer(&sd->s_timer);
     
     /* get policy-specific decision on scheduling... */
-    next_slice = ops.do_schedule(now);
+    next_slice = sched->do_schedule(sched, now);
 
     next = next_slice.task;
 
@@ -911,18 +986,25 @@ static void poll_timer_fn(void *data)
         vcpu_unblock(v);
 }
 
+/* Get scheduler by id */
+struct scheduler *scheduler_get_by_id(unsigned int id)
+{
+    int i;
+
+    for ( i = 0; schedulers[i] != NULL; i++ )
+    {
+        if ( schedulers[i]->sched_id == id )
+            return schedulers[i];
+    }
+    return NULL;
+}
+
 /* Initialise the data structures. */
 void __init scheduler_init(void)
 {
     int i;
 
     open_softirq(SCHEDULE_SOFTIRQ, schedule);
-
-    for_each_cpu ( i )
-    {
-        spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
-        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
-    }
 
     for ( i = 0; schedulers[i] != NULL; i++ )
     {
@@ -934,43 +1016,121 @@ void __init scheduler_init(void)
     if ( schedulers[i] == NULL )
         printk("Could not find scheduler: %s\n", opt_sched);
 
+    for_each_cpu ( i )
+    {
+        per_cpu(scheduler, i) = &ops;
+        spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
+        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
+    }
+
     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(init);
-}
-
-void dump_runq(unsigned char key)
-{
-    s_time_t      now = NOW();
-    int           i;
+    if ( SCHED_OP(&ops, init) )
+        panic("scheduler returned error on init\n");
+}
+
+/* switch scheduler on cpu */
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
+{
     unsigned long flags;
-
-    local_irq_save(flags);
-
-    printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    SCHED_OP(dump_settings);
-    printk("sched_smt_power_savings: %s\n",
-            sched_smt_power_savings? "enabled":"disabled");
-    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
-
-    for_each_online_cpu ( i )
+    struct vcpu *v;
+    void *vpriv = NULL;
+    void *ppriv;
+    void *ppriv_old;
+    struct scheduler *old_ops;
+    struct scheduler *new_ops;
+
+    old_ops = per_cpu(scheduler, cpu);
+    new_ops = (c == NULL) ? &ops : &(c->sched);
+    v = per_cpu(schedule_data, cpu).idle;
+    ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
+    if ( c != NULL )
+        vpriv = SCHED_OP(new_ops, alloc_vdata, v);
+
+    spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( c == NULL )
+    {
+        vpriv = v->sched_priv;
+        v->sched_priv = per_cpu(schedule_data, cpu).sched_idlevpriv;
+    }
+    else
+    {
+        v->sched_priv = vpriv;
+        vpriv = NULL;
+    }
+    SCHED_OP(old_ops, tick_suspend, cpu);
+    per_cpu(scheduler, cpu) = new_ops;
+    ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
+    per_cpu(schedule_data, cpu).sched_priv = ppriv;
+    SCHED_OP(new_ops, tick_resume, cpu);
+    SCHED_OP(new_ops, insert_vcpu, v);
+
+    spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    if ( vpriv != NULL )
+        SCHED_OP(old_ops, free_vdata, vpriv);
+    SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
+}
+
+/* init scheduler global data */
+int schedule_init_global(char *name, struct scheduler *sched)
+{
+    int i;
+    struct scheduler *data;
+
+    data = &ops;
+    for ( i = 0; (schedulers[i] != NULL) && (name != NULL) ; i++ )
+    {
+        if ( strcmp(schedulers[i]->opt_name, name) == 0 )
+        {
+            data = schedulers[i];
+            break;
+        }
+    }
+    memcpy(sched, data, sizeof(*sched));
+    return SCHED_OP(sched, init);
+}
+
+/* deinitialize scheduler global data */
+void schedule_deinit_global(struct scheduler *sched)
+{
+    SCHED_OP(sched, deinit);
+}
+
+void schedule_dump(struct cpupool *c)
+{
+    int               i;
+    struct scheduler *sched;
+    cpumask_t         cpus;
+
+    sched = (c == NULL) ? &ops : &(c->sched);
+    cpus = (c == NULL) ? cpupool_free_cpus : c->cpu_valid;
+    printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
+    SCHED_OP(sched, dump_settings);
+
+    for_each_cpu_mask (i, cpus)
     {
         spin_lock(&per_cpu(schedule_data, i).schedule_lock);
         printk("CPU[%02d] ", i);
-        SCHED_OP(dump_cpu_state, i);
+        SCHED_OP(sched, dump_cpu_state, i);
         spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
     }
-
-    local_irq_restore(flags);
-}
-
-void sched_tick_suspend(void)
-{
-    SCHED_OP(tick_suspend);
-}
-
-void sched_tick_resume(void)
-{
-    SCHED_OP(tick_resume);
+}
+
+void sched_tick_suspend(unsigned int cpu)
+{
+    struct scheduler *sched;
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_suspend, cpu);
+}
+
+void sched_tick_resume(unsigned int cpu)
+{
+    struct scheduler *sched;
+
+    sched = per_cpu(scheduler, cpu);
+    SCHED_OP(sched, tick_resume, cpu);
 }
 
 #ifdef CONFIG_COMPAT
diff -r 655dc3bc1d8e xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/include/public/domctl.h       Thu Apr 09 11:47:18 2009 +0200
@@ -59,7 +59,11 @@ struct xen_domctl_createdomain {
  /* Should domain memory integrity be verifed by tboot during Sx? */
 #define _XEN_DOMCTL_CDF_s3_integrity  2
 #define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
+ /* cpupool is specified (0 otherwise) */
+#define _XEN_DOMCTL_CDF_pool          3
+#define XEN_DOMCTL_CDF_pool           (1U<<_XEN_DOMCTL_CDF_pool)
     uint32_t flags;
+    uint32_t cpupool;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
@@ -109,6 +113,7 @@ struct xen_domctl_getdomaininfo {
     uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
     uint32_t ssidref;
     xen_domain_handle_t handle;
+    uint32_t cpupool;
 };
 typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
@@ -645,6 +650,30 @@ typedef struct xen_domctl_hvmcontext_par
     XEN_GUEST_HANDLE_64(uint8) buffer;  /* OUT: buffer to write record into */
 } xen_domctl_hvmcontext_partial_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
+
+/*
+ * Move domain to specified cpupool.
+ */
+#define XEN_DOMCTL_cpupool_op  56
+#define XEN_DOMCTL_CPUPOOL_OP_CREATE                1  /* C */
+#define XEN_DOMCTL_CPUPOOL_OP_DESTROY               2  /* D */
+#define XEN_DOMCTL_CPUPOOL_OP_INFO                  3  /* I */
+#define XEN_DOMCTL_CPUPOOL_OP_ADDCPU                4  /* A */
+#define XEN_DOMCTL_CPUPOOL_OP_RMCPU                 5  /* R */
+#define XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN            6  /* M */
+#define XEN_DOMCTL_CPUPOOL_OP_FREEINFO              7  /* F */
+#define XEN_DOMCTL_CPUPOOL_PAR_ANY     0xFFFFFFFF
+struct xen_domctl_cpupool_op {
+    uint32_t op;          /* IN */
+    uint32_t cpupool_id;  /* IN: CDIARM OUT: CI */
+    uint32_t sched_id;    /* IN: C      OUT: I  */
+    uint32_t domid;       /* IN: M              */
+    uint32_t cpu;         /* IN: AR             */
+    uint32_t n_dom;       /*            OUT: I  */
+    struct xenctl_cpumap cpumap; /*     OUT: IF */
+};
+typedef struct xen_domctl_cpupool_op xen_domctl_cpupool_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpupool_op_t);
 
 
 struct xen_domctl {
@@ -688,6 +717,7 @@ struct xen_domctl {
         struct xen_domctl_set_target        set_target;
         struct xen_domctl_subscribe         subscribe;
         struct xen_domctl_debug_op          debug_op;
+        struct xen_domctl_cpupool_op        cpupool_op;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
 #endif
diff -r 655dc3bc1d8e xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/include/xen/sched-if.h        Thu Apr 16 09:16:18 2009 +0200
@@ -10,15 +10,24 @@
 
 #include <xen/percpu.h>
 
+/* A global pointer to the initial cpupool (POOL0). */
+extern struct cpupool *cpupool0;
+
+/* cpus currently in no cpupool */
+extern cpumask_t cpupool_free_cpus;
+
 struct schedule_data {
     spinlock_t          schedule_lock;  /* spinlock protecting curr        */
     struct vcpu        *curr;           /* current task                    */
     struct vcpu        *idle;           /* idle task for this cpu          */
     void               *sched_priv;
+    void               *sched_idlevpriv; /* default scheduler vcpu data     */
     struct timer        s_timer;        /* scheduling timer                */
 } __cacheline_aligned;
 
 DECLARE_PER_CPU(struct schedule_data, schedule_data);
+DECLARE_PER_CPU(struct scheduler *, scheduler);
+DECLARE_PER_CPU(struct cpupool *, cpupool);
 
 static inline void vcpu_schedule_lock(struct vcpu *v)
 {
@@ -58,28 +67,50 @@ struct scheduler {
     char *name;             /* full name for this scheduler      */
     char *opt_name;         /* option name for this scheduler    */
     unsigned int sched_id;  /* ID for this scheduler             */
+    void *sched_data;       /* global data pointer               */
 
-    void         (*init)           (void);
+    int          (*init)           (struct scheduler *);
+    void         (*deinit)         (struct scheduler *);
 
-    int          (*init_domain)    (struct domain *);
-    void         (*destroy_domain) (struct domain *);
+    void         (*free_vdata)     (struct scheduler *, void *);
+    void *       (*alloc_vdata)    (struct scheduler *, struct vcpu *);
+    void         (*free_pdata)     (struct scheduler *, void *, int);
+    void *       (*alloc_pdata)    (struct scheduler *, int);
 
-    int          (*init_vcpu)      (struct vcpu *);
-    void         (*destroy_vcpu)   (struct vcpu *);
+    int          (*init_domain)    (struct scheduler *, struct domain *);
+    void         (*destroy_domain) (struct scheduler *, struct domain *);
 
-    void         (*sleep)          (struct vcpu *);
-    void         (*wake)           (struct vcpu *);
+    int          (*init_vcpu)      (struct scheduler *, struct vcpu *);
+    void         (*insert_vcpu)    (struct scheduler *, struct vcpu *);
+    void         (*destroy_vcpu)   (struct scheduler *, struct vcpu *);
 
-    struct task_slice (*do_schedule) (s_time_t);
+    void         (*sleep)          (struct scheduler *, struct vcpu *);
+    void         (*wake)           (struct scheduler *, struct vcpu *);
 
-    int          (*pick_cpu)       (struct vcpu *);
-    int          (*adjust)         (struct domain *,
+    struct task_slice (*do_schedule) (struct scheduler *, s_time_t);
+
+    int          (*pick_cpu)       (struct scheduler *, struct vcpu *);
+    int          (*adjust)         (struct scheduler *, struct domain *,
                                     struct xen_domctl_scheduler_op *);
-    void         (*dump_settings)  (void);
-    void         (*dump_cpu_state) (int);
+    void         (*dump_settings)  (struct scheduler *);
+    void         (*dump_cpu_state) (struct scheduler *, int);
 
-    void         (*tick_suspend)    (void);
-    void         (*tick_resume)     (void);
+    void         (*tick_suspend)   (struct scheduler *, unsigned int);
+    void         (*tick_resume)    (struct scheduler *, unsigned int);
 };
 
+struct cpupool
+{
+    int              cpupool_id;
+    cpumask_t        cpu_valid;      /* all cpus assigned to pool */
+    cpumask_t        cpus_borrowed;  /* cpus borrowed or lent */
+    struct cpupool   *next;
+    unsigned int     n_dom;
+    int              cpu_in_transit; /* used for adding/removing cpus */
+    bool_t           pool_paused;
+    struct scheduler sched;
+};
+
+struct scheduler *scheduler_get_by_id(unsigned int id);
+
 #endif /* __XEN_SCHED_IF_H__ */
diff -r 655dc3bc1d8e xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Thu Apr 16 11:54:06 2009 +0100
+++ b/xen/include/xen/sched.h   Thu Apr 16 09:14:00 2009 +0200
@@ -182,6 +182,7 @@ struct domain
 
     /* Scheduling. */
     void            *sched_priv;    /* scheduler-specific data */
+    struct cpupool  *cpupool;
 
     struct domain   *next_in_list;
     struct domain   *next_in_hashbucket;
@@ -341,7 +342,7 @@ static inline struct domain *get_current
 }
 
 struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
+    domid_t domid, int poolid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
 #define _DOMCRF_hvm           0
 #define DOMCRF_hvm            (1U<<_DOMCRF_hvm)
@@ -426,10 +427,11 @@ void sched_destroy_vcpu(struct vcpu *v);
 void sched_destroy_vcpu(struct vcpu *v);
 int  sched_init_domain(struct domain *d);
 void sched_destroy_domain(struct domain *d);
+void sched_move_domain(struct domain *d, struct cpupool *c);
 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
 int  sched_id(void);
-void sched_tick_suspend(void);
-void sched_tick_resume(void);
+void sched_tick_suspend(unsigned int cpu);
+void sched_tick_resume(unsigned int cpu);
 void vcpu_wake(struct vcpu *d);
 void vcpu_sleep_nosync(struct vcpu *d);
 void vcpu_sleep_sync(struct vcpu *d);
@@ -533,8 +535,13 @@ void domain_unpause_by_systemcontroller(
 void domain_unpause_by_systemcontroller(struct domain *d);
 void cpu_init(void);
 
+struct scheduler;
+
+int schedule_init_global(char *name, struct scheduler *sched);
+void schedule_deinit_global(struct scheduler *sched);
+void schedule_cpu_switch(unsigned int cpu, struct cpupool *c);
 void vcpu_force_reschedule(struct vcpu *v);
-void cpu_disable_scheduler(void);
+int cpu_disable_scheduler(unsigned int cpu, int lock);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
 int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
@@ -560,6 +567,21 @@ extern enum cpufreq_controller {
 extern enum cpufreq_controller {
     FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
 } cpufreq_controller;
+
+#define CPUPOOLID_NONE    -1
+
+struct cpupool *cpupool_create(int poolid, char *sched);
+int cpupool_destroy(struct cpupool *c);
+int cpupool0_cpu_assign(struct cpupool *c);
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu);
+void cpupool_cpu_add(unsigned int cpu);
+int cpupool_cpu_remove(unsigned int cpu);
+int cpupool_borrow_cpu(struct cpupool *c, unsigned int cpu);
+int cpupool_return_cpu(struct cpupool *c);
+int cpupool_add_domain(struct domain *d, int poolid);
+void cpupool_rm_domain(struct domain *d);
+int cpupool_do_domctl(struct xen_domctl *op);
+#define num_cpupool_cpus(c) (cpus_weight((c)->cpu_valid))
 
 #endif /* __SCHED_H__ */
 
diff -r 655dc3bc1d8e xen/common/cpupool.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/common/cpupool.c      Fri Apr 17 11:01:51 2009 +0200
@@ -0,0 +1,698 @@
+/******************************************************************************
+ * cpupool.c
+ * 
+ * Generic cpupool-handling functions.
+ *
+ * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
+ */
+
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/cpumask.h>
+#include <xen/percpu.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+
+/* #define PRINTD(args...)    printk(args) */
+
+#define for_each_cpupool(ptr)    \
+    for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
+
+struct cpupool *cpupool0;
+cpumask_t cpupool_free_cpus;
+cpumask_t cpupool_free_cpus_borrowed;
+
+static struct cpupool *cpupool_list;     /* linked list, sorted by poolid */
+
+static int cpupool0_max_cpus;
+integer_param("pool0_max_cpus", cpupool0_max_cpus);
+
+static DEFINE_SPINLOCK(cpupool_lock);
+
+DEFINE_PER_CPU(struct cpupool *, cpupool);
+
+static struct cpupool *alloc_cpupool_struct(void)
+{
+    return xmalloc(struct cpupool);
+}
+
+static void free_cpupool_struct(struct cpupool *c)
+{
+    xfree(c);
+}
+
+/*
+ * find a cpupool by it's id. to be called with cpupool lock held,
+ * returns NULL if not found.
+ */
+static struct cpupool *cpupool_find_by_id(int id, int exact)
+{
+    struct cpupool **q;
+
+    for_each_cpupool(q)
+    {
+        if ( (*q)->cpupool_id == id )
+            return *q;
+        if ( (*q)->cpupool_id > id )
+            break;
+    }
+    return exact ? NULL : *q;
+}
+
+/*
+ * create a new cpupool with specified poolid
+ * returns pointer to new cpupool structure if okay, NULL else
+ * possible failures:
+ * - no memory
+ * - poolid already used
+ * - unknown scheduler
+ */
+struct cpupool *cpupool_create(int poolid, char *sched)
+{
+    struct cpupool *c;
+    struct cpupool **q;
+    int last = 0;
+
+    if ( (c = alloc_cpupool_struct()) == NULL )
+        return NULL;
+    memset(c, 0, sizeof(*c));
+
+    PRINTD("cpupool_create(%d,%s)\n", poolid, sched);
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+    {
+        last = (*q)->cpupool_id;
+        if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
+            break;
+    }
+    if ( *q != NULL )
+    {
+        if ( (*q)->cpupool_id == poolid )
+        {
+            spin_unlock(&cpupool_lock);
+            free_cpupool_struct(c);
+            return NULL;
+        }
+        c->next = *q;
+    }
+    *q = c;
+    c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
+    c->cpu_in_transit = -1;
+    if ( schedule_init_global(sched, &(c->sched)) )
+    {
+        spin_unlock(&cpupool_lock);
+        cpupool_destroy(c);
+        return NULL;
+    }
+    spin_unlock(&cpupool_lock);
+
+    printk("Created cpupool %d with scheduler %s (%s)\n", c->cpupool_id,
+        c->sched.name, c->sched.opt_name);
+
+    return c;
+}
+
+/*
+ * destroys the given cpupool
+ * returns 0 on success, 1 else
+ * possible failures:
+ * - pool still in use
+ * - cpus still assigned to pool
+ * - pool not in list
+ */
+int cpupool_destroy(struct cpupool *c)
+{
+    struct cpupool **q;
+
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+        if ( *q == c )
+            break;
+    if ( (*q != c) || (c->n_dom != 0) || cpus_weight(c->cpu_valid) )
+    {
+        spin_unlock(&cpupool_lock);
+        return 1;
+    }
+    *q = c->next;
+    spin_unlock(&cpupool_lock);
+    PRINTD("cpupool_destroy(%d)\n", c->cpupool_id);
+    schedule_deinit_global(&(c->sched));
+    free_cpupool_struct(c);
+    return 0;
+}
+
+/*
+ * assign a specific cpu to a cpupool
+ */
+static void cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+    PRINTD("cpupool_assign_cpu(%d,%d)\n", c->cpupool_id, cpu);
+    per_cpu(cpupool, cpu) = c;
+    schedule_cpu_switch(cpu, c);
+    cpu_clear(cpu, cpupool_free_cpus);
+    cpu_set(cpu, c->cpu_valid);
+    PRINTD("cpupool_assign_cpu(%d,%d) ready\n", c->cpupool_id, cpu);
+}
+
+/*
+ * assign free physical cpus to a cpupool
+ * cpus assigned are unused cpus with lowest possible ids
+ * returns the number of cpus assigned
+ */
+int cpupool_assign_ncpu(struct cpupool *c, int ncpu)
+{
+    int i;
+    int n;
+
+    n = 0;
+    spin_lock(&cpupool_lock);
+    for_each_cpu_mask(i, cpupool_free_cpus)
+    {
+        cpupool_assign_cpu_locked(c, i);
+        n++;
+        if ( n == ncpu )
+            break;
+    }
+    spin_unlock(&cpupool_lock);
+    PRINTD("cpupool_assign_ncpu(%d,%d) rc %d\n", c->cpupool_id, ncpu, n);
+    return n;
+}
+
+static void cpupool_unassign_cpu_locked_1(struct cpupool *c, unsigned int cpu)
+{
+    PRINTD("cpupool_unassign_cpu(%d,%d)\n", c->cpupool_id, cpu);
+    c->cpu_in_transit = cpu;
+}
+
+static int cpupool_unassign_cpu_locked_2(struct cpupool *c)
+{
+    uint64_t to = NOW() + MILLISECS(100);
+    int cpu = c->cpu_in_transit;
+    int ret;
+
+    cpu_clear(cpu, c->cpu_valid);
+    while ( ((ret = cpu_disable_scheduler(cpu, 1)) != 0) && (NOW() < to) );
+    if ( ret )
+    {
+        cpu_set(cpu, c->cpu_valid);
+        c->cpu_in_transit = -1;
+    }
+    else
+    {
+        c->cpu_in_transit = -1;
+        cpu_set(cpu, cpupool_free_cpus);
+        schedule_cpu_switch(cpu, NULL);
+        per_cpu(cpupool, cpu) = NULL;
+    }
+    PRINTD("cpupool_unassign_cpu(%d,%d) ret %d\n", c->cpupool_id, cpu, ret);
+    return ret;
+}
+
+static long cpupool_unassign_cpu_helper(void *info)
+{
+    struct cpupool *c = (struct cpupool *)info;
+    long ret;
+
+    ret = cpupool_unassign_cpu_locked_2(c);
+    spin_unlock(&cpupool_lock);
+    return ret;
+}
+
+static int cpupool_unassign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+    cpupool_unassign_cpu_locked_1(c, cpu);
+    return cpupool_unassign_cpu_locked_2(c);
+}
+
+/*
+ * unassign a specific cpu from a cpupool
+ * possible failures:
+ * - last cpu and still domains in cpupool
+ */
+int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
+{
+    int work_cpu;
+
+    spin_lock(&cpupool_lock);
+    if ( !cpu_isset(cpu, c->cpu_valid) )
+    {
+        spin_unlock(&cpupool_lock);
+        return 0;
+    }
+    if ( (c->n_dom > 0) && (cpus_weight(c->cpu_valid) == 1) )
+    {
+        spin_unlock(&cpupool_lock);
+        return -EBUSY;
+    }
+    cpupool_unassign_cpu_locked_1(c, cpu);
+    work_cpu = smp_processor_id();
+    if ( work_cpu == cpu )
+    {
+        work_cpu = first_cpu(cpupool0->cpu_valid);
+        if ( work_cpu == cpu )
+            work_cpu = next_cpu(cpu, cpupool0->cpu_valid);
+    }
+    return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
+}
+
+/*
+ * borrow cpu from another cpupool
+ * cpu might be free or already in the correct pool
+ * if cpu is taken from other pool, all domains in this pool will be paused
+ * rc == 0 if not borrowed, 1 if borrowed
+ */
+int cpupool_borrow_cpu(struct cpupool *c, unsigned int cpu)
+{
+    struct cpupool **q;
+    struct domain *d;
+
+    if ( cpu_isset(cpu, c->cpu_valid) )
+        return 0;
+
+    spin_lock(&cpupool_lock);
+
+    if ( cpu_isset(cpu, cpupool_free_cpus) )
+    {
+        cpupool_assign_cpu_locked(c, cpu);
+        cpu_set(cpu, c->cpus_borrowed);
+        cpu_set(cpu, cpupool_free_cpus_borrowed);
+        spin_unlock(&cpupool_lock);
+        return 1;
+    }
+
+    for_each_cpupool(q)
+    {
+        if ( cpu_isset(cpu, (*q)->cpu_valid) )
+            break;
+    }
+    BUG_ON(*q == NULL);
+    if ( (*q)->pool_paused++ == 0 )
+    {
+        for_each_domain(d)
+        {
+            if ( d->cpupool == *q )
+                domain_pause(d);
+        }
+    }
+    /* unassigning cpu can't fail as all domains in pool should be paused */
+    cpupool_unassign_cpu_locked(*q, cpu);
+    cpupool_assign_cpu_locked(c, cpu);
+    cpu_set(cpu, c->cpus_borrowed);
+    cpu_set(cpu, (*q)->cpus_borrowed);
+
+    spin_unlock(&cpupool_lock);
+    return 1;
+}
+
+/*
+ * return cpu after borrowing it before
+ * a cpu borrowed via cpupool_borrow_cpu before is returned to its former
+ * pool
+ * returns a cpu to continue on, -1 if all okay
+ */
+int cpupool_return_cpu(struct cpupool *c)
+{
+    int cpu = -1;
+    cpumask_t mask;
+    struct cpupool **q;
+    struct domain *d;
+
+    spin_lock(&cpupool_lock);
+    if ( cpus_weight(c->cpus_borrowed) == 0 )
+        goto out;
+
+    if ( cpu_isset(smp_processor_id(), c->cpus_borrowed) )
+    {
+        cpus_andnot(mask, c->cpu_valid, c->cpus_borrowed);
+        cpu = first_cpu(mask);
+        BUG_ON(cpu == NR_CPUS);
+        goto out;
+    }
+
+    for_each_cpu_mask(cpu, c->cpus_borrowed)
+    {
+        BUG_ON(!cpu_isset(cpu, c->cpu_valid));
+        if ( cpu_isset(cpu, cpupool_free_cpus_borrowed) )
+        {
+            cpu_clear(cpu, cpupool_free_cpus_borrowed);
+            cpu_clear(cpu, c->cpus_borrowed);
+            if ( !cpupool_unassign_cpu_locked(c, cpu) )
+                continue;
+            /* could not move all vcpus, try again */
+            cpu_set(cpu, cpupool_free_cpus_borrowed);
+            cpu_set(cpu, c->cpus_borrowed);
+            goto out;
+        }
+        for_each_cpupool(q)
+        {
+            if ( (*q != c) && cpu_isset(cpu, (*q)->cpus_borrowed) )
+                break;
+        }
+        BUG_ON(*q == NULL);
+        BUG_ON(!(*q)->pool_paused);
+        cpu_clear(cpu, (*q)->cpus_borrowed);
+        cpu_clear(cpu, c->cpus_borrowed);
+        if ( cpupool_unassign_cpu_locked(c, cpu) )
+        {
+            cpu_set(cpu, (*q)->cpus_borrowed);
+            cpu_set(cpu, c->cpus_borrowed);
+            goto out;
+        }
+        cpupool_assign_cpu_locked(*q, cpu);
+        if ( (*q)->pool_paused == 1 )
+        {
+            for_each_domain(d)
+            {
+                if ( d->cpupool == *q )
+                    domain_unpause(d);
+            }
+        }
+        (*q)->pool_paused--;
+    }
+    cpu = -1;
+
+out:
+    spin_unlock(&cpupool_lock);
+    return cpu;
+}
+
+/*
+ * assign cpus to the default cpupool
+ * default are all cpus, less cpus may be specified as boot parameter
+ * possible failures:
+ * - no cpu assigned
+ */
+int __init cpupool0_cpu_assign(struct cpupool *c)
+{
+    if ( (cpupool0_max_cpus == 0) || (cpupool0_max_cpus > num_online_cpus()) )
+        cpupool0_max_cpus = num_online_cpus();
+    if ( !cpupool_assign_ncpu(cpupool0, cpupool0_max_cpus) )
+        return 1;
+    return 0;
+}
+
+/*
+ * add a new domain to a cpupool
+ * possible failures:
+ * - pool does not exist
+ * - pool is paused
+ * - no cpu assigned to pool
+ */
+int cpupool_add_domain(struct domain *d, int poolid)
+{
+    struct cpupool *c;
+    int rc = 1;
+
+    if ( poolid == CPUPOOLID_NONE )
+        return 0;
+    spin_lock(&cpupool_lock);
+    c = cpupool_find_by_id(poolid, 1);
+    if ( (c != NULL) && !c->pool_paused && cpus_weight(c->cpu_valid) )
+    {
+        c->n_dom++;
+        d->cpupool = c;
+        PRINTD("cpupool_add_domain(%d,%d) n_dom %d\n", d->domain_id, poolid,
+            c->n_dom);
+        rc = 0;
+    }
+    spin_unlock(&cpupool_lock);
+    return rc;
+}
+
+/*
+ * remove a domain from a cpupool
+ */
+void cpupool_rm_domain(struct domain *d)
+{
+    if ( d->cpupool == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    d->cpupool->n_dom--;
+    PRINTD("cpupool_rm_domain(%d,%d) n_dom %d\n", d->domain_id,
+        d->cpupool->cpupool_id, d->cpupool->n_dom);
+    d->cpupool = NULL;
+    spin_unlock(&cpupool_lock);
+    return;
+}
+
+/*
+ * called to add a new cpu to pool admin
+ * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0
+ */
+void cpupool_cpu_add(unsigned int cpu)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+    if ( cpupool0 == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpu_set(cpu, cpupool_free_cpus);
+    cpupool_assign_cpu_locked(cpupool0, cpu);
+    spin_unlock(&cpupool_lock);
+#endif
+    return;
+}
+
+/* called to remove a cpu from pool admin
+ * possible failures:
+ * - cpu is last one in a pool with domains in it
+ * - pool is paused
+ */
+int cpupool_cpu_remove(unsigned int cpu)
+{
+    int rc = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+    struct cpupool **q;
+
+    spin_lock(&cpupool_lock);
+    if ( cpu_isset(cpu, cpupool_free_cpus) )
+    {
+        cpu_clear(cpu, cpupool_free_cpus);
+        goto out;
+    }
+    for_each_cpupool(q)
+        if ( cpu_isset(cpu, (*q)->cpu_valid) )
+            break;
+    if ( *q == NULL )
+        goto out;
+    if ( (((*q)->n_dom == 0) || (cpus_weight((*q)->cpu_valid) > 1)) &&
+         !(*q)->pool_paused )
+    {
+        cpu_clear(cpu, (*q)->cpu_valid);
+        schedule_cpu_switch(cpu, NULL);
+        per_cpu(cpupool, cpu) = NULL;
+    }
+    else
+        rc = 1;
+out:
+    spin_unlock(&cpupool_lock);
+#endif
+    return rc;
+}
+
+/*
+ * do cpupool related domctl operations
+ */
+int cpupool_do_domctl(struct xen_domctl *op)
+{
+    int ret;
+    struct cpupool *c;
+
+    switch ( op->u.cpupool_op.op )
+    {
+
+    case XEN_DOMCTL_CPUPOOL_OP_CREATE:
+    {
+        int poolid;
+        struct scheduler *sched;
+
+        poolid = (op->u.cpupool_op.cpupool_id == XEN_DOMCTL_CPUPOOL_PAR_ANY) ?
+            CPUPOOLID_NONE: op->u.cpupool_op.cpupool_id;
+        sched = scheduler_get_by_id(op->u.cpupool_op.sched_id);
+        ret = -ENOENT;
+        if ( sched == NULL )
+            break;
+        ret = 0;
+        c = cpupool_create(poolid, sched->opt_name);
+        if ( c == NULL )
+            ret = -EINVAL;
+        else
+            op->u.cpupool_op.cpupool_id = c->cpupool_id;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_DESTROY:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 1);
+        spin_unlock(&cpupool_lock);
+       ret = -ENOENT;
+       if ( c == NULL )
+            break;
+        ret = (cpupool_destroy(c) != 0) ? -EBUSY : 0;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_INFO:
+    {
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+       ret = -ENOENT;
+       if ( c == NULL )
+            break;
+        op->u.cpupool_op.cpupool_id = c->cpupool_id;
+        op->u.cpupool_op.sched_id = c->sched.sched_id;
+        op->u.cpupool_op.n_dom = c->n_dom;
+       cpumask_to_xenctl_cpumap(&(op->u.cpupool_op.cpumap), &(c->cpu_valid));
+        ret = 0;
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_ADDCPU:
+    {
+        unsigned cpu;
+
+        cpu = op->u.cpupool_op.cpu;
+        spin_lock(&cpupool_lock);
+        if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY )
+            cpu = first_cpu(cpupool_free_cpus);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            goto addcpu_out;
+        ret = -EBUSY;
+        if ( !cpu_isset(cpu, cpupool_free_cpus) )
+            goto addcpu_out;
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 0);
+       ret = -ENOENT;
+        if ( c == NULL )
+            goto addcpu_out;
+        cpupool_assign_cpu_locked(c, cpu);
+        ret = 0;
+addcpu_out:
+        spin_unlock(&cpupool_lock);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_RMCPU:
+    {
+        unsigned cpu;
+
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 0);
+        spin_unlock(&cpupool_lock);
+       ret = -ENOENT;
+       if ( c == NULL )
+            break;
+        cpu = op->u.cpupool_op.cpu;
+        if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY )
+            cpu = last_cpu(c->cpu_valid);
+        ret = -EINVAL;
+        if ( cpu >= NR_CPUS )
+            break;
+        /* caution: cpupool_unassign_cpu uses continue_hypercall_on_cpu and
+         * will continue after the local return
+         */
+        ret = cpupool_unassign_cpu(c, cpu);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN:
+    {
+        struct domain *d;
+
+        ret = -EINVAL;
+        if ( op->u.cpupool_op.domid == 0 )
+            break;
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->u.cpupool_op.domid);
+        if ( d == NULL )
+            break;
+        if ( d->cpupool == NULL )
+        {
+            ret = -EINVAL;
+            rcu_unlock_domain(d);
+            break;
+        }
+        ret = -ENOENT;
+        spin_lock(&cpupool_lock);
+        c = cpupool_find_by_id(op->u.cpupool_op.cpupool_id, 1);
+        if ( (c != NULL) && cpus_weight(c->cpu_valid) && !c->pool_paused )
+        {
+            PRINTD("cpupool move_domain(%d)->%d\n", d->domain_id,
+                c->cpupool_id);
+            d->cpupool->n_dom--;
+            PRINTD("cpupool move_domain(%d), %d.n_dom=%d\n", d->domain_id,
+                d->cpupool->cpupool_id, d->cpupool->n_dom);
+            sched_move_domain(d, c);
+            c->n_dom++;
+            PRINTD("cpupool move_domain(%d), %d.n_dom=%d\n", d->domain_id,
+                c->cpupool_id, c->n_dom);
+            PRINTD("cpupool move_domain(%d)->%d ready\n", d->domain_id,
+                c->cpupool_id);
+            ret = 0;
+        }
+        spin_unlock(&cpupool_lock);
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_CPUPOOL_OP_FREEINFO:
+    {
+        cpumask_to_xenctl_cpumap(&(op->u.cpupool_op.cpumap),
+            &cpupool_free_cpus);
+        ret = 0;
+    }
+    break;
+
+    default:
+        ret = -ENOSYS;
+
+    }
+
+    return ret;
+}
+
+void schedule_dump(struct cpupool *c);
+
+void dump_runq(unsigned char key)
+{
+    unsigned long    flags;
+    s_time_t         now = NOW();
+    struct cpupool **c;
+
+    spin_lock(&cpupool_lock);
+    local_irq_save(flags);
+
+    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
+
+    printk("Idle cpupool:\n");
+    schedule_dump(NULL);
+
+    for_each_cpupool(c)
+    {
+        printk("Cpupool %d:\n", (*c)->cpupool_id);
+        schedule_dump(*c);
+    }
+
+    local_irq_restore(flags);
+    spin_unlock(&cpupool_lock);
+}
+
+static int __init cpupool_init(void)
+{
+    cpupool_free_cpus = cpu_online_map;
+    cpus_clear(cpupool_free_cpus_borrowed);
+    cpupool_list = NULL;
+    return 0;
+}
+__initcall(cpupool_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.