Xen project Mailing List

[Xen-devel] [Patch 1/6] Cpupools: hypervisor part

To: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>

From: Juergen Gross <juergen.gross@xxxxxxxxxxxxxx>

Date: Wed, 21 Apr 2010 13:16:25 +0200

Delivery-date: Wed, 21 Apr 2010 04:19:09 -0700

Domainkey-signature: s=s1536a; d=ts.fujitsu.com; c=nofws; q=dns; h=X-SBRSScore:X-IronPort-AV:Received:X-IronPort-AV: Received:Received:Message-ID:Date:From:Organization: User-Agent:MIME-Version:To:Subject:X-Enigmail-Version: Content-Type; b=cMsF4YyN8YN1ZsmO5bQdERbOyQt/SVNDulEvK/jTYYB+MJKpQLK6yzks 9P1wrRnPUDiUnvC07b4xK4h0h5YZzTmQg6Xi7YD7+ZlOvQyxuv/KxlP8C a8vnsvITH7Bi1kEHV8T2FLfPSB6KiVqg8e4SHpbyCIIk04TGpJmErQKaP SfKwzKn6OR2HeuYlXfPHaVKWTq7E06JUqALoEHOM0xsrjAEVmsOkG9CC6 LC1Lw0MGNMLjqV5XXgy6RQTcxO42w;

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

-- Juergen Gross Principal Developer Operating Systems TSP ES&S SWE OS6 Telephone: +49 (0) 89 3222 2967 Fujitsu Technology Solutions e-mail: juergen.gross@xxxxxxxxxxxxxx Domagkstr. 28 Internet: ts.fujitsu.com D-80807 Muenchen Company details: ts.fujitsu.com/imprint.html

Signed-off-by: juergen.gross@xxxxxxxxxxxxxx diff -r dbf0fd95180f xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/arch/x86/domain_build.c Wed Apr 21 13:08:37 2010 +0200 @@ -9,6 +9,7 @@ #include <xen/lib.h> #include <xen/ctype.h> #include <xen/sched.h> +#include <xen/sched-if.h> #include <xen/smp.h> #include <xen/delay.h> #include <xen/event.h> @@ -84,7 +85,7 @@ struct vcpu *__init alloc_dom0_vcpu0(voi struct vcpu *__init alloc_dom0_vcpu0(void) { if ( opt_dom0_max_vcpus == 0 ) - opt_dom0_max_vcpus = num_online_cpus(); + opt_dom0_max_vcpus = num_cpupool_cpus(cpupool0); if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS ) opt_dom0_max_vcpus = MAX_VIRT_CPUS; @@ -277,7 +278,7 @@ int __init construct_dom0( unsigned long _initrd_start, unsigned long initrd_len, char *cmdline) { - int i, rc, compatible, compat32, order, machine; + int i, cpu, rc, compatible, compat32, order, machine; struct cpu_user_regs *regs; unsigned long pfn, mfn; unsigned long nr_pages; @@ -776,8 +777,12 @@ int __init construct_dom0( printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus); + cpu = first_cpu(cpupool0->cpu_valid); for ( i = 1; i < opt_dom0_max_vcpus; i++ ) - (void)alloc_vcpu(d, i, i % num_online_cpus()); + { + cpu = cycle_cpu(cpu, cpupool0->cpu_valid); + (void)alloc_vcpu(d, i, cpu); + } /* Set up CR3 value for write_ptbase */ if ( paging_mode_enabled(d) ) diff -r dbf0fd95180f xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/arch/x86/setup.c Wed Apr 21 13:08:37 2010 +0200 @@ -2,6 +2,7 @@ #include <xen/init.h> #include <xen/lib.h> #include <xen/sched.h> +#include <xen/sched-if.h> #include <xen/domain.h> #include <xen/serial.h> #include <xen/softirq.h> @@ -1093,6 +1094,11 @@ void __init __start_xen(unsigned long mb if ( !tboot_protect_mem_regions() ) panic("Could not protect TXT memory regions\n"); + /* Create initial cpupool 0. */ + cpupool0 = cpupool_create(0, NULL); + if ( (cpupool0 == NULL) || cpupool0_cpu_assign(cpupool0) ) + panic("Error creating cpupool 0\n"); + /* Create initial domain 0. */ dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF); if ( (dom0 == NULL) || (alloc_dom0_vcpu0() == NULL) ) diff -r dbf0fd95180f xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/arch/x86/smpboot.c Wed Apr 21 13:08:37 2010 +0200 @@ -39,6 +39,7 @@ #include <xen/mm.h> #include <xen/domain.h> #include <xen/sched.h> +#include <xen/sched-if.h> #include <xen/irq.h> #include <xen/delay.h> #include <xen/softirq.h> @@ -1296,10 +1297,11 @@ int __cpu_disable(void) remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpupool0->cpu_valid); cpu_clear(cpu, cpu_online_map); fixup_irqs(); - cpu_disable_scheduler(); + cpu_disable_scheduler(cpu, 0); return 0; } @@ -1331,15 +1333,16 @@ int cpu_down(unsigned int cpu) int cpu_down(unsigned int cpu) { int err = 0; + int pool_rm = 0; /* spin_trylock() avoids deadlock with stop_machine_run(). */ if (!spin_trylock(&cpu_add_remove_lock)) return -EBUSY; - - if (num_online_cpus() == 1) { - err = -EBUSY; + err = cpupool_cpu_remove(cpu); + if (err) { goto out; } + pool_rm = 1; /* Can not offline BSP */ if (cpu == 0) { @@ -1370,6 +1373,8 @@ out: out: if (!err) send_guest_global_virq(dom0, VIRQ_PCPU_STATE); + else if (pool_rm) + cpupool_cpu_add(cpu); spin_unlock(&cpu_add_remove_lock); return err; } @@ -1559,6 +1564,7 @@ int __devinit __cpu_up(unsigned int cpu) process_pending_softirqs(); } + cpupool_cpu_add(cpu); cpufreq_add_cpu(cpu); return 0; } diff -r dbf0fd95180f xen/common/Makefile --- a/xen/common/Makefile Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/common/Makefile Wed Apr 21 13:08:37 2010 +0200 @@ -1,5 +1,6 @@ obj-y += bitmap.o obj-y += bitmap.o obj-y += cpu.o +obj-y += cpupool.o obj-y += domctl.o obj-y += domain.o obj-y += event_channel.o diff -r dbf0fd95180f xen/common/cpupool.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/common/cpupool.c Wed Apr 21 13:08:37 2010 +0200 @@ -0,0 +1,604 @@ +/****************************************************************************** + * cpupool.c + * + * Generic cpupool-handling functions. + * + * Cpupools are a feature to have configurable scheduling domains. Each + * cpupool runs an own scheduler on a dedicated set of physical cpus. + * A domain is bound to one cpupool at any time, but it can be moved to + * another cpupool. + * + * (C) 2009, Juergen Gross, Fujitsu Technology Solutions + */ + +#include <xen/lib.h> +#include <xen/init.h> +#include <xen/cpumask.h> +#include <xen/percpu.h> +#include <xen/sched.h> +#include <xen/sched-if.h> + +#define for_each_cpupool(ptr) \ + for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next)) + +struct cpupool *cpupool0; /* Initial cpupool with Dom0 */ +cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */ + +static struct cpupool *cpupool_list; /* linked list, sorted by poolid */ + +static int cpupool0_max_cpus; +integer_param("pool0_max_cpus", cpupool0_max_cpus); + +static int cpupool_moving_cpu = -1; +static struct cpupool *cpupool_cpu_moving = NULL; +static cpumask_t cpupool_locked_cpus = CPU_MASK_NONE; + +/* cpupool lock: be carefull, this lock is sometimes released on another cpu + * as it was obtained! + */ +static DEFINE_SPINLOCK(cpupool_lock); + +DEFINE_PER_CPU(struct cpupool *, cpupool); + +static struct cpupool *alloc_cpupool_struct(void) +{ + return xmalloc(struct cpupool); +} + +static void free_cpupool_struct(struct cpupool *c) +{ + xfree(c); +} + +/* + * find a cpupool by it's id. to be called with cpupool lock held + * if exact is not specified, the first cpupool with an id larger or equal to + * the searched id is returned + * returns NULL if not found. + */ +static struct cpupool *cpupool_find_by_id(int id, int exact) +{ + struct cpupool **q; + + for_each_cpupool(q) + { + if ( (*q)->cpupool_id == id ) + return *q; + if ( (*q)->cpupool_id > id ) + break; + } + return exact ? NULL : *q; +} + +/* + * create a new cpupool with specified poolid and scheduler + * returns pointer to new cpupool structure if okay, NULL else + * possible failures: + * - no memory + * - poolid already used + * - unknown scheduler + */ +struct cpupool *cpupool_create(int poolid, char *sched) +{ + struct cpupool *c; + struct cpupool **q; + int last = 0; + + if ( (c = alloc_cpupool_struct()) == NULL ) + return NULL; + memset(c, 0, sizeof(*c)); + + printk(XENLOG_DEBUG "cpupool_create(pool=%d,sched=%s)\n", poolid, sched); + spin_lock(&cpupool_lock); + for_each_cpupool(q) + { + last = (*q)->cpupool_id; + if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) ) + break; + } + if ( *q != NULL ) + { + if ( (*q)->cpupool_id == poolid ) + { + spin_unlock(&cpupool_lock); + free_cpupool_struct(c); + return NULL; + } + c->next = *q; + } + *q = c; + c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid; + if ( schedule_init_global(sched, &(c->sched)) ) + { + spin_unlock(&cpupool_lock); + cpupool_destroy(c); + return NULL; + } + spin_unlock(&cpupool_lock); + + printk("Created cpupool %d with scheduler %s (%s)\n", c->cpupool_id, + c->sched.name, c->sched.opt_name); + + return c; +} +/* + * destroys the given cpupool + * returns 0 on success, 1 else + * possible failures: + * - pool still in use + * - cpus still assigned to pool + * - pool not in list + */ +int cpupool_destroy(struct cpupool *c) +{ + struct cpupool **q; + + spin_lock(&cpupool_lock); + for_each_cpupool(q) + if ( *q == c ) + break; + if ( (*q != c) || (c->n_dom != 0) || cpus_weight(c->cpu_valid) ) + { + spin_unlock(&cpupool_lock); + return 1; + } + *q = c->next; + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_destroy(pool=%d)\n", c->cpupool_id); + schedule_deinit_global(&(c->sched)); + free_cpupool_struct(c); + return 0; +} + +/* + * assign a specific cpu to a cpupool + * cpupool_lock must be held + */ +static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) +{ + if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) ) + return -EBUSY; + per_cpu(cpupool, cpu) = c; + schedule_cpu_switch(cpu, c); + cpu_clear(cpu, cpupool_free_cpus); + if (cpupool_moving_cpu == cpu) + { + cpupool_moving_cpu = -1; + cpupool_cpu_moving = NULL; + } + cpu_set(cpu, c->cpu_valid); + return 0; +} + +/* + * assign free physical cpus to a cpupool + * cpus assigned are unused cpus with lowest possible ids + * returns the number of cpus assigned + */ +int cpupool_assign_ncpu(struct cpupool *c, int ncpu) +{ + int i; + int n; + + n = 0; + spin_lock(&cpupool_lock); + for_each_cpu_mask(i, cpupool_free_cpus) + { + if ( cpupool_assign_cpu_locked(c, i) == 0 ) + n++; + if ( n == ncpu ) + break; + } + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_assign_ncpu(pool=%d,ncpu=%d) rc %d\n", + c->cpupool_id, ncpu, n); + return n; +} + +static long cpupool_unassign_cpu_helper(void *info) +{ + struct cpupool *c = (struct cpupool *)info; + int cpu = cpupool_moving_cpu; + long ret; + int cpupool_id = c->cpupool_id; + + ret = cpu_disable_scheduler(cpu, 1); + cpu_set(cpu, cpupool_free_cpus); + if ( !ret ) + { + schedule_cpu_switch(cpu, NULL); + per_cpu(cpupool, cpu) = NULL; + cpupool_moving_cpu = -1; + cpupool_cpu_moving = NULL; + } + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %ld\n", + cpupool_id, cpu, ret); + return ret; +} + +/* + * unassign a specific cpu from a cpupool + * we must be sure not to run on the cpu to be unassigned! to achieve this + * the main functionality is performed via continue_hypercall_on_cpu on a + * specific cpu. + * if the cpu to be removed is the last one of the cpupool no active domain + * must be bound to the cpupool. dying domains are moved to cpupool0 as they + * might be zombies. + * possible failures: + * - last cpu and still active domains in cpupool + * - cpu just being unplugged + */ +int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu) +{ + int work_cpu; + int ret; + struct domain *d; + int cpupool_id = c->cpupool_id; + + printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d)\n", + cpupool_id, cpu); + spin_lock(&cpupool_lock); + ret = -EBUSY; + if ( (cpupool_moving_cpu != -1) && (cpu != cpupool_moving_cpu) ) + goto out; + if ( cpu_isset(cpu, cpupool_locked_cpus) ) + goto out; + + ret = 0; + if ( !cpu_isset(cpu, c->cpu_valid) && (cpu != cpupool_moving_cpu) ) + goto out; + + if ( (c->n_dom > 0) && (cpus_weight(c->cpu_valid) == 1) && + (cpu != cpupool_moving_cpu) ) + { + for_each_domain(d) + { + if ( d->cpupool != c ) + continue; + if ( !d->is_dying ) + { + ret = -EBUSY; + break; + } + c->n_dom--; + ret = sched_move_domain(d, cpupool0); + if ( ret ) + { + c->n_dom++; + break; + } + cpupool0->n_dom++; + } + if ( ret ) + goto out; + } + cpupool_moving_cpu = cpu; + cpupool_cpu_moving = c; + cpu_clear(cpu, c->cpu_valid); + work_cpu = smp_processor_id(); + if ( work_cpu == cpu ) + { + work_cpu = first_cpu(cpupool0->cpu_valid); + if ( work_cpu == cpu ) + work_cpu = next_cpu(cpu, cpupool0->cpu_valid); + } + return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c); + +out: + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n", + cpupool_id, cpu, ret); + return ret; +} + +/* + * assign cpus to the default cpupool + * default are all cpus, less cpus may be specified as boot parameter + * possible failures: + * - no cpu assigned + */ +int __init cpupool0_cpu_assign(struct cpupool *c) +{ + if ( (cpupool0_max_cpus == 0) || (cpupool0_max_cpus > num_online_cpus()) ) + cpupool0_max_cpus = num_online_cpus(); + if ( !cpupool_assign_ncpu(cpupool0, cpupool0_max_cpus) ) + return 1; + return 0; +} + +/* + * add a new domain to a cpupool + * possible failures: + * - pool does not exist + * - no cpu assigned to pool + */ +int cpupool_add_domain(struct domain *d, int poolid) +{ + struct cpupool *c; + int rc = 1; + int n_dom; + + if ( poolid == CPUPOOLID_NONE ) + return 0; + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(poolid, 1); + if ( (c != NULL) && cpus_weight(c->cpu_valid) ) + { + c->n_dom++; + n_dom = c->n_dom; + d->cpupool = c; + rc = 0; + } + spin_unlock(&cpupool_lock); + if (!rc) + printk(XENLOG_DEBUG "cpupool_add_domain(dom=%d,pool=%d) n_dom %d\n", + d->domain_id, poolid, n_dom); + return rc; +} + +/* + * remove a domain from a cpupool + */ +void cpupool_rm_domain(struct domain *d) +{ + int cpupool_id; + int n_dom; + + if ( d->cpupool == NULL ) + return; + spin_lock(&cpupool_lock); + cpupool_id = d->cpupool->cpupool_id; + d->cpupool->n_dom--; + n_dom = d->cpupool->n_dom; + d->cpupool = NULL; + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n", + d->domain_id, cpupool_id, n_dom); + return; +} + +/* + * called to add a new cpu to pool admin + * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0 + */ +void cpupool_cpu_add(unsigned int cpu) +{ + if ( cpupool0 == NULL ) + return; + spin_lock(&cpupool_lock); + cpu_clear(cpu, cpupool_locked_cpus); + cpu_set(cpu, cpupool_free_cpus); + cpupool_assign_cpu_locked(cpupool0, cpu); + spin_unlock(&cpupool_lock); + return; +} + +/* + * called to remove a cpu from pool admin + * the cpu to be removed is locked to avoid removing it from dom0 + * returns failure if not in pool0 + */ +int cpupool_cpu_remove(unsigned int cpu) +{ + int ret = 0; + + spin_lock(&cpupool_lock); + if ( !cpu_isset(cpu, cpupool0->cpu_valid)) + ret = -EBUSY; + else + cpu_set(cpu, cpupool_locked_cpus); + spin_unlock(&cpupool_lock); + + return ret; +} + +/* + * do cpupool related domctl operations + */ +int cpupool_do_domctl(struct xen_domctl_cpupool_op *op) +{ + int ret; + struct cpupool *c; + + switch ( op->op ) + { + + case XEN_DOMCTL_CPUPOOL_OP_CREATE: + { + int poolid; + const struct scheduler *sched; + + poolid = (op->cpupool_id == XEN_DOMCTL_CPUPOOL_PAR_ANY) ? + CPUPOOLID_NONE: op->cpupool_id; + sched = scheduler_get_by_id(op->sched_id); + ret = -ENOENT; + if ( sched == NULL ) + break; + ret = 0; + c = cpupool_create(poolid, sched->opt_name); + if ( c == NULL ) + ret = -EINVAL; + else + op->cpupool_id = c->cpupool_id; + } + break; + + case XEN_DOMCTL_CPUPOOL_OP_DESTROY: + { + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(op->cpupool_id, 1); + spin_unlock(&cpupool_lock); + ret = -ENOENT; + if ( c == NULL ) + break; + ret = (cpupool_destroy(c) != 0) ? -EBUSY : 0; + } + break; + + case XEN_DOMCTL_CPUPOOL_OP_INFO: + { + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(op->cpupool_id, 0); + spin_unlock(&cpupool_lock); + ret = -ENOENT; + if ( c == NULL ) + break; + op->cpupool_id = c->cpupool_id; + op->sched_id = c->sched.sched_id; + op->n_dom = c->n_dom; + cpumask_to_xenctl_cpumap(&(op->cpumap), &(c->cpu_valid)); + ret = 0; + } + break; + + case XEN_DOMCTL_CPUPOOL_OP_ADDCPU: + { + unsigned cpu; + + cpu = op->cpu; + printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d)\n", + op->cpupool_id, cpu); + spin_lock(&cpupool_lock); + if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY ) + cpu = first_cpu(cpupool_free_cpus); + ret = -EINVAL; + if ( cpu >= NR_CPUS ) + goto addcpu_out; + ret = -EBUSY; + if ( !cpu_isset(cpu, cpupool_free_cpus) ) + goto addcpu_out; + c = cpupool_find_by_id(op->cpupool_id, 0); + ret = -ENOENT; + if ( c == NULL ) + goto addcpu_out; + ret = cpupool_assign_cpu_locked(c, cpu); +addcpu_out: + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n", + op->cpupool_id, cpu, ret); + } + break; + + case XEN_DOMCTL_CPUPOOL_OP_RMCPU: + { + unsigned cpu; + + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(op->cpupool_id, 0); + spin_unlock(&cpupool_lock); + ret = -ENOENT; + if ( c == NULL ) + break; + cpu = op->cpu; + if ( cpu == XEN_DOMCTL_CPUPOOL_PAR_ANY ) + cpu = last_cpu(c->cpu_valid); + ret = -EINVAL; + if ( cpu >= NR_CPUS ) + break; + /* caution: cpupool_unassign_cpu uses continue_hypercall_on_cpu and + * will continue after the local return + */ + ret = cpupool_unassign_cpu(c, cpu); + } + break; + + case XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN: + { + struct domain *d; + + ret = -EINVAL; + if ( op->domid == 0 ) + break; + ret = -ESRCH; + d = rcu_lock_domain_by_id(op->domid); + if ( d == NULL ) + break; + if ( d->cpupool == NULL ) + { + ret = -EINVAL; + rcu_unlock_domain(d); + break; + } + printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d\n", + d->domain_id, op->cpupool_id); + ret = -ENOENT; + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(op->cpupool_id, 1); + if ( (c != NULL) && cpus_weight(c->cpu_valid) ) + { + d->cpupool->n_dom--; + ret = sched_move_domain(d, c); + if ( ret ) + d->cpupool->n_dom++; + else + c->n_dom++; + } + spin_unlock(&cpupool_lock); + printk(XENLOG_DEBUG "cpupool move_domain(dom=%d)->pool=%d ret %d\n", + d->domain_id, op->cpupool_id, ret); + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_CPUPOOL_OP_FREEINFO: + { + cpumask_to_xenctl_cpumap(&(op->cpumap), + &cpupool_free_cpus); + ret = 0; + } + break; + + default: + ret = -ENOSYS; + + } + + return ret; +} + +void schedule_dump(struct cpupool *c); + +void dump_runq(unsigned char key) +{ + unsigned long flags; + s_time_t now = NOW(); + struct cpupool **c; + + spin_lock(&cpupool_lock); + local_irq_save(flags); + + printk("sched_smt_power_savings: %s\n", + sched_smt_power_savings? "enabled":"disabled"); + printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); + + printk("Idle cpupool:\n"); + schedule_dump(NULL); + + for_each_cpupool(c) + { + printk("Cpupool %d:\n", (*c)->cpupool_id); + schedule_dump(*c); + } + + local_irq_restore(flags); + spin_unlock(&cpupool_lock); +} + +static int __init cpupool_init(void) +{ + cpupool_free_cpus = cpu_online_map; + cpupool_list = NULL; + return 0; +} +__initcall(cpupool_init); + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r dbf0fd95180f xen/common/domain.c --- a/xen/common/domain.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/common/domain.c Wed Apr 21 13:08:37 2010 +0200 @@ -218,6 +218,7 @@ struct domain *domain_create( enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2, INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 }; int init_status = 0; + int poolid = CPUPOOLID_NONE; if ( (d = alloc_domain_struct()) == NULL ) return NULL; @@ -282,6 +283,8 @@ struct domain *domain_create( if ( grant_table_create(d) != 0 ) goto fail; init_status |= INIT_gnttab; + + poolid = 0; } if ( arch_domain_create(d, domcr_flags) != 0 ) @@ -291,6 +294,9 @@ struct domain *domain_create( d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex); d->irq_caps = rangeset_new(d, "Interrupts", 0); if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) ) + goto fail; + + if ( cpupool_add_domain(d, poolid) != 0 ) goto fail; if ( sched_init_domain(d) != 0 ) @@ -600,6 +606,8 @@ static void complete_domain_destroy(stru arch_domain_destroy(d); rangeset_domain_destroy(d); + + cpupool_rm_domain(d); sched_destroy_domain(d); diff -r dbf0fd95180f xen/common/domctl.c --- a/xen/common/domctl.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/common/domctl.c Wed Apr 21 13:08:37 2010 +0200 @@ -11,6 +11,7 @@ #include <xen/lib.h> #include <xen/mm.h> #include <xen/sched.h> +#include <xen/sched-if.h> #include <xen/domain.h> #include <xen/event.h> #include <xen/domain_page.h> @@ -140,10 +141,12 @@ void getdomaininfo(struct domain *d, str info->shared_info_frame = mfn_to_gmfn(d, __pa(d->shared_info)>>PAGE_SHIFT); BUG_ON(SHARED_M2P(info->shared_info_frame)); + info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE; + memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t)); } -static unsigned int default_vcpu0_location(void) +static unsigned int default_vcpu0_location(cpumask_t *online) { struct domain *d; struct vcpu *v; @@ -173,7 +176,7 @@ static unsigned int default_vcpu0_locati if ( cpus_weight(per_cpu(cpu_sibling_map, 0)) > 1 ) cpu = next_cpu(cpu, per_cpu(cpu_sibling_map, 0)); cpu_exclude_map = per_cpu(cpu_sibling_map, 0); - for_each_online_cpu ( i ) + for_each_cpu_mask(i, *online) { if ( cpu_isset(i, cpu_exclude_map) ) continue; @@ -450,6 +453,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc { struct domain *d; unsigned int i, max = op->u.max_vcpus.max, cpu; + cpumask_t *online; ret = -ESRCH; if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL ) @@ -498,6 +502,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc goto maxvcpu_out; ret = -ENOMEM; + online = (d->cpupool == NULL) ? &cpu_online_map : &d->cpupool->cpu_valid; if ( max > d->max_vcpus ) { struct vcpu **vcpus; @@ -521,8 +526,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc continue; cpu = (i == 0) ? - default_vcpu0_location() : - cycle_cpu(d->vcpu[i-1]->processor, cpu_online_map); + default_vcpu0_location(online) : + cycle_cpu(d->vcpu[i-1]->processor, *online); if ( alloc_vcpu(d, i, cpu) == NULL ) goto maxvcpu_out; @@ -961,6 +966,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc } break; + case XEN_DOMCTL_cpupool_op: + { + ret = cpupool_do_domctl(&op->u.cpupool_op); + if ( (ret == 0) && copy_to_guest(u_domctl, op, 1) ) + ret = -EFAULT; + } + break; + default: ret = arch_do_domctl(op, u_domctl); break; diff -r dbf0fd95180f xen/common/sched_credit.c --- a/xen/common/sched_credit.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/common/sched_credit.c Wed Apr 21 13:08:37 2010 +0200 @@ -70,11 +70,15 @@ /* * Useful macros */ +#define CSCHED_PRIV(_ops) \ + ((struct csched_private *)((_ops)->sched_data)) #define CSCHED_PCPU(_c) \ ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) +#define CSCHED_CPUONLINE(_pool) \ + (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid) /* @@ -160,19 +164,22 @@ struct csched_private { struct timer master_ticker; unsigned int master; cpumask_t idlers; + cpumask_t cpus; uint32_t weight; uint32_t credit; int credit_balance; uint32_t runq_sort; + int ticker_active; }; /* * Global variables */ -static struct csched_private csched_priv; +static struct csched_private *csched_priv0 = NULL; static void csched_tick(void *_cpu); +static void csched_acct(void *dummy); static inline int __vcpu_on_runq(struct csched_vcpu *svc) @@ -238,6 +245,7 @@ __runq_tickle(unsigned int cpu, struct c { struct csched_vcpu * const cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); + struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); cpumask_t mask; ASSERT(cur); @@ -264,7 +272,7 @@ __runq_tickle(unsigned int cpu, struct c */ if ( cur->pri > CSCHED_PRI_IDLE ) { - if ( cpus_empty(csched_priv.idlers) ) + if ( cpus_empty(prv->idlers) ) { CSCHED_STAT_CRANK(tickle_idlers_none); } @@ -272,7 +280,7 @@ __runq_tickle(unsigned int cpu, struct c { cpumask_t idle_mask; - cpus_and(idle_mask, csched_priv.idlers, new->vcpu->cpu_affinity); + cpus_and(idle_mask, prv->idlers, new->vcpu->cpu_affinity); if ( !cpus_empty(idle_mask) ) { CSCHED_STAT_CRANK(tickle_idlers_some); @@ -294,40 +302,80 @@ __runq_tickle(unsigned int cpu, struct c cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ); } -static int -csched_pcpu_init(int cpu) +static void +csched_free_pdata(struct scheduler *ops, void *pcpu, int cpu) +{ + struct csched_private *prv = CSCHED_PRIV(ops); + struct csched_pcpu *spc = pcpu; + unsigned long flags; + + if ( spc == NULL ) + return; + + spin_lock_irqsave(&prv->lock, flags); + + prv->credit -= CSCHED_CREDITS_PER_ACCT; + prv->ncpus--; + cpu_clear(cpu, prv->idlers); + cpu_clear(cpu, prv->cpus); + if ( (prv->master == cpu) && (prv->ncpus > 0) ) + { + prv->master = first_cpu(prv->cpus); + migrate_timer(&prv->master_ticker, prv->master); + } + kill_timer(&spc->ticker); + if ( prv->ncpus == 0 ) + kill_timer(&prv->master_ticker); + + spin_unlock_irqrestore(&prv->lock, flags); + + xfree(spc); +} + +static void * +csched_alloc_pdata(struct scheduler *ops, int cpu) { struct csched_pcpu *spc; + struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; /* Allocate per-PCPU info */ spc = xmalloc(struct csched_pcpu); if ( spc == NULL ) - return -1; + return NULL; memset(spc, 0, sizeof(*spc)); - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); /* Initialize/update system-wide config */ - csched_priv.credit += CSCHED_CREDITS_PER_ACCT; - if ( csched_priv.ncpus <= cpu ) - csched_priv.ncpus = cpu + 1; - if ( csched_priv.master >= csched_priv.ncpus ) - csched_priv.master = cpu; + prv->credit += CSCHED_CREDITS_PER_ACCT; + prv->ncpus++; + cpu_set(cpu, prv->cpus); + if ( (prv->ncpus == 1) && (prv != csched_priv0) ) + { + prv->master = cpu; + init_timer( &prv->master_ticker, csched_acct, prv, cpu); + prv->ticker_active = 2; + } init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu); + + if ( prv == csched_priv0 ) + prv->master = first_cpu(prv->cpus); + INIT_LIST_HEAD(&spc->runq); - spc->runq_sort_last = csched_priv.runq_sort; + spc->runq_sort_last = prv->runq_sort; spc->idle_bias = NR_CPUS - 1; - per_cpu(schedule_data, cpu).sched_priv = spc; + if ( per_cpu(schedule_data, cpu).sched_priv == NULL ) + per_cpu(schedule_data, cpu).sched_priv = spc; /* Start off idling... */ BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr)); - cpu_set(cpu, csched_priv.idlers); + cpu_set(cpu, prv->idlers); - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); - return 0; + return spc; } #ifndef NDEBUG @@ -400,17 +448,19 @@ __csched_vcpu_is_migrateable(struct vcpu } static int -_csched_cpu_pick(struct vcpu *vc, bool_t commit) +_csched_cpu_pick(struct scheduler *ops, struct vcpu *vc, bool_t commit) { cpumask_t cpus; cpumask_t idlers; + cpumask_t *online; int cpu; /* * Pick from online CPUs in VCPU's affinity mask, giving a * preference to its current processor if it's in there. */ - cpus_and(cpus, cpu_online_map, vc->cpu_affinity); + online = CSCHED_CPUONLINE(vc->domain->cpupool); + cpus_and(cpus, *online, vc->cpu_affinity); cpu = cpu_isset(vc->processor, cpus) ? vc->processor : cycle_cpu(vc->processor, cpus); @@ -428,7 +478,7 @@ _csched_cpu_pick(struct vcpu *vc, bool_t * like run two VCPUs on co-hyperthreads while there are idle cores * or sockets. */ - cpus_and(idlers, cpu_online_map, csched_priv.idlers); + cpus_and(idlers, cpu_online_map, CSCHED_PRIV(ops)->idlers); cpu_set(cpu, idlers); cpus_and(cpus, cpus, idlers); cpu_clear(cpu, cpus); @@ -474,18 +524,18 @@ _csched_cpu_pick(struct vcpu *vc, bool_t } static int -csched_cpu_pick(struct vcpu *vc) +csched_cpu_pick(struct scheduler *ops, struct vcpu *vc) { - return _csched_cpu_pick(vc, 1); + return _csched_cpu_pick(ops, vc, 1); } static inline void -__csched_vcpu_acct_start(struct csched_vcpu *svc) +__csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc) { struct csched_dom * const sdom = svc->sdom; unsigned long flags; - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); if ( list_empty(&svc->active_vcpu_elem) ) { @@ -496,16 +546,17 @@ __csched_vcpu_acct_start(struct csched_v list_add(&svc->active_vcpu_elem, &sdom->active_vcpu); if ( list_empty(&sdom->active_sdom_elem) ) { - list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom); - csched_priv.weight += sdom->weight; + list_add(&sdom->active_sdom_elem, &prv->active_sdom); + prv->weight += sdom->weight; } } - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); } static inline void -__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc) +__csched_vcpu_acct_stop_locked(struct csched_private *prv, + struct csched_vcpu *svc) { struct csched_dom * const sdom = svc->sdom; @@ -518,16 +569,17 @@ __csched_vcpu_acct_stop_locked(struct cs list_del_init(&svc->active_vcpu_elem); if ( list_empty(&sdom->active_vcpu) ) { - BUG_ON( csched_priv.weight < sdom->weight ); + BUG_ON( prv->weight < sdom->weight ); list_del_init(&sdom->active_sdom_elem); - csched_priv.weight -= sdom->weight; + prv->weight -= sdom->weight; } } static void -csched_vcpu_acct(unsigned int cpu) +csched_vcpu_acct(struct csched_private *prv, unsigned int cpu) { struct csched_vcpu * const svc = CSCHED_VCPU(current); + struct scheduler *ops = per_cpu(scheduler, cpu); ASSERT( current->processor == cpu ); ASSERT( svc->sdom != NULL ); @@ -556,9 +608,9 @@ csched_vcpu_acct(unsigned int cpu) */ if ( list_empty(&svc->active_vcpu_elem) ) { - __csched_vcpu_acct_start(svc); + __csched_vcpu_acct_start(prv, svc); } - else if ( _csched_cpu_pick(current, 0) != cpu ) + else if ( _csched_cpu_pick(ops, current, 0) != cpu ) { CSCHED_VCPU_STAT_CRANK(svc, migrate_r); CSCHED_STAT_CRANK(migrate_running); @@ -567,66 +619,75 @@ csched_vcpu_acct(unsigned int cpu) } } -static int -csched_vcpu_init(struct vcpu *vc) +static void * +csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc, void *dd) { - struct domain * const dom = vc->domain; - struct csched_dom *sdom = CSCHED_DOM(dom); struct csched_vcpu *svc; - - CSCHED_STAT_CRANK(vcpu_init); /* Allocate per-VCPU info */ svc = xmalloc(struct csched_vcpu); if ( svc == NULL ) - return -1; + return NULL; memset(svc, 0, sizeof(*svc)); INIT_LIST_HEAD(&svc->runq_elem); INIT_LIST_HEAD(&svc->active_vcpu_elem); - svc->sdom = sdom; + svc->sdom = dd; svc->vcpu = vc; atomic_set(&svc->credit, 0); svc->flags = 0U; - svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; + svc->pri = is_idle_domain(vc->domain) ? + CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; CSCHED_VCPU_STATS_RESET(svc); - vc->sched_priv = svc; - - /* Allocate per-PCPU info */ - if ( unlikely(!CSCHED_PCPU(vc->processor)) ) - { - if ( csched_pcpu_init(vc->processor) != 0 ) - return -1; - } - - CSCHED_VCPU_CHECK(vc); - return 0; + CSCHED_STAT_CRANK(vcpu_init); + return svc; } static void -csched_vcpu_destroy(struct vcpu *vc) +csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc) +{ + struct csched_vcpu *svc = vc->sched_priv; + + if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running ) + __runq_insert(vc->processor, svc); +} + +static void +csched_free_vdata(struct scheduler *ops, void *priv) +{ + struct csched_private *prv = CSCHED_PRIV(ops); + struct csched_vcpu *svc = priv; + unsigned long flags; + + if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); + + spin_lock_irqsave(&(prv->lock), flags); + + if ( !list_empty(&svc->active_vcpu_elem) ) + __csched_vcpu_acct_stop_locked(prv, svc); + + spin_unlock_irqrestore(&(prv->lock), flags); + + xfree(svc); +} + +static void +csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; - unsigned long flags; CSCHED_STAT_CRANK(vcpu_destroy); BUG_ON( sdom == NULL ); BUG_ON( !list_empty(&svc->runq_elem) ); - spin_lock_irqsave(&csched_priv.lock, flags); - - if ( !list_empty(&svc->active_vcpu_elem) ) - __csched_vcpu_acct_stop_locked(svc); - - spin_unlock_irqrestore(&csched_priv.lock, flags); - - xfree(svc); + csched_free_vdata(ops, svc); } static void -csched_vcpu_sleep(struct vcpu *vc) +csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); @@ -641,7 +702,7 @@ csched_vcpu_sleep(struct vcpu *vc) } static void -csched_vcpu_wake(struct vcpu *vc) +csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); const unsigned int cpu = vc->processor; @@ -697,10 +758,12 @@ csched_vcpu_wake(struct vcpu *vc) static int csched_dom_cntl( + struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched_dom * const sdom = CSCHED_DOM(d); + struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) @@ -712,14 +775,14 @@ csched_dom_cntl( { ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); if ( op->u.credit.weight != 0 ) { if ( !list_empty(&sdom->active_sdom_elem) ) { - csched_priv.weight -= sdom->weight; - csched_priv.weight += op->u.credit.weight; + prv->weight -= sdom->weight; + prv->weight += op->u.credit.weight; } sdom->weight = op->u.credit.weight; } @@ -727,25 +790,20 @@ csched_dom_cntl( if ( op->u.credit.cap != (uint16_t)~0U ) sdom->cap = op->u.credit.cap; - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); } return 0; } -static int -csched_dom_init(struct domain *dom) +static void * +csched_alloc_domdata(struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; - CSCHED_STAT_CRANK(dom_init); - - if ( is_idle_domain(dom) ) - return 0; - sdom = xmalloc(struct csched_dom); if ( sdom == NULL ) - return -ENOMEM; + return NULL; memset(sdom, 0, sizeof(*sdom)); /* Initialize credit and weight */ @@ -755,16 +813,40 @@ csched_dom_init(struct domain *dom) sdom->dom = dom; sdom->weight = CSCHED_DEFAULT_WEIGHT; sdom->cap = 0U; + + return (void *)sdom; +} + +static int +csched_dom_init(struct scheduler *ops, struct domain *dom) +{ + struct csched_dom *sdom; + + CSCHED_STAT_CRANK(dom_init); + + if ( is_idle_domain(dom) ) + return 0; + + sdom = csched_alloc_domdata(ops, dom); + if ( sdom == NULL ) + return -ENOMEM; + dom->sched_priv = sdom; return 0; } static void -csched_dom_destroy(struct domain *dom) +csched_free_domdata(struct scheduler *ops, void *data) +{ + xfree(data); +} + +static void +csched_dom_destroy(struct scheduler *ops, struct domain *dom) { CSCHED_STAT_CRANK(dom_destroy); - xfree(CSCHED_DOM(dom)); + csched_free_domdata(ops, CSCHED_DOM(dom)); } /* @@ -775,7 +857,7 @@ csched_dom_destroy(struct domain *dom) * remember the last UNDER to make the move up operation O(1). */ static void -csched_runq_sort(unsigned int cpu) +csched_runq_sort(struct csched_private *prv, unsigned int cpu) { struct csched_pcpu * const spc = CSCHED_PCPU(cpu); struct list_head *runq, *elem, *next, *last_under; @@ -783,7 +865,7 @@ csched_runq_sort(unsigned int cpu) unsigned long flags; int sort_epoch; - sort_epoch = csched_priv.runq_sort; + sort_epoch = prv->runq_sort; if ( sort_epoch == spc->runq_sort_last ) return; @@ -820,6 +902,7 @@ static void static void csched_acct(void* dummy) { + struct csched_private *prv = dummy; unsigned long flags; struct list_head *iter_vcpu, *next_vcpu; struct list_head *iter_sdom, *next_sdom; @@ -836,22 +919,22 @@ csched_acct(void* dummy) int credit; - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); - weight_total = csched_priv.weight; - credit_total = csched_priv.credit; + weight_total = prv->weight; + credit_total = prv->credit; /* Converge balance towards 0 when it drops negative */ - if ( csched_priv.credit_balance < 0 ) + if ( prv->credit_balance < 0 ) { - credit_total -= csched_priv.credit_balance; + credit_total -= prv->credit_balance; CSCHED_STAT_CRANK(acct_balance); } if ( unlikely(weight_total == 0) ) { - csched_priv.credit_balance = 0; - spin_unlock_irqrestore(&csched_priv.lock, flags); + prv->credit_balance = 0; + spin_unlock_irqrestore(&prv->lock, flags); CSCHED_STAT_CRANK(acct_no_work); goto out; } @@ -863,7 +946,7 @@ csched_acct(void* dummy) credit_xtra = 0; credit_cap = 0U; - list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom ) + list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom ) { sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); @@ -883,9 +966,9 @@ csched_acct(void* dummy) * only when the system-wide credit balance is negative. */ credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT; - if ( csched_priv.credit_balance < 0 ) + if ( prv->credit_balance < 0 ) { - credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) + + credit_peak += ( ( -prv->credit_balance * sdom->weight) + (weight_total - 1) ) / weight_total; } @@ -927,7 +1010,7 @@ csched_acct(void* dummy) */ CSCHED_STAT_CRANK(acct_reorder); list_del(&sdom->active_sdom_elem); - list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom); + list_add(&sdom->active_sdom_elem, &prv->active_sdom); } credit_fair = credit_peak; @@ -993,7 +1076,7 @@ csched_acct(void* dummy) /* Upper bound on credits means VCPU stops earning */ if ( credit > CSCHED_CREDITS_PER_TSLICE ) { - __csched_vcpu_acct_stop_locked(svc); + __csched_vcpu_acct_stop_locked(prv, svc); credit = 0; atomic_set(&svc->credit, credit); } @@ -1005,15 +1088,15 @@ csched_acct(void* dummy) } } - csched_priv.credit_balance = credit_balance; + prv->credit_balance = credit_balance; - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); /* Inform each CPU that its runq needs to be sorted */ - csched_priv.runq_sort++; + prv->runq_sort++; out: - set_timer( &csched_priv.master_ticker, NOW() + + set_timer( &prv->master_ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT ); } @@ -1022,6 +1105,7 @@ csched_tick(void *_cpu) { unsigned int cpu = (unsigned long)_cpu; struct csched_pcpu *spc = CSCHED_PCPU(cpu); + struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); spc->tick++; @@ -1029,7 +1113,7 @@ csched_tick(void *_cpu) * Accounting for running VCPU */ if ( !is_idle_vcpu(current) ) - csched_vcpu_acct(cpu); + csched_vcpu_acct(prv, cpu); /* * Check if runq needs to be sorted @@ -1038,7 +1122,7 @@ csched_tick(void *_cpu) * modified priorities. This is a special O(n) sort and runs at most * once per accounting period (currently 30 milliseconds). */ - csched_runq_sort(cpu); + csched_runq_sort(prv, cpu); set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK)); } @@ -1091,16 +1175,19 @@ csched_runq_steal(int peer_cpu, int cpu, } static struct csched_vcpu * -csched_load_balance(int cpu, struct csched_vcpu *snext) +csched_load_balance(struct csched_private *prv, int cpu, + struct csched_vcpu *snext) { struct csched_vcpu *speer; cpumask_t workers; + cpumask_t *online; int peer_cpu; BUG_ON( cpu != snext->vcpu->processor ); + online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu)); /* If this CPU is going offline we shouldn't steal work. */ - if ( unlikely(!cpu_online(cpu)) ) + if ( unlikely(!cpu_isset(cpu, *online)) ) goto out; if ( snext->pri == CSCHED_PRI_IDLE ) @@ -1114,7 +1201,7 @@ csched_load_balance(int cpu, struct csch * Peek at non-idling CPUs in the system, starting with our * immediate neighbour. */ - cpus_andnot(workers, cpu_online_map, csched_priv.idlers); + cpus_andnot(workers, *online, prv->idlers); cpu_clear(cpu, workers); peer_cpu = cpu; @@ -1156,11 +1243,12 @@ csched_load_balance(int cpu, struct csch * fast for the common case. */ static struct task_slice -csched_schedule(s_time_t now) +csched_schedule(struct scheduler *ops, s_time_t now) { const int cpu = smp_processor_id(); struct list_head * const runq = RUNQ(cpu); struct csched_vcpu * const scurr = CSCHED_VCPU(current); + struct csched_private *prv = CSCHED_PRIV(ops); struct csched_vcpu *snext; struct task_slice ret; @@ -1207,7 +1295,7 @@ csched_schedule(s_time_t now) if ( snext->pri > CSCHED_PRI_TS_OVER ) __runq_remove(snext); else - snext = csched_load_balance(cpu, snext); + snext = csched_load_balance(prv, cpu, snext); /* * Update idlers mask if necessary. When we're idling, other CPUs @@ -1215,12 +1303,12 @@ csched_schedule(s_time_t now) */ if ( snext->pri == CSCHED_PRI_IDLE ) { - if ( !cpu_isset(cpu, csched_priv.idlers) ) - cpu_set(cpu, csched_priv.idlers); + if ( !cpu_isset(cpu, prv->idlers) ) + cpu_set(cpu, prv->idlers); } - else if ( cpu_isset(cpu, csched_priv.idlers) ) + else if ( cpu_isset(cpu, prv->idlers) ) { - cpu_clear(cpu, csched_priv.idlers); + cpu_clear(cpu, prv->idlers); } if ( !is_idle_vcpu(snext->vcpu) ) @@ -1267,7 +1355,7 @@ csched_dump_vcpu(struct csched_vcpu *svc } static void -csched_dump_pcpu(int cpu) +csched_dump_pcpu(struct scheduler *ops, int cpu) { struct list_head *runq, *iter; struct csched_pcpu *spc; @@ -1305,9 +1393,10 @@ csched_dump_pcpu(int cpu) } static void -csched_dump(void) +csched_dump(struct scheduler *ops) { struct list_head *iter_sdom, *iter_svc; + struct csched_private *prv = CSCHED_PRIV(ops); int loop; #define idlers_buf keyhandler_scratch @@ -1324,12 +1413,12 @@ csched_dump(void) "\tticks per tslice = %d\n" "\tticks per acct = %d\n" "\tmigration delay = %uus\n", - csched_priv.ncpus, - csched_priv.master, - csched_priv.credit, - csched_priv.credit_balance, - csched_priv.weight, - csched_priv.runq_sort, + prv->ncpus, + prv->master, + prv->credit, + prv->credit_balance, + prv->weight, + prv->runq_sort, CSCHED_DEFAULT_WEIGHT, CSCHED_MSECS_PER_TICK, CSCHED_CREDITS_PER_MSEC, @@ -1337,12 +1426,12 @@ csched_dump(void) CSCHED_TICKS_PER_ACCT, vcpu_migration_delay); - cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers); + cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers); printk("idlers: %s\n", idlers_buf); printk("active vcpus:\n"); loop = 0; - list_for_each( iter_sdom, &csched_priv.active_sdom ) + list_for_each( iter_sdom, &prv->active_sdom ) { struct csched_dom *sdom; sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); @@ -1359,18 +1448,30 @@ csched_dump(void) #undef idlers_buf } -static void -csched_init(void) +static int +csched_init(struct scheduler *ops, int pool0) { - spin_lock_init(&csched_priv.lock); - INIT_LIST_HEAD(&csched_priv.active_sdom); - csched_priv.ncpus = 0; - csched_priv.master = UINT_MAX; - cpus_clear(csched_priv.idlers); - csched_priv.weight = 0U; - csched_priv.credit = 0U; - csched_priv.credit_balance = 0; - csched_priv.runq_sort = 0U; + struct csched_private *prv; + + prv = xmalloc(struct csched_private); + if ( prv == NULL ) + return 1; + memset(prv, 0, sizeof(*prv)); + if ( pool0 ) + csched_priv0 = prv; + ops->sched_data = prv; + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->active_sdom); + prv->ncpus = 0; + prv->master = UINT_MAX; + cpus_clear(prv->idlers); + prv->weight = 0U; + prv->credit = 0U; + prv->credit_balance = 0; + prv->runq_sort = 0U; + prv->ticker_active = (csched_priv0 == prv) ? 0 : 1; + + return 0; } /* Tickers cannot be kicked until SMP subsystem is alive. */ @@ -1380,8 +1481,10 @@ static __init int csched_start_tickers(v unsigned int cpu; /* Is the credit scheduler initialised? */ - if ( csched_priv.ncpus == 0 ) + if ( (csched_priv0 == NULL) || (csched_priv0->ncpus == 0) ) return 0; + + csched_priv0->ticker_active = 1; for_each_online_cpu ( cpu ) { @@ -1389,45 +1492,72 @@ static __init int csched_start_tickers(v set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK)); } - init_timer( &csched_priv.master_ticker, csched_acct, NULL, - csched_priv.master); + init_timer( &csched_priv0->master_ticker, csched_acct, csched_priv0, + csched_priv0->master); - set_timer( &csched_priv.master_ticker, NOW() + + set_timer( &csched_priv0->master_ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT ); return 0; } __initcall(csched_start_tickers); -static void csched_tick_suspend(void) +static void +csched_deinit(struct scheduler *ops) +{ + struct csched_private *prv; + + prv = CSCHED_PRIV(ops); + if ( prv != NULL ) + xfree(prv); +} + +static void csched_tick_suspend(struct scheduler *ops, unsigned int cpu) { struct csched_pcpu *spc; - spc = CSCHED_PCPU(smp_processor_id()); + spc = CSCHED_PCPU(cpu); stop_timer(&spc->ticker); } -static void csched_tick_resume(void) +static void csched_tick_resume(struct scheduler *ops, unsigned int cpu) { struct csched_pcpu *spc; uint64_t now = NOW(); + struct csched_private *prv; - spc = CSCHED_PCPU(smp_processor_id()); + prv = CSCHED_PRIV(ops); + if ( !prv->ticker_active ) + return; + + + spc = CSCHED_PCPU(cpu); set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK) - now % MILLISECS(CSCHED_MSECS_PER_TICK) ); + + if ( (prv->ticker_active == 2) && (prv->master == cpu) ) + { + set_timer( &prv->master_ticker, now + + MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT - + now % MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT); + prv->ticker_active = 1; + } } -const struct scheduler sched_credit_def = { +static struct csched_private _csched_priv; + +struct scheduler sched_credit_def = { .name = "SMP Credit Scheduler", .opt_name = "credit", .sched_id = XEN_SCHEDULER_CREDIT, + .sched_data = &_csched_priv, .init_domain = csched_dom_init, .destroy_domain = csched_dom_destroy, - .init_vcpu = csched_vcpu_init, + .insert_vcpu = csched_vcpu_insert, .destroy_vcpu = csched_vcpu_destroy, .sleep = csched_vcpu_sleep, @@ -1441,6 +1571,13 @@ const struct scheduler sched_credit_def .dump_cpu_state = csched_dump_pcpu, .dump_settings = csched_dump, .init = csched_init, + .deinit = csched_deinit, + .alloc_vdata = csched_alloc_vdata, + .free_vdata = csched_free_vdata, + .alloc_pdata = csched_alloc_pdata, + .free_pdata = csched_free_pdata, + .alloc_domdata = csched_alloc_domdata, + .free_domdata = csched_free_domdata, .tick_suspend = csched_tick_suspend, .tick_resume = csched_tick_resume, diff -r dbf0fd95180f xen/common/sched_credit2.c --- a/xen/common/sched_credit2.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/common/sched_credit2.c Wed Apr 21 13:08:37 2010 +0200 @@ -149,12 +149,16 @@ /* * Useful macros */ +#define CSCHED_PRIV(_ops) \ + ((struct csched_private *)((_ops)->sched_data)) #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) +#define CSCHED_CPUONLINE(_pool) \ + (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid) /* CPU to runq_id macro */ -#define c2r(_cpu) (csched_priv.runq_map[(_cpu)]) +#define c2r(_ops, _cpu) (CSCHED_PRIV(_ops)->runq_map[(_cpu)]) /* CPU to runqueue struct macro */ -#define RQD(_cpu) (&csched_priv.rqd[c2r(_cpu)]) +#define RQD(_ops, _cpu) (&CSCHED_PRIV(_ops)->rqd[c2r(_ops, _cpu)]) /* * Per-runqueue data @@ -212,11 +216,6 @@ struct csched_dom { uint16_t nr_vcpus; }; - -/* - * Global variables - */ -static struct csched_private csched_priv; /* * Time-to-credit, credit-to-time. @@ -284,15 +283,15 @@ __runq_insert(struct list_head *runq, st } static void -runq_insert(unsigned int cpu, struct csched_vcpu *svc) +runq_insert(struct scheduler *ops, unsigned int cpu, struct csched_vcpu *svc) { - struct list_head * runq = &RQD(cpu)->runq; + struct list_head * runq = &RQD(ops, cpu)->runq; int pos = 0; ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) ); BUG_ON( __vcpu_on_runq(svc) ); - BUG_ON( c2r(cpu) != c2r(svc->vcpu->processor) ); + BUG_ON( c2r(ops, cpu) != c2r(ops, svc->vcpu->processor) ); pos = __runq_insert(runq, svc); @@ -324,11 +323,12 @@ void burn_credits(struct csched_runqueue /* Check to see if the item on the runqueue is higher priority than what's * currently running; if so, wake up the processor */ static /*inline*/ void -runq_tickle(unsigned int cpu, struct csched_vcpu *new, s_time_t now) +runq_tickle(struct scheduler *ops, unsigned int cpu, struct csched_vcpu *new, s_time_t now) { int i, ipid=-1; s_time_t lowest=(1<<30); - struct csched_runqueue_data *rqd = RQD(cpu); + struct csched_runqueue_data *rqd = RQD(ops, cpu); + cpumask_t *online; d2printk("rqt d%dv%d cd%dv%d\n", new->vcpu->domain->domain_id, @@ -336,13 +336,14 @@ runq_tickle(unsigned int cpu, struct csc current->domain->domain_id, current->vcpu_id); + online = CSCHED_CPUONLINE(per_cpu(cpupool, cpu)); /* Find the cpu in this queue group that has the lowest credits */ for ( i=rqd->cpu_min ; i < rqd->cpu_max ; i++ ) { struct csched_vcpu * cur; /* Skip cpus that aren't online */ - if ( !cpu_online(i) ) + if ( !cpu_isset(i, *online) ) continue; cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr); @@ -396,11 +397,11 @@ runq_tickle(unsigned int cpu, struct csc /* * Credit-related code */ -static void reset_credit(int cpu, s_time_t now) +static void reset_credit(struct scheduler *ops, int cpu, s_time_t now) { struct list_head *iter; - list_for_each( iter, &RQD(cpu)->svc ) + list_for_each( iter, &RQD(ops, cpu)->svc ) { struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem); @@ -521,64 +522,100 @@ __csched_vcpu_check(struct vcpu *vc) #define CSCHED_VCPU_CHECK(_vc) #endif -static int -csched_vcpu_init(struct vcpu *vc) +static void * +csched_alloc_vdata(struct scheduler *ops, struct vcpu *vc, void *dd) { - struct domain * const dom = vc->domain; - struct csched_dom *sdom = CSCHED_DOM(dom); struct csched_vcpu *svc; - - printk("%s: Initializing d%dv%d\n", - __func__, dom->domain_id, vc->vcpu_id); /* Allocate per-VCPU info */ svc = xmalloc(struct csched_vcpu); if ( svc == NULL ) - return -1; + return NULL; + memset(svc, 0, sizeof(*svc)); INIT_LIST_HEAD(&svc->rqd_elem); INIT_LIST_HEAD(&svc->sdom_elem); INIT_LIST_HEAD(&svc->runq_elem); - svc->sdom = sdom; + svc->sdom = dd; svc->vcpu = vc; svc->flags = 0U; - vc->sched_priv = svc; if ( ! is_idle_vcpu(vc) ) { - BUG_ON( sdom == NULL ); + BUG_ON( svc->sdom == NULL ); svc->credit = CSCHED_CREDIT_INIT; - svc->weight = sdom->weight; + svc->weight = svc->sdom->weight; + } + else + { + BUG_ON( svc->sdom != NULL ); + svc->credit = CSCHED_IDLE_CREDIT; + svc->weight = 0; + } + return svc; +} + +static void +csched_vcpu_insert(struct scheduler *ops, struct vcpu *vc) +{ + struct csched_vcpu *svc = vc->sched_priv; + struct domain * const dom = vc->domain; + struct csched_dom *sdom = CSCHED_DOM(dom); + + printk("%s: Inserting d%dv%d\n", + __func__, dom->domain_id, vc->vcpu_id); + + if ( ! is_idle_vcpu(vc) ) + { /* FIXME: Do we need the private lock here? */ - list_add_tail(&svc->sdom_elem, &sdom->vcpu); + list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); /* Add vcpu to runqueue of initial processor */ /* FIXME: Abstract for multiple runqueues */ vcpu_schedule_lock_irq(vc); - list_add_tail(&svc->rqd_elem, &RQD(vc->processor)->svc); - update_max_weight(RQD(vc->processor), svc->weight, 0); + list_add_tail(&svc->rqd_elem, &RQD(ops, vc->processor)->svc); + update_max_weight(RQD(ops, vc->processor), svc->weight, 0); vcpu_schedule_unlock_irq(vc); sdom->nr_vcpus++; } - else - { - BUG_ON( sdom != NULL ); - svc->credit = CSCHED_IDLE_CREDIT; - svc->weight = 0; - } CSCHED_VCPU_CHECK(vc); - return 0; } static void -csched_vcpu_destroy(struct vcpu *vc) +csched_free_vdata(struct scheduler *ops, void *priv) +{ + struct csched_vcpu *svc = priv; + struct vcpu *vc = svc->vcpu; + + if ( ! is_idle_vcpu(vc) ) + { + /* Remove from runqueue */ + vcpu_schedule_lock_irq(vc); + + list_del_init(&svc->rqd_elem); + update_max_weight(RQD(ops, vc->processor), 0, svc->weight); + + vcpu_schedule_unlock_irq(vc); + + /* Remove from sdom list. Don't need a lock for this, as it's called + * syncronously when nothing else can happen. */ + list_del_init(&svc->sdom_elem); + + svc->sdom->nr_vcpus--; + } + + xfree(svc); +} + +static void +csched_vcpu_destroy(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; @@ -586,25 +623,11 @@ csched_vcpu_destroy(struct vcpu *vc) BUG_ON( sdom == NULL ); BUG_ON( !list_empty(&svc->runq_elem) ); - /* Remove from runqueue */ - vcpu_schedule_lock_irq(vc); - - list_del_init(&svc->rqd_elem); - update_max_weight(RQD(vc->processor), 0, svc->weight); - - vcpu_schedule_unlock_irq(vc); - - /* Remove from sdom list. Don't need a lock for this, as it's called - * syncronously when nothing else can happen. */ - list_del_init(&svc->sdom_elem); - - sdom->nr_vcpus--; - - xfree(svc); + csched_free_vdata(ops, svc); } static void -csched_vcpu_sleep(struct vcpu *vc) +csched_vcpu_sleep(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); @@ -617,7 +640,7 @@ csched_vcpu_sleep(struct vcpu *vc) } static void -csched_vcpu_wake(struct vcpu *vc) +csched_vcpu_wake(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); const unsigned int cpu = vc->processor; @@ -654,8 +677,8 @@ csched_vcpu_wake(struct vcpu *vc) now = NOW(); /* Put the VCPU on the runq */ - runq_insert(cpu, svc); - runq_tickle(cpu, svc, now); + runq_insert(ops, cpu, svc); + runq_tickle(ops, cpu, svc, now); out: d2printk("w-\n"); @@ -663,7 +686,7 @@ out: } static void -csched_context_saved(struct vcpu *vc) +csched_context_saved(struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); @@ -688,15 +711,15 @@ csched_context_saved(struct vcpu *vc) BUG_ON(__vcpu_on_runq(svc)); - runq_insert(cpu, svc); - runq_tickle(cpu, svc, NOW()); + runq_insert(ops, cpu, svc); + runq_tickle(ops, cpu, svc, NOW()); } vcpu_schedule_unlock_irq(vc); } static int -csched_cpu_pick(struct vcpu *vc) +csched_cpu_pick(struct scheduler *ops, struct vcpu *vc) { /* FIXME: Chose a schedule group based on load */ /* FIXME: Migrate the vcpu to the new runqueue list, updating @@ -706,10 +729,12 @@ csched_cpu_pick(struct vcpu *vc) static int csched_dom_cntl( + struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched_dom * const sdom = CSCHED_DOM(d); + struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) @@ -727,7 +752,7 @@ csched_dom_cntl( /* Must hold csched_priv lock to update sdom, runq lock to * update csvcs. */ - spin_lock_irqsave(&csched_priv.lock, flags); + spin_lock_irqsave(&prv->lock, flags); old_weight = sdom->weight; @@ -744,32 +769,28 @@ csched_dom_cntl( vcpu_schedule_lock_irq(svc->vcpu); svc->weight = sdom->weight; - update_max_weight(RQD(svc->vcpu->processor), svc->weight, old_weight); + update_max_weight(RQD(ops, svc->vcpu->processor), svc->weight, old_weight); vcpu_schedule_unlock_irq(svc->vcpu); } - spin_unlock_irqrestore(&csched_priv.lock, flags); + spin_unlock_irqrestore(&prv->lock, flags); } } return 0; } -static int -csched_dom_init(struct domain *dom) +static void * +csched_alloc_domdata(struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; int flags; - printk("%s: Initializing domain %d\n", __func__, dom->domain_id); - - if ( is_idle_domain(dom) ) - return 0; - sdom = xmalloc(struct csched_dom); if ( sdom == NULL ) - return -ENOMEM; + return NULL; + memset(sdom, 0, sizeof(*sdom)); /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->vcpu); @@ -778,40 +799,65 @@ csched_dom_init(struct domain *dom) sdom->weight = CSCHED_DEFAULT_WEIGHT; sdom->nr_vcpus = 0; + spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags); + + list_add_tail(&sdom->sdom_elem, &CSCHED_PRIV(ops)->sdom); + + spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags); + + return (void *)sdom; +} + +static int +csched_dom_init(struct scheduler *ops, struct domain *dom) +{ + struct csched_dom *sdom; + + printk("%s: Initializing domain %d\n", __func__, dom->domain_id); + + if ( is_idle_domain(dom) ) + return 0; + + sdom = csched_alloc_domdata(ops, dom); + if ( sdom == NULL ) + return -ENOMEM; + dom->sched_priv = sdom; - - spin_lock_irqsave(&csched_priv.lock, flags); - - list_add_tail(&sdom->sdom_elem, &csched_priv.sdom); - - spin_unlock_irqrestore(&csched_priv.lock, flags); return 0; } static void -csched_dom_destroy(struct domain *dom) +csched_free_domdata(struct scheduler *ops, void *data) +{ + int flags; + struct csched_dom *sdom = data; + + spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags); + + list_del_init(&sdom->sdom_elem); + + spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags); + + xfree(data); +} + +static void +csched_dom_destroy(struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom = CSCHED_DOM(dom); - int flags; BUG_ON(!list_empty(&sdom->vcpu)); - spin_lock_irqsave(&csched_priv.lock, flags); - - list_del_init(&sdom->sdom_elem); - - spin_unlock_irqrestore(&csched_priv.lock, flags); - - xfree(CSCHED_DOM(dom)); + csched_free_domdata(ops, CSCHED_DOM(dom)); } /* How long should we let this vcpu run for? */ static s_time_t -csched_runtime(int cpu, struct csched_vcpu *snext) +csched_runtime(struct scheduler *ops, int cpu, struct csched_vcpu *snext) { s_time_t time = CSCHED_MAX_TIMER; - struct csched_runqueue_data *rqd = RQD(cpu); + struct csched_runqueue_data *rqd = RQD(ops, cpu); struct list_head *runq = &rqd->runq; if ( is_idle_vcpu(snext->vcpu) ) @@ -851,10 +897,10 @@ void __dump_execstate(void *unused); * fast for the common case. */ static struct task_slice -csched_schedule(s_time_t now) +csched_schedule(struct scheduler *ops, s_time_t now) { const int cpu = smp_processor_id(); - struct csched_runqueue_data *rqd = RQD(cpu); + struct csched_runqueue_data *rqd = RQD(ops, cpu); struct list_head * const runq = &rqd->runq; struct csched_vcpu * const scurr = CSCHED_VCPU(current); struct csched_vcpu *snext = NULL; @@ -927,7 +973,7 @@ csched_schedule(s_time_t now) } if ( !is_idle_vcpu(snext->vcpu) && snext->credit <= CSCHED_CREDIT_RESET ) - reset_credit(cpu, now); + reset_credit(ops, cpu, now); #if 0 /* @@ -955,7 +1001,7 @@ csched_schedule(s_time_t now) /* * Return task to run next... */ - ret.time = csched_runtime(cpu, snext); + ret.time = csched_runtime(ops, cpu, snext); ret.task = snext->vcpu; CSCHED_VCPU_CHECK(ret.task); @@ -977,7 +1023,7 @@ csched_dump_vcpu(struct csched_vcpu *svc } static void -csched_dump_pcpu(int cpu) +csched_dump_pcpu(struct scheduler *ops, int cpu) { struct list_head *runq, *iter; struct csched_vcpu *svc; @@ -986,7 +1032,7 @@ csched_dump_pcpu(int cpu) /* FIXME: Do locking properly for access to runqueue structures */ - runq = &RQD(cpu)->runq; + runq = &RQD(ops, cpu)->runq; cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu)); printk(" sibling=%s, ", cpustr); @@ -1014,22 +1060,23 @@ csched_dump_pcpu(int cpu) } static void -csched_dump(void) +csched_dump(struct scheduler *ops) { struct list_head *iter_sdom, *iter_svc; + struct csched_private *prv = CSCHED_PRIV(ops); int loop; printk("info:\n" "\tncpus = %u\n" "\tdefault-weight = %d\n", - csched_priv.ncpus, + prv->ncpus, CSCHED_DEFAULT_WEIGHT); /* FIXME: Locking! */ printk("active vcpus:\n"); loop = 0; - list_for_each( iter_sdom, &csched_priv.sdom ) + list_for_each( iter_sdom, &prv->sdom ) { struct csched_dom *sdom; sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem); @@ -1046,42 +1093,49 @@ csched_dump(void) } static void -make_runq_map(void) +make_runq_map(struct scheduler *ops) { int cpu, cpu_count=0; + struct csched_private *prv = CSCHED_PRIV(ops); /* FIXME: Read pcpu layout and do this properly */ for_each_possible_cpu( cpu ) { - csched_priv.runq_map[cpu] = 0; + prv->runq_map[cpu] = 0; cpu_count++; } - csched_priv.runq_count = 1; + prv->runq_count = 1; /* Move to the init code...? */ - csched_priv.rqd[0].cpu_min = 0; - csched_priv.rqd[0].cpu_max = cpu_count; + prv->rqd[0].cpu_min = 0; + prv->rqd[0].cpu_max = cpu_count; } -static void -csched_init(void) +static int +csched_init(struct scheduler *ops, int pool0) { int i; + struct csched_private *prv; printk("Initializing Credit2 scheduler\n" \ " WARNING: This is experimental software in development.\n" \ " Use at your own risk.\n"); - spin_lock_init(&csched_priv.lock); - INIT_LIST_HEAD(&csched_priv.sdom); + prv = xmalloc(struct csched_private); + if ( prv == NULL ) + return 1; + memset(prv, 0, sizeof(*prv)); - csched_priv.ncpus = 0; + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->sdom); - make_runq_map(); + prv->ncpus = 0; - for ( i=0; i<csched_priv.runq_count ; i++ ) + make_runq_map(ops); + + for ( i=0; i<prv->runq_count ; i++ ) { - struct csched_runqueue_data *rqd = csched_priv.rqd + i; + struct csched_runqueue_data *rqd = prv->rqd + i; rqd->max_weight = 1; rqd->id = i; @@ -1096,24 +1150,40 @@ csched_init(void) spinlock_t *lock; /* Point the per-cpu schedule lock to the runq_id lock */ - runq_id = csched_priv.runq_map[i]; + runq_id = prv->runq_map[i]; lock = &per_cpu(schedule_data, runq_id)._lock; per_cpu(schedule_data, i).schedule_lock = lock; - csched_priv.ncpus++; + prv->ncpus++; } + + return 0; } + +static void +csched_deinit(struct scheduler *ops) +{ + struct csched_private *prv; + + prv = CSCHED_PRIV(ops); + if ( prv != NULL ) + xfree(prv); +} + + +static struct csched_private _csched_priv; struct scheduler sched_credit2_def = { .name = "SMP Credit Scheduler rev2", .opt_name = "credit2", .sched_id = XEN_SCHEDULER_CREDIT2, + .sched_data = &_csched_priv, .init_domain = csched_dom_init, .destroy_domain = csched_dom_destroy, - .init_vcpu = csched_vcpu_init, + .insert_vcpu = csched_vcpu_insert, .destroy_vcpu = csched_vcpu_destroy, .sleep = csched_vcpu_sleep, @@ -1128,4 +1198,9 @@ struct scheduler sched_credit2_def = { .dump_cpu_state = csched_dump_pcpu, .dump_settings = csched_dump, .init = csched_init, + .deinit = csched_deinit, + .alloc_vdata = csched_alloc_vdata, + .free_vdata = csched_free_vdata, + .alloc_domdata = csched_alloc_domdata, + .free_domdata = csched_free_domdata, }; diff -r dbf0fd95180f xen/common/sched_sedf.c --- a/xen/common/sched_sedf.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/common/sched_sedf.c Wed Apr 21 13:08:37 2010 +0200 @@ -20,6 +20,9 @@ if ( (_f) <= SEDFLEVEL ) \ printk(_a ); \ } while ( 0 ) + +#define SEDF_CPUONLINE(_pool) \ + (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid) #ifndef NDEBUG #define SEDF_STATS @@ -132,7 +135,7 @@ struct sedf_cpu_info { #define sedf_runnable(edom) (!(EDOM_INFO(edom)->status & SEDF_ASLEEP)) -static void sedf_dump_cpu_state(int i); +static void sedf_dump_cpu_state(struct scheduler *ops, int i); static inline int extraq_on(struct vcpu *d, int i) { @@ -329,30 +332,17 @@ static inline void __add_to_runqueue_sor } -static int sedf_init_vcpu(struct vcpu *v) +static void *sedf_alloc_vdata(struct scheduler *ops, struct vcpu *v, void *dd) { struct sedf_vcpu_info *inf; - if ( (v->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL ) - return -1; - memset(v->sched_priv, 0, sizeof(struct sedf_vcpu_info)); + inf = xmalloc(struct sedf_vcpu_info); + if ( inf == NULL ) + return NULL; - inf = EDOM_INFO(v); + memset(inf, 0, sizeof(struct sedf_vcpu_info)); inf->vcpu = v; - - /* Allocate per-CPU context if this is the first domain to be added. */ - if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) ) - { - per_cpu(schedule_data, v->processor).sched_priv = - xmalloc(struct sedf_cpu_info); - BUG_ON(per_cpu(schedule_data, v->processor).sched_priv == NULL); - memset(CPU_INFO(v->processor), 0, sizeof(*CPU_INFO(v->processor))); - INIT_LIST_HEAD(WAITQ(v->processor)); - INIT_LIST_HEAD(RUNQ(v->processor)); - INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_PEN_Q)); - INIT_LIST_HEAD(EXTRAQ(v->processor,EXTRA_UTIL_Q)); - } - + /* Every VCPU gets an equal share of extratime by default. */ inf->deadl_abs = 0; inf->latency = 0; @@ -383,39 +373,88 @@ static int sedf_init_vcpu(struct vcpu *v } else { - EDOM_INFO(v)->deadl_abs = 0; - EDOM_INFO(v)->status &= ~SEDF_ASLEEP; + inf->deadl_abs = 0; + inf->status &= ~SEDF_ASLEEP; } + + return inf; +} + +static void * +sedf_alloc_pdata(struct scheduler *ops, int cpu) +{ + struct sedf_cpu_info *spc; + + spc = xmalloc(struct sedf_cpu_info); + BUG_ON(spc == NULL); + memset(spc, 0, sizeof(*spc)); + INIT_LIST_HEAD(&spc->waitq); + INIT_LIST_HEAD(&spc->runnableq); + INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]); + INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]); + + return (void *)spc; +} + +static void +sedf_free_pdata(struct scheduler *ops, void *spc, int cpu) +{ + if ( spc == NULL ) + return; + + xfree(spc); +} + +static void sedf_free_vdata(struct scheduler *ops, void *priv) +{ + xfree(priv); +} + +static void sedf_destroy_vcpu(struct scheduler *ops, struct vcpu *v) +{ + sedf_free_vdata(ops, v->sched_priv); +} + +static void * +sedf_alloc_domdata(struct scheduler *ops, struct domain *d) +{ + void *mem; + + mem = xmalloc(struct sedf_dom_info); + if ( mem == NULL ) + return NULL; + + memset(mem, 0, sizeof(struct sedf_dom_info)); + + return mem; +} + +static int sedf_init_domain(struct scheduler *ops, struct domain *d) +{ + d->sched_priv = sedf_alloc_domdata(ops, d); + if ( d->sched_priv == NULL ) + return -ENOMEM; return 0; } -static void sedf_destroy_vcpu(struct vcpu *v) +static void sedf_free_domdata(struct scheduler *ops, void *data) { - xfree(v->sched_priv); + xfree(data); } -static int sedf_init_domain(struct domain *d) +static void sedf_destroy_domain(struct scheduler *ops, struct domain *d) { - d->sched_priv = xmalloc(struct sedf_dom_info); - if ( d->sched_priv == NULL ) - return -ENOMEM; - - memset(d->sched_priv, 0, sizeof(struct sedf_dom_info)); - - return 0; + sedf_free_domdata(ops, d->sched_priv); } -static void sedf_destroy_domain(struct domain *d) -{ - xfree(d->sched_priv); -} - -static int sedf_pick_cpu(struct vcpu *v) +static int sedf_pick_cpu(struct scheduler *ops, struct vcpu *v) { cpumask_t online_affinity; + cpumask_t *online; - cpus_and(online_affinity, v->cpu_affinity, cpu_online_map); + online = SEDF_CPUONLINE(v->domain->cpupool); + cpus_and(online_affinity, v->cpu_affinity, *online); return first_cpu(online_affinity); } @@ -751,7 +790,7 @@ static struct task_slice sedf_do_extra_s -timeslice for the current period used up -domain on waitqueue has started it's period -and various others ;) in general: determine which domain to run next*/ -static struct task_slice sedf_do_schedule(s_time_t now) +static struct task_slice sedf_do_schedule(struct scheduler *ops, s_time_t now) { int cpu = smp_processor_id(); struct list_head *runq = RUNQ(cpu); @@ -786,6 +825,13 @@ static struct task_slice sedf_do_schedul } check_waitq: update_queues(now, runq, waitq); + + if ( unlikely(!cpu_isset(cpu, *SEDF_CPUONLINE(per_cpu(cpupool, cpu)))) ) + { + ret.task = IDLETASK(cpu); + ret.time = SECONDS(1); + goto sched_done; + } /*now simply pick the first domain from the runqueue, which has the earliest deadline, because the list is sorted*/ @@ -824,6 +870,7 @@ static struct task_slice sedf_do_schedul extraq, cpu); } + sched_done: /*TODO: Do something USEFUL when this happens and find out, why it still can happen!!!*/ if ( ret.time < 0) @@ -841,7 +888,7 @@ static struct task_slice sedf_do_schedul } -static void sedf_sleep(struct vcpu *d) +static void sedf_sleep(struct scheduler *ops, struct vcpu *d) { PRINT(2,"sedf_sleep was called, domain-id %i.%i\n", d->domain->domain_id, d->vcpu_id); @@ -1060,7 +1107,7 @@ static inline int should_switch(struct v return 1; } -static void sedf_wake(struct vcpu *d) +static void sedf_wake(struct scheduler *ops, struct vcpu *d) { s_time_t now = NOW(); struct sedf_vcpu_info* inf = EDOM_INFO(d); @@ -1213,8 +1260,8 @@ static void sedf_dump_domain(struct vcpu } -/* dumps all domains on hte specified cpu */ -static void sedf_dump_cpu_state(int i) +/* dumps all domains on the specified cpu */ +static void sedf_dump_cpu_state(struct scheduler *ops, int i) { struct list_head *list, *queue, *tmp; struct sedf_vcpu_info *d_inf; @@ -1287,7 +1334,7 @@ static void sedf_dump_cpu_state(int i) /* Adjusts periods and slices of the domains accordingly to their weights. */ -static int sedf_adjust_weights(struct xen_domctl_scheduler_op *cmd) +static int sedf_adjust_weights(struct cpupool *c, struct xen_domctl_scheduler_op *cmd) { struct vcpu *p; struct domain *d; @@ -1308,6 +1355,8 @@ static int sedf_adjust_weights(struct xe rcu_read_lock(&domlist_read_lock); for_each_domain( d ) { + if ( c != d->cpupool ) + continue; for_each_vcpu( d, p ) { if ( EDOM_INFO(p)->weight ) @@ -1359,7 +1408,7 @@ static int sedf_adjust_weights(struct xe /* set or fetch domain scheduling parameters */ -static int sedf_adjust(struct domain *p, struct xen_domctl_scheduler_op *op) +static int sedf_adjust(struct scheduler *ops, struct domain *p, struct xen_domctl_scheduler_op *op) { struct vcpu *v; int rc; @@ -1368,9 +1417,6 @@ static int sedf_adjust(struct domain *p, "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n", p->domain_id, op->u.sedf.period, op->u.sedf.slice, op->u.sedf.latency, (op->u.sedf.extratime)?"yes":"no"); - - if ( !p->vcpu ) - return -EINVAL; if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo ) { @@ -1421,7 +1467,7 @@ static int sedf_adjust(struct domain *p, } } - rc = sedf_adjust_weights(op); + rc = sedf_adjust_weights(p->cpupool, op); if ( rc ) return rc; @@ -1449,7 +1495,7 @@ static int sedf_adjust(struct domain *p, return 0; } -const struct scheduler sched_sedf_def = { +struct scheduler sched_sedf_def = { .name = "Simple EDF Scheduler", .opt_name = "sedf", .sched_id = XEN_SCHEDULER_SEDF, @@ -1457,8 +1503,14 @@ const struct scheduler sched_sedf_def = .init_domain = sedf_init_domain, .destroy_domain = sedf_destroy_domain, - .init_vcpu = sedf_init_vcpu, .destroy_vcpu = sedf_destroy_vcpu, + + .alloc_vdata = sedf_alloc_vdata, + .free_vdata = sedf_free_vdata, + .alloc_pdata = sedf_alloc_pdata, + .free_pdata = sedf_free_pdata, + .alloc_domdata = sedf_alloc_domdata, + .free_domdata = sedf_free_domdata, .do_schedule = sedf_do_schedule, .pick_cpu = sedf_pick_cpu, diff -r dbf0fd95180f xen/common/schedule.c --- a/xen/common/schedule.c Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/common/schedule.c Wed Apr 21 13:08:37 2010 +0200 @@ -53,11 +53,12 @@ static void poll_timer_fn(void *data); /* This is global for now so that private implementations can reach it */ DEFINE_PER_CPU(struct schedule_data, schedule_data); +DEFINE_PER_CPU(struct scheduler *, scheduler); extern const struct scheduler sched_sedf_def; extern const struct scheduler sched_credit_def; extern const struct scheduler sched_credit2_def; -static const struct scheduler *__initdata schedulers[] = { +static const struct scheduler *schedulers[] = { &sched_sedf_def, &sched_credit_def, &sched_credit2_def, @@ -66,9 +67,15 @@ static const struct scheduler *__initdat static struct scheduler __read_mostly ops; -#define SCHED_OP(fn, ...) \ - (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \ - : (typeof(ops.fn(__VA_ARGS__)))0 ) +#define SCHED_OP(opsptr, fn, ...) \ + (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \ + : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 ) + +#define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : &((_d)->cpupool->sched)) +#define VCPU2OP(_v) (DOM2OP((_v)->domain)) +#define VCPU2ONLINE(_v) \ + (((_v)->domain->cpupool == NULL) ? &cpu_online_map \ + : &(_v)->domain->cpupool->cpu_valid) static inline void trace_runstate_change(struct vcpu *v, int new_state) { @@ -209,7 +216,86 @@ int sched_init_vcpu(struct vcpu *v, unsi TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id); - return SCHED_OP(init_vcpu, v); + if ( unlikely(per_cpu(schedule_data, v->processor).sched_priv == NULL) ) + { + per_cpu(schedule_data, v->processor).sched_priv = + SCHED_OP(DOM2OP(d), alloc_pdata, processor); + if ( per_cpu(schedule_data, v->processor).sched_priv == NULL ) + return 1; + } + + v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv); + if ( v->sched_priv == NULL ) + return 1; + + if ( is_idle_domain(d) ) + per_cpu(schedule_data, v->processor).sched_idlevpriv = v->sched_priv; + + return 0; +} + +int sched_move_domain(struct domain *d, struct cpupool *c) +{ + struct vcpu *v; + unsigned int new_p; + void **vcpu_priv; + void *domdata; + + domdata = SCHED_OP(&(c->sched), alloc_domdata, d); + if ( domdata == NULL ) + return -ENOMEM; + + vcpu_priv = xmalloc_array(void *, d->max_vcpus); + if ( vcpu_priv == NULL ) + { + SCHED_OP(&(c->sched), free_domdata, domdata); + return -ENOMEM; + } + + memset(vcpu_priv, 0, d->max_vcpus * sizeof(void *)); + for_each_vcpu ( d, v ) + { + vcpu_priv[v->vcpu_id] = SCHED_OP(&(c->sched), alloc_vdata, v, domdata); + if ( vcpu_priv[v->vcpu_id] == NULL ) + { + for_each_vcpu ( d, v ) + { + if ( vcpu_priv[v->vcpu_id] != NULL ) + xfree(vcpu_priv[v->vcpu_id]); + } + xfree(vcpu_priv); + SCHED_OP(&(c->sched), free_domdata, domdata); + return -ENOMEM; + } + } + + domain_pause(d); + + new_p = first_cpu(c->cpu_valid); + for_each_vcpu ( d, v ) + { + migrate_timer(&v->periodic_timer, new_p); + migrate_timer(&v->singleshot_timer, new_p); + migrate_timer(&v->poll_timer, new_p); + + SCHED_OP(VCPU2OP(v), destroy_vcpu, v); + + cpus_setall(v->cpu_affinity); + v->processor = new_p; + v->sched_priv = vcpu_priv[v->vcpu_id]; + + new_p = cycle_cpu(new_p, c->cpu_valid); + } + + d->cpupool = c; + SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv); + d->sched_priv = domdata; + + domain_unpause(d); + + xfree(vcpu_priv); + + return 0; } void sched_destroy_vcpu(struct vcpu *v) @@ -219,17 +305,17 @@ void sched_destroy_vcpu(struct vcpu *v) kill_timer(&v->poll_timer); if ( test_and_clear_bool(v->is_urgent) ) atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count); - SCHED_OP(destroy_vcpu, v); + SCHED_OP(VCPU2OP(v), destroy_vcpu, v); } int sched_init_domain(struct domain *d) { - return SCHED_OP(init_domain, d); + return SCHED_OP(DOM2OP(d), init_domain, d); } void sched_destroy_domain(struct domain *d) { - SCHED_OP(destroy_domain, d); + SCHED_OP(DOM2OP(d), destroy_domain, d); } void vcpu_sleep_nosync(struct vcpu *v) @@ -243,7 +329,7 @@ void vcpu_sleep_nosync(struct vcpu *v) if ( v->runstate.state == RUNSTATE_runnable ) vcpu_runstate_change(v, RUNSTATE_offline, NOW()); - SCHED_OP(sleep, v); + SCHED_OP(VCPU2OP(v), sleep, v); } vcpu_schedule_unlock_irqrestore(v, flags); @@ -271,7 +357,7 @@ void vcpu_wake(struct vcpu *v) { if ( v->runstate.state >= RUNSTATE_blocked ) vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); - SCHED_OP(wake, v); + SCHED_OP(VCPU2OP(v), wake, v); } else if ( !test_bit(_VPF_blocked, &v->pause_flags) ) { @@ -326,7 +412,7 @@ static void vcpu_migrate(struct vcpu *v) /* Select new CPU. */ old_cpu = v->processor; - new_cpu = SCHED_OP(pick_cpu, v); + new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v); /* * Transfer urgency status to new CPU before switching CPUs, as once @@ -369,19 +455,29 @@ void vcpu_force_reschedule(struct vcpu * } /* - * This function is used by cpu_hotplug code from stop_machine context. - * Hence we can avoid needing to take certain locks. + * This function is used by cpu_hotplug code from stop_machine context + * and from cpupools to switch schedulers on a cpu. */ -void cpu_disable_scheduler(void) +int cpu_disable_scheduler(unsigned int cpu, int lock) { struct domain *d; struct vcpu *v; - unsigned int cpu = smp_processor_id(); + struct cpupool *c; + int ret = 0; + + c = per_cpu(cpupool, cpu); + if ( c == NULL ) + return ret; for_each_domain ( d ) { + if ( d->cpupool != c ) + continue; + for_each_vcpu ( d, v ) { + if ( lock != 0 ) + vcpu_schedule_lock_irq(v); if ( (cpus_weight(v->cpu_affinity) == 1) && cpu_isset(cpu, v->cpu_affinity) ) { @@ -395,26 +491,46 @@ void cpu_disable_scheduler(void) * be chosen when the timer is next re-set. */ if ( v->singleshot_timer.cpu == cpu ) - migrate_timer(&v->singleshot_timer, 0); + { + int cpu_mig; + + cpu_mig = first_cpu(c->cpu_valid); + if (cpu_mig == cpu) + cpu_mig = next_cpu(cpu_mig, c->cpu_valid); + migrate_timer(&v->singleshot_timer, cpu_mig); + } if ( v->processor == cpu ) { set_bit(_VPF_migrating, &v->pause_flags); + if ( lock != 0 ) + vcpu_schedule_unlock_irq(v); vcpu_sleep_nosync(v); vcpu_migrate(v); } + else if ( lock != 0 ) + vcpu_schedule_unlock_irq(v); + /* + * A vcpu active in the hypervisor will not be migratable. + * The caller should try again after releasing and reaquiring + * all locks. + */ + if ( v->processor == cpu ) + ret = -EAGAIN; } } + return ret; } int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity) { cpumask_t online_affinity, old_affinity; + cpumask_t *online; if ( v->domain->is_pinned ) return -EINVAL; - - cpus_and(online_affinity, *affinity, cpu_online_map); + online = VCPU2ONLINE(v); + cpus_and(online_affinity, *affinity, *online); if ( cpus_empty(online_affinity) ) return -EINVAL; @@ -723,7 +839,7 @@ long sched_adjust(struct domain *d, stru struct vcpu *v; long ret; - if ( (op->sched_id != ops.sched_id) || + if ( (op->sched_id != DOM2OP(d)->sched_id) || ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) && (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) ) return -EINVAL; @@ -750,7 +866,7 @@ long sched_adjust(struct domain *d, stru if ( d == current->domain ) vcpu_schedule_lock_irq(current); - if ( (ret = SCHED_OP(adjust, d, op)) == 0 ) + if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 ) TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); if ( d == current->domain ) @@ -797,6 +913,7 @@ static void schedule(void) { struct vcpu *prev = current, *next = NULL; s_time_t now = NOW(); + struct scheduler *sched = this_cpu(scheduler); struct schedule_data *sd; struct task_slice next_slice; @@ -812,7 +929,7 @@ static void schedule(void) stop_timer(&sd->s_timer); /* get policy-specific decision on scheduling... */ - next_slice = ops.do_schedule(now); + next_slice = sched->do_schedule(sched, now); next = next_slice.task; @@ -871,6 +988,10 @@ static void schedule(void) update_vcpu_system_time(next); vcpu_periodic_timer_work(next); + TRACE_4D(TRC_SCHED_SWITCH, + prev->domain->domain_id, prev->vcpu_id, + next->domain->domain_id, next->vcpu_id); + context_switch(prev, next); } @@ -884,7 +1005,7 @@ void context_saved(struct vcpu *prev) /* Check for migration request /after/ clearing running flag. */ smp_mb(); - SCHED_OP(context_saved, prev); + SCHED_OP(VCPU2OP(prev), context_saved, prev); if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) ) vcpu_migrate(prev); @@ -920,20 +1041,25 @@ static void poll_timer_fn(void *data) vcpu_unblock(v); } +/* Get scheduler by id */ +const struct scheduler *scheduler_get_by_id(unsigned int id) +{ + int i; + + for ( i = 0; schedulers[i] != NULL; i++ ) + { + if ( schedulers[i]->sched_id == id ) + return schedulers[i]; + } + return NULL; +} + /* Initialise the data structures. */ void __init scheduler_init(void) { int i; open_softirq(SCHEDULE_SOFTIRQ, schedule); - - for_each_possible_cpu ( i ) - { - spin_lock_init(&per_cpu(schedule_data, i)._lock); - per_cpu(schedule_data, i).schedule_lock - = &per_cpu(schedule_data, i)._lock; - init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i); - } for ( i = 0; schedulers[i] != NULL; i++ ) { @@ -948,43 +1074,125 @@ void __init scheduler_init(void) ops = *schedulers[0]; } + for_each_possible_cpu ( i ) + { + per_cpu(scheduler, i) = &ops; + spin_lock_init(&per_cpu(schedule_data, i)._lock); + per_cpu(schedule_data, i).schedule_lock + = &per_cpu(schedule_data, i)._lock; + init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i); + } + printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); - SCHED_OP(init); + if ( SCHED_OP(&ops, init, 1) ) + panic("scheduler returned error on init\n"); } -void dump_runq(unsigned char key) +/* switch scheduler on cpu */ +void schedule_cpu_switch(unsigned int cpu, struct cpupool *c) { - s_time_t now = NOW(); - int i; unsigned long flags; + struct vcpu *v; + void *vpriv = NULL; + void *ppriv; + void *ppriv_old; + struct scheduler *old_ops; + struct scheduler *new_ops; - local_irq_save(flags); + old_ops = per_cpu(scheduler, cpu); + new_ops = (c == NULL) ? &ops : &(c->sched); + v = per_cpu(schedule_data, cpu).idle; + ppriv = SCHED_OP(new_ops, alloc_pdata, cpu); + if ( c != NULL ) + vpriv = SCHED_OP(new_ops, alloc_vdata, v, v->domain->sched_priv); - printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name); - SCHED_OP(dump_settings); - printk("sched_smt_power_savings: %s\n", - sched_smt_power_savings? "enabled":"disabled"); - printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); + spin_lock_irqsave(per_cpu(schedule_data, cpu).schedule_lock, flags); - for_each_online_cpu ( i ) + if ( c == NULL ) + { + vpriv = v->sched_priv; + v->sched_priv = per_cpu(schedule_data, cpu).sched_idlevpriv; + } + else + { + v->sched_priv = vpriv; + vpriv = NULL; + } + SCHED_OP(old_ops, tick_suspend, cpu); + per_cpu(scheduler, cpu) = new_ops; + ppriv_old = per_cpu(schedule_data, cpu).sched_priv; + per_cpu(schedule_data, cpu).sched_priv = ppriv; + SCHED_OP(new_ops, tick_resume, cpu); + SCHED_OP(new_ops, insert_vcpu, v); + + spin_unlock_irqrestore(per_cpu(schedule_data, cpu).schedule_lock, flags); + + if ( vpriv != NULL ) + SCHED_OP(old_ops, free_vdata, vpriv); + SCHED_OP(old_ops, free_pdata, ppriv_old, cpu); +} + +/* init scheduler global data */ +int schedule_init_global(char *name, struct scheduler *sched) +{ + int i; + const struct scheduler *data; + + data = &ops; + for ( i = 0; (schedulers[i] != NULL) && (name != NULL) ; i++ ) + { + if ( strcmp(schedulers[i]->opt_name, name) == 0 ) + { + data = schedulers[i]; + break; + } + } + memcpy(sched, data, sizeof(*sched)); + return SCHED_OP(sched, init, 0); +} + +/* deinitialize scheduler global data */ +void schedule_deinit_global(struct scheduler *sched) +{ + SCHED_OP(sched, deinit); +} + +void schedule_dump(struct cpupool *c) +{ + int i; + struct scheduler *sched; + cpumask_t *cpus; + + sched = (c == NULL) ? &ops : &(c->sched); + cpus = (c == NULL) ? &cpupool_free_cpus : &c->cpu_valid; + printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name); + SCHED_OP(sched, dump_settings); + + for_each_cpu_mask (i, *cpus) { spin_lock(per_cpu(schedule_data, i).schedule_lock); printk("CPU[%02d] ", i); - SCHED_OP(dump_cpu_state, i); + SCHED_OP(sched, dump_cpu_state, i); spin_unlock(per_cpu(schedule_data, i).schedule_lock); } - - local_irq_restore(flags); } void sched_tick_suspend(void) { - SCHED_OP(tick_suspend); + struct scheduler *sched; + unsigned int cpu = smp_processor_id(); + + sched = per_cpu(scheduler, cpu); + SCHED_OP(sched, tick_suspend, cpu); } void sched_tick_resume(void) { - SCHED_OP(tick_resume); + struct scheduler *sched; + unsigned int cpu = smp_processor_id(); + + sched = per_cpu(scheduler, cpu); + SCHED_OP(sched, tick_resume, cpu); } #ifdef CONFIG_COMPAT diff -r dbf0fd95180f xen/include/public/domctl.h --- a/xen/include/public/domctl.h Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/include/public/domctl.h Wed Apr 21 13:08:37 2010 +0200 @@ -35,7 +35,7 @@ #include "xen.h" #include "grant_table.h" -#define XEN_DOMCTL_INTERFACE_VERSION 0x00000006 +#define XEN_DOMCTL_INTERFACE_VERSION 0x00000007 struct xenctl_cpumap { XEN_GUEST_HANDLE_64(uint8) bitmap; @@ -60,10 +60,10 @@ struct xen_domctl_createdomain { /* Should domain memory integrity be verifed by tboot during Sx? */ #define _XEN_DOMCTL_CDF_s3_integrity 2 #define XEN_DOMCTL_CDF_s3_integrity (1U<<_XEN_DOMCTL_CDF_s3_integrity) - uint32_t flags; /* Disable out-of-sync shadow page tables? */ #define _XEN_DOMCTL_CDF_oos_off 3 #define XEN_DOMCTL_CDF_oos_off (1U<<_XEN_DOMCTL_CDF_oos_off) + uint32_t flags; }; typedef struct xen_domctl_createdomain xen_domctl_createdomain_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t); @@ -106,6 +106,7 @@ struct xen_domctl_getdomaininfo { uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */ uint32_t ssidref; xen_domain_handle_t handle; + uint32_t cpupool; }; typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t); @@ -785,6 +786,30 @@ typedef struct xen_domctl_mem_sharing_op typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t); +/* + * cpupool operations + */ +/* XEN_DOMCTL_cpupool_op */ +#define XEN_DOMCTL_CPUPOOL_OP_CREATE 1 /* C */ +#define XEN_DOMCTL_CPUPOOL_OP_DESTROY 2 /* D */ +#define XEN_DOMCTL_CPUPOOL_OP_INFO 3 /* I */ +#define XEN_DOMCTL_CPUPOOL_OP_ADDCPU 4 /* A */ +#define XEN_DOMCTL_CPUPOOL_OP_RMCPU 5 /* R */ +#define XEN_DOMCTL_CPUPOOL_OP_MOVEDOMAIN 6 /* M */ +#define XEN_DOMCTL_CPUPOOL_OP_FREEINFO 7 /* F */ +#define XEN_DOMCTL_CPUPOOL_PAR_ANY 0xFFFFFFFF +struct xen_domctl_cpupool_op { + uint32_t op; /* IN */ + uint32_t cpupool_id; /* IN: CDIARM OUT: CI */ + uint32_t sched_id; /* IN: C OUT: I */ + uint32_t domid; /* IN: M */ + uint32_t cpu; /* IN: AR */ + uint32_t n_dom; /* OUT: I */ + struct xenctl_cpumap cpumap; /* OUT: IF */ +}; +typedef struct xen_domctl_cpupool_op xen_domctl_cpupool_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpupool_op_t); + struct xen_domctl { uint32_t cmd; @@ -846,6 +871,7 @@ struct xen_domctl { #define XEN_DOMCTL_gettscinfo 59 #define XEN_DOMCTL_settscinfo 60 #define XEN_DOMCTL_getpageframeinfo3 61 +#define XEN_DOMCTL_cpupool_op 62 #define XEN_DOMCTL_gdbsx_guestmemio 1000 #define XEN_DOMCTL_gdbsx_pausevcpu 1001 #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 @@ -894,6 +920,7 @@ struct xen_domctl { struct xen_domctl_debug_op debug_op; struct xen_domctl_mem_event_op mem_event_op; struct xen_domctl_mem_sharing_op mem_sharing_op; + struct xen_domctl_cpupool_op cpupool_op; #if defined(__i386__) || defined(__x86_64__) struct xen_domctl_cpuid cpuid; #endif diff -r dbf0fd95180f xen/include/xen/sched-if.h --- a/xen/include/xen/sched-if.h Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/include/xen/sched-if.h Wed Apr 21 13:08:37 2010 +0200 @@ -9,6 +9,12 @@ #define __XEN_SCHED_IF_H__ #include <xen/percpu.h> + +/* A global pointer to the initial cpupool (POOL0). */ +extern struct cpupool *cpupool0; + +/* cpus currently in no cpupool */ +extern cpumask_t cpupool_free_cpus; /* * In order to allow a scheduler to remap the lock->cpu mapping, @@ -26,11 +32,14 @@ struct schedule_data { struct vcpu *curr; /* current task */ struct vcpu *idle; /* idle task for this cpu */ void *sched_priv; + void *sched_idlevpriv; /* default scheduler vcpu data */ struct timer s_timer; /* scheduling timer */ atomic_t urgent_count; /* how many urgent vcpus */ } __cacheline_aligned; DECLARE_PER_CPU(struct schedule_data, schedule_data); +DECLARE_PER_CPU(struct scheduler *, scheduler); +DECLARE_PER_CPU(struct cpupool *, cpupool); static inline void vcpu_schedule_lock(struct vcpu *v) { @@ -78,29 +87,50 @@ struct scheduler { char *name; /* full name for this scheduler */ char *opt_name; /* option name for this scheduler */ unsigned int sched_id; /* ID for this scheduler */ + void *sched_data; /* global data pointer */ - void (*init) (void); + int (*init) (struct scheduler *, int); + void (*deinit) (struct scheduler *); - int (*init_domain) (struct domain *); - void (*destroy_domain) (struct domain *); + void (*free_vdata) (struct scheduler *, void *); + void * (*alloc_vdata) (struct scheduler *, struct vcpu *, + void *); + void (*free_pdata) (struct scheduler *, void *, int); + void * (*alloc_pdata) (struct scheduler *, int); + void (*free_domdata) (struct scheduler *, void *); + void * (*alloc_domdata) (struct scheduler *, struct domain *); - int (*init_vcpu) (struct vcpu *); - void (*destroy_vcpu) (struct vcpu *); + int (*init_domain) (struct scheduler *, struct domain *); + void (*destroy_domain) (struct scheduler *, struct domain *); - void (*sleep) (struct vcpu *); - void (*wake) (struct vcpu *); - void (*context_saved) (struct vcpu *); + void (*insert_vcpu) (struct scheduler *, struct vcpu *); + void (*destroy_vcpu) (struct scheduler *, struct vcpu *); - struct task_slice (*do_schedule) (s_time_t); + void (*sleep) (struct scheduler *, struct vcpu *); + void (*wake) (struct scheduler *, struct vcpu *); + void (*context_saved) (struct scheduler *, struct vcpu *); - int (*pick_cpu) (struct vcpu *); - int (*adjust) (struct domain *, + struct task_slice (*do_schedule) (struct scheduler *, s_time_t); + + int (*pick_cpu) (struct scheduler *, struct vcpu *); + int (*adjust) (struct scheduler *, struct domain *, struct xen_domctl_scheduler_op *); - void (*dump_settings) (void); - void (*dump_cpu_state) (int); + void (*dump_settings) (struct scheduler *); + void (*dump_cpu_state) (struct scheduler *, int); - void (*tick_suspend) (void); - void (*tick_resume) (void); + void (*tick_suspend) (struct scheduler *, unsigned int); + void (*tick_resume) (struct scheduler *, unsigned int); }; +struct cpupool +{ + int cpupool_id; + cpumask_t cpu_valid; /* all cpus assigned to pool */ + struct cpupool *next; + unsigned int n_dom; + struct scheduler sched; +}; + +const struct scheduler *scheduler_get_by_id(unsigned int id); + #endif /* __XEN_SCHED_IF_H__ */ diff -r dbf0fd95180f xen/include/xen/sched.h --- a/xen/include/xen/sched.h Tue Apr 20 14:32:53 2010 +0100 +++ b/xen/include/xen/sched.h Wed Apr 21 13:08:37 2010 +0200 @@ -213,6 +213,7 @@ struct domain /* Scheduling. */ void *sched_priv; /* scheduler-specific data */ + struct cpupool *cpupool; struct domain *next_in_list; struct domain *next_in_hashbucket; @@ -465,6 +466,7 @@ void sched_destroy_vcpu(struct vcpu *v); void sched_destroy_vcpu(struct vcpu *v); int sched_init_domain(struct domain *d); void sched_destroy_domain(struct domain *d); +int sched_move_domain(struct domain *d, struct cpupool *c); long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *); int sched_id(void); void sched_tick_suspend(void); @@ -575,8 +577,13 @@ void domain_unpause_by_systemcontroller( void domain_unpause_by_systemcontroller(struct domain *d); void cpu_init(void); +struct scheduler; + +int schedule_init_global(char *name, struct scheduler *sched); +void schedule_deinit_global(struct scheduler *sched); +void schedule_cpu_switch(unsigned int cpu, struct cpupool *c); void vcpu_force_reschedule(struct vcpu *v); -void cpu_disable_scheduler(void); +int cpu_disable_scheduler(unsigned int cpu, int lock); int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity); void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate); @@ -607,6 +614,19 @@ extern enum cpufreq_controller { FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen } cpufreq_controller; +#define CPUPOOLID_NONE -1 + +struct cpupool *cpupool_create(int poolid, char *sched); +int cpupool_destroy(struct cpupool *c); +int cpupool0_cpu_assign(struct cpupool *c); +int cpupool_assign_ncpu(struct cpupool *c, int ncpu); +void cpupool_cpu_add(unsigned int cpu); +int cpupool_cpu_remove(unsigned int cpu); +int cpupool_add_domain(struct domain *d, int poolid); +void cpupool_rm_domain(struct domain *d); +int cpupool_do_domctl(struct xen_domctl_cpupool_op *op); +#define num_cpupool_cpus(c) (cpus_weight((c)->cpu_valid)) + #endif /* __SCHED_H__ */ /*

_______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.