Xen project Mailing List

[Xen-devel] [PATCH RFC v1 1/4] rt: Add rt scheduler to hypervisor

This is the core rt scheduler patch. It adds the new real time scheduler to the hypervisor, as the non-default scheduler. This scheduler follows the pre-emptive Global EDF theory in real-time field. Each VCPU can have a dedicated period and budget. While scheduled, a VCPU burns its budget. A VCPU has its budget replenished at the beginning of each of its periods; The VCPU discards its unused budget at the end of each of its periods. If a VCPU runs out of budget in a period, it has to wait until next period. The mechanism of how to burn a VCPU's budget depends on the server mechanism implemented for each VCPU. Server mechanism: a VCPU is implemented as a deferable server. When a VCPU has a task running on it, its budget is continuously burned; When a VCPU has no task but with budget left, its budget is preserved. Priority scheme: Preemptive Global Earliest Deadline First (gEDF). At any scheduling point, the VCPU with earliest deadline has highest priority. Queue scheme: A global runqueue for each CPU pool. The runqueue holds all runnable VCPUs. VCPUs in the runqueue are divided into two parts: with and without budget. At each part, VCPUs are sorted based on EDF priority scheme. Scheduling quanta: 1 ms; but accounting the budget is in microsecond. Note: cpumask and cpupool is supported. This is still in the development phase. Signed-off-by: Sisu Xi <xisisu@xxxxxxxxx> Signed-off-by: Meng Xu <mengxu@xxxxxxxxxxxxx> --- xen/common/Makefile | 1 + xen/common/sched_rt.c | 984 +++++++++++++++++++++++++++++++++++++++++++ xen/common/schedule.c | 1 + xen/include/public/domctl.h | 19 + xen/include/xen/sched-if.h | 2 +- 5 files changed, 1006 insertions(+), 1 deletion(-) create mode 100644 xen/common/sched_rt.c diff --git a/xen/common/Makefile b/xen/common/Makefile index 3683ae3..5a23aa4 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -26,6 +26,7 @@ obj-y += sched_credit.o obj-y += sched_credit2.o obj-y += sched_sedf.o obj-y += sched_arinc653.o +obj-y += sched_rt.o obj-y += schedule.o obj-y += shutdown.o obj-y += softirq.o diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c new file mode 100644 index 0000000..41543a2 --- /dev/null +++ b/xen/common/sched_rt.c @@ -0,0 +1,984 @@ +/****************************************************************************** + * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen + * EDF scheduling is one of most popular real-time scheduling algorithm used in + * embedded field. + * + * by Sisu Xi, 2013, Washington University in Saint Louis + * and Meng Xu, 2014, University of Pennsylvania + * + * based on the code of credit Scheduler + */ + +#include <xen/config.h> +#include <xen/init.h> +#include <xen/lib.h> +#include <xen/sched.h> +#include <xen/domain.h> +#include <xen/delay.h> +#include <xen/event.h> +#include <xen/time.h> +#include <xen/perfc.h> +#include <xen/sched-if.h> +#include <xen/softirq.h> +#include <asm/atomic.h> +#include <xen/errno.h> +#include <xen/trace.h> +#include <xen/cpu.h> +#include <xen/keyhandler.h> +#include <xen/trace.h> +#include <xen/guest_access.h> + +/* + * TODO: + * + * Migration compensation and resist like credit2 to better use cache; + * Lock Holder Problem, using yield? + * Self switch problem: VCPUs of the same domain may preempt each other; + */ + +/* + * Design: + * + * This scheduler follows the Preemptive Global EDF theory in real-time field. + * Each VCPU can have a dedicated period and budget. + * While scheduled, a VCPU burns its budget. + * A VCPU has its budget replenished at the beginning of each of its periods; + * The VCPU discards its unused budget at the end of each of its periods. + * If a VCPU runs out of budget in a period, it has to wait until next period. + * The mechanism of how to burn a VCPU's budget depends on the server mechanism + * implemented for each VCPU. + * + * Server mechanism: a VCPU is implemented as a deferable server. + * When a VCPU has a task running on it, its budget is continuously burned; + * When a VCPU has no task but with budget left, its budget is preserved. + * + * Priority scheme: Preemptive Global Earliest Deadline First (gEDF). + * At any scheduling point, the VCPU with earliest deadline has highest priority. + * + * Queue scheme: A global runqueue for each CPU pool. + * The runqueue holds all runnable VCPUs. + * VCPUs in the runqueue are divided into two parts: with and without remaining budget. + * At each part, VCPUs are sorted based on EDF priority scheme. + * + * Scheduling quanta: 1 ms; but accounting the budget is in microsecond. + * + * Note: cpumask and cpupool is supported. + */ + +/* + * Locking: + * Just like credit2, a global system lock is used to protect the RunQ. + * The global lock is referenced by schedule_data.schedule_lock from all physical cpus. + * + * The lock is already grabbed when calling wake/sleep/schedule/ functions in schedule.c + * + * The functions involes RunQ and needs to grab locks are: + * dump, vcpu_insert, vcpu_remove, context_saved, + */ + + +/* + * Default parameters in ms + */ +#define RT_DEFAULT_PERIOD 10 +#define RT_DEFAULT_BUDGET 4 + +/* + * Useful macros + */ +#define RT_PRIV(_ops) ((struct rt_private *)((_ops)->sched_data)) +#define RT_VCPU(_vcpu) ((struct rt_vcpu *)(_vcpu)->sched_priv) +#define RT_DOM(_dom) ((struct rt_dom *)(_dom)->sched_priv) +#define RUNQ(_ops) (&RT_PRIV(_ops)->runq) + +/* + * Flags + */ +#define __RT_scheduled 1 +#define RT_scheduled (1<<__RT_scheduled) +#define __RT_delayed_runq_add 2 +#define RT_delayed_runq_add (1<<__RT_delayed_runq_add) + +/* + * Used to printout debug information + */ +#define printtime() ( printk("%d : %3ld.%3ld : %-19s ", smp_processor_id(), NOW()/MILLISECS(1), NOW()%MILLISECS(1)/1000, __func__) ) + +/* + * Systme-wide private data, include a global RunQueue + * The global lock is referenced by schedule_data.schedule_lock from all physical cpus. + * It can be grabbed via vcpu_schedule_lock_irq() + */ +struct rt_private { + spinlock_t lock; /* The global coarse grand lock */ + struct list_head sdom; /* list of availalbe domains, used for dump */ + struct list_head runq; /* Ordered list of runnable VMs */ + cpumask_t cpus; /* cpumask_t of available physical cpus */ + cpumask_t tickled; /* another cpu in the queue already ticked this one */ +}; + +/* + * Virtual CPU + */ +struct rt_vcpu { + struct list_head runq_elem; /* On the runqueue list */ + struct list_head sdom_elem; /* On the domain VCPU list */ + + /* Up-pointers */ + struct rt_dom *sdom; + struct vcpu *vcpu; + + /* VCPU parameters, in milliseconds */ + s_time_t period; + s_time_t budget; + + /* VCPU current infomation */ + long cur_budget; /* current budget in microseconds */ + s_time_t last_start; /* last start time, used to calculate budget */ + s_time_t cur_deadline; /* current deadline, used to do EDF */ + unsigned flags; /* mark __RT_scheduled, etc.. */ +}; + +/* + * Domain + */ +struct rt_dom { + struct list_head vcpu; /* link its VCPUs */ + struct list_head sdom_elem; /* link list on rt_priv */ + struct domain *dom; /* pointer to upper domain */ +}; + +/* + * RunQueue helper functions + */ +static int +__vcpu_on_runq(struct rt_vcpu *svc) +{ + return !list_empty(&svc->runq_elem); +} + +static struct rt_vcpu * +__runq_elem(struct list_head *elem) +{ + return list_entry(elem, struct rt_vcpu, runq_elem); +} + +static inline void +__runq_remove(struct rt_vcpu *svc) +{ + if ( __vcpu_on_runq(svc) ) + list_del_init(&svc->runq_elem); +} + +/* + * Insert a vcpu in the RunQ basing on vcpu's deadline: + * vcpus with shorter deadline are inserted first. + * EDF schedule policy: vcpu with smaller deadline has higher priority; + * When vcpus have the same deadline, insert the current one at the head of these vcpus. + */ +static void +__runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) +{ + struct list_head *runq = RUNQ(ops); + struct list_head *iter; + + ASSERT( spin_is_locked(per_cpu(schedule_data, svc->vcpu->processor).schedule_lock) ); + + if ( __vcpu_on_runq(svc) ) + return; + + list_for_each(iter, runq) { + struct rt_vcpu * iter_svc = __runq_elem(iter); + + if ( svc->cur_budget > 0 ) { /* svc still has budget */ + if ( iter_svc->cur_budget == 0 || + svc->cur_deadline <= iter_svc->cur_deadline ) + break; + } else { /* svc has no budget */ + if ( iter_svc->cur_budget == 0 && + svc->cur_deadline <= iter_svc->cur_deadline ) + break; + } + } + + list_add_tail(&svc->runq_elem, iter); +} + + +/* + * Debug related code, dump vcpu/cpu information + */ +static void +rt_dump_vcpu(struct rt_vcpu *svc) +{ + if ( svc == NULL ) { + printk("NULL!\n"); + return; + } +#define cpustr keyhandler_scratch + cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity); + printk("[%5d.%-2d] cpu %d, (%"PRId64", %"PRId64"), cur_b=%"PRId64" cur_d=%"PRId64" last_start=%"PRId64" onR=%d runnable=%d cpu_hard_affinity=%s ", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->vcpu->processor, + svc->period, + svc->budget, + svc->cur_budget, + svc->cur_deadline, + svc->last_start, + __vcpu_on_runq(svc), + vcpu_runnable(svc->vcpu), + cpustr); + memset(cpustr, 0, sizeof(char)*1024); + cpumask_scnprintf(cpustr, sizeof(cpustr), cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool)); + printk("cpupool=%s\n", cpustr); +#undef cpustr +} + +static void +rt_dump_pcpu(const struct scheduler *ops, int cpu) +{ + struct rt_vcpu *svc = RT_VCPU(curr_on_cpu(cpu)); + + printtime(); + rt_dump_vcpu(svc); +} + +/* + * should not need lock here. only showing stuff + */ +static void +rt_dump(const struct scheduler *ops) +{ + struct list_head *iter_sdom, *iter_svc, *runq, *iter; + struct rt_private *prv = RT_PRIV(ops); + struct rt_vcpu *svc; + int cpu = 0; + int loop = 0; + + printtime(); + printk("Priority Scheme: EDF\n"); + + printk("PCPU info: \n"); + for_each_cpu(cpu, &prv->cpus) { + rt_dump_pcpu(ops, cpu); + } + + printk("Global RunQueue info: \n"); + loop = 0; + runq = RUNQ(ops); + list_for_each( iter, runq ) { + svc = __runq_elem(iter); + printk("\t%3d: ", ++loop); + rt_dump_vcpu(svc); + } + + printk("Domain info: \n"); + loop = 0; + list_for_each( iter_sdom, &prv->sdom ) { + struct rt_dom *sdom; + sdom = list_entry(iter_sdom, struct rt_dom, sdom_elem); + printk("\tdomain: %d\n", sdom->dom->domain_id); + + list_for_each( iter_svc, &sdom->vcpu ) { + svc = list_entry(iter_svc, struct rt_vcpu, sdom_elem); + printk("\t\t%3d: ", ++loop); + rt_dump_vcpu(svc); + } + } + + printk("\n"); +} + +/* + * Init/Free related code + */ +static int +rt_init(struct scheduler *ops) +{ + struct rt_private *prv; + + prv = xzalloc(struct rt_private); + if ( prv == NULL ) { + printk("xzalloc failed at rt_private\n"); + return -ENOMEM; + } + + ops->sched_data = prv; + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->sdom); + INIT_LIST_HEAD(&prv->runq); + cpumask_clear(&prv->tickled); + cpumask_clear(&prv->cpus); + + printtime(); + printk("\n"); + + return 0; +} + +static void +rt_deinit(const struct scheduler *ops) +{ + struct rt_private *prv; + + printtime(); + printk("\n"); + + prv = RT_PRIV(ops); + if ( prv ) + xfree(prv); +} + +/* + * point per_cpu spinlock to the global system lock; all cpu have same global system lock + */ +static void * +rt_alloc_pdata(const struct scheduler *ops, int cpu) +{ + struct rt_private *prv = RT_PRIV(ops); + + cpumask_set_cpu(cpu, &prv->cpus); + + per_cpu(schedule_data, cpu).schedule_lock = &prv->lock; + + printtime(); + printk("%s total cpus: %d", __FUNCTION__, cpumask_weight(&prv->cpus)); + return (void *)1; +} + +static void +rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + struct rt_private * prv = RT_PRIV(ops); + cpumask_clear_cpu(cpu, &prv->cpus); + printtime(); + printk("%s cpu=%d\n", __FUNCTION__, cpu); +} + +static void * +rt_alloc_domdata(const struct scheduler *ops, struct domain *dom) +{ + unsigned long flags; + struct rt_dom *sdom; + struct rt_private * prv = RT_PRIV(ops); + + printtime(); + printk("dom=%d\n", dom->domain_id); + + sdom = xzalloc(struct rt_dom); + if ( sdom == NULL ) { + printk("%s, xzalloc failed\n", __func__); + return NULL; + } + + INIT_LIST_HEAD(&sdom->vcpu); + INIT_LIST_HEAD(&sdom->sdom_elem); + sdom->dom = dom; + + /* spinlock here to insert the dom */ + spin_lock_irqsave(&prv->lock, flags); + list_add_tail(&sdom->sdom_elem, &(prv->sdom)); + spin_unlock_irqrestore(&prv->lock, flags); + + return (void *)sdom; +} + +static void +rt_free_domdata(const struct scheduler *ops, void *data) +{ + unsigned long flags; + struct rt_dom *sdom = data; + struct rt_private * prv = RT_PRIV(ops); + + printtime(); + printk("dom=%d\n", sdom->dom->domain_id); + + spin_lock_irqsave(&prv->lock, flags); + list_del_init(&sdom->sdom_elem); + spin_unlock_irqrestore(&prv->lock, flags); + xfree(data); +} + +static int +rt_dom_init(const struct scheduler *ops, struct domain *dom) +{ + struct rt_dom *sdom; + + printtime(); + printk("dom=%d\n", dom->domain_id); + + /* IDLE Domain does not link on rt_private */ + if ( is_idle_domain(dom) ) { return 0; } + + sdom = rt_alloc_domdata(ops, dom); + if ( sdom == NULL ) { + printk("%s, failed\n", __func__); + return -ENOMEM; + } + dom->sched_priv = sdom; + + return 0; +} + +static void +rt_dom_destroy(const struct scheduler *ops, struct domain *dom) +{ + printtime(); + printk("dom=%d\n", dom->domain_id); + + rt_free_domdata(ops, RT_DOM(dom)); +} + +static void * +rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) +{ + struct rt_vcpu *svc; + s_time_t now = NOW(); + long count; + + /* Allocate per-VCPU info */ + svc = xzalloc(struct rt_vcpu); + if ( svc == NULL ) { + printk("%s, xzalloc failed\n", __func__); + return NULL; + } + + INIT_LIST_HEAD(&svc->runq_elem); + INIT_LIST_HEAD(&svc->sdom_elem); + svc->flags = 0U; + svc->sdom = dd; + svc->vcpu = vc; + svc->last_start = 0; /* init last_start is 0 */ + + svc->period = RT_DEFAULT_PERIOD; + if ( !is_idle_vcpu(vc) && vc->domain->domain_id != 0 ) { + svc->budget = RT_DEFAULT_BUDGET; + } else { + svc->budget = RT_DEFAULT_PERIOD; /* give vcpus of dom0 100% utilization */ + } + + count = (now/MILLISECS(svc->period)) + 1; + /* sync all VCPU's start time to 0 */ + svc->cur_deadline += count*MILLISECS(svc->period); + + svc->cur_budget = svc->budget*1000; /* counting in microseconds level */ + /* Debug only: dump new vcpu's info */ + printtime(); + rt_dump_vcpu(svc); + + return svc; +} + +static void +rt_free_vdata(const struct scheduler *ops, void *priv) +{ + struct rt_vcpu *svc = priv; + + /* Debug only: dump freed vcpu's info */ + printtime(); + rt_dump_vcpu(svc); + xfree(svc); +} + +static void +rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu *svc = RT_VCPU(vc); + + /* Debug only: dump info of vcpu to insert */ + printtime(); + rt_dump_vcpu(svc); + + /* IDLE VCPU not allowed on RunQ */ + if ( is_idle_vcpu(vc) ) + return; + + list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); /* add to dom vcpu list */ +} + +static void +rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = RT_VCPU(vc); + struct rt_dom * const sdom = svc->sdom; + + printtime(); + rt_dump_vcpu(svc); + + BUG_ON( sdom == NULL ); + BUG_ON( __vcpu_on_runq(svc) ); + + if ( !is_idle_vcpu(vc) ) { + list_del_init(&svc->sdom_elem); + } +} + +/* + * Pick a valid CPU for the vcpu vc + * Valid CPU of a vcpu is intesection of vcpu's affinity and available cpus + */ +static int +rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc) +{ + cpumask_t cpus; + cpumask_t *online; + int cpu; + struct rt_private * prv = RT_PRIV(ops); + + online = cpupool_scheduler_cpumask(vc->domain->cpupool); + cpumask_and(&cpus, &prv->cpus, online); + cpumask_and(&cpus, &cpus, vc->cpu_hard_affinity); + + cpu = cpumask_test_cpu(vc->processor, &cpus) + ? vc->processor + : cpumask_cycle(vc->processor, &cpus); + ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); + + return cpu; +} + +/* + * Implemented as deferrable server. + * Different server mechanism has different implementation. + * burn budget at microsecond level. + */ +static void +burn_budgets(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) { + s_time_t delta; + unsigned int consume; + long count = 0; + + /* first time called for this svc, update last_start */ + if ( svc->last_start == 0 ) { + svc->last_start = now; + return; + } + + /* don't burn budget for idle VCPU */ + if ( is_idle_vcpu(svc->vcpu) ) { + return; + } + + /* don't burn budget for Domain-0, RT-Xen use only */ + if ( svc->sdom->dom->domain_id == 0 ) { + return; + } + + /* update deadline info */ + delta = now - svc->cur_deadline; + if ( delta >= 0 ) { + count = ( delta/MILLISECS(svc->period) ) + 1; + svc->cur_deadline += count * MILLISECS(svc->period); + svc->cur_budget = svc->budget * 1000; + return; + } + + delta = now - svc->last_start; + if ( delta < 0 ) { + printk("%s, delta = %ld for ", __func__, delta); + rt_dump_vcpu(svc); + svc->last_start = now; /* update last_start */ + svc->cur_budget = 0; + return; + } + + if ( svc->cur_budget == 0 ) return; + + /* burn at microseconds level */ + consume = ( delta/MICROSECS(1) ); + if ( delta%MICROSECS(1) > MICROSECS(1)/2 ) consume++; + + svc->cur_budget -= consume; + if ( svc->cur_budget < 0 ) svc->cur_budget = 0; +} + +/* + * RunQ is sorted. Pick first one within cpumask. If no one, return NULL + * lock is grabbed before calling this function + */ +static struct rt_vcpu * +__runq_pick(const struct scheduler *ops, cpumask_t mask) +{ + struct list_head *runq = RUNQ(ops); + struct list_head *iter; + struct rt_vcpu *svc = NULL; + struct rt_vcpu *iter_svc = NULL; + cpumask_t cpu_common; + cpumask_t *online; + struct rt_private * prv = RT_PRIV(ops); + + list_for_each(iter, runq) { + iter_svc = __runq_elem(iter); + + /* mask is intersection of cpu_hard_affinity and cpupool and priv->cpus */ + online = cpupool_scheduler_cpumask(iter_svc->vcpu->domain->cpupool); + cpumask_and(&cpu_common, online, &prv->cpus); + cpumask_and(&cpu_common, &cpu_common, iter_svc->vcpu->cpu_hard_affinity); + cpumask_and(&cpu_common, &mask, &cpu_common); + if ( cpumask_empty(&cpu_common) ) + continue; + + if ( iter_svc->cur_budget <= 0 ) + continue; + + svc = iter_svc; + break; + } + + return svc; +} + +/* + * Update vcpu's budget and sort runq by insert the modifed vcpu back to runq + * lock is grabbed before calling this function + */ +static void +__repl_update(const struct scheduler *ops, s_time_t now) +{ + struct list_head *runq = RUNQ(ops); + struct list_head *iter; + struct list_head *tmp; + struct rt_vcpu *svc = NULL; + + s_time_t diff; + long count; + + list_for_each_safe(iter, tmp, runq) { + svc = __runq_elem(iter); + + diff = now - svc->cur_deadline; + if ( diff > 0 ) { + count = (diff/MILLISECS(svc->period)) + 1; + svc->cur_deadline += count * MILLISECS(svc->period); + svc->cur_budget = svc->budget * 1000; + __runq_remove(svc); + __runq_insert(ops, svc); + } + } +} + +/* + * schedule function for rt scheduler. + * The lock is already grabbed in schedule.c, no need to lock here + */ +static struct task_slice +rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) +{ + const int cpu = smp_processor_id(); + struct rt_private * prv = RT_PRIV(ops); + struct rt_vcpu * const scurr = RT_VCPU(current); + struct rt_vcpu * snext = NULL; + struct task_slice ret; + + /* clear ticked bit now that we've been scheduled */ + if ( cpumask_test_cpu(cpu, &prv->tickled) ) + cpumask_clear_cpu(cpu, &prv->tickled); + + /* burn_budget would return for IDLE VCPU */ + burn_budgets(ops, scurr, now); + + __repl_update(ops, now); + + if ( tasklet_work_scheduled ) { + snext = RT_VCPU(idle_vcpu[cpu]); + } else { + cpumask_t cur_cpu; + cpumask_clear(&cur_cpu); + cpumask_set_cpu(cpu, &cur_cpu); + snext = __runq_pick(ops, cur_cpu); + if ( snext == NULL ) + snext = RT_VCPU(idle_vcpu[cpu]); + + /* if scurr has higher priority and budget, still pick scurr */ + if ( !is_idle_vcpu(current) && + vcpu_runnable(current) && + scurr->cur_budget > 0 && + ( is_idle_vcpu(snext->vcpu) || + scurr->cur_deadline <= snext->cur_deadline ) ) { + snext = scurr; + } + } + + if ( snext != scurr && + !is_idle_vcpu(current) && + vcpu_runnable(current) ) { + set_bit(__RT_delayed_runq_add, &scurr->flags); + } + + snext->last_start = now; + ret.migrated = 0; + if ( !is_idle_vcpu(snext->vcpu) ) { + if ( snext != scurr ) { + __runq_remove(snext); + set_bit(__RT_scheduled, &snext->flags); + } + if ( snext->vcpu->processor != cpu ) { + snext->vcpu->processor = cpu; + ret.migrated = 1; + } + } + + ret.time = MILLISECS(1); + ret.task = snext->vcpu; + + return ret; +} + +/* + * Remove VCPU from RunQ + * The lock is already grabbed in schedule.c, no need to lock here + */ +static void +rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = RT_VCPU(vc); + + BUG_ON( is_idle_vcpu(vc) ); + + if ( curr_on_cpu(vc->processor) == vc ) { + cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); + return; + } + + if ( __vcpu_on_runq(svc) ) { + __runq_remove(svc); + } + + clear_bit(__RT_delayed_runq_add, &svc->flags); +} + +/* + * Pick a vcpu on a cpu to kick out to place the running candidate + * Called by wake() and context_saved() + * We have a running candidate here, the kick logic is: + * Among all the cpus that are within the cpu affinity + * 1) if the new->cpu is idle, kick it. This could benefit cache hit + * 2) if there are any idle vcpu, kick it. + * 3) now all pcpus are busy, among all the running vcpus, pick lowest priority one + * if snext has higher priority, kick it. + * + * TODO: + * 1) what if these two vcpus belongs to the same domain? + * replace a vcpu belonging to the same domain introduces more overhead + * + * lock is grabbed before calling this function + */ +static void +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) +{ + struct rt_private * prv = RT_PRIV(ops); + struct rt_vcpu * scheduled = NULL; /* lowest priority scheduled */ + struct rt_vcpu * iter_svc; + struct vcpu * iter_vc; + int cpu = 0; + cpumask_t not_tickled; /* not tickled cpus */ + cpumask_t *online; + + if ( new == NULL || is_idle_vcpu(new->vcpu) ) return; + + online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool); + cpumask_and(&not_tickled, online, &prv->cpus); + cpumask_and(&not_tickled, &not_tickled, new->vcpu->cpu_hard_affinity); + cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled); + + /* 1) if new's previous cpu is idle, kick it for cache benefit */ + if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) ) { + cpumask_set_cpu(new->vcpu->processor, &prv->tickled); + cpu_raise_softirq(new->vcpu->processor, SCHEDULE_SOFTIRQ); + return; + } + + /* 2) if there are any idle pcpu, kick it */ + /* The same loop also find the one with lowest priority */ + for_each_cpu(cpu, &not_tickled) { + iter_vc = curr_on_cpu(cpu); + if ( is_idle_vcpu(iter_vc) ) { + cpumask_set_cpu(cpu, &prv->tickled); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + return; + } + iter_svc = RT_VCPU(iter_vc); + if ( scheduled == NULL || + iter_svc->cur_deadline > scheduled->cur_deadline ) { + scheduled = iter_svc; + } + } + + /* 3) candicate has higher priority, kick out the lowest priority vcpu */ + if ( scheduled != NULL && new->cur_deadline < scheduled->cur_deadline ) { + cpumask_set_cpu(scheduled->vcpu->processor, &prv->tickled); + cpu_raise_softirq(scheduled->vcpu->processor, SCHEDULE_SOFTIRQ); + } + return; +} + +/* + * Should always wake up runnable vcpu, put it back to RunQ. + * Check priority to raise interrupt + * The lock is already grabbed in schedule.c, no need to lock here + * TODO: what if these two vcpus belongs to the same domain? + */ +static void +rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = RT_VCPU(vc); + s_time_t diff; + s_time_t now = NOW(); + long count = 0; + struct rt_private * prv = RT_PRIV(ops); + struct rt_vcpu * snext = NULL; /* highest priority on RunQ */ + + BUG_ON( is_idle_vcpu(vc) ); + + if ( unlikely(curr_on_cpu(vc->processor) == vc) ) return; + + /* on RunQ, just update info is ok */ + if ( unlikely(__vcpu_on_runq(svc)) ) return; + + /* If context hasn't been saved yet, set flag so it will add later */ + if ( unlikely(test_bit(__RT_scheduled, &svc->flags)) ) { + set_bit(__RT_delayed_runq_add, &svc->flags); + return; + } + + /* update deadline info */ + diff = now - svc->cur_deadline; + if ( diff >= 0 ) { + count = ( diff/MILLISECS(svc->period) ) + 1; + svc->cur_deadline += count * MILLISECS(svc->period); + svc->cur_budget = svc->budget * 1000; + } + + __runq_insert(ops, svc); + __repl_update(ops, now); + snext = __runq_pick(ops, prv->cpus); /* pick snext from ALL valid cpus */ + runq_tickle(ops, snext); + + return; +} + +/* + * scurr has finished context switch, insert it back to the RunQ, + * and then pick the highest priority vcpu from runq to run + */ +static void +rt_context_saved(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * svc = RT_VCPU(vc); + struct rt_vcpu * snext = NULL; + struct rt_private * prv = RT_PRIV(ops); + spinlock_t *lock; + + clear_bit(__RT_scheduled, &svc->flags); + if ( is_idle_vcpu(vc) ) return; + + lock = vcpu_schedule_lock_irq(vc); + if ( test_and_clear_bit(__RT_delayed_runq_add, &svc->flags) && + likely(vcpu_runnable(vc)) ) { + __runq_insert(ops, svc); + __repl_update(ops, NOW()); + snext = __runq_pick(ops, prv->cpus); /* pick snext from ALL cpus */ + runq_tickle(ops, snext); + } + vcpu_schedule_unlock_irq(lock, vc); +} + +/* + * set/get each vcpu info of each domain + */ +static int +rt_dom_cntl(const struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) +{ + xen_domctl_sched_rt_params_t local_sched; + struct rt_dom * const sdom = RT_DOM(d); + struct list_head *iter; + int vcpu_index = 0; + int rc = -EINVAL; + + switch ( op->cmd ) + { + case XEN_DOMCTL_SCHEDOP_getinfo: + /* for debug use, whenever adjust Dom0 parameter, do global dump */ + if ( d->domain_id == 0 ) { + rt_dump(ops); + } + + local_sched.num_vcpus = 0; + list_for_each( iter, &sdom->vcpu ) { + struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, sdom_elem); + + ASSERT(vcpu_index < XEN_LEGACY_MAX_VCPUS); + local_sched.vcpus[vcpu_index].budget = svc->budget; + local_sched.vcpus[vcpu_index].period = svc->period; + + vcpu_index++; + } + local_sched.num_vcpus = vcpu_index; + copy_to_guest(op->u.rt.schedule, &local_sched, 1); + rc = 0; + break; + case XEN_DOMCTL_SCHEDOP_putinfo: + copy_from_guest(&local_sched, op->u.rt.schedule, 1); + list_for_each( iter, &sdom->vcpu ) { + struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, sdom_elem); + + if ( local_sched.vcpu_index == svc->vcpu->vcpu_id ) { /* adjust per VCPU parameter */ + vcpu_index = local_sched.vcpu_index; + + if ( vcpu_index < 0 || vcpu_index > XEN_LEGACY_MAX_VCPUS) { + printk("XEN_DOMCTL_SCHEDOP_putinfo: vcpu_index=%d\n", + vcpu_index); + }else{ + printk("XEN_DOMCTL_SCHEDOP_putinfo: vcpu_index=%d, period=%"PRId64", budget=%"PRId64"\n", + vcpu_index, local_sched.vcpus[vcpu_index].period, + local_sched.vcpus[vcpu_index].budget); + } + + svc->period = local_sched.vcpus[vcpu_index].period; + svc->budget = local_sched.vcpus[vcpu_index].budget; + + break; + } + } + rc = 0; + break; + } + + return rc; +} + +static struct rt_private _rt_priv; + +const struct scheduler sched_rt_def = { + .name = "SMP RT Scheduler", + .opt_name = "rt", + .sched_id = XEN_SCHEDULER_RT, + .sched_data = &_rt_priv, + + .dump_cpu_state = rt_dump_pcpu, + .dump_settings = rt_dump, + .init = rt_init, + .deinit = rt_deinit, + .alloc_pdata = rt_alloc_pdata, + .free_pdata = rt_free_pdata, + .alloc_domdata = rt_alloc_domdata, + .free_domdata = rt_free_domdata, + .init_domain = rt_dom_init, + .destroy_domain = rt_dom_destroy, + .alloc_vdata = rt_alloc_vdata, + .free_vdata = rt_free_vdata, + .insert_vcpu = rt_vcpu_insert, + .remove_vcpu = rt_vcpu_remove, + + .adjust = rt_dom_cntl, + + .pick_cpu = rt_cpu_pick, + .do_schedule = rt_schedule, + .sleep = rt_vcpu_sleep, + .wake = rt_vcpu_wake, + .context_saved = rt_context_saved, + + .yield = NULL, + .migrate = NULL, +}; diff --git a/xen/common/schedule.c b/xen/common/schedule.c index e9eb0bc..2d13966 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -68,6 +68,7 @@ static const struct scheduler *schedulers[] = { &sched_sedf_def, &sched_credit_def, &sched_credit2_def, + &sched_rt_def, &sched_arinc653_def, }; diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index 5b11bbf..d1a8201 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -339,6 +339,20 @@ struct xen_domctl_max_vcpus { typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t); +/* + * This structure is used to pass to rt scheduler from a + * privileged domain to Xen + */ +struct xen_domctl_sched_rt_params { + struct { + signed long period; /* s_time_t type */ + signed long budget; + } vcpus[XEN_LEGACY_MAX_VCPUS]; + uint16_t num_vcpus; + uint16_t vcpu_index; +}; +typedef struct xen_domctl_sched_rt_params xen_domctl_sched_rt_params_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_sched_rt_params_t); /* XEN_DOMCTL_scheduler_op */ /* Scheduler types. */ @@ -346,6 +360,8 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t); #define XEN_SCHEDULER_CREDIT 5 #define XEN_SCHEDULER_CREDIT2 6 #define XEN_SCHEDULER_ARINC653 7 +#define XEN_SCHEDULER_RT 8 + /* Set or get info? */ #define XEN_DOMCTL_SCHEDOP_putinfo 0 #define XEN_DOMCTL_SCHEDOP_getinfo 1 @@ -367,6 +383,9 @@ struct xen_domctl_scheduler_op { struct xen_domctl_sched_credit2 { uint16_t weight; } credit2; + struct xen_domctl_sched_rt{ + XEN_GUEST_HANDLE_64(xen_domctl_sched_rt_params_t) schedule; + } rt; } u; }; typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h index 4164dff..e452d32 100644 --- a/xen/include/xen/sched-if.h +++ b/xen/include/xen/sched-if.h @@ -169,7 +169,7 @@ extern const struct scheduler sched_sedf_def; extern const struct scheduler sched_credit_def; extern const struct scheduler sched_credit2_def; extern const struct scheduler sched_arinc653_def; - +extern const struct scheduler sched_rt_def; struct cpupool { -- 1.7.9.5 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.