[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler

To: Meng Xu <xumengpanda@xxxxxxxxx>
From: Tianyang Chen <tiche@xxxxxxxxxxxxxx>
Date: Tue, 19 Jan 2016 11:35:06 -0500
Cc: Dario Faggioli <dario.faggioli@xxxxxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Meng Xu <mengxu@xxxxxxxxxxxxx>, Dagaen Golomb <dgolomb@xxxxxxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxx>
Delivery-date: Tue, 19 Jan 2016 16:35:52 +0000
List-id: Xen developer discussion <xen-devel.lists.xen.org>



On 1/15/2016 10:33 AM, Meng Xu wrote:

On Wed, Jan 6, 2016 at 2:57 AM, Tianyang Chen <tiche@xxxxxxxxxxxxxx> wrote:




On 12/31/2015 8:44 AM, Meng Xu wrote:


On Thu, Dec 31, 2015 at 6:20 PM, Tianyang Chen <tiche@xxxxxxxxxxxxxx> wrote:



Budget replenishment is now handled by a dedicated timer which is
triggered at the most imminent release time of all runnable vcpus.

Changes since V1:
      None

Signed-off-by: Tianyang Chen <tiche@xxxxxxxxxxxxxx>
Signed-off-by: Meng Xu <mengxu@xxxxxxxxxxxxx>
Signed-off-by: Dagaen Golomb <dgolomb@xxxxxxxxxxxxxx>
---
   xen/common/sched_rt.c |  159 
+++++++++++++++++++++++++++++--------------------
   1 file changed, 95 insertions(+), 64 deletions(-)

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 4372486..d522272 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -16,6 +16,7 @@
   #include <xen/delay.h>
   #include <xen/event.h>
   #include <xen/time.h>
+#include <xen/timer.h>
   #include <xen/perfc.h>
   #include <xen/sched-if.h>
   #include <xen/softirq.h>
@@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
    * Global lock is referenced by schedule_data.schedule_lock from all
    * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
    */
+
+/* dedicated timer for replenishment */
+static struct timer repl_timer;
+
+/* controls when to first start the timer*/




missing a space between * and / at the end of this line.


+static int timer_started;
+
+/* handler for the replenishment timer */
+static void repl_handler(void *data);
+
   struct rt_private {
       spinlock_t lock;            /* the global coarse grand lock */
       struct list_head sdom;      /* list of availalbe domains, used for dump 
*/
@@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu 
*svc)
   static int
   rt_init(struct scheduler *ops)
   {
+    const int cpu = smp_processor_id();




Did the "cpu" be used? If not, please remove it here.


       struct rt_private *prv = xzalloc(struct rt_private);

       printk("Initializing RTDS scheduler\n"
@@ -454,6 +466,8 @@ rt_init(struct scheduler *ops)

       ops->sched_data = prv;

+    init_timer(&repl_timer, repl_handler, ops, 0);
+
       return 0;

    no_mem:
@@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops)
           xfree(_cpumask_scratch);
           _cpumask_scratch = NULL;
       }
+
+    kill_timer(&repl_timer);
+
       xfree(prv);
   }

@@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu 
*vc)

       /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
       list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
+
+    if(!timer_started)




space should be added..



Thanks, Meng. I will fix that.


+    {
+        /* the first vcpu starts the timer for the first time*/
+        timer_started = 1;
+        set_timer(&repl_timer,svc->cur_deadline);
+    }
   }

   /*
@@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t 
*mask)
   }

   /*
- * Update vcpu's budget and
- * sort runq by insert the modifed vcpu back to runq
- * lock is grabbed before calling this function
- */
-static void
-__repl_update(const struct scheduler *ops, s_time_t now)
-{
-    struct list_head *runq = rt_runq(ops);
-    struct list_head *depletedq = rt_depletedq(ops);
-    struct list_head *iter;
-    struct list_head *tmp;
-    struct rt_vcpu *svc = NULL;
-
-    list_for_each_safe(iter, tmp, runq)
-    {
-        svc = __q_elem(iter);
-        if ( now < svc->cur_deadline )
-            break;
-
-        rt_update_deadline(now, svc);
-        /* reinsert the vcpu if its deadline is updated */
-        __q_remove(svc);
-        __runq_insert(ops, svc);
-    }
-
-    list_for_each_safe(iter, tmp, depletedq)
-    {
-        svc = __q_elem(iter);
-        if ( now >= svc->cur_deadline )
-        {
-            rt_update_deadline(now, svc);
-            __q_remove(svc); /* remove from depleted queue */
-            __runq_insert(ops, svc); /* add to runq */
-        }
-    }
-}
-
-/*
    * schedule function for rt scheduler.
    * The lock is already grabbed in schedule.c, no need to lock here
    */
@@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, 
bool_t tasklet_work_sched
       /* burn_budget would return for IDLE VCPU */
       burn_budget(ops, scurr, now);

-    __repl_update(ops, now);

       if ( tasklet_work_scheduled )
       {
@@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, 
bool_t tasklet_work_sched
           }
       }

-    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
+    ret.time = snext->budget; /* invoke the scheduler next time */
       ret.task = snext->vcpu;

       /* TRACE */
@@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu 
*vc)
   {
       struct rt_vcpu * const svc = rt_vcpu(vc);
       s_time_t now = NOW();
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
-    struct rt_dom *sdom = NULL;
-    cpumask_t *online;

       BUG_ON( is_idle_vcpu(vc) );

@@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu 
*vc)
       /* insert svc to runq/depletedq because svc is not in queue now */
       __runq_insert(ops, svc);

-    __repl_update(ops, now);
-
-    ASSERT(!list_empty(&prv->sdom));
-    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
-    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
-
-    runq_tickle(ops, snext);
+    runq_tickle(ops, svc);

       return;
   }
@@ -1094,10 +1068,6 @@ static void
   rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
   {
       struct rt_vcpu *svc = rt_vcpu(vc);
-    struct rt_vcpu *snext = NULL;
-    struct rt_dom *sdom = NULL;
-    struct rt_private *prv = rt_priv(ops);
-    cpumask_t *online;
       spinlock_t *lock = vcpu_schedule_lock_irq(vc);

       clear_bit(__RTDS_scheduled, &svc->flags);
@@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct 
vcpu *vc)
       if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
            likely(vcpu_runnable(vc)) )
       {
+        /* only insert the pre-empted vcpu back*/
           __runq_insert(ops, svc);
-        __repl_update(ops, NOW());
-
-        ASSERT(!list_empty(&prv->sdom));
-        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
-        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
-
-        runq_tickle(ops, snext);
       }
   out:
       vcpu_schedule_unlock_irq(lock, vc);
@@ -1167,6 +1130,74 @@ rt_dom_cntl(
       return rc;
   }

+static void repl_handler(void *data){
+    unsigned long flags;
+    s_time_t now = NOW();
+    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
+    struct scheduler *ops = data;
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *runq = rt_runq(ops);
+    struct list_head *depletedq = rt_depletedq(ops);
+    struct list_head *iter;
+    struct list_head *tmp;
+    struct rt_vcpu *svc = NULL;
+
+    spin_lock_irqsave(&prv->lock,flags);
+
+    stop_timer(&repl_timer);




Move the stop_timer before spin_lock_irqsave() since no more than one
core will run this handler in the current design.



Right. We want the critical section as short as possible.




+
+    list_for_each_safe(iter, tmp, runq)
+    {
+        svc = __q_elem(iter);
+        if ( now < svc->cur_deadline )
+            break;
+        rt_update_deadline(now, svc);
+        /* scan the runq to find the min release time
+         * this happens when vcpus on runq miss deadline
+         */
+        if( min_repl> svc->cur_deadline )
+        {
+            min_repl = svc->cur_deadline;
+        }



The logic here is incorrect. The comment you put is correct, but the
code didn't reflect the comment.

When the timer's handler is called, now should be equal to min_repl,
shouldn't it?
If now = min_repl, this if statement will never be called because the
loop stops if ( now < svc->cur_deadline ).


Yes the handler will be invoked when it was programmed to but the value of 
min_repl is set to be LONG_MAX initially. It doesn't preserve values. 
Therefore, the IF statement is feasible.



Hmm, let's have a look  at the logic...

the min_repl. will only be possibly updated to svc->cur_deadline when
now >= svc->cur_deadline, right?

In other words, min_repl will  only be possibly updated to a time
point before now. Shouldn't the min_repl be the earliest time *in the
future* that the handler should be triggered? If not, what is the
definition/meaning of min_repl here?

At the end of the function, you did reprogram the timer to min_repl.
        /* reprogram the timer using the most imminent release time*/
        set_timer(&repl_timer, min_repl);

So I think the logic is incorrect here...

You should first update the budget and deadline of each VCPUs in runq,
depletedq and running VCPUs, and (then) find the earliest time in the
future when the handler should be triggered.

Yeah. Although the next release time was actually found after a vcpu wasreplenished. If the runq still has vcpus pending but not missing adeadline, the break statement on the first iteration will not check therelease time.

Since the runq is sorted, it's easier to just pick the next one afterboth runq and depletedq are updated.

Only after you have replenished the vcpus on runq and depletedq,
should you pick the earliest deadline of the front vcpus at the runq
and depletedq as the min_repl.

Please correct me  if I missed anything. :-)


+        /* reinsert the vcpu if its deadline is updated */
+        __q_remove(svc);
+        __runq_insert(ops, svc);
+    }
+
+    list_for_each_safe(iter, tmp, depletedq)
+    {
+        svc = __q_elem(iter);
+        if ( now >= svc->cur_deadline )
+        {
+            rt_update_deadline(now, svc);
+
+            /* scan the delp_q to find the minimal release time */
+            if(min_repl > svc->cur_deadline)
+            {
+                min_repl = svc->cur_deadline;
+            }




Please see the above comment. It may have the same issue here.

In addition, the current running VCPUs on cores also need to replenish
their budgets at the beginning of their next periods. Those running
VCPUs are not in runq or depletedq in the current RTDS scheduler. So
this patch won't replenish the current running VCPUs' budget until
they are put back to runq or depletedq, which is incorrect. Think
about the following  scenario: a VCPU with very small period  has a
task always running on it to keep this vcpu always runnable; This VCPU
has its period equal to its budget. Suppose this VCPU is the only VCPU
on a 4 core machine. This VCPU should keep running on one core and
never be put back to runq. In the current code, this VCPU won't have
its budget replenished. Right? :-)

You can choose one of the following three approaches to fix it:
a) add the running VCPUs into the runq
b) use another queue to keep track of the current running VCPUs
c) just scan each core (in the cpupool) to get the current running VCPU on it.
I personally prefer c) but I'm looking forward to others' opinion. :-)

I experimented a little bit with adding a runningq to keep track of allrunning vcpus. A vcpu will be inserted when it's picked as snext inrt_schedule(). It will be removed from runningq after the context switchis finished in context_saved(). Also, a currently running vcpu could bemarked as RUNSTATE_offline in sleep() and added back to runq in wake().Does this sound reasonable if we were to use a q to keep track ofrunning vcpus?

+            __q_remove(svc);
+            __runq_insert(ops, svc);
+            runq_tickle(ops, svc);
+        }
+    }
+
+    /* if timer was triggered but none of the vcpus
+     * need to be refilled, set the timer to be the
+     * default period + now
+     */




Hmm, when could this happen? If it could happen, please specify the situation.
Otherwise, it may be better to print a warning (or cause crash?) when
you set the timer to be default period + now?

I'm not quite sure which is better, warning or crash?


+    if(min_repl == LONG_MAX)
+    {

+        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
+    }




Hmm,

when could this happen? The only situation in my mind is that there is
no vcpus in the cpupool. But if that is the case, shouldn't the timer
never be triggered? Then this handler will never be called.



This is actually still a question for me. When I was testing the code in a 
virtual machine, it happens when the system boots up. I am guessing it's 
related to the same problem in nested-virtualization (just guessing). When I 
tested it on a real machine, it locks up and won't even respond to any 
key-board events.




First, we care about the real machine scenario. It should always run
well on real machine. The scheduler design didn't consider the
nest-virt. situation, so it is ok for now to have performance
degradation or issues in nest-virt. situation, IMHO.

Second, the reason why it locks up is probably because the timer of
the replenish handler is not correctly set up as I pointed out above.
So try to fix the above issue first and see if this still occurs.

Right, sometimes there is no vcpus on q at all because when the timer isprogrammed to trigger, a running vcpu hasn't been added back to the qyet. In this situation, correct me if I'm wrong, the repl_handler iscalled before the context switch is finished. However, scanning thecurrently running vcpus can fix this.


After some debugging, it appeared that the timer was only triggered once and in 
the timer handler, none of the vcpus was replenished.



Because you set the timer to a past time above.


After the the system boots up, I have not seen any debug prints indicating that 
the timer is not programmed based on the release time. Besides looking for why, 
I think we should set some maximum interval for the timer to do the 
replenishment.



Well, this is not correct. Replenishment should happen at some
specific time, i.e., the start of a new period of any VCPU; It should
not be an arbitrary value. Otherwise, the VCPU budget and deadline
won't be updated correctly...

+    else
+    {
+        /* reprogram the timer using the most imminent release time*/
+        set_timer(&repl_timer, min_repl);
+    }

+    spin_unlock_irqrestore(&prv->lock,flags);




Same here, need to move spin_unlock_ before the if statement.

+}
+
   static struct rt_private _rt_priv;

   const struct scheduler sched_rtds_def = {
--
1.7.9.5


Best,

Meng



-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

Thanks,
Tianyang Chen

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

Follow-Ups:
- Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
  - From: Meng Xu

References:
- Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
  - From: Tianyang Chen
- Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
  - From: Meng Xu

Prev by Date: Re: [Xen-devel] Error booting Xen
Next by Date: Re: [Xen-devel] [PATCH v3 2/3] XENVER_build_id: Provide ld-embedded build-ids (v8)
Previous by thread: Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
Next by thread: Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
Index(es):
- Date
- Thread

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.