commit 0f8ef8f23718cc24b0bc958979aa789be4ed89d5
Author: Dario Faggioli <dfaggioli@xxxxxxxx>
Date:   Tue Mar 2 19:03:05 2021 +0000

    Debug patch for suspect scheduler issues.
    
    Signed-off-by: Dario Faggioli <dfaggioli@xxxxxxxx>

diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c
index b7f4e2bea8..8ce705bd48 100644
--- a/tools/xentrace/xenalyze.c
+++ b/tools/xentrace/xenalyze.c
@@ -7440,6 +7440,17 @@ void sched_process(struct pcpu_info *p)
         /* TRC_SCHED_VERBOSE */
         switch(ri->event)
         {
+        case TRC_SCHED_MAX_INTRV:
+            if(opt.dump_all) {
+                struct {
+                    unsigned int domid, vcpuid;
+                    unsigned int interv, time, last;
+                } *r = (typeof(r))ri->d;
+
+                printf(" %s sched_max_interv %u usecs, at %u usecs with d%uv%u (last: %u usecs)\n",
+                       ri->dump_header, r->interv, r->time, r->domid, r->vcpuid, r->last);
+            }
+            break;
         case TRC_SCHED_DOM_ADD:
             if(opt.dump_all) {
                 struct {
@@ -7904,6 +7915,18 @@ void sched_process(struct pcpu_info *p)
                        ri->dump_header, r->domid, r->vcpuid);
             }
             break;
+        case TRC_SCHED_CLASS_EVT(CSCHED2, 24):
+            if(opt.dump_all) {
+                struct {
+                    unsigned int domid, vcpuid;
+                    unsigned int limits, now, exec;
+                } *r = (typeof(r))ri->d;
+
+                printf(" %s csched2:limit_credit_loss[#%u] d%uv%u, at %u, exec'd %u usecs!\n",
+                       ri->dump_header, r->limits, r->domid, r->vcpuid, r->now, r->exec);
+            }
+            break;
+
         /* RTDS (TRC_RTDS_xxx) */
         case TRC_SCHED_CLASS_EVT(RTDS, 1): /* TICKLE           */
             if(opt.dump_all) {
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index ce7c56147b..29aa99db86 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -57,6 +57,7 @@
 #define TRC_CSCHED2_SCHEDULE         TRC_SCHED_CLASS_EVT(CSCHED2, 21)
 #define TRC_CSCHED2_RATELIMIT        TRC_SCHED_CLASS_EVT(CSCHED2, 22)
 #define TRC_CSCHED2_RUNQ_CAND_CHECK  TRC_SCHED_CLASS_EVT(CSCHED2, 23)
+#define TRC_CSCHED2_LIMIT_CREDITS    TRC_SCHED_CLASS_EVT(CSCHED2, 24)
 
 /*
  * TODO:
@@ -775,6 +776,11 @@ static int get_fallback_cpu(struct csched2_unit *svc)
     return cpumask_any(cpumask_scratch_cpu(sched_unit_master(unit)));
 }
 
+static DEFINE_PER_CPU(unsigned int, limit_credits);
+static DEFINE_PER_CPU(s_time_t, limit_credits_time);
+static DEFINE_PER_CPU(s_time_t, limit_credits_exec);
+static DEFINE_PER_CPU(struct sched_unit *, limit_credits_unit);
+
 /*
  * Time-to-credit, credit-to-time.
  *
@@ -792,7 +798,17 @@ static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time,
     /* Getting to lower credit than CSCHED2_CREDIT_MIN makes no sense. */
     val = svc->credit - val;
     if ( unlikely(val < CSCHED2_CREDIT_MIN) )
+    {
+        this_cpu(limit_credits)++;
+        this_cpu(limit_credits_time) = NOW();
+        this_cpu(limit_credits_exec) = time;
+        this_cpu(limit_credits_unit) = svc->unit;
+        TRACE_5D(TRC_CSCHED2_LIMIT_CREDITS, svc->unit->domain->domain_id,
+                 svc->unit->unit_id, this_cpu(limit_credits),
+                 (uint32_t)(this_cpu(limit_credits_time)/MICROSECS(1)),
+                 (uint32_t)(this_cpu(limit_credits_exec)/MICROSECS(1)));
         svc->credit = CSCHED2_CREDIT_MIN;
+    }
     else
         svc->credit = val;
 }
@@ -3661,6 +3677,12 @@ dump_pcpu(const struct scheduler *ops, int cpu)
            cpu, c2r(cpu),
            CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
            CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
+    if ( per_cpu(limit_credits_unit, cpu) != NULL ) {
+        printk("\tCredit limited: #%u, last at %"PRI_stime" as d%uv%u exec'd %"PRI_stime"\n",
+               per_cpu(limit_credits, cpu), per_cpu(limit_credits_time, cpu),
+               per_cpu(limit_credits_unit, cpu)->domain->domain_id,
+               per_cpu(limit_credits_unit, cpu)->unit_id, per_cpu(limit_credits_exec, cpu));
+    }
 
     /* current UNIT (nothing to say if that's the idle unit) */
     svc = csched2_unit(curr_on_cpu(cpu));
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 6b1ae7bf8c..4a950f3b57 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -2385,6 +2385,12 @@ static void sched_slave(void)
                          is_idle_unit(next) && !is_idle_unit(prev), now);
 }
 
+static DEFINE_PER_CPU(s_time_t, last_sched_time);
+static DEFINE_PER_CPU(s_time_t, last_sched_interval);
+static DEFINE_PER_CPU(s_time_t, max_sched_interval);
+static DEFINE_PER_CPU(s_time_t, max_sched_time);
+static DEFINE_PER_CPU(struct vcpu *, max_sched_interv_vprev);
+
 /*
  * The main function
  * - deschedule the current domain (scheduler independent).
@@ -2399,6 +2405,7 @@ static void schedule(void)
     spinlock_t           *lock;
     int cpu = smp_processor_id();
     unsigned int          gran;
+    s_time_t              sched_interval;
 
     ASSERT_NOT_IN_ATOMIC();
 
@@ -2408,6 +2415,21 @@ static void schedule(void)
 
     lock = pcpu_schedule_lock_irq(cpu);
 
+    now = NOW();
+
+    sched_interval = this_cpu(last_sched_interval) = now - this_cpu(last_sched_time);
+    if ( sched_interval > this_cpu(max_sched_interval) )
+    {
+        this_cpu(max_sched_interval) = sched_interval;
+        this_cpu(max_sched_interv_vprev) = vprev;
+	this_cpu(max_sched_time) = now;
+        TRACE_5D(TRC_SCHED_MAX_INTRV, vprev->domain->domain_id, vprev->vcpu_id,
+                 (uint32_t)(this_cpu(max_sched_interval) / MICROSECS(1)),
+		 (uint32_t)(this_cpu(max_sched_time) / MICROSECS(1)),
+                 (uint32_t)(this_cpu(last_sched_interval) / MICROSECS(1)));
+    }
+    this_cpu(last_sched_time) = now;
+
     sr = get_sched_res(cpu);
     gran = sr->granularity;
 
@@ -2427,8 +2449,6 @@ static void schedule(void)
 
     stop_timer(&sr->s_timer);
 
-    now = NOW();
-
     if ( gran > 1 )
     {
         cpumask_t mask;
@@ -3085,7 +3105,7 @@ void scheduler_free(struct scheduler *sched)
 
 void schedule_dump(struct cpupool *c)
 {
-    unsigned int      i;
+    unsigned int      i,j;
     struct scheduler *sched;
     cpumask_t        *cpus;
 
@@ -3106,11 +3126,30 @@ void schedule_dump(struct cpupool *c)
         cpus = &cpupool_free_cpus;
     }
 
-    if ( sched->dump_cpu_state != NULL )
-    {
-        printk("CPUs info:\n");
-        for_each_cpu (i, cpus)
-            sched_dump_cpu_state(sched, i);
+    printk("CPUs info:\n");
+    for_each_cpu (i, cpus) {
+        struct sched_resource *sr = get_sched_res(i);
+        unsigned long flags;
+        spinlock_t *lock;
+
+        lock = pcpu_schedule_lock_irqsave(i, &flags);
+
+        printk("CPU[%02d] current=%pv, curr=%pv, prev=%pv\n", i,
+               get_cpu_current(i), sr->curr ? sr->curr->vcpu_list : NULL,
+               sr->prev ? sr->prev->vcpu_list : NULL);
+        printk("\tlast schedule: %"PRI_stime", last_interval=%"PRI_stime", "
+               "max_interval=%"PRI_stime" at %"PRI_stime" (after running %pv)\n",
+               per_cpu(last_sched_time, i), per_cpu(last_sched_interval, i),
+               per_cpu(max_sched_interval, i), per_cpu(max_sched_time, i),
+               per_cpu(max_sched_interv_vprev, i));
+               per_cpu(max_sched_interval, i) = 0;
+        for_each_cpu (j, sr->cpus)
+            if ( i != j )
+                printk("CPU[%02d] current=%pv\n", j, get_cpu_current(j));
+
+        pcpu_schedule_unlock_irqrestore(lock, flags, i);
+
+        sched_dump_cpu_state(sched, i);
     }
 
     rcu_read_unlock(&sched_res_rculock);
diff --git a/xen/include/public/trace.h b/xen/include/public/trace.h
index d5fa4aea8d..5b3faf0fd5 100644
--- a/xen/include/public/trace.h
+++ b/xen/include/public/trace.h
@@ -117,6 +117,7 @@
 #define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED_VERBOSE + 15)
 #define TRC_SCHED_SHUTDOWN_CODE  (TRC_SCHED_VERBOSE + 16)
 #define TRC_SCHED_SWITCH_INFCONT (TRC_SCHED_VERBOSE + 17)
+#define TRC_SCHED_MAX_INTRV      (TRC_SCHED_VERBOSE + 18)
 
 #define TRC_DOM0_DOM_ADD         (TRC_DOM0_DOMOPS + 1)
 #define TRC_DOM0_DOM_REM         (TRC_DOM0_DOMOPS + 2)