[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] xenmon: fix domain numbering


  • To: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxx>
  • From: Rob Gardner <rob.gardner@xxxxxx>
  • Date: Thu, 06 Jul 2006 12:44:07 -0600
  • Delivery-date: Thu, 06 Jul 2006 11:44:37 -0700
  • List-id: Xen developer discussion <xen-devel.lists.xensource.com>

This patch addresses the problem of xenbaked/xenmon not dealing with
large domain ID's. Xen Domain ID's increase monotonically as domains
are created; The ID's are not (often) recycled. Xenbaked was using the domain
ID's as indices to arrays of data, and this scheme blows up as soon as
a domain ID exceeds the array size. Code has been changed in xenbaked
and xenmon to isolate domain id's from array indices, so everything is
indirect. Users should not notice any difference in behavior.

Rob Gardner


# HG changeset patch
# User rob.gardner@xxxxxx
# Node ID 67f658b84025efeb2a570d2937ebc8f5e35056f7
# Parent  462d6e4cb29a620685f7c382a2372edcc99e2e4a

Domain ID/index mapping in xenmon.

This patch addresses the problem of xenbaked/xenmon not dealing with
large domain ID's. Xen Domain ID's increase monotonically as domains
are created; The ID's are not (often) recycled. Xenbaked was using the domain
ID's as indices to arrays of data, and this scheme blows up as soon as
a domain ID exceeds the array size. Code has been changed in xenbaked
and xenmon to isolate domain id's from array indices, so everything is
indirect. Users should not notice any difference in behavior.

diff -r 462d6e4cb29a -r 67f658b84025 tools/xenmon/xenmon.py
--- a/tools/xenmon/xenmon.py    Wed Jul  5 13:27:27 2006
+++ b/tools/xenmon/xenmon.py    Thu Jul  6 18:08:37 2006
@@ -36,10 +36,10 @@
# constants
NSAMPLES = 100
NDOMAINS = 32
-IDLE_DOMAIN = 31 # idle domain's ID
+IDLE_DOMAIN = -1 # idle domain's ID

# the struct strings for qos_info
-ST_DOM_INFO = "6Q4i32s"
+ST_DOM_INFO = "6Q3i2H32s"
ST_QDATA = "%dQ" % (6*NDOMAINS + 4)

# size of mmaped file
@@ -297,6 +297,7 @@
            samples = []
            doms = []
            dom_in_use = []
+            domain_id = []

            # read in data
            for i in range(0, NSAMPLES):
@@ -311,9 +312,13 @@
                doms.append(dom)
# (last_update_time, start_time, runnable_start_time, blocked_start_time,
#         ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update,
-#         runnable, in_use, domid, name) = dom
+#         runnable, in_use, domid, junk, name) = dom
#        dom_in_use.append(in_use)
                dom_in_use.append(dom[8])
+                domid = dom[9]
+                if domid == 32767 :
+                    domid = IDLE_DOMAIN
+                domain_id.append(domid)
                idx += len
#            print "dom_in_use(cpu=%d): " % cpuidx, dom_in_use

@@ -366,16 +371,16 @@
            if not dom_in_use[dom]:
                continue

-            if h1[dom][0][1] > 0 or dom == NDOMAINS - 1:
+            if h1[dom][0][1] > 0 or domain_id[dom] == IDLE_DOMAIN:
                # display gotten
                row += 1
                col = 2
-                display_domain_id(stdscr, row, col, dom)
+                display_domain_id(stdscr, row, col, domain_id[dom])
                col += 4
                display(stdscr, row, col, "%s" % time_scale(h2[dom][0][0]))
                col += 12
                display(stdscr, row, col, "%3.2f%%" % h2[dom][0][1])
-                if dom != NDOMAINS - 1:
+                if dom != IDLE_DOMAIN:
                    cpu_10sec_usage += h2[dom][0][1]
                col += 12
display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][0][2]))
@@ -388,14 +393,14 @@
                col += 18
                display(stdscr, row, col, "Gotten")

-                if dom != NDOMAINS - 1:
+                if dom != IDLE_DOMAIN:
                    cpu_1sec_usage = cpu_1sec_usage + h1[dom][0][1]
# display allocated
                if options.allocated:
                    row += 1
                    col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                    col += 28
display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][1]))
                    col += 42
@@ -407,7 +412,7 @@
                if options.blocked:
                    row += 1
                    col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                    col += 4
display(stdscr, row, col, "%s" % time_scale(h2[dom][2][0]))
                    col += 12
@@ -427,7 +432,7 @@
                if options.waited:
                    row += 1
                    col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                    col += 4
display(stdscr, row, col, "%s" % time_scale(h2[dom][3][0]))
                    col += 12
@@ -447,7 +452,7 @@
                if options.excount:
                    row += 1
                    col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
col += 28
                    display(stdscr, row, col, "%d/s" % h2[dom][4])
@@ -460,7 +465,7 @@
                if options.iocount:
                    row += 1
                    col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                    col += 4
                    display(stdscr, row, col, "%d/s" % h2[dom][5][0])
                    col += 24
@@ -544,6 +549,9 @@
            self.file.write(self.delay_data)
    self.file.write(str)

+    def rename(self, name):
+        self.filename = name
+
    def flush(self):
        if  self.opened:
            self.file.flush()
@@ -567,10 +575,7 @@
    curr = last = time.time()
    outfiles = {}
    for dom in range(0, NDOMAINS):
-        if dom == IDLE_DOMAIN:
-            outfiles[dom] = Delayed("%s-idle.log" % options.prefix, 'w')
-        else:
- outfiles[dom] = Delayed("%s-dom%d.log" % (options.prefix, dom), 'w') + outfiles[dom] = Delayed("%s-dom%d.log" % (options.prefix, dom), 'w') outfiles[dom].delayed_write("# passed cpu dom cpu(tot) cpu(%) cpu/ex allocated/ex blocked(tot) blocked(%) blocked/io waited(tot) waited(%) waited/ex ex/s io(tot) io/ex\n")

    while options.duration == 0 or interval < (options.duration * 1000):
@@ -582,6 +587,7 @@
            samples = []
            doms = []
            dom_in_use = []
+            domain_id = []

            for i in range(0, NSAMPLES):
                len = struct.calcsize(ST_QDATA)
@@ -595,8 +601,16 @@
#                doms.append(dom)
# (last_update_time, start_time, runnable_start_time, blocked_start_time,
#         ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update,
-#         runnable, in_use, domid, name) = dom
+#         runnable, in_use, domid, junk, name) = dom
                dom_in_use.append(dom[8])
+                domid = dom[9]
+                if domid == 32767:
+                    domid = IDLE_DOMAIN
+                domain_id.append(domid)
+                if domid == IDLE_DOMAIN:
+                    outfiles[i].rename("%s-idle.log" % options.prefix)
+                else:
+ outfiles[i].rename("%s-dom%d.log" % (options.prefix, domid))
                idx += len

            len = struct.calcsize("4i")
@@ -617,9 +631,9 @@
            for dom in range(0, NDOMAINS):
                if not dom_in_use[dom]:
                    continue
-                if h1[dom][0][1] > 0 or dom == NDOMAINS - 1:
+                if h1[dom][0][1] > 0 or dom == IDLE_DOMAIN:
outfiles[dom].write("%.3f %d %d %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n" %
-                                     (interval, cpuidx, dom,
+                                     (interval, cpuidx, domain_id[dom],
h1[dom][0][0], h1[dom][0][1], h1[dom][0][2],
                                     h1[dom][1],
h1[dom][2][0], h1[dom][2][1], h1[dom][2][2],
diff -r 462d6e4cb29a -r 67f658b84025 tools/xenmon/xenbaked.c
--- a/tools/xenmon/xenbaked.c    Wed Jul  5 13:27:27 2006
+++ b/tools/xenmon/xenbaked.c    Thu Jul  6 18:08:37 2006
@@ -95,6 +95,8 @@
_new_qos_data *new_qos;
_new_qos_data **cpu_qos_data;

+int global_cpu;
+uint64_t global_now;

// array of currently running domains, indexed by cpu
int *running = NULL;
@@ -678,7 +680,7 @@
};


-const char *argp_program_version     = "xenbaked v1.3";
+const char *argp_program_version     = "xenbaked v1.4";
const char *argp_program_bug_address = "<rob.gardner@xxxxxx>";


@@ -715,16 +717,117 @@
    return ret;
}

+void qos_init_domain(int domid, int idx)
+{
+  int i;
+
+  memset(&new_qos->domain_info[idx], 0, sizeof(_domain_info));
+  new_qos->domain_info[idx].last_update_time = global_now;
+  //  runnable_start_time[idx] = 0;
+  new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
+  new_qos->domain_info[idx].in_use = 1;
+  new_qos->domain_info[idx].blocked_start_time = 0;
+  new_qos->domain_info[idx].id = domid;
+  if (domid == IDLE_DOMAIN_ID)
+    sprintf(new_qos->domain_info[idx].name, "Idle Task%d", global_cpu);
+  else
+    sprintf(new_qos->domain_info[idx].name, "Domain#%d", domid);
+ + for (i=0; i<NSAMPLES; i++) {
+    new_qos->qdata[i].ns_gotten[idx] = 0;
+    new_qos->qdata[i].ns_allocated[idx] = 0;
+    new_qos->qdata[i].ns_waiting[idx] = 0;
+    new_qos->qdata[i].ns_blocked[idx] = 0;
+    new_qos->qdata[i].switchin_count[idx] = 0;
+    new_qos->qdata[i].io_count[idx] = 0;
+  }
+}
+
+void global_init_domain(int domid, int idx)
+{
+  int cpu;
+  _new_qos_data *saved_qos;
+ + saved_qos = new_qos; + + for (cpu=0; cpu<NCPU; cpu++) {
+    new_qos = cpu_qos_data[cpu];
+    qos_init_domain(domid, idx);
+  }
+  new_qos = saved_qos;
+}
+
+
+// give index of this domain in the qos data array
+int indexof(int domid)
+{
+  int idx;
+  xc_dominfo_t dominfo[NDOMAINS];
+  int xc_handle, ndomains;
+  extern void qos_kill_thread(int domid);
+ + if (domid < 0 || domid >= NDOMAINS) { // shouldn't happen
+    if (domid != IDLE_DOMAIN_ID) {
+      printf("bad domain id: %d\r\n", domid);
+      return 0;
+    }
+  }
+
+  for (idx=0; idx<NDOMAINS; idx++)
+ if ( (new_qos->domain_info[idx].id == domid) && new_qos->domain_info[idx].in_use)
+      return idx;
+
+  // not found, make a new entry
+  for (idx=0; idx<NDOMAINS; idx++)
+    if (new_qos->domain_info[idx].in_use == 0) {
+      global_init_domain(domid, idx);
+      return idx;
+    }
+
+  // call domaininfo hypercall to try and garbage collect unused entries
+  xc_handle = xc_interface_open();
+  ndomains = xc_domain_getinfo(xc_handle, 0, NDOMAINS, dominfo);
+  xc_interface_close(xc_handle);
+
+ // for each domain in our data, look for it in the system dominfo structure
+  // and purge the domain's data from our state if it does not exist in the
+  // dominfo structure
+  for (idx=0; idx<NDOMAINS; idx++) {
+    int domid = new_qos->domain_info[idx].id;
+    int jdx;
+ + for (jdx=0; jdx<ndomains; jdx++) {
+      if (dominfo[jdx].domid == domid)
+    break;
+    }
+ if (jdx == ndomains) // we didn't find domid in the dominfo struct + if (domid != IDLE_DOMAIN_ID) // exception for idle domain, which is not
+                               // contained in dominfo
+    qos_kill_thread(domid);    // purge our stale data
+  }
+ + // look again for a free slot
+  for (idx=0; idx<NDOMAINS; idx++)
+    if (new_qos->domain_info[idx].in_use == 0) {
+      global_init_domain(domid, idx);
+      return idx;
+    }
+
+  // still no space found, so bail
+  fprintf(stderr, "out of space in domain table, increase NDOMAINS\r\n");
+  exit(2);
+}
+
int domain_runnable(int domid)
{
-    return new_qos->domain_info[ID(domid)].runnable;
+    return new_qos->domain_info[indexof(domid)].runnable;
}


void update_blocked_time(int domid, uint64_t now)
{
    uint64_t t_blocked;
-    int id = ID(domid);
+    int id = indexof(domid);

    if (new_qos->domain_info[id].blocked_start_time != 0) {
        if (now >= new_qos->domain_info[id].blocked_start_time)
@@ -734,7 +837,7 @@
new_qos->qdata[new_qos->next_datapoint].ns_blocked[id] += t_blocked;
    }

-    if (domain_runnable(id))
+    if (domain_runnable(domid))
        new_qos->domain_info[id].blocked_start_time = 0;
    else
        new_qos->domain_info[id].blocked_start_time = now;
@@ -773,7 +876,7 @@
    uint64_t last_update_time, start;
    int64_t time_since_update, run_time = 0;

-    id = ID(domid);
+    id = indexof(domid);

    n = new_qos->next_datapoint;
    last_update_time = new_qos->domain_info[id].last_update_time;
@@ -851,7 +954,7 @@

    for (i=0; i<NDOMAINS; i++)
        if (new_qos->domain_info[i].in_use)
-            qos_update_thread(cpu, i, now);
+            qos_update_thread(cpu, new_qos->domain_info[i].id, now);
}


@@ -866,69 +969,37 @@
}


-void qos_init_domain(int cpu, int domid, uint64_t now)
-{
-    int i, id;
-
-    id = ID(domid);
-
-    if (new_qos->domain_info[id].in_use)
-        return;
-
-
-    memset(&new_qos->domain_info[id], 0, sizeof(_domain_info));
-    new_qos->domain_info[id].last_update_time = now;
-    //  runnable_start_time[id] = 0;
-    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
-    new_qos->domain_info[id].in_use = 1;
-    new_qos->domain_info[id].blocked_start_time = 0;
-    new_qos->domain_info[id].id = id;
-    if (domid == IDLE_DOMAIN_ID)
-        sprintf(new_qos->domain_info[id].name, "Idle Task%d", cpu);
-    else
-        sprintf(new_qos->domain_info[id].name, "Domain#%d", domid);
-
-    for (i=0; i<NSAMPLES; i++) {
-        new_qos->qdata[i].ns_gotten[id] = 0;
-        new_qos->qdata[i].ns_allocated[id] = 0;
-        new_qos->qdata[i].ns_waiting[id] = 0;
-        new_qos->qdata[i].ns_blocked[id] = 0;
-        new_qos->qdata[i].switchin_count[id] = 0;
-        new_qos->qdata[i].io_count[id] = 0;
-    }
-}
-

// called when a new thread gets the cpu
void qos_switch_in(int cpu, int domid, uint64_t now, unsigned long ns_alloc, unsigned long ns_waited)
{
-    int id = ID(domid);
-
-    new_qos->domain_info[id].runnable = 1;
+    int idx = indexof(domid);
+
+    new_qos->domain_info[idx].runnable = 1;
    update_blocked_time(domid, now);
-    new_qos->domain_info[id].blocked_start_time = 0; // invalidate
-    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
-    //runnable_start_time[id] = 0;
-
-    new_qos->domain_info[id].start_time = now;
-    new_qos->qdata[new_qos->next_datapoint].switchin_count[id]++;
-    new_qos->qdata[new_qos->next_datapoint].ns_allocated[id] += ns_alloc;
-    new_qos->qdata[new_qos->next_datapoint].ns_waiting[id] += ns_waited;
+    new_qos->domain_info[idx].blocked_start_time = 0; // invalidate
+    new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
+    //runnable_start_time[idx] = 0;
+
+    new_qos->domain_info[idx].start_time = now;
+    new_qos->qdata[new_qos->next_datapoint].switchin_count[idx]++;
+    new_qos->qdata[new_qos->next_datapoint].ns_allocated[idx] += ns_alloc;
+    new_qos->qdata[new_qos->next_datapoint].ns_waiting[idx] += ns_waited;
    qos_update_thread_stats(cpu, domid, now);
-    set_current(cpu, id);
+    set_current(cpu, domid);

    // count up page flips for dom0 execution
-    if (id == 0)
+    if (domid == 0)
      dom0_flips = 0;
}

// called when the current thread is taken off the cpu
void qos_switch_out(int cpu, int domid, uint64_t now, unsigned long gotten)
{
-    int id = ID(domid);
+    int idx = indexof(domid);
    int n;

-    if (!is_current(id, cpu)) {
+    if (!is_current(domid, cpu)) {
// printf("switching out domain %d but it is not current. gotten=%ld\r\n", id, gotten);
    }

@@ -943,18 +1014,18 @@

    n = new_qos->next_datapoint;
#if 0
-    new_qos->qdata[n].ns_gotten[id] += gotten;
+    new_qos->qdata[n].ns_gotten[idx] += gotten;
    if (gotten > new_qos->qdata[n].ns_passed)
      printf("inconsistency #257, diff = %lld\n",
        gotten - new_qos->qdata[n].ns_passed );
#endif
-    new_qos->domain_info[id].ns_oncpu_since_boot += gotten;
-    new_qos->domain_info[id].runnable_start_time = now;
+    new_qos->domain_info[idx].ns_oncpu_since_boot += gotten;
+    new_qos->domain_info[idx].runnable_start_time = now;
    //  runnable_start_time[id] = now;
-    qos_update_thread_stats(cpu, id, now);
+    qos_update_thread_stats(cpu, domid, now);

    // process dom0 page flips
-    if (id == 0)
+    if (domid == 0)
      if (dom0_flips == 0)
    new_qos->qdata[n].flip_free_periods++;
}
@@ -963,23 +1034,30 @@
// when thread is already asleep
void qos_state_sleeping(int cpu, int domid, uint64_t now)
{
-    int id = ID(domid);
-
-    if (!domain_runnable(id))    // double call?
+    int idx;
+
+    if (!domain_runnable(domid))    // double call?
        return;

-    new_qos->domain_info[id].runnable = 0;
-    new_qos->domain_info[id].blocked_start_time = now;
-    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
-    //  runnable_start_time[id] = 0; // invalidate
+    idx = indexof(domid);
+    new_qos->domain_info[idx].runnable = 0;
+    new_qos->domain_info[idx].blocked_start_time = now;
+    new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
+    //  runnable_start_time[idx] = 0; // invalidate
    qos_update_thread_stats(cpu, domid, now);
}



+// domain died, presume it's dead on all cpu's, not just mostly dead
void qos_kill_thread(int domid)
{
-    new_qos->domain_info[ID(domid)].in_use = 0;
+  int cpu;
+ + for (cpu=0; cpu<NCPU; cpu++) {
+    cpu_qos_data[cpu]->domain_info[indexof(domid)].in_use = 0;
+  }
+ }


@@ -987,30 +1065,33 @@
// when thread is already runnable
void qos_state_runnable(int cpu, int domid, uint64_t now)
{
-    int id = ID(domid);
+   int idx;
+
    qos_update_thread_stats(cpu, domid, now);

-    if (domain_runnable(id))    // double call?
+    if (domain_runnable(domid))    // double call?
        return;
-    new_qos->domain_info[id].runnable = 1;
+
+    idx = indexof(domid);
+    new_qos->domain_info[idx].runnable = 1;
    update_blocked_time(domid, now);

-    new_qos->domain_info[id].blocked_start_time = 0; /* invalidate */
-    new_qos->domain_info[id].runnable_start_time = now;
+    new_qos->domain_info[idx].blocked_start_time = 0; /* invalidate */
+    new_qos->domain_info[idx].runnable_start_time = now;
    //  runnable_start_time[id] = now;
}


void qos_count_packets(domid_t domid, uint64_t now)
{
-  int i, id = ID(domid);
+  int i, idx = indexof(domid);
  _new_qos_data *cpu_data;

  for (i=0; i<NCPU; i++) {
    cpu_data = cpu_qos_data[i];
-    if (cpu_data->domain_info[id].in_use) {
-      cpu_data->qdata[cpu_data->next_datapoint].io_count[id]++;
+    if (cpu_data->domain_info[idx].in_use) {
+      cpu_data->qdata[cpu_data->next_datapoint].io_count[idx]++;
    }
  }

@@ -1019,30 +1100,18 @@
}


-int domain_ok(int cpu, int domid, uint64_t now)
-{
-    if (domid == IDLE_DOMAIN_ID)
-        domid = NDOMAINS-1;
-    if (domid < 0 || domid >= NDOMAINS) {
-        printf("bad domain id: %d\r\n", domid);
-        return 0;
-    }
-    if (new_qos->domain_info[domid].in_use == 0)
-        qos_init_domain(cpu, domid, now);
-    return 1;
-}
-
-
void process_record(int cpu, struct t_rec *r)
{
  uint64_t now;

-
  new_qos = cpu_qos_data[cpu];

  rec_count++;

  now = ((double)r->cycles) / (opts.cpu_freq / 1000.0);
+
+  global_now = now;
+  global_cpu = cpu;

  log_event(r->event);

@@ -1050,46 +1119,38 @@

  case TRC_SCHED_SWITCH_INFPREV:
    // domain data[0] just switched out and received data[1] ns of cpu time
-    if (domain_ok(cpu, r->data[0], now))
-      qos_switch_out(cpu, r->data[0], now, r->data[1]);
+    qos_switch_out(cpu, r->data[0], now, r->data[1]);
    //    printf("ns_gotten %ld\n", r->data[1]);
    break;
case TRC_SCHED_SWITCH_INFNEXT:
    // domain data[0] just switched in and
    // waited data[1] ns, and was allocated data[2] ns of cpu time
-    if (domain_ok(cpu, r->data[0], now))
-      qos_switch_in(cpu, r->data[0], now, r->data[2], r->data[1]);
+    qos_switch_in(cpu, r->data[0], now, r->data[2], r->data[1]);
    break;
case TRC_SCHED_DOM_ADD:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_init_domain(cpu, r->data[0],  now);
+    (void) indexof(r->data[0]);
    break;
case TRC_SCHED_DOM_REM:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_kill_thread(r->data[0]);
+    qos_kill_thread(r->data[0]);
    break;
case TRC_SCHED_SLEEP:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_state_sleeping(cpu, r->data[0], now);
+    qos_state_sleeping(cpu, r->data[0], now);
    break;
case TRC_SCHED_WAKE:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_state_runnable(cpu, r->data[0], now);
+    qos_state_runnable(cpu, r->data[0], now);
    break;
case TRC_SCHED_BLOCK:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_state_sleeping(cpu, r->data[0], now);
+    qos_state_sleeping(cpu, r->data[0], now);
    break;
case TRC_MEM_PAGE_GRANT_TRANSFER:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_count_packets(r->data[0], now);
+    qos_count_packets(r->data[0], now);
    break;
default:

# HG changeset patch
# User rob.gardner@xxxxxx
# Node ID 67f658b84025efeb2a570d2937ebc8f5e35056f7
# Parent  462d6e4cb29a620685f7c382a2372edcc99e2e4a

Domain ID/index mapping in xenmon.

This patch addresses the problem of xenbaked/xenmon not dealing with
large domain ID's. Xen Domain ID's increase monotonically as domains
are created; The ID's are not (often) recycled. Xenbaked was using the domain
ID's as indices to arrays of data, and this scheme blows up as soon as
a domain ID exceeds the array size. Code has been changed in xenbaked
and xenmon to isolate domain id's from array indices, so everything is
indirect. Users should not notice any difference in behavior.

diff -r 462d6e4cb29a -r 67f658b84025 tools/xenmon/xenmon.py
--- a/tools/xenmon/xenmon.py    Wed Jul  5 13:27:27 2006
+++ b/tools/xenmon/xenmon.py    Thu Jul  6 18:08:37 2006
@@ -36,10 +36,10 @@
 # constants
 NSAMPLES = 100
 NDOMAINS = 32
-IDLE_DOMAIN = 31 # idle domain's ID
+IDLE_DOMAIN = -1 # idle domain's ID
 
 # the struct strings for qos_info
-ST_DOM_INFO = "6Q4i32s"
+ST_DOM_INFO = "6Q3i2H32s"
 ST_QDATA = "%dQ" % (6*NDOMAINS + 4)
 
 # size of mmaped file
@@ -297,6 +297,7 @@
             samples = []
             doms = []
             dom_in_use = []
+            domain_id = []
 
             # read in data
             for i in range(0, NSAMPLES):
@@ -311,9 +312,13 @@
                 doms.append(dom)
 #              (last_update_time, start_time, runnable_start_time, 
blocked_start_time,
 #               ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update,
-#               runnable, in_use, domid, name) = dom
+#               runnable, in_use, domid, junk, name) = dom
 #              dom_in_use.append(in_use)
                 dom_in_use.append(dom[8])
+                domid = dom[9]
+                if domid == 32767 :
+                    domid = IDLE_DOMAIN
+                domain_id.append(domid)
                 idx += len
 #            print "dom_in_use(cpu=%d): " % cpuidx, dom_in_use
 
@@ -366,16 +371,16 @@
             if not dom_in_use[dom]:
                 continue
 
-            if h1[dom][0][1] > 0 or dom == NDOMAINS - 1:
+            if h1[dom][0][1] > 0 or domain_id[dom] == IDLE_DOMAIN:
                 # display gotten
                 row += 1 
                 col = 2
-                display_domain_id(stdscr, row, col, dom)
+                display_domain_id(stdscr, row, col, domain_id[dom])
                 col += 4
                 display(stdscr, row, col, "%s" % time_scale(h2[dom][0][0]))
                 col += 12
                 display(stdscr, row, col, "%3.2f%%" % h2[dom][0][1])
-                if dom != NDOMAINS - 1:
+                if dom != IDLE_DOMAIN:
                     cpu_10sec_usage += h2[dom][0][1]
                 col += 12
                 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][0][2]))
@@ -388,14 +393,14 @@
                 col += 18
                 display(stdscr, row, col, "Gotten")
 
-                if dom != NDOMAINS - 1:
+                if dom != IDLE_DOMAIN:
                     cpu_1sec_usage = cpu_1sec_usage + h1[dom][0][1]
     
                 # display allocated
                 if options.allocated:
                     row += 1
                     col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                     col += 28
                     display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][1]))
                     col += 42
@@ -407,7 +412,7 @@
                 if options.blocked:
                     row += 1
                     col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                     col += 4
                     display(stdscr, row, col, "%s" % time_scale(h2[dom][2][0]))
                     col += 12
@@ -427,7 +432,7 @@
                 if options.waited:
                     row += 1
                     col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                     col += 4
                     display(stdscr, row, col, "%s" % time_scale(h2[dom][3][0]))
                     col += 12
@@ -447,7 +452,7 @@
                 if options.excount:
                     row += 1
                     col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                     
                     col += 28
                     display(stdscr, row, col, "%d/s" % h2[dom][4])
@@ -460,7 +465,7 @@
                 if options.iocount:
                     row += 1
                     col = 2
-                    display_domain_id(stdscr, row, col, dom)
+                    display_domain_id(stdscr, row, col, domain_id[dom])
                     col += 4
                     display(stdscr, row, col, "%d/s" % h2[dom][5][0])
                     col += 24
@@ -544,6 +549,9 @@
             self.file.write(self.delay_data)
        self.file.write(str)
 
+    def rename(self, name):
+        self.filename = name
+
     def flush(self):
         if  self.opened:
             self.file.flush()
@@ -567,10 +575,7 @@
     curr = last = time.time()
     outfiles = {}
     for dom in range(0, NDOMAINS):
-        if dom == IDLE_DOMAIN:
-            outfiles[dom] = Delayed("%s-idle.log" % options.prefix, 'w')
-        else:
-            outfiles[dom] = Delayed("%s-dom%d.log" % (options.prefix, dom), 
'w')
+        outfiles[dom] = Delayed("%s-dom%d.log" % (options.prefix, dom), 'w')
         outfiles[dom].delayed_write("# passed cpu dom cpu(tot) cpu(%) cpu/ex 
allocated/ex blocked(tot) blocked(%) blocked/io waited(tot) waited(%) waited/ex 
ex/s io(tot) io/ex\n")
 
     while options.duration == 0 or interval < (options.duration * 1000):
@@ -582,6 +587,7 @@
             samples = []
             doms = []
             dom_in_use = []
+            domain_id = []
 
             for i in range(0, NSAMPLES):
                 len = struct.calcsize(ST_QDATA)
@@ -595,8 +601,16 @@
 #                doms.append(dom)
 #              (last_update_time, start_time, runnable_start_time, 
blocked_start_time,
 #               ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update,
-#               runnable, in_use, domid, name) = dom
+#               runnable, in_use, domid, junk, name) = dom
                 dom_in_use.append(dom[8])
+                domid = dom[9]
+                if domid == 32767:
+                    domid = IDLE_DOMAIN
+                domain_id.append(domid)
+                if domid == IDLE_DOMAIN:
+                    outfiles[i].rename("%s-idle.log" % options.prefix)
+                else:
+                    outfiles[i].rename("%s-dom%d.log" % (options.prefix, 
domid))
                 idx += len
 
             len = struct.calcsize("4i")
@@ -617,9 +631,9 @@
             for dom in range(0, NDOMAINS):
                 if not dom_in_use[dom]:
                     continue
-                if h1[dom][0][1] > 0 or dom == NDOMAINS - 1:
+                if h1[dom][0][1] > 0 or dom == IDLE_DOMAIN:
                     outfiles[dom].write("%.3f %d %d %.3f %.3f %.3f %.3f %.3f 
%.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n" %
-                                     (interval, cpuidx, dom,
+                                     (interval, cpuidx, domain_id[dom],
                                      h1[dom][0][0], h1[dom][0][1], 
h1[dom][0][2],
                                      h1[dom][1],
                                      h1[dom][2][0], h1[dom][2][1], 
h1[dom][2][2],
diff -r 462d6e4cb29a -r 67f658b84025 tools/xenmon/xenbaked.c
--- a/tools/xenmon/xenbaked.c   Wed Jul  5 13:27:27 2006
+++ b/tools/xenmon/xenbaked.c   Thu Jul  6 18:08:37 2006
@@ -95,6 +95,8 @@
 _new_qos_data *new_qos;
 _new_qos_data **cpu_qos_data;
 
+int global_cpu;
+uint64_t global_now;
 
 // array of currently running domains, indexed by cpu
 int *running = NULL;
@@ -678,7 +680,7 @@
 };
 
 
-const char *argp_program_version     = "xenbaked v1.3";
+const char *argp_program_version     = "xenbaked v1.4";
 const char *argp_program_bug_address = "<rob.gardner@xxxxxx>";
 
 
@@ -715,16 +717,117 @@
     return ret;
 }
 
+void qos_init_domain(int domid, int idx)
+{
+  int i;
+
+  memset(&new_qos->domain_info[idx], 0, sizeof(_domain_info));
+  new_qos->domain_info[idx].last_update_time = global_now;
+  //  runnable_start_time[idx] = 0;
+  new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
+  new_qos->domain_info[idx].in_use = 1;
+  new_qos->domain_info[idx].blocked_start_time = 0;
+  new_qos->domain_info[idx].id = domid;
+  if (domid == IDLE_DOMAIN_ID)
+    sprintf(new_qos->domain_info[idx].name, "Idle Task%d", global_cpu);
+  else
+    sprintf(new_qos->domain_info[idx].name, "Domain#%d", domid);
+  
+  for (i=0; i<NSAMPLES; i++) {
+    new_qos->qdata[i].ns_gotten[idx] = 0;
+    new_qos->qdata[i].ns_allocated[idx] = 0;
+    new_qos->qdata[i].ns_waiting[idx] = 0;
+    new_qos->qdata[i].ns_blocked[idx] = 0;
+    new_qos->qdata[i].switchin_count[idx] = 0;
+    new_qos->qdata[i].io_count[idx] = 0;
+  }
+}
+
+void global_init_domain(int domid, int idx) 
+{
+  int cpu;
+  _new_qos_data *saved_qos;
+  
+  saved_qos = new_qos;
+  
+  for (cpu=0; cpu<NCPU; cpu++) {
+    new_qos = cpu_qos_data[cpu];
+    qos_init_domain(domid, idx);
+  }
+  new_qos = saved_qos;
+}
+
+
+// give index of this domain in the qos data array
+int indexof(int domid)
+{
+  int idx;
+  xc_dominfo_t dominfo[NDOMAINS];
+  int xc_handle, ndomains;
+  extern void qos_kill_thread(int domid);
+  
+  if (domid < 0 || domid >= NDOMAINS) {        // shouldn't happen
+    if (domid != IDLE_DOMAIN_ID) {
+      printf("bad domain id: %d\r\n", domid);
+      return 0;
+    }
+  }
+
+  for (idx=0; idx<NDOMAINS; idx++)
+    if ( (new_qos->domain_info[idx].id == domid) && 
new_qos->domain_info[idx].in_use)
+      return idx;
+
+  // not found, make a new entry
+  for (idx=0; idx<NDOMAINS; idx++)
+    if (new_qos->domain_info[idx].in_use == 0) {
+      global_init_domain(domid, idx);
+      return idx;
+    }
+
+  // call domaininfo hypercall to try and garbage collect unused entries
+  xc_handle = xc_interface_open();
+  ndomains = xc_domain_getinfo(xc_handle, 0, NDOMAINS, dominfo);
+  xc_interface_close(xc_handle);
+
+  // for each domain in our data, look for it in the system dominfo structure
+  // and purge the domain's data from our state if it does not exist in the
+  // dominfo structure
+  for (idx=0; idx<NDOMAINS; idx++) {
+    int domid = new_qos->domain_info[idx].id;
+    int jdx;
+    
+    for (jdx=0; jdx<ndomains; jdx++) {
+      if (dominfo[jdx].domid == domid)
+       break;
+    }
+    if (jdx == ndomains)        // we didn't find domid in the dominfo struct
+      if (domid != IDLE_DOMAIN_ID) // exception for idle domain, which is not
+                                  // contained in dominfo
+       qos_kill_thread(domid); // purge our stale data
+  }
+  
+  // look again for a free slot
+  for (idx=0; idx<NDOMAINS; idx++)
+    if (new_qos->domain_info[idx].in_use == 0) {
+      global_init_domain(domid, idx);
+      return idx;
+    }
+
+  // still no space found, so bail
+  fprintf(stderr, "out of space in domain table, increase NDOMAINS\r\n");
+  exit(2);
+}
+
 int domain_runnable(int domid)
 {
-    return new_qos->domain_info[ID(domid)].runnable;
+    return new_qos->domain_info[indexof(domid)].runnable;
 }
 
 
 void update_blocked_time(int domid, uint64_t now)
 {
     uint64_t t_blocked;
-    int id = ID(domid);
+    int id = indexof(domid);
 
     if (new_qos->domain_info[id].blocked_start_time != 0) {
         if (now >= new_qos->domain_info[id].blocked_start_time)
@@ -734,7 +837,7 @@
         new_qos->qdata[new_qos->next_datapoint].ns_blocked[id] += t_blocked;
     }
 
-    if (domain_runnable(id))
+    if (domain_runnable(domid))
         new_qos->domain_info[id].blocked_start_time = 0;
     else
         new_qos->domain_info[id].blocked_start_time = now;
@@ -773,7 +876,7 @@
     uint64_t last_update_time, start;
     int64_t time_since_update, run_time = 0;
 
-    id = ID(domid);
+    id = indexof(domid);
 
     n = new_qos->next_datapoint;
     last_update_time = new_qos->domain_info[id].last_update_time;
@@ -851,7 +954,7 @@
 
     for (i=0; i<NDOMAINS; i++)
         if (new_qos->domain_info[i].in_use)
-            qos_update_thread(cpu, i, now);
+            qos_update_thread(cpu, new_qos->domain_info[i].id, now); 
 }
 
 
@@ -866,69 +969,37 @@
 }
 
 
-void qos_init_domain(int cpu, int domid, uint64_t now)
-{
-    int i, id;
-
-    id = ID(domid);
-
-    if (new_qos->domain_info[id].in_use)
-        return;
-
-
-    memset(&new_qos->domain_info[id], 0, sizeof(_domain_info));
-    new_qos->domain_info[id].last_update_time = now;
-    //  runnable_start_time[id] = 0;
-    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
-    new_qos->domain_info[id].in_use = 1;
-    new_qos->domain_info[id].blocked_start_time = 0;
-    new_qos->domain_info[id].id = id;
-    if (domid == IDLE_DOMAIN_ID)
-        sprintf(new_qos->domain_info[id].name, "Idle Task%d", cpu);
-    else
-        sprintf(new_qos->domain_info[id].name, "Domain#%d", domid);
-
-    for (i=0; i<NSAMPLES; i++) {
-        new_qos->qdata[i].ns_gotten[id] = 0;
-        new_qos->qdata[i].ns_allocated[id] = 0;
-        new_qos->qdata[i].ns_waiting[id] = 0;
-        new_qos->qdata[i].ns_blocked[id] = 0;
-        new_qos->qdata[i].switchin_count[id] = 0;
-        new_qos->qdata[i].io_count[id] = 0;
-    }
-}
-
 
 // called when a new thread gets the cpu
 void qos_switch_in(int cpu, int domid, uint64_t now, unsigned long ns_alloc, 
unsigned long ns_waited)
 {
-    int id = ID(domid);
-
-    new_qos->domain_info[id].runnable = 1;
+    int idx = indexof(domid);
+
+    new_qos->domain_info[idx].runnable = 1;
     update_blocked_time(domid, now);
-    new_qos->domain_info[id].blocked_start_time = 0; // invalidate
-    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
-    //runnable_start_time[id] = 0;
-
-    new_qos->domain_info[id].start_time = now;
-    new_qos->qdata[new_qos->next_datapoint].switchin_count[id]++;
-    new_qos->qdata[new_qos->next_datapoint].ns_allocated[id] += ns_alloc;
-    new_qos->qdata[new_qos->next_datapoint].ns_waiting[id] += ns_waited;
+    new_qos->domain_info[idx].blocked_start_time = 0; // invalidate
+    new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
+    //runnable_start_time[idx] = 0;
+
+    new_qos->domain_info[idx].start_time = now;
+    new_qos->qdata[new_qos->next_datapoint].switchin_count[idx]++;
+    new_qos->qdata[new_qos->next_datapoint].ns_allocated[idx] += ns_alloc;
+    new_qos->qdata[new_qos->next_datapoint].ns_waiting[idx] += ns_waited;
     qos_update_thread_stats(cpu, domid, now);
-    set_current(cpu, id);
+    set_current(cpu, domid);
 
     // count up page flips for dom0 execution
-    if (id == 0)
+    if (domid == 0)
       dom0_flips = 0;
 }
 
 // called when the current thread is taken off the cpu
 void qos_switch_out(int cpu, int domid, uint64_t now, unsigned long gotten)
 {
-    int id = ID(domid);
+    int idx = indexof(domid);
     int n;
 
-    if (!is_current(id, cpu)) {
+    if (!is_current(domid, cpu)) {
         //    printf("switching out domain %d but it is not current. 
gotten=%ld\r\n", id, gotten);
     }
 
@@ -943,18 +1014,18 @@
 
     n = new_qos->next_datapoint;
 #if 0
-    new_qos->qdata[n].ns_gotten[id] += gotten;
+    new_qos->qdata[n].ns_gotten[idx] += gotten;
     if (gotten > new_qos->qdata[n].ns_passed)
       printf("inconsistency #257, diff = %lld\n",
            gotten - new_qos->qdata[n].ns_passed );
 #endif
-    new_qos->domain_info[id].ns_oncpu_since_boot += gotten;
-    new_qos->domain_info[id].runnable_start_time = now;
+    new_qos->domain_info[idx].ns_oncpu_since_boot += gotten;
+    new_qos->domain_info[idx].runnable_start_time = now;
     //  runnable_start_time[id] = now;
-    qos_update_thread_stats(cpu, id, now);
+    qos_update_thread_stats(cpu, domid, now);
 
     // process dom0 page flips
-    if (id == 0)
+    if (domid == 0)
       if (dom0_flips == 0)
        new_qos->qdata[n].flip_free_periods++;
 }
@@ -963,23 +1034,30 @@
 // when thread is already asleep
 void qos_state_sleeping(int cpu, int domid, uint64_t now) 
 {
-    int id = ID(domid);
-
-    if (!domain_runnable(id))  // double call?
+    int idx;
+
+    if (!domain_runnable(domid))       // double call?
         return;
 
-    new_qos->domain_info[id].runnable = 0;
-    new_qos->domain_info[id].blocked_start_time = now;
-    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
-    //  runnable_start_time[id] = 0; // invalidate
+    idx = indexof(domid);
+    new_qos->domain_info[idx].runnable = 0;
+    new_qos->domain_info[idx].blocked_start_time = now;
+    new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
+    //  runnable_start_time[idx] = 0; // invalidate
     qos_update_thread_stats(cpu, domid, now);
 }
 
 
 
+// domain died, presume it's dead on all cpu's, not just mostly dead
 void qos_kill_thread(int domid)
 {
-    new_qos->domain_info[ID(domid)].in_use = 0;
+  int cpu;
+  
+  for (cpu=0; cpu<NCPU; cpu++) {
+    cpu_qos_data[cpu]->domain_info[indexof(domid)].in_use = 0;
+  }
+  
 }
 
 
@@ -987,30 +1065,33 @@
 // when thread is already runnable
 void qos_state_runnable(int cpu, int domid, uint64_t now)
 {
-    int id = ID(domid);
+   int idx;
+  
 
     qos_update_thread_stats(cpu, domid, now);
 
-    if (domain_runnable(id))   // double call?
+    if (domain_runnable(domid))        // double call?
         return;
-    new_qos->domain_info[id].runnable = 1;
+
+    idx = indexof(domid);
+    new_qos->domain_info[idx].runnable = 1;
     update_blocked_time(domid, now);
 
-    new_qos->domain_info[id].blocked_start_time = 0; /* invalidate */
-    new_qos->domain_info[id].runnable_start_time = now;
+    new_qos->domain_info[idx].blocked_start_time = 0; /* invalidate */
+    new_qos->domain_info[idx].runnable_start_time = now;
     //  runnable_start_time[id] = now;
 }
 
 
 void qos_count_packets(domid_t domid, uint64_t now)
 {
-  int i, id = ID(domid);
+  int i, idx = indexof(domid);
   _new_qos_data *cpu_data;
 
   for (i=0; i<NCPU; i++) {
     cpu_data = cpu_qos_data[i];
-    if (cpu_data->domain_info[id].in_use) {
-      cpu_data->qdata[cpu_data->next_datapoint].io_count[id]++;
+    if (cpu_data->domain_info[idx].in_use) {
+      cpu_data->qdata[cpu_data->next_datapoint].io_count[idx]++;
     }
   }
 
@@ -1019,30 +1100,18 @@
 }
 
 
-int domain_ok(int cpu, int domid, uint64_t now)
-{
-    if (domid == IDLE_DOMAIN_ID)
-        domid = NDOMAINS-1;
-    if (domid < 0 || domid >= NDOMAINS) {
-        printf("bad domain id: %d\r\n", domid);
-        return 0;
-    }
-    if (new_qos->domain_info[domid].in_use == 0)
-        qos_init_domain(cpu, domid, now);
-    return 1;
-}
-
-
 void process_record(int cpu, struct t_rec *r)
 {
   uint64_t now;
 
-
   new_qos = cpu_qos_data[cpu];
 
   rec_count++;
 
   now = ((double)r->cycles) / (opts.cpu_freq / 1000.0);
+
+  global_now = now;
+  global_cpu = cpu;
 
   log_event(r->event);
 
@@ -1050,46 +1119,38 @@
 
   case TRC_SCHED_SWITCH_INFPREV:
     // domain data[0] just switched out and received data[1] ns of cpu time
-    if (domain_ok(cpu, r->data[0], now))
-      qos_switch_out(cpu, r->data[0], now, r->data[1]);
+    qos_switch_out(cpu, r->data[0], now, r->data[1]);
     //    printf("ns_gotten %ld\n", r->data[1]);
     break;
     
   case TRC_SCHED_SWITCH_INFNEXT:
     // domain data[0] just switched in and
     // waited data[1] ns, and was allocated data[2] ns of cpu time
-    if (domain_ok(cpu, r->data[0], now))
-      qos_switch_in(cpu, r->data[0], now, r->data[2], r->data[1]);
+    qos_switch_in(cpu, r->data[0], now, r->data[2], r->data[1]);
     break;
     
   case TRC_SCHED_DOM_ADD:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_init_domain(cpu, r->data[0],  now);
+    (void) indexof(r->data[0]);
     break;
     
   case TRC_SCHED_DOM_REM:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_kill_thread(r->data[0]);
+    qos_kill_thread(r->data[0]);
     break;
     
   case TRC_SCHED_SLEEP:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_state_sleeping(cpu, r->data[0], now);
+    qos_state_sleeping(cpu, r->data[0], now);
     break;
     
   case TRC_SCHED_WAKE:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_state_runnable(cpu, r->data[0], now);
+    qos_state_runnable(cpu, r->data[0], now);
     break;
     
   case TRC_SCHED_BLOCK:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_state_sleeping(cpu, r->data[0], now);
+    qos_state_sleeping(cpu, r->data[0], now);
     break;
     
   case TRC_MEM_PAGE_GRANT_TRANSFER:
-    if (domain_ok(cpu, r->data[0], now))
-      qos_count_packets(r->data[0], now);
+    qos_count_packets(r->data[0], now);
     break;
     
   default:
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.