[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] First cut of new time interfaces and synchronisation mechanisms.



# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 43564304cf9448ad8978df6d2d0d6721b4615143
# Parent  9697bc63d4039196b15378f3b3fe406c6a445ea2

First cut of new time interfaces and synchronisation mechanisms.
Based on an initial patch from Don Fry at IBM.
Still TODO: 
 1. Testing
 2. NTP synchronisation
 3. Fix wallclock interface a bit
 4. Support for platform timers other than PIT (e.g., HPET, IBM Cyclone)
 5. Scale 64-bit TSC diffs instead of 32-bit, just for sanity
 6. Error-correcting scale factor is still slightly wrong
 6. More testing
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>

diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/apic.c
--- a/xen/arch/x86/apic.c       Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/apic.c       Mon Jul 18 20:22:11 2005
@@ -723,16 +723,8 @@
 static void __init setup_APIC_timer(unsigned int clocks)
 {
     unsigned long flags;
-    
     local_irq_save(flags);
-
-    /*
-     * Wait for IRQ0's slice:
-     */
-    wait_timer_tick();
-
     __setup_APIC_LVTT(clocks);
-
     local_irq_restore(flags);
 }
 
diff -r 9697bc63d403 -r 43564304cf94 
linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile        Sun Jul 17 
14:16:21 2005
+++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile        Mon Jul 18 
20:22:11 2005
@@ -19,7 +19,7 @@
 s-obj-y        :=
 
 obj-y                          += cpu/
-obj-y                          += timers/
+#obj-y                         += timers/
 obj-$(CONFIG_ACPI_BOOT)                += acpi/
 #c-obj-$(CONFIG_X86_BIOS_REBOOT)       += reboot.o
 c-obj-$(CONFIG_MCA)            += mca.o
diff -r 9697bc63d403 -r 43564304cf94 xen/common/domain.c
--- a/xen/common/domain.c       Sun Jul 17 14:16:21 2005
+++ b/xen/common/domain.c       Mon Jul 18 20:22:11 2005
@@ -42,8 +42,6 @@
     d->domain_id   = dom_id;
     v->processor  = cpu;
  
-    spin_lock_init(&d->time_lock);
-
     spin_lock_init(&d->big_lock);
 
     spin_lock_init(&d->page_alloc_lock);
diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/vmx_intercept.c
--- a/xen/arch/x86/vmx_intercept.c      Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/vmx_intercept.c      Mon Jul 18 20:22:11 2005
@@ -24,10 +24,10 @@
 #include <asm/vmx_virpit.h>
 #include <asm/vmx_intercept.h>
 #include <public/io/ioreq.h>
-
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <io_ports.h>
 
 #ifdef CONFIG_VMX
 
@@ -175,7 +175,7 @@
         p->port_mm)
         return 0;
     
-    if (p->addr == 0x43 &&
+    if (p->addr == PIT_MODE &&
        p->dir == 0 &&                          /* write */
         ((p->u.data >> 4) & 0x3) == 0 &&       /* latch command */
         ((p->u.data >> 6) & 0x3) == (vpit->channel)) {/* right channel */
@@ -183,7 +183,7 @@
        return 1;
     }
 
-    if (p->addr == (0x40 + vpit->channel) &&
+    if (p->addr == (PIT_CH0 + vpit->channel) &&
        p->dir == 1) {  /* read */
         p->u.data = pit_read_io(vpit);
         resume_pit_io(p);
diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/i8259.c
--- a/xen/arch/x86/i8259.c      Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/i8259.c      Mon Jul 18 20:22:11 2005
@@ -19,7 +19,7 @@
 #include <asm/bitops.h>
 #include <xen/delay.h>
 #include <asm/apic.h>
-
+#include <io_ports.h>
 
 /*
  * Common place to define all x86 IRQ vectors
@@ -395,9 +395,9 @@
     /* Set the clock to HZ Hz */
 #define CLOCK_TICK_RATE 1193180 /* crystal freq (Hz) */
 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
-    outb_p(0x34,0x43);           /* binary, mode 2, LSB/MSB, ch 0 */
-    outb_p(LATCH & 0xff , 0x40); /* LSB */
-    outb(LATCH >> 8 , 0x40);     /* MSB */
+    outb_p(0x34, PIT_MODE);        /* binary, mode 2, LSB/MSB, ch 0 */
+    outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
+    outb(LATCH >> 8, PIT_CH0);     /* MSB */
 
     setup_irq(2, &cascade);
 }
diff -r 9697bc63d403 -r 43564304cf94 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Sun Jul 17 14:16:21 2005
+++ b/xen/common/page_alloc.c   Mon Jul 18 20:22:11 2005
@@ -351,10 +351,10 @@
 void scrub_heap_pages(void)
 {
     void *p;
-    unsigned long pfn, flags;
+    unsigned long pfn;
+    int cpu = smp_processor_id();
 
     printk("Scrubbing Free RAM: ");
-    watchdog_disable();
 
     for ( pfn = 0; pfn < (bitmap_size * 8); pfn++ )
     {
@@ -362,12 +362,15 @@
         if ( (pfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
             printk(".");
 
+        if ( unlikely(softirq_pending(cpu)) )
+            do_softirq();
+
         /* Quick lock-free check. */
         if ( allocated_in_map(pfn) )
             continue;
-        
-        spin_lock_irqsave(&heap_lock, flags);
-        
+
+        spin_lock_irq(&heap_lock);
+
         /* Re-check page status with lock held. */
         if ( !allocated_in_map(pfn) )
         {
@@ -385,11 +388,10 @@
                 unmap_domain_page(p);
             }
         }
-        
-        spin_unlock_irqrestore(&heap_lock, flags);
-    }
-
-    watchdog_enable();
+
+        spin_unlock_irq(&heap_lock);
+    }
+
     printk("done.\n");
 }
 
diff -r 9697bc63d403 -r 43564304cf94 xen/common/ac_timer.c
--- a/xen/common/ac_timer.c     Sun Jul 17 14:16:21 2005
+++ b/xen/common/ac_timer.c     Mon Jul 18 20:22:11 2005
@@ -202,7 +202,7 @@
     do {
         heap = ac_timers[cpu].heap;
         now  = NOW();
-        
+
         while ( (GET_HEAP_SIZE(heap) != 0) &&
                 ((t = heap[1])->expires < (now + TIMER_SLOP)) )
         {
diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/smpboot.c    Mon Jul 18 20:22:11 2005
@@ -40,6 +40,7 @@
 #include <xen/sched.h>
 #include <xen/irq.h>
 #include <xen/delay.h>
+#include <xen/softirq.h>
 #include <asm/current.h>
 #include <asm/mc146818rtc.h>
 #include <asm/desc.h>
@@ -406,6 +407,7 @@
         */
        if (cpu_has_tsc && cpu_khz)
                synchronize_tsc_ap();
+       calibrate_tsc_ap();
 }
 
 int cpucount;
@@ -464,6 +466,8 @@
 
        /* We can take interrupts now: we're officially "up". */
        local_irq_enable();
+
+        init_percpu_time();
 
        wmb();
        startup_cpu_idle_loop();
@@ -1149,6 +1153,7 @@
         */
        if (cpu_has_tsc && cpucount && cpu_khz)
                synchronize_tsc_bp();
+       calibrate_tsc_bp();
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
@@ -1167,22 +1172,21 @@
 int __devinit __cpu_up(unsigned int cpu)
 {
        /* This only works at boot for x86.  See "rewrite" above. */
-       if (cpu_isset(cpu, smp_commenced_mask)) {
-               local_irq_enable();
+       if (cpu_isset(cpu, smp_commenced_mask))
                return -ENOSYS;
-       }
 
        /* In case one didn't come up */
-       if (!cpu_isset(cpu, cpu_callin_map)) {
-               local_irq_enable();
+       if (!cpu_isset(cpu, cpu_callin_map))
                return -EIO;
-       }
-
-       local_irq_enable();
+
        /* Unleash the CPU! */
        cpu_set(cpu, smp_commenced_mask);
-       while (!cpu_isset(cpu, cpu_online_map))
+       while (!cpu_isset(cpu, cpu_online_map)) {
                mb();
+               if (softirq_pending(0))
+                       do_softirq();
+       }
+
        return 0;
 }
 
diff -r 9697bc63d403 -r 43564304cf94 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Sun Jul 17 14:16:21 2005
+++ b/xen/include/xen/sched.h   Mon Jul 18 20:22:11 2005
@@ -92,7 +92,6 @@
     domid_t          domain_id;
 
     shared_info_t   *shared_info;     /* shared data area */
-    spinlock_t       time_lock;
 
     spinlock_t       big_lock;
 
diff -r 9697bc63d403 -r 43564304cf94 xen/drivers/char/console.c
--- a/xen/drivers/char/console.c        Sun Jul 17 14:16:21 2005
+++ b/xen/drivers/char/console.c        Mon Jul 18 20:22:11 2005
@@ -635,8 +635,6 @@
 
     debugtrace_bytes = bytes;
 
-    memset(debugtrace_buf, '\0', debugtrace_bytes);
-
     return 0;
 }
 __initcall(debugtrace_init);
diff -r 9697bc63d403 -r 43564304cf94 xen/include/xen/time.h
--- a/xen/include/xen/time.h    Sun Jul 17 14:16:21 2005
+++ b/xen/include/xen/time.h    Mon Jul 18 20:22:11 2005
@@ -30,7 +30,8 @@
 #include <public/xen.h>
 #include <asm/time.h>
 
-extern int init_xen_time();
+extern int init_xen_time(void);
+extern void init_percpu_time(void);
 
 extern unsigned long cpu_khz;
 
diff -r 9697bc63d403 -r 43564304cf94 xen/include/public/xen.h
--- a/xen/include/public/xen.h  Sun Jul 17 14:16:21 2005
+++ b/xen/include/public/xen.h  Mon Jul 18 20:22:11 2005
@@ -329,12 +329,36 @@
 #endif
 } vcpu_info_t;
 
+typedef struct vcpu_time_info {
+    /*
+     * The following values are updated periodically (and not necessarily
+     * atomically!). The guest OS detects this because 'time_version1' is
+     * incremented just before updating these values, and 'time_version2' is
+     * incremented immediately after. See the Xen-specific Linux code for an
+     * example of how to read these values safely (arch/xen/kernel/time.c).
+     */
+    u32 time_version1;
+    u32 time_version2;
+    u64 tsc_timestamp;   /* TSC at last update of time vals.  */
+    u64 system_time;     /* Time, in nanosecs, since boot.    */
+    /*
+     * Current system time:
+     *   system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
+     * CPU frequency (Hz):
+     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
+     */
+    u32 tsc_to_system_mul;
+    s8  tsc_shift;
+} vcpu_time_info_t;
+
 /*
  * Xen/kernel shared data -- pointer provided in start_info.
  * NB. We expect that this struct is smaller than a page.
  */
 typedef struct shared_info {
     vcpu_info_t vcpu_data[MAX_VIRT_CPUS];
+
+    vcpu_time_info_t vcpu_time[MAX_VIRT_CPUS];
 
     u32 n_vcpu;
 
@@ -373,33 +397,11 @@
     u32 evtchn_mask[32];
 
     /*
-     * Time: The following abstractions are exposed: System Time, Clock Time,
-     * Domain Virtual Time. Domains can access Cycle counter time directly.
+     * Wallclock time: updated only by control software. Guests should base
+     * their gettimeofday() syscall on this wallclock-base value.
      */
-    u64                cpu_freq;        /* CPU frequency (Hz).          */
-
-    /*
-     * The following values are updated periodically (and not necessarily
-     * atomically!). The guest OS detects this because 'time_version1' is
-     * incremented just before updating these values, and 'time_version2' is
-     * incremented immediately after. See the Xen-specific Linux code for an
-     * example of how to read these values safely (arch/xen/kernel/time.c).
-     */
-    u32                time_version1;
-    u32                time_version2;
-    tsc_timestamp_t    tsc_timestamp;   /* TSC at last update of time vals.  */
-    u64                system_time;     /* Time, in nanosecs, since boot.    */
     u32                wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
     u32                wc_usec;         /* Usecs 00:00:00 UTC, Jan 1, 1970.  */
-    u64                domain_time;     /* Domain virtual time, in nanosecs. */
-
-    /*
-     * Timeout values:
-     * Allow a domain to specify a timeout value in system time and 
-     * domain virtual time.
-     */
-    u64                wall_timeout;
-    u64                domain_timeout;
 
     arch_shared_info_t arch;
 
diff -r 9697bc63d403 -r 43564304cf94 
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile      Sun Jul 17 
14:16:21 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile      Mon Jul 18 
20:22:11 2005
@@ -15,7 +15,7 @@
                ptrace.o quirks.o syscall.o bootflag.o
 
 i386-obj-y                     := time.o
-obj-y                          += ../../i386/kernel/timers/
+#obj-y                         += ../../i386/kernel/timers/
 
 s-obj-y        :=
 
diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/time.c       Mon Jul 18 20:22:11 2005
@@ -1,16 +1,12 @@
-/****************************************************************************
- * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2002-2003 University of Cambridge
- ****************************************************************************
- *
- *        File: i386/time.c
- *      Author: Rolf Neugebar & Keir Fraser
- */
-
-/*
- *  linux/arch/i386/kernel/time.c
- *
- *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+/******************************************************************************
+ * arch/x86/time.c
+ * 
+ * Per-CPU time calibration and management.
+ * 
+ * Copyright (c) 2002-2005, K A Fraser
+ * 
+ * Portions from Linux are:
+ * Copyright (c) 1991, 1992, 1995  Linus Torvalds
  */
 
 #include <xen/config.h>
@@ -31,29 +27,74 @@
 #include <asm/processor.h>
 #include <asm/fixmap.h>
 #include <asm/mc146818rtc.h>
-
-/* GLOBAL */
+#include <asm/div64.h>
+#include <io_ports.h>
+
 unsigned long cpu_khz;  /* CPU clock frequency in kHz. */
 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
 int timer_ack = 0;
 unsigned long volatile jiffies;
-
-/* PRIVATE */
-static unsigned int    rdtsc_bitshift;  /* Which 32 bits of TSC do we use?   */
-static u64             cpu_freq;        /* CPU frequency (Hz)                */
-static u32             st_scale_f;      /* Cycles -> ns, fractional part     */
-static u32             st_scale_i;      /* Cycles -> ns, integer part        */
-static u32             shifted_tsc_irq; /* CPU0's TSC at last 'time update'  */
-static u64             full_tsc_irq;    /* ...ditto, but all 64 bits         */
-static s_time_t        stime_irq;       /* System time at last 'time update' */
-static unsigned long   wc_sec, wc_usec; /* UTC time at last 'time update'.   */
-static rwlock_t        time_lock = RW_LOCK_UNLOCKED;
+static unsigned long wc_sec, wc_usec; /* UTC time at last 'time update'. */
+
+struct time_scale {
+    int shift;
+    u32 mul_frac;
+};
+
+struct cpu_time {
+    u64 local_tsc_stamp;
+    s_time_t stime_local_stamp;
+    s_time_t stime_master_stamp;
+    struct time_scale tsc_scale;
+    struct ac_timer calibration_timer;
+} __cacheline_aligned;
+
+static struct cpu_time cpu_time[NR_CPUS];
+
+/* Protected by platform_timer_lock. */
+static s_time_t stime_platform_stamp;
+static u64 platform_timer_stamp;
+static struct time_scale platform_timer_scale;
+static spinlock_t platform_timer_lock = SPIN_LOCK_UNLOCKED;
+
+static inline u32 down_shift(u64 time, int shift)
+{
+    if ( shift < 0 )
+        return (u32)(time >> -shift);
+    return (u32)((u32)time << shift);
+}
+
+/*
+ * 32-bit division of integer dividend and integer divisor yielding
+ * 32-bit fractional quotient.
+ */
+static inline u32 div_frac(u32 dividend, u32 divisor)
+{
+    u32 quotient, remainder;
+    ASSERT(dividend < divisor);
+    __asm__ ( 
+        "div %4"
+        : "=a" (quotient), "=d" (remainder)
+        : "0" (0), "1" (dividend), "r" (divisor) );
+    return quotient;
+}
+
+/*
+ * 32-bit multiplication of integer multiplicand and fractional multiplier
+ * yielding 32-bit integer product.
+ */
+static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
+{
+    u32 product_int, product_frac;
+    __asm__ (
+        "mul %3"
+        : "=a" (product_frac), "=d" (product_int)
+        : "0" (multiplicand), "r" (multiplier) );
+    return product_int;
+}
 
 void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
 {
-    write_lock_irq(&time_lock);
-
-#ifdef CONFIG_X86_IO_APIC
     if ( timer_ack ) 
     {
         extern spinlock_t i8259A_lock;
@@ -63,30 +104,9 @@
         inb(0x20);
         spin_unlock(&i8259A_lock);
     }
-#endif
     
-    /*
-     * Updates TSC timestamp (used to interpolate passage of time between
-     * interrupts).
-     */
-    rdtscll(full_tsc_irq);
-    shifted_tsc_irq = (u32)(full_tsc_irq >> rdtsc_bitshift);
-
     /* Update jiffies counter. */
     (*(unsigned long *)&jiffies)++;
-
-    /* Update wall time. */
-    wc_usec += 1000000/HZ;
-    if ( wc_usec >= 1000000 )
-    {
-        wc_usec -= 1000000;
-        wc_sec++;
-    }
-
-    /* Updates system time (nanoseconds since boot). */
-    stime_irq += MILLISECS(1000/HZ);
-
-    write_unlock_irq(&time_lock);
 
     /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
     if ( !cpu_has_apic )
@@ -103,9 +123,9 @@
 #define CALIBRATE_FRAC  20      /* calibrate over 50ms */
 #define CALIBRATE_LATCH ((CLOCK_TICK_RATE+(CALIBRATE_FRAC/2))/CALIBRATE_FRAC)
 
-static unsigned long __init calibrate_tsc(void)
-{
-    u64 start, end, diff;
+static u64 calibrate_boot_tsc(void)
+{
+    u64 start, end;
     unsigned long count;
 
     /* Set the Gate high, disable speaker */
@@ -118,9 +138,9 @@
      * terminal count mode), binary count, load 5 * LATCH count, (LSB and MSB)
      * to begin countdown.
      */
-    outb(0xb0, 0x43);           /* binary, mode 0, LSB/MSB, Ch 2 */
-    outb(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
-    outb(CALIBRATE_LATCH >> 8, 0x42);   /* MSB of count */
+    outb(0xb0, PIT_MODE);           /* binary, mode 0, LSB/MSB, Ch 2 */
+    outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
+    outb(CALIBRATE_LATCH >> 8, PIT_CH2);   /* MSB of count */
 
     rdtscll(start);
     for ( count = 0; (inb(0x61) & 0x20) == 0; count++ )
@@ -131,15 +151,147 @@
     if ( count == 0 )
         return 0;
 
-    diff = end - start;
-
-#if defined(__i386__)
-    /* If quotient doesn't fit in 32 bits then we return error (zero). */
-    if ( diff & ~0xffffffffULL )
-        return 0;
-#endif
-
-    return (unsigned long)diff;
+    return ((end - start) * (u64)CALIBRATE_FRAC);
+}
+
+static void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
+{
+    u64 tps64 = ticks_per_sec;
+    u32 tps32;
+    int shift = 0;
+
+    while ( tps64 > (MILLISECS(1000)*2) )
+    {
+        tps64 >>= 1;
+        shift--;
+    }
+
+    tps32 = (u32)tps64;
+    while ( tps32 < (u32)MILLISECS(1000) )
+    {
+        tps32 <<= 1;
+        shift++;
+    }
+
+    ts->mul_frac = div_frac(MILLISECS(1000), tps32);
+    ts->shift    = shift;
+}
+
+static atomic_t tsc_calibrate_gang = ATOMIC_INIT(0);
+static unsigned int tsc_calibrate_status = 0;
+
+void calibrate_tsc_bp(void)
+{
+    while ( atomic_read(&tsc_calibrate_gang) != (num_booting_cpus() - 1) )
+        mb();
+
+    outb(CALIBRATE_LATCH & 0xff, PIT_CH2);
+    outb(CALIBRATE_LATCH >> 8, PIT_CH2);
+
+    tsc_calibrate_status = 1;
+       wmb();
+
+    while ( (inb(0x61) & 0x20) == 0 )
+        continue;
+
+    tsc_calibrate_status = 2;
+       wmb();
+
+    while ( atomic_read(&tsc_calibrate_gang) != 0 )
+        mb();
+}
+
+void calibrate_tsc_ap(void)
+{
+    u64 t1, t2, ticks_per_sec;
+
+    atomic_inc(&tsc_calibrate_gang);
+
+    while ( tsc_calibrate_status < 1 )
+        mb();
+
+    rdtscll(t1);
+
+    while ( tsc_calibrate_status < 2 )
+        mb();
+
+    rdtscll(t2);
+
+    ticks_per_sec = (t2 - t1) * (u64)CALIBRATE_FRAC;
+    set_time_scale(&cpu_time[smp_processor_id()].tsc_scale, ticks_per_sec);
+
+    atomic_dec(&tsc_calibrate_gang);
+}
+
+/* Protected by platform_timer_lock. */
+static u64 platform_pit_counter;
+static u16 pit_stamp;
+static struct ac_timer pit_overflow_timer;
+
+static u16 pit_read_counter(void)
+{
+    u16 count;
+    ASSERT(spin_is_locked(&platform_timer_lock));
+    outb(0x80, PIT_MODE);
+    count  = inb(PIT_CH2);
+    count |= inb(PIT_CH2) << 8;
+    return count;
+}
+
+static void pit_overflow(void *unused)
+{
+    u16 counter;
+
+    spin_lock(&platform_timer_lock);
+    counter = pit_read_counter();
+    platform_pit_counter += (u16)(pit_stamp - counter);
+    pit_stamp = counter;
+    spin_unlock(&platform_timer_lock);
+
+    set_ac_timer(&pit_overflow_timer, NOW() + MILLISECS(20));
+}
+
+static void init_platform_timer(void)
+{
+    init_ac_timer(&pit_overflow_timer, pit_overflow, NULL, 0);
+    pit_overflow(NULL);
+    platform_timer_stamp = platform_pit_counter;
+    set_time_scale(&platform_timer_scale, CLOCK_TICK_RATE);
+}
+
+static s_time_t __read_platform_stime(u64 platform_time)
+{
+    u64 diff64 = platform_time - platform_timer_stamp;
+    u32 diff   = down_shift(diff64, platform_timer_scale.shift);
+    ASSERT(spin_is_locked(&platform_timer_lock));
+    return (stime_platform_stamp + 
+            (u64)mul_frac(diff, platform_timer_scale.mul_frac));
+}
+
+static s_time_t read_platform_stime(void)
+{
+    u64 counter;
+    s_time_t stime;
+
+    spin_lock(&platform_timer_lock);
+    counter = platform_pit_counter + (u16)(pit_stamp - pit_read_counter());
+    stime   = __read_platform_stime(counter);
+    spin_unlock(&platform_timer_lock);
+
+    return stime;
+}
+
+static void platform_time_calibration(void)
+{
+    u64 counter;
+    s_time_t stamp;
+
+    spin_lock(&platform_timer_lock);
+    counter = platform_pit_counter + (u16)(pit_stamp - pit_read_counter());
+    stamp   = __read_platform_stime(counter);
+    stime_platform_stamp = stamp;
+    platform_timer_stamp = counter;
+    spin_unlock(&platform_timer_lock);
 }
 
 
@@ -233,140 +385,214 @@
  * System Time
  ***************************************************************************/
 
-static inline u64 get_time_delta(void)
-{
-    s32      delta_tsc;
-    u32      low;
-    u64      delta, tsc;
-
-    ASSERT(st_scale_f || st_scale_i);
+s_time_t get_s_time(void)
+{
+    struct cpu_time *t = &cpu_time[smp_processor_id()];
+    u64 tsc;
+    u32 delta;
+    s_time_t now;
 
     rdtscll(tsc);
-    low = (u32)(tsc >> rdtsc_bitshift);
-    delta_tsc = (s32)(low - shifted_tsc_irq);
-    if ( unlikely(delta_tsc < 0) ) delta_tsc = 0;
-    delta = ((u64)delta_tsc * st_scale_f);
-    delta >>= 32;
-    delta += ((u64)delta_tsc * st_scale_i);
-
-    return delta;
-}
-
-s_time_t get_s_time(void)
-{
-    s_time_t now;
-    unsigned long flags;
-
-    read_lock_irqsave(&time_lock, flags);
-
-    now = stime_irq + get_time_delta();
-
-    /* Ensure that the returned system time is monotonically increasing. */
-    {
-        static s_time_t prev_now = 0;
-        if ( unlikely(now < prev_now) )
-            now = prev_now;
-        prev_now = now;
-    }
-
-    read_unlock_irqrestore(&time_lock, flags);
-
-    return now; 
+    delta = down_shift(tsc - t->local_tsc_stamp, t->tsc_scale.shift);
+    now = t->stime_local_stamp + (u64)mul_frac(delta, t->tsc_scale.mul_frac);
+
+    return now;
 }
 
 static inline void __update_dom_time(struct vcpu *v)
 {
-    struct domain *d  = v->domain;
-    shared_info_t *si = d->shared_info;
-
-    spin_lock(&d->time_lock);
-
-    si->time_version1++;
+    struct cpu_time       *t = &cpu_time[smp_processor_id()];
+    struct vcpu_time_info *u = &v->domain->shared_info->vcpu_time[v->vcpu_id];
+
+    u->time_version1++;
     wmb();
 
-    si->cpu_freq       = cpu_freq;
-    si->tsc_timestamp  = full_tsc_irq;
-    si->system_time    = stime_irq;
-    si->wc_sec         = wc_sec;
-    si->wc_usec        = wc_usec;
+    u->tsc_timestamp     = t->local_tsc_stamp;
+    u->system_time       = t->stime_local_stamp;
+    u->tsc_to_system_mul = t->tsc_scale.mul_frac;
+    u->tsc_shift         = (s8)t->tsc_scale.shift;
 
     wmb();
-    si->time_version2++;
-
-    spin_unlock(&d->time_lock);
+    u->time_version2++;
+
+    /* Should only do this during do_settime(). */
+    v->domain->shared_info->wc_sec  = wc_sec;
+    v->domain->shared_info->wc_usec = wc_usec;
 }
 
 void update_dom_time(struct vcpu *v)
 {
-    unsigned long flags;
-
-    if ( v->domain->shared_info->tsc_timestamp != full_tsc_irq )
-    {
-        read_lock_irqsave(&time_lock, flags);
+    if ( v->domain->shared_info->vcpu_time[v->vcpu_id].tsc_timestamp != 
+         cpu_time[smp_processor_id()].local_tsc_stamp )
         __update_dom_time(v);
-        read_unlock_irqrestore(&time_lock, flags);
-    }
 }
 
 /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */
 void do_settime(unsigned long secs, unsigned long usecs, u64 system_time_base)
 {
-    s64 delta;
-    long _usecs = (long)usecs;
-
-    write_lock_irq(&time_lock);
-
-    delta = (s64)(stime_irq - system_time_base);
-
-    _usecs += (long)(delta/1000);
-    while ( _usecs >= 1000000 ) 
-    {
-        _usecs -= 1000000;
-        secs++;
-    }
-
-    wc_sec  = secs;
-    wc_usec = _usecs;
-
-    /* Others will pick up the change at the next tick. */
+    u64 x, base_usecs;
+    u32 y;
+
+    base_usecs = system_time_base;
+    do_div(base_usecs, 1000);
+
+    x = (secs * 1000000ULL) + (u64)usecs + base_usecs;
+    y = do_div(x, 1000000);
+
+    wc_sec  = (unsigned long)x;
+    wc_usec = (unsigned long)y;
+
     __update_dom_time(current);
-    send_guest_virq(current, VIRQ_TIMER);
-
-    write_unlock_irq(&time_lock);
-}
-
+}
+
+static void local_time_calibration(void *unused)
+{
+    unsigned int cpu = smp_processor_id();
+
+    /*
+     * System timestamps, extrapolated from local and master oscillators,
+     * taken during this calibration and the previous calibration.
+     */
+    s_time_t prev_local_stime, curr_local_stime;
+    s_time_t prev_master_stime, curr_master_stime;
+
+    /* TSC timestamps taken during this calibration and prev calibration. */
+    u64 prev_tsc, curr_tsc;
+
+    /*
+     * System time and TSC ticks elapsed during the previous calibration
+     * 'epoch'. Also the accumulated error in the local estimate. All these
+     * values end up down-shifted to fit in 32 bits.
+     */
+    u64 stime_elapsed64, tsc_elapsed64, local_stime_error64;
+    u32 stime_elapsed32, tsc_elapsed32, local_stime_error32;
+
+    /* Calculated TSC shift to ensure 32-bit scale multiplier. */
+    int tsc_shift = 0;
+
+    prev_tsc          = cpu_time[cpu].local_tsc_stamp;
+    prev_local_stime  = cpu_time[cpu].stime_local_stamp;
+    prev_master_stime = cpu_time[cpu].stime_master_stamp;
+
+    /* Disable IRQs to get 'instantaneous' current timestamps. */
+    local_irq_disable();
+    rdtscll(curr_tsc);
+    curr_local_stime  = get_s_time();
+    curr_master_stime = read_platform_stime();
+    local_irq_enable();
+
+#if 0
+    printk("PRE%d: tsc=%lld stime=%lld master=%lld\n",
+           cpu, prev_tsc, prev_local_stime, prev_master_stime);
+    printk("CUR%d: tsc=%lld stime=%lld master=%lld %lld\n",
+           cpu, curr_tsc, curr_local_stime, curr_master_stime,
+           platform_pit_counter);
+#endif
+
+    /* Local time warps forward if it lags behind master time. */
+    if ( curr_local_stime < curr_master_stime )
+        curr_local_stime = curr_master_stime;
+
+    stime_elapsed64 = curr_master_stime - prev_master_stime;
+    tsc_elapsed64   = curr_tsc - prev_tsc;
+
+    /*
+     * Error in the local system time estimate. Clamp to epoch time period, or
+     * we could end up with a negative scale factor (time going backwards!).
+     * This effectively clamps the scale factor to >= 0.
+     */
+    local_stime_error64 = curr_local_stime - curr_master_stime;
+    if ( local_stime_error64 > stime_elapsed64 )
+        local_stime_error64 = stime_elapsed64;
+
+    /*
+     * We require 0 < stime_elapsed < 2^31.
+     * This allows us to binary shift a 32-bit tsc_elapsed such that:
+     * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
+     */
+    while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
+            ((s32)stime_elapsed64 < 0) )
+    {
+        stime_elapsed64     >>= 1;
+        tsc_elapsed64       >>= 1;
+        local_stime_error64 >>= 1;
+    }
+
+    /* stime_master_diff (and hence stime_error) now fit in a 32-bit word. */
+    stime_elapsed32     = (u32)stime_elapsed64;
+    local_stime_error32 = (u32)local_stime_error64;
+
+    /* tsc_elapsed <= 2*stime_elapsed */
+    while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
+    {
+        tsc_elapsed64 >>= 1;
+        tsc_shift--;
+    }
+
+    /* Local difference must now fit in 32 bits. */
+    ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
+    tsc_elapsed32 = (u32)tsc_elapsed64;
+
+    /* tsc_elapsed > stime_elapsed */
+    ASSERT(tsc_elapsed32 != 0);
+    while ( tsc_elapsed32 <= stime_elapsed32 )
+    {
+        tsc_elapsed32 <<= 1;
+        tsc_shift++;
+    }
+
+#if 0
+    printk("---%d: %08x %d\n", cpu, 
+           div_frac(stime_elapsed32 - local_stime_error32, tsc_elapsed32),
+           tsc_shift);
+#endif
+
+    /* Record new timestamp information. */
+    cpu_time[cpu].tsc_scale.mul_frac = 
+        div_frac(stime_elapsed32 - local_stime_error32, tsc_elapsed32);
+    cpu_time[cpu].tsc_scale.shift    = tsc_shift;
+    cpu_time[cpu].local_tsc_stamp    = curr_tsc;
+    cpu_time[cpu].stime_local_stamp  = curr_local_stime;
+    cpu_time[cpu].stime_master_stamp = curr_master_stime;
+
+    set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + MILLISECS(1000));
+
+    if ( cpu == 0 )
+        platform_time_calibration();
+}
+
+void init_percpu_time(void)
+{
+    unsigned int cpu = smp_processor_id();
+    unsigned long flags;
+    s_time_t now;
+
+    local_irq_save(flags);
+    rdtscll(cpu_time[cpu].local_tsc_stamp);
+    now = (cpu == 0) ? 0 : read_platform_stime();
+    local_irq_restore(flags);
+
+    cpu_time[cpu].stime_master_stamp = now;
+    cpu_time[cpu].stime_local_stamp  = now;
+
+    init_ac_timer(&cpu_time[cpu].calibration_timer,
+                  local_time_calibration, NULL, cpu);
+    set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + MILLISECS(1000));
+}
 
 /* Late init function (after all CPUs are booted). */
-int __init init_xen_time()
-{
-    u64      scale;
-    unsigned int cpu_ghz;
-
-    cpu_ghz = (unsigned int)(cpu_freq / 1000000000ULL);
-    for ( rdtsc_bitshift = 0; cpu_ghz != 0; rdtsc_bitshift++, cpu_ghz >>= 1 )
-        continue;
-
-    scale  = 1000000000LL << (32 + rdtsc_bitshift);
-    scale /= cpu_freq;
-    st_scale_f = scale & 0xffffffff;
-    st_scale_i = scale >> 32;
+int __init init_xen_time(void)
+{
+    wc_sec = get_cmos_time();
 
     local_irq_disable();
 
-    /* System time ticks from zero. */
-    rdtscll(full_tsc_irq);
-    stime_irq = (s_time_t)0;
-    shifted_tsc_irq = (u32)(full_tsc_irq >> rdtsc_bitshift);
-
-    /* Wallclock time starts as the initial RTC time. */
-    wc_sec = get_cmos_time();
+    init_percpu_time();
+
+    stime_platform_stamp = 0;
+    init_platform_timer();
 
     local_irq_enable();
-
-    printk("Time init:\n");
-    printk(".... cpu_freq:    %08X:%08X\n", (u32)(cpu_freq>>32),(u32)cpu_freq);
-    printk(".... scale:       %08X:%08X\n", (u32)(scale>>32),(u32)scale);
-    printk(".... Wall Clock:  %lds %ldus\n", wc_sec, wc_usec);
 
     return 0;
 }
@@ -375,15 +601,12 @@
 /* Early init function. */
 void __init early_time_init(void)
 {
-    unsigned long ticks_per_frac = calibrate_tsc();
-
-    if ( !ticks_per_frac )
-        panic("Error calibrating TSC\n");
-
-    cpu_khz = ticks_per_frac / (1000/CALIBRATE_FRAC);
-
-    cpu_freq = (u64)ticks_per_frac * (u64)CALIBRATE_FRAC;
-
+    u64 tmp = calibrate_boot_tsc();
+
+    set_time_scale(&cpu_time[0].tsc_scale, tmp);
+
+    do_div(tmp, 1000);
+    cpu_khz = (unsigned long)tmp;
     printk("Detected %lu.%03lu MHz processor.\n", 
            cpu_khz / 1000, cpu_khz % 1000);
 
diff -r 9697bc63d403 -r 43564304cf94 xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h        Sun Jul 17 14:16:21 2005
+++ b/xen/include/asm-x86/time.h        Mon Jul 18 20:22:11 2005
@@ -4,4 +4,7 @@
 
 extern int timer_ack;
 
+extern void calibrate_tsc_bp(void);
+extern void calibrate_tsc_ap(void);
+
 #endif /* __X86_TIME_H__ */
diff -r 9697bc63d403 -r 43564304cf94 
linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c
--- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c  Sun Jul 17 14:16:21 2005
+++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c  Mon Jul 18 20:22:11 2005
@@ -104,24 +104,16 @@
 struct timer_opts *cur_timer = &timer_tsc;
 
 /* These are peridically updated in shared_info, and then copied here. */
-u32 shadow_tsc_stamp;
-u64 shadow_system_time;
-static u32 shadow_time_version;
+struct shadow_time_info {
+       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+       u32 tsc_to_nsec_mul;
+       u32 tsc_to_usec_mul;
+       int tsc_shift;
+       u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
 static struct timeval shadow_tv;
-
-/*
- * We use this to ensure that gettimeofday() is monotonically increasing. We
- * only break this guarantee if the wall clock jumps backwards "a long way".
- */
-static struct timeval last_seen_tv = {0,0};
-
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
-/* Periodically propagate synchronised time base to the RTC and to Xen. */
-static long last_rtc_update, last_update_to_xen;
-#endif
-
-/* Periodically take synchronised time base from Xen, if we need it. */
-static long last_update_from_xen;   /* UTC seconds when last read Xen clock. */
 
 /* Keep track of last time we did processing/updating of jiffies and xtime. */
 static u64 processed_system_time;   /* System time (ns) at last processing. */
@@ -164,26 +156,147 @@
 #define INDEPENDENT_WALLCLOCK() \
     (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN))
 
+int tsc_disable __initdata = 0;
+
+static void delay_tsc(unsigned long loops)
+{
+       unsigned long bclock, now;
+       
+       rdtscl(bclock);
+       do
+       {
+               rep_nop();
+               rdtscl(now);
+       } while ((now-bclock) < loops);
+}
+
+struct timer_opts timer_tsc = {
+       .name = "tsc",
+       .delay = delay_tsc,
+};
+
+static inline u32 down_shift(u64 time, int shift)
+{
+       if ( shift < 0 )
+               return (u32)(time >> -shift);
+       return (u32)((u32)time << shift);
+}
+
+/*
+ * 32-bit multiplication of integer multiplicand and fractional multiplier
+ * yielding 32-bit integer product.
+ */
+static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
+{
+       u32 product_int, product_frac;
+       __asm__ (
+               "mul %3"
+               : "=a" (product_frac), "=d" (product_int)
+               : "0" (multiplicand), "r" (multiplier) );
+       return product_int;
+}
+
+void init_cpu_khz(void)
+{
+       u64 __cpu_khz = 1000000ULL << 32;
+       struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0];
+       do_div(__cpu_khz, info->tsc_to_system_mul);
+       cpu_khz = down_shift(__cpu_khz, -info->tsc_shift);
+       printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n",
+              cpu_khz / 1000, cpu_khz % 1000);
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+       u64 now;
+       u32 delta;
+       rdtscll(now);
+       delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+       return mul_frac(delta, shadow->tsc_to_nsec_mul);
+}
+
+static unsigned long get_usec_offset(struct shadow_time_info *shadow)
+{
+       u64 now;
+       u32 delta;
+       rdtscll(now);
+       delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+       return mul_frac(delta, shadow->tsc_to_usec_mul);
+}
+
+static void update_wallclock(void)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       long wtm_nsec;
+       time_t wtm_sec, sec;
+       s64 nsec;
+
+       shadow_tv.tv_sec  = s->wc_sec;
+       shadow_tv.tv_usec = s->wc_usec;
+
+       if (INDEPENDENT_WALLCLOCK())
+               return;
+
+       if ((time_status & STA_UNSYNC) != 0)
+               return;
+
+       /* Adjust shadow for jiffies that haven't updated xtime yet. */
+       shadow_tv.tv_usec -= 
+               (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
+       HANDLE_USEC_UNDERFLOW(shadow_tv);
+
+       /* Update our unsynchronised xtime appropriately. */
+       sec = shadow_tv.tv_sec;
+       nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
+
+       __normalize_time(&sec, &nsec);
+       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+       set_normalized_timespec(&xtime, sec, nsec);
+       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+}
+
 /*
  * Reads a consistent set of time-base values from Xen, into a shadow data
  * area. Must be called with the xtime_lock held for writing.
  */
 static void __get_time_values_from_xen(void)
 {
-       shared_info_t *s = HYPERVISOR_shared_info;
+       shared_info_t           *s = HYPERVISOR_shared_info;
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &s->vcpu_time[smp_processor_id()];
+       dst = &per_cpu(shadow_time, smp_processor_id());
 
        do {
-               shadow_time_version = s->time_version2;
+               dst->version = src->time_version2;
                rmb();
-               shadow_tv.tv_sec    = s->wc_sec;
-               shadow_tv.tv_usec   = s->wc_usec;
-               shadow_tsc_stamp    = (u32)s->tsc_timestamp;
-               shadow_system_time  = s->system_time;
+               dst->tsc_timestamp     = src->tsc_timestamp;
+               dst->system_timestamp  = src->system_time;
+               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+               dst->tsc_shift         = src->tsc_shift;
                rmb();
        }
-       while (shadow_time_version != s->time_version1);
-
-       cur_timer->mark_offset();
+       while (dst->version != src->time_version1);
+
+       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+
+       if ((shadow_tv.tv_sec != s->wc_sec) ||
+           (shadow_tv.tv_usec != s->wc_usec))
+               update_wallclock();
+}
+
+static inline int time_values_up_to_date(int cpu)
+{
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &HYPERVISOR_shared_info->vcpu_time[smp_processor_id()];
+       dst = &per_cpu(shadow_time, smp_processor_id());
+
+       return (dst->version == src->time_version2);
 }
 
 #define TIME_VALUES_UP_TO_DATE \
@@ -229,13 +342,18 @@
        unsigned long max_ntp_tick;
        unsigned long flags;
        s64 nsec;
+       unsigned int cpu;
+       struct shadow_time_info *shadow;
+
+       cpu = get_cpu();
+       shadow = &per_cpu(shadow_time, cpu);
 
        do {
                unsigned long lost;
 
                seq = read_seqbegin(&xtime_lock);
 
-               usec = cur_timer->get_offset();
+               usec = get_usec_offset(shadow);
                lost = jiffies - wall_jiffies;
 
                /*
@@ -256,11 +374,11 @@
                sec = xtime.tv_sec;
                usec += (xtime.tv_nsec / NSEC_PER_USEC);
 
-               nsec = shadow_system_time - processed_system_time;
+               nsec = shadow->system_timestamp - processed_system_time;
                __normalize_time(&sec, &nsec);
                usec += (long)nsec / NSEC_PER_USEC;
 
-               if (unlikely(!TIME_VALUES_UP_TO_DATE)) {
+               if (unlikely(!time_values_up_to_date(cpu))) {
                        /*
                         * We may have blocked for a long time,
                         * rendering our calculations invalid
@@ -275,19 +393,11 @@
                }
        } while (read_seqretry(&xtime_lock, seq));
 
+       put_cpu();
+
        while (usec >= USEC_PER_SEC) {
                usec -= USEC_PER_SEC;
                sec++;
-       }
-
-       /* Ensure that time-of-day is monotonically increasing. */
-       if ((sec < last_seen_tv.tv_sec) ||
-           ((sec == last_seen_tv.tv_sec) && (usec < last_seen_tv.tv_usec))) {
-               sec = last_seen_tv.tv_sec;
-               usec = last_seen_tv.tv_usec;
-       } else {
-               last_seen_tv.tv_sec = sec;
-               last_seen_tv.tv_usec = usec;
        }
 
        tv->tv_sec = sec;
@@ -302,12 +412,17 @@
        long wtm_nsec;
        s64 nsec;
        struct timespec xentime;
+       unsigned int cpu;
+       struct shadow_time_info *shadow;
 
        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
 
        if (!INDEPENDENT_WALLCLOCK())
                return 0; /* Silent failure? */
+
+       cpu = get_cpu();
+       shadow = &per_cpu(shadow_time, cpu);
 
        write_seqlock_irq(&xtime_lock);
 
@@ -317,9 +432,8 @@
         * be stale, so we can retry with fresh ones.
         */
  again:
-       nsec = (s64)tv->tv_nsec -
-           ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC);
-       if (unlikely(!TIME_VALUES_UP_TO_DATE)) {
+       nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow);
+       if (unlikely(!time_values_up_to_date(cpu))) {
                __get_time_values_from_xen();
                goto again;
        }
@@ -335,7 +449,7 @@
         */
        nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
 
-       nsec -= (shadow_system_time - processed_system_time);
+       nsec -= (shadow->system_timestamp - processed_system_time);
 
        __normalize_time(&sec, &nsec);
        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
@@ -349,23 +463,20 @@
        time_maxerror = NTP_PHASE_LIMIT;
        time_esterror = NTP_PHASE_LIMIT;
 
-       /* Reset all our running time counts. They make no sense now. */
-       last_seen_tv.tv_sec = 0;
-       last_update_from_xen = 0;
-
 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
        if (xen_start_info.flags & SIF_INITDOMAIN) {
                dom0_op_t op;
-               last_rtc_update = last_update_to_xen = 0;
                op.cmd = DOM0_SETTIME;
                op.u.settime.secs        = xentime.tv_sec;
                op.u.settime.usecs       = xentime.tv_nsec / NSEC_PER_USEC;
-               op.u.settime.system_time = shadow_system_time;
+               op.u.settime.system_time = shadow->system_timestamp;
                write_sequnlock_irq(&xtime_lock);
                HYPERVISOR_dom0_op(&op);
        } else
 #endif
                write_sequnlock_irq(&xtime_lock);
+
+       put_cpu();
 
        clock_was_set();
        return 0;
@@ -403,9 +514,30 @@
  */
 unsigned long long monotonic_clock(void)
 {
-       return cur_timer->monotonic_clock();
+       int cpu = get_cpu();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+       s64 off;
+       unsigned long flags;
+       
+       for ( ; ; ) {
+               off = get_nsec_offset(shadow);
+               if (time_values_up_to_date(cpu))
+                       break;
+               write_seqlock_irqsave(&xtime_lock, flags);
+               __get_time_values_from_xen();
+               write_sequnlock_irqrestore(&xtime_lock, flags);
+       }
+
+       put_cpu();
+
+       return shadow->system_timestamp + off;
 }
 EXPORT_SYMBOL(monotonic_clock);
+
+unsigned long long sched_clock(void)
+{
+       return monotonic_clock();
+}
 
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 unsigned long profile_pc(struct pt_regs *regs)
@@ -427,27 +559,26 @@
 static inline void do_timer_interrupt(int irq, void *dev_id,
                                        struct pt_regs *regs)
 {
-       time_t wtm_sec, sec;
-       s64 delta, delta_cpu, nsec;
-       long sec_diff, wtm_nsec;
+       s64 delta, delta_cpu;
        int cpu = smp_processor_id();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
 
        do {
                __get_time_values_from_xen();
 
-               delta = delta_cpu = (s64)shadow_system_time +
-                       ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC);
+               delta = delta_cpu = 
+                       shadow->system_timestamp + get_nsec_offset(shadow);
                delta     -= processed_system_time;
                delta_cpu -= per_cpu(processed_system_time, cpu);
        }
-       while (!TIME_VALUES_UP_TO_DATE);
+       while (!time_values_up_to_date(cpu));
 
        if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) {
                printk("Timer ISR/%d: Time went backwards: "
                       "delta=%lld cpu_delta=%lld shadow=%lld "
                       "off=%lld processed=%lld cpu_processed=%lld\n",
-                      cpu, delta, delta_cpu, shadow_system_time,
-                      ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC), 
+                      cpu, delta, delta_cpu, shadow->system_timestamp,
+                      (s64)get_nsec_offset(shadow),
                       processed_system_time,
                       per_cpu(processed_system_time, cpu));
                for (cpu = 0; cpu < num_online_cpus(); cpu++)
@@ -470,76 +601,6 @@
                update_process_times(user_mode(regs));
                profile_tick(CPU_PROFILING, regs);
        }
-
-       if (cpu != 0)
-               return;
-
-       /*
-        * Take synchronised time from Xen once a minute if we're not
-        * synchronised ourselves, and we haven't chosen to keep an independent
-        * time base.
-        */
-       if (!INDEPENDENT_WALLCLOCK() &&
-           ((time_status & STA_UNSYNC) != 0) &&
-           (xtime.tv_sec > (last_update_from_xen + 60))) {
-               /* Adjust shadow for jiffies that haven't updated xtime yet. */
-               shadow_tv.tv_usec -= 
-                       (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
-               HANDLE_USEC_UNDERFLOW(shadow_tv);
-
-               /*
-                * Reset our running time counts if they are invalidated by
-                * a warp backwards of more than 500ms.
-                */
-               sec_diff = xtime.tv_sec - shadow_tv.tv_sec;
-               if (unlikely(abs(sec_diff) > 1) ||
-                   unlikely(((sec_diff * USEC_PER_SEC) +
-                             (xtime.tv_nsec / NSEC_PER_USEC) -
-                             shadow_tv.tv_usec) > 500000)) {
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
-                       last_rtc_update = last_update_to_xen = 0;
-#endif
-                       last_seen_tv.tv_sec = 0;
-               }
-
-               /* Update our unsynchronised xtime appropriately. */
-               sec = shadow_tv.tv_sec;
-               nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
-
-               __normalize_time(&sec, &nsec);
-               wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-               wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-               set_normalized_timespec(&xtime, sec, nsec);
-               set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-               last_update_from_xen = sec;
-       }
-
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
-       if (!(xen_start_info.flags & SIF_INITDOMAIN))
-               return;
-
-       /* Send synchronised time to Xen approximately every minute. */
-       if (((time_status & STA_UNSYNC) == 0) &&
-           (xtime.tv_sec > (last_update_to_xen + 60))) {
-               dom0_op_t op;
-               struct timeval tv;
-
-               tv.tv_sec   = xtime.tv_sec;
-               tv.tv_usec  = xtime.tv_nsec / NSEC_PER_USEC;
-               tv.tv_usec += (jiffies - wall_jiffies) * (USEC_PER_SEC/HZ);
-               HANDLE_USEC_OVERFLOW(tv);
-
-               op.cmd = DOM0_SETTIME;
-               op.u.settime.secs        = tv.tv_sec;
-               op.u.settime.usecs       = tv.tv_usec;
-               op.u.settime.system_time = shadow_system_time;
-               HYPERVISOR_dom0_op(&op);
-
-               last_update_to_xen = xtime.tv_sec;
-       }
-#endif
 }
 
 /*
@@ -731,12 +792,10 @@
        xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
        set_normalized_timespec(&wall_to_monotonic,
                -xtime.tv_sec, -xtime.tv_nsec);
-       processed_system_time = shadow_system_time;
+       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;
 
-       if (timer_tsc_init.init(NULL) != 0)
-               BUG();
-       printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+       init_cpu_khz();
 
 #if defined(__x86_64__)
        vxtime.mode = VXTIME_TSC;
@@ -807,21 +866,15 @@
 /* No locking required. We are only CPU running, and interrupts are off. */
 void time_resume(void)
 {
-       if (timer_tsc_init.init(NULL) != 0)
-               BUG();
+       init_cpu_khz();
 
        /* Get timebases for new environment. */ 
        __get_time_values_from_xen();
 
        /* Reset our own concept of passage of system time. */
-       processed_system_time = shadow_system_time;
+       processed_system_time =
+               per_cpu(shadow_time, smp_processor_id()).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;
-
-       /* Accept a warp in UTC (wall-clock) time. */
-       last_seen_tv.tv_sec = 0;
-
-       /* Make sure we resync UTC time with Xen on next timer interrupt. */
-       last_update_from_xen = 0;
 }
 
 #ifdef CONFIG_SMP
@@ -832,7 +885,8 @@
 
        do {
                seq = read_seqbegin(&xtime_lock);
-               per_cpu(processed_system_time, cpu) = shadow_system_time;
+               per_cpu(processed_system_time, cpu) = 
+                       per_cpu(shadow_time, cpu).system_timestamp;
        } while (read_seqretry(&xtime_lock, seq));
 
        per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER);
@@ -861,3 +915,13 @@
        return 0;
 }
 __initcall(xen_sysctl_init);
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.