[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH 7/10] linux 2.6.18: time handling
Keir Fraser wrote: > I think Jeremy Fitzhardinge has an alternative clocksource patch which iirc > is more in line with how Xen time works (should advertise a GHz frequency > clocksource, and do scaling of the TSC value according to time-record values > read from shared_info). Having thought about this some more I think > clocksource support is worth getting into our tree, but let's look at both > available patches and decide which is the better basis for further work. > > Jeremy: If I'm not mistaken and you do have a patch floating around, could > you post it? > Yes, there's a Xen clocksource in the pv_ops tree. There's no nicely separable patch, but the mechanism is pretty simple. I've attached arch/i386/xen/time.c J #include <linux/kernel.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/clocksource.h> #include <asm/xen/hypercall.h> #include <asm/arch_hooks.h> #include <xen/events.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> #include "xen-ops.h" #define XEN_SHIFT 22 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */ static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */ static int __init __permitted_clock_jitter(char *str) { permitted_clock_jitter = simple_strtoul(str, NULL, 0); return 1; } __setup("permitted_clock_jitter=", __permitted_clock_jitter); /* These are perodically updated in shared_info, and then copied here. */ struct shadow_time_info { u64 tsc_timestamp; /* TSC at last update of time vals. */ u64 system_timestamp; /* Time, in nanosecs, since boot. */ u32 tsc_to_nsec_mul; int tsc_shift; u32 version; }; static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); /* Keep track of last time we did processing/updating of jiffies and xtime. */ static u64 processed_system_time; /* System time (ns) at last processing. */ static DEFINE_PER_CPU(u64, processed_system_time); /* How much CPU time was spent blocked and how much was 'stolen'? */ static DEFINE_PER_CPU(u64, processed_stolen_time); static DEFINE_PER_CPU(u64, processed_blocked_time); /* Current runstate of each CPU (updated automatically by the hypervisor). */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); /* Must be signed, as it's compared with s64 quantities which can be -ve. */ #define NS_PER_TICK (1000000000LL/HZ) unsigned long xen_cpu_khz(void) { u64 cpu_khz = 1000000ULL << 32; const struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; do_div(cpu_khz, info->tsc_to_system_mul); if (info->tsc_shift < 0) cpu_khz <<= -info->tsc_shift; else cpu_khz >>= info->tsc_shift; return cpu_khz; } /* * Reads a consistent set of time-base values from Xen, into a shadow data * area. */ static void get_time_values_from_xen(void) { struct vcpu_time_info *src; struct shadow_time_info *dst; src = &read_pda(xen.vcpu)->time; dst = &get_cpu_var(shadow_time); do { dst->version = src->version; rmb(); dst->tsc_timestamp = src->tsc_timestamp; dst->system_timestamp = src->system_time; dst->tsc_to_nsec_mul = src->tsc_to_system_mul; dst->tsc_shift = src->tsc_shift; rmb(); } while ((src->version & 1) | (dst->version ^ src->version)); put_cpu_var(shadow_time); } static inline int time_values_up_to_date(void) { struct vcpu_time_info *src; unsigned dstversion; src = &read_pda(xen.vcpu)->time; dstversion = get_cpu_var(shadow_time).version; put_cpu_var(shadow_time); rmb(); return (dstversion == src->version); } /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) { u64 product; #ifdef __i386__ u32 tmp1, tmp2; #endif if (shift < 0) delta >>= -shift; else delta <<= shift; #ifdef __i386__ __asm__ ( "mul %5 ; " "mov %4,%%eax ; " "mov %%edx,%4 ; " "mul %5 ; " "xor %5,%5 ; " "add %4,%%eax ; " "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); #elif __x86_64__ __asm__ ( "mul %%rdx ; shrd $32,%%rdx,%%rax" : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); #else #error implement me! #endif return product; } static u64 get_nsec_offset(struct shadow_time_info *shadow) { u64 now, delta; rdtscll(now); delta = now - shadow->tsc_timestamp; return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); } static void xen_timer_interrupt_hook(void) { s64 delta, delta_cpu, stolen, blocked; u64 sched_time; int i, cpu = smp_processor_id(); unsigned long ticks; struct shadow_time_info *shadow = &__get_cpu_var(shadow_time); struct vcpu_runstate_info *runstate = &__get_cpu_var(runstate); do { get_time_values_from_xen(); /* Obtain a consistent snapshot of elapsed wallclock cycles. */ delta = delta_cpu = shadow->system_timestamp + get_nsec_offset(shadow); if (0) printk("tsc_timestamp=%llu system_timestamp=%llu tsc_to_nsec=%u tsc_shift=%d, version=%u, delta=%lld processed_system_time=%lld\n", shadow->tsc_timestamp, shadow->system_timestamp, shadow->tsc_to_nsec_mul, shadow->tsc_shift, shadow->version, delta, processed_system_time); delta -= processed_system_time; delta_cpu -= __get_cpu_var(processed_system_time); /* * Obtain a consistent snapshot of stolen/blocked cycles. We * can use state_entry_time to detect if we get preempted here. */ do { sched_time = runstate->state_entry_time; barrier(); stolen = runstate->time[RUNSTATE_runnable] + runstate->time[RUNSTATE_offline] - __get_cpu_var(processed_stolen_time); blocked = runstate->time[RUNSTATE_blocked] - __get_cpu_var(processed_blocked_time); barrier(); } while (sched_time != runstate->state_entry_time); } while (!time_values_up_to_date()); if ((unlikely(delta < -(s64)permitted_clock_jitter) || unlikely(delta_cpu < -(s64)permitted_clock_jitter)) && printk_ratelimit()) { printk("Timer ISR/%d: Time went backwards: " "delta=%lld delta_cpu=%lld shadow=%lld " "off=%lld processed=%lld cpu_processed=%lld\n", cpu, delta, delta_cpu, shadow->system_timestamp, (s64)get_nsec_offset(shadow), processed_system_time, __get_cpu_var(processed_system_time)); for (i = 0; i < num_online_cpus(); i++) printk(" %d: %lld\n", i, per_cpu(processed_system_time, i)); } /* System-wide jiffy work. */ ticks = 0; while(delta > NS_PER_TICK) { delta -= NS_PER_TICK; processed_system_time += NS_PER_TICK; ticks++; } do_timer(ticks); /* * Account stolen ticks. * HACK: Passing NULL to account_steal_time() * ensures that the ticks are accounted as stolen. */ if ((stolen > 0) && (delta_cpu > 0)) { delta_cpu -= stolen; if (unlikely(delta_cpu < 0)) stolen += delta_cpu; /* clamp local-time progress */ do_div(stolen, NS_PER_TICK); __get_cpu_var(processed_stolen_time) += stolen * NS_PER_TICK; __get_cpu_var(processed_system_time) += stolen * NS_PER_TICK; account_steal_time(NULL, (cputime_t)stolen); } /* * Account blocked ticks. * HACK: Passing idle_task to account_steal_time() * ensures that the ticks are accounted as idle/wait. */ if ((blocked > 0) && (delta_cpu > 0)) { delta_cpu -= blocked; if (unlikely(delta_cpu < 0)) blocked += delta_cpu; /* clamp local-time progress */ do_div(blocked, NS_PER_TICK); __get_cpu_var(processed_blocked_time) += blocked * NS_PER_TICK; __get_cpu_var(processed_system_time) += blocked * NS_PER_TICK; account_steal_time(idle_task(cpu), (cputime_t)blocked); } update_process_times(user_mode_vm(get_irq_regs())); } static cycle_t xen_clocksource_read(void) { struct shadow_time_info *shadow = &get_cpu_var(shadow_time); cycle_t ret; get_time_values_from_xen(); ret = shadow->system_timestamp + get_nsec_offset(shadow); put_cpu_var(shadow_time); return ret; } static void xen_read_wallclock(struct timespec *ts) { const struct shared_info *s = HYPERVISOR_shared_info; u32 version; u64 delta; struct timespec now; /* get wallclock at system boot */ do { version = s->wc_version; rmb(); now.tv_sec = s->wc_sec; now.tv_nsec = s->wc_nsec; rmb(); } while ((s->wc_version & 1) | (version ^ s->wc_version)); delta = xen_clocksource_read(); /* time since system boot */ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; now.tv_nsec = do_div(delta, NSEC_PER_SEC); now.tv_sec = delta; set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } unsigned long xen_get_wallclock(void) { struct timespec ts; xen_read_wallclock(&ts); return ts.tv_sec; } int xen_set_wallclock(unsigned long now) { /* do nothing for domU */ return -1; } static struct clocksource xen_clocksource = { .name = "xen", .rating = 400, .read = xen_clocksource_read, .mask = ~0, .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ .shift = XEN_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static void init_missing_ticks_accounting(int cpu) { struct vcpu_register_runstate_memory_area area; struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); memset(runstate, 0, sizeof(*runstate)); area.addr.v = runstate; HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area); per_cpu(processed_blocked_time, cpu) = runstate->time[RUNSTATE_blocked]; per_cpu(processed_stolen_time, cpu) = runstate->time[RUNSTATE_runnable] + runstate->time[RUNSTATE_offline]; } static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) { /* * Here we are in the timer irq handler. We just have irqs locally * disabled but we don't know if the timer_bh is running on the other * CPU. We need to avoid to SMP race with it. NOTE: we don' t need * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ write_seqlock(&xtime_lock); xen_timer_interrupt_hook(); write_sequnlock(&xtime_lock); return IRQ_HANDLED; } static void setup_cpu0_timer_irq(void) { printk(KERN_DEBUG "installing Xen timer for CPU 0\n"); bind_virq_to_irqhandler( VIRQ_TIMER, 0, xen_timer_interrupt, SA_INTERRUPT, "timer0", NULL); } __init void xen_time_init(void) { get_time_values_from_xen(); processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; init_missing_ticks_accounting(0); clocksource_register(&xen_clocksource); /* Set initial system time with full resolution */ xen_read_wallclock(&xtime); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); tsc_disable = 0; setup_cpu0_timer_irq(); } /* Convert jiffies to system time. */ static u64 jiffies_to_st(unsigned long j) { unsigned long seq; long delta; u64 st; do { seq = read_seqbegin(&xtime_lock); delta = j - jiffies; if (delta < 1) { /* Triggers in some wrap-around cases, but that's okay: * we just end up with a shorter timeout. */ st = processed_system_time + NS_PER_TICK; } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) { /* Very long timeout means there is no pending timer. * We indicate this to Xen by passing zero timeout. */ st = 0; } else { st = processed_system_time + delta * (u64)NS_PER_TICK; } } while (read_seqretry(&xtime_lock, seq)); return st; } /* * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu * These functions are based on implementations from arch/s390/kernel/time.c */ void stop_hz_timer(void) { unsigned int cpu = smp_processor_id(); unsigned long j; cpu_set(cpu, nohz_cpu_mask); /* * See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs * ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a * value of rcp->cur that matches rdp->quiescbatch and allows us to * stop the hz timer then the cpumasks created for subsequent values * of cur in rcu_start_batch are guaranteed to pick up the updated * nohz_cpu_mask and so will not depend on this cpu. */ smp_mb(); /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ if (rcu_needs_cpu(cpu) || local_softirq_pending() || (j = next_timer_interrupt(), time_before_eq(j, jiffies))) { cpu_clear(cpu, nohz_cpu_mask); j = jiffies + 1; } if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0) BUG(); } void start_hz_timer(void) { cpu_clear(smp_processor_id(), nohz_cpu_mask); } _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |