From: jbeulich@xxxxxxxxxx Subject: reduce contention on xtime_lock Patch-mainline: n/a References: bnc#569014, bnc#571041, bnc#571769, bnc#572146 Especially on large systems the number of CPUs queueing up on xtime_lock may become signficiant, and (as reported in the bugs above) may even prevent proper operation of the system when Xen is using deep C-states. There is, however, no need for all CPUs in the system to update global time - it is sufficient to have a single (at any given point in time) CPU being responsible for this. --- sle11sp1-2010-02-02.orig/arch/x86/kernel/time-xen.c 2010-02-08 08:39:21.000000000 +0100 +++ sle11sp1-2010-02-02/arch/x86/kernel/time-xen.c 2010-02-08 08:40:15.000000000 +0100 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -122,6 +123,20 @@ static int __init __permitted_clock_jitt } __setup("permitted_clock_jitter=", __permitted_clock_jitter); +static volatile unsigned int duty_cpu = NR_CPUS; + +static int __cpuinit duty_cpu_callback(struct notifier_block *nb, + unsigned long action, void *hcpu) +{ + if ((action & ~CPU_TASKS_FROZEN) == CPU_DOWN_PREPARE) + (void)cmpxchg(&duty_cpu, (unsigned long)hcpu, NR_CPUS); + return NOTIFY_DONE; +} + +static struct notifier_block __cpuinitdata duty_cpu_notifier = { + .notifier_call = duty_cpu_callback +}; + /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. @@ -407,6 +422,7 @@ unsigned long profile_pc(struct pt_regs } EXPORT_SYMBOL(profile_pc); +#include //temp /* * Default timer interrupt handler */ @@ -415,6 +431,7 @@ static irqreturn_t timer_interrupt(int i s64 delta, delta_cpu, stolen, blocked; unsigned int i, cpu = smp_processor_id(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + bool duty = false; struct vcpu_runstate_info runstate; /* Keep nmi watchdog up to date */ @@ -427,7 +444,17 @@ static irqreturn_t timer_interrupt(int i * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ - write_seqlock(&xtime_lock); + do { + i = duty_cpu; + if (i == cpu + || ((i == NR_CPUS + || per_cpu(runstate, i).state != RUNSTATE_running) + && cmpxchg(&duty_cpu, i, cpu) == i)) { + duty = true; + write_seqlock(&xtime_lock); + break; + } + } while (i != duty_cpu); do { get_time_values_from_xen(cpu); @@ -441,8 +468,7 @@ static irqreturn_t timer_interrupt(int i get_runstate_snapshot(&runstate); } while (!time_values_up_to_date()); - if ((unlikely(delta < -(s64)permitted_clock_jitter) || - unlikely(delta_cpu < -(s64)permitted_clock_jitter)) + if (duty && unlikely(delta < -(s64)permitted_clock_jitter) && printk_ratelimit()) { printk("Timer ISR/%u: Time went backwards: " "delta=%lld delta_cpu=%lld shadow=%lld " @@ -454,27 +480,45 @@ static irqreturn_t timer_interrupt(int i for (i = 0; i < num_online_cpus(); i++) printk(" %d: %lld\n", i, per_cpu(processed_system_time, i)); + } else if (unlikely(delta_cpu < -(s64)permitted_clock_jitter) + && printk_ratelimit()) { + if (!duty) + write_seqlock(&xtime_lock); + printk("Timer ISR/%u: Time went backwards: " + "delta=-%Lx shadow=%Lx off=%Lx processed=%Lx/%Lx\n", + cpu, -delta_cpu, shadow->system_timestamp, + get_nsec_offset(shadow), processed_system_time, + per_cpu(processed_system_time, cpu)); + for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu)) + printk(" %u: %Lx\n", i, + per_cpu(processed_system_time, i)); + if (!duty) + write_sequnlock(&xtime_lock); } - /* System-wide jiffy work. */ - if (delta >= NS_PER_TICK) { - do_div(delta, NS_PER_TICK); - processed_system_time += delta * NS_PER_TICK; - while (delta > HZ) { - clobber_induction_variable(delta); - do_timer(HZ); - delta -= HZ; + if (duty) { +kstat_incr_irqs_this_cpu(0, irq_to_desc(0));//temp + /* System-wide jiffy work. */ + if (delta >= NS_PER_TICK) { + do_div(delta, NS_PER_TICK); + processed_system_time += delta * NS_PER_TICK; + while (delta > HZ) { + clobber_induction_variable(delta); + do_timer(HZ); + delta -= HZ; + } + do_timer(delta); } - do_timer(delta); - } - if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { - update_wallclock(); - if (keventd_up()) - schedule_work(&clock_was_set_work); - } + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { + update_wallclock(); + if (keventd_up()) + schedule_work(&clock_was_set_work); + } - write_sequnlock(&xtime_lock); + write_sequnlock(&xtime_lock); + } +else percpu_add(mce_exception_count, 1);//temp /* * Account stolen ticks. @@ -692,6 +736,7 @@ static void __init setup_cpu0_timer_irq( { timer_irq = bind_virq_to_irqaction(VIRQ_TIMER, 0, &timer_action); BUG_ON(timer_irq < 0); + register_hotcpu_notifier(&duty_cpu_notifier); } void __init time_init(void) @@ -765,6 +810,7 @@ static void stop_hz_timer(void) unsigned long j; int rc; + (void)cmpxchg(&duty_cpu, cpu, NR_CPUS); cpumask_set_cpu(cpu, nohz_cpu_mask); /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */ @@ -786,9 +832,16 @@ static void stop_hz_timer(void) } singleshot.timeout_abs_ns = jiffies_to_st(j); - if (singleshot.timeout_abs_ns) + if (singleshot.timeout_abs_ns) { +#ifdef CONFIG_X86_64 + u64 local = percpu_read(processed_system_time); +#else +??? u64 local = get64_local(&per_cpu(processed_system_time, cpu)); +#endif + if ((s64)(singleshot.timeout_abs_ns - local) <= 0) + singleshot.timeout_abs_ns = local + NS_PER_TICK; singleshot.timeout_abs_ns += NS_PER_TICK/2; - else + } else singleshot.timeout_abs_ns = per_cpu(processed_system_time, cpu) + ((u64)NS_PER_TICK << 32); singleshot.flags = 0;