Xen project Mailing List

You're right. These aren't interrupts in get_s_time_fixed, but rather a cumulative error in the sequence due to integer rounding. I added logging of the current local_stime to local_time_calibration and noticed that the timestamp between cores is gradually increasing. If the server has been running for weeks, this could be a very large value.

```

@@ -1732,6 +1753,8 @@ static void cf_check local_time_calibration(void)

if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )

{

/* Atomically read cpu_calibration struct and write cpu_time struct. */

+ printk("update stime on time calibrate %d, %lu -> %lu (%lu, %lu)\n", smp_processor_id(), t->stamp.local_stime, c->local_stime,

+ t->last_seen_ns, t->last_seen_tsc);

local_irq_disable();

t->stamp = *c;

local_irq_enable();

```

2 hours of work:

```

(XEN) update stime on time calibrate 0, 8564145820102 -> 8565145861597 (8565145862216, 0)

(XEN) update stime on time calibrate 1, 8564145820129 -> 8565145861609 (8565145863957, 0)

(XEN) update stime on time calibrate 3, 8564145819996 -> 8565145861491 (8565145864800, 0)

(XEN) update stime on time calibrate 2, 8564145820099 -> 8565145861609 (8565145865372, 0)

8565145861609 - 8565145861491 = 115 * 3 (3.00 GHz) = 345 lag

```

6 hours of work:

```

(XEN) update stime on time calibrate 0, 22914730829200 -> 22915730869993 (22915730870665, 0)

(XEN) update stime on time calibrate 1, 22914730829073 -> 22915730869889 (22915730870693, 0)

(XEN) update stime on time calibrate 2, 22914730829052 -> 22915730869841 (22915730872231, 0)

(XEN) update stime on time calibrate 3, 22914730828892 -> 22915730869696 (22915730872096, 0)

22915730869993 - 22915730869696 = 297 * 3 (3.00 GHz) = 891 lag

```

Clarification, according to my xen settings:

```

ucode=scan dom0_mem=53923M,max:53923M dom0_max_vcpus=4-96 dom0_vcpus_pin=0 force-ept=1 ept=no-ad,no-pml hap_1gb=0 hap_2mb=0 altp2m=1 hpet=legacy-replacement smt=1 spec-ctrl=no gnttab_max_frames=512 cpufreq=xen:performance max_cstate=1 sched=credit sched-gran=cpu apicv=0 sched_credit_tslice_ms=5 sched_ratelimit_us=500

```

Processors I tested on:

```

Intel(R) Core(TM) i5-3330 CPU @ 3.00GHz

```

Intel(R) Xeon(R) Gold 5318Y CPU @ 2.10GHz x2 (NUMA)

Flags: fpu de tsc msr pae mce cx8 apic sep mca cmov pat clflush acpi mmx fxsr sse sse2 ss ht syscall nx rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid tsc_known_freq pni pclmulqdq monitor est ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch cpuid_fault ibpb fsgsbase bmi1 avx2 bmi2 erms rtm avx512f avx512dq rdseed adx avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 avx512vbmi avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid fsrm md_clear arch_capabilities

```

Next I moved the code to r3 to speed up playback:

```

#include <stdint.h>

#include <stdio.h>

static __inline__ unsigned long long rdtsc(void)

{

unsigned hi, lo;

__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));

return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );

}

#define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL))

struct cpu_time_stamp {

uint64_t local_tsc;

int64_t local_stime;

int64_t master_stime;

};

struct time_scale {

int shift;

uint32_t mul_frac;

};

static inline uint32_t div_frac(uint32_t dividend, uint32_t divisor)

{

uint32_t quotient, remainder;

asm (

"divl %4"

: "=a" (quotient), "=d" (remainder)

: "0" (0), "1" (dividend), "r" (divisor) );

return quotient;

}

void set_time_scale(struct time_scale *ts, uint64_t ticks_per_sec)

{

uint64_t tps64 = ticks_per_sec;

uint32_t tps32;

int shift = 0;

while ( tps64 > (MILLISECS(1000)*2) )

{

tps64 >>= 1;

shift--;

}

tps32 = (uint32_t)tps64;

while ( tps32 <= (uint32_t)MILLISECS(1000) )

{

tps32 <<= 1;

shift++;

}

ts->mul_frac = div_frac(MILLISECS(1000), tps32);

ts->shift = shift;

}

uint64_t scale_delta(uint64_t delta, const struct time_scale *scale)

{

uint64_t product;

if ( scale->shift < 0 )

delta >>= -scale->shift;

else

delta <<= scale->shift;

asm (

"mulq %2 ; shrd $32,%1,%0"

: "=a" (product), "=d" (delta)

: "rm" (delta), "0" ((uint64_t)scale->mul_frac) );

return product;

}

#define _TS_MUL_FRAC_IDENTITY 0x80000000UL

static inline struct time_scale scale_reciprocal(struct time_scale scale)

{

struct time_scale reciprocal;

uint32_t dividend;

dividend = _TS_MUL_FRAC_IDENTITY;

reciprocal.shift = 1 - scale.shift;

while ( dividend >= scale.mul_frac )

{

dividend >>= 1;

reciprocal.shift++;

}

asm (

"divl %4"

: "=a" (reciprocal.mul_frac), "=d" (dividend)

: "0" (0), "1" (dividend), "r" (scale.mul_frac) );

return reciprocal;

}

int64_t get_s_time_fixed(const struct cpu_time_stamp *t, const struct time_scale *ts, uint64_t at_tsc)

{

uint64_t tsc, delta;

if ( at_tsc )

tsc = at_tsc;

else

tsc = rdtsc();

delta = tsc - t->local_tsc;

return t->local_stime + scale_delta(delta, ts);

}

int main() {

struct cpu_time_stamp ct;

struct time_scale ts;

struct time_scale back;

uint64_t start = rdtsc();

set_time_scale(&ts, 3000000000);

back = scale_reciprocal(ts);

ct.local_tsc = start;

ct.local_stime = scale_delta(start, &ts);

while (1) {

uint64_t new_tsc = rdtsc();

ct.local_stime = get_s_time_fixed(&ct, &ts, new_tsc);

ct.local_tsc = new_tsc;

uint64_t tmp_tsc = rdtsc();

printf("%lu %lu\n", tmp_tsc, scale_delta(get_s_time_fixed(&ct, &ts, tmp_tsc), &back));

}

return 0;

}

```

It's enough to run the loop for 10-20 seconds to see the problem. scale_delta rounds the least significant bits during calculation, which causes the error to accumulate, at different rates on different cores, depending on the least significant bits at the time of calibration.

Now let's talk about why dwm reacts this way. When a snapshot is reversed, last_guest_time in hvm_get_guest_time_fixed is set to 0, which doesn't prevent time from flowing backwards. This means that cache_tsc_offset in hvm_get_guest_tsc_fixed may be configured correctly on one physical core, but due to shedding on a physical core with a lagging tsc, the guest may occasionally see a tsc value lower than it saw before the snapshot was reversed. If this happens to the dwm code, it terminates with an error.

A quick solution to this problem might be to save the last_seen_tsc parameter in a snapshot for each core, followed by validation.

The correct solution is to remove the rounding of the least significant bits from the sequence.

Re: [PATCH 1/5] x86/time: deal with negative deltas in get_s_time_fixed()