[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC] draft tsc_mode patch (to replace tsc_native)



This isn't done yet, but contains the core code for
implementing the mechanism I've been proposing for
handling "tsc_mode" (to replace tsc_native/vtsc),
so I thought I'd ask for some feedback hopefully before
reviewers leave for Xen Summit Asia.

The tsc_mode is set in the VM config file as:

#define TSC_MODE_DEFAULT          0
#define TSC_MODE_ALWAYS_EMULATE   1
#define TSC_MODE_NEVER_EMULATE    2
#define TSC_MODE_PVRDTSCP         3

0 = guest rdtsc/p executed natively when monotonicity can be guaranteed
     and emulated otherwise (with frequency scaled if necessary)
1 = guest rdtsc/p always emulated at 1GHz (kernel and user)
2 = guest rdtsc always executed natively (no monotonicity/frequency
     guarantees); guest rdtscp emulated at native frequency if
     unsupported by h/w, else executed natively
3 = same as 2, except xen manages TSC_AUX register so guest can
     determine when a restore/migration has occurred and assumes
     guest obtains/uses pvclock-like mechanism to adjust for
     monotonicity and frequency changes

Tsc_mode must be persistent across save/restore/migration.
In addition, an offset, a tsc_khz, and a "incarnation" counter
are deduced on creation, and then must be persistent across
save/restore/migrate (though some are ignored for some tsc modes).

A key improvement over the previous tsc_native implementation
is that if TSC is "safe", the default tsc_mode does not
use emulation until after the first save/restore/migrate
(mimicking the previous implementation by Intel for HVM).
Since a surprising number of machines are now "TSC safe"
and since most domains never get saved/migrated, the
vast majority of VMs will never suffer the performance
penalty of emulated TSC even though TSC correctness for
applications is still provided.

Note that I haven't removed the tsc_native code yet,
some functionality is still underway 
and there's still a fair amount of debug code to be
removed.

Thanks for any feedback!
Dan

diff -r 494ad84ad38c tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/libxc/xc_domain.c   Thu Nov 12 16:48:53 2009 -0700
@@ -483,6 +483,52 @@ int xc_domain_disable_migrate(int xc_han
     domctl.u.disable_migrate.disable = 1;
     return do_domctl(xc_handle, &domctl);
 }
+
+int xc_domain_set_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t tsc_mode,
+                           uint64_t elapsed_nsec,
+                           uint32_t gtsc_khz,
+                           uint32_t incarnation)
+{
+    DECLARE_DOMCTL;
+    domctl.cmd = XEN_DOMCTL_settscinfo;
+    domctl.domain = (domid_t)domid;
+    domctl.u.tsc_info.info.tsc_mode = tsc_mode;
+    domctl.u.tsc_info.info.elapsed_nsec = elapsed_nsec;
+    domctl.u.tsc_info.info.gtsc_khz = gtsc_khz;
+    domctl.u.tsc_info.info.incarnation = incarnation;
+    return do_domctl(xc_handle, &domctl);
+}
+
+int xc_domain_get_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t *tsc_mode,
+                           uint64_t *elapsed_nsec,
+                           uint32_t *gtsc_khz,
+                           uint32_t *incarnation)
+{
+    int rc;
+    DECLARE_DOMCTL;
+    xen_guest_tsc_info_t info = { 0 };
+
+    domctl.cmd = XEN_DOMCTL_gettscinfo;
+    domctl.domain = (domid_t)domid;
+    set_xen_guest_handle(domctl.u.tsc_info.out_info, &info);
+    if ( (rc = lock_pages(&info, sizeof(info))) != 0 )
+        return rc;
+    rc = do_domctl(xc_handle, &domctl);
+    if ( rc == 0 )
+    {
+        *tsc_mode = info.tsc_mode;
+        *elapsed_nsec = info.elapsed_nsec;
+        *gtsc_khz = info.gtsc_khz;
+        *incarnation = info.incarnation;
+    }
+    unlock_pages(&info,sizeof(info));
+    return rc;
+}
+
 
 int xc_domain_memory_increase_reservation(int xc_handle,
                                           uint32_t domid,
diff -r 494ad84ad38c tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/libxc/xc_domain_restore.c   Thu Nov 12 16:48:53 2009 -0700
@@ -1083,6 +1083,23 @@ static int pagebuf_get_one(pagebuf_t* bu
             ERROR("error reading/restoring tmem extra");
             return -1;
         }
+        return pagebuf_get_one(buf, fd, xch, dom);
+    } else if ( count == -7 ) {
+        uint32_t tsc_mode, khz, incarn;
+        uint64_t nsec;
+        if ( read_exact(fd, &tsc_mode, sizeof(uint32_t)) ||
+             read_exact(fd, &nsec, sizeof(uint64_t)) ||
+             read_exact(fd, &khz, sizeof(uint32_t)) ||
+             read_exact(fd, &incarn, sizeof(uint32_t)) ||
+             xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
+            ERROR("error reading/restoring tmem extra");
+            return -1;
+        }
+        /* no PRIxxx formatting allowed here???
+        DPRINTF("tsc_info read: mode=%"PRIu32",ns=0x%"PRIx64","
+                "khz=%"PRIu32",incarn=%"PRIu32"\n",
+                tsc_mode, nsec, khz, incarn);
+        */
         return pagebuf_get_one(buf, fd, xch, dom);
     } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
         ERROR("Max batch size exceeded (%d). Giving up.", count);
diff -r 494ad84ad38c tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/libxc/xc_domain_save.c      Thu Nov 12 16:48:53 2009 -0700
@@ -1099,6 +1099,28 @@ int xc_domain_save(int xc_handle, int io
         ERROR("Error when writing to state file (tmem)");
         goto out;
     }
+
+                {
+                    int marker = -7;
+                    uint32_t tsc_mode, khz, incarn;
+                    uint64_t nsec;
+                    if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode,
+                                                &nsec, &khz, &incarn) < 0  ||
+                         write_exact(io_fd, &marker, sizeof(marker)) ||
+                         write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
+                         write_exact(io_fd, &nsec, sizeof(nsec)) ||
+                         write_exact(io_fd, &khz, sizeof(khz)) ||
+                         write_exact(io_fd, &incarn, sizeof(incarn)) )
+                    {
+                        ERROR("Error when writing to state file (tsc)");
+                        goto out;
+                    }
+                    /* no PRIxxx formatting?
+                    DPRINTK("tsc_info written: mode=%"PRIu32",ns=0x%"PRIx64","
+                            "khz=%"PRIu32",incarn=%"PRIu32"\n",
+                            tsc_mode, nsec, khz, incarn);
+                    */
+                 }
 
   copypages:
 #define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), 
(len))
@@ -1707,6 +1729,7 @@ int xc_domain_save(int xc_handle, int io
             PERROR("Error when writing to state file (2)");
             goto out;
         }
+
     }
 
     /*
diff -r 494ad84ad38c tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/libxc/xenctrl.h     Thu Nov 12 16:48:53 2009 -0700
@@ -627,6 +627,20 @@ int xc_domain_set_time_offset(int xc_han
 int xc_domain_set_time_offset(int xc_handle,
                               uint32_t domid,
                               int32_t time_offset_seconds);
+
+int xc_domain_set_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t tsc_mode,
+                           uint64_t elapsed_nsec,
+                           uint32_t gtsc_khz,
+                           uint32_t incarnation);
+
+int xc_domain_get_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t *tsc_mode,
+                           uint64_t *elapsed_nsec,
+                           uint32_t *gtsc_khz,
+                           uint32_t *incarnation);
 
 int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native);
 
diff -r 494ad84ad38c tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/lowlevel/xc/xc.c Thu Nov 12 16:48:53 2009 -0700
@@ -1479,6 +1479,20 @@ static PyObject *pyxc_domain_set_tsc_nat
         return NULL;
 
     if (xc_domain_set_tsc_native(self->xc_handle, dom, is_native) != 0)
+        return pyxc_error_to_exception();
+
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_domain_set_tsc_mode(XcObject *self, PyObject *args)
+{
+    uint32_t dom, tsc_mode;
+
+    if (!PyArg_ParseTuple(args, "ii", &dom, &tsc_mode))
+        return NULL;
+
+    if (xc_domain_set_tsc_info(self->xc_handle, dom, tsc_mode, 0, 0, 0) != 0)
         return pyxc_error_to_exception();
 
     Py_INCREF(zero);
@@ -2029,6 +2043,15 @@ static PyMethodDef pyxc_methods[] = {
       " is_native  [int]: 1=native, 0=emulate.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
+    { "domain_set_tsc_mode",
+      (PyCFunction)pyxc_domain_set_tsc_mode,
+      METH_VARARGS, "\n"
+      "Set a domain's TSC mode\n"
+      " dom        [int]: Domain whose TSC mode is being set.\n"
+      " tsc_mode   [int]: 0=default (monotonic, but native where possible)\n"
+      "                   1=always emulate 2=never emulate 3=pvrdtscp\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
     { "domain_disable_migrate",
       (PyCFunction)pyxc_domain_disable_migrate,
       METH_VARARGS, "\n"
diff -r 494ad84ad38c tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/xend/XendConfig.py       Thu Nov 12 16:48:53 2009 -0700
@@ -164,6 +164,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
     'vnclisten': str,
     'timer_mode': int,
     'tsc_native': int,
+    'tsc_mode': int,
     'vpt_align': int,
     'viridian': int,
     'vncpasswd': str,
@@ -479,6 +480,9 @@ class XendConfig(dict):
 
         if 'tsc_native' not in self['platform']:
             self['platform']['tsc_native'] = 0
+
+        if 'tsc_mode' not in self['platform']:
+            self['platform']['tsc_mode'] = 0
 
         if 'nomigrate' not in self['platform']:
             self['platform']['nomigrate'] = 0
diff -r 494ad84ad38c tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/xend/XendDomainInfo.py   Thu Nov 12 16:48:53 2009 -0700
@@ -2467,6 +2467,11 @@ class XendDomainInfo:
         if arch.type == "x86" and tsc_native is not None:
             xc.domain_set_tsc_native(self.domid, int(tsc_native))
 
+        # Set TSC mode of domain
+        tsc_mode = self.info["platform"].get("tsc_mode")
+        if arch.type == "x86" and tsc_mode is not None:
+            xc.domain_set_tsc_native(self.domid, int(tsc_mode))
+
         # Set timer configuration of domain
         timer_mode = self.info["platform"].get("timer_mode")
         if hvm and timer_mode is not None:
diff -r 494ad84ad38c tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/xm/create.py     Thu Nov 12 16:48:53 2009 -0700
@@ -224,6 +224,10 @@ gopts.var('tsc_native', val='TSC_NATIVE'
 gopts.var('tsc_native', val='TSC_NATIVE',
           fn=set_int, default=0,
           use="""TSC mode (0=emulate TSC, 1=native TSC).""")
+
+gopts.var('tsc_mode', val='TSC_MODE',
+          fn=set_int, default=0,
+          use="""TSC mode (0=default, 1=always emulate, 2=never emulate, 
3=pvrdtscp).""")
 
 gopts.var('nomigrate', val='NOMIGRATE',
           fn=set_int, default=0,
@@ -741,6 +745,10 @@ def configure_image(vals):
     if vals.tsc_native is not None:
         config_image.append(['tsc_native', vals.tsc_native])
 
+    # DJM DJM don't think I need this???
+    #if vals.tsc_mode is not None:
+    #    config_image.append(['tsc_mode', vals.tsc_mode])
+
     if vals.nomigrate is not None:
         config_image.append(['nomigrate', vals.nomigrate])
 
@@ -1027,7 +1035,7 @@ def make_config(vals):
                 config.append([n, v])
 
     map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
-                   'restart', 'on_poweroff', 'tsc_native', 'nomigrate',
+                   'restart', 'on_poweroff', 'tsc_native', 'tsc_mode', 
'nomigrate',
                    'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail', 'features',
                    'on_xend_start', 'on_xend_stop', 'target', 'cpuid',
                    'cpuid_check', 'machine_address_size', 
'suppress_spurious_page_faults'])
diff -r 494ad84ad38c tools/python/xen/xm/xenapi_create.py
--- a/tools/python/xen/xm/xenapi_create.py      Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/xm/xenapi_create.py      Thu Nov 12 16:48:53 2009 -0700
@@ -1078,6 +1078,7 @@ class sxp2xml:
             'pci_power_mgmt',
             'xen_platform_pci',
             'tsc_native'
+            'tsc_mode'
             'description',
             'nomigrate'
         ]
diff -r 494ad84ad38c xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/arch/x86/domain.c     Thu Nov 12 16:48:53 2009 -0700
@@ -520,6 +520,9 @@ int arch_domain_create(struct domain *d,
         d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
     }
 
+    /* initialize default tsc behavior in case tools don't */
+if (d->domain_id) //DJM
+    tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
     spin_lock_init(&d->arch.vtsc_lock);
 
     return 0;
diff -r 494ad84ad38c xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/arch/x86/domctl.c     Thu Nov 12 16:48:53 2009 -0700
@@ -1111,9 +1111,65 @@ long arch_do_domctl(
             break;
 
         domain_pause(d);
-        d->arch.vtsc = !domctl->u.set_tsc_native.is_native;
-        if ( is_hvm_domain(d) )
-            hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+printk("DJM DJM ignoring set_tsc_native for now\n");
+        //d->arch.vtsc = !domctl->u.set_tsc_native.is_native;
+        //if ( is_hvm_domain(d) )
+            //hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+        domain_unpause(d);
+
+        rcu_unlock_domain(d);
+        ret = 0;
+    }
+    break;
+
+    case XEN_DOMCTL_gettscinfo:
+    {
+        struct domain *d;
+        xen_guest_tsc_info_t info;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        domain_pause(d);
+        tsc_get_info(d, &info.tsc_mode,
+                        &info.elapsed_nsec,
+                        &info.gtsc_khz,
+                        &info.incarnation);
+        /* tsc_get_info(d, &domctl->u.tsc_info.tsc_mode,
+                        &domctl->u.tsc_info.elapsed_nsec,
+                        &domctl->u.tsc_info.gtsc_khz,
+                        &domctl->u.tsc_info.incarnation);*/
+        //if ( raw_copy_to_guest(domctl->u.tsc_info.out_info.p, &info, 
sizeof(info)) )
+        if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) )
+{
+printk("XEN_DOMCTL_gettscinfo: copy failed, out_info=%p\n",
+domctl->u.tsc_info.out_info.p);
+            ret = -EFAULT;
+}
+        else
+            ret = 0;
+        domain_unpause(d);
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_settscinfo:
+    {
+        struct domain *d;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        domain_pause(d);
+        tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode,
+                     domctl->u.tsc_info.info.elapsed_nsec,
+                     domctl->u.tsc_info.info.gtsc_khz,
+                     domctl->u.tsc_info.info.incarnation);
         domain_unpause(d);
 
         rcu_unlock_domain(d);
diff -r 494ad84ad38c xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/arch/x86/time.c       Thu Nov 12 16:48:53 2009 -0700
@@ -818,6 +818,7 @@ static void __update_vcpu_system_time(st
     struct cpu_time       *t;
     struct vcpu_time_info *u, _u;
     XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
+    s_time_t vtsc_stamp = 0;
 
     if ( v->vcpu_info == NULL )
         return;
@@ -825,9 +826,15 @@ static void __update_vcpu_system_time(st
     t = &this_cpu(cpu_time);
     u = &vcpu_info(v, time);
 
+    if ( v->domain->arch.vtsc )
+        /* FIXME: need scaling here too? */
+        vtsc_stamp = t->stime_local_stamp - v->domain->arch.vtsc_offset;
+    else if ( v->domain->arch.pvrdtscp )
+        /* FIXME: write tsc_aux here? */;
+
     /* Don't bother unless timestamps have changed or we are forced. */
     if ( !force && (u->tsc_timestamp == (v->domain->arch.vtsc
-                                         ? t->stime_local_stamp
+                                         ? vtsc_stamp
                                          : t->local_tsc_stamp)) )
         return;
 
@@ -835,8 +842,8 @@ static void __update_vcpu_system_time(st
 
     if ( v->domain->arch.vtsc )
     {
-        _u.tsc_timestamp     = t->stime_local_stamp;
-        _u.system_time       = t->stime_local_stamp;
+        _u.tsc_timestamp     = vtsc_stamp;
+        _u.system_time       = vtsc_stamp;
         _u.tsc_to_system_mul = 0x80000000u;
         _u.tsc_shift         = 1;
     }
@@ -1598,8 +1605,126 @@ void pv_soft_rdtsc(struct vcpu *v, struc
 
     spin_unlock(&v->domain->arch.vtsc_lock);
 
+    now -= v->domain->arch.vtsc_offset;
+    if ( v->domain->arch.vtsc_shift != 1 ||
+         v->domain->arch.vtsc_mul_frac != 0x80000000u )
+    {
+/* FIXME
+        struct time_scale scale;
+        scale.shift = v->domain->arch.vtsc_shift;
+        scale.mul_frac = v->domain->arch.vtsc_mul_frac;
+        now = scale_delta(now, &scale);
+*/
+    }
+
     regs->eax = (uint32_t)now;
     regs->edx = (uint32_t)(now >> 32);
+}
+
+int host_tsc_is_safe(void)
+{
+    extern unsigned int max_cstate;
+
+    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
+        return 1;
+    if ( num_online_cpus() == 1 )
+        return 1;
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && max_cstate <= 2 )
+        return 1;
+    return 0;
+}
+
+/* called to collect tsc-related data only for save file or live migrate */
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
+                  uint32_t *gtsc_khz, uint32_t *incarnation)
+{
+    switch ( *tsc_mode = d->arch.tsc_mode )
+    {
+    case TSC_MODE_NEVER_EMULATE:
+        *elapsed_nsec =  *gtsc_khz = *incarnation = 0;
+        break;
+    case TSC_MODE_ALWAYS_EMULATE:
+        *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+        *gtsc_khz = 1000000UL;
+         break;
+    case TSC_MODE_DEFAULT:
+        *incarnation = d->arch.incarnation;
+        *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+        *gtsc_khz =  ( d->arch.vtsc == 0 ) ?  cpu_khz : 1000000UL;
+        break;
+    case TSC_MODE_PVRDTSCP:
+        *incarnation = d->arch.incarnation;
+        *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+        *gtsc_khz =  d->arch.tsc_khz;
+        break;
+    }
+printk("DJM tsc_get_info got: dom%u,mode=%d,ns=0x%lx,khz=%d,inc=%d\n",
+d->domain_id,(int)*tsc_mode,(long)*elapsed_nsec,(int)*gtsc_khz,(int)*incarnation);
+}
+
+/* called to set tsc-related data only on restore or target of live migrate */
+void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+                  uint32_t gtsc_khz, uint32_t incarnation)
+{
+    struct time_scale scale;
+
+printk("DJM tsc_set_info before: dom%u,mode=%d,ns=0x%lx,khz=%d,inc=%d\n",
+d->domain_id,(int)tsc_mode,(long)elapsed_nsec,(int)gtsc_khz,(int)incarnation);
+    switch ( d->arch.tsc_mode = tsc_mode )
+    {
+    case TSC_MODE_NEVER_EMULATE:
+        d->arch.vtsc = 0;
+        break;
+    case TSC_MODE_ALWAYS_EMULATE:
+        d->arch.vtsc = 1;
+        d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+        d->arch.vtsc_shift = 1;
+        d->arch.vtsc_mul_frac = 0x80000000U;
+        break;
+    case TSC_MODE_DEFAULT:
+        if ( host_tsc_is_safe() && incarnation == 0 )
+        {
+            d->arch.vtsc = 0;
+            d->arch.incarnation = 1;
+            d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
+            /* d->arch.vtsc_shift/mul_frac/offset will not be used */
+        } else if ( gtsc_khz != 0  && gtsc_khz != 1000000 ) {
+printk("DJM tsc_set_info A: khz=%d\n",gtsc_khz);
+            d->arch.vtsc = 1;
+            set_time_scale(&scale, gtsc_khz * 1000 );
+            d->arch.vtsc_shift = scale.shift;
+            d->arch.vtsc_mul_frac = scale.mul_frac;
+            d->arch.vtsc_offset = get_s_time() - elapsed_nsec; /* FIXME? */
+        } else {
+            d->arch.vtsc = 1;
+            d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+            d->arch.vtsc_shift = 1;
+            d->arch.vtsc_mul_frac = 0x80000000U;
+            d->arch.incarnation = incarnation + 1;
+printk("DJM tsc_set_info B: offset=%ld\n",(long)d->arch.vtsc_offset);
+        }
+        break;
+    case TSC_MODE_PVRDTSCP:
+        /* if (hardware supports rdtscp instruction) FIXME */
+            d->arch.pvrdtscp = 1;
+        d->arch.vtsc = 0;
+        if ( gtsc_khz != 0 ) {
+             set_time_scale(&scale, gtsc_khz * 1000 );
+             d->arch.vtsc_shift = scale.shift;
+            d->arch.vtsc_mul_frac = scale.mul_frac;
+        } else {
+            d->arch.vtsc = 1;
+            d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+            d->arch.vtsc_shift = 1;
+            d->arch.vtsc_mul_frac = 0x80000000U;
+            d->arch.incarnation = incarnation + 1;
+        }
+        break;
+    }
+    if ( is_hvm_domain(d) )
+        hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+printk("DJM tsc_set_info after: 
dom%u,vtsc=%d,ofs=0x%lx,sh=%d,mulfrac=0x%08x,inc=%d\n",
+d->domain_id,(int)d->arch.vtsc,(long)d->arch.vtsc_offset,(int)d->arch.vtsc_shift,(int)d->arch.vtsc_mul_frac,(int)d->arch.incarnation);
 }
 
 /* vtsc may incur measurable performance degradation, diagnose with this */
@@ -1607,33 +1732,51 @@ static void dump_softtsc(unsigned char k
 {
     struct domain *d;
     int domcnt = 0;
+    extern unsigned int max_cstate;
 
     tsc_check_reliability();
     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
         printk("TSC marked as reliable, "
                "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
     else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
-        printk("TSC marked as constant but not reliable, "
-               "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
-    else
+    {
+        printk("TSC has constant rate, ");
+        if (max_cstate <= 2)
+            printk("no deep Cstates possible, so deemed reliable, ");
+        else
+            printk("deep Cstates possible, so not reliable, ");
+        printk("warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+    } else
         printk("TSC not marked as either constant or reliable, "
                "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
     for_each_domain ( d )
     {
+        if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT )
+            continue;
+        printk("dom%u%s: mode=%d",d->domain_id,
+                is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
+        if ( d->arch.vtsc_offset )
+            printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset);
+        if ( d->arch.tsc_khz )
+            printk(",khz=%"PRIu32"",d->arch.tsc_khz);
+        if ( d->arch.incarnation )
+            printk(",inc=%"PRIu32"",d->arch.incarnation);
         if ( !d->arch.vtsc )
+        {
+            printk("\n");
             continue;
+        }
         if ( is_hvm_domain(d) )
-            printk("dom%u (hvm) vtsc count: %"PRIu64" total\n",
-                   d->domain_id, d->arch.vtsc_kerncount);
+            printk(",vtsc count: %"PRIu64" total\n",
+                   d->arch.vtsc_kerncount);
         else
-            printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
-                   d->domain_id, d->arch.vtsc_kerncount,
-                   d->arch.vtsc_usercount);
+            printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
+                   d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
         domcnt++;
     }
 
     if ( !domcnt )
-            printk("All domains have native TSC\n");
+            printk("No domains have emulated TSC\n");
 }
 
 static struct keyhandler dump_softtsc_keyhandler = {
diff -r 494ad84ad38c xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/include/asm-x86/domain.h      Thu Nov 12 16:48:53 2009 -0700
@@ -299,9 +299,17 @@ struct arch_domain
     struct domain_mca_msrs vmca_msrs;
 
     /* SoftTSC emulation */
-    bool_t vtsc;
-    s_time_t vtsc_last;
+    int tsc_mode;            /* see include/asm-x86/time.h */
+    bool_t vtsc;             /* 1 == enable tsc emulation */
+    bool_t pvrdtscp;         /* set TSC_AUX to incarnation on all vcpus */
+    s_time_t vtsc_last;      /* previous value (to guarantee monotonicity) */
     spinlock_t vtsc_lock;
+    uint64_t vtsc_offset;    /* adjustment for save/restore/migrate */
+    uint32_t tsc_khz;        /* cached khz for certain emulated cases */
+    uint32_t vtsc_shift;     /* cached scaling for certain emulated cases */
+    uint32_t vtsc_mul_frac;  /* cached scaling for certain emulated cases */
+    uint32_t incarnation;    /* incremented every restore or live migrate
+                                (possibly other cases in the future */
     uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
     uint64_t vtsc_usercount; /* not used for hvm */
 } __cacheline_aligned;
diff -r 494ad84ad38c xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h        Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/include/asm-x86/time.h        Thu Nov 12 16:48:53 2009 -0700
@@ -3,6 +3,24 @@
 #define __X86_TIME_H__
 
 #include <asm/msr.h>
+
+/*
+ *  PV TSC emulation modes:
+ *    0 = guest rdtsc/p executed natively when monotonicity can be guaranteed
+ *         and emulated otherwise (with frequency scaled if necessary)
+ *    1 = guest rdtsc/p always emulated at 1GHz (kernel and user)
+ *    2 = guest rdtsc always executed natively (no monotonicity/frequency
+ *         guarantees); guest rdtscp emulated at native frequency if
+ *         unsupported by h/w, else executed natively
+ *    3 = same as 2, except xen manages TSC_AUX register so guest can
+ *         determine when a restore/migration has occurred and assumes
+ *         guest obtains/uses pvclock-like mechanism to adjust for
+ *         monotonicity and frequency changes
+ */
+#define TSC_MODE_DEFAULT          0
+#define TSC_MODE_ALWAYS_EMULATE   1
+#define TSC_MODE_NEVER_EMULATE    2
+#define TSC_MODE_PVRDTSCP         3
 
 void calibrate_tsc_bp(void);
 void calibrate_tsc_ap(void);
@@ -43,6 +61,13 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
 
 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
 
+void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+                  uint32_t gtsc_khz, uint32_t incarnation);
+   
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
+                  uint32_t *gtsc_khz, uint32_t *incarnation);
+   
+
 void force_update_vcpu_system_time(struct vcpu *v);
 
 #endif /* __X86_TIME_H__ */
diff -r 494ad84ad38c xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/include/public/domctl.h       Thu Nov 12 16:48:53 2009 -0700
@@ -656,6 +656,22 @@ typedef struct xen_domctl_disable_migrat
 } xen_domctl_disable_migrate_t;
 
 
+#define XEN_DOMCTL_gettscinfo    59
+#define XEN_DOMCTL_settscinfo    60
+struct xen_guest_tsc_info {
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint32_t pad;
+    uint64_t elapsed_nsec;
+};
+typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
+typedef struct xen_domctl_tsc_info {
+    XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
+    xen_guest_tsc_info_t info; /* IN */
+} xen_domctl_tsc_info_t;
+
 #define XEN_DOMCTL_gdbsx_guestmemio     1000 /* guest mem io */
 struct xen_domctl_gdbsx_memio {
     uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
@@ -707,6 +723,7 @@ struct xen_domctl {
         struct xen_domctl_settimeoffset     settimeoffset;
         struct xen_domctl_set_tsc_native    set_tsc_native;
         struct xen_domctl_disable_migrate   disable_migrate;
+        struct xen_domctl_tsc_info          tsc_info;
         struct xen_domctl_real_mode_area    real_mode_area;
         struct xen_domctl_hvmcontext        hvmcontext;
         struct xen_domctl_hvmcontext_partial hvmcontext_partial;



Attachment: tscmode-draft.patch
Description: Binary data

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.