[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] x86: Control CR0 TS behavior using dev_na_ts_allowed



The device not available trap handler in the mainline Linux kernel
has not been PVABI compliant since 2.6.26, leading to FPU register
corruption under extremely rare circumstances.  While this should and
hopefully will be fixed, the performance of the fix vs. the original
behavior may lead to one or the other being desirable under different
workloads.

Add an option, dev_na_ts_allowed, which on a per dom0/U basis breaks
PVABI by keeping CR0 TS set during the trap device not available.
PVABI compatibility is advertised using CPUID such that the OS can
choose its behavior accordingly.

This option is disabled by default.

Reported-by: Zhu Yanhai <zhu.yanhai@xxxxxxxxx>
Signed-off-by: Sarah Newman <srn@xxxxxxxxx>
---
 docs/man/xl.cfg.pod.5                   |   20 ++++++++++++++++++++
 tools/libxc/xc_dom_x86.c                |    8 ++++++++
 tools/libxc/xenctrl.h                   |    1 +
 tools/libxl/libxl_create.c              |    1 +
 tools/libxl/libxl_types.idl             |    1 +
 tools/libxl/libxl_x86.c                 |    4 ++++
 tools/libxl/xl_cmdimpl.c                |    1 +
 tools/libxl/xl_sxp.c                    |    2 ++
 tools/python/xen/lowlevel/xc/xc.c       |   21 +++++++++++++++++++++
 tools/python/xen/xend/XendConfig.py     |    4 ++++
 tools/python/xen/xend/XendDomainInfo.py |    4 ++++
 tools/python/xen/xm/create.py           |   11 ++++++++++-
 tools/python/xen/xm/xenapi_create.py    |    1 +
 xen/arch/x86/domain_build.c             |    6 ++++++
 xen/arch/x86/domctl.c                   |    6 ++++++
 xen/arch/x86/traps.c                    |   15 ++++++++++++---
 xen/include/asm-x86/domain.h            |    2 ++
 xen/include/public/arch-x86/cpuid.h     |    5 +++++
 xen/include/public/domctl.h             |   11 +++++++++++
 19 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
index a6663b9..4fc3953 100644
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -696,6 +696,26 @@ anyway.
 =item B<pvh=BOOLEAN>
 Selects whether to run this PV guest in an HVM container. Default is 0.
 
+=item B<dev_na_ts_allowed=BOOLEAN>
+
+On the x86 platform, selects whether to break PVABI compatibility by 
+setting the task switch bit in control register 0 before entering 
+the device not available trap.
+
+Linux kernels derived from mainline between v2.6.26 through at least 3.14
+may in very rare circumstances have their FPU registers corrupted without
+this enabled.  
+
+Kernels that have the flag 'eagerfpu' in the flags in /proc/cpuinfo do not
+require this to be enabled, but it may be advisable to enable this anyway
+for performance reasons. On machines with the 'fxsr' flag (all x86 CPUs 
+starting from a Pentium 2) and a kernel version of v3.10 or newer 
+may also manually add 'eagerfpu=on' to their kernel command line 
+in order to avoid this bug.
+
+Leaving this on should not cause issues for any kernels derived from
+mainline.
+
 =back
 
 =head2 Fully-virtualised (HVM) Guest Specific Options
diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c
index e034d62..83ebf7b 100644
--- a/tools/libxc/xc_dom_x86.c
+++ b/tools/libxc/xc_dom_x86.c
@@ -950,6 +950,14 @@ int xc_dom_feature_translated(struct xc_dom_image *dom)
     return elf_xen_feature_get(XENFEAT_auto_translated_physmap, dom->f_active);
 }
 
+int xc_domain_dev_na_ts_allowed(xc_interface *xch, uint32_t domid)
+{
+    DECLARE_DOMCTL;
+    domctl.cmd = XEN_DOMCTL_dev_na_ts_allowed;
+    domctl.domain = (domid_t)domid;
+    domctl.u.dev_na_ts_allowed.allowed = 1;
+    return do_domctl(xch, &domctl);
+}
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 13f816b..2947c00 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -1913,6 +1913,7 @@ int xc_cpuid_apply_policy(xc_interface *xch,
 void xc_cpuid_to_str(const unsigned int *regs,
                      char **strs); /* some strs[] may be NULL if ENOMEM */
 int xc_mca_op(xc_interface *xch, struct xen_mc *mc);
+int xc_domain_dev_na_ts_allowed(xc_interface *xch, uint32_t domid);
 #endif
 
 struct xc_px_val {
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index d015cf4..3a09cf6 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -341,6 +341,7 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
         break;
     case LIBXL_DOMAIN_TYPE_PV:
         libxl_defbool_setdefault(&b_info->u.pv.e820_host, false);
+        libxl_defbool_setdefault(&b_info->u.pv.dev_na_ts_allowed, false);
         if (b_info->shadow_memkb == LIBXL_MEMKB_DEFAULT)
             b_info->shadow_memkb = 0;
         if (b_info->u.pv.slack_memkb == LIBXL_MEMKB_DEFAULT)
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 612645c..17b1ba2 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -383,6 +383,7 @@ libxl_domain_build_info = Struct("domain_build_info",[
                                       ("features", string, {'const': True}),
                                       # Use host's E820 for PCI passthrough.
                                       ("e820_host", libxl_defbool),
+                                      ("dev_na_ts_allowed", libxl_defbool),
                                       ])),
                  ("invalid", None),
                  ], keyvar_init_val = "LIBXL_DOMAIN_TYPE_INVALID")),
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index b11d036..6032d99 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -308,6 +308,10 @@ int libxl__arch_domain_create(libxl__gc *gc, 
libxl_domain_config *d_config,
         }
     }
 
+    if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_PV &&
+                libxl_defbool_val(d_config->b_info.u.pv.dev_na_ts_allowed))
+        xc_domain_dev_na_ts_allowed(ctx->xch, domid);
+
     return ret;
 }
 
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 6b1ebfa..0016512 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -1481,6 +1481,7 @@ skip_vfb:
      * after guest starts is done (with PCI devices passed in). */
     if (c_info->type == LIBXL_DOMAIN_TYPE_PV) {
         xlu_cfg_get_defbool(config, "e820_host", &b_info->u.pv.e820_host, 0);
+        xlu_cfg_get_defbool(config, "dev_na_ts_allowed", 
&b_info->u.pv.dev_na_ts_allowed, 0);
     }
 
     if (!xlu_cfg_get_list (config, "pci", &pcis, 0, 0)) {
diff --git a/tools/libxl/xl_sxp.c b/tools/libxl/xl_sxp.c
index a16a025..a673614 100644
--- a/tools/libxl/xl_sxp.c
+++ b/tools/libxl/xl_sxp.c
@@ -152,6 +152,8 @@ void printf_info_sexp(int domid, libxl_domain_config 
*d_config)
         printf("\t\t\t(ramdisk %s)\n", b_info->u.pv.ramdisk);
         printf("\t\t\t(e820_host %s)\n",
                libxl_defbool_to_string(b_info->u.pv.e820_host));
+        printf("\t\t\t(dev_na_ts_allowed %s)\n",
+               libxl_defbool_to_string(b_info->u.pv.dev_na_ts_allowed));
         printf("\t\t)\n");
         break;
     default:
diff --git a/tools/python/xen/lowlevel/xc/xc.c 
b/tools/python/xen/lowlevel/xc/xc.c
index 737bdac..afeaea4 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -1779,6 +1779,20 @@ static PyObject *pyxc_domain_disable_migrate(XcObject 
*self, PyObject *args)
     return zero;
 }
 
+static PyObject *pyxc_domain_dev_na_ts_allowed(XcObject *self, PyObject *args)
+{
+    uint32_t dom;
+
+    if (!PyArg_ParseTuple(args, "i", &dom))
+        return NULL;
+
+    if (xc_domain_dev_na_ts_allowed(self->xc_handle, dom) != 0)
+        return pyxc_error_to_exception(self->xc_handle);
+
+    Py_INCREF(zero);
+    return zero;
+}
+
 static PyObject *pyxc_domain_send_trigger(XcObject *self,
                                           PyObject *args,
                                           PyObject *kwds)
@@ -2731,6 +2745,13 @@ static PyMethodDef pyxc_methods[] = {
       " dom        [int]: Domain whose TSC mode is being set.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
+    { "domain_dev_na_ts_allowed",
+      (PyCFunction)pyxc_domain_dev_na_ts_allowed,
+      METH_VARARGS, "\n"
+      "Marks domain as needing task switching enabled during x86 device na 
trap\n"
+      " dom        [int]: Identifier of domain.\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
     { "domain_send_trigger",
       (PyCFunction)pyxc_domain_send_trigger,
       METH_VARARGS | METH_KEYWORDS, "\n"
diff --git a/tools/python/xen/xend/XendConfig.py 
b/tools/python/xen/xend/XendConfig.py
index 4a226a7..4e5c880 100644
--- a/tools/python/xen/xend/XendConfig.py
+++ b/tools/python/xen/xend/XendConfig.py
@@ -158,6 +158,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
     'monitor_path': str,
     'nographic': int,
     'nomigrate': int,
+    'dev_na_ts_allowed' : int,
     'pae' : int,
     'rtc_timeoffset': int,
     'parallel': str,
@@ -511,6 +512,9 @@ class XendConfig(dict):
         if 'nomigrate' not in self['platform']:
             self['platform']['nomigrate'] = 0
 
+        if 'dev_na_ts_allowed' not in self['platform']:
+            self['platform']['dev_na_ts_allowed'] = 0
+
         if self.is_hvm():
             if 'timer_mode' not in self['platform']:
                 self['platform']['timer_mode'] = 1
diff --git a/tools/python/xen/xend/XendDomainInfo.py 
b/tools/python/xen/xend/XendDomainInfo.py
index 8d4ff5c..2646291 100644
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -2606,6 +2606,10 @@ class XendDomainInfo:
         if nomigrate is not None and long(nomigrate) != 0:
             xc.domain_disable_migrate(self.domid)
 
+        dev_na_ts_allowed = self.info["platform"].get("dev_na_ts_allowed")
+        if arch.type == "x86" and not hvm and long(dev_na_ts_allowed) != 0:
+            xc.domain_dev_na_ts_allowed(self.domid)
+
         # Optionally enable virtual HPET
         hpet = self.info["platform"].get("hpet")
         if hvm and hpet is not None:
diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py
index 22841aa..451f7bf 100644
--- a/tools/python/xen/xm/create.py
+++ b/tools/python/xen/xm/create.py
@@ -233,6 +233,12 @@ gopts.var('nomigrate', val='NOMIGRATE',
           fn=set_int, default=0,
           use="""migratability (0=migration enabled, 1=migration disabled).""")
 
+gopts.var('dev_na_ts_allowed', val='DEV_NA_TS_ALLOWED',
+          fn=set_int, default=0,
+          use="""Enable task switch during device not available trap
+          (Default is 0).
+          Recommended for x86 Linux kernels derived from mainline from v2.6.26 
on.""")
+
 gopts.var('vpt_align', val='VPT_ALIGN',
           fn=set_int, default=1,
           use="Enable aligning all periodic vpt to reduce timer interrupts.")
@@ -776,6 +782,9 @@ def configure_image(vals):
     if vals.nomigrate is not None:
         config_image.append(['nomigrate', vals.nomigrate])
 
+    if vals.dev_na_ts_allowed is not None:
+        config_image.append(['dev_na_ts_allowed', vals.dev_na_ts_allowed])
+
     return config_image
     
 def configure_disks(config_devs, vals):
@@ -1094,7 +1103,7 @@ def make_config(vals):
                    'on_reboot', 'on_crash', 'features', 'on_xend_start',
                    'on_xend_stop', 'target', 'cpuid', 'cpuid_check',
                    'machine_address_size', 'suppress_spurious_page_faults',
-                   'description'])
+                   'description', 'dev_na_ts_allowed', ])
 
     vcpu_conf()
     if vals.uuid is not None:
diff --git a/tools/python/xen/xm/xenapi_create.py 
b/tools/python/xen/xm/xenapi_create.py
index 346ff20..dd30b41 100644
--- a/tools/python/xen/xm/xenapi_create.py
+++ b/tools/python/xen/xm/xenapi_create.py
@@ -1074,6 +1074,7 @@ class sxp2xml:
             'xen_platform_pci',
             'tsc_mode'
             'description',
+            'dev_na_ts_allowed',
             'nomigrate'
         ]
 
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 84ce392..0e8186f 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -126,6 +126,9 @@ boolean_param("dom0_shadow", opt_dom0_shadow);
 static char __initdata opt_dom0_ioports_disable[200] = "";
 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
 
+static bool_t __initdata opt_dom0_dev_na_ts_allowed;
+boolean_param("dev_na_ts_allowed", opt_dom0_dev_na_ts_allowed);
+
 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
@@ -1061,6 +1064,9 @@ int __init construct_dom0(
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
 
+    if ( opt_dom0_dev_na_ts_allowed )
+        d->arch.pv_domain.dev_na_ts_allowed = 1;
+
     if ( supervisor_mode_kernel )
     {
         v->arch.pv_vcpu.kernel_ss &= ~3;
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 26635ff..0ad1708 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1256,6 +1256,12 @@ long arch_do_domctl(
     }
     break;
 
+    case XEN_DOMCTL_dev_na_ts_allowed:
+    {
+        d->arch.pv_domain.dev_na_ts_allowed = 
domctl->u.dev_na_ts_allowed.allowed;
+    }
+    break;
+
     default:
         ret = iommu_do_domctl(domctl, d, u_domctl);
         break;
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index ed4ae2d..cfe7c8d 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -713,8 +713,12 @@ int cpuid_hypervisor_leaves( uint32_t idx, uint32_t 
sub_idx,
             *ebx = 0x40000200;
         *ecx = 0;          /* Features 1 */
         *edx = 0;          /* Features 2 */
-        if ( is_pv_vcpu(current) )
+        if ( is_pv_vcpu(current) ) {
             *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
+
+            if (current->domain->arch.pv_domain.dev_na_ts_allowed)
+                *ecx |= XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED;
+        }
         break;
 
     case 3:
@@ -3269,8 +3273,13 @@ void do_device_not_available(struct cpu_user_regs *regs)
 
     if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS )
     {
-        do_guest_trap(TRAP_no_device, regs, 0);
-        curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
+        if (!current->domain->arch.pv_domain.dev_na_ts_allowed) {
+            do_guest_trap(TRAP_no_device, regs, 0);
+            curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
+        } else {
+            stts();
+            do_guest_trap(TRAP_no_device, regs, 0);
+        }
     }
     else
         TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 4ff89f0..75d47c9 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -232,6 +232,8 @@ struct pv_domain
      * unmask the event channel */
     bool_t auto_unmask;
 
+    bool_t dev_na_ts_allowed;
+
     /* map_domain_page() mapping cache. */
     struct mapcache_domain mapcache;
 };
diff --git a/xen/include/public/arch-x86/cpuid.h 
b/xen/include/public/arch-x86/cpuid.h
index d9bd627..3a165ee 100644
--- a/xen/include/public/arch-x86/cpuid.h
+++ b/xen/include/public/arch-x86/cpuid.h
@@ -65,4 +65,9 @@
 #define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
 #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
 
+/* Will the host not automatically clear CR0.TS after exiting ? */
+#define _XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED 1
+#define XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED \
+    (1u<<_XEN_CPUID_FEAT1_DEV_NA_TS_ALLOWED)
+
 #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index f22fe2e..95b83fd 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -896,6 +896,13 @@ struct xen_domctl_cacheflush {
 typedef struct xen_domctl_cacheflush xen_domctl_cacheflush_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_cacheflush_t);
 
+#if defined(__i386__) || defined(__x86_64__)
+/* XEN_DOMCTL_dev_na_ts_allowed */
+typedef struct xen_domctl_dev_na_ts_allowed {
+    uint32_t allowed; /* IN: 1: allow task switch during device NA trap */
+} xen_domctl_dev_na_ts_allowed_t;
+#endif
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -966,6 +973,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_getnodeaffinity               69
 #define XEN_DOMCTL_set_max_evtchn                70
 #define XEN_DOMCTL_cacheflush                    71
+#define XEN_DOMCTL_dev_na_ts_allowed             72
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -1025,6 +1033,9 @@ struct xen_domctl {
         struct xen_domctl_gdbsx_memio       gdbsx_guest_memio;
         struct xen_domctl_set_broken_page_p2m set_broken_page_p2m;
         struct xen_domctl_cacheflush        cacheflush;
+#if defined(__i386__) || defined(__x86_64__)
+        struct xen_domctl_dev_na_ts_allowed dev_na_ts_allowed;
+#endif
         struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
         uint8_t                             pad[128];
-- 
1.7.9.5


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.