[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] x86/vMCE: save/restore MCA capabilities



# HG changeset patch
# User Jan Beulich <jbeulich@xxxxxxxx>
# Date 1330070874 -3600
# Node ID 35d855e520038bf5814f53d105dc4adf58a670f2
# Parent  e80b0bb4470b944a5b52a91c0ec85a1d65d18c55
x86/vMCE: save/restore MCA capabilities

This allows migration to a host with less MCA banks than the source
host had, while without this patch accesses to the excess banks' MSRs
caused #GP-s in the guest after migration (and it depended on the guest
kernel whether this would be fatal).

A fundamental question is whether we should also save/restore MCG_CTL
and MCi_CTL, as the HVM save record would better be defined to the
complete state that needs saving from the beginning (I'm unsure whether
the save/restore logic allows for future extension of an existing
record).

Of course, this change is expected to make migration from new to older
Xen impossible (again I'm unsure what the save/restore logic does with
records it doesn't even know about).

The (trivial) tools side change may seem unrelated, but the code should
have been that way from the beginning to allow the hypervisor to look
at currently unused ext_vcpucontext fields without risking to read
garbage when those fields get a meaning assigned in the future. This
isn't being enforced here - should it be? (Obviously, for backwards
compatibility, the hypervisor must assume these fields to be clear only
when the extended context's size exceeds the old original one.)

A future addition to this change might be to allow configuration of the
number of banks and other MCA capabilities for a guest before it starts
(i.e. to not inherits the values seen on the first host it runs on).

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
Acked-by: Keir Fraser <keir@xxxxxxx>
---


diff -r e80b0bb4470b -r 35d855e52003 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Fri Feb 24 09:03:43 2012 +0100
+++ b/tools/libxc/xc_domain_save.c      Fri Feb 24 09:07:54 2012 +0100
@@ -1879,6 +1879,7 @@
 
         domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
         domctl.domain = dom;
+        memset(&domctl.u, 0, sizeof(domctl.u));
         domctl.u.ext_vcpucontext.vcpu = i;
         if ( xc_domctl(xch, &domctl) < 0 )
         {
diff -r e80b0bb4470b -r 35d855e52003 tools/misc/xen-hvmctx.c
--- a/tools/misc/xen-hvmctx.c   Fri Feb 24 09:03:43 2012 +0100
+++ b/tools/misc/xen-hvmctx.c   Fri Feb 24 09:07:54 2012 +0100
@@ -383,6 +383,13 @@
            (unsigned long long) p.apic_assist);           
 }
 
+static void dump_vmce_vcpu(void)
+{
+    HVM_SAVE_TYPE(VMCE_VCPU) p;
+    READ(p);
+    printf("    VMCE_VCPU: caps %" PRIx64 "\n", p.caps);
+}
+
 int main(int argc, char **argv)
 {
     int entry, domid;
@@ -449,6 +456,7 @@
         case HVM_SAVE_CODE(MTRR): dump_mtrr(); break;
         case HVM_SAVE_CODE(VIRIDIAN_DOMAIN): dump_viridian_domain(); break;
         case HVM_SAVE_CODE(VIRIDIAN_VCPU): dump_viridian_vcpu(); break;
+        case HVM_SAVE_CODE(VMCE_VCPU): dump_vmce_vcpu(); break;
         case HVM_SAVE_CODE(END): break;
         default:
             printf(" ** Don't understand type %u: skipping\n",
diff -r e80b0bb4470b -r 35d855e52003 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Fri Feb 24 09:07:54 2012 +0100
@@ -3,6 +3,7 @@
 #define _MCE_H
 
 #include <xen/init.h>
+#include <xen/sched.h>
 #include <xen/smp.h>
 #include <asm/types.h>
 #include <asm/traps.h>
@@ -54,8 +55,8 @@
 u64 mce_cap_init(void);
 extern unsigned int firstbank;
 
-int intel_mce_rdmsr(uint32_t msr, uint64_t *val);
-int intel_mce_wrmsr(uint32_t msr, uint64_t val);
+int intel_mce_rdmsr(const struct vcpu *, uint32_t msr, uint64_t *val);
+int intel_mce_wrmsr(struct vcpu *, uint32_t msr, uint64_t val);
 
 struct mcinfo_extended *intel_get_extended_msrs(
     struct mcinfo_global *mig, struct mc_info *mi);
@@ -171,18 +172,20 @@
 
 extern int vmce_init(struct cpuinfo_x86 *c);
 
-static inline int mce_vendor_bank_msr(uint32_t msr)
+static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr)
 {
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-         msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks) )
+         msr >= MSR_IA32_MC0_CTL2 &&
+         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
           return 1;
     return 0;
 }
 
-static inline int mce_bank_msr(uint32_t msr)
+static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr)
 {
-    if ( (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(nr_mce_banks)) ||
-        mce_vendor_bank_msr(msr) )
+    if ( (msr >= MSR_IA32_MC0_CTL &&
+          msr < MSR_IA32_MCx_CTL(v->arch.mcg_cap & MCG_CAP_COUNT)) ||
+         mce_vendor_bank_msr(v, msr) )
         return 1;
     return 0;
 }
diff -r e80b0bb4470b -r 35d855e52003 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Fri Feb 24 09:07:54 2012 +0100
@@ -1421,11 +1421,12 @@
 }
 
 /* intel specific MCA MSR */
-int intel_mce_wrmsr(uint32_t msr, uint64_t val)
+int intel_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
 {
     int ret = 0;
 
-    if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks))
+    if ( msr >= MSR_IA32_MC0_CTL2 &&
+         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
     {
         mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
                  "Guest should not write this MSR!\n");
@@ -1435,11 +1436,12 @@
     return ret;
 }
 
-int intel_mce_rdmsr(uint32_t msr, uint64_t *val)
+int intel_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
 {
     int ret = 0;
 
-    if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks))
+    if ( msr >= MSR_IA32_MC0_CTL2 &&
+         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
     {
         mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
                  "Guest should not read this MSR!\n");
diff -r e80b0bb4470b -r 35d855e52003 xen/arch/x86/cpu/mcheck/vmce.c
--- a/xen/arch/x86/cpu/mcheck/vmce.c    Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/vmce.c    Fri Feb 24 09:07:54 2012 +0100
@@ -10,6 +10,7 @@
 #include <xen/delay.h>
 #include <xen/smp.h>
 #include <xen/mm.h>
+#include <xen/hvm/save.h>
 #include <asm/processor.h>
 #include <public/sysctl.h>
 #include <asm/system.h>
@@ -42,7 +43,6 @@
            nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl));
 
     dom_vmce(d)->mcg_status = 0x0;
-    dom_vmce(d)->mcg_cap = g_mcg_cap;
     dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0;
     dom_vmce(d)->nr_injection = 0;
 
@@ -61,21 +61,41 @@
     dom_vmce(d) = NULL;
 }
 
-static int bank_mce_rdmsr(struct domain *d, uint32_t msr, uint64_t *val)
+void vmce_init_vcpu(struct vcpu *v)
 {
-    int bank, ret = 1;
-    struct domain_mca_msrs *vmce = dom_vmce(d);
+    v->arch.mcg_cap = g_mcg_cap;
+}
+
+int vmce_restore_vcpu(struct vcpu *v, uint64_t caps)
+{
+    if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
+    {
+        dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities"
+                " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n",
+                is_hvm_vcpu(v) ? "HVM" : "PV", caps, v->domain->domain_id,
+                v->vcpu_id, g_mcg_cap & ~MCG_CAP_COUNT);
+        return -EPERM;
+    }
+
+    v->arch.mcg_cap = caps;
+    return 0;
+}
+
+static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+{
+    int ret = 1;
+    unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
+    struct domain_mca_msrs *vmce = dom_vmce(v->domain);
     struct bank_entry *entry;
 
-    bank = (msr - MSR_IA32_MC0_CTL) / 4;
-    if ( bank >= nr_mce_banks )
-        return -1;
+    *val = 0;
 
     switch ( msr & (MSR_IA32_MC0_CTL | 3) )
     {
     case MSR_IA32_MC0_CTL:
-        *val = vmce->mci_ctl[bank] &
-            (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
+        if ( bank < nr_mce_banks )
+            *val = vmce->mci_ctl[bank] &
+                   (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
         mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
                    bank, *val);
         break;
@@ -126,7 +146,7 @@
         switch ( boot_cpu_data.x86_vendor )
         {
         case X86_VENDOR_INTEL:
-            ret = intel_mce_rdmsr(msr, val);
+            ret = intel_mce_rdmsr(v, msr, val);
             break;
         default:
             ret = 0;
@@ -145,13 +165,13 @@
  */
 int vmce_rdmsr(uint32_t msr, uint64_t *val)
 {
-    struct domain *d = current->domain;
-    struct domain_mca_msrs *vmce = dom_vmce(d);
+    const struct vcpu *cur = current;
+    struct domain_mca_msrs *vmce = dom_vmce(cur->domain);
     int ret = 1;
 
     *val = 0;
 
-    spin_lock(&dom_vmce(d)->lock);
+    spin_lock(&vmce->lock);
 
     switch ( msr )
     {
@@ -162,39 +182,38 @@
                        "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
         break;
     case MSR_IA32_MCG_CAP:
-        *val = vmce->mcg_cap;
+        *val = cur->arch.mcg_cap;
         mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
                    *val);
         break;
     case MSR_IA32_MCG_CTL:
         /* Always 0 if no CTL support */
-        *val = vmce->mcg_ctl & h_mcg_ctl;
+        if ( cur->arch.mcg_cap & MCG_CTL_P )
+            *val = vmce->mcg_ctl & h_mcg_ctl;
         mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
                    *val);
         break;
     default:
-        ret = mce_bank_msr(msr) ? bank_mce_rdmsr(d, msr, val) : 0;
+        ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
         break;
     }
 
-    spin_unlock(&dom_vmce(d)->lock);
+    spin_unlock(&vmce->lock);
     return ret;
 }
 
-static int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val)
+static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val)
 {
-    int bank, ret = 1;
-    struct domain_mca_msrs *vmce = dom_vmce(d);
+    int ret = 1;
+    unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
+    struct domain_mca_msrs *vmce = dom_vmce(v->domain);
     struct bank_entry *entry = NULL;
 
-    bank = (msr - MSR_IA32_MC0_CTL) / 4;
-    if ( bank >= nr_mce_banks )
-        return -EINVAL;
-
     switch ( msr & (MSR_IA32_MC0_CTL | 3) )
     {
     case MSR_IA32_MC0_CTL:
-        vmce->mci_ctl[bank] = val;
+        if ( bank < nr_mce_banks )
+            vmce->mci_ctl[bank] = val;
         break;
     case MSR_IA32_MC0_STATUS:
         /* Give the first entry of the list, it corresponds to current
@@ -202,9 +221,9 @@
          * the guest, this node will be deleted.
          * Only error bank is written. Non-error banks simply return.
          */
-        if ( !list_empty(&dom_vmce(d)->impact_header) )
+        if ( !list_empty(&vmce->impact_header) )
         {
-            entry = list_entry(dom_vmce(d)->impact_header.next,
+            entry = list_entry(vmce->impact_header.next,
                                struct bank_entry, list);
             if ( entry->bank == bank )
                 entry->mci_status = val;
@@ -228,7 +247,7 @@
         switch ( boot_cpu_data.x86_vendor )
         {
         case X86_VENDOR_INTEL:
-            ret = intel_mce_wrmsr(msr, val);
+            ret = intel_mce_wrmsr(v, msr, val);
             break;
         default:
             ret = 0;
@@ -247,9 +266,9 @@
  */
 int vmce_wrmsr(u32 msr, u64 val)
 {
-    struct domain *d = current->domain;
+    struct vcpu *cur = current;
     struct bank_entry *entry = NULL;
-    struct domain_mca_msrs *vmce = dom_vmce(d);
+    struct domain_mca_msrs *vmce = dom_vmce(cur->domain);
     int ret = 1;
 
     if ( !g_mcg_cap )
@@ -266,7 +285,7 @@
         vmce->mcg_status = val;
         mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
         /* For HVM guest, this is the point for deleting vMCE injection node */
-        if ( d->is_hvm && (vmce->nr_injection > 0) )
+        if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) )
         {
             vmce->nr_injection--; /* Should be 0 */
             if ( !list_empty(&vmce->impact_header) )
@@ -293,7 +312,7 @@
         ret = -1;
         break;
     default:
-        ret = mce_bank_msr(msr) ? bank_mce_wrmsr(d, msr, val) : 0;
+        ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
         break;
     }
 
@@ -301,6 +320,46 @@
     return ret;
 }
 
+static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
+{
+    struct vcpu *v;
+    int err = 0;
+
+    for_each_vcpu( d, v ) {
+        struct hvm_vmce_vcpu ctxt = {
+            .caps = v->arch.mcg_cap
+        };
+
+        err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
+        if ( err )
+            break;
+    }
+
+    return err;
+}
+
+static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
+{
+    unsigned int vcpuid = hvm_load_instance(h);
+    struct vcpu *v;
+    struct hvm_vmce_vcpu ctxt;
+    int err;
+
+    if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
+    {
+        dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
+                d->domain_id, vcpuid);
+        err = -EINVAL;
+    }
+    else
+        err = hvm_load_entry(VMCE_VCPU, h, &ctxt);
+
+    return err ?: vmce_restore_vcpu(v, ctxt.caps);
+}
+
+HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
+                          vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
+
 int inject_vmce(struct domain *d)
 {
     int cpu = smp_processor_id();
diff -r e80b0bb4470b -r 35d855e52003 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/arch/x86/domain.c     Fri Feb 24 09:07:54 2012 +0100
@@ -422,6 +422,8 @@
     if ( (rc = vcpu_init_fpu(v)) != 0 )
         return rc;
 
+    vmce_init_vcpu(v);
+
     if ( is_hvm_domain(d) )
     {
         rc = hvm_vcpu_initialise(v);
diff -r e80b0bb4470b -r 35d855e52003 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/arch/x86/domctl.c     Fri Feb 24 09:07:54 2012 +0100
@@ -1029,11 +1029,12 @@
                 evc->syscall32_callback_eip    = 0;
                 evc->syscall32_disables_events = 0;
             }
+            evc->mcg_cap = v->arch.mcg_cap;
         }
         else
         {
             ret = -EINVAL;
-            if ( evc->size != sizeof(*evc) )
+            if ( evc->size < offsetof(typeof(*evc), mcg_cap) )
                 goto ext_vcpucontext_out;
 #ifdef __x86_64__
             if ( !is_hvm_domain(d) )
@@ -1061,6 +1062,10 @@
                  (evc->syscall32_callback_cs & ~3) ||
                  evc->syscall32_callback_eip )
                 goto ext_vcpucontext_out;
+
+            if ( evc->size >= offsetof(typeof(*evc), mcg_cap) +
+                              sizeof(evc->mcg_cap) )
+                ret = vmce_restore_vcpu(v, evc->mcg_cap);
         }
 
         ret = 0;
diff -r e80b0bb4470b -r 35d855e52003 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/include/asm-x86/domain.h      Fri Feb 24 09:07:54 2012 +0100
@@ -488,6 +488,8 @@
     /* This variable determines whether nonlazy extended state has been used,
      * and thus should be saved/restored. */
     bool_t nonlazy_xstate_used;
+
+    uint64_t mcg_cap;
     
     struct paging_vcpu paging;
 
diff -r e80b0bb4470b -r 35d855e52003 xen/include/asm-x86/mce.h
--- a/xen/include/asm-x86/mce.h Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/include/asm-x86/mce.h Fri Feb 24 09:07:54 2012 +0100
@@ -16,7 +16,6 @@
 struct domain_mca_msrs
 {
     /* Guest should not change below values after DOM boot up */
-    uint64_t mcg_cap;
     uint64_t mcg_ctl;
     uint64_t mcg_status;
     uint64_t *mci_ctl;
@@ -28,6 +27,8 @@
 /* Guest vMCE MSRs virtualization */
 extern int vmce_init_msr(struct domain *d);
 extern void vmce_destroy_msr(struct domain *d);
+extern void vmce_init_vcpu(struct vcpu *);
+extern int vmce_restore_vcpu(struct vcpu *, uint64_t caps);
 extern int vmce_wrmsr(uint32_t msr, uint64_t val);
 extern int vmce_rdmsr(uint32_t msr, uint64_t *val);
 
diff -r e80b0bb4470b -r 35d855e52003 xen/include/public/arch-x86/hvm/save.h
--- a/xen/include/public/arch-x86/hvm/save.h    Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/include/public/arch-x86/hvm/save.h    Fri Feb 24 09:07:54 2012 +0100
@@ -575,9 +575,15 @@
 
 DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct hvm_viridian_vcpu_context);
 
+struct hvm_vmce_vcpu {
+    uint64_t caps;
+};
+
+DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu);
+
 /* 
  * Largest type-code in use
  */
-#define HVM_SAVE_CODE_MAX 17
+#define HVM_SAVE_CODE_MAX 18
 
 #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
diff -r e80b0bb4470b -r 35d855e52003 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Fri Feb 24 09:03:43 2012 +0100
+++ b/xen/include/public/domctl.h       Fri Feb 24 09:07:54 2012 +0100
@@ -559,7 +559,7 @@
     uint32_t         vcpu;
     /*
      * SET: Size of struct (IN)
-     * GET: Size of struct (OUT)
+     * GET: Size of struct (OUT, up to 128 bytes)
      */
     uint32_t         size;
 #if defined(__i386__) || defined(__x86_64__)
@@ -571,6 +571,7 @@
     uint16_t         sysenter_callback_cs;
     uint8_t          syscall32_disables_events;
     uint8_t          sysenter_disables_events;
+    uint64_aligned_t mcg_cap;
 #endif
 };
 typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.