[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] x86 hvm mce: Support HVM Guest virtual MCA handling.



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1246372839 -3600
# Node ID 7bbbc57163d58c27f1e3883b20d09c72d04351ab
# Parent  00502df38143d6c26a6db43f9329634cdef76f3e
x86 hvm mce: Support HVM Guest virtual MCA handling.

When MCE# happens, if the error has been contained/recovered by XEN
and it impacts one guest Domain(DOM0/HVM Guest/PV Guest), we will
inject the corresponding vMCE# into the impacted Domain. Guest OS will
go on its own recovery job if it has MCA handler.

Signed-off-by: Liping Ke <liping.ke@xxxxxxxxx>
Signed-off-by: Yunhong Jiang <yunhong.jiang@xxxxxxxxx>
---
 xen/arch/x86/cpu/mcheck/mce_intel.c |  157 ++++++++++++++++++++++++++++--------
 xen/arch/x86/cpu/mcheck/mctelem.c   |    4 
 xen/arch/x86/cpu/mcheck/mctelem.h   |    2 
 xen/arch/x86/hvm/hvm.c              |   37 +++++---
 xen/arch/x86/hvm/irq.c              |    7 +
 xen/arch/x86/hvm/vmx/intr.c         |    4 
 xen/arch/x86/x86_64/traps.c         |    9 +-
 xen/include/asm-x86/domain.h        |    1 
 xen/include/asm-x86/hvm/hvm.h       |    4 
 9 files changed, 169 insertions(+), 56 deletions(-)

diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Jun 30 15:40:39 2009 +0100
@@ -10,6 +10,7 @@
 #include <public/sysctl.h>
 #include <asm/system.h>
 #include <asm/msr.h>
+#include <asm/p2m.h>
 #include "mce.h"
 #include "x86_mca.h"
 
@@ -224,7 +225,7 @@ static struct bank_entry* alloc_bank_ent
       for vMCE# MSRs virtualization
 */
 
-static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank, 
+static int fill_vmsr_data(struct mcinfo_bank *mc_bank, 
         uint64_t gstatus) {
     struct domain *d;
     struct bank_entry *entry;
@@ -240,28 +241,89 @@ static int fill_vmsr_data(int cpu, struc
             return 0;
         }
 
+        /* For HVM guest, Only when first vMCE is consumed by HVM guest 
successfully,
+         * will we generete another node and inject another vMCE
+         */
+        if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) )
+        {
+            printk(KERN_DEBUG "MCE: HVM guest has not handled previous"
+                        " vMCE yet!\n");
+            return -1;
+        }
         entry = alloc_bank_entry();
         if (entry == NULL)
-           return -1;
+            return -1;
+
         entry->mci_status = mc_bank->mc_status;
         entry->mci_addr = mc_bank->mc_addr;
         entry->mci_misc = mc_bank->mc_misc;
-        entry->cpu = cpu;
         entry->bank = mc_bank->mc_bank;
 
-       spin_lock(&d->arch.vmca_msrs.lock);
+        spin_lock(&d->arch.vmca_msrs.lock);
         /* New error Node, insert to the tail of the per_dom data */
         list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
         /* Fill MSR global status */
         d->arch.vmca_msrs.mcg_status = gstatus;
         /* New node impact the domain, need another vMCE# injection*/
         d->arch.vmca_msrs.nr_injection++;
-       spin_unlock(&d->arch.vmca_msrs.lock);
-
-        printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d "
+        spin_unlock(&d->arch.vmca_msrs.lock);
+
+        printk(KERN_DEBUG "MCE: Found error @[BANK%d "
                 "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
-                entry->cpu, mc_bank->mc_bank,
-                mc_bank->mc_status, mc_bank->mc_addr, mc_bank->mc_domid);
+                mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
+                mc_bank->mc_domid);
+    }
+    return 0;
+}
+
+static int inject_mce(struct domain *d)
+{
+    int cpu = smp_processor_id();
+    cpumask_t affinity;
+
+    /* PV guest and HVM guest have different vMCE# injection
+     * methods*/
+
+    if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
+    {
+        if (d->is_hvm)
+        {
+            printk(KERN_DEBUG "MCE: inject vMCE to HVM DOM %d\n", 
+                        d->domain_id);
+            vcpu_kick(d->vcpu[0]);
+        }
+        /* PV guest including DOM0 */
+        else
+        {
+            printk(KERN_DEBUG "MCE: inject vMCE to PV DOM%d\n", 
+                        d->domain_id);
+            if (guest_has_trap_callback
+                   (d, 0, TRAP_machine_check))
+            {
+                d->vcpu[0]->cpu_affinity_tmp =
+                        d->vcpu[0]->cpu_affinity;
+                cpus_clear(affinity);
+                cpu_set(cpu, affinity);
+                printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
+                            d->vcpu[0]->processor);
+                vcpu_set_affinity(d->vcpu[0], &affinity);
+                vcpu_kick(d->vcpu[0]);
+            }
+            else
+            {
+                printk(KERN_DEBUG "MCE: Kill PV guest with No MCE handler\n");
+                domain_crash(d);
+            }
+        }
+    }
+    else {
+        /* new vMCE comes while first one has not been injected yet,
+         * in this case, inject fail. [We can't lose this vMCE for
+         * the mce node's consistency].
+        */
+        printk(KERN_DEBUG "There's a pending vMCE waiting to be injected "
+                    " to this DOM%d!\n", d->domain_id);
+        return -1;
     }
     return 0;
 }
@@ -272,7 +334,7 @@ void intel_UCR_handler(struct mcinfo_ban
              struct mca_handle_result *result)
 {
     struct domain *d;
-    unsigned long mfn;
+    unsigned long mfn, gfn;
     uint32_t status;
 
     printk(KERN_DEBUG "MCE: Enter EWB UCR recovery action\n");
@@ -280,6 +342,7 @@ void intel_UCR_handler(struct mcinfo_ban
     if (bank->mc_addr != 0) {
          mfn = bank->mc_addr >> PAGE_SHIFT;
          if (!offline_page(mfn, 1, &status)) {
+              /* This is free page */
               if (status & PG_OFFLINE_OFFLINED)
                   result->result = MCA_RECOVERED;
               else if (status & PG_OFFLINE_PENDING) {
@@ -289,9 +352,35 @@ void intel_UCR_handler(struct mcinfo_ban
                       result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
                       printk(KERN_DEBUG "MCE: This error page is ownded"
                                   " by DOM %d\n", result->owner);
-                      if (result->owner != 0 && result->owner != DOMID_XEN) {
+                      /* Fill vMCE# injection and vMCE# MSR virtualization "
+                       * "related data */
+                      bank->mc_domid = result->owner;
+                      if ( result->owner != DOMID_XEN ) {
                           d = get_domain_by_id(result->owner);
-                          domain_crash(d);
+                          gfn =
+                              mfn_to_gmfn(d, ((bank->mc_addr) >> PAGE_SHIFT));
+                          bank->mc_addr =
+                              gfn << PAGE_SHIFT | (bank->mc_addr & PAGE_MASK);
+                          if (fill_vmsr_data(bank, global->mc_gstatus) == -1)
+                          {
+                              printk(KERN_DEBUG "Fill vMCE# data for DOM%d "
+                                      "failed\n", result->owner);
+                              domain_crash(d);
+                              return;
+                          }
+                          /* We will inject vMCE to DOMU*/
+                          if ( inject_mce(d) < 0 )
+                          {
+                              printk(KERN_DEBUG "inject vMCE to DOM%d"
+                                          " failed\n", d->domain_id);
+                              domain_crash(d);
+                              return;
+                          }
+                          /* Impacted domain go on with domain's recovery job
+                           * if the domain has its own MCA handler.
+                           * For xen, it has contained the error and finished
+                           * its own recovery job.
+                           */
                           result->result = MCA_RECOVERED;
                       }
                   }
@@ -309,7 +398,7 @@ struct mca_error_handler intel_recovery_
  * should be committed for dom0 consumption, 0 if it should be
  * dismissed.
  */
-static int mce_action(unsigned int cpu, mctelem_cookie_t mctc)
+static int mce_action(mctelem_cookie_t mctc)
 {
     struct mc_info *local_mi;
     uint32_t i;
@@ -335,9 +424,6 @@ static int mce_action(unsigned int cpu, 
             continue;
         }
         mc_bank = (struct mcinfo_bank*)mic;
-        /* Fill vMCE# injection and vMCE# MSR virtualization related data */
-        if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1)
-             break;
 
         /* TODO: Add recovery actions here, such as page-offline, etc */
         memset(&mca_res, 0x0f, sizeof(mca_res));
@@ -386,7 +472,6 @@ static void mce_softirq(void)
 {
     int cpu = smp_processor_id();
     unsigned int workcpu;
-    cpumask_t affinity;
 
     printk(KERN_DEBUG "CPU%d enter softirq\n", cpu);
 
@@ -417,27 +502,13 @@ static void mce_softirq(void)
          * vMCE MSRs virtualization buffer
          */
         for_each_online_cpu(workcpu) {
-           mctelem_process_deferred(workcpu, mce_action);
+            mctelem_process_deferred(workcpu, mce_action);
         }
 
         /* Step2: Send Log to DOM0 through vIRQ */
         if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
             printk(KERN_DEBUG "MCE: send MCE# to DOM0 through virq\n");
             send_guest_global_virq(dom0, VIRQ_MCA);
-        }
-
-        /* Step3: Inject vMCE to impacted DOM. Currently we cares DOM0 only */
-        if (guest_has_trap_callback
-               (dom0, 0, TRAP_machine_check) &&
-                 !test_and_set_bool(dom0->vcpu[0]->mce_pending)) {
-            dom0->vcpu[0]->cpu_affinity_tmp = 
-                    dom0->vcpu[0]->cpu_affinity;
-            cpus_clear(affinity);
-            cpu_set(cpu, affinity);
-            printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
-                dom0->vcpu[0]->processor);
-            vcpu_set_affinity(dom0->vcpu[0], &affinity);
-            vcpu_kick(dom0->vcpu[0]);
         }
     }
 
@@ -1057,7 +1128,27 @@ int intel_mce_wrmsr(u32 msr, u64 value)
         break;
     case MSR_IA32_MCG_STATUS:
         d->arch.vmca_msrs.mcg_status = value;
-        gdprintk(XENLOG_DEBUG, "MCE: wrmsr MCG_CTL %"PRIx64"\n", value);
+        gdprintk(XENLOG_DEBUG, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", value);
+        /* For HVM guest, this is the point for deleting vMCE injection node */
+        if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection >0) )
+        {
+            d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
+            if (!list_empty(&d->arch.vmca_msrs.impact_header)) {
+                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+                    struct bank_entry, list);
+                if (entry->mci_status & MCi_STATUS_VAL)
+                    gdprintk(XENLOG_ERR, "MCE: MCi_STATUS MSR should have "
+                                "been cleared before write MCG_STATUS MSR\n");
+
+                gdprintk(XENLOG_DEBUG, "MCE: Delete HVM last injection "
+                                "Node, nr_injection %u\n",
+                                d->arch.vmca_msrs.nr_injection);
+                list_del(&entry->list);
+            }
+            else
+                gdprintk(XENLOG_DEBUG, "MCE: Not found HVM guest"
+                    " last injection Node, something Wrong!\n");
+        }
         break;
     case MSR_IA32_MCG_CAP:
         gdprintk(XENLOG_WARNING, "MCE: MCG_CAP is read-only\n");
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/cpu/mcheck/mctelem.c
--- a/xen/arch/x86/cpu/mcheck/mctelem.c Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c Tue Jun 30 15:40:39 2009 +0100
@@ -153,7 +153,7 @@ void mctelem_defer(mctelem_cookie_t cook
 }
 
 void mctelem_process_deferred(unsigned int cpu,
-                             int (*fn)(unsigned int, mctelem_cookie_t))
+                             int (*fn)(mctelem_cookie_t))
 {
        struct mctelem_ent *tep;
        struct mctelem_ent *head, *prev;
@@ -189,7 +189,7 @@ void mctelem_process_deferred(unsigned i
                prev = tep->mcte_prev;
                tep->mcte_next = tep->mcte_prev = NULL;
 
-               ret = fn(cpu, MCTE2COOKIE(tep));
+               ret = fn(MCTE2COOKIE(tep));
                if (prev != NULL)
                        prev->mcte_next = NULL;
                tep->mcte_prev = tep->mcte_next = NULL;
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/cpu/mcheck/mctelem.h
--- a/xen/arch/x86/cpu/mcheck/mctelem.h Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mctelem.h Tue Jun 30 15:40:39 2009 +0100
@@ -69,7 +69,7 @@ extern void mctelem_ack(mctelem_class_t,
 extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
 extern void mctelem_defer(mctelem_cookie_t);
 extern void mctelem_process_deferred(unsigned int,
-    int (*)(unsigned int, mctelem_cookie_t));
+    int (*)(mctelem_cookie_t));
 int mctelem_has_deferred(unsigned int);
 
 #endif
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Tue Jun 30 15:40:39 2009 +0100
@@ -1771,6 +1771,8 @@ void hvm_rdtsc_intercept(struct cpu_user
     regs->edx = (uint32_t)(tsc >> 32);
 }
 
+extern int intel_mce_rdmsr(u32 msr, u32 *lo, u32 *hi);
+extern int intel_mce_wrmsr(u32 msr, u64 value);
 int hvm_msr_read_intercept(struct cpu_user_regs *regs)
 {
     uint32_t ecx = regs->ecx;
@@ -1779,6 +1781,8 @@ int hvm_msr_read_intercept(struct cpu_us
     uint64_t *var_range_base, *fixed_range_base;
     int index, mtrr;
     uint32_t cpuid[4];
+    uint32_t lo, hi;
+    int ret;
 
     var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
     fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
@@ -1794,18 +1798,6 @@ int hvm_msr_read_intercept(struct cpu_us
 
     case MSR_IA32_APICBASE:
         msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
-        break;
-
-    case MSR_IA32_MCG_CAP:
-    case MSR_IA32_MCG_STATUS:
-    case MSR_IA32_MC0_STATUS:
-    case MSR_IA32_MC1_STATUS:
-    case MSR_IA32_MC2_STATUS:
-    case MSR_IA32_MC3_STATUS:
-    case MSR_IA32_MC4_STATUS:
-    case MSR_IA32_MC5_STATUS:
-        /* No point in letting the guest see real MCEs */
-        msr_content = 0;
         break;
 
     case MSR_IA32_CR_PAT:
@@ -1858,7 +1850,17 @@ int hvm_msr_read_intercept(struct cpu_us
          break;
 
     default:
-        return hvm_funcs.msr_read_intercept(regs);
+        ret = intel_mce_rdmsr(ecx, &lo, &hi);
+        if ( ret < 0 )
+            goto gp_fault;
+        else if ( ret )
+        {
+            msr_content = ((u64)hi << 32) | lo;
+            break;
+        }
+        /* ret == 0, This is not an MCE MSR, see other MSRs */
+        else if (!ret)
+            return hvm_funcs.msr_read_intercept(regs);
     }
 
     regs->eax = (uint32_t)msr_content;
@@ -1884,6 +1886,7 @@ int hvm_msr_write_intercept(struct cpu_u
     struct vcpu *v = current;
     int index, mtrr;
     uint32_t cpuid[4];
+    int ret;
 
     hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
     mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
@@ -1946,7 +1949,13 @@ int hvm_msr_write_intercept(struct cpu_u
         break;
 
     default:
-        return hvm_funcs.msr_write_intercept(regs);
+        ret = intel_mce_wrmsr(ecx, msr_content);
+        if ( ret < 0 )
+            goto gp_fault;
+        else if ( ret )
+            break;
+        else if (!ret)
+            return hvm_funcs.msr_write_intercept(regs);
     }
 
     return X86EMUL_OKAY;
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/hvm/irq.c
--- a/xen/arch/x86/hvm/irq.c    Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/hvm/irq.c    Tue Jun 30 15:40:39 2009 +0100
@@ -326,6 +326,9 @@ struct hvm_intack hvm_vcpu_has_pending_i
     if ( unlikely(v->nmi_pending) )
         return hvm_intack_nmi;
 
+    if ( unlikely(v->mce_pending) )
+        return hvm_intack_mce;
+
     if ( vlapic_accept_pic_intr(v) && plat->vpic[0].int_output )
         return hvm_intack_pic(0);
 
@@ -345,6 +348,10 @@ struct hvm_intack hvm_vcpu_ack_pending_i
     {
     case hvm_intsrc_nmi:
         if ( !test_and_clear_bool(v->nmi_pending) )
+            intack = hvm_intack_none;
+        break;
+    case hvm_intsrc_mce:
+        if ( !test_and_clear_bool(v->mce_pending) )
             intack = hvm_intack_none;
         break;
     case hvm_intsrc_pic:
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/hvm/vmx/intr.c
--- a/xen/arch/x86/hvm/vmx/intr.c       Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/hvm/vmx/intr.c       Tue Jun 30 15:40:39 2009 +0100
@@ -157,6 +157,10 @@ asmlinkage void vmx_intr_assist(void)
     {
         vmx_inject_nmi();
     }
+    else if ( intack.source == hvm_intsrc_mce )
+    {
+        vmx_inject_hw_exception(TRAP_machine_check, HVM_DELIVER_NO_ERROR_CODE);
+    }
     else
     {
         HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/x86_64/traps.c       Tue Jun 30 15:40:39 2009 +0100
@@ -309,12 +309,13 @@ unsigned long do_iret(void)
        && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
         vcpu_set_affinity(v, &v->cpu_affinity_tmp);
 
-   /*Currently, only inject vMCE to DOM0.*/
+   /* inject vMCE to PV_Guest including DOM0. */
     if (v->trap_priority >= VCPU_TRAP_NMI) {
-        printk(KERN_DEBUG "MCE: Return from vMCE# trap!");
-        if (d->domain_id == 0 && v->vcpu_id == 0) {
+        printk(KERN_DEBUG "MCE: Return from vMCE# trap!\n");
+        if ( v->vcpu_id == 0 ) {
             if ( !d->arch.vmca_msrs.nr_injection ) {
-                printk(KERN_WARNING "MCE: Ret from vMCE#, nr_injection is 
0\n");
+                printk(KERN_WARNING "MCE: Ret from vMCE#, "
+                       "No injection Node\n");
                 goto end;
             }
 
diff -r 00502df38143 -r 7bbbc57163d5 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/include/asm-x86/domain.h      Tue Jun 30 15:40:39 2009 +0100
@@ -210,7 +210,6 @@ struct p2m_domain;
  * put into impact_header list. */
 struct bank_entry {
     struct list_head list;
-    int32_t cpu;
     uint16_t bank;
     uint64_t mci_status;
     uint64_t mci_addr;
diff -r 00502df38143 -r 7bbbc57163d5 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h     Tue Jun 30 15:40:39 2009 +0100
@@ -31,7 +31,8 @@ enum hvm_intsrc {
     hvm_intsrc_none,
     hvm_intsrc_pic,
     hvm_intsrc_lapic,
-    hvm_intsrc_nmi
+    hvm_intsrc_nmi,
+    hvm_intsrc_mce
 };
 struct hvm_intack {
     uint8_t source; /* enum hvm_intsrc */
@@ -41,6 +42,7 @@ struct hvm_intack {
 #define hvm_intack_pic(vec)   ( (struct hvm_intack) { hvm_intsrc_pic,   vec } )
 #define hvm_intack_lapic(vec) ( (struct hvm_intack) { hvm_intsrc_lapic, vec } )
 #define hvm_intack_nmi        ( (struct hvm_intack) { hvm_intsrc_nmi,   2 } )
+#define hvm_intack_mce        ( (struct hvm_intack) { hvm_intsrc_mce,   18 } )
 enum hvm_intblk {
     hvm_intblk_none,      /* not blocked (deliverable) */
     hvm_intblk_shadow,    /* MOV-SS or STI shadow */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.