[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] Clean-up on MCA MSR virtualization and vMCE injection



Clean-up on MCA MSR virtualization and vMCE injection

Remove all virtual MCE related work into a seperated file.
It also try to do some clean-up on the vMCE, including:
a) renmae some function name like mce_init_msr/mce_rdmsr to be
   vmce_init_msr/vmce_rdmsr to make it more straightforward,
b) make the vmca_msrs be a pointer in arch_domain,
    to decrease arch_domain's size
c) extract per-bank MCA MSR access to be seperated function
    (bank_mce_wrmsr/bank_mce_rdmsr) to make it be a bit cleaner.
d) A new file xen/include/asm-x86/mce.h  is added for vmce related header.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile  Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/Makefile  Fri Apr 16 18:55:03 2010 +0800
@@ -7,3 +7,4 @@ obj-y += mce_intel.o
 obj-y += mce_intel.o
 obj-y += mce_amd_quirks.o
 obj-y += non-fatal.o
+obj-y += vmce.o
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Fri Apr 16 18:55:03 2010 +0800
@@ -31,11 +31,11 @@ unsigned int nr_mce_banks;
 unsigned int nr_mce_banks;

 int mce_broadcast = 0;
-static uint64_t g_mcg_cap;
+uint64_t g_mcg_cap;

 /* Real value in physical CTL MSR */
-static uint64_t h_mcg_ctl = 0UL;
-static uint64_t *h_mci_ctrl;
+uint64_t h_mcg_ctl = 0UL;
+uint64_t *h_mci_ctrl;
 int firstbank;

 static void intpose_init(void);
@@ -752,234 +752,6 @@ u64 mce_cap_init(void)
     return value;
 }

-/* Guest vMCE# MSRs virtualization ops (rdmsr/wrmsr) */
-void mce_init_msr(struct domain *d)
-{
-    d->arch.vmca_msrs.mcg_status = 0x0;
-    d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
-    d->arch.vmca_msrs.mcg_ctl = ~(uint64_t)0x0;
-    d->arch.vmca_msrs.nr_injection = 0;
-    memset(d->arch.vmca_msrs.mci_ctl, ~0,
-           sizeof(d->arch.vmca_msrs.mci_ctl));
-    INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
-    spin_lock_init(&d->arch.vmca_msrs.lock);
-}
-
-int mce_rdmsr(uint32_t msr, uint64_t *val)
-{
-    struct domain *d = current->domain;
-    int ret = 1;
-    unsigned int bank;
-    struct bank_entry *entry = NULL;
-
-    *val = 0;
-    spin_lock(&d->arch.vmca_msrs.lock);
-
-    switch ( msr )
-    {
-    case MSR_IA32_MCG_STATUS:
-        *val = d->arch.vmca_msrs.mcg_status;
-        if (*val)
-            mce_printk(MCE_VERBOSE,
-                "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
-        break;
-    case MSR_IA32_MCG_CAP:
-        *val = d->arch.vmca_msrs.mcg_cap;
-        mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
-            *val);
-        break;
-    case MSR_IA32_MCG_CTL:
-        /* Always 0 if no CTL support */
-        *val = d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl;
-        mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
-            *val);
-        break;
-    case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
-        bank = (msr - MSR_IA32_MC0_CTL) / 4;
-        if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
-        {
-            mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
-            ret = 0;
-            break;
-        }
-        switch (msr & (MSR_IA32_MC0_CTL | 3))
-        {
-        case MSR_IA32_MC0_CTL:
-            *val = d->arch.vmca_msrs.mci_ctl[bank] &
-                    (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
-            mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
-                     bank, *val);
-            break;
-        case MSR_IA32_MC0_STATUS:
-            /* Only error bank is read. Non-error banks simply return. */
-            if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
-            {
-                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
-                                   struct bank_entry, list);
-                if (entry->bank == bank) {
-                    *val = entry->mci_status;
-                    mce_printk(MCE_VERBOSE,
-                             "MCE: rd MC%u_STATUS in vMCE# context "
-                             "value 0x%"PRIx64"\n", bank, *val);
-                }
-                else
-                    entry = NULL;
-            }
-            break;
-        case MSR_IA32_MC0_ADDR:
-            if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
-            {
-                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
-                                   struct bank_entry, list);
-                if ( entry->bank == bank )
-                {
-                    *val = entry->mci_addr;
-                    mce_printk(MCE_VERBOSE,
-                             "MCE: rdmsr MC%u_ADDR in vMCE# context "
-                             "0x%"PRIx64"\n", bank, *val);
-                }
-            }
-            break;
-        case MSR_IA32_MC0_MISC:
-            if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
-            {
-                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
-                                   struct bank_entry, list);
-                if ( entry->bank == bank )
-                {
-                    *val = entry->mci_misc;
-                    mce_printk(MCE_VERBOSE,
-                             "MCE: rd MC%u_MISC in vMCE# context "
-                             "0x%"PRIx64"\n", bank, *val);
-                }
-            }
-            break;
-        }
-        break;
-    default:
-        switch ( boot_cpu_data.x86_vendor )
-        {
-        case X86_VENDOR_INTEL:
-            ret = intel_mce_rdmsr(msr, val);
-            break;
-        default:
-            ret = 0;
-            break;
-        }
-        break;
-    }
-
-    spin_unlock(&d->arch.vmca_msrs.lock);
-    return ret;
-}
-
-int mce_wrmsr(u32 msr, u64 val)
-{
-    struct domain *d = current->domain;
-    struct bank_entry *entry = NULL;
-    unsigned int bank;
-    int ret = 1;
-
-    if ( !g_mcg_cap )
-        return 0;
-
-    spin_lock(&d->arch.vmca_msrs.lock);
-
-    switch ( msr )
-    {
-    case MSR_IA32_MCG_CTL:
-        d->arch.vmca_msrs.mcg_ctl = val;
-        break;
-    case MSR_IA32_MCG_STATUS:
-        d->arch.vmca_msrs.mcg_status = val;
-        mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
-        /* For HVM guest, this is the point for deleting vMCE injection node */
-        if ( d->is_hvm && (d->arch.vmca_msrs.nr_injection > 0) )
-        {
-            d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
-            if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
-            {
-                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
-                    struct bank_entry, list);
-                if ( entry->mci_status & MCi_STATUS_VAL )
-                    mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have "
-                                "been cleared before write MCG_STATUS MSR\n");
-
-                mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
-                                "Node, nr_injection %u\n",
-                                d->arch.vmca_msrs.nr_injection);
-                list_del(&entry->list);
-                xfree(entry);
-            }
-            else
-                mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
-                    " last injection Node, something Wrong!\n");
-        }
-        break;
-    case MSR_IA32_MCG_CAP:
-        mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
-        ret = -1;
-        break;
-    case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
-        bank = (msr - MSR_IA32_MC0_CTL) / 4;
-        if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
-        {
-            mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
-            ret = 0;
-            break;
-        }
-        switch ( msr & (MSR_IA32_MC0_CTL | 3) )
-        {
-        case MSR_IA32_MC0_CTL:
-            d->arch.vmca_msrs.mci_ctl[bank] = val;
-            break;
-        case MSR_IA32_MC0_STATUS:
-            /* Give the first entry of the list, it corresponds to current
-             * vMCE# injection. When vMCE# is finished processing by the
-             * the guest, this node will be deleted.
-             * Only error bank is written. Non-error banks simply return.
-             */
-            if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
-            {
-                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
-                                   struct bank_entry, list);
-                if ( entry->bank == bank )
-                    entry->mci_status = val;
-                mce_printk(MCE_VERBOSE,
-                         "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
-                         bank, val);
-            }
-            else
-                mce_printk(MCE_VERBOSE,
-                         "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
-            break;
-        case MSR_IA32_MC0_ADDR:
-            mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n", bank);
-            ret = -1;
-            break;
-        case MSR_IA32_MC0_MISC:
-            mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
-            ret = -1;
-            break;
-        }
-        break;
-    default:
-        switch ( boot_cpu_data.x86_vendor )
-        {
-        case X86_VENDOR_INTEL:
-            ret = intel_mce_wrmsr(msr, val);
-            break;
-        default:
-            ret = 0;
-            break;
-        }
-        break;
-    }
-
-    spin_unlock(&d->arch.vmca_msrs.lock);
-    return ret;
-}
-
 static void mcinfo_clear(struct mc_info *mi)
 {
        memset(mi, 0, sizeof(struct mc_info));
@@ -1238,11 +1010,11 @@ int mca_ctl_conflict(struct mcinfo_bank
         return 1;

     /* Will MCE happen in host if If host mcg_ctl is 0? */
-    if ( ~d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl )
+    if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl )
         return 1;

     bank_nr = bank->mc_bank;
-    if (~d->arch.vmca_msrs.mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] )
+    if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] )
         return 1;
     return 0;
 }
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Fri Apr 16 18:55:03 2010 +0800
@@ -164,4 +164,32 @@ int x86_mcinfo_add(struct mc_info *mi, v
 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
 void x86_mcinfo_dump(struct mc_info *mi);

+int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
+        uint64_t gstatus);
+int inject_vmce(struct domain *d);
+int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct 
mcinfo_global *global);
+
+extern uint64_t g_mcg_cap;
+/* Real value in physical CTL MSR */
+extern uint64_t h_mcg_ctl;
+extern uint64_t *h_mci_ctrl;
+
+extern unsigned int nr_mce_banks;
+
+static inline int mce_vendor_bank_msr(uint32_t msr)
+{
+    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+        (msr > MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks)) )
+          return 1;
+    return 0;
+}
+
+static inline int mce_bank_msr(uint32_t msr)
+{
+    if ( (msr > MSR_IA32_MC0_CTL2 &&
+         msr < (MSR_IA32_MC0_CTL + 4 * nr_mce_banks - 1)) ||
+        mce_vendor_bank_msr(msr) )
+        return 1;
+    return 0;
+}
 #endif /* _MCE_H */
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Fri Apr 16 18:55:03 2010 +0800
@@ -11,6 +11,7 @@
 #include <asm/system.h>
 #include <asm/msr.h>
 #include <asm/p2m.h>
+#include <asm/mce.h>
 #include "mce.h"
 #include "x86_mca.h"

@@ -199,126 +200,6 @@ intel_get_extended_msrs(struct mc_info *
     return MCA_EXTINFO_GLOBAL;
 }

-/* This node list records errors impacting a domain. when one
- * MCE# happens, one error bank impacts a domain. This error node
- * will be inserted to the tail of the per_dom data for vMCE# MSR
- * virtualization. When one vMCE# injection is finished processing
- * processed by guest, the corresponding node will be deleted.
- * This node list is for GUEST vMCE# MSRS virtualization.
- */
-static struct bank_entry* alloc_bank_entry(void) {
-    struct bank_entry *entry;
-
-    entry = xmalloc(struct bank_entry);
-    if (!entry) {
-        printk(KERN_ERR "MCE: malloc bank_entry failed\n");
-        return NULL;
-    }
-    memset(entry, 0x0, sizeof(entry));
-    INIT_LIST_HEAD(&entry->list);
-    return entry;
-}
-
-/* Fill error bank info for #vMCE injection and GUEST vMCE#
- * MSR virtualization data
- * 1) Log down how many nr_injections of the impacted.
- * 2) Copy MCE# error bank to impacted DOM node list,
-      for vMCE# MSRs virtualization
-*/
-
-static int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
-        uint64_t gstatus) {
-    struct bank_entry *entry;
-
-    /* This error bank impacts one domain, we need to fill domain related
-     * data for vMCE MSRs virtualization and vMCE# injection */
-    if (mc_bank->mc_domid != (uint16_t)~0) {
-        /* For HVM guest, Only when first vMCE is consumed by HVM guest 
successfully,
-         * will we generete another node and inject another vMCE
-         */
-        if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) )
-        {
-            mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous"
-                        " vMCE yet!\n");
-            return -1;
-        }
-        entry = alloc_bank_entry();
-        if (entry == NULL)
-            return -1;
-
-        entry->mci_status = mc_bank->mc_status;
-        entry->mci_addr = mc_bank->mc_addr;
-        entry->mci_misc = mc_bank->mc_misc;
-        entry->bank = mc_bank->mc_bank;
-
-        spin_lock(&d->arch.vmca_msrs.lock);
-        /* New error Node, insert to the tail of the per_dom data */
-        list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
-        /* Fill MSR global status */
-        d->arch.vmca_msrs.mcg_status = gstatus;
-        /* New node impact the domain, need another vMCE# injection*/
-        d->arch.vmca_msrs.nr_injection++;
-        spin_unlock(&d->arch.vmca_msrs.lock);
-
-        mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
-                "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
-                mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
-                mc_bank->mc_domid);
-    }
-    return 0;
-}
-
-static int inject_mce(struct domain *d)
-{
-    int cpu = smp_processor_id();
-    cpumask_t affinity;
-
-    /* PV guest and HVM guest have different vMCE# injection
-     * methods*/
-
-    if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
-    {
-        if (d->is_hvm)
-        {
-            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
-                        d->domain_id);
-            vcpu_kick(d->vcpu[0]);
-        }
-        /* PV guest including DOM0 */
-        else
-        {
-            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
-                        d->domain_id);
-            if (guest_has_trap_callback
-                   (d, 0, TRAP_machine_check))
-            {
-                d->vcpu[0]->cpu_affinity_tmp =
-                        d->vcpu[0]->cpu_affinity;
-                cpus_clear(affinity);
-                cpu_set(cpu, affinity);
-                mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n", 
cpu,
-                            d->vcpu[0]->processor);
-                vcpu_set_affinity(d->vcpu[0], &affinity);
-                vcpu_kick(d->vcpu[0]);
-            }
-            else
-            {
-                mce_printk(MCE_VERBOSE, "MCE: Kill PV guest with No MCE 
handler\n");
-                domain_crash(d);
-            }
-        }
-    }
-    else {
-        /* new vMCE comes while first one has not been injected yet,
-         * in this case, inject fail. [We can't lose this vMCE for
-         * the mce node's consistency].
-        */
-        mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected "
-                    " to this DOM%d!\n", d->domain_id);
-        return -1;
-    }
-    return 0;
-}

 static void intel_UCR_handler(struct mcinfo_bank *bank,
              struct mcinfo_global *global,
@@ -377,7 +258,7 @@ static void intel_UCR_handler(struct mci
                               return;
                           }
                           /* We will inject vMCE to DOMU*/
-                          if ( inject_mce(d) < 0 )
+                          if ( inject_vmce(d) < 0 )
                           {
                               mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
                                           " failed\n", d->domain_id);
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/vmce.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/vmce.c    Fri Apr 16 18:55:03 2010 +0800
@@ -0,0 +1,451 @@
+/*
+ * vmce.c - virtual MCE support
+ */
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/irq.h>
+#include <xen/event.h>
+#include <xen/kernel.h>
+#include <xen/delay.h>
+#include <xen/smp.h>
+#include <xen/mm.h>
+#include <asm/processor.h>
+#include <public/sysctl.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+#include <asm/p2m.h>
+#include "mce.h"
+#include "x86_mca.h"
+
+int vmce_init_msr(struct domain *d)
+{
+    if ( dom_vmce(d) )
+    {
+        dprintk(XENLOG_G_WARNING, "Domain %d has inited vMCE\n", d->domain_id);
+        return 0;
+    }
+
+    /* Allocate the vmca_msrs and mci_ctl togother */
+    dom_vmce(d) = xmalloc(struct domain_mca_msrs);
+    if ( !dom_vmce(d) )
+        return -ENOMEM;
+
+    dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks);
+    if ( !dom_vmce(d)->mci_ctl )
+    {
+        xfree(dom_vmce(d));
+        return -ENOMEM;
+    }
+    memset(d->arch.vmca_msrs->mci_ctl, ~0,
+           sizeof(d->arch.vmca_msrs->mci_ctl));
+
+    dom_vmce(d)->mcg_status = 0x0;
+    dom_vmce(d)->mcg_cap = g_mcg_cap;
+    dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0;
+    dom_vmce(d)->nr_injection = 0;
+
+    INIT_LIST_HEAD(&d->arch.vmca_msrs->impact_header);
+    spin_lock_init(&d->arch.vmca_msrs->lock);
+
+    return 0;
+}
+
+/*
+ * Caller should make sure msr is bank msr */
+static int bank_mce_rdmsr(struct domain *d, uint32_t msr, uint64_t *val)
+{
+    int bank, ret = 1;
+    struct domain_mca_msrs *vmce;
+    struct bank_entry *entry = NULL;
+
+    if (!d)
+        return -EINVAL;
+    vmce = dom_vmce(d);
+    ASSERT(vmce);
+
+    bank = (msr - MSR_IA32_MC0_CTL) / 4;
+    if (bank >= nr_mce_banks)
+        return -1;
+
+    switch (msr & (MSR_IA32_MC0_CTL | 3))
+    {
+    case MSR_IA32_MC0_CTL:
+        *val = vmce->mci_ctl[bank] &
+          (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
+        mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
+          bank, *val);
+        break;
+    case MSR_IA32_MC0_STATUS:
+        /* Only error bank is read. Non-error banks simply return. */
+        if ( !list_empty(&vmce->impact_header) )
+        {
+            entry = list_entry(vmce->impact_header.next,
+              struct bank_entry, list);
+            if (entry->bank == bank) {
+                *val = entry->mci_status;
+                mce_printk(MCE_VERBOSE,
+                  "MCE: rd MC%u_STATUS in vMCE# context "
+                  "value 0x%"PRIx64"\n", bank, *val);
+            }
+            else
+                entry = NULL;
+        }
+        break;
+    case MSR_IA32_MC0_ADDR:
+        if ( !list_empty(&vmce->impact_header) )
+        {
+            entry = list_entry(vmce->impact_header.next,
+              struct bank_entry, list);
+            if ( entry->bank == bank )
+            {
+                *val = entry->mci_addr;
+                mce_printk(MCE_VERBOSE,
+                  "MCE: rdmsr MC%u_ADDR in vMCE# context "
+                  "0x%"PRIx64"\n", bank, *val);
+            }
+        }
+        break;
+    case MSR_IA32_MC0_MISC:
+        if ( !list_empty(&vmce->impact_header) )
+        {
+            entry = list_entry(vmce->impact_header.next,
+              struct bank_entry, list);
+            if ( entry->bank == bank )
+            {
+                *val = entry->mci_misc;
+                mce_printk(MCE_VERBOSE,
+                  "MCE: rd MC%u_MISC in vMCE# context "
+                  "0x%"PRIx64"\n", bank, *val);
+            }
+        }
+        break;
+    default:
+        switch ( boot_cpu_data.x86_vendor )
+        {
+            case X86_VENDOR_INTEL:
+                ret = intel_mce_rdmsr(msr, val);
+                break;
+            default:
+                ret = 0;
+                break;
+        }
+        break;
+    }
+
+    return ret;
+}
+
+/*
+ * < 0: Unsupported and will #GP fault to guest
+ * = 0: Not handled, should be handled by other components
+ * > 0: Success
+ */
+int vmce_rdmsr(uint32_t msr, uint64_t *val)
+{
+    struct domain *d = current->domain;
+    struct domain_mca_msrs *vmce;
+    int ret = 1;
+
+    *val = 0;
+
+    vmce = dom_vmce(d);
+    if ( !vmce )
+    {
+        /* XXX more handle here */
+        return 0;
+    }
+
+    spin_lock(&d->arch.vmca_msrs->lock);
+
+    switch ( msr )
+    {
+    case MSR_IA32_MCG_STATUS:
+        *val = vmce->mcg_status;
+        if (*val)
+            mce_printk(MCE_VERBOSE,
+                "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
+        break;
+    case MSR_IA32_MCG_CAP:
+        *val = vmce->mcg_cap;
+        mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
+            *val);
+        break;
+    case MSR_IA32_MCG_CTL:
+        /* Always 0 if no CTL support */
+        *val = vmce->mcg_ctl & h_mcg_ctl;
+        mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
+            *val);
+        break;
+    default:
+        if ( mce_bank_msr(msr) )
+            ret = bank_mce_rdmsr(d, msr, val);
+        else
+            ret = 0;
+        break;
+    }
+
+    spin_unlock(&d->arch.vmca_msrs->lock);
+    return ret;
+}
+
+int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val)
+{
+    int bank, ret = 1;
+    struct domain_mca_msrs *vmce;
+    struct bank_entry *entry = NULL;
+
+    if (!d)
+        return -EINVAL;
+    vmce = dom_vmce(d);
+    ASSERT(vmce && vmce->mci_ctl);
+
+    bank = (msr - MSR_IA32_MC0_CTL) / 4;
+    if (bank >= nr_mce_banks)
+        return -EINVAL;
+
+    switch ( msr & (MSR_IA32_MC0_CTL | 3) )
+    {
+    case MSR_IA32_MC0_CTL:
+        vmce->mci_ctl[bank] = val;
+            break;
+    case MSR_IA32_MC0_STATUS:
+            /* Give the first entry of the list, it corresponds to current
+             * vMCE# injection. When vMCE# is finished processing by the
+             * the guest, this node will be deleted.
+             * Only error bank is written. Non-error banks simply return.
+             */
+            if ( !list_empty(&d->arch.vmca_msrs->impact_header) )
+            {
+                entry = list_entry(d->arch.vmca_msrs->impact_header.next,
+                                   struct bank_entry, list);
+                if ( entry->bank == bank )
+                    entry->mci_status = val;
+                mce_printk(MCE_VERBOSE,
+                         "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
+                         bank, val);
+            }
+            else
+                mce_printk(MCE_VERBOSE,
+                         "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
+            break;
+    case MSR_IA32_MC0_ADDR:
+            mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n", bank);
+            ret = -1;
+            break;
+    case MSR_IA32_MC0_MISC:
+            mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
+            ret = -1;
+            break;
+    default:
+        switch ( boot_cpu_data.x86_vendor )
+        {
+        case X86_VENDOR_INTEL:
+            ret = intel_mce_wrmsr(msr, val);
+            break;
+        default:
+            ret = 0;
+            break;
+        }
+        break;
+    }
+
+    return ret;
+}
+
+/*
+ * < 0: Unsupported and will #GP fault to guest
+ * = 0: Not handled, should be handled by other components
+ * > 0: Success
+ */
+int vmce_wrmsr(u32 msr, u64 val)
+{
+    struct domain *d = current->domain;
+    struct bank_entry *entry = NULL;
+    struct domain_mca_msrs *vmce;
+    int ret = 1;
+
+    if ( !g_mcg_cap )
+        return 0;
+
+    vmce = dom_vmce(d);
+    spin_lock(&vmce->lock);
+
+    switch ( msr )
+    {
+    case MSR_IA32_MCG_CTL:
+        vmce->mcg_ctl = val;
+        break;
+    case MSR_IA32_MCG_STATUS:
+        vmce->mcg_status = val;
+        mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
+        /* For HVM guest, this is the point for deleting vMCE injection node */
+        if ( d->is_hvm && (vmce->nr_injection > 0) )
+        {
+            vmce->nr_injection--; /* Should be 0 */
+            if ( !list_empty(&vmce->impact_header) )
+            {
+                entry = list_entry(vmce->impact_header.next,
+                    struct bank_entry, list);
+                if ( entry->mci_status & MCi_STATUS_VAL )
+                    mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have "
+                                "been cleared before write MCG_STATUS MSR\n");
+
+                mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
+                                "Node, nr_injection %u\n",
+                                vmce->nr_injection);
+                list_del(&entry->list);
+                xfree(entry);
+            }
+            else
+                mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
+                    " last injection Node, something Wrong!\n");
+        }
+        break;
+    case MSR_IA32_MCG_CAP:
+        mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
+        ret = -1;
+        break;
+    default:
+        if ( mce_bank_msr(msr) )
+            ret = bank_mce_wrmsr(d, msr, val);
+        else
+            ret = 0;
+        break;
+    }
+
+    spin_unlock(&vmce->lock);
+    return ret;
+}
+
+int inject_vmce(struct domain *d)
+{
+    int cpu = smp_processor_id();
+    cpumask_t affinity;
+
+    /* PV guest and HVM guest have different vMCE# injection
+     * methods*/
+    if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
+    {
+        if (d->is_hvm)
+        {
+            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
+                        d->domain_id);
+            vcpu_kick(d->vcpu[0]);
+        }
+        /* PV guest including DOM0 */
+        else
+        {
+            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
+                        d->domain_id);
+            if (guest_has_trap_callback
+                   (d, 0, TRAP_machine_check))
+            {
+                d->vcpu[0]->cpu_affinity_tmp =
+                        d->vcpu[0]->cpu_affinity;
+                cpus_clear(affinity);
+                cpu_set(cpu, affinity);
+                mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n", 
cpu,
+                            d->vcpu[0]->processor);
+                vcpu_set_affinity(d->vcpu[0], &affinity);
+                vcpu_kick(d->vcpu[0]);
+            }
+            else
+            {
+                mce_printk(MCE_VERBOSE, "MCE: Kill PV guest with No MCE 
handler\n");
+                domain_crash(d);
+            }
+        }
+    }
+    else {
+        /* new vMCE comes while first one has not been injected yet,
+         * in this case, inject fail. [We can't lose this vMCE for
+         * the mce node's consistency].
+        */
+        mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected "
+                    " to this DOM%d!\n", d->domain_id);
+        return -1;
+    }
+    return 0;
+}
+
+/* This node list records errors impacting a domain. when one
+ * MCE# happens, one error bank impacts a domain. This error node
+ * will be inserted to the tail of the per_dom data for vMCE# MSR
+ * virtualization. When one vMCE# injection is finished processing
+ * processed by guest, the corresponding node will be deleted.
+ * This node list is for GUEST vMCE# MSRS virtualization.
+ */
+static struct bank_entry* alloc_bank_entry(void) {
+    struct bank_entry *entry;
+
+    entry = xmalloc(struct bank_entry);
+    if (!entry) {
+        printk(KERN_ERR "MCE: malloc bank_entry failed\n");
+        return NULL;
+    }
+    memset(entry, 0x0, sizeof(entry));
+    INIT_LIST_HEAD(&entry->list);
+    return entry;
+}
+
+/* Fill error bank info for #vMCE injection and GUEST vMCE#
+ * MSR virtualization data
+ * 1) Log down how many nr_injections of the impacted.
+ * 2) Copy MCE# error bank to impacted DOM node list,
+      for vMCE# MSRs virtualization
+*/
+
+int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
+        uint64_t gstatus) {
+    struct bank_entry *entry;
+
+    /* This error bank impacts one domain, we need to fill domain related
+     * data for vMCE MSRs virtualization and vMCE# injection */
+    if (mc_bank->mc_domid != (uint16_t)~0) {
+        /* For HVM guest, Only when first vMCE is consumed by HVM guest 
successfully,
+         * will we generete another node and inject another vMCE
+         */
+        if ( (d->is_hvm) && (d->arch.vmca_msrs->nr_injection > 0) )
+        {
+            mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous"
+                        " vMCE yet!\n");
+            return -1;
+        }
+        entry = alloc_bank_entry();
+        if (entry == NULL)
+            return -1;
+
+        entry->mci_status = mc_bank->mc_status;
+        entry->mci_addr = mc_bank->mc_addr;
+        entry->mci_misc = mc_bank->mc_misc;
+        entry->bank = mc_bank->mc_bank;
+
+        spin_lock(&d->arch.vmca_msrs->lock);
+        /* New error Node, insert to the tail of the per_dom data */
+        list_add_tail(&entry->list, &d->arch.vmca_msrs->impact_header);
+        /* Fill MSR global status */
+        d->arch.vmca_msrs->mcg_status = gstatus;
+        /* New node impact the domain, need another vMCE# injection*/
+        d->arch.vmca_msrs->nr_injection++;
+        spin_unlock(&d->arch.vmca_msrs->lock);
+
+        mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
+                "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
+                mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
+                mc_bank->mc_domid);
+    }
+    return 0;
+}
+
+int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct 
mcinfo_global *global)
+{
+    int ret;
+
+    ret = fill_vmsr_data(bank, d, global->mc_gstatus);
+    if (ret < 0)
+        return ret;
+
+    return inject_vmce(d);
+}
+
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/domain.c     Fri Apr 16 18:55:03 2010 +0800
@@ -49,6 +49,7 @@
 #include <asm/msr.h>
 #include <asm/traps.h>
 #include <asm/nmi.h>
+#include <asm/mce.h>
 #include <xen/numa.h>
 #include <xen/iommu.h>
 #ifdef CONFIG_COMPAT
@@ -501,7 +502,7 @@ int arch_domain_create(struct domain *d,
             goto fail;

         /* For Guest vMCE MSRs virtualization */
-        mce_init_msr(d);
+        vmce_init_msr(d);
     }

     if ( is_hvm_domain(d) )
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Fri Apr 16 18:55:03 2010 +0800
@@ -47,6 +47,7 @@
 #include <asm/traps.h>
 #include <asm/mc146818rtc.h>
 #include <asm/spinlock.h>
+#include <asm/mce.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/vpt.h>
 #include <asm/hvm/support.h>
@@ -2061,7 +2062,7 @@ int hvm_msr_read_intercept(struct cpu_us
          break;

     default:
-        ret = mce_rdmsr(ecx, &msr_content);
+        ret = vmce_rdmsr(ecx, &msr_content);
         if ( ret < 0 )
             goto gp_fault;
         else if ( ret )
@@ -2160,7 +2161,7 @@ int hvm_msr_write_intercept(struct cpu_u
         break;

     default:
-        ret = mce_wrmsr(ecx, msr_content);
+        ret = vmce_wrmsr(ecx, msr_content);
         if ( ret < 0 )
             goto gp_fault;
         else if ( ret )
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/traps.c      Fri Apr 16 18:55:03 2010 +0800
@@ -65,6 +65,7 @@
 #include <asm/traps.h>
 #include <asm/hvm/vpt.h>
 #include <asm/hypercall.h>
+#include <asm/mce.h>
 #include <public/arch-x86/cpuid.h>

 /*
@@ -2295,7 +2296,7 @@ static int emulate_privileged_op(struct
             if ( wrmsr_hypervisor_regs(regs->ecx, val) )
                 break;

-            rc = mce_wrmsr(regs->ecx, val);
+            rc = vmce_wrmsr(regs->ecx, val);
             if ( rc < 0 )
                 goto fail;
             if ( rc )
@@ -2388,7 +2389,7 @@ static int emulate_privileged_op(struct
                 break;
             }

-            rc = mce_rdmsr(regs->ecx, &val);
+            rc = vmce_rdmsr(regs->ecx, &val);
             if ( rc < 0 )
                 goto fail;
             if ( rc )
@@ -2947,19 +2948,19 @@ void async_exception_cleanup(struct vcpu
         {
             struct domain *d = curr->domain;

-            if ( !d->arch.vmca_msrs.nr_injection )
+            if ( !d->arch.vmca_msrs->nr_injection )
             {
                 printk(XENLOG_WARNING "MCE: ret from vMCE#, "
                        "no injection node\n");
                 goto end;
             }

-            d->arch.vmca_msrs.nr_injection--;
-            if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
+            d->arch.vmca_msrs->nr_injection--;
+            if ( !list_empty(&d->arch.vmca_msrs->impact_header) )
             {
                 struct bank_entry *entry;

-                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+                entry = list_entry(d->arch.vmca_msrs->impact_header.next,
                                    struct bank_entry, list);
                 gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n");
                 list_del(&entry->list);
@@ -2968,7 +2969,7 @@ void async_exception_cleanup(struct vcpu
                 printk(XENLOG_ERR "MCE: didn't found last injection node\n");

             /* further injection */
-            if ( d->arch.vmca_msrs.nr_injection > 0 &&
+            if ( d->arch.vmca_msrs->nr_injection > 0 &&
                  guest_has_trap_callback(d, 0, TRAP_machine_check) &&
                  !test_and_set_bool(curr->mce_pending) )
             {
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/common/domain.c
--- a/xen/common/domain.c       Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/common/domain.c       Fri Apr 16 18:55:03 2010 +0800
@@ -616,6 +616,8 @@ static void complete_domain_destroy(stru

     xfree(d->pirq_mask);
     xfree(d->pirq_to_evtchn);
+    xfree(dom_vmce(d)->mci_ctl);
+    xfree(dom_vmce(d));

     xsm_free_security_domain(d);
     free_domain_struct(d);
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/include/asm-x86/domain.h      Fri Apr 16 18:55:03 2010 +0800
@@ -6,6 +6,7 @@
 #include <asm/hvm/vcpu.h>
 #include <asm/hvm/domain.h>
 #include <asm/e820.h>
+#include <asm/mce.h>
 #include <public/vcpu.h>

 #define has_32bit_shinfo(d)    ((d)->arch.has_32bit_shinfo)
@@ -214,32 +215,6 @@ typedef xen_domctl_cpuid_t cpuid_input_t
 typedef xen_domctl_cpuid_t cpuid_input_t;

 struct p2m_domain;
-
-/* Define for GUEST MCA handling */
-#define MAX_NR_BANKS 30
-
-/* This entry is for recording bank nodes for the impacted domain,
- * put into impact_header list. */
-struct bank_entry {
-    struct list_head list;
-    uint16_t bank;
-    uint64_t mci_status;
-    uint64_t mci_addr;
-    uint64_t mci_misc;
-};
-
-struct domain_mca_msrs
-{
-    /* Guest should not change below values after DOM boot up */
-    uint64_t mcg_cap;
-    uint64_t mcg_ctl;
-    uint64_t mcg_status;
-    uint64_t mci_ctl[MAX_NR_BANKS];
-    uint16_t nr_injection;
-    struct list_head impact_header;
-    spinlock_t lock;
-};
-
 struct time_scale {
     int shift;
     u32 mul_frac;
@@ -311,7 +286,7 @@ struct arch_domain
     cpuid_input_t cpuids[MAX_CPUID_INPUT];

     /* For Guest vMCA handling */
-    struct domain_mca_msrs vmca_msrs;
+    struct domain_mca_msrs *vmca_msrs;

     /* TSC management (emulation, pv, scaling, stats) */
     int tsc_mode;            /* see include/asm-x86/time.h */
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/include/asm-x86/mce.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/mce.h Fri Apr 16 18:55:03 2010 +0800
@@ -0,0 +1,36 @@
+#include <xen/types.h>
+#include <public/arch-x86/xen-mca.h>
+#ifndef _XEN_X86_MCE_H
+#define _XEN_X86_MCE_H
+/* Define for GUEST MCA handling */
+#define MAX_NR_BANKS 30
+
+/* This entry is for recording bank nodes for the impacted domain,
+ * put into impact_header list. */
+struct bank_entry {
+    struct list_head list;
+    uint16_t bank;
+    uint64_t mci_status;
+    uint64_t mci_addr;
+    uint64_t mci_misc;
+};
+
+struct domain_mca_msrs
+{
+    /* Guest should not change below values after DOM boot up */
+    uint64_t mcg_cap;
+    uint64_t mcg_ctl;
+    uint64_t mcg_status;
+    uint64_t *mci_ctl;
+    uint16_t nr_injection;
+    struct list_head impact_header;
+    spinlock_t lock;
+};
+
+#define dom_vmce(x)   ((x)->arch.vmca_msrs)
+
+/* Guest vMCE MSRs virtualization */
+extern int vmce_init_msr(struct domain *d);
+extern int vmce_wrmsr(uint32_t msr, uint64_t val);
+extern int vmce_rdmsr(uint32_t msr, uint64_t *val);
+#endif
diff -r 7ee8bb40200a -r b4fd50c22d9c xen/include/asm-x86/traps.h
--- a/xen/include/asm-x86/traps.h       Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/include/asm-x86/traps.h       Fri Apr 16 18:55:03 2010 +0800
@@ -49,9 +49,4 @@ extern int send_guest_trap(struct domain
 extern int send_guest_trap(struct domain *d, uint16_t vcpuid,
                                unsigned int trap_nr);

-/* Guest vMCE MSRs virtualization */
-extern void mce_init_msr(struct domain *d);
-extern int mce_wrmsr(uint32_t msr, uint64_t val);
-extern int mce_rdmsr(uint32_t msr, uint64_t *val);
-
 #endif /* ASM_TRAP_H */



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.