[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] Implement page offline recovery action for AMD


  • To: xen-changelog@xxxxxxxxxxxxxxxxxxx
  • From: Xen patchbot-unstable <patchbot@xxxxxxx>
  • Date: Wed, 10 Oct 2012 22:11:19 +0000
  • Delivery-date: Wed, 10 Oct 2012 22:11:30 +0000
  • List-id: "Change log for Mercurial \(receive only\)" <xen-changelog.lists.xen.org>

# HG changeset patch
# User Christoph Egger <Christoph.Egger@xxxxxxx>
# Date 1349783497 -3600
# Node ID 142e4577f5a9b95832b82f7b6d31fde1697cbe76
# Parent  cb1382bdaad9e11683a0329d18cef60ee85360c1
Implement page offline recovery action for AMD

Signed-off-by: Christoph Egger <Christoph.Egger@xxxxxxx>
Committed-by: Keir Fraser <keir@xxxxxxx>
---


diff -r cb1382bdaad9 -r 142e4577f5a9 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile  Tue Oct 09 12:46:27 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/Makefile  Tue Oct 09 12:51:37 2012 +0100
@@ -3,6 +3,7 @@ obj-y += k7.o
 obj-y += amd_k8.o
 obj-y += amd_f10.o
 obj-y += mce_amd.o
+obj-y += mcaction.o
 obj-y += barrier.o
 obj-y += mctelem.o
 obj-y += mce.o
diff -r cb1382bdaad9 -r 142e4577f5a9 xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Oct 09 12:46:27 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Oct 09 12:51:37 2012 +0100
@@ -44,6 +44,7 @@
 #include "mce_quirks.h"
 #include "x86_mca.h"
 #include "mce_amd.h"
+#include "mcaction.h"
 
 static struct mcinfo_extended *
 amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
@@ -97,6 +98,7 @@ enum mcheck_type amd_f10_mcheck_init(str
 
        x86_mce_callback_register(amd_f10_handler);
        mce_recoverable_register(mc_amd_recoverable_scan);
+       mce_register_addrcheck(mc_amd_addrcheck);
 
        return mcheck_amd_famXX;
 }
diff -r cb1382bdaad9 -r 142e4577f5a9 xen/arch/x86/cpu/mcheck/mcaction.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/mcaction.c        Tue Oct 09 12:51:37 2012 +0100
@@ -0,0 +1,139 @@
+#include <xen/types.h>
+#include <xen/sched.h>
+#include "mcaction.h"
+#include "vmce.h"
+#include "mce.h"
+
+static struct mcinfo_recovery *
+mci_action_add_pageoffline(int bank, struct mc_info *mi,
+                       uint64_t mfn, uint32_t status)
+{
+    struct mcinfo_recovery *rec;
+
+    if (!mi)
+        return NULL;
+
+    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
+    if (!rec) {
+        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+        return NULL;
+    }
+
+    memset(rec, 0, sizeof(struct mcinfo_recovery));
+
+    rec->common.type = MC_TYPE_RECOVERY;
+    rec->common.size = sizeof(*rec);
+    rec->mc_bank = bank;
+    rec->action_types = MC_ACTION_PAGE_OFFLINE;
+    rec->action_info.page_retire.mfn = mfn;
+    rec->action_info.page_retire.status = status;
+    return rec;
+}
+
+mce_check_addr_t mc_check_addr = NULL;
+
+void mce_register_addrcheck(mce_check_addr_t cbfunc)
+{
+    mc_check_addr = cbfunc;
+}
+
+void
+mc_memerr_dhandler(struct mca_binfo *binfo,
+                   enum mce_result *result,
+                   struct cpu_user_regs *regs)
+{
+    struct mcinfo_bank *bank = binfo->mib;
+    struct mcinfo_global *global = binfo->mig;
+    struct domain *d;
+    unsigned long mfn, gfn;
+    uint32_t status;
+    uint16_t vmce_vcpuid;
+
+    if (!mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL)) {
+        dprintk(XENLOG_WARNING,
+            "No physical address provided for memory error\n");
+        return;
+    }
+
+    mfn = bank->mc_addr >> PAGE_SHIFT;
+    if (offline_page(mfn, 1, &status))
+    {
+        dprintk(XENLOG_WARNING,
+                "Failed to offline page %lx for MCE error\n", mfn);
+        return;
+    }
+
+    mci_action_add_pageoffline(binfo->bank, binfo->mi, mfn, status);
+
+    /* This is free page */
+    if (status & PG_OFFLINE_OFFLINED)
+        *result = MCER_RECOVERED;
+    else if (status & PG_OFFLINE_AGAIN)
+        *result = MCER_CONTINUE;
+    else if (status & PG_OFFLINE_PENDING) {
+        /* This page has owner */
+        if (status & PG_OFFLINE_OWNED) {
+            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
+            mce_printk(MCE_QUIET, "MCE: This error page is ownded"
+              " by DOM %d\n", bank->mc_domid);
+            /* XXX: Cannot handle shared pages yet
+             * (this should identify all domains and gfn mapping to
+             *  the mfn in question) */
+            BUG_ON( bank->mc_domid == DOMID_COW );
+            if ( bank->mc_domid != DOMID_XEN ) {
+                d = get_domain_by_id(bank->mc_domid);
+                ASSERT(d);
+                gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
+
+                if ( !is_vmce_ready(bank, d) )
+                {
+                    printk("DOM%d not ready for vMCE\n", d->domain_id);
+                    goto vmce_failed;
+                }
+
+                if ( unmmap_broken_page(d, _mfn(mfn), gfn) )
+                {
+                    printk("Unmap broken memory %lx for DOM%d failed\n",
+                            mfn, d->domain_id);
+                    goto vmce_failed;
+                }
+
+                bank->mc_addr = gfn << PAGE_SHIFT |
+                  (bank->mc_addr & (PAGE_SIZE -1 ));
+                if ( fill_vmsr_data(bank, d,
+                      global->mc_gstatus) == -1 )
+                {
+                    mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
+                      "failed\n", bank->mc_domid);
+                    goto vmce_failed;
+                }
+
+                if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+                    vmce_vcpuid = VMCE_INJECT_BROADCAST;
+                else
+                    vmce_vcpuid = global->mc_vcpuid;
+
+                /* We will inject vMCE to DOMU*/
+                if ( inject_vmce(d, vmce_vcpuid) < 0 )
+                {
+                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
+                      " failed\n", d->domain_id);
+                    goto vmce_failed;
+                }
+
+                /* Impacted domain go on with domain's recovery job
+                 * if the domain has its own MCA handler.
+                 * For xen, it has contained the error and finished
+                 * its own recovery job.
+                 */
+                *result = MCER_RECOVERED;
+                put_domain(d);
+
+                return;
+vmce_failed:
+                put_domain(d);
+                domain_crash(d);
+            }
+        }
+    }
+}
diff -r cb1382bdaad9 -r 142e4577f5a9 xen/arch/x86/cpu/mcheck/mcaction.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/mcaction.h        Tue Oct 09 12:51:37 2012 +0100
@@ -0,0 +1,20 @@
+#ifndef _MCHECK_ACTION_H
+#define _MCHECK_ACTION_H
+
+#include <xen/types.h>
+#include "x86_mca.h"
+
+void
+mc_memerr_dhandler(struct mca_binfo *binfo,
+                   enum mce_result *result,
+                   struct cpu_user_regs *regs);
+
+#define MC_ADDR_PHYSICAL  0
+#define MC_ADDR_VIRTUAL   1
+
+typedef int (*mce_check_addr_t)(uint64_t status, uint64_t misc, int addr_type);
+extern void mce_register_addrcheck(mce_check_addr_t);
+
+extern mce_check_addr_t mc_check_addr;
+
+#endif
diff -r cb1382bdaad9 -r 142e4577f5a9 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Tue Oct 09 12:46:27 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Tue Oct 09 12:51:37 2012 +0100
@@ -24,6 +24,7 @@
 
 #include "mce.h"
 #include "barrier.h"
+#include "mcaction.h"
 #include "util.h"
 #include "vmce.h"
 
@@ -216,7 +217,7 @@ static void mca_init_bank(enum mca_sourc
 
     if ((mib->mc_status & MCi_STATUS_MISCV) &&
         (mib->mc_status & MCi_STATUS_ADDRV) &&
-        ((mib->mc_misc & MCi_MISC_ADDRMOD_MASK) == MCi_MISC_PHYSMOD) && 
+        (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) &&
         (who == MCA_POLLER || who == MCA_CMCI_HANDLER) &&
         (mfn_valid(paddr_to_pfn(mib->mc_addr))))
     {
diff -r cb1382bdaad9 -r 142e4577f5a9 xen/arch/x86/cpu/mcheck/mce_amd.c
--- a/xen/arch/x86/cpu/mcheck/mce_amd.c Tue Oct 09 12:46:27 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_amd.c Tue Oct 09 12:51:37 2012 +0100
@@ -25,6 +25,7 @@
 #include "mce.h"
 #include "x86_mca.h"
 #include "mce_amd.h"
+#include "mcaction.h"
 
 /* Error Code Types */
 enum mc_ec_type {
@@ -75,3 +76,25 @@ mc_amd_recoverable_scan(uint64_t status)
 
     return ret;
 }
+
+int
+mc_amd_addrcheck(uint64_t status, uint64_t misc, int addrtype)
+{
+    enum mc_ec_type ectype;
+    uint16_t errorcode;
+
+    errorcode = status & (MCi_STATUS_MCA | MCi_STATUS_MSEC);
+    ectype = mc_ec2type(errorcode);
+
+    switch (ectype) {
+    case MC_EC_BUS_TYPE: /* value in addr MSR is physical */
+    case MC_EC_MEM_TYPE: /* value in addr MSR is physical */
+        return (addrtype == MC_ADDR_PHYSICAL);
+    case MC_EC_TLB_TYPE: /* value in addr MSR is virtual */
+        return (addrtype == MC_ADDR_VIRTUAL);
+    }
+
+    /* unreached */
+    BUG();
+    return 0;
+}
diff -r cb1382bdaad9 -r 142e4577f5a9 xen/arch/x86/cpu/mcheck/mce_amd.h
--- a/xen/arch/x86/cpu/mcheck/mce_amd.h Tue Oct 09 12:46:27 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_amd.h Tue Oct 09 12:51:37 2012 +0100
@@ -2,5 +2,6 @@
 #define _MCHECK_AMD_H
 
 int mc_amd_recoverable_scan(uint64_t status);
+int mc_amd_addrcheck(uint64_t status, uint64_t misc, int addrtype);
 
 #endif
diff -r cb1382bdaad9 -r 142e4577f5a9 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Oct 09 12:46:27 2012 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Oct 09 12:51:37 2012 +0100
@@ -19,6 +19,7 @@
 #include "barrier.h"
 #include "util.h"
 #include "vmce.h"
+#include "mcaction.h"
 
 DEFINE_PER_CPU(struct mca_banks *, mce_banks_owned);
 DEFINE_PER_CPU(struct mca_banks *, no_cmci_banks);
@@ -257,130 +258,13 @@ static enum intel_mce_type intel_check_m
     return intel_mce_fatal;
 }
 
-struct mcinfo_recovery *mci_add_pageoff_action(int bank, struct mc_info *mi,
-                              uint64_t mfn, uint32_t status)
-{
-    struct mcinfo_recovery *rec;
-
-    if (!mi)
-        return NULL;
-
-    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
-    if (!rec)
-    {
-        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
-        return NULL;
-    }
-
-    memset(rec, 0, sizeof(struct mcinfo_recovery));
-
-    rec->mc_bank = bank;
-    rec->action_types = MC_ACTION_PAGE_OFFLINE;
-    rec->action_info.page_retire.mfn = mfn;
-    rec->action_info.page_retire.status = status;
-    return rec;
-}
-
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
              enum mce_result *result,
              struct cpu_user_regs *regs)
 {
-    struct mcinfo_bank *bank = binfo->mib;
-    struct mcinfo_global *global = binfo->mig;
-    struct domain *d;
-    unsigned long mfn, gfn;
-    uint32_t status;
-    uint64_t mc_status, mc_misc;
-
     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
-
-    mc_status = bank->mc_status;
-    mc_misc = bank->mc_misc;
-    if (!(mc_status &  MCi_STATUS_ADDRV) ||
-        !(mc_status & MCi_STATUS_MISCV) ||
-        ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
-    {
-        dprintk(XENLOG_WARNING,
-            "No physical address provided for memory error\n");
-        return;
-    }
-
-    mfn = bank->mc_addr >> PAGE_SHIFT;
-    if (offline_page(mfn, 1, &status))
-    {
-        dprintk(XENLOG_WARNING,
-                "Failed to offline page %lx for MCE error\n", mfn);
-        return;
-    }
-
-    mci_add_pageoff_action(binfo->bank, binfo->mi, mfn, status);
-
-    /* This is free page */
-    if (status & PG_OFFLINE_OFFLINED)
-        *result = MCER_RECOVERED;
-    else if (status & PG_OFFLINE_AGAIN)
-        *result = MCER_CONTINUE;
-    else if (status & PG_OFFLINE_PENDING) {
-        /* This page has owner */
-        if (status & PG_OFFLINE_OWNED) {
-            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
-            mce_printk(MCE_QUIET, "MCE: This error page is ownded"
-              " by DOM %d\n", bank->mc_domid);
-            /* XXX: Cannot handle shared pages yet 
-             * (this should identify all domains and gfn mapping to
-             *  the mfn in question) */
-            BUG_ON( bank->mc_domid == DOMID_COW );
-            if ( bank->mc_domid != DOMID_XEN ) {
-                d = get_domain_by_id(bank->mc_domid);
-                ASSERT(d);
-                gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
-
-                if ( !is_vmce_ready(bank, d) )
-                {
-                    printk("DOM%d not ready for vMCE\n", d->domain_id);
-                    goto vmce_failed;
-                }
-
-                if ( unmmap_broken_page(d, _mfn(mfn), gfn) )
-                {
-                    printk("Unmap broken memory %lx for DOM%d failed\n",
-                            mfn, d->domain_id);
-                    goto vmce_failed;
-                }
-
-                bank->mc_addr =  gfn << PAGE_SHIFT |
-                  (bank->mc_addr & (PAGE_SIZE -1 ));
-                if ( fill_vmsr_data(bank, d,
-                      global->mc_gstatus) == -1 )
-                {
-                    mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
-                      "failed\n", bank->mc_domid);
-                    goto vmce_failed;
-                }
-
-                /* We will inject vMCE to DOMU*/
-                if ( inject_vmce(d, VMCE_INJECT_BROADCAST) < 0 )
-                {
-                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
-                      " failed\n", d->domain_id);
-                    goto vmce_failed;
-                }
-                /* Impacted domain go on with domain's recovery job
-                 * if the domain has its own MCA handler.
-                 * For xen, it has contained the error and finished
-                 * its own recovery job.
-                 */
-                *result = MCER_RECOVERED;
-                put_domain(d);
-
-                return;
-vmce_failed:
-                put_domain(d);
-                domain_crash(d);
-            }
-        }
-    }
+    mc_memerr_dhandler(binfo, result, regs);
 }
 
 static int intel_srar_check(uint64_t status)
@@ -388,6 +272,19 @@ static int intel_srar_check(uint64_t sta
     return ( intel_check_mce_type(status) == intel_mce_ucr_srar );
 }
 
+static int intel_checkaddr(uint64_t status, uint64_t misc, int addrtype)
+{
+    if (!(status & MCi_STATUS_ADDRV) ||
+        !(status & MCi_STATUS_MISCV) ||
+        ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
+    {
+        /* addr is virtual */
+        return (addrtype == MC_ADDR_VIRTUAL);
+    }
+
+    return (addrtype == MC_ADDR_PHYSICAL);
+}
+
 static void intel_srar_dhandler(
              struct mca_binfo *binfo,
              enum mce_result *result,
@@ -882,6 +779,7 @@ static void intel_init_mce(void)
     x86_mce_vector_register(intel_machine_check);
     mce_recoverable_register(intel_recoverable_scan);
     mce_need_clearbank_register(intel_need_clearbank_scan);
+    mce_register_addrcheck(intel_checkaddr);
 
     mce_dhandlers = intel_mce_dhandlers;
     mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.