[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] X86 MCE: Add SRAR handler



Jan,

I update a little for my former patch, as attached.
For my former patch, you mainly have 2 concerns (list below). I double check 
xen mce code, w/ my opinion append:

Concern 1: for SRAR IFU error, since RIPV=EIPV=0, it maybe an async error which 
occur at guest but root from hypervisor.
[Jinsong]: 
    Yes, but EIPV didn't tell us where the error root from (it's just a hint, 
warning us async possibility). 
    It no need to overkill xen at mce isr, instead, at mce softirq we can find 
out error root location and then handle accordingly:
    * at mce isr:
            /* a total insurance */
            /* if error is async, we delay handle it at mce softirq */
            if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
                return -1;
    * at mce softirq:
            /* detect error location by bank->mc_addr */
            /* handle different page OWNER cases at intel_memerr_dhandler() and 
offline_page() */
            /* who own, who take */
            if (error page owner is guest)
                trigger vmce to guest;
            else
                panic xen;


Concern 2: If a guest accesses the hypervisor part of the GDT or page tables, 
or some other shared data structure owned by the hypervisor (like the M2P 
table), its handler may get utterly confused by being presented an address it 
doesn't own and knows nothing about.
[Jinsong]: for such cases, page owner would be dom_xen/dom_cow or NULL, but not 
guest --> it would be handled at hypervisor, not triggering vmce to guest --> 
who own, who take.


Thanks,
Jinsong

=============================
X86 MCE: Add SRAR handler

Currently Intel SDM add 2 kinds of MCE SRAR errors:
1). Data Load error, error code = 0x134
2). Instruction Fetch error, error code = 0x150
This patch add handler to these new SRAR errors.
It based on existed mce infrastructure, add code to handle SRAR specific error.

Signed-off-by: Liu, Jinsong <jinsong.liu@xxxxxxxxx>

diff -r 1515484353c6 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Thu Oct 13 10:09:28 2011 +0200
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Sat Oct 22 01:40:41 2011 +0800
@@ -37,6 +37,14 @@ static int __read_mostly nr_intel_ext_ms
  */
 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
 #define INTEL_SRAO_L3_EWB    0x17A
+
+/* 
+ * Currently Intel SDM define 2 kinds of srar errors:
+ * 1). Data Load error, error code = 0x134
+ * 2). Instruction Fetch error, error code = 0x150
+ */
+#define INTEL_SRAR_DATA_LOAD   0x134
+#define INTEL_SRAR_INSTR_FETCH 0x150
 
 /* Thermal Hanlding */
 #ifdef CONFIG_X86_MCE_THERMAL
@@ -256,7 +264,7 @@ static enum mce_result mce_action(struct
         for ( i = 0; i < handler_num; i++ ) {
             if (handlers[i].owned_error(binfo.mib->mc_status))
             {
-                handlers[i].recovery_handler(&binfo, &bank_result);
+                handlers[i].recovery_handler(&binfo, &bank_result, regs);
                 if (worst_result < bank_result)
                     worst_result = bank_result;
                 break;
@@ -622,7 +630,8 @@ struct mcinfo_recovery *mci_add_pageoff_
 
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     struct mcinfo_bank *bank = binfo->mib;
     struct mcinfo_global *global = binfo->mig;
@@ -721,6 +730,32 @@ vmce_failed:
     }
 }
 
+static int intel_srar_check(uint64_t status)
+{
+    return ( intel_check_mce_type(status) == intel_mce_ucr_srar );
+}
+
+static void intel_srar_dhandler(
+             struct mca_binfo *binfo,
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
+{
+    uint64_t status = binfo->mib->mc_status;
+
+    /* For unknown srar error code, reset system */
+    *result = MCER_RESET;
+
+    switch ( status & INTEL_MCCOD_MASK )
+    {
+    case INTEL_SRAR_DATA_LOAD:
+    case INTEL_SRAR_INSTR_FETCH:
+        intel_memerr_dhandler(binfo, result, regs);
+        break;
+    default:
+        break;
+    }
+}
+
 static int intel_srao_check(uint64_t status)
 {
     return ( intel_check_mce_type(status) == intel_mce_ucr_srao );
@@ -728,7 +763,8 @@ static int intel_srao_check(uint64_t sta
 
 static void intel_srao_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     uint64_t status = binfo->mib->mc_status;
 
@@ -741,7 +777,7 @@ static void intel_srao_dhandler(
         {
         case INTEL_SRAO_MEM_SCRUB:
         case INTEL_SRAO_L3_EWB:
-            intel_memerr_dhandler(binfo, result);
+            intel_memerr_dhandler(binfo, result, regs);
             break;
         default:
             break;
@@ -756,14 +792,15 @@ static int intel_default_check(uint64_t 
 
 static void intel_default_mce_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs * regs)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
 
     type = intel_check_mce_type(status);
 
-    if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
+    if (type == intel_mce_fatal)
         *result = MCER_RESET;
     else
         *result = MCER_CONTINUE;
@@ -771,12 +808,14 @@ static void intel_default_mce_dhandler(
 
 static const struct mca_error_handler intel_mce_dhandlers[] = {
     {intel_srao_check, intel_srao_dhandler},
+    {intel_srar_check, intel_srar_dhandler},
     {intel_default_check, intel_default_mce_dhandler}
 };
 
 static void intel_default_mce_uhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
@@ -785,8 +824,6 @@ static void intel_default_mce_uhandler(
 
     switch (type)
     {
-    /* Panic if no handler for SRAR error */
-    case intel_mce_ucr_srar:
     case intel_mce_fatal:
         *result = MCER_RESET;
         break;
@@ -961,10 +998,8 @@ static int intel_recoverable_scan(u64 st
     /* SRAR error */
     else if ( ser_support && !(status & MCi_STATUS_OVER) 
                 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
-                && (status & MCi_STATUS_AR) ) {
-        mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined currently.\n");
-        return 0;
-    }
+                && (status & MCi_STATUS_AR) && (status & MCi_STATUS_EN) )
+        return 1;
     /* SRAO error */
     else if (ser_support && !(status & MCi_STATUS_PCC)
                 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
diff -r 1515484353c6 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Thu Oct 13 10:09:28 2011 +0200
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sat Oct 22 01:40:41 2011 +0800
@@ -151,7 +151,7 @@ struct mca_error_handler
     */
     int (*owned_error)(uint64_t status);
     void (*recovery_handler)(struct mca_binfo *binfo,
-                    enum mce_result *result);
+                    enum mce_result *result, struct cpu_user_regs *regs);
 };
 
 /* Global variables */

Attachment: srar-1.patch
Description: srar-1.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.