[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] mca: Fix several issues for MCA UCR error handling



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1253605052 -3600
# Node ID 8c4685fc198ef4b5ea8accf30cb0b6b828cef54f
# Parent  bcb6b95b30b13efa9635f8b8e1b7ff57c50dae3d
mca: Fix several issues for MCA UCR error handling

This patch is for fixing several issues for MCA UCR error handling on
latest Intel platforms, including:
1) For UCR error, the  is 0xC0 ~ 0xCF instead of just C0
2) Synchronization issues for clearing error finding flag and clearing
global MCIP flag. Otherwise, in some cases, MCIP flag can't be cleared.

Signed-off-by: Liping Ke <liping.ke@xxxxxxxxx>
---
 xen/arch/x86/cpu/mcheck/mce_intel.c |   73 +++++++++++++++---------------------
 1 files changed, 32 insertions(+), 41 deletions(-)

diff -r bcb6b95b30b1 -r 8c4685fc198e xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Sep 22 08:36:40 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Sep 22 08:37:32 2009 +0100
@@ -45,7 +45,6 @@ static atomic_t found_error = ATOMIC_INI
 
 static void mce_barrier_enter(struct mce_softirq_barrier *);
 static void mce_barrier_exit(struct mce_softirq_barrier *);
-static int mce_barrier_last(struct mce_softirq_barrier *);
 
 #ifdef CONFIG_X86_MCE_THERMAL
 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
@@ -339,7 +338,7 @@ void intel_UCR_handler(struct mcinfo_ban
     unsigned long mfn, gfn;
     uint32_t status;
 
-    printk(KERN_DEBUG "MCE: Enter EWB UCR recovery action\n");
+    printk(KERN_DEBUG "MCE: Enter UCR recovery action\n");
     result->result = MCA_NEED_RESET;
     if (bank->mc_addr != 0) {
          mfn = bank->mc_addr >> PAGE_SHIFT;
@@ -430,8 +429,10 @@ static int mce_action(mctelem_cookie_t m
         /* TODO: Add recovery actions here, such as page-offline, etc */
         memset(&mca_res, 0x0f, sizeof(mca_res));
         for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
-            if ( (mc_bank->mc_status & 0xffff) == 
-                        intel_recovery_handler[i].mca_code ) {
+            if ( ((mc_bank->mc_status & 0xffff) ==
+                        intel_recovery_handler[i].mca_code) ||
+                  ((mc_bank->mc_status & 0xfff0) ==
+                        intel_recovery_handler[i].mca_code)) {
                 /* For SRAR, OVER = 1 should have caused reset
                  * For SRAO, OVER = 1 skip recovery action, continue execution
                  */
@@ -439,10 +440,10 @@ static int mce_action(mctelem_cookie_t m
                     intel_recovery_handler[i].recovery_handler
                                 (mc_bank, mc_global, NULL, &mca_res);
                 else {
-                   if (!mc_global->mc_gstatus & MCG_STATUS_RIPV)
+                   if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
                        mca_res.result = MCA_NEED_RESET;
                    else
-                       mca_res.result = MCA_NO_ACTION; 
+                       mca_res.result = MCA_NO_ACTION;
                 }
                 if (mca_res.result & MCA_OWNER)
                     mc_bank->mc_domid = mca_res.owner;
@@ -458,13 +459,14 @@ static int mce_action(mctelem_cookie_t m
                                 "recover action, RIPV=1, let it be.\n");
                 break;
             }
-            /* For SRAR, no defined recovery action should have caused reset
-             * in MCA Handler
-             */
-            if ( i >= INTEL_MAX_RECOVERY )
-                printk(KERN_DEBUG "MCE: No software recovery action found for "
-                                "this SRAO error\n");
         }
+        /* For SRAR, no defined recovery action should have caused reset
+         * in MCA Handler
+         */
+        if ( i >= INTEL_MAX_RECOVERY )
+            printk(KERN_DEBUG "MCE: No software recovery action found for "
+                            "this SRAO error\n");
+
     }
     return 1;
 }
@@ -622,16 +624,6 @@ static void mce_barrier_exit(struct mce_
       }
 }
 
-static int mce_barrier_last(struct mce_softirq_barrier *bar)
-{
-    int gen = atomic_read(&bar->ingen);
-    if ( atomic_read(&bar->ingen) == gen &&
-        atomic_read(&bar->val) == 1 ) {
-        return 1;
-    }
-    return 0;
-}
-
 #if 0
 static void mce_barrier(struct mce_softirq_barrier *bar)
 {
@@ -645,7 +637,7 @@ static void intel_machine_check(struct c
     uint64_t gstatus;
     mctelem_cookie_t mctc = NULL;
     struct mca_summary bs;
-    cpu_banks_t clear_bank; 
+    cpu_banks_t clear_bank;
 
     mce_spin_lock(&mce_logout_lock);
 
@@ -677,9 +669,11 @@ static void intel_machine_check(struct c
         }
         atomic_set(&found_error, 1);
 
-        printk(KERN_DEBUG "MCE: clear_bank map %lx\n", 
-                *((unsigned long*)clear_bank));
+        printk(KERN_DEBUG "MCE: clear_bank map %lx on CPU%d\n",
+                *((unsigned long*)clear_bank), smp_processor_id());
         mcheck_mca_clearbanks(clear_bank);
+       /* Print MCE error */
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
 
     } else {
         if (mctc != NULL)
@@ -692,29 +686,26 @@ static void intel_machine_check(struct c
      */
     mce_barrier_enter(&mce_trap_bar);
     /* According to latest MCA OS writer guide, if no error bank found
-     * on all cpus, something unexpected happening, we can't do any 
+     * on all cpus, something unexpected happening, we can't do any
      * recovery job but to reset the system.
      */
     if (atomic_read(&found_error) == 0)
         mc_panic("Unexpected condition for the MCE handler, need reset\n");
-    if (mce_barrier_last(&mce_trap_bar)) {
-        printk(KERN_DEBUG "Choose one CPU to clear error finding flag\n ");
+    mce_barrier_exit(&mce_trap_bar);
+
+    /* Clear error finding flags after all cpus finishes above judgement */
+    mce_barrier_enter(&mce_trap_bar);
+    if (atomic_read(&found_error)) {
+        printk(KERN_DEBUG "MCE: Choose one CPU "
+                       "to clear error finding flag\n ");
         atomic_set(&found_error, 0);
     }
+    mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+    if ((gstatus & MCG_STATUS_MCIP) != 0) {
+        printk(KERN_DEBUG "MCE: Clear MCIP@ last step");
+        mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
+    }
     mce_barrier_exit(&mce_trap_bar);
-
-    /*
-     * Clear MCIP if it wasn't already. There is a small
-     * chance that more than 1 CPU will end up doing this,
-     * but that's OK.
-     */
-    if (bs.errcnt) {
-        mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
-        if ((gstatus & MCG_STATUS_MCIP) != 0)
-            mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
-        /* Print MCE error */
-        x86_mcinfo_dump(mctelem_dataptr(mctc));
-    }
 
     raise_softirq(MACHINE_CHECK_SOFTIRQ);
 }

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.