[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 5/6 ]Clean-up the mc_panic handler


  • To: Keir Fraser <keir.fraser@xxxxxxxxxxxxx>
  • From: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>
  • Date: Wed, 9 Jun 2010 22:32:17 +0800
  • Accept-language: en-US
  • Acceptlanguage: en-US
  • Cc: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxx>
  • Delivery-date: Wed, 09 Jun 2010 07:40:32 -0700
  • List-id: Xen developer discussion <xen-devel.lists.xensource.com>
  • Thread-index: AcsH4I+wIMmcoSrOTH+VPZ/xtu/BUA==
  • Thread-topic: [PATCH 5/6 ]Clean-up the mc_panic handler

Clean-up the mc_panic handler.

Firstly, mc_panic should only mc_panic in one CPU to avoid printk output be 
mixed.
Secondly, call urgent handler in MCE panic to get error code specific hander be 
involved.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

mce_intel.c |   94 ++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 76 insertions(+), 18 deletions(-)

diff -r c436e1758236 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Wed Jun 09 17:29:47 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Wed Jun 09 17:40:14 2010 +0800
@@ -150,6 +150,7 @@ static DEFINE_SPINLOCK(mce_logout_lock);
 
 static atomic_t severity_cpu = ATOMIC_INIT(-1);
 static atomic_t found_error = ATOMIC_INIT(0);
+static cpumask_t mce_fatal_cpus;
 
 static void mce_barrier_enter(struct mce_softirq_barrier *);
 static void mce_barrier_exit(struct mce_softirq_barrier *);
@@ -320,6 +321,27 @@ static void mce_softirq(void)
     }
 
     mce_barrier_exit(&mce_inside_bar);
+}
+
+/*
+ * Return:
+ * -1: if system can't be recoved
+ * 0: Continoue to next step
+ */
+static int mce_urgent_action(struct cpu_user_regs *regs,
+                              mctelem_cookie_t mctc)
+{
+    uint64_t gstatus;
+
+    if ( mctc == NULL)
+        return 0;
+
+    mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+    /* Xen is not pre-emptible */
+    if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
+        return 0;
+
+    return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
 }
 
 /* Machine Check owner judge algorithm:
@@ -693,6 +715,31 @@ struct mca_error_handler intel_mce_dhand
 struct mca_error_handler intel_mce_dhandlers[] =
             {{is_async_memerr, intel_memerr_dhandler}, {default_check, 
intel_default_dhandler}};
 
+static void intel_default_uhandler(int bnum,
+             struct mca_binfo *binfo,
+             struct mca_handle_result *result)
+{
+    uint64_t status = binfo->mib->mc_status;
+    enum intel_mce_type type;
+
+    type = intel_check_mce_type(status);
+
+    switch (type)
+    {
+    /* Panic if no handler for SRAR error */
+    case intel_mce_ucr_srar:
+    case intel_mce_fatal:
+        result->result = MCA_RESET;
+        break;
+    default:
+        result->result = MCA_NO_ACTION;
+        break;
+    }
+}
+
+struct mca_error_handler intel_mce_uhandlers[] =
+            {{default_check, intel_default_uhandler}};
+
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
     uint64_t gstatus;
@@ -724,17 +771,16 @@ static void intel_machine_check(struct c
              * clearing  the banks, and deal with the telemetry after reboot
              * (the MSRs are sticky)
              */
-            if (bs.pcc)
-                mc_panic("State lost due to machine check exception.\n");
-            if (!bs.ripv)
-                mc_panic("RIPV =0 can't resume execution!\n");
-            if (!bs.recoverable)
-                mc_panic("Machine check exception software recovery fail.\n");
+            if (bs.pcc || !bs.recoverable)
+                cpu_set(smp_processor_id(), mce_fatal_cpus);
         } else {
             if (mctc != NULL)
                 mctelem_commit(mctc);
         }
         atomic_set(&found_error, 1);
+
+        /* The last CPU will be take check/clean-up etc */
+        atomic_set(&severity_cpu, smp_processor_id());
 
         mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
                 *((unsigned long*)clear_bank), smp_processor_id());
@@ -745,25 +791,35 @@ static void intel_machine_check(struct c
     }
     mce_spin_unlock(&mce_logout_lock);
 
+    mce_barrier_enter(&mce_trap_bar);
+    if ( mctc != NULL && mce_urgent_action(regs, mctc))
+        cpu_set(smp_processor_id(), mce_fatal_cpus);
+    mce_barrier_exit(&mce_trap_bar);
     /*
      * Wait until everybody has processed the trap.
      */
     mce_barrier_enter(&mce_trap_bar);
-    /* According to latest MCA OS writer guide, if no error bank found
-     * on all cpus, something unexpected happening, we can't do any
-     * recovery job but to reset the system.
-     */
-    if (atomic_read(&found_error) == 0)
-        mc_panic("Unexpected condition for the MCE handler, need reset\n");
+    if (atomic_read(&severity_cpu) == smp_processor_id())
+    {
+        /* According to SDM, if no error bank found on any cpus,
+         * something unexpected happening, we can't do any
+         * recovery job but to reset the system.
+         */
+        if (atomic_read(&found_error) == 0)
+            mc_panic("MCE: No CPU found valid MCE, need reset\n");
+        if (!cpus_empty(mce_fatal_cpus))
+        {
+            char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs ";
+            ebufp = ebuf + strlen(ebuf);
+            cpumask_scnprintf(ebufp, 95 - strlen(ebuf), mce_fatal_cpus);
+            mc_panic(ebuf);
+        }
+        atomic_set(&found_error, 0);
+    }
     mce_barrier_exit(&mce_trap_bar);
 
-    /* Clear error finding flags after all cpus finishes above judgement */
+    /* Clear flags after above fatal check */
     mce_barrier_enter(&mce_trap_bar);
-    if (atomic_read(&found_error)) {
-        mce_printk(MCE_CRITICAL, "MCE: Choose one CPU "
-                   "to clear error finding flag\n ");
-        atomic_set(&found_error, 0);
-    }
     mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
     if ((gstatus & MCG_STATUS_MCIP) != 0) {
         mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
@@ -1158,6 +1214,8 @@ static void intel_init_mce(void)
 
     mce_dhandlers = intel_mce_dhandlers;
     mce_dhandler_num = sizeof(intel_mce_dhandlers)/sizeof(struct 
mca_error_handler);
+    mce_uhandlers = intel_mce_uhandlers;
+    mce_uhandler_num = sizeof(intel_mce_uhandlers)/sizeof(struct 
mca_error_handler);
 }
 
 static int intel_init_mca_banks(void)


Attachment: mc_panic.patch
Description: mc_panic.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.