[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] mca: Fix several issues for MCA UCR error handling
# HG changeset patch # User Keir Fraser <keir.fraser@xxxxxxxxxx> # Date 1253605052 -3600 # Node ID 8c4685fc198ef4b5ea8accf30cb0b6b828cef54f # Parent bcb6b95b30b13efa9635f8b8e1b7ff57c50dae3d mca: Fix several issues for MCA UCR error handling This patch is for fixing several issues for MCA UCR error handling on latest Intel platforms, including: 1) For UCR error, the is 0xC0 ~ 0xCF instead of just C0 2) Synchronization issues for clearing error finding flag and clearing global MCIP flag. Otherwise, in some cases, MCIP flag can't be cleared. Signed-off-by: Liping Ke <liping.ke@xxxxxxxxx> --- xen/arch/x86/cpu/mcheck/mce_intel.c | 73 +++++++++++++++--------------------- 1 files changed, 32 insertions(+), 41 deletions(-) diff -r bcb6b95b30b1 -r 8c4685fc198e xen/arch/x86/cpu/mcheck/mce_intel.c --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Sep 22 08:36:40 2009 +0100 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Sep 22 08:37:32 2009 +0100 @@ -45,7 +45,6 @@ static atomic_t found_error = ATOMIC_INI static void mce_barrier_enter(struct mce_softirq_barrier *); static void mce_barrier_exit(struct mce_softirq_barrier *); -static int mce_barrier_last(struct mce_softirq_barrier *); #ifdef CONFIG_X86_MCE_THERMAL static void unexpected_thermal_interrupt(struct cpu_user_regs *regs) @@ -339,7 +338,7 @@ void intel_UCR_handler(struct mcinfo_ban unsigned long mfn, gfn; uint32_t status; - printk(KERN_DEBUG "MCE: Enter EWB UCR recovery action\n"); + printk(KERN_DEBUG "MCE: Enter UCR recovery action\n"); result->result = MCA_NEED_RESET; if (bank->mc_addr != 0) { mfn = bank->mc_addr >> PAGE_SHIFT; @@ -430,8 +429,10 @@ static int mce_action(mctelem_cookie_t m /* TODO: Add recovery actions here, such as page-offline, etc */ memset(&mca_res, 0x0f, sizeof(mca_res)); for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) { - if ( (mc_bank->mc_status & 0xffff) == - intel_recovery_handler[i].mca_code ) { + if ( ((mc_bank->mc_status & 0xffff) == + intel_recovery_handler[i].mca_code) || + ((mc_bank->mc_status & 0xfff0) == + intel_recovery_handler[i].mca_code)) { /* For SRAR, OVER = 1 should have caused reset * For SRAO, OVER = 1 skip recovery action, continue execution */ @@ -439,10 +440,10 @@ static int mce_action(mctelem_cookie_t m intel_recovery_handler[i].recovery_handler (mc_bank, mc_global, NULL, &mca_res); else { - if (!mc_global->mc_gstatus & MCG_STATUS_RIPV) + if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV)) mca_res.result = MCA_NEED_RESET; else - mca_res.result = MCA_NO_ACTION; + mca_res.result = MCA_NO_ACTION; } if (mca_res.result & MCA_OWNER) mc_bank->mc_domid = mca_res.owner; @@ -458,13 +459,14 @@ static int mce_action(mctelem_cookie_t m "recover action, RIPV=1, let it be.\n"); break; } - /* For SRAR, no defined recovery action should have caused reset - * in MCA Handler - */ - if ( i >= INTEL_MAX_RECOVERY ) - printk(KERN_DEBUG "MCE: No software recovery action found for " - "this SRAO error\n"); } + /* For SRAR, no defined recovery action should have caused reset + * in MCA Handler + */ + if ( i >= INTEL_MAX_RECOVERY ) + printk(KERN_DEBUG "MCE: No software recovery action found for " + "this SRAO error\n"); + } return 1; } @@ -622,16 +624,6 @@ static void mce_barrier_exit(struct mce_ } } -static int mce_barrier_last(struct mce_softirq_barrier *bar) -{ - int gen = atomic_read(&bar->ingen); - if ( atomic_read(&bar->ingen) == gen && - atomic_read(&bar->val) == 1 ) { - return 1; - } - return 0; -} - #if 0 static void mce_barrier(struct mce_softirq_barrier *bar) { @@ -645,7 +637,7 @@ static void intel_machine_check(struct c uint64_t gstatus; mctelem_cookie_t mctc = NULL; struct mca_summary bs; - cpu_banks_t clear_bank; + cpu_banks_t clear_bank; mce_spin_lock(&mce_logout_lock); @@ -677,9 +669,11 @@ static void intel_machine_check(struct c } atomic_set(&found_error, 1); - printk(KERN_DEBUG "MCE: clear_bank map %lx\n", - *((unsigned long*)clear_bank)); + printk(KERN_DEBUG "MCE: clear_bank map %lx on CPU%d\n", + *((unsigned long*)clear_bank), smp_processor_id()); mcheck_mca_clearbanks(clear_bank); + /* Print MCE error */ + x86_mcinfo_dump(mctelem_dataptr(mctc)); } else { if (mctc != NULL) @@ -692,29 +686,26 @@ static void intel_machine_check(struct c */ mce_barrier_enter(&mce_trap_bar); /* According to latest MCA OS writer guide, if no error bank found - * on all cpus, something unexpected happening, we can't do any + * on all cpus, something unexpected happening, we can't do any * recovery job but to reset the system. */ if (atomic_read(&found_error) == 0) mc_panic("Unexpected condition for the MCE handler, need reset\n"); - if (mce_barrier_last(&mce_trap_bar)) { - printk(KERN_DEBUG "Choose one CPU to clear error finding flag\n "); + mce_barrier_exit(&mce_trap_bar); + + /* Clear error finding flags after all cpus finishes above judgement */ + mce_barrier_enter(&mce_trap_bar); + if (atomic_read(&found_error)) { + printk(KERN_DEBUG "MCE: Choose one CPU " + "to clear error finding flag\n "); atomic_set(&found_error, 0); } + mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus); + if ((gstatus & MCG_STATUS_MCIP) != 0) { + printk(KERN_DEBUG "MCE: Clear MCIP@ last step"); + mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP); + } mce_barrier_exit(&mce_trap_bar); - - /* - * Clear MCIP if it wasn't already. There is a small - * chance that more than 1 CPU will end up doing this, - * but that's OK. - */ - if (bs.errcnt) { - mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus); - if ((gstatus & MCG_STATUS_MCIP) != 0) - mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP); - /* Print MCE error */ - x86_mcinfo_dump(mctelem_dataptr(mctc)); - } raise_softirq(MACHINE_CHECK_SOFTIRQ); } _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |