[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] x86 mcheck: Replace hypervisor MCA telemetry structures with something
# HG changeset patch # User Keir Fraser <keir.fraser@xxxxxxxxxx> # Date 1237299770 0 # Node ID 9c1be8f2013be449a09f1af34a0b5c8820ce7c55 # Parent 0b1ce09f457762d93029b1b62e1139acb5fc92fd x86 mcheck: Replace hypervisor MCA telemetry structures with something more robust and designed to make terminal error telemetry available to the dom0 panic flow for diagnosis on reboot. Use common code for a lot of the AMD and Intel MCE handling code. Signed-off-by: Gavin Maltby <gavin.maltby@xxxxxxx> Signed-off-by: Frank van der Linden <frank.vanderlinden@xxxxxxx> --- xen/arch/x86/cpu/mcheck/Makefile | 1 xen/arch/x86/cpu/mcheck/amd_f10.c | 37 - xen/arch/x86/cpu/mcheck/amd_k8.c | 229 ------- xen/arch/x86/cpu/mcheck/amd_nonfatal.c | 146 +--- xen/arch/x86/cpu/mcheck/k7.c | 11 xen/arch/x86/cpu/mcheck/mce.c | 1001 +++++++++++++++++++-------------- xen/arch/x86/cpu/mcheck/mce.h | 98 ++- xen/arch/x86/cpu/mcheck/mce_intel.c | 391 +++--------- xen/arch/x86/cpu/mcheck/mctelem.c | 443 ++++++++++++++ xen/arch/x86/cpu/mcheck/mctelem.h | 71 ++ xen/arch/x86/cpu/mcheck/non-fatal.c | 87 +- xen/arch/x86/cpu/mcheck/p5.c | 15 xen/arch/x86/cpu/mcheck/winchip.c | 8 xen/arch/x86/cpu/mcheck/x86_mca.h | 8 xen/include/asm-x86/traps.h | 2 xen/include/public/arch-x86/xen-mca.h | 49 - 16 files changed, 1476 insertions(+), 1121 deletions(-) diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/Makefile --- a/xen/arch/x86/cpu/mcheck/Makefile Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/Makefile Tue Mar 17 14:22:50 2009 +0000 @@ -2,6 +2,7 @@ obj-y += k7.o obj-y += k7.o obj-y += amd_k8.o obj-y += amd_f10.o +obj-y += mctelem.o obj-y += mce.o obj-y += mce_intel.o obj-y += non-fatal.o diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/amd_f10.c --- a/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Mar 17 14:22:50 2009 +0000 @@ -49,20 +49,21 @@ #include "x86_mca.h" -static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status) +static enum mca_extinfo +amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status) { struct mcinfo_extended mc_ext; /* Family 0x10 introduced additional MSR that belong to the * northbridge bank (4). */ - if (bank != 4) - return 0; + if (mi == NULL || bank != 4) + return MCA_EXTINFO_IGNORED; if (!(status & MCi_STATUS_VAL)) - return 0; + return MCA_EXTINFO_IGNORED; if (!(status & MCi_STATUS_MISCV)) - return 0; + return MCA_EXTINFO_IGNORED; memset(&mc_ext, 0, sizeof(mc_ext)); mc_ext.common.type = MC_TYPE_EXTENDED; @@ -78,23 +79,25 @@ static int amd_f10_handler(struct mc_inf rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value); x86_mcinfo_add(mi, &mc_ext); - return 1; + return MCA_EXTINFO_LOCAL; } extern void k8_machine_check(struct cpu_user_regs *regs, long error_code); /* AMD Family10 machine check */ -void amd_f10_mcheck_init(struct cpuinfo_x86 *c) +int amd_f10_mcheck_init(struct cpuinfo_x86 *c) { uint64_t value; uint32_t i; int cpu_nr; - machine_check_vector = k8_machine_check; - mc_callback_bank_extended = amd_f10_handler; + if (!cpu_has(c, X86_FEATURE_MCA)) + return 0; + + x86_mce_vector_register(k8_machine_check); + x86_mce_callback_register(amd_f10_handler); cpu_nr = smp_processor_id(); - wmb(); rdmsrl(MSR_IA32_MCG_CAP, value); if (value & MCG_CTL_P) /* Control register present ? */ @@ -104,18 +107,9 @@ void amd_f10_mcheck_init(struct cpuinfo_ for (i = 0; i < nr_mce_banks; i++) { switch (i) { case 4: /* Northbridge */ - /* Enable error reporting of all errors, - * enable error checking and - * disable sync flooding */ - wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL); + /* Enable error reporting of all errors */ + wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL); wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL); - - /* XXX: We should write the value 0x1087821UL into - * to register F3x180 here, which sits in - * the PCI extended configuration space. - * Since this is not possible here, we can only hope, - * Dom0 is doing that. - */ break; default: @@ -128,4 +122,5 @@ void amd_f10_mcheck_init(struct cpuinfo_ set_in_cr4(X86_CR4_MCE); printk("CPU%i: AMD Family10h machine check reporting enabled.\n", cpu_nr); + return 1; } diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/amd_k8.c --- a/xen/arch/x86/cpu/mcheck/amd_k8.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_k8.c Tue Mar 17 14:22:50 2009 +0000 @@ -67,234 +67,27 @@ #include <asm/msr.h> #include "mce.h" -#include "x86_mca.h" /* Machine Check Handler for AMD K8 family series */ void k8_machine_check(struct cpu_user_regs *regs, long error_code) { - struct vcpu *vcpu = current; - struct domain *curdom; - struct mc_info *mc_data; - struct mcinfo_global mc_global; - struct mcinfo_bank mc_info; - uint64_t status, addrv, miscv, uc; - uint32_t i; - unsigned int cpu_nr; - uint32_t xen_impacted = 0; -#define DOM_NORMAL 0 -#define DOM0_TRAP 1 -#define DOMU_TRAP 2 -#define DOMU_KILLED 4 - uint32_t dom_state = DOM_NORMAL; - - /* This handler runs as interrupt gate. So IPIs from the - * polling service routine are defered until we finished. - */ - - /* Disable interrupts for the _vcpu_. It may not re-scheduled to - * an other physical CPU or the impacted process in the guest - * continues running with corrupted data, otherwise. */ - vcpu_schedule_lock_irq(vcpu); - - mc_data = x86_mcinfo_getptr(); - cpu_nr = smp_processor_id(); - BUG_ON(cpu_nr != vcpu->processor); - - curdom = vcpu->domain; - - memset(&mc_global, 0, sizeof(mc_global)); - mc_global.common.type = MC_TYPE_GLOBAL; - mc_global.common.size = sizeof(mc_global); - - mc_global.mc_domid = curdom->domain_id; /* impacted domain */ - - x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid, - &mc_global.mc_coreid, &mc_global.mc_core_threadid, - &mc_global.mc_apicid, NULL, NULL, NULL); - - mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ - mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE; - rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); - - /* Quick check, who is impacted */ - xen_impacted = is_idle_domain(curdom); - - /* Dom0 */ - x86_mcinfo_clear(mc_data); - x86_mcinfo_add(mc_data, &mc_global); - - for (i = 0; i < nr_mce_banks; i++) { - struct domain *d; - - rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status); - - if (!(status & MCi_STATUS_VAL)) - continue; - - /* An error happened in this bank. - * This is expected to be an uncorrectable error, - * since correctable errors get polled. - */ - uc = status & MCi_STATUS_UC; - - memset(&mc_info, 0, sizeof(mc_info)); - mc_info.common.type = MC_TYPE_BANK; - mc_info.common.size = sizeof(mc_info); - mc_info.mc_bank = i; - mc_info.mc_status = status; - - addrv = 0; - if (status & MCi_STATUS_ADDRV) { - rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv); - - d = maddr_get_owner(addrv); - if (d != NULL) - mc_info.mc_domid = d->domain_id; - } - - miscv = 0; - if (status & MCi_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv); - - mc_info.mc_addr = addrv; - mc_info.mc_misc = miscv; - - x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */ - - if (mc_callback_bank_extended) - mc_callback_bank_extended(mc_data, i, status); - - /* clear status */ - wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - - status = mc_global.mc_gstatus; - - /* clear MCIP or cpu enters shutdown state - * in case another MCE occurs. */ - status &= ~MCG_STATUS_MCIP; - wrmsrl(MSR_IA32_MCG_STATUS, status); - wmb(); - - /* For the details see the discussion "MCE/MCA concept" on xen-devel. - * The thread started here: - * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html - */ - - /* MCG_STATUS_RIPV: - * When this bit is not set, then the instruction pointer onto the stack - * to resume at is not valid. If xen is interrupted, then we panic anyway - * right below. Otherwise it is up to the guest to figure out if - * guest kernel or guest userland is affected and should kill either - * itself or the affected process. - */ - - /* MCG_STATUS_EIPV: - * Evaluation of EIPV is the job of the guest. - */ - - if (xen_impacted) { - /* Now we are going to panic anyway. Allow interrupts, so that - * printk on serial console can work. */ - vcpu_schedule_unlock_irq(vcpu); - - /* Uh, that means, machine check exception - * inside Xen occured. */ - printk("Machine check exception occured in Xen.\n"); - - /* if MCG_STATUS_EIPV indicates, the IP on the stack is related - * to the error then it makes sense to print a stack trace. - * That can be useful for more detailed error analysis and/or - * error case studies to figure out, if we can clear - * xen_impacted and kill a DomU instead - * (i.e. if a guest only control structure is affected, but then - * we must ensure the bad pages are not re-used again). - */ - if (status & MCG_STATUS_EIPV) { - printk("MCE: Instruction Pointer is related to the error. " - "Therefore, print the execution state.\n"); - show_execution_state(regs); - } - x86_mcinfo_dump(mc_data); - mc_panic("End of MCE. Use mcelog to decode above error codes.\n"); - } - - /* If Dom0 registered a machine check handler, which is only possible - * with a PV MCA driver, then ... */ - if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) { - dom_state = DOM0_TRAP; - - /* ... deliver machine check trap to Dom0. */ - send_guest_trap(dom0, 0, TRAP_machine_check); - - /* Xen may tell Dom0 now to notify the DomU. - * But this will happen through a hypercall. */ - } else - /* Dom0 did not register a machine check handler, but if DomU - * did so, then... */ - if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) { - dom_state = DOMU_TRAP; - - /* ... deliver machine check trap to DomU */ - send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check); - } else { - /* hmm... noone feels responsible to handle the error. - * So, do a quick check if a DomU is impacted or not. - */ - if (curdom == dom0) { - /* Dom0 is impacted. Since noone can't handle - * this error, panic! */ - x86_mcinfo_dump(mc_data); - mc_panic("MCE occured in Dom0, which it can't handle\n"); - - /* UNREACHED */ - } else { - dom_state = DOMU_KILLED; - - /* Enable interrupts. This basically results in - * calling sti on the *physical* cpu. But after - * domain_crash() the vcpu pointer is invalid. - * Therefore, we must unlock the irqs before killing - * it. */ - vcpu_schedule_unlock_irq(vcpu); - - /* DomU is impacted. Kill it and continue. */ - domain_crash(curdom); - } - } - - - switch (dom_state) { - case DOM0_TRAP: - case DOMU_TRAP: - /* Enable interrupts. */ - vcpu_schedule_unlock_irq(vcpu); - - /* guest softirqs and event callbacks are scheduled - * immediately after this handler exits. */ - break; - case DOMU_KILLED: - /* Nothing to do here. */ - break; - default: - BUG(); - } + mcheck_cmn_handler(regs, error_code, mca_allbanks); } - /* AMD K8 machine check */ -void amd_k8_mcheck_init(struct cpuinfo_x86 *c) +int amd_k8_mcheck_init(struct cpuinfo_x86 *c) { uint64_t value; uint32_t i; int cpu_nr; - machine_check_vector = k8_machine_check; + /* Check for PPro style MCA; our caller has confirmed MCE support. */ + if (!cpu_has(c, X86_FEATURE_MCA)) + return 0; + + x86_mce_vector_register(k8_machine_check); cpu_nr = smp_processor_id(); - wmb(); rdmsrl(MSR_IA32_MCG_CAP, value); if (value & MCG_CTL_P) /* Control register present ? */ @@ -304,10 +97,8 @@ void amd_k8_mcheck_init(struct cpuinfo_x for (i = 0; i < nr_mce_banks; i++) { switch (i) { case 4: /* Northbridge */ - /* Enable error reporting of all errors, - * enable error checking and - * disable sync flooding */ - wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL); + /* Enable error reporting of all errors */ + wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL); wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL); break; @@ -321,4 +112,6 @@ void amd_k8_mcheck_init(struct cpuinfo_x set_in_cr4(X86_CR4_MCE); printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr); + + return 1; } diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/amd_nonfatal.c --- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c Tue Mar 17 14:22:50 2009 +0000 @@ -58,22 +58,23 @@ #include <xen/smp.h> #include <xen/timer.h> #include <xen/event.h> -#include <asm/processor.h> + +#include <asm/processor.h> #include <asm/system.h> #include <asm/msr.h> #include "mce.h" -#include "x86_mca.h" static struct timer mce_timer; -#define MCE_PERIOD MILLISECS(15000) +#define MCE_PERIOD MILLISECS(10000) #define MCE_MIN MILLISECS(2000) #define MCE_MAX MILLISECS(30000) static s_time_t period = MCE_PERIOD; static int hw_threshold = 0; static int adjust = 0; +static int variable_period = 1; /* The polling service routine: * Collects information of correctable errors and notifies @@ -81,99 +82,46 @@ static int adjust = 0; */ void mce_amd_checkregs(void *info) { - struct vcpu *vcpu = current; - struct mc_info *mc_data; - struct mcinfo_global mc_global; - struct mcinfo_bank mc_info; - uint64_t status, addrv, miscv; - unsigned int i; + mctelem_cookie_t mctc; + struct mca_summary bs; unsigned int event_enabled; - unsigned int cpu_nr; - int error_found; - - /* We don't need a slot yet. Only allocate one on error. */ - mc_data = NULL; - - cpu_nr = smp_processor_id(); - BUG_ON(cpu_nr != vcpu->processor); + + mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs); + event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA); - error_found = 0; - - memset(&mc_global, 0, sizeof(mc_global)); - mc_global.common.type = MC_TYPE_GLOBAL; - mc_global.common.size = sizeof(mc_global); - - mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */ - mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ - - x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid, - &mc_global.mc_coreid, &mc_global.mc_core_threadid, - &mc_global.mc_apicid, NULL, NULL, NULL); - - mc_global.mc_flags |= MC_FLAG_CORRECTABLE; - rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); - - for (i = 0; i < nr_mce_banks; i++) { - struct domain *d; - - rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status); - - if (!(status & MCi_STATUS_VAL)) - continue; - - if (mc_data == NULL) { - /* Now we need a slot to fill in error telemetry. */ - mc_data = x86_mcinfo_getptr(); - BUG_ON(mc_data == NULL); - x86_mcinfo_clear(mc_data); - x86_mcinfo_add(mc_data, &mc_global); - } - - memset(&mc_info, 0, sizeof(mc_info)); - mc_info.common.type = MC_TYPE_BANK; - mc_info.common.size = sizeof(mc_info); - mc_info.mc_bank = i; - mc_info.mc_status = status; - - /* Increase polling frequency */ - error_found = 1; - - addrv = 0; - if (status & MCi_STATUS_ADDRV) { - rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv); - - d = maddr_get_owner(addrv); - if (d != NULL) - mc_info.mc_domid = d->domain_id; - } - - miscv = 0; - if (status & MCi_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv); - - mc_info.mc_addr = addrv; - mc_info.mc_misc = miscv; - x86_mcinfo_add(mc_data, &mc_info); - - if (mc_callback_bank_extended) - mc_callback_bank_extended(mc_data, i, status); - - /* clear status */ - wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL); - wmb(); - } - - if (error_found > 0) { - /* If Dom0 enabled the VIRQ_MCA event, then ... */ - if (event_enabled) - /* ... notify it. */ + + if (bs.errcnt && mctc != NULL) { + static uint64_t dumpcount = 0; + + /* If Dom0 enabled the VIRQ_MCA event, then notify it. + * Otherwise, if dom0 has had plenty of time to register + * the virq handler but still hasn't then dump telemetry + * to the Xen console. The call count may be incremented + * on multiple cpus at once and is indicative only - just + * a simple-minded attempt to avoid spamming the console + * for corrected errors in early startup. */ + + if (event_enabled) { + mctelem_commit(mctc); send_guest_global_virq(dom0, VIRQ_MCA); - else - /* ... or dump it */ - x86_mcinfo_dump(mc_data); - } - - adjust += error_found; + } else if (++dumpcount >= 10) { + x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc)); + mctelem_dismiss(mctc); + } else { + mctelem_dismiss(mctc); + } + + } else if (mctc != NULL) { + mctelem_dismiss(mctc); + } + + /* adjust is global and all cpus may attempt to increment it without + * synchronisation, so they race and the final adjust count + * (number of cpus seeing any error) is approximate. We can + * guarantee that if any cpu observes an error that the + * adjust count is at least 1. */ + if (bs.errcnt) + adjust++; } /* polling service routine invoker: @@ -188,7 +136,7 @@ static void mce_amd_work_fn(void *data) on_each_cpu(mce_amd_checkregs, data, 1, 1); if (adjust > 0) { - if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) { + if (!guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) { /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */ printk("MCE: polling routine found correctable error. " " Use mcelog to parse above error output.\n"); @@ -229,19 +177,19 @@ static void mce_amd_work_fn(void *data) } } - if (adjust > 0) { + if (variable_period && adjust > 0) { /* Increase polling frequency */ adjust++; /* adjust == 1 must have an effect */ period /= adjust; - } else { + } else if (variable_period) { /* Decrease polling frequency */ period *= 2; } - if (period > MCE_MAX) { + if (variable_period && period > MCE_MAX) { /* limit: Poll at least every 30s */ period = MCE_MAX; } - if (period < MCE_MIN) { + if (variable_period && period < MCE_MIN) { /* limit: Poll every 2s. * When this is reached an uncorrectable error * is expected to happen, if Dom0 does nothing. @@ -262,7 +210,7 @@ void amd_nonfatal_mcheck_init(struct cpu /* The threshold bitfields in MSR_IA32_MC4_MISC has * been introduced along with the SVME feature bit. */ - if (cpu_has(c, X86_FEATURE_SVME)) { + if (variable_period && cpu_has(c, X86_FEATURE_SVME)) { uint64_t value; /* hw threshold registers present */ diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/k7.c --- a/xen/arch/x86/cpu/mcheck/k7.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/k7.c Tue Mar 17 14:22:50 2009 +0000 @@ -68,13 +68,16 @@ static fastcall void k7_machine_check(st /* AMD K7 machine check */ -void amd_k7_mcheck_init(struct cpuinfo_x86 *c) +int amd_k7_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; - machine_check_vector = k7_machine_check; - wmb(); + /* Check for PPro style MCA; our caller has confirmed MCE support. */ + if (!cpu_has(c, X86_FEATURE_MCA)) + return 0; + + x86_mce_vector_register(k7_machine_check); rdmsr (MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ @@ -92,4 +95,6 @@ void amd_k7_mcheck_init(struct cpuinfo_x set_in_cr4 (X86_CR4_MCE); printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n", smp_processor_id()); + + return 1; } diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/mce.c Tue Mar 17 14:22:50 2009 +0000 @@ -10,104 +10,490 @@ #include <xen/smp.h> #include <xen/errno.h> #include <xen/console.h> - -#include <asm/processor.h> +#include <xen/sched.h> +#include <xen/sched-if.h> +#include <xen/cpumask.h> +#include <xen/event.h> +#include <xen/guest_access.h> + +#include <asm/processor.h> #include <asm/system.h> +#include <asm/msr.h> #include "mce.h" -#include "x86_mca.h" int mce_disabled = 0; unsigned int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ -/* XXX For now a fixed array is used. Later this should be changed - * to a dynamic allocated array with the size calculated in relation - * to physical cpus present in the machine. - * The more physical cpus are available, the more entries you need. - */ -#define MAX_MCINFO 20 - -struct mc_machine_notify { - struct mc_info mc; - uint32_t fetch_idx; - uint32_t valid; -}; - -struct mc_machine { - - /* Array structure used for collecting machine check error telemetry. */ - struct mc_info mc[MAX_MCINFO]; - - /* We handle multiple machine check reports lockless by - * iterating through the array using the producer/consumer concept. - */ - /* Producer array index to fill with machine check error data. - * Index must be increased atomically. */ - uint32_t error_idx; - - /* Consumer array index to fetch machine check error data from. - * Index must be increased atomically. */ - uint32_t fetch_idx; - - /* Integer array holding the indeces of the mc array that allows - * a Dom0 to notify a DomU to re-fetch the same machine check error - * data. The notification and refetch also uses its own - * producer/consumer mechanism, because Dom0 may decide to not report - * every error to the impacted DomU. - */ - struct mc_machine_notify notify[MAX_MCINFO]; - - /* Array index to get fetch_idx from. - * Index must be increased atomically. */ - uint32_t notifyproducer_idx; - uint32_t notifyconsumer_idx; -}; - -/* Global variable with machine check information. */ -struct mc_machine mc_data; +static void mcinfo_clear(struct mc_info *); + +#define SEG_PL(segsel) ((segsel) & 0x3) + +#if 1 /* XXFM switch to 0 for putback */ + +#define x86_mcerr(str, err) _x86_mcerr(str, err) + +static int _x86_mcerr(const char *msg, int err) +{ + printk("x86_mcerr: %s, returning %d\n", + msg != NULL ? msg : "", err); + return err; +} +#else +#define x86_mcerr(str,err) +#endif + +cpu_banks_t mca_allbanks; /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code) -{ +{ printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); } +static x86_mce_vector_t _machine_check_vector = unexpected_machine_check; + +void x86_mce_vector_register(x86_mce_vector_t hdlr) +{ + _machine_check_vector = hdlr; + wmb(); +} + /* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = unexpected_machine_check; + +void machine_check_vector(struct cpu_user_regs *regs, long error_code) +{ + _machine_check_vector(regs, error_code); +} /* Init machine check callback handler * It is used to collect additional information provided by newer * CPU families/models without the need to duplicate the whole handler. * This avoids having many handlers doing almost nearly the same and each * with its own tweaks ands bugs. */ -int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL; - - -static void amd_mcheck_init(struct cpuinfo_x86 *ci) -{ +static x86_mce_callback_t mc_callback_bank_extended = NULL; + +void x86_mce_callback_register(x86_mce_callback_t cbfunc) +{ + mc_callback_bank_extended = cbfunc; +} + +/* Utility function to perform MCA bank telemetry readout and to push that + * telemetry towards an interested dom0 for logging and diagnosis. + * The caller - #MC handler or MCA poll function - must arrange that we + * do not migrate cpus. */ + +/* XXFM Could add overflow counting? */ +mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask, + struct mca_summary *sp) +{ + struct vcpu *v = current; + struct domain *d; + uint64_t gstatus, status, addr, misc; + struct mcinfo_global mcg; /* on stack */ + struct mcinfo_common *mic; + struct mcinfo_global *mig; /* on stack */ + mctelem_cookie_t mctc = NULL; + uint32_t uc = 0, pcc = 0; + struct mc_info *mci = NULL; + mctelem_class_t which = MC_URGENT; /* XXXgcc */ + unsigned int cpu_nr; + int errcnt = 0; + int i; + enum mca_extinfo cbret = MCA_EXTINFO_IGNORED; + + cpu_nr = smp_processor_id(); + BUG_ON(cpu_nr != v->processor); + + rdmsrl(MSR_IA32_MCG_STATUS, gstatus); + + memset(&mcg, 0, sizeof (mcg)); + mcg.common.type = MC_TYPE_GLOBAL; + mcg.common.size = sizeof (mcg); + if (v != NULL && ((d = v->domain) != NULL)) { + mcg.mc_domid = d->domain_id; + mcg.mc_vcpuid = v->vcpu_id; + } else { + mcg.mc_domid = -1; + mcg.mc_vcpuid = -1; + } + mcg.mc_gstatus = gstatus; /* MCG_STATUS */ + + switch (who) { + case MCA_MCE_HANDLER: + mcg.mc_flags = MC_FLAG_MCE; + which = MC_URGENT; + break; + + case MCA_POLLER: + case MCA_RESET: + mcg.mc_flags = MC_FLAG_POLLED; + which = MC_NONURGENT; + break; + + case MCA_CMCI_HANDLER: + mcg.mc_flags = MC_FLAG_CMCI; + which = MC_NONURGENT; + break; + + default: + BUG(); + } + + /* Retrieve detector information */ + x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid, + &mcg.mc_coreid, &mcg.mc_core_threadid, + &mcg.mc_apicid, NULL, NULL, NULL); + + for (i = 0; i < 32 && i < nr_mce_banks; i++) { + struct mcinfo_bank mcb; /* on stack */ + + /* Skip bank if corresponding bit in bankmask is clear */ + if (!test_bit(i, bankmask)) + continue; + + rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status); + if (!(status & MCi_STATUS_VAL)) + continue; /* this bank has no valid telemetry */ + + /* If this is the first bank with valid MCA DATA, then + * try to reserve an entry from the urgent/nonurgent queue + * depending on whethere we are called from an exception or + * a poller; this can fail (for example dom0 may not + * yet have consumed past telemetry). */ + if (errcnt == 0) { + if ((mctc = mctelem_reserve(which)) != NULL) { + mci = mctelem_dataptr(mctc); + mcinfo_clear(mci); + } + } + + memset(&mcb, 0, sizeof (mcb)); + mcb.common.type = MC_TYPE_BANK; + mcb.common.size = sizeof (mcb); + mcb.mc_bank = i; + mcb.mc_status = status; + + /* form a mask of which banks have logged uncorrected errors */ + if ((status & MCi_STATUS_UC) != 0) + uc |= (1 << i); + + /* likewise for those with processor context corrupt */ + if ((status & MCi_STATUS_PCC) != 0) + pcc |= (1 << i); + + addr = misc = 0; + + if (status & MCi_STATUS_ADDRV) { + rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr); + d = maddr_get_owner(addr); + if (d != NULL && (who == MCA_POLLER || + who == MCA_CMCI_HANDLER)) + mcb.mc_domid = d->domain_id; + } + + if (status & MCi_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc); + + mcb.mc_addr = addr; + mcb.mc_misc = misc; + + if (who == MCA_CMCI_HANDLER) { + rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2); + rdtscll(mcb.mc_tsc); + } + + /* Increment the error count; if this is the first bank + * with a valid error then add the global info to the mcinfo. */ + if (errcnt++ == 0 && mci != NULL) + x86_mcinfo_add(mci, &mcg); + + /* Add the bank data */ + if (mci != NULL) + x86_mcinfo_add(mci, &mcb); + + if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) { + cbret = mc_callback_bank_extended(mci, i, status); + } + + /* Clear status */ + wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); + wmb(); + } + + if (mci != NULL && errcnt > 0) { + x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL); + mig = (struct mcinfo_global *)mic; + if (pcc) + mcg.mc_flags |= MC_FLAG_UNCORRECTABLE; + else if (uc) + mcg.mc_flags |= MC_FLAG_RECOVERABLE; + else + mcg.mc_flags |= MC_FLAG_CORRECTABLE; + } + + + if (sp) { + sp->errcnt = errcnt; + sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0; + sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0; + sp->uc = uc; + sp->pcc = pcc; + } + + return mci != NULL ? mctc : NULL; /* may be NULL */ +} + +#define DOM_NORMAL 0 +#define DOM0_TRAP 1 +#define DOMU_TRAP 2 +#define DOMU_KILLED 4 + +/* Shared #MC handler. */ +void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code, + cpu_banks_t bankmask) +{ + int xen_state_lost, dom0_state_lost, domU_state_lost; + struct vcpu *v = current; + struct domain *curdom = v->domain; + domid_t domid = curdom->domain_id; + int ctx_xen, ctx_dom0, ctx_domU; + uint32_t dom_state = DOM_NORMAL; + mctelem_cookie_t mctc = NULL; + struct mca_summary bs; + struct mc_info *mci = NULL; + int irqlocked = 0; + uint64_t gstatus; + int ripv; + + /* This handler runs as interrupt gate. So IPIs from the + * polling service routine are defered until we're finished. + */ + + /* Disable interrupts for the _vcpu_. It may not re-scheduled to + * another physical CPU. */ + vcpu_schedule_lock_irq(v); + irqlocked = 1; + + /* Read global status; if it does not indicate machine check + * in progress then bail as long as we have a valid ip to return to. */ + rdmsrl(MSR_IA32_MCG_STATUS, gstatus); + ripv = ((gstatus & MCG_STATUS_RIPV) != 0); + if (!(gstatus & MCG_STATUS_MCIP) && ripv) { + add_taint(TAINT_MACHINE_CHECK); /* questionable */ + vcpu_schedule_unlock_irq(v); + irqlocked = 0; + goto cmn_handler_done; + } + + /* Go and grab error telemetry. We must choose whether to commit + * for logging or dismiss the cookie that is returned, and must not + * reference the cookie after that action. + */ + mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs); + if (mctc != NULL) + mci = (struct mc_info *)mctelem_dataptr(mctc); + + /* Clear MCIP or another #MC will enter shutdown state */ + gstatus &= ~MCG_STATUS_MCIP; + wrmsrl(MSR_IA32_MCG_STATUS, gstatus); + wmb(); + + /* If no valid errors and our stack is intact, we're done */ + if (ripv && bs.errcnt == 0) { + vcpu_schedule_unlock_irq(v); + irqlocked = 0; + goto cmn_handler_done; + } + + if (bs.uc || bs.pcc) + add_taint(TAINT_MACHINE_CHECK); + + /* Machine check exceptions will usually be for UC and/or PCC errors, + * but it is possible to configure machine check for some classes + * of corrected error. + * + * UC errors could compromise any domain or the hypervisor + * itself - for example a cache writeback of modified data that + * turned out to be bad could be for data belonging to anyone, not + * just the current domain. In the absence of known data poisoning + * to prevent consumption of such bad data in the system we regard + * all UC errors as terminal. It may be possible to attempt some + * heuristics based on the address affected, which guests have + * mappings to that mfn etc. + * + * PCC errors apply to the current context. + * + * If MCG_STATUS indicates !RIPV then even a #MC that is not UC + * and not PCC is terminal - the return instruction pointer + * pushed onto the stack is bogus. If the interrupt context is + * the hypervisor or dom0 the game is over, otherwise we can + * limit the impact to a single domU but only if we trampoline + * somewhere safely - we can't return and unwind the stack. + * Since there is no trampoline in place we will treat !RIPV + * as terminal for any context. + */ + ctx_xen = SEG_PL(regs->cs) == 0; + ctx_dom0 = !ctx_xen && (domid == dom0->domain_id); + ctx_domU = !ctx_xen && !ctx_dom0; + + xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) || + !ripv; + dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv)); + domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv)); + + if (xen_state_lost) { + /* Now we are going to panic anyway. Allow interrupts, so that + * printk on serial console can work. */ + vcpu_schedule_unlock_irq(v); + irqlocked = 0; + + printk("Terminal machine check exception occured in " + "hypervisor context.\n"); + + /* If MCG_STATUS_EIPV indicates, the IP on the stack is related + * to the error then it makes sense to print a stack trace. + * That can be useful for more detailed error analysis and/or + * error case studies to figure out, if we can clear + * xen_impacted and kill a DomU instead + * (i.e. if a guest only control structure is affected, but then + * we must ensure the bad pages are not re-used again). + */ + if (bs.eipv & MCG_STATUS_EIPV) { + printk("MCE: Instruction Pointer is related to the " + "error, therefore print the execution state.\n"); + show_execution_state(regs); + } + + /* Commit the telemetry so that panic flow can find it. */ + if (mctc != NULL) { + x86_mcinfo_dump(mci); + mctelem_commit(mctc); + } + mc_panic("Hypervisor state lost due to machine check " + "exception.\n"); + /*NOTREACHED*/ + } + + /* + * Xen hypervisor state is intact. If dom0 state is lost then + * give it a chance to decide what to do if it has registered + * a handler for this event, otherwise panic. + * + * XXFM Could add some Solaris dom0 contract kill here? + */ + if (dom0_state_lost) { + if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) { + dom_state = DOM0_TRAP; + send_guest_trap(dom0, 0, TRAP_machine_check); + /* XXFM case of return with !ripv ??? */ + } else { + /* Commit telemetry for panic flow. */ + if (mctc != NULL) { + x86_mcinfo_dump(mci); + mctelem_commit(mctc); + } + mc_panic("Dom0 state lost due to machine check " + "exception\n"); + /*NOTREACHED*/ + } + } + + /* + * If a domU has lost state then send it a trap if it has registered + * a handler, otherwise crash the domain. + * XXFM Revisit this functionality. + */ + if (domU_state_lost) { + if (guest_has_trap_callback(v->domain, v->vcpu_id, + TRAP_machine_check)) { + dom_state = DOMU_TRAP; + send_guest_trap(curdom, v->vcpu_id, + TRAP_machine_check); + } else { + dom_state = DOMU_KILLED; + /* Enable interrupts. This basically results in + * calling sti on the *physical* cpu. But after + * domain_crash() the vcpu pointer is invalid. + * Therefore, we must unlock the irqs before killing + * it. */ + vcpu_schedule_unlock_irq(v); + irqlocked = 0; + + /* DomU is impacted. Kill it and continue. */ + domain_crash(curdom); + } + } + + switch (dom_state) { + case DOM0_TRAP: + case DOMU_TRAP: + /* Enable interrupts. */ + vcpu_schedule_unlock_irq(v); + irqlocked = 0; + + /* guest softirqs and event callbacks are scheduled + * immediately after this handler exits. */ + break; + case DOMU_KILLED: + /* Nothing to do here. */ + break; + + case DOM_NORMAL: + vcpu_schedule_unlock_irq(v); + irqlocked = 0; + break; + } + +cmn_handler_done: + BUG_ON(irqlocked); + BUG_ON(!ripv); + + if (bs.errcnt) { + /* Not panicing, so forward telemetry to dom0 now if it + * is interested. */ + if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) { + if (mctc != NULL) + mctelem_commit(mctc); + send_guest_global_virq(dom0, VIRQ_MCA); + } else { + x86_mcinfo_dump(mci); + if (mctc != NULL) + mctelem_dismiss(mctc); + } + } else if (mctc != NULL) { + mctelem_dismiss(mctc); + } +} + +static int amd_mcheck_init(struct cpuinfo_x86 *ci) +{ + int rc = 0; switch (ci->x86) { case 6: - amd_k7_mcheck_init(ci); + rc = amd_k7_mcheck_init(ci); break; case 0xf: - amd_k8_mcheck_init(ci); + rc = amd_k8_mcheck_init(ci); break; case 0x10: - amd_f10_mcheck_init(ci); + rc = amd_f10_mcheck_init(ci); break; default: /* Assume that machine check support is available. * The minimum provided support is at least the K8. */ - amd_k8_mcheck_init(ci); - } + rc = amd_k8_mcheck_init(ci); + } + + return rc; } /*check the existence of Machine Check*/ @@ -116,50 +502,81 @@ int mce_available(struct cpuinfo_x86 *c) return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } +/* + * Check if bank 0 is usable for MCE. It isn't for AMD K7, + * and Intel P6 family before model 0x1a. + */ +int mce_firstbank(struct cpuinfo_x86 *c) +{ + if (c->x86 == 6) { + if (c->x86_vendor == X86_VENDOR_AMD) + return 1; + + if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a) + return 1; + } + + return 0; +} + /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { + int inited = 0, i; + if (mce_disabled == 1) { printk(XENLOG_INFO "MCE support disabled by bootparam\n"); return; } + for (i = 0; i < MAX_NR_BANKS; i++) + set_bit(i,mca_allbanks); + + /* Enforce at least MCE support in CPUID information. Individual + * families may also need to enforce a check for MCA support. */ if (!cpu_has(c, X86_FEATURE_MCE)) { printk(XENLOG_INFO "CPU%i: No machine check support available\n", smp_processor_id()); return; } - memset(&mc_data, 0, sizeof(struct mc_machine)); + mctelem_init(sizeof (struct mc_info)); switch (c->x86_vendor) { case X86_VENDOR_AMD: - amd_mcheck_init(c); + inited = amd_mcheck_init(c); break; case X86_VENDOR_INTEL: + switch (c->x86) { + case 5: #ifndef CONFIG_X86_64 - if (c->x86==5) - intel_p5_mcheck_init(c); + inited = intel_p5_mcheck_init(c); #endif - /*If it is P6 or P4 family, including CORE 2 DUO series*/ - if (c->x86 == 6 || c->x86==15) - { - printk(KERN_DEBUG "MCE: Intel newly family MC Init\n"); - intel_mcheck_init(c); + break; + + case 6: + case 15: + inited = intel_mcheck_init(c); + break; } break; #ifndef CONFIG_X86_64 case X86_VENDOR_CENTAUR: - if (c->x86==5) - winchip_mcheck_init(c); + if (c->x86==5) { + inited = winchip_mcheck_init(c); + } break; #endif default: break; } + + if (!inited) + printk(XENLOG_INFO "CPU%i: No machine check initialization\n", + smp_processor_id()); } @@ -176,190 +593,11 @@ custom_param("nomce", mcheck_disable); custom_param("nomce", mcheck_disable); custom_param("mce", mcheck_enable); - -#include <xen/guest_access.h> -#include <asm/traps.h> - -struct mc_info *x86_mcinfo_getptr(void) -{ - struct mc_info *mi; - uint32_t entry, next; - - for (;;) { - entry = mc_data.error_idx; - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mc_data.error_idx, entry, next) == entry) - break; - } - - mi = &(mc_data.mc[(entry % MAX_MCINFO)]); - BUG_ON(mc_data.error_idx < mc_data.fetch_idx); - - return mi; -} - -static int x86_mcinfo_matches_guest(const struct mc_info *mi, - const struct domain *d, const struct vcpu *v) -{ - struct mcinfo_common *mic; - struct mcinfo_global *mig; - - x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); - mig = (struct mcinfo_global *)mic; - if (mig == NULL) - return 0; - - if (d->domain_id != mig->mc_domid) - return 0; - - if (v->vcpu_id != mig->mc_vcpuid) - return 0; - - return 1; -} - - -#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)]) - -static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx, - const struct domain *d, const struct vcpu *v) -{ - struct mc_info *mi; - - /* This function is called from the fetch hypercall with - * the mc_lock spinlock held. Thus, no need for locking here. - */ - mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx)); - if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) { - /* Bogus domU command detected. */ - *fetch_idx = 0; - return NULL; - } - - *fetch_idx = mc_data.fetch_idx; - mc_data.fetch_idx++; - BUG_ON(mc_data.fetch_idx > mc_data.error_idx); - - return mi; -} - - -static void x86_mcinfo_marknotified(struct xen_mc_notifydomain *mc_notifydomain) -{ - struct mc_machine_notify *mn; - struct mcinfo_common *mic = NULL; - struct mcinfo_global *mig; - struct domain *d; - int i; - - /* This function is called from the notifier hypercall with - * the mc_notify_lock spinlock held. Thus, no need for locking here. - */ - - /* First invalidate entries for guests that disappeared after - * notification (e.g. shutdown/crash). This step prevents the - * notification array from filling up with stalling/leaking entries. - */ - for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) { - mn = &(mc_data.notify[(i % MAX_MCINFO)]); - x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL); - BUG_ON(mic == NULL); - mig = (struct mcinfo_global *)mic; - d = get_domain_by_id(mig->mc_domid); - if (d == NULL) { - /* Domain does not exist. */ - mn->valid = 0; - } - if ((!mn->valid) && (i == mc_data.notifyconsumer_idx)) - mc_data.notifyconsumer_idx++; - } - - /* Now put in the error telemetry. Since all error data fetchable - * by domUs are uncorrectable errors, they are very important. - * So we dump them before overriding them. When a guest takes that long, - * then we can assume something bad already happened (crash, hang, etc.) - */ - mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]); - - if (mn->valid) { - struct mcinfo_common *mic = NULL; - struct mcinfo_global *mig; - - /* To not loose the information, we dump it. */ - x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL); - BUG_ON(mic == NULL); - mig = (struct mcinfo_global *)mic; - printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to " - "fetch machine check error telemetry. But Domain ID " - "did not do that in time.\n", - mig->mc_domid); - x86_mcinfo_dump(&mn->mc); - } - - memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)), - sizeof(struct mc_info)); - mn->fetch_idx = mc_notifydomain->fetch_idx; - mn->valid = 1; - - mc_data.notifyproducer_idx++; - - /* By design there can never be more notifies than machine check errors. - * If that ever happens, then we hit a bug. */ - BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx); - BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx); -} - -static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx, - const struct domain *d, const struct vcpu *v) -{ - struct mc_machine_notify *mn = NULL; - uint32_t i; - int found; - - /* This function is called from the fetch hypercall with - * the mc_notify_lock spinlock held. Thus, no need for locking here. - */ - - /* The notifier data is filled in the order guests get notified, but - * guests may fetch them in a different order. That's why we need - * the game with valid/invalid entries. */ - found = 0; - for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) { - mn = &(mc_data.notify[(i % MAX_MCINFO)]); - if (!mn->valid) { - if (i == mc_data.notifyconsumer_idx) - mc_data.notifyconsumer_idx++; - continue; - } - if (x86_mcinfo_matches_guest(&mn->mc, d, v)) { - found = 1; - break; - } - } - - if (!found) { - /* This domain has never been notified. This must be - * a bogus domU command. */ - *fetch_idx = 0; - return NULL; - } - - BUG_ON(mn == NULL); - *fetch_idx = mn->fetch_idx; - mn->valid = 0; - - BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx); - return &mn->mc; -} - - -void x86_mcinfo_clear(struct mc_info *mi) +static void mcinfo_clear(struct mc_info *mi) { memset(mi, 0, sizeof(struct mc_info)); x86_mcinfo_nentries(mi) = 0; } - int x86_mcinfo_add(struct mc_info *mi, void *mcinfo) { @@ -380,7 +618,7 @@ int x86_mcinfo_add(struct mc_info *mi, v end2 = (unsigned long)((uint8_t *)mic_index + mic->size); if (end1 < end2) - return -ENOSPC; /* No space. Can't add entry. */ + return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC); /* there's enough space. add entry. */ memcpy(mic_index, mic, mic->size); @@ -388,7 +626,6 @@ int x86_mcinfo_add(struct mc_info *mi, v return 0; } - /* Dump machine check information in a format, * mcelog can parse. This is used only when @@ -404,7 +641,7 @@ void x86_mcinfo_dump(struct mc_info *mi) if (mic == NULL) return; mc_global = (struct mcinfo_global *)mic; - if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) { + if (mc_global->mc_flags & MC_FLAG_MCE) { printk(XENLOG_WARNING "CPU%d: Machine Check Exception: %16"PRIx64"\n", mc_global->mc_coreid, mc_global->mc_gstatus); @@ -424,7 +661,7 @@ void x86_mcinfo_dump(struct mc_info *mi) goto next; mc_bank = (struct mcinfo_bank *)mic; - + printk(XENLOG_WARNING "Bank %d: %16"PRIx64, mc_bank->mc_bank, mc_bank->mc_status); @@ -440,8 +677,6 @@ next: break; } while (1); } - - static void do_mc_get_cpu_info(void *v) { @@ -533,183 +768,141 @@ void x86_mc_get_cpu_info(unsigned cpu, u } } +#if BITS_PER_LONG == 64 + +#define ID2COOKIE(id) ((mctelem_cookie_t)(id)) +#define COOKIE2ID(c) ((uint64_t)(c)) + +#elif BITS_PER_LONG == 32 + +#define ID2COOKIE(id) ((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU)) +#define COOKIE2ID(c) ((uint64_t)(uint32_t)(c)) + +#elif defined(BITS_PER_LONG) +#error BITS_PER_LONG has unexpected value +#else +#error BITS_PER_LONG definition absent +#endif + /* Machine Check Architecture Hypercall */ long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc) { long ret = 0; struct xen_mc curop, *op = &curop; struct vcpu *v = current; - struct domain *domU; struct xen_mc_fetch *mc_fetch; - struct xen_mc_notifydomain *mc_notifydomain; struct xen_mc_physcpuinfo *mc_physcpuinfo; - struct mc_info *mi; - uint32_t flags; - uint32_t fetch_idx; - uint16_t vcpuid; - /* Use a different lock for the notify hypercall in order to allow - * a DomU to fetch mc data while Dom0 notifies another DomU. */ - static DEFINE_SPINLOCK(mc_lock); - static DEFINE_SPINLOCK(mc_notify_lock); + uint32_t flags, cmdflags; int nlcpu; xen_mc_logical_cpu_t *log_cpus = NULL; + mctelem_cookie_t mctc; + mctelem_class_t which; if ( copy_from_guest(op, u_xen_mc, 1) ) - return -EFAULT; + return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT); if ( op->interface_version != XEN_MCA_INTERFACE_VERSION ) - return -EACCES; - - switch ( op->cmd ) { + return x86_mcerr("do_mca: interface version mismatch", -EACCES); + + switch (op->cmd) { case XEN_MC_fetch: - /* This hypercall is for any domain */ mc_fetch = &op->u.mc_fetch; - - switch (mc_fetch->flags) { - case XEN_MC_CORRECTABLE: - /* But polling mode is Dom0 only, because - * correctable errors are reported to Dom0 only */ - if ( !IS_PRIV(v->domain) ) - return -EPERM; + cmdflags = mc_fetch->flags; + + /* This hypercall is for Dom0 only */ + if (!IS_PRIV(v->domain) ) + return x86_mcerr(NULL, -EPERM); + + switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) { + case XEN_MC_NONURGENT: + which = MC_NONURGENT; break; - case XEN_MC_TRAP: + case XEN_MC_URGENT: + which = MC_URGENT; break; + default: - return -EFAULT; + return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL); } flags = XEN_MC_OK; - spin_lock(&mc_lock); - - if ( IS_PRIV(v->domain) ) { - /* this must be Dom0. So a notify hypercall - * can't have happened before. */ - mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v); + + if (cmdflags & XEN_MC_ACK) { + mctelem_cookie_t cookie = ID2COOKIE(mc_fetch->fetch_id); + mctelem_ack(which, cookie); } else { - /* Hypercall comes from an unprivileged domain */ - domU = v->domain; - if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) { - /* Dom0 must have notified this DomU before - * via the notify hypercall. */ - mi = x86_mcinfo_getnotifiedptr(&fetch_idx, domU, v); + if (guest_handle_is_null(mc_fetch->data)) + return x86_mcerr("do_mca fetch: guest buffer " + "invalid", -EINVAL); + + if ((mctc = mctelem_consume_oldest_begin(which))) { + struct mc_info *mcip = mctelem_dataptr(mctc); + if (copy_to_guest(mc_fetch->data, mcip, 1)) { + ret = -EFAULT; + flags |= XEN_MC_FETCHFAILED; + mc_fetch->fetch_id = 0; + } else { + mc_fetch->fetch_id = COOKIE2ID(mctc); + } + mctelem_consume_oldest_end(mctc); } else { - /* Xen notified the DomU. */ - mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, v); + /* There is no data */ + flags |= XEN_MC_NODATA; + mc_fetch->fetch_id = 0; } - } - - if (mi) { - memcpy(&mc_fetch->mc_info, mi, - sizeof(struct mc_info)); - } else { - /* There is no data for a bogus DomU command. */ - flags |= XEN_MC_NODATA; - memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info)); - } - - mc_fetch->flags = flags; - mc_fetch->fetch_idx = fetch_idx; - - if ( copy_to_guest(u_xen_mc, op, 1) ) - ret = -EFAULT; - - spin_unlock(&mc_lock); + + mc_fetch->flags = flags; + if (copy_to_guest(u_xen_mc, op, 1) != 0) + ret = -EFAULT; + } + break; case XEN_MC_notifydomain: - /* This hypercall is for Dom0 only */ + return x86_mcerr("do_mca notify unsupported", -EINVAL); + + case XEN_MC_physcpuinfo: if ( !IS_PRIV(v->domain) ) - return -EPERM; - - spin_lock(&mc_notify_lock); - - mc_notifydomain = &op->u.mc_notifydomain; - domU = get_domain_by_id(mc_notifydomain->mc_domid); - vcpuid = mc_notifydomain->mc_vcpuid; - - if ((domU == NULL) || (domU == dom0)) { - /* It's not possible to notify a non-existent domain - * or the dom0. */ - spin_unlock(&mc_notify_lock); - return -EACCES; - } - - if (vcpuid >= MAX_VIRT_CPUS) { - /* It's not possible to notify a vcpu, Xen can't - * assign to a domain. */ - spin_unlock(&mc_notify_lock); - return -EACCES; - } - - mc_notifydomain->flags = XEN_MC_OK; - - mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)); - if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) { - /* The error telemetry is not for the guest, Dom0 - * wants to notify. */ - mc_notifydomain->flags |= XEN_MC_NOMATCH; - } else if ( guest_has_trap_callback(domU, vcpuid, - TRAP_machine_check) ) - { - /* Send notification */ - if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) ) - mc_notifydomain->flags |= XEN_MC_NOTDELIVERED; - } else - mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE; - -#ifdef DEBUG - /* sanity check - these two flags are mutually exclusive */ - if ((flags & XEN_MC_CANNOTHANDLE) && (flags & XEN_MC_NOTDELIVERED)) - BUG(); -#endif - - if ( copy_to_guest(u_xen_mc, op, 1) ) - ret = -EFAULT; - - if (ret == 0) { - x86_mcinfo_marknotified(mc_notifydomain); - } - - spin_unlock(&mc_notify_lock); - break; - - case XEN_MC_physcpuinfo: - if ( !IS_PRIV(v->domain) ) - return -EPERM; - - mc_physcpuinfo = &op->u.mc_physcpuinfo; - nlcpu = num_online_cpus(); - - if (!guest_handle_is_null(mc_physcpuinfo->info)) { - if (mc_physcpuinfo->ncpus <= 0) - return -EINVAL; - nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus); - log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu); - if (log_cpus == NULL) - return -ENOMEM; - - if (on_each_cpu(do_mc_get_cpu_info, log_cpus, - 1, 1) != 0) { - xfree(log_cpus); - return -EIO; - } - } - - mc_physcpuinfo->ncpus = nlcpu; - - if (copy_to_guest(u_xen_mc, op, 1)) { - if (log_cpus != NULL) - xfree(log_cpus); - return -EFAULT; - } - - if (!guest_handle_is_null(mc_physcpuinfo->info)) { - if (copy_to_guest(mc_physcpuinfo->info, - log_cpus, nlcpu)) - ret = -EFAULT; - xfree(log_cpus); - } + return x86_mcerr("do_mca cpuinfo", -EPERM); + + mc_physcpuinfo = &op->u.mc_physcpuinfo; + nlcpu = num_online_cpus(); + + if (!guest_handle_is_null(mc_physcpuinfo->info)) { + if (mc_physcpuinfo->ncpus <= 0) + return x86_mcerr("do_mca cpuinfo: ncpus <= 0", + -EINVAL); + nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus); + log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu); + if (log_cpus == NULL) + return x86_mcerr("do_mca cpuinfo", -ENOMEM); + + if (on_each_cpu(do_mc_get_cpu_info, log_cpus, + 1, 1) != 0) { + xfree(log_cpus); + return x86_mcerr("do_mca cpuinfo", -EIO); + } + } + + mc_physcpuinfo->ncpus = nlcpu; + + if (copy_to_guest(u_xen_mc, op, 1)) { + if (log_cpus != NULL) + xfree(log_cpus); + return x86_mcerr("do_mca cpuinfo", -EFAULT); + } + + if (!guest_handle_is_null(mc_physcpuinfo->info)) { + if (copy_to_guest(mc_physcpuinfo->info, + log_cpus, nlcpu)) + ret = -EFAULT; + xfree(log_cpus); + } + break; + + default: + return x86_mcerr("do_mca: bad command", -EINVAL); } return ret; diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mce.h --- a/xen/arch/x86/cpu/mcheck/mce.h Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/mce.h Tue Mar 17 14:22:50 2009 +0000 @@ -1,38 +1,98 @@ +#ifndef _MCE_H + +#define _MCE_H + #include <xen/init.h> +#include <xen/smp.h> #include <asm/types.h> #include <asm/traps.h> #include <asm/atomic.h> #include <asm/percpu.h> +#include "x86_mca.h" +#include "mctelem.h" /* Init functions */ -void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c); -void amd_k7_mcheck_init(struct cpuinfo_x86 *c); -void amd_k8_mcheck_init(struct cpuinfo_x86 *c); -void amd_f10_mcheck_init(struct cpuinfo_x86 *c); +int amd_k7_mcheck_init(struct cpuinfo_x86 *c); +int amd_k8_mcheck_init(struct cpuinfo_x86 *c); +int amd_f10_mcheck_init(struct cpuinfo_x86 *c); +int intel_p5_mcheck_init(struct cpuinfo_x86 *c); +int winchip_mcheck_init(struct cpuinfo_x86 *c); +int intel_mcheck_init(struct cpuinfo_x86 *c); void intel_mcheck_timer(struct cpuinfo_x86 *c); -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void intel_mcheck_init(struct cpuinfo_x86 *c); void mce_intel_feature_init(struct cpuinfo_x86 *c); - -void winchip_mcheck_init(struct cpuinfo_x86 *c); - -/* Function pointer used in the handlers to collect additional information - * provided by newer CPU families/models without the need to duplicate - * the whole handler resulting in various handlers each with its own - * tweaks and bugs */ -extern int (*mc_callback_bank_extended)(struct mc_info *mi, - uint16_t bank, uint64_t status); - +void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c); int mce_available(struct cpuinfo_x86 *c); +int mce_firstbank(struct cpuinfo_x86 *c); /* Helper functions used for collecting error telemetry */ struct mc_info *x86_mcinfo_getptr(void); -void x86_mcinfo_clear(struct mc_info *mi); -int x86_mcinfo_add(struct mc_info *mi, void *mcinfo); -void x86_mcinfo_dump(struct mc_info *mi); void mc_panic(char *s); void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); + + +/* Register a handler for machine check exceptions. */ +typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long); +extern void x86_mce_vector_register(x86_mce_vector_t); + +/* Common generic MCE handler that implementations may nominate + * via x86_mce_vector_register. */ +extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t); + +/* Utility function to "logout" all architectural MCA telemetry from the MCA + * banks of the current processor. A cookie is returned which may be + * uses to reference the data so logged (the cookie can be NULL if + * no logout structures were available). The caller can also pass a pointer + * to a structure which will be completed with some summary information + * of the MCA data observed in the logout operation. */ + +enum mca_source { + MCA_MCE_HANDLER, + MCA_POLLER, + MCA_CMCI_HANDLER, + MCA_RESET +}; + +enum mca_extinfo { + MCA_EXTINFO_LOCAL, + MCA_EXTINFO_GLOBAL, + MCA_EXTINFO_IGNORED +}; + +struct mca_summary { + uint32_t errcnt; /* number of banks with valid errors */ + int ripv; /* meaningful on #MC */ + int eipv; /* meaningful on #MC */ + uint32_t uc; /* bitmask of banks with UC */ + uint32_t pcc; /* bitmask of banks with PCC */ +}; + +extern cpu_banks_t mca_allbanks; + +extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t, + struct mca_summary *); + +/* Register a callback to be made during bank telemetry logout. + * This callback is only available to those machine check handlers + * that call to the common mcheck_cmn_handler or who use the common + * telemetry logout function mcheck_mca_logout in error polling. + * + * This can be used to collect additional information (typically non- + * architectural) provided by newer CPU families/models without the need + * to duplicate the whole handler resulting in various handlers each with + * its own tweaks and bugs. The callback receives an struct mc_info pointer + * which it can use with x86_mcinfo_add to add additional telemetry, + * the current MCA bank number we are reading telemetry from, and the + * MCi_STATUS value for that bank. + */ +typedef enum mca_extinfo (*x86_mce_callback_t) + (struct mc_info *, uint16_t, uint64_t); +extern void x86_mce_callback_register(x86_mce_callback_t); + +int x86_mcinfo_add(struct mc_info *mi, void *mcinfo); +void x86_mcinfo_dump(struct mc_info *mi); + +#endif /* _MCE_H */ diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mce_intel.c --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Mar 17 14:22:50 2009 +0000 @@ -14,6 +14,7 @@ DEFINE_PER_CPU(cpu_banks_t, mce_banks_ow static int nr_intel_ext_msrs = 0; static int cmci_support = 0; +static int firstbank; #ifdef CONFIG_X86_MCE_THERMAL static void unexpected_thermal_interrupt(struct cpu_user_regs *regs) @@ -115,222 +116,51 @@ static void intel_init_thermal(struct cp } #endif /* CONFIG_X86_MCE_THERMAL */ -static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext) -{ - if (nr_intel_ext_msrs == 0) - return; +static enum mca_extinfo +intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status) +{ + struct mcinfo_extended mc_ext; + + if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV)) + return MCA_EXTINFO_IGNORED; /* this function will called when CAP(9).MCG_EXT_P = 1 */ - memset(mc_ext, 0, sizeof(struct mcinfo_extended)); - mc_ext->common.type = MC_TYPE_EXTENDED; - mc_ext->common.size = sizeof(mc_ext); - mc_ext->mc_msrs = 10; - - mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX; - rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value); - mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX; - rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value); - mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX; - rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value); - - mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX; - rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value); - mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI; - rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value); - mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI; - rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value); - - mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP; - rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value); - mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP; - rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value); - mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS; - rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value); - mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP; - rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value); -} - -/* machine_check_poll might be called by following types: - * 1. called when do mcheck_init. - * 2. called in cmci interrupt handler - * 3. called in polling handler - * It will generate a new mc_info item if found CE/UC errors. DOM0 is the - * consumer. - */ -static struct mc_info *machine_check_poll(int calltype) -{ - struct mc_info *mi = NULL; - int exceptions = (read_cr4() & X86_CR4_MCE); - int i, nr_unit = 0, uc = 0, pcc = 0; - uint64_t status, addr; - struct mcinfo_global mcg; - struct mcinfo_extended mce; - unsigned int cpu; - struct domain *d; - - cpu = smp_processor_id(); - - memset(&mcg, 0, sizeof(mcg)); - mcg.common.type = MC_TYPE_GLOBAL; - mcg.common.size = sizeof(mcg); - /* If called from cpu-reset check, don't need to fill them. - * If called from cmci context, we'll try to fill domid by memory addr - */ - mcg.mc_domid = -1; - mcg.mc_vcpuid = -1; - if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET) - mcg.mc_flags = MC_FLAG_POLLED; - else if (calltype == MC_FLAG_CMCI) - mcg.mc_flags = MC_FLAG_CMCI; - x86_mc_get_cpu_info( - cpu, &mcg.mc_socketid, &mcg.mc_coreid, - &mcg.mc_core_threadid, &mcg.mc_apicid, NULL, NULL, NULL); - rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus); - - for ( i = 0; i < nr_mce_banks; i++ ) { - struct mcinfo_bank mcb; - /* For CMCI, only owners checks the owned MSRs */ - if ( !test_bit(i, __get_cpu_var(mce_banks_owned)) && - (calltype & MC_FLAG_CMCI) ) - continue; - rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status); - - if (! (status & MCi_STATUS_VAL) ) - continue; - /* - * Uncorrected events are handled by the exception - * handler when it is enabled. But when the exception - * is disabled such as when mcheck_init, log everything. - */ - if ((status & MCi_STATUS_UC) && exceptions) - continue; - - if (status & MCi_STATUS_UC) - uc = 1; - if (status & MCi_STATUS_PCC) - pcc = 1; - - if (!mi) { - mi = x86_mcinfo_getptr(); - if (!mi) { - printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n"); - return NULL; - } - x86_mcinfo_clear(mi); - } - memset(&mcb, 0, sizeof(mcb)); - mcb.common.type = MC_TYPE_BANK; - mcb.common.size = sizeof(mcb); - mcb.mc_bank = i; - mcb.mc_status = status; - if (status & MCi_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc); - if (status & MCi_STATUS_ADDRV) { - rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr); - d = maddr_get_owner(addr); - if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) ) - mcb.mc_domid = d->domain_id; - } - if (cmci_support) - rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2); - if (calltype == MC_FLAG_CMCI) - rdtscll(mcb.mc_tsc); - x86_mcinfo_add(mi, &mcb); - nr_unit++; - add_taint(TAINT_MACHINE_CHECK); - /* Clear state for this bank */ - wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0); - printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%"PRIx64"]\n", - i, cpu, status); - printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], " - "thread[%d]\n", cpu, mcg.mc_socketid, - mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid); - - } - /* if pcc = 1, uc must be 1 */ - if (pcc) - mcg.mc_flags |= MC_FLAG_UNCORRECTABLE; - else if (uc) - mcg.mc_flags |= MC_FLAG_RECOVERABLE; - else /* correctable */ - mcg.mc_flags |= MC_FLAG_CORRECTABLE; - - if (nr_unit && nr_intel_ext_msrs && - (mcg.mc_gstatus & MCG_STATUS_EIPV)) { - intel_get_extended_msrs(&mce); - x86_mcinfo_add(mi, &mce); - } - if (nr_unit) - x86_mcinfo_add(mi, &mcg); - /* Clear global state */ - return mi; -} - -static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code) -{ - /* MACHINE CHECK Error handler will be sent in another patch, - * simply copy old solutions here. This code will be replaced - * by upcoming machine check patches - */ - - int recover=1; - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i=0; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); - if (high & (1<<31)) { - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); - } - printk ("\n"); - } - } - - if (recover & 2) - mc_panic ("CPU context corrupt"); - if (recover & 1) - mc_panic ("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i=0; i<nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr (msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); + memset(&mc_ext, 0, sizeof(struct mcinfo_extended)); + mc_ext.common.type = MC_TYPE_EXTENDED; + mc_ext.common.size = sizeof(mc_ext); + mc_ext.mc_msrs = 10; + + mc_ext.mc_msr[0].reg = MSR_IA32_MCG_EAX; + rdmsrl(MSR_IA32_MCG_EAX, mc_ext.mc_msr[0].value); + mc_ext.mc_msr[1].reg = MSR_IA32_MCG_EBX; + rdmsrl(MSR_IA32_MCG_EBX, mc_ext.mc_msr[1].value); + mc_ext.mc_msr[2].reg = MSR_IA32_MCG_ECX; + rdmsrl(MSR_IA32_MCG_ECX, mc_ext.mc_msr[2].value); + + mc_ext.mc_msr[3].reg = MSR_IA32_MCG_EDX; + rdmsrl(MSR_IA32_MCG_EDX, mc_ext.mc_msr[3].value); + mc_ext.mc_msr[4].reg = MSR_IA32_MCG_ESI; + rdmsrl(MSR_IA32_MCG_ESI, mc_ext.mc_msr[4].value); + mc_ext.mc_msr[5].reg = MSR_IA32_MCG_EDI; + rdmsrl(MSR_IA32_MCG_EDI, mc_ext.mc_msr[5].value); + + mc_ext.mc_msr[6].reg = MSR_IA32_MCG_EBP; + rdmsrl(MSR_IA32_MCG_EBP, mc_ext.mc_msr[6].value); + mc_ext.mc_msr[7].reg = MSR_IA32_MCG_ESP; + rdmsrl(MSR_IA32_MCG_ESP, mc_ext.mc_msr[7].value); + mc_ext.mc_msr[8].reg = MSR_IA32_MCG_EFLAGS; + rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext.mc_msr[8].value); + mc_ext.mc_msr[9].reg = MSR_IA32_MCG_EIP; + rdmsrl(MSR_IA32_MCG_EIP, mc_ext.mc_msr[9].value); + + x86_mcinfo_add(mci, &mc_ext); + + return MCA_EXTINFO_GLOBAL; +} + +static void intel_machine_check(struct cpu_user_regs * regs, long error_code) +{ + mcheck_cmn_handler(regs, error_code, mca_allbanks); } static DEFINE_SPINLOCK(cmci_discover_lock); @@ -369,6 +199,8 @@ static void cmci_discover(void) unsigned long flags; int i; struct mc_info *mi = NULL; + mctelem_cookie_t mctc; + struct mca_summary bs; printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id()); @@ -385,12 +217,20 @@ static void cmci_discover(void) * MCi_status (error_count bit 38~52) is not cleared, * the CMCI interrupt will never be triggered again. */ - mi = machine_check_poll(MC_FLAG_CMCI); - if (mi) { - x86_mcinfo_dump(mi); - if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) + + mctc = mcheck_mca_logout( + MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs); + + if (bs.errcnt && mctc != NULL) { + if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) { + mctelem_commit(mctc); send_guest_global_virq(dom0, VIRQ_MCA); - } + } else { + x86_mcinfo_dump(mi); + mctelem_dismiss(mctc); + } + } else if (mctc != NULL) + mctelem_dismiss(mctc); printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", smp_processor_id(), @@ -487,17 +327,26 @@ fastcall void smp_cmci_interrupt(struct fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs) { struct mc_info *mi = NULL; - int cpu = smp_processor_id(); + mctelem_cookie_t mctc; + struct mca_summary bs; ack_APIC_irq(); irq_enter(); - printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu); - mi = machine_check_poll(MC_FLAG_CMCI); - if (mi) { - x86_mcinfo_dump(mi); - if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) + + mctc = mcheck_mca_logout( + MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs); + + if (bs.errcnt && mctc != NULL) { + if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) { + mctelem_commit(mctc); send_guest_global_virq(dom0, VIRQ_MCA); - } + } else { + x86_mcinfo_dump(mi); + mctelem_dismiss(mctc); + } + } else if (mctc != NULL) + mctelem_dismiss(mctc); + irq_exit(); } @@ -527,28 +376,28 @@ static void mce_cap_init(struct cpuinfo_ printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n", smp_processor_id(), nr_intel_ext_msrs); } - /* for most of p6 family, bank 0 is an alias bios MSR. - * But after model>1a, bank 0 is available*/ - if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL - && c->x86_model < 0x1A) - firstbank = 1; - else - firstbank = 0; + firstbank = mce_firstbank(c); } static void mce_init(void) { u32 l, h; int i; - struct mc_info *mi; + mctelem_cookie_t mctc; + struct mca_summary bs; + clear_in_cr4(X86_CR4_MCE); + /* log the machine checks left over from the previous reset. * This also clears all registers*/ - mi = machine_check_poll(MC_FLAG_RESET); + mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs); + /* in the boot up stage, don't inject to DOM0, but print out */ - if (mi) - x86_mcinfo_dump(mi); + if (bs.errcnt && mctc != NULL) { + x86_mcinfo_dump(mctelem_dataptr(mctc)); + mctelem_dismiss(mctc); + } set_in_cr4(X86_CR4_MCE); rdmsr (MSR_IA32_MCG_CAP, l, h); @@ -573,71 +422,19 @@ static void mce_init(void) } /* p4/p6 family have similar MCA initialization process */ -void intel_mcheck_init(struct cpuinfo_x86 *c) +int intel_mcheck_init(struct cpuinfo_x86 *c) { mce_cap_init(c); printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", smp_processor_id()); + /* machine check is available */ - machine_check_vector = intel_machine_check; + x86_mce_vector_register(intel_machine_check); + x86_mce_callback_register(intel_get_extended_msrs); + mce_init(); mce_intel_feature_init(c); mce_set_owner(); -} - -/* - * Periodic polling timer for "silent" machine check errors. If the - * poller finds an MCE, poll faster. When the poller finds no more - * errors, poll slower -*/ -static struct timer mce_timer; - -#define MCE_PERIOD 4000 -#define MCE_MIN 2000 -#define MCE_MAX 32000 - -static u64 period = MCE_PERIOD; -static int adjust = 0; - -static void mce_intel_checkregs(void *info) -{ - struct mc_info *mi; - - if( !mce_available(¤t_cpu_data)) - return; - mi = machine_check_poll(MC_FLAG_POLLED); - if (mi) - { - x86_mcinfo_dump(mi); - adjust++; - if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) - send_guest_global_virq(dom0, VIRQ_MCA); - } -} - -static void mce_intel_work_fn(void *data) -{ - on_each_cpu(mce_intel_checkregs, data, 1, 1); - if (adjust) { - period = period / (adjust + 1); - printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval " - "to %"PRIu64"\n", period); - } - else { - period *= 2; - } - if (period > MCE_MAX) - period = MCE_MAX; - if (period < MCE_MIN) - period = MCE_MIN; - set_timer(&mce_timer, NOW() + MILLISECS(period)); - adjust = 0; -} - -void intel_mcheck_timer(struct cpuinfo_x86 *c) -{ - printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n"); - init_timer(&mce_timer, mce_intel_work_fn, NULL, 0); - set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD)); -} - + + return 1; +} diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mctelem.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/cpu/mcheck/mctelem.c Tue Mar 17 14:22:50 2009 +0000 @@ -0,0 +1,443 @@ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +/* + * mctelem.c - x86 Machine Check Telemetry Transport + */ + +#include <xen/init.h> +#include <xen/types.h> +#include <xen/kernel.h> +#include <xen/config.h> +#include <xen/smp.h> +#include <xen/errno.h> +#include <xen/sched.h> +#include <xen/sched-if.h> +#include <xen/cpumask.h> +#include <xen/event.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +struct mctelem_ent { + struct mctelem_ent *mcte_next; /* next in chronological order */ + struct mctelem_ent *mcte_prev; /* previous in chronological order */ + uint32_t mcte_flags; /* See MCTE_F_* below */ + uint32_t mcte_refcnt; /* Reference count */ + void *mcte_data; /* corresponding data payload */ +}; + +#define MCTE_F_HOME_URGENT 0x0001U /* free to urgent freelist */ +#define MCTE_F_HOME_NONURGENT 0x0002U /* free to nonurgent freelist */ +#define MCTE_F_CLASS_URGENT 0x0004U /* in use - urgent errors */ +#define MCTE_F_CLASS_NONURGENT 0x0008U /* in use - nonurgent errors */ +#define MCTE_F_STATE_FREE 0x0010U /* on a freelist */ +#define MCTE_F_STATE_UNCOMMITTED 0x0020U /* reserved; on no list */ +#define MCTE_F_STATE_COMMITTED 0x0040U /* on a committed list */ +#define MCTE_F_STATE_PROCESSING 0x0080U /* on a processing list */ + +#define MCTE_F_MASK_HOME (MCTE_F_HOME_URGENT | MCTE_F_HOME_NONURGENT) +#define MCTE_F_MASK_CLASS (MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT) +#define MCTE_F_MASK_STATE (MCTE_F_STATE_FREE | \ + MCTE_F_STATE_UNCOMMITTED | \ + MCTE_F_STATE_COMMITTED | \ + MCTE_F_STATE_PROCESSING) + +#define MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME) + +#define MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS) +#define MCTE_SET_CLASS(tep, new) do { \ + (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \ + (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0) + +#define MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE) +#define MCTE_TRANSITION_STATE(tep, old, new) do { \ + BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \ + (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \ + (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0) + +#define MC_URGENT_NENT 10 +#define MC_NONURGENT_NENT 20 + +#define MC_NCLASSES (MC_NONURGENT + 1) + +#define COOKIE2MCTE(c) ((struct mctelem_ent *)(c)) +#define MCTE2COOKIE(tep) ((mctelem_cookie_t)(tep)) + +static struct mc_telem_ctl { + /* Linked lists that thread the array members together. + * + * The free lists are singly-linked via mcte_next, and we allocate + * from them by atomically unlinking an element from the head. + * Consumed entries are returned to the head of the free list. + * When an entry is reserved off the free list it is not linked + * on any list until it is committed or dismissed. + * + * The committed list grows at the head and we do not maintain a + * tail pointer; insertions are performed atomically. The head + * thus has the most-recently committed telemetry, i.e. the + * list is in reverse chronological order. The committed list + * is singly-linked via mcte_prev pointers, and mcte_next is NULL. + * When we move telemetry from the committed list to the processing + * list we atomically unlink the committed list and keep a pointer + * to the head of that list; we then traverse the list following + * mcte_prev and fill in mcte_next to doubly-link the list, and then + * append the tail of the list onto the processing list. If we panic + * during this manipulation of the committed list we still have + * the pointer to its head so we can recover all entries during + * the panic flow (albeit in reverse chronological order). + * + * The processing list is updated in a controlled context, and + * we can lock it for updates. The head of the processing list + * always has the oldest telemetry, and we append (as above) + * at the tail of the processing list. */ + struct mctelem_ent *mctc_free[MC_NCLASSES]; + struct mctelem_ent *mctc_committed[MC_NCLASSES]; + struct mctelem_ent *mctc_processing_head[MC_NCLASSES]; + struct mctelem_ent *mctc_processing_tail[MC_NCLASSES]; + /* + * Telemetry array + */ + struct mctelem_ent *mctc_elems; +} mctctl; + +/* Lock protecting all processing lists */ +static DEFINE_SPINLOCK(processing_lock); + +static void *cmpxchgptr(void *ptr, void *old, void *new) +{ + unsigned long *ulp = (unsigned long *)ptr; + unsigned long a = (unsigned long)old; + unsigned long b = (unsigned long)new; + + return (void *)cmpxchg(ulp, a, b); +} + +/* Free an entry to its native free list; the entry must not be linked on + * any list. + */ +static void mctelem_free(struct mctelem_ent *tep) +{ + mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ? + MC_URGENT : MC_NONURGENT; + struct mctelem_ent **freelp; + struct mctelem_ent *oldhead; + + BUG_ON(tep->mcte_refcnt != 0); + BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE); + + tep->mcte_prev = NULL; + freelp = &mctctl.mctc_free[target]; + for (;;) { + oldhead = *freelp; + tep->mcte_next = oldhead; + wmb(); + if (cmpxchgptr(freelp, oldhead, tep) == oldhead) + break; + } +} + +/* Increment the reference count of an entry that is not linked on to + * any list and which only the caller has a pointer to. + */ +static void mctelem_hold(struct mctelem_ent *tep) +{ + tep->mcte_refcnt++; +} + +/* Increment the reference count on an entry that is linked at the head of + * a processing list. The caller is responsible for locking the list. + */ +static void mctelem_processing_hold(struct mctelem_ent *tep) +{ + int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ? + MC_URGENT : MC_NONURGENT; + + BUG_ON(tep != mctctl.mctc_processing_head[which]); + tep->mcte_refcnt++; +} + +/* Decrement the reference count on an entry that is linked at the head of + * a processing list. The caller is responsible for locking the list. + */ +static void mctelem_processing_release(struct mctelem_ent *tep) +{ + int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ? + MC_URGENT : MC_NONURGENT; + + BUG_ON(tep != mctctl.mctc_processing_head[which]); + if (--tep->mcte_refcnt == 0) { + MCTE_TRANSITION_STATE(tep, PROCESSING, FREE); + mctctl.mctc_processing_head[which] = tep->mcte_next; + mctelem_free(tep); + } +} + +void mctelem_init(int reqdatasz) +{ + static int called = 0; + static int datasz = 0, realdatasz = 0; + char *datarr; + int i; + + BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2); + + /* Called from mcheck_init for all processors; initialize for the + * first call only (no race here since the boot cpu completes + * init before others start up). */ + if (++called == 1) { + realdatasz = reqdatasz; + datasz = (reqdatasz & ~0xf) + 0x10; /* 16 byte roundup */ + } else { + BUG_ON(reqdatasz != realdatasz); + return; + } + + if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent, + MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL || + (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT) * + datasz)) == NULL) { + if (mctctl.mctc_elems) + xfree(mctctl.mctc_elems); + printk("Allocations for MCA telemetry failed\n"); + return; + } + + for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) { + struct mctelem_ent *tep, **tepp; + + tep = mctctl.mctc_elems + i; + tep->mcte_flags = MCTE_F_STATE_FREE; + tep->mcte_refcnt = 0; + tep->mcte_data = datarr + i * datasz; + + if (i < MC_URGENT_NENT) { + tepp = &mctctl.mctc_free[MC_URGENT]; + tep->mcte_flags |= MCTE_F_HOME_URGENT; + } else { + tepp = &mctctl.mctc_free[MC_NONURGENT]; + tep->mcte_flags |= MCTE_F_HOME_NONURGENT; + } + + tep->mcte_next = *tepp; + tep->mcte_prev = NULL; + *tepp = tep; + } +} + +/* incremented non-atomically when reserve fails */ +static int mctelem_drop_count; + +/* Reserve a telemetry entry, or return NULL if none available. + * If we return an entry then the caller must subsequently call exactly one of + * mctelem_unreserve or mctelem_commit for that entry. + */ +mctelem_cookie_t mctelem_reserve(mctelem_class_t which) +{ + struct mctelem_ent **freelp; + struct mctelem_ent *oldhead, *newhead; + mctelem_class_t target = (which == MC_URGENT) ? + MC_URGENT : MC_NONURGENT; + + freelp = &mctctl.mctc_free[target]; + for (;;) { + if ((oldhead = *freelp) == NULL) { + if (which == MC_URGENT && target == MC_URGENT) { + /* raid the non-urgent freelist */ + target = MC_NONURGENT; + freelp = &mctctl.mctc_free[target]; + continue; + } else { + mctelem_drop_count++; + return (NULL); + } + } + + newhead = oldhead->mcte_next; + if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) { + struct mctelem_ent *tep = oldhead; + + mctelem_hold(tep); + MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED); + tep->mcte_next = NULL; + tep->mcte_prev = NULL; + if (which == MC_URGENT) + MCTE_SET_CLASS(tep, URGENT); + else + MCTE_SET_CLASS(tep, NONURGENT); + return MCTE2COOKIE(tep); + } + } +} + +void *mctelem_dataptr(mctelem_cookie_t cookie) +{ + struct mctelem_ent *tep = COOKIE2MCTE(cookie); + + return tep->mcte_data; +} + +/* Release a previously reserved entry back to the freelist without + * submitting it for logging. The entry must not be linked on to any + * list - that's how mctelem_reserve handed it out. + */ +void mctelem_dismiss(mctelem_cookie_t cookie) +{ + struct mctelem_ent *tep = COOKIE2MCTE(cookie); + + tep->mcte_refcnt--; + MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE); + mctelem_free(tep); +} + +/* Commit an entry with completed telemetry for logging. The caller must + * not reference the entry after this call. Note that we add entries + * at the head of the committed list, so that list therefore has entries + * in reverse chronological order. + */ +void mctelem_commit(mctelem_cookie_t cookie) +{ + struct mctelem_ent *tep = COOKIE2MCTE(cookie); + struct mctelem_ent **commlp; + struct mctelem_ent *oldhead; + mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ? + MC_URGENT : MC_NONURGENT; + + BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL); + MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED); + + commlp = &mctctl.mctc_committed[target]; + for (;;) { + oldhead = *commlp; + tep->mcte_prev = oldhead; + wmb(); + if (cmpxchgptr(commlp, oldhead, tep) == oldhead) + break; + } +} + +/* Move telemetry from committed list to processing list, reversing the + * list into chronological order. The processing list has been + * locked by the caller, and may be non-empty. We append the + * reversed committed list on to the tail of the processing list. + * The committed list may grow even while we run, so use atomic + * operations to swap NULL to the freelist head. + * + * Note that "chronological order" means the order in which producers + * won additions to the processing list, which may not reflect the + * strict chronological order of the associated events if events are + * closely spaced in time and contend for the processing list at once. + */ + +static struct mctelem_ent *dangling[MC_NCLASSES]; + +static void mctelem_append_processing(mctelem_class_t which) +{ + mctelem_class_t target = which == MC_URGENT ? + MC_URGENT : MC_NONURGENT; + struct mctelem_ent **commlp = &mctctl.mctc_committed[target]; + struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target]; + struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target]; + struct mctelem_ent *tep, *ltep; + + /* Check for an empty list; no race since we hold the processing lock */ + if (*commlp == NULL) + return; + + /* Atomically unlink the committed list, and keep a pointer to + * the list we unlink in a well-known location so it can be + * picked up in panic code should we panic between this unlink + * and the append to the processing list. */ + for (;;) { + dangling[target] = *commlp; + wmb(); + if (cmpxchgptr(commlp, dangling[target], NULL) == + dangling[target]) + break; + } + + if (dangling[target] == NULL) + return; + + /* Traverse the list following the previous pointers (reverse + * chronological order). For each entry fill in the next pointer + * and transition the element state. */ + for (tep = dangling[target], ltep = NULL; tep != NULL; + tep = tep->mcte_prev) { + MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING); + tep->mcte_next = ltep; + ltep = tep; + } + + /* ltep points to the head of a chronologically ordered linked + * list of telemetry entries ending at the most recent entry + * dangling[target] if mcte_next is followed; tack this on to + * the processing list. + */ + if (*proclhp == NULL) { + *proclhp = ltep; + *procltp = dangling[target]; + } else { + (*procltp)->mcte_next = ltep; + ltep->mcte_prev = *procltp; + *procltp = dangling[target]; + } + wmb(); + dangling[target] = NULL; + wmb(); +} + +mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which) +{ + mctelem_class_t target = (which == MC_URGENT) ? + MC_URGENT : MC_NONURGENT; + struct mctelem_ent *tep; + + spin_lock(&processing_lock); + mctelem_append_processing(target); + if ((tep = mctctl.mctc_processing_head[target]) == NULL) { + spin_unlock(&processing_lock); + return NULL; + } + + mctelem_processing_hold(tep); + wmb(); + spin_unlock(&processing_lock); + return MCTE2COOKIE(tep); +} + +void mctelem_consume_oldest_end(mctelem_cookie_t cookie) +{ + struct mctelem_ent *tep = COOKIE2MCTE(cookie); + + spin_lock(&processing_lock); + mctelem_processing_release(tep); + wmb(); + spin_unlock(&processing_lock); +} + +void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie) +{ + mctelem_class_t target = (which == MC_URGENT) ? + MC_URGENT : MC_NONURGENT; + struct mctelem_ent *tep = COOKIE2MCTE(cookie); + + if (tep == NULL) + return; + + spin_lock(&processing_lock); + if (tep == mctctl.mctc_processing_head[target]) + mctelem_processing_release(tep); + wmb(); + spin_unlock(&processing_lock); +} diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mctelem.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/cpu/mcheck/mctelem.h Tue Mar 17 14:22:50 2009 +0000 @@ -0,0 +1,71 @@ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#ifndef _MCTELEM_H + +#define _MCTELEM_H + +#include <xen/init.h> +#include <xen/smp.h> +#include <asm/traps.h> + +/* Helper functions used for collecting error telemetry. + * + * mctelem_init preallocates a number of data areas for use during + * machine check data "logout". Two classes are distinguished - + * urgent uses, intended for use from machine check exception handlers, + * and non-urgent uses intended for use from error pollers. + * Associated with each logout entry of whatever class is a data area + * sized per the single argument to mctelem_init. mcelem_init should be + * called from MCA init code before anybody has the chance to change the + * machine check vector with mcheck_mca_logout or to use mcheck_mca_logout. + * + * To reserve an entry of a given class for use in logout, call + * mctelem_reserve (or use the common handler functions which do all this + * for you). This returns an opaque cookie, or NULL if no elements are + * available. Elements are reserved with an atomic operation so no deadlock + * will occur if, for example, a machine check exception interrupts a + * scheduled error poll. The implementation will raid free non-urgent + * entries if all urgent entries are in use when an urgent request is received. + * Once an entry is reserved the caller must eventually perform exactly + * one of two actions: mctelem_commit or mctelem_dismiss. + * + * On mctelem_commit the entry is appended to a processing list; mctelem_dismiss + * frees the element without processing. After either call the cookie + * must not be referenced again. + * + * To consume committed telemetry call mctelem_consume_oldest_begin + * which will return a cookie referencing the oldest (first committed) + * entry of the requested class. Access the associated data using + * mctelem_dataptr and when finished use mctelem_consume_oldest_end - in the + * begin .. end bracket you are guaranteed that the entry canot be freed + * even if it is ack'd elsewhere). Once the ultimate consumer of the + * telemetry has processed it to stable storage it should acknowledge + * the telemetry quoting the cookie id, at which point we will free + * the element from the processing list. + */ + +typedef struct mctelem_cookie *mctelem_cookie_t; + +typedef enum mctelem_class { + MC_URGENT, + MC_NONURGENT +} mctelem_class_t; + +extern void mctelem_init(int); +extern mctelem_cookie_t mctelem_reserve(mctelem_class_t); +extern void *mctelem_dataptr(mctelem_cookie_t); +extern void mctelem_commit(mctelem_cookie_t); +extern void mctelem_dismiss(mctelem_cookie_t); +extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t); +extern void mctelem_consume_oldest_end(mctelem_cookie_t); +extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t); + +#endif diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/non-fatal.c --- a/xen/arch/x86/cpu/mcheck/non-fatal.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/non-fatal.c Tue Mar 17 14:22:50 2009 +0000 @@ -14,46 +14,76 @@ #include <xen/smp.h> #include <xen/timer.h> #include <xen/errno.h> +#include <xen/event.h> +#include <xen/sched.h> #include <asm/processor.h> #include <asm/system.h> #include <asm/msr.h> #include "mce.h" -#include "x86_mca.h" -int firstbank = 0; + +static cpu_banks_t bankmask; static struct timer mce_timer; -#define MCE_PERIOD MILLISECS(15000) +#define MCE_PERIOD MILLISECS(8000) +#define MCE_PERIOD_MIN MILLISECS(2000) +#define MCE_PERIOD_MAX MILLISECS(16000) + +static uint64_t period = MCE_PERIOD; +static int adjust = 0; +static int variable_period = 1; static void mce_checkregs (void *info) { - u32 low, high; - int i; + mctelem_cookie_t mctc; + struct mca_summary bs; + static uint64_t dumpcount = 0; - for (i=firstbank; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high); + mctc = mcheck_mca_logout(MCA_POLLER, bankmask, &bs); - if (high & (1<<31)) { - printk(KERN_INFO "MCE: The hardware reports a non " - "fatal, correctable incident occurred on " - "CPU %d.\n", - smp_processor_id()); - printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low); + if (bs.errcnt && mctc != NULL) { + adjust++; - /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */ - wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + /* If Dom0 enabled the VIRQ_MCA event, then notify it. + * Otherwise, if dom0 has had plenty of time to register + * the virq handler but still hasn't then dump telemetry + * to the Xen console. The call count may be incremented + * on multiple cpus at once and is indicative only - just + * a simple-minded attempt to avoid spamming the console + * for corrected errors in early startup. + */ - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); + if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) { + mctelem_commit(mctc); + send_guest_global_virq(dom0, VIRQ_MCA); + } else if (++dumpcount >= 10) { + x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc)); + mctelem_dismiss(mctc); + } else { + mctelem_dismiss(mctc); } + } else if (mctc != NULL) { + mctelem_dismiss(mctc); } } static void mce_work_fn(void *data) { on_each_cpu(mce_checkregs, NULL, 1, 1); - set_timer(&mce_timer, NOW() + MCE_PERIOD); + + if (variable_period) { + if (adjust) + period /= (adjust + 1); + else + period *= 2; + if (period > MCE_PERIOD_MAX) + period = MCE_PERIOD_MAX; + if (period < MCE_PERIOD_MIN) + period = MCE_PERIOD_MIN; + } + + set_timer(&mce_timer, NOW() + period); + adjust = 0; } static int __init init_nonfatal_mce_checker(void) @@ -63,13 +93,17 @@ static int __init init_nonfatal_mce_chec /* Check for MCE support */ if (!mce_available(c)) return -ENODEV; + + memcpy(&bankmask, &mca_allbanks, sizeof (cpu_banks_t)); + if (mce_firstbank(c) == 1) + clear_bit(0, bankmask); + /* * Check for non-fatal errors every MCE_RATE s */ switch (c->x86_vendor) { case X86_VENDOR_AMD: if (c->x86 == 6) { /* K7 */ - firstbank = 1; init_timer(&mce_timer, mce_work_fn, NULL, 0); set_timer(&mce_timer, NOW() + MCE_PERIOD); break; @@ -80,15 +114,14 @@ static int __init init_nonfatal_mce_chec break; case X86_VENDOR_INTEL: - /* p5 family is different. P4/P6 and latest CPUs shares the - * same polling methods - */ + /* + * The P5 family is different. P4/P6 and latest CPUs share the + * same polling methods. + */ if ( c->x86 != 5 ) { - /* some CPUs or banks don't support cmci, we need to - * enable this feature anyway - */ - intel_mcheck_timer(c); + init_timer(&mce_timer, mce_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + MCE_PERIOD); } break; } diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/p5.c --- a/xen/arch/x86/cpu/mcheck/p5.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/p5.c Tue Mar 17 14:22:50 2009 +0000 @@ -16,7 +16,7 @@ #include "x86_mca.h" /* Machine check handler for Pentium class Intel */ -static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long error_code) +static void pentium_machine_check(struct cpu_user_regs * regs, long error_code) { u32 loaddr, hi, lotype; rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); @@ -28,19 +28,14 @@ static fastcall void pentium_machine_che } /* Set up machine check reporting for processors with Intel style MCE */ -void intel_p5_mcheck_init(struct cpuinfo_x86 *c) +int intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; - /*Check for MCE support */ - if( !cpu_has(c, X86_FEATURE_MCE) ) - return; - /* Default P5 to off as its often misconnected */ if(mce_disabled != -1) - return; - machine_check_vector = pentium_machine_check; - wmb(); + return 0; + x86_mce_vector_register(pentium_machine_check); /* Read registers before enabling */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); @@ -50,4 +45,6 @@ void intel_p5_mcheck_init(struct cpuinfo /* Enable MCE */ set_in_cr4(X86_CR4_MCE); printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); + + return 1; } diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/winchip.c --- a/xen/arch/x86/cpu/mcheck/winchip.c Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/winchip.c Tue Mar 17 14:22:50 2009 +0000 @@ -16,22 +16,24 @@ #include "mce.h" /* Machine check handler for WinChip C6 */ -static fastcall void winchip_machine_check(struct cpu_user_regs * regs, long error_code) +static void winchip_machine_check(struct cpu_user_regs * regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK); } /* Set up machine check reporting on the Winchip C6 series */ -void winchip_mcheck_init(struct cpuinfo_x86 *c) +int winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; - machine_check_vector = winchip_machine_check; + wmb(); + x86_mce_vector_register(winchip_machine_check); rdmsr(MSR_IDT_FCR1, lo, hi); lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo&= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); set_in_cr4(X86_CR4_MCE); printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); + return (1); } diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/x86_mca.h --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Mar 17 14:22:50 2009 +0000 @@ -16,6 +16,10 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef X86_MCA_H + +#define X86_MCA_H /* The MCA/MCE MSRs should not be used anywhere else. @@ -73,6 +77,9 @@ /* reserved bits */ #define MCi_STATUS_OTHER_RESERVED2 0x0180000000000000ULL +/* Bitfield of MSR_K8_HWCR register */ +#define K8_HWCR_MCi_STATUS_WREN (1ULL << 18) + /*Intel Specific bitfield*/ #define CMCI_THRESHOLD 0x2 @@ -87,3 +94,4 @@ extern unsigned int nr_mce_banks; extern unsigned int nr_mce_banks; extern int firstbank; +#endif /* X86_MCA_H */ diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/include/asm-x86/traps.h --- a/xen/include/asm-x86/traps.h Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/include/asm-x86/traps.h Tue Mar 17 14:22:50 2009 +0000 @@ -28,7 +28,7 @@ struct softirq_trap { struct cpu_user_regs; -extern void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code); +extern void machine_check_vector(struct cpu_user_regs *regs, long error_code); /** * guest_has_trap_callback diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/include/public/arch-x86/xen-mca.h --- a/xen/include/public/arch-x86/xen-mca.h Tue Mar 17 14:21:18 2009 +0000 +++ b/xen/include/public/arch-x86/xen-mca.h Tue Mar 17 14:22:50 2009 +0000 @@ -56,13 +56,20 @@ /* Hypercall */ #define __HYPERVISOR_mca __HYPERVISOR_arch_0 -#define XEN_MCA_INTERFACE_VERSION 0x03000002 - -/* IN: Dom0 calls hypercall from MC event handler. */ -#define XEN_MC_CORRECTABLE 0x0 -/* IN: Dom0/DomU calls hypercall from MC trap handler. */ -#define XEN_MC_TRAP 0x1 -/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */ +/* + * The xen-unstable repo has interface version 0x03000001; out interface + * is incompatible with that and any future minor revisions, so we + * choose a different version number range that is numerically less + * than that used in xen-unstable. + */ +#define XEN_MCA_INTERFACE_VERSION 0x01ecc002 + +/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */ +#define XEN_MC_NONURGENT 0x0001 +/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */ +#define XEN_MC_URGENT 0x0002 +/* IN: Dom0 acknowledges previosly-fetched telemetry */ +#define XEN_MC_ACK 0x0004 /* OUT: All is ok */ #define XEN_MC_OK 0x0 @@ -110,6 +117,7 @@ struct mcinfo_common { #define MC_FLAG_POLLED (1 << 3) #define MC_FLAG_RESET (1 << 4) #define MC_FLAG_CMCI (1 << 5) +#define MC_FLAG_MCE (1 << 6) /* contains global x86 mc information */ struct mcinfo_global { struct mcinfo_common common; @@ -174,6 +182,7 @@ struct mc_info { uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)]; }; typedef struct mc_info mc_info_t; +DEFINE_XEN_GUEST_HANDLE(mc_info_t); #define __MC_MSR_ARRAYSIZE 8 #define __MC_NMSRS 1 @@ -274,14 +283,14 @@ DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_c #define XEN_MC_fetch 1 struct xen_mc_fetch { /* IN/OUT variables. */ - uint32_t flags; - -/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */ -/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */ + uint32_t flags; /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT, + XEN_MC_ACK if ack'ing an earlier fetch */ + /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, + XEN_MC_NODATA, XEN_MC_NOMATCH */ + uint64_t fetch_id; /* OUT: id for ack, IN: id we are ack'ing */ /* OUT variables. */ - uint32_t fetch_idx; /* only useful for Dom0 for the notify hypercall */ - struct mc_info mc_info; + XEN_GUEST_HANDLE(mc_info_t) data; }; typedef struct xen_mc_fetch xen_mc_fetch_t; DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t); @@ -296,7 +305,6 @@ struct xen_mc_notifydomain { uint16_t mc_domid; /* The unprivileged domain to notify. */ uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify. * Usually echo'd value from the fetch hypercall. */ - uint32_t fetch_idx; /* echo'd value from the fetch hypercall. */ /* IN/OUT variables. */ uint32_t flags; @@ -316,15 +324,16 @@ struct xen_mc_physcpuinfo { XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info; }; +typedef union { + struct xen_mc_fetch mc_fetch; + struct xen_mc_notifydomain mc_notifydomain; + struct xen_mc_physcpuinfo mc_physcpuinfo; +} xen_mc_arg_t; + struct xen_mc { uint32_t cmd; uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */ - union { - struct xen_mc_fetch mc_fetch; - struct xen_mc_notifydomain mc_notifydomain; - struct xen_mc_physcpuinfo mc_physcpuinfo; - uint8_t pad[MCINFO_HYPERCALLSIZE]; - } u; + xen_mc_arg_t u; }; typedef struct xen_mc xen_mc_t; DEFINE_XEN_GUEST_HANDLE(xen_mc_t); _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |