[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 1/2] MCA support with page offlining
This is xen part. [1/2] xen part: mca-support-with-page-offlining-xen.patch Signed-off-by: Kazuhiro Suzuki <kaz@xxxxxxxxxxxxxx> Thanks, KAZ diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_f10.c --- a/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Mon Dec 15 14:25:07 2008 +0900 @@ -82,8 +82,6 @@ } -extern void k8_machine_check(struct cpu_user_regs *regs, long error_code); - /* AMD Family10 machine check */ void amd_f10_mcheck_init(struct cpuinfo_x86 *c) { @@ -91,7 +89,7 @@ uint32_t i; int cpu_nr; - machine_check_vector = k8_machine_check; + machine_check_vector = x86_machine_check; mc_callback_bank_extended = amd_f10_handler; cpu_nr = smp_processor_id(); wmb(); diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_k8.c --- a/xen/arch/x86/cpu/mcheck/amd_k8.c Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_k8.c Mon Dec 15 14:25:07 2008 +0900 @@ -69,220 +69,8 @@ #include "mce.h" #include "x86_mca.h" +extern int mce_bootlog; -/* Machine Check Handler for AMD K8 family series */ -void k8_machine_check(struct cpu_user_regs *regs, long error_code) -{ - struct vcpu *vcpu = current; - struct domain *curdom; - struct mc_info *mc_data; - struct mcinfo_global mc_global; - struct mcinfo_bank mc_info; - uint64_t status, addrv, miscv, uc; - uint32_t i; - unsigned int cpu_nr; - uint32_t xen_impacted = 0; -#define DOM_NORMAL 0 -#define DOM0_TRAP 1 -#define DOMU_TRAP 2 -#define DOMU_KILLED 4 - uint32_t dom_state = DOM_NORMAL; - - /* This handler runs as interrupt gate. So IPIs from the - * polling service routine are defered until we finished. - */ - - /* Disable interrupts for the _vcpu_. It may not re-scheduled to - * an other physical CPU or the impacted process in the guest - * continues running with corrupted data, otherwise. */ - vcpu_schedule_lock_irq(vcpu); - - mc_data = x86_mcinfo_getptr(); - cpu_nr = smp_processor_id(); - curdom = vcpu->domain; - - memset(&mc_global, 0, sizeof(mc_global)); - mc_global.common.type = MC_TYPE_GLOBAL; - mc_global.common.size = sizeof(mc_global); - - mc_global.mc_domid = curdom->domain_id; /* impacted domain */ - mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ - BUG_ON(cpu_nr != vcpu->processor); - mc_global.mc_core_threadid = 0; - mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ -#if 0 /* TODO: on which socket is this physical core? - It's not clear to me how to figure this out. */ - mc_global.mc_socketid = ???; -#endif - mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE; - rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); - - /* Quick check, who is impacted */ - xen_impacted = is_idle_domain(curdom); - - /* Dom0 */ - x86_mcinfo_clear(mc_data); - x86_mcinfo_add(mc_data, &mc_global); - - for (i = 0; i < nr_mce_banks; i++) { - struct domain *d; - - rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status); - - if (!(status & MCi_STATUS_VAL)) - continue; - - /* An error happened in this bank. - * This is expected to be an uncorrectable error, - * since correctable errors get polled. - */ - uc = status & MCi_STATUS_UC; - - memset(&mc_info, 0, sizeof(mc_info)); - mc_info.common.type = MC_TYPE_BANK; - mc_info.common.size = sizeof(mc_info); - mc_info.mc_bank = i; - mc_info.mc_status = status; - - addrv = 0; - if (status & MCi_STATUS_ADDRV) { - rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv); - - d = maddr_get_owner(addrv); - if (d != NULL) - mc_info.mc_domid = d->domain_id; - } - - miscv = 0; - if (status & MCi_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv); - - mc_info.mc_addr = addrv; - mc_info.mc_misc = miscv; - - x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */ - - if (mc_callback_bank_extended) - mc_callback_bank_extended(mc_data, i, status); - - /* clear status */ - wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - - status = mc_global.mc_gstatus; - - /* clear MCIP or cpu enters shutdown state - * in case another MCE occurs. */ - status &= ~MCG_STATUS_MCIP; - wrmsrl(MSR_IA32_MCG_STATUS, status); - wmb(); - - /* For the details see the discussion "MCE/MCA concept" on xen-devel. - * The thread started here: - * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html - */ - - /* MCG_STATUS_RIPV: - * When this bit is not set, then the instruction pointer onto the stack - * to resume at is not valid. If xen is interrupted, then we panic anyway - * right below. Otherwise it is up to the guest to figure out if - * guest kernel or guest userland is affected and should kill either - * itself or the affected process. - */ - - /* MCG_STATUS_EIPV: - * Evaluation of EIPV is the job of the guest. - */ - - if (xen_impacted) { - /* Now we are going to panic anyway. Allow interrupts, so that - * printk on serial console can work. */ - vcpu_schedule_unlock_irq(vcpu); - - /* Uh, that means, machine check exception - * inside Xen occured. */ - printk("Machine check exception occured in Xen.\n"); - - /* if MCG_STATUS_EIPV indicates, the IP on the stack is related - * to the error then it makes sense to print a stack trace. - * That can be useful for more detailed error analysis and/or - * error case studies to figure out, if we can clear - * xen_impacted and kill a DomU instead - * (i.e. if a guest only control structure is affected, but then - * we must ensure the bad pages are not re-used again). - */ - if (status & MCG_STATUS_EIPV) { - printk("MCE: Instruction Pointer is related to the error. " - "Therefore, print the execution state.\n"); - show_execution_state(regs); - } - x86_mcinfo_dump(mc_data); - panic("End of MCE. Use mcelog to decode above error codes.\n"); - } - - /* If Dom0 registered a machine check handler, which is only possible - * with a PV MCA driver, then ... */ - if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) { - dom_state = DOM0_TRAP; - - /* ... deliver machine check trap to Dom0. */ - send_guest_trap(dom0, 0, TRAP_machine_check); - - /* Xen may tell Dom0 now to notify the DomU. - * But this will happen through a hypercall. */ - } else - /* Dom0 did not register a machine check handler, but if DomU - * did so, then... */ - if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) { - dom_state = DOMU_TRAP; - - /* ... deliver machine check trap to DomU */ - send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check); - } else { - /* hmm... noone feels responsible to handle the error. - * So, do a quick check if a DomU is impacted or not. - */ - if (curdom == dom0) { - /* Dom0 is impacted. Since noone can't handle - * this error, panic! */ - x86_mcinfo_dump(mc_data); - panic("MCE occured in Dom0, which it can't handle\n"); - - /* UNREACHED */ - } else { - dom_state = DOMU_KILLED; - - /* Enable interrupts. This basically results in - * calling sti on the *physical* cpu. But after - * domain_crash() the vcpu pointer is invalid. - * Therefore, we must unlock the irqs before killing - * it. */ - vcpu_schedule_unlock_irq(vcpu); - - /* DomU is impacted. Kill it and continue. */ - domain_crash(curdom); - } - } - - - switch (dom_state) { - case DOM0_TRAP: - case DOMU_TRAP: - /* Enable interrupts. */ - vcpu_schedule_unlock_irq(vcpu); - - /* guest softirqs and event callbacks are scheduled - * immediately after this handler exits. */ - break; - case DOMU_KILLED: - /* Nothing to do here. */ - break; - default: - BUG(); - } -} /* AMD K8 machine check */ @@ -292,7 +80,7 @@ uint32_t i; int cpu_nr; - machine_check_vector = k8_machine_check; + machine_check_vector = x86_machine_check; cpu_nr = smp_processor_id(); wmb(); @@ -300,6 +88,17 @@ if (value & MCG_CTL_P) /* Control register present ? */ wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL); nr_mce_banks = value & MCG_CAP_COUNT; + + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + for (i=0; i<nr_mce_banks; i++) { + u64 status; + rdmsrl(MSR_IA32_MC0_STATUS + i*4, status); + if (status & MCi_STATUS_VAL) { + x86_machine_check(NULL, mce_bootlog ? -1 : -2); + break; + } + } for (i = 0; i < nr_mce_banks; i++) { switch (i) { diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_nonfatal.c --- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c Mon Dec 15 14:25:07 2008 +0900 @@ -65,117 +65,12 @@ #include "mce.h" #include "x86_mca.h" -static struct timer mce_timer; +static int hw_threshold = 0; -#define MCE_PERIOD MILLISECS(15000) -#define MCE_MIN MILLISECS(2000) -#define MCE_MAX MILLISECS(30000) +extern struct timer mce_timer; -static s_time_t period = MCE_PERIOD; -static int hw_threshold = 0; -static int adjust = 0; - -/* The polling service routine: - * Collects information of correctable errors and notifies - * Dom0 via an event. - */ -void mce_amd_checkregs(void *info) -{ - struct vcpu *vcpu = current; - struct mc_info *mc_data; - struct mcinfo_global mc_global; - struct mcinfo_bank mc_info; - uint64_t status, addrv, miscv; - unsigned int i; - unsigned int event_enabled; - unsigned int cpu_nr; - int error_found; - - /* We don't need a slot yet. Only allocate one on error. */ - mc_data = NULL; - - cpu_nr = smp_processor_id(); - event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA); - error_found = 0; - - memset(&mc_global, 0, sizeof(mc_global)); - mc_global.common.type = MC_TYPE_GLOBAL; - mc_global.common.size = sizeof(mc_global); - - mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */ - mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ - BUG_ON(cpu_nr != vcpu->processor); - mc_global.mc_core_threadid = 0; - mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ -#if 0 /* TODO: on which socket is this physical core? - It's not clear to me how to figure this out. */ - mc_global.mc_socketid = ???; -#endif - mc_global.mc_flags |= MC_FLAG_CORRECTABLE; - rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); - - for (i = 0; i < nr_mce_banks; i++) { - struct domain *d; - - rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status); - - if (!(status & MCi_STATUS_VAL)) - continue; - - if (mc_data == NULL) { - /* Now we need a slot to fill in error telemetry. */ - mc_data = x86_mcinfo_getptr(); - BUG_ON(mc_data == NULL); - x86_mcinfo_clear(mc_data); - x86_mcinfo_add(mc_data, &mc_global); - } - - memset(&mc_info, 0, sizeof(mc_info)); - mc_info.common.type = MC_TYPE_BANK; - mc_info.common.size = sizeof(mc_info); - mc_info.mc_bank = i; - mc_info.mc_status = status; - - /* Increase polling frequency */ - error_found = 1; - - addrv = 0; - if (status & MCi_STATUS_ADDRV) { - rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv); - - d = maddr_get_owner(addrv); - if (d != NULL) - mc_info.mc_domid = d->domain_id; - } - - miscv = 0; - if (status & MCi_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv); - - mc_info.mc_addr = addrv; - mc_info.mc_misc = miscv; - x86_mcinfo_add(mc_data, &mc_info); - - if (mc_callback_bank_extended) - mc_callback_bank_extended(mc_data, i, status); - - /* clear status */ - wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL); - wmb(); - } - - if (error_found > 0) { - /* If Dom0 enabled the VIRQ_MCA event, then ... */ - if (event_enabled) - /* ... notify it. */ - send_guest_global_virq(dom0, VIRQ_MCA); - else - /* ... or dump it */ - x86_mcinfo_dump(mc_data); - } - - adjust += error_found; -} +extern s_time_t period; +extern int adjust; /* polling service routine invoker: * Adjust poll frequency at runtime. No error means slow polling frequency, @@ -186,7 +81,7 @@ */ static void mce_amd_work_fn(void *data) { - on_each_cpu(mce_amd_checkregs, data, 1, 1); + on_each_cpu(x86_mce_checkregs, data, 1, 1); if (adjust > 0) { if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) { diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/arch/x86/cpu/mcheck/mce.c Mon Dec 15 14:25:07 2008 +0900 @@ -7,6 +7,9 @@ #include <xen/types.h> #include <xen/kernel.h> #include <xen/config.h> +#include <xen/sched.h> +#include <xen/sched-if.h> +#include <xen/paging.h> #include <xen/smp.h> #include <xen/errno.h> @@ -18,6 +21,12 @@ int mce_disabled = 0; unsigned int nr_mce_banks; +int mce_bootlog = 1; + +#define MAX_PAGE_OFFLINING 1024 + +static struct page_info *page_offlining[MAX_PAGE_OFFLINING]; +static int num_page_offlining = 0; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ @@ -136,6 +145,9 @@ intel_p5_mcheck_init(c); if (c->x86==6) intel_p6_mcheck_init(c); +#else + if (c->x86==6) + intel_p4_mcheck_init(c); #endif if (c->x86==15) intel_p4_mcheck_init(c); @@ -159,9 +171,19 @@ mce_disabled = 1; } +/* mce=off disables machine check. + mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + mce=nobootlog Don't log MCEs from before booting. */ static void __init mcheck_enable(char *str) { - mce_disabled = -1; + if (*str == '=') + str++; + if (!strcmp(str, "off")) + mce_disabled = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) + mce_bootlog = str[0] == 'b'; + else + printk("mce= argument %s ignored.", str); } custom_param("nomce", mcheck_disable); @@ -221,6 +243,12 @@ /* This function is called from the fetch hypercall with * the mc_lock spinlock held. Thus, no need for locking here. */ + + /* Return NULL if no data is available. */ + if (mc_data.fetch_idx == mc_data.error_idx) { + *fetch_idx = mc_data.fetch_idx; + return NULL; + } mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx)); if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) { /* Bogus domU command detected. */ @@ -431,6 +459,272 @@ } while (1); } +static int x86_page_offlining(unsigned long maddr, struct domain *d) +{ + int i; + struct page_info *pg; + + if (!mfn_valid(maddr >> PAGE_SHIFT)) { + printk(XENLOG_ERR "Page offlining: ( %lx ) invalid.\n", maddr); + return -1; + } + + /* convert physical address to physical page number */ + pg = maddr_to_page(maddr); + + if (pg == NULL) { + printk(XENLOG_ERR "Page offlining: ( %lx ) not found.\n", + maddr); + return -1; + } + + /* check whether a page number have been already registered or not */ + for (i = 0; i < num_page_offlining; i++) + if (page_offlining[i] == pg) + goto out; + + /* limitation check and already having attribute 'reserved' */ + if (num_page_offlining == MAX_PAGE_OFFLINING || + pg->count_info & PGC_reserved) { + printk(XENLOG_DEBUG "Page offlining: ( %lx ) failure.\n", + maddr); + return 1; + } + + /* add attribute 'reserved' and register the page */ + get_page(pg, d); + pg->count_info |= PGC_reserved; + page_offlining[num_page_offlining++] = pg; + + out: + printk(XENLOG_DEBUG "Page offlining: ( %lx ) success.\n", maddr); + return 0; +} + + +/* Machine Check Handler for AMD K8 family series and Intel P4/Xeon family */ +void x86_machine_check(struct cpu_user_regs *regs, long error_code) +{ + struct vcpu *vcpu = current; + struct domain *curdom; + struct mc_info *mc_data; + struct mcinfo_global mc_global; + struct mcinfo_bank mc_info; + uint64_t status, addrv, miscv, uc; + uint32_t i; + unsigned int cpu_nr; + uint32_t xen_impacted = 0; +#define DOM_NORMAL 0 +#define DOM0_TRAP 1 +#define DOMU_TRAP 2 +#define DOMU_KILLED 4 + uint32_t dom_state = DOM_NORMAL; + + /* This handler runs as interrupt gate. So IPIs from the + * polling service routine are defered until we finished. + */ + + /* Disable interrupts for the _vcpu_. It may not re-scheduled to + * an other physical CPU or the impacted process in the guest + * continues running with corrupted data, otherwise. */ + vcpu_schedule_lock_irq(vcpu); + + mc_data = x86_mcinfo_getptr(); + cpu_nr = smp_processor_id(); + curdom = vcpu->domain; + + memset(&mc_global, 0, sizeof(mc_global)); + mc_global.common.type = MC_TYPE_GLOBAL; + mc_global.common.size = sizeof(mc_global); + + mc_global.mc_domid = curdom->domain_id; /* impacted domain */ + mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ + BUG_ON(cpu_nr != vcpu->processor); + mc_global.mc_core_threadid = 0; + mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ +#if 0 /* TODO: on which socket is this physical core? + It's not clear to me how to figure this out. */ + mc_global.mc_socketid = ???; +#endif + mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE; + rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); + + /* Quick check, who is impacted */ + xen_impacted = is_idle_domain(curdom); + + /* Dom0 */ + x86_mcinfo_clear(mc_data); + x86_mcinfo_add(mc_data, &mc_global); + + for (i = 0; i < nr_mce_banks; i++) { + struct domain *d; + + rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status); + + if (!(status & MCi_STATUS_VAL)) + continue; + + /* An error happened in this bank. + * This is expected to be an uncorrectable error, + * since correctable errors get polled. + */ + uc = status & MCi_STATUS_UC; + + memset(&mc_info, 0, sizeof(mc_info)); + mc_info.common.type = MC_TYPE_BANK; + mc_info.common.size = sizeof(mc_info); + mc_info.mc_bank = i; + mc_info.mc_status = status; + + addrv = 0; + if (status & MCi_STATUS_ADDRV) { + rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv); + + d = maddr_get_owner(addrv); + if (d != NULL) { + mc_info.mc_domid = d->domain_id; + + /* Page offlining */ + x86_page_offlining(addrv, d); + } + } + + miscv = 0; + if (status & MCi_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv); + + mc_info.mc_addr = addrv; + mc_info.mc_misc = miscv; + + x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */ + + if (mc_callback_bank_extended) + mc_callback_bank_extended(mc_data, i, status); + + /* clear status */ + wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + + /* Never do anything final for the previous reset */ + if (!regs) { + vcpu_schedule_unlock_irq(vcpu); + return; + } + + status = mc_global.mc_gstatus; + + /* clear MCIP or cpu enters shutdown state + * in case another MCE occurs. */ + status &= ~MCG_STATUS_MCIP; + wrmsrl(MSR_IA32_MCG_STATUS, status); + wmb(); + + /* For the details see the discussion "MCE/MCA concept" on xen-devel. + * The thread started here: + * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html + */ + + /* MCG_STATUS_RIPV: + * When this bit is not set, then the instruction pointer onto the stack + * to resume at is not valid. If xen is interrupted, then we panic anyway + * right below. Otherwise it is up to the guest to figure out if + * guest kernel or guest userland is affected and should kill either + * itself or the affected process. + */ + + /* MCG_STATUS_EIPV: + * Evaluation of EIPV is the job of the guest. + */ + + if (xen_impacted) { + /* Now we are going to panic anyway. Allow interrupts, so that + * printk on serial console can work. */ + vcpu_schedule_unlock_irq(vcpu); + + /* Uh, that means, machine check exception + * inside Xen occured. */ + printk("Machine check exception occured in Xen.\n"); + + /* if MCG_STATUS_EIPV indicates, the IP on the stack is related + * to the error then it makes sense to print a stack trace. + * That can be useful for more detailed error analysis and/or + * error case studies to figure out, if we can clear + * xen_impacted and kill a DomU instead + * (i.e. if a guest only control structure is affected, but then + * we must ensure the bad pages are not re-used again). + */ + if (status & MCG_STATUS_EIPV) { + printk("MCE: Instruction Pointer is related to the error. " + "Therefore, print the execution state.\n"); + show_execution_state(regs); + } + x86_mcinfo_dump(mc_data); + panic("End of MCE. Use mcelog to decode above error codes.\n"); + } + + /* If Dom0 registered a machine check handler, which is only possible + * with a PV MCA driver, then ... */ + if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) { + dom_state = DOM0_TRAP; + + /* ... deliver machine check trap to Dom0. */ + send_guest_trap(dom0, 0, TRAP_machine_check); + + /* Xen may tell Dom0 now to notify the DomU. + * But this will happen through a hypercall. */ + } else + /* Dom0 did not register a machine check handler, but if DomU + * did so, then... */ + if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) { + dom_state = DOMU_TRAP; + + /* ... deliver machine check trap to DomU */ + send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check); + } else { + /* hmm... noone feels responsible to handle the error. + * So, do a quick check if a DomU is impacted or not. + */ + if (curdom == dom0) { + /* Dom0 is impacted. Since noone can't handle + * this error, panic! */ + x86_mcinfo_dump(mc_data); + panic("MCE occured in Dom0, which it can't handle\n"); + + /* UNREACHED */ + } else { + dom_state = DOMU_KILLED; + + /* Enable interrupts. This basically results in + * calling sti on the *physical* cpu. But after + * domain_crash() the vcpu pointer is invalid. + * Therefore, we must unlock the irqs before killing + * it. */ + vcpu_schedule_unlock_irq(vcpu); + + /* DomU is impacted. Kill it and continue. */ + domain_crash(curdom); + } + } + + + switch (dom_state) { + case DOM0_TRAP: + case DOMU_TRAP: + /* Enable interrupts. */ + vcpu_schedule_unlock_irq(vcpu); + + /* guest softirqs and event callbacks are scheduled + * immediately after this handler exits. */ + break; + case DOMU_KILLED: + /* Nothing to do here. */ + break; + default: + BUG(); + } +} /* Machine Check Architecture Hypercall */ @@ -564,7 +858,7 @@ if ( copy_to_guest(u_xen_mc, op, 1) ) ret = -EFAULT; - if (ret == 0) { + if (ret == 0 && mc_notifydomain->flags == XEN_MC_OK) { x86_mcinfo_marknotified(mc_notifydomain); } diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/non-fatal.c --- a/xen/arch/x86/cpu/mcheck/non-fatal.c Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/arch/x86/cpu/mcheck/non-fatal.c Mon Dec 15 14:25:07 2008 +0900 @@ -14,16 +14,158 @@ #include <xen/smp.h> #include <xen/timer.h> #include <xen/errno.h> +#include <xen/event.h> #include <asm/processor.h> #include <asm/system.h> #include <asm/msr.h> #include "mce.h" +#include "x86_mca.h" static int firstbank; -static struct timer mce_timer; -#define MCE_PERIOD MILLISECS(15000) +struct timer mce_timer; + +s_time_t period = MCE_PERIOD; +int adjust = 0; + +/* The polling service routine: + * Collects information of correctable errors and notifies + * Dom0 via an event. + */ +void x86_mce_checkregs(void *info) +{ + struct vcpu *vcpu = current; + struct mc_info *mc_data; + struct mcinfo_global mc_global; + struct mcinfo_bank mc_info; + uint64_t status, addrv, miscv; + unsigned int i; + unsigned int event_enabled; + unsigned int cpu_nr; + int error_found; + + /* We don't need a slot yet. Only allocate one on error. */ + mc_data = NULL; + + cpu_nr = smp_processor_id(); + event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA); + error_found = 0; + + memset(&mc_global, 0, sizeof(mc_global)); + mc_global.common.type = MC_TYPE_GLOBAL; + mc_global.common.size = sizeof(mc_global); + + mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */ + mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ + BUG_ON(cpu_nr != vcpu->processor); + mc_global.mc_core_threadid = 0; + mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ +#if 0 /* TODO: on which socket is this physical core? + It's not clear to me how to figure this out. */ + mc_global.mc_socketid = ???; +#endif + mc_global.mc_flags |= MC_FLAG_CORRECTABLE; + rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); + + for (i = 0; i < nr_mce_banks; i++) { + struct domain *d; + + rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status); + + if (!(status & MCi_STATUS_VAL)) + continue; + + if (mc_data == NULL) { + /* Now we need a slot to fill in error telemetry. */ + mc_data = x86_mcinfo_getptr(); + BUG_ON(mc_data == NULL); + x86_mcinfo_clear(mc_data); + x86_mcinfo_add(mc_data, &mc_global); + } + + memset(&mc_info, 0, sizeof(mc_info)); + mc_info.common.type = MC_TYPE_BANK; + mc_info.common.size = sizeof(mc_info); + mc_info.mc_bank = i; + mc_info.mc_status = status; + + /* Increase polling frequency */ + error_found = 1; + + addrv = 0; + if (status & MCi_STATUS_ADDRV) { + rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv); + + d = maddr_get_owner(addrv); + if (d != NULL) + mc_info.mc_domid = d->domain_id; + } + + miscv = 0; + if (status & MCi_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv); + + mc_info.mc_addr = addrv; + mc_info.mc_misc = miscv; + x86_mcinfo_add(mc_data, &mc_info); + + if (mc_callback_bank_extended) + mc_callback_bank_extended(mc_data, i, status); + + /* clear status */ + wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL); + wmb(); + } + + if (error_found > 0) { + /* If Dom0 enabled the VIRQ_MCA event, then ... */ + if (event_enabled) + /* ... notify it. */ + send_guest_global_virq(dom0, VIRQ_MCA); + else + /* ... or dump it */ + x86_mcinfo_dump(mc_data); + } + + adjust += error_found; +} + +static void p4_mce_work_fn(void *data) +{ + on_each_cpu(x86_mce_checkregs, NULL, 1, 1); + + if (adjust > 0) { + if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) { + /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */ + printk("MCE: polling routine found correctable error. " + " Use mcelog to parse above error output.\n"); + } + } + + if (adjust > 0) { + /* Increase polling frequency */ + adjust++; /* adjust == 1 must have an effect */ + period /= adjust; + } else { + /* Decrease polling frequency */ + period *= 2; + } + if (period > MCE_MAX) { + /* limit: Poll at least every 30s */ + period = MCE_MAX; + } + if (period < MCE_MIN) { + /* limit: Poll every 2s. + * When this is reached an uncorrectable error + * is expected to happen, if Dom0 does nothing. + */ + period = MCE_MIN; + } + + set_timer(&mce_timer, NOW() + period); + adjust = 0; +} static void mce_checkregs (void *info) { @@ -85,6 +227,15 @@ break; case X86_VENDOR_INTEL: + if (c->x86 == 15 /* P4/Xeon */ +#ifdef CONFIG_X86_64 + || c->x86 == 6 +#endif + ) { + init_timer(&mce_timer, p4_mce_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + period); + break; + } init_timer(&mce_timer, mce_work_fn, NULL, 0); set_timer(&mce_timer, NOW() + MCE_PERIOD); break; diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/p4.c --- a/xen/arch/x86/cpu/mcheck/p4.c Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/arch/x86/cpu/mcheck/p4.c Mon Dec 15 14:25:07 2008 +0900 @@ -15,6 +15,7 @@ #include <asm/apic.h> #include "mce.h" +#include "x86_mca.h" /* as supported by the P4/Xeon family */ struct intel_mce_extended_msrs { @@ -32,6 +33,7 @@ }; static int mce_num_extended_msrs = 0; +extern int mce_bootlog; #ifdef CONFIG_X86_MCE_P4THERMAL @@ -158,85 +160,13 @@ return mce_num_extended_msrs; } -static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code) -{ - int recover=1; - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int i; - struct intel_mce_extended_msrs dbg; - - rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; - - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (intel_get_extended_msrs(&dbg)) { - printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags); - printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); - printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i=0; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); - if (high & (1<<31)) { - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); - } - printk ("\n"); - } - } - - if (recover & 2) - panic ("CPU context corrupt"); - if (recover & 1) - panic ("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i=0; i<nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr (msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); -} - void intel_p4_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; - machine_check_vector = intel_machine_check; + machine_check_vector = x86_machine_check; wmb(); printk (KERN_INFO "Intel machine check architecture supported.\n"); @@ -244,6 +174,17 @@ if (l & (1<<8)) /* Control register present ? */ wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff; + + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + for (i=0; i<nr_mce_banks; i++) { + u64 status; + rdmsrl(MSR_IA32_MC0_STATUS + i*4, status); + if (status & MCi_STATUS_VAL) { + x86_machine_check(NULL, mce_bootlog ? -1 : -2); + break; + } + } for (i=0; i<nr_mce_banks; i++) { wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/x86_mca.h --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Mon Dec 15 14:25:07 2008 +0900 @@ -70,3 +70,11 @@ /* reserved bits */ #define MCi_STATUS_OTHER_RESERVED2 0x0180000000000000ULL +/* Polling period */ +#define MCE_PERIOD MILLISECS(15000) +#define MCE_MIN MILLISECS(2000) +#define MCE_MAX MILLISECS(30000) + +/* Common routines */ +void x86_machine_check(struct cpu_user_regs *regs, long error_code); +void x86_mce_checkregs(void *info); diff -r 6595393a3d28 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/arch/x86/traps.c Mon Dec 15 14:25:07 2008 +0900 @@ -726,8 +726,10 @@ if ( !opt_allow_hugepage ) __clear_bit(X86_FEATURE_PSE, &d); __clear_bit(X86_FEATURE_PGE, &d); +#ifndef __x86_64__ __clear_bit(X86_FEATURE_MCE, &d); __clear_bit(X86_FEATURE_MCA, &d); +#endif __clear_bit(X86_FEATURE_PSE36, &d); } switch ( (uint32_t)regs->eax ) diff -r 6595393a3d28 xen/common/page_alloc.c --- a/xen/common/page_alloc.c Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/common/page_alloc.c Mon Dec 15 14:25:07 2008 +0900 @@ -338,8 +338,14 @@ /* Find smallest order which can satisfy the request. */ for ( j = order; j <= MAX_ORDER; j++ ) - if ( !list_empty(&heap(node, zone, j)) ) - goto found; + if ( !list_empty(&heap(node, zone, j)) ) { + pg = list_entry(heap(node, zone, j).next, struct page_info, list); + if (!(pg->count_info & PGC_reserved)) + goto found; + else + printk(XENLOG_DEBUG "Page %p(%lx) is not to be allocated.\n", + pg, page_to_maddr(pg)); + } } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */ /* Pick next node, wrapping around if needed. */ @@ -402,11 +408,22 @@ unsigned long mask; unsigned int i, node = phys_to_nid(page_to_maddr(pg)); struct domain *d; + int reserved = 0; ASSERT(zone < NR_ZONES); ASSERT(order <= MAX_ORDER); ASSERT(node >= 0); ASSERT(node < num_online_nodes()); + + for ( i = 0; i < (1 << order); i++) { + reserved += !!(pg[i].count_info & PGC_reserved); + if (!!(pg[i].count_info & PGC_reserved)) + printk(XENLOG_DEBUG "Page %p(%lx) is not to be freed\n", + &pg[i], page_to_maddr(&pg[i])); + } + + if (reserved) + return; for ( i = 0; i < (1 << order); i++ ) { diff -r 6595393a3d28 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Tue Dec 09 16:28:02 2008 +0000 +++ b/xen/include/asm-x86/mm.h Mon Dec 15 14:25:07 2008 +0900 @@ -142,8 +142,11 @@ /* 3-bit PAT/PCD/PWT cache-attribute hint. */ #define PGC_cacheattr_base 26 #define PGC_cacheattr_mask (7U<<PGC_cacheattr_base) - /* 26-bit count of references to this frame. */ -#define PGC_count_mask ((1U<<26)-1) + /* Set for special pages, which can never be used */ +#define _PGC_reserved 25 +#define PGC_reserved (1U<<_PGC_reserved) + /* 25-bit count of references to this frame. */ +#define PGC_count_mask ((1U<<25)-1) #define is_xen_heap_page(page) is_xen_heap_mfn(page_to_mfn(page)) #define is_xen_heap_mfn(mfn) ({ \ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |