[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 2/2] MCA support with page offlining
This is linux/x86_64 part. [2/2] linux/x86_64 part: mca-support-with-page-offlining-linux.patch Signed-off-by: Kazuhiro Suzuki <kaz@xxxxxxxxxxxxxx> Thanks, KAZ diff -r ff9683032b76 arch/x86_64/Kconfig --- a/arch/x86_64/Kconfig Sat Dec 13 16:00:43 2008 +0000 +++ b/arch/x86_64/Kconfig Mon Dec 15 14:25:11 2008 +0900 @@ -472,7 +472,6 @@ config X86_MCE bool "Machine check support" if EMBEDDED - depends on !X86_64_XEN default y help Include a machine check error handler to report hardware errors. diff -r ff9683032b76 arch/x86_64/kernel/apic-xen.c --- a/arch/x86_64/kernel/apic-xen.c Sat Dec 13 16:00:43 2008 +0000 +++ b/arch/x86_64/kernel/apic-xen.c Mon Dec 15 14:25:11 2008 +0900 @@ -60,6 +60,14 @@ int setup_profiling_timer(unsigned int multiplier) { return -EINVAL; +} + +void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask) +{ + unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + apic_write(reg, v); } void smp_local_timer_interrupt(struct pt_regs *regs) diff -r ff9683032b76 arch/x86_64/kernel/entry-xen.S --- a/arch/x86_64/kernel/entry-xen.S Sat Dec 13 16:00:43 2008 +0000 +++ b/arch/x86_64/kernel/entry-xen.S Mon Dec 15 14:25:11 2008 +0900 @@ -1259,12 +1259,13 @@ #ifdef CONFIG_X86_MCE /* runs on exception stack */ ENTRY(machine_check) - INTR_FRAME +/* INTR_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_machine_check - jmp paranoid_exit1 - CFI_ENDPROC + paranoidentry do_machine_check */ + zeroentry do_machine_check +/* jmp paranoid_exit1 + CFI_ENDPROC */ END(machine_check) #endif diff -r ff9683032b76 arch/x86_64/kernel/mce-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/x86_64/kernel/mce-xen.c Mon Dec 15 14:25:11 2008 +0900 @@ -0,0 +1,633 @@ +/* + * Machine check handler. + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/rcupdate.h> +#include <linux/kallsyms.h> +#include <linux/sysdev.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/capability.h> +#include <linux/cpu.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <asm/processor.h> +#include <asm/mce.h> +#include <asm/kdebug.h> +#include <asm/uaccess.h> +#include <asm/smp.h> + +#include <xen/evtchn.h> +#include <xen/interface/vcpu.h> + +#define MISC_MCELOG_MINOR 227 + +atomic_t mce_entry; + +static int mce_dont_init; + +/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, + 3: never panic or exit (for testing only) */ +static int tolerant = 1; +static int mce_bootlog = 1; + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +struct mce_log mcelog = { + MCE_LOG_SIGNATURE, + MCE_LOG_LEN, +}; + +void mce_log(struct mce *mce) +{ + unsigned next, entry; + mce->finished = 0; + wmb(); + for (;;) { + entry = rcu_dereference(mcelog.next); + /* The rmb forces the compiler to reload next in each + iteration */ + rmb(); + for (;;) { + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); +} + +static void print_mce(struct mce *m) +{ + printk(KERN_EMERG "\n" + KERN_EMERG "HARDWARE ERROR\n" + KERN_EMERG + "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + m->cpu, m->mcgstatus, m->bank, m->status); + if (m->rip) { + printk(KERN_EMERG + "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->rip); + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->rip); + printk("\n"); + } + printk(KERN_EMERG "TSC %Lx ", m->tsc); + if (m->addr) + printk("ADDR %Lx ", m->addr); + if (m->misc) + printk("MISC %Lx ", m->misc); + printk("\n"); + printk(KERN_EMERG "This is not a software problem!\n"); + printk(KERN_EMERG + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +} + +static void mce_panic(char *msg, struct mce *backup, unsigned long start) +{ + int i; + oops_begin(); + for (i = 0; i < MCE_LOG_LEN; i++) { + unsigned long tsc = mcelog.entry[i].tsc; + if (time_before(tsc, start)) + continue; + print_mce(&mcelog.entry[i]); + if (backup && mcelog.entry[i].tsc == backup->tsc) + backup = NULL; + } + if (backup) + print_mce(backup); + if (tolerant >= 3) + printk("Fake panic: %s\n", msg); + else + panic(msg); +} + +static int mce_available(struct cpuinfo_x86 *c) +{ + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + m->rip = regs->rip; + m->cs = regs->cs; + } else { + m->rip = 0; + m->cs = 0; + } +} + +/* + * The actual machine check handler + */ + +void do_machine_check(struct pt_regs * regs, long error_code) +{ + xen_mc_t mc; + struct mc_info *mi; + struct mcinfo_common *mic = NULL; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + struct mce m, panicm; + int nowayout = (tolerant < 1); + int kill_it = 0; + u64 mcestart = 0; + int ret; + int panicm_found = 0; + int domid = 0; + + atomic_inc(&mce_entry); + + mc.cmd = XEN_MC_fetch; + mc.interface_version = XEN_MCA_INTERFACE_VERSION; + mc.u.mc_fetch.flags = + (regs == NULL && !error_code)? XEN_MC_CORRECTABLE: XEN_MC_TRAP; + ret = HYPERVISOR_mca(&mc); + if (ret) { + printk("HYPERVISOR_mca: fetch failed: %d\n", ret); + goto out; + } + /* Do nothing if no machine check log left over from the previous + reset */ + if (mc.u.mc_fetch.flags & XEN_MC_NODATA) + return; + + mi = &mc.u.mc_fetch.mc_info; + + /* first convert the global info */ + x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); + if (mic == NULL) + goto out; + + mc_global = (struct mcinfo_global *)mic; + + memset(&m, 0, sizeof(struct mce)); + m.cpu = mc_global->mc_coreid; + m.mcgstatus = mc_global->mc_gstatus; + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + + rdtscll(mcestart); + barrier(); + + /* then the bank information */ + x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */ + do { + if (mic == NULL) + goto out; + if (mic->type != MC_TYPE_BANK) + continue; + + mc_bank = (struct mcinfo_bank *)mic; + m.misc = 0; + m.addr = 0; + m.bank = mc_bank->mc_bank; + m.tsc = 0; + + m.status = mc_bank->mc_status; + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + if (m.status & MCI_STATUS_EN) { + /* In theory _OVER could be a nowayout too, but + assume any overflowed errors were no fatal. */ + nowayout |= !!(m.status & MCI_STATUS_PCC); + kill_it |= !!(m.status & MCI_STATUS_UC); + } + + if (m.status & MCI_STATUS_MISCV) + m.misc = mc_bank->mc_misc; + if (m.status & MCI_STATUS_ADDRV) { + m.addr = mc_bank->mc_addr; + domid = mc_bank->mc_domid; + } + + mce_get_rip(&m, regs); + if (error_code >= 0) + rdtscll(m.tsc); + if (error_code != -2) + mce_log(&m); + + /* Did this bank cause the exception? */ + /* Assume that the bank with uncorrectable errors did it, + and that there is only a single one. */ + if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + panicm = m; + panicm_found = 1; + } + + add_taint(TAINT_MACHINE_CHECK); + + mic = x86_mcinfo_next(mic); /* next entry */ + if ((mic == NULL) || (mic->size == 0)) + break; + } while (1); + + /* Never do anything final in the polling timer */ + if (!regs) + goto out; + + /* If we didn't find an uncorrectable error, pick + the last one (shouldn't happen, just being safe). */ + if (!panicm_found) + panicm = m; + if (nowayout) + mce_panic("Machine check", &panicm, mcestart); + if (kill_it) { + int user_space = 0; + + if (is_initial_xendomain() && domid != 0) { + struct xen_mc_notifydomain *mc_notifydomain; + + mc.cmd = XEN_MC_notifydomain; + mc_notifydomain = &mc.u.mc_notifydomain; + mc_notifydomain->mc_domid = domid; + mc_notifydomain->mc_vcpuid = mc_global->mc_vcpuid; + + ret = HYPERVISOR_mca(&mc); + if (ret) { + printk("HYPERVISOR_mca: notifydomain failed: %d\n", ret); + goto out; + } + if (mc_notifydomain->flags == XEN_MC_OK) { + /* Notify success */ + goto out; + } else if (tolerant < 3) { + /* Shutdown remote domain if tolerant < 3 */ + struct sched_remote_shutdown remote_shutdown = { + .domain_id = domid, + .reason = SHUTDOWN_crash + }; + VOID(HYPERVISOR_sched_op( + SCHEDOP_remote_shutdown, + &remote_shutdown)); + goto out; + } + } + + if (m.mcgstatus & MCG_STATUS_RIPV) + user_space = panicm.rip && (panicm.cs & 3); + + /* When the machine was in user space and the CPU didn't get + confused it's normally not necessary to panic, unless you + are paranoid (tolerant == 0) + + RED-PEN could be more tolerant for MCEs in idle, + but most likely they occur at boot anyways, where + it is best to just halt the machine. */ + if ((!user_space && (panic_on_oops || tolerant < 2)) || + (unsigned)current->pid <= 1) + mce_panic("Uncorrected machine check", &panicm, mcestart); + + /* do_exit takes an awful lot of locks and has as + slight risk of deadlocking. If you don't want that + don't set tolerant >= 2 */ + if (tolerant < 3) + do_exit(SIGBUS); + } + + out: + atomic_dec(&mce_entry); +} + +static irqreturn_t mcheck_event_handler(int irq, void *dev_id, struct pt_regs *regs) +{ + if (mce_available(¤t_cpu_data)) + do_machine_check(NULL, 0); + return IRQ_HANDLED; +} + +/* + * Initialize Machine Checks for a CPU. + */ +static void mce_init(void *dummy) +{ + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + if (is_initial_xendomain()) + do_machine_check(NULL, mce_bootlog ? -1 : -2); +} + +/* Add per CPU specific workarounds here */ +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { + /* Lots of broken BIOS around that don't clear them + by default and leave crap in there. Don't log. */ + mce_bootlog = 0; + } + +} + +static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; + default: + break; + } +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off. + */ +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +{ + static cpumask_t mce_cpus = CPU_MASK_NONE; + + mce_cpu_quirks(c); + + if (mce_dont_init || + cpu_test_and_set(smp_processor_id(), mce_cpus) || + !mce_available(c)) + return; + + if (smp_processor_id() == 0) { + if (bind_virq_to_irqhandler(VIRQ_MCA, 0, + mcheck_event_handler, SA_INTERRUPT, + "mce0", NULL) < 0) { + printk(KERN_ERR "Cannot bind mcheck_event_handler\n"); + return; + } + } + + mce_init(NULL); + mce_cpu_features(c); +} + +/* + * Character device to read and clear the MCE log. + */ + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) +{ + unsigned long *cpu_tsc; + static DECLARE_MUTEX(mce_read_sem); + unsigned next; + char __user *buf = ubuf; + int i, err; + + cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + down(&mce_read_sem); + next = rcu_dereference(mcelog.next); + + /* Only supports full reads right now */ + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { + up(&mce_read_sem); + kfree(cpu_tsc); + return -EINVAL; + } + + err = 0; + for (i = 0; i < next; i++) { + unsigned long start = jiffies; + while (!mcelog.entry[i].finished) { + if (!time_before(jiffies, start + 2)) { + memset(mcelog.entry + i,0, sizeof(struct mce)); + continue; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); + buf += sizeof(struct mce); + } + + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; + + synchronize_sched(); + + /* Collect entries that were still getting written before the synchronize. */ + + on_each_cpu(collect_tscs, cpu_tsc, 1, 1); + for (i = next; i < MCE_LOG_LEN; i++) { + if (mcelog.entry[i].finished && + mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { + err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); + smp_rmb(); + buf += sizeof(struct mce); + memset(&mcelog.entry[i], 0, sizeof(struct mce)); + } + } + up(&mce_read_sem); + kfree(cpu_tsc); + return err ? -EFAULT : buf - ubuf; +} + +static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) +{ + int __user *p = (int __user *)arg; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static struct file_operations mce_chrdev_ops = { + .read = mce_read, + .ioctl = mce_ioctl, +}; + +static struct miscdevice mce_log_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +/* + * Old style boot options parsing. Only for compatibility. + */ + +static int __init mcheck_disable(char *str) +{ + mce_dont_init = 1; + return 1; +} + +/* mce=off disables machine check. Note you can reenable it later + using sysfs. + mce=TOLERANCELEVEL (number, see above) + mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + mce=nobootlog Don't log MCEs from before booting. */ +static int __init mcheck_enable(char *str) +{ + if (*str == '=') + str++; + if (!strcmp(str, "off")) + mce_dont_init = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) + mce_bootlog = str[0] == 'b'; + else if (isdigit(str[0])) + get_option(&str, &tolerant); + else + printk("mce= argument %s ignored. Please use /sys", str); + return 1; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +/* + * Sysfs support + */ + +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. + Only one CPU is active at this time, the others get readded later using + CPU hotplug. */ +static int mce_resume(struct sys_device *dev) +{ + mce_init(NULL); + return 0; +} + +static struct sysdev_class mce_sysclass = { + .resume = mce_resume, + set_kset_name("machinecheck"), +}; + +DEFINE_PER_CPU(struct sys_device, device_mce); + +/* Why are there no generic functions for this? */ +#define ACCESSOR(name, var, start) \ + static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + return sprintf(buf, "%lx\n", (unsigned long)var); \ + } \ + static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + char *end; \ + unsigned long new = simple_strtoul(buf, &end, 0); \ + if (end == buf) return -EINVAL; \ + var = new; \ + start; \ + return end-buf; \ + } \ + static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); + +ACCESSOR(tolerant,tolerant,) + +/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ + int err; + if (!mce_available(&cpu_data[cpu])) + return -EIO; + + per_cpu(device_mce,cpu).id = cpu; + per_cpu(device_mce,cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce,cpu)); + + if (!err) { + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); + } + return err; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void mce_remove_device(unsigned int cpu) +{ + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_unregister(&per_cpu(device_mce,cpu)); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + mce_create_device(cpu); + break; + case CPU_DEAD: + mce_remove_device(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier = { + .notifier_call = mce_cpu_callback, +}; +#endif + +static __init int mce_init_device(void) +{ + int err; + int i = 0; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + err = sysdev_class_register(&mce_sysclass); + + for_each_online_cpu(i) { + mce_create_device(i); + } + + register_hotcpu_notifier(&mce_cpu_notifier); + misc_register(&mce_log_device); + return err; +} + +device_initcall(mce_init_device); diff -r ff9683032b76 buildconfigs/linux-defconfig_xen_x86_64 --- a/buildconfigs/linux-defconfig_xen_x86_64 Sat Dec 13 16:00:43 2008 +0000 +++ b/buildconfigs/linux-defconfig_xen_x86_64 Mon Dec 15 14:25:11 2008 +0900 @@ -142,6 +142,9 @@ CONFIG_HOTPLUG_CPU=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_SWIOTLB=y +CONFIG_X86_MCE=y +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y CONFIG_KEXEC=y # CONFIG_CRASH_DUMP is not set CONFIG_PHYSICAL_START=0x200000 diff -r ff9683032b76 include/asm-x86_64/mach-xen/asm/hw_irq.h --- a/include/asm-x86_64/mach-xen/asm/hw_irq.h Sat Dec 13 16:00:43 2008 +0000 +++ b/include/asm-x86_64/mach-xen/asm/hw_irq.h Mon Dec 15 14:25:11 2008 +0900 @@ -51,8 +51,10 @@ #define CALL_FUNCTION_VECTOR 0xfc /* fb free - please don't readd KDB here because it's useless (hint - think what a NMI bit does to a vector) */ +#endif #define THERMAL_APIC_VECTOR 0xfa #define THRESHOLD_APIC_VECTOR 0xf9 +#ifndef CONFIG_XEN /* f8 free */ #define INVALIDATE_TLB_VECTOR_END 0xf7 #define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ diff -r ff9683032b76 include/asm-x86_64/mach-xen/asm/hypercall.h --- a/include/asm-x86_64/mach-xen/asm/hypercall.h Sat Dec 13 16:00:43 2008 +0000 +++ b/include/asm-x86_64/mach-xen/asm/hypercall.h Mon Dec 15 14:25:11 2008 +0900 @@ -405,4 +405,10 @@ return _hypercall2(int, kexec_op, op, args); } +static inline int __must_check +HYPERVISOR_mca( + const xen_mc_t *mc) +{ + return _hypercall1(int, mca, mc); +} #endif /* __HYPERCALL_H__ */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |