[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] x86 mcheck: Replace hypervisor MCA telemetry structures with something



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1237299770 0
# Node ID 9c1be8f2013be449a09f1af34a0b5c8820ce7c55
# Parent  0b1ce09f457762d93029b1b62e1139acb5fc92fd
x86 mcheck: Replace hypervisor MCA telemetry structures with something
more robust and designed to make terminal error telemetry available to
the dom0 panic flow for diagnosis on reboot.

Use common code for a lot of the AMD and Intel MCE handling code.

Signed-off-by: Gavin Maltby <gavin.maltby@xxxxxxx>
Signed-off-by: Frank van der Linden <frank.vanderlinden@xxxxxxx>
---
 xen/arch/x86/cpu/mcheck/Makefile       |    1 
 xen/arch/x86/cpu/mcheck/amd_f10.c      |   37 -
 xen/arch/x86/cpu/mcheck/amd_k8.c       |  229 -------
 xen/arch/x86/cpu/mcheck/amd_nonfatal.c |  146 +---
 xen/arch/x86/cpu/mcheck/k7.c           |   11 
 xen/arch/x86/cpu/mcheck/mce.c          | 1001 +++++++++++++++++++--------------
 xen/arch/x86/cpu/mcheck/mce.h          |   98 ++-
 xen/arch/x86/cpu/mcheck/mce_intel.c    |  391 +++---------
 xen/arch/x86/cpu/mcheck/mctelem.c      |  443 ++++++++++++++
 xen/arch/x86/cpu/mcheck/mctelem.h      |   71 ++
 xen/arch/x86/cpu/mcheck/non-fatal.c    |   87 +-
 xen/arch/x86/cpu/mcheck/p5.c           |   15 
 xen/arch/x86/cpu/mcheck/winchip.c      |    8 
 xen/arch/x86/cpu/mcheck/x86_mca.h      |    8 
 xen/include/asm-x86/traps.h            |    2 
 xen/include/public/arch-x86/xen-mca.h  |   49 -
 16 files changed, 1476 insertions(+), 1121 deletions(-)

diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile  Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/Makefile  Tue Mar 17 14:22:50 2009 +0000
@@ -2,6 +2,7 @@ obj-y += k7.o
 obj-y += k7.o
 obj-y += amd_k8.o
 obj-y += amd_f10.o
+obj-y += mctelem.o
 obj-y += mce.o
 obj-y += mce_intel.o
 obj-y += non-fatal.o
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Mar 17 14:22:50 2009 +0000
@@ -49,20 +49,21 @@
 #include "x86_mca.h"
 
 
-static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+static enum mca_extinfo
+amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
 {
        struct mcinfo_extended mc_ext;
 
        /* Family 0x10 introduced additional MSR that belong to the
         * northbridge bank (4). */
-       if (bank != 4)
-               return 0;
+       if (mi == NULL || bank != 4)
+               return MCA_EXTINFO_IGNORED;
 
        if (!(status & MCi_STATUS_VAL))
-               return 0;
+               return MCA_EXTINFO_IGNORED;
 
        if (!(status & MCi_STATUS_MISCV))
-               return 0;
+               return MCA_EXTINFO_IGNORED;
 
        memset(&mc_ext, 0, sizeof(mc_ext));
        mc_ext.common.type = MC_TYPE_EXTENDED;
@@ -78,23 +79,25 @@ static int amd_f10_handler(struct mc_inf
        rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
        
        x86_mcinfo_add(mi, &mc_ext);
-       return 1;
+       return MCA_EXTINFO_LOCAL;
 }
 
 
 extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
 
 /* AMD Family10 machine check */
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
 { 
        uint64_t value;
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
-       mc_callback_bank_extended = amd_f10_handler;
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k8_machine_check);
+       x86_mce_callback_register(amd_f10_handler);
        cpu_nr = smp_processor_id();
-       wmb();
 
        rdmsrl(MSR_IA32_MCG_CAP, value);
        if (value & MCG_CTL_P)  /* Control register present ? */
@@ -104,18 +107,9 @@ void amd_f10_mcheck_init(struct cpuinfo_
        for (i = 0; i < nr_mce_banks; i++) {
                switch (i) {
                case 4: /* Northbridge */
-                       /* Enable error reporting of all errors,
-                        * enable error checking and
-                        * disable sync flooding */
-                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
                        wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
-
-                       /* XXX: We should write the value 0x1087821UL into
-                        * to register F3x180 here, which sits in
-                        * the PCI extended configuration space.
-                        * Since this is not possible here, we can only hope,
-                        * Dom0 is doing that.
-                        */
                        break;
 
                default:
@@ -128,4 +122,5 @@ void amd_f10_mcheck_init(struct cpuinfo_
 
        set_in_cr4(X86_CR4_MCE);
        printk("CPU%i: AMD Family10h machine check reporting enabled.\n", 
cpu_nr);
+       return 1;
 }
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/amd_k8.c
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c  Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c  Tue Mar 17 14:22:50 2009 +0000
@@ -67,234 +67,27 @@
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 
 /* Machine Check Handler for AMD K8 family series */
 void k8_machine_check(struct cpu_user_regs *regs, long error_code)
 {
-       struct vcpu *vcpu = current;
-       struct domain *curdom;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv, uc;
-       uint32_t i;
-       unsigned int cpu_nr;
-       uint32_t xen_impacted = 0;
-#define DOM_NORMAL     0
-#define DOM0_TRAP      1
-#define DOMU_TRAP      2
-#define DOMU_KILLED    4
-       uint32_t dom_state = DOM_NORMAL;
-
-       /* This handler runs as interrupt gate. So IPIs from the
-        * polling service routine are defered until we finished.
-        */
-
-        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
-        * an other physical CPU or the impacted process in the guest
-        * continues running with corrupted data, otherwise. */
-        vcpu_schedule_lock_irq(vcpu);
-
-       mc_data = x86_mcinfo_getptr();
-       cpu_nr = smp_processor_id();
-       BUG_ON(cpu_nr != vcpu->processor);
-
-       curdom = vcpu->domain;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
-
-       x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
-           &mc_global.mc_coreid, &mc_global.mc_core_threadid,
-           &mc_global.mc_apicid, NULL, NULL, NULL);
-
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       /* Quick check, who is impacted */
-       xen_impacted = is_idle_domain(curdom);
-
-       /* Dom0 */
-       x86_mcinfo_clear(mc_data);
-       x86_mcinfo_add(mc_data, &mc_global);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               /* An error happened in this bank.
-                * This is expected to be an uncorrectable error,
-                * since correctable errors get polled.
-                */
-               uc = status & MCi_STATUS_UC;
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
-                       
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-
-               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
-               wmb();
-               add_taint(TAINT_MACHINE_CHECK);
-       }
-
-       status = mc_global.mc_gstatus;
-
-       /* clear MCIP or cpu enters shutdown state
-        * in case another MCE occurs. */
-       status &= ~MCG_STATUS_MCIP;
-       wrmsrl(MSR_IA32_MCG_STATUS, status);
-       wmb();
-
-       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
-        * The thread started here:
-        * 
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
-        */
-
-       /* MCG_STATUS_RIPV: 
-        * When this bit is not set, then the instruction pointer onto the stack
-        * to resume at is not valid. If xen is interrupted, then we panic 
anyway
-        * right below. Otherwise it is up to the guest to figure out if 
-        * guest kernel or guest userland is affected and should kill either
-        * itself or the affected process.
-        */
-
-       /* MCG_STATUS_EIPV:
-        * Evaluation of EIPV is the job of the guest.
-        */
-
-       if (xen_impacted) {
-               /* Now we are going to panic anyway. Allow interrupts, so that
-                * printk on serial console can work. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* Uh, that means, machine check exception
-                * inside Xen occured. */
-               printk("Machine check exception occured in Xen.\n");
-
-               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
-                * to the error then it makes sense to print a stack trace.
-                * That can be useful for more detailed error analysis and/or
-                * error case studies to figure out, if we can clear
-                * xen_impacted and kill a DomU instead
-                * (i.e. if a guest only control structure is affected, but then
-                * we must ensure the bad pages are not re-used again).
-                */
-               if (status & MCG_STATUS_EIPV) {
-                       printk("MCE: Instruction Pointer is related to the 
error. "
-                               "Therefore, print the execution state.\n");
-                       show_execution_state(regs);
-               }
-               x86_mcinfo_dump(mc_data);
-               mc_panic("End of MCE. Use mcelog to decode above error 
codes.\n");
-       }
-
-       /* If Dom0 registered a machine check handler, which is only possible
-        * with a PV MCA driver, then ... */
-       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
-               dom_state = DOM0_TRAP;
-
-               /* ... deliver machine check trap to Dom0. */
-               send_guest_trap(dom0, 0, TRAP_machine_check);
-
-               /* Xen may tell Dom0 now to notify the DomU.
-                * But this will happen through a hypercall. */
-       } else
-               /* Dom0 did not register a machine check handler, but if DomU
-                * did so, then... */
-                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, 
TRAP_machine_check) ) {
-                       dom_state = DOMU_TRAP;
-
-                       /* ... deliver machine check trap to DomU */
-                       send_guest_trap(curdom, vcpu->vcpu_id, 
TRAP_machine_check);
-       } else {
-               /* hmm... noone feels responsible to handle the error.
-                * So, do a quick check if a DomU is impacted or not.
-                */
-               if (curdom == dom0) {
-                       /* Dom0 is impacted. Since noone can't handle
-                        * this error, panic! */
-                       x86_mcinfo_dump(mc_data);
-                       mc_panic("MCE occured in Dom0, which it can't 
handle\n");
-
-                       /* UNREACHED */
-               } else {
-                       dom_state = DOMU_KILLED;
-
-                       /* Enable interrupts. This basically results in
-                        * calling sti on the *physical* cpu. But after
-                        * domain_crash() the vcpu pointer is invalid.
-                        * Therefore, we must unlock the irqs before killing
-                        * it. */
-                       vcpu_schedule_unlock_irq(vcpu);
-
-                       /* DomU is impacted. Kill it and continue. */
-                       domain_crash(curdom);
-               }
-       }
-
-
-       switch (dom_state) {
-       case DOM0_TRAP:
-       case DOMU_TRAP:
-               /* Enable interrupts. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* guest softirqs and event callbacks are scheduled
-                * immediately after this handler exits. */
-               break;
-       case DOMU_KILLED:
-               /* Nothing to do here. */
-               break;
-       default:
-               BUG();
-       }
+       mcheck_cmn_handler(regs, error_code, mca_allbanks);
 }
 
-
 /* AMD K8 machine check */
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c)
 {
        uint64_t value;
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
+       /* Check for PPro style MCA; our caller has confirmed MCE support. */
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k8_machine_check);
        cpu_nr = smp_processor_id();
-       wmb();
 
        rdmsrl(MSR_IA32_MCG_CAP, value);
        if (value & MCG_CTL_P)  /* Control register present ? */
@@ -304,10 +97,8 @@ void amd_k8_mcheck_init(struct cpuinfo_x
        for (i = 0; i < nr_mce_banks; i++) {
                switch (i) {
                case 4: /* Northbridge */
-                       /* Enable error reporting of all errors,
-                        * enable error checking and
-                        * disable sync flooding */
-                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
                        wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
                        break;
 
@@ -321,4 +112,6 @@ void amd_k8_mcheck_init(struct cpuinfo_x
 
        set_in_cr4(X86_CR4_MCE);
        printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
+
+       return 1;
 }
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Tue Mar 17 14:22:50 2009 +0000
@@ -58,22 +58,23 @@
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/event.h>
-#include <asm/processor.h> 
+
+#include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(10000)
 #define MCE_MIN    MILLISECS(2000)
 #define MCE_MAX    MILLISECS(30000)
 
 static s_time_t period = MCE_PERIOD;
 static int hw_threshold = 0;
 static int adjust = 0;
+static int variable_period = 1;
 
 /* The polling service routine:
  * Collects information of correctable errors and notifies
@@ -81,99 +82,46 @@ static int adjust = 0;
  */
 void mce_amd_checkregs(void *info)
 {
-       struct vcpu *vcpu = current;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv;
-       unsigned int i;
+       mctelem_cookie_t mctc;
+       struct mca_summary bs;
        unsigned int event_enabled;
-       unsigned int cpu_nr;
-       int error_found;
-
-       /* We don't need a slot yet. Only allocate one on error. */
-       mc_data = NULL;
-
-       cpu_nr = smp_processor_id();
-       BUG_ON(cpu_nr != vcpu->processor);
+
+       mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs);
+
        event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
-       error_found = 0;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-
-       x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
-           &mc_global.mc_coreid, &mc_global.mc_core_threadid,
-           &mc_global.mc_apicid, NULL, NULL, NULL);
-
-       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               if (mc_data == NULL) {
-                       /* Now we need a slot to fill in error telemetry. */
-                       mc_data = x86_mcinfo_getptr();
-                       BUG_ON(mc_data == NULL);
-                       x86_mcinfo_clear(mc_data);
-                       x86_mcinfo_add(mc_data, &mc_global);
-               }
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               /* Increase polling frequency */
-               error_found = 1;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
-
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-               x86_mcinfo_add(mc_data, &mc_info);
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
-               wmb();
-       }
-
-       if (error_found > 0) {
-               /* If Dom0 enabled the VIRQ_MCA event, then ... */
-               if (event_enabled)
-                       /* ... notify it. */
+
+       if (bs.errcnt && mctc != NULL) {
+               static uint64_t dumpcount = 0;
+
+               /* If Dom0 enabled the VIRQ_MCA event, then notify it.
+                * Otherwise, if dom0 has had plenty of time to register
+                * the virq handler but still hasn't then dump telemetry
+                * to the Xen console.  The call count may be incremented
+                * on multiple cpus at once and is indicative only - just
+                * a simple-minded attempt to avoid spamming the console
+                * for corrected errors in early startup. */
+
+               if (event_enabled) {
+                       mctelem_commit(mctc);
                        send_guest_global_virq(dom0, VIRQ_MCA);
-               else
-                       /* ... or dump it */
-                       x86_mcinfo_dump(mc_data);
-       }
-
-       adjust += error_found;
+               } else if (++dumpcount >= 10) {
+                       x86_mcinfo_dump((struct mc_info 
*)mctelem_dataptr(mctc));
+                       mctelem_dismiss(mctc);
+               } else {
+                       mctelem_dismiss(mctc);
+               }
+               
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
+       }
+
+       /* adjust is global and all cpus may attempt to increment it without
+        * synchronisation, so they race and the final adjust count
+        * (number of cpus seeing any error) is approximate.  We can
+        * guarantee that if any cpu observes an error that the
+        * adjust count is at least 1. */
+       if (bs.errcnt)
+               adjust++;
 }
 
 /* polling service routine invoker:
@@ -188,7 +136,7 @@ static void mce_amd_work_fn(void *data)
        on_each_cpu(mce_amd_checkregs, data, 1, 1);
 
        if (adjust > 0) {
-               if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+               if (!guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
                        /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
                        printk("MCE: polling routine found correctable error. "
                                " Use mcelog to parse above error output.\n");
@@ -229,19 +177,19 @@ static void mce_amd_work_fn(void *data)
                }
        }
 
-       if (adjust > 0) {
+       if (variable_period && adjust > 0) {
                /* Increase polling frequency */
                adjust++; /* adjust == 1 must have an effect */
                period /= adjust;
-       } else {
+       } else if (variable_period) {
                /* Decrease polling frequency */
                period *= 2;
        }
-       if (period > MCE_MAX) {
+       if (variable_period && period > MCE_MAX) {
                /* limit: Poll at least every 30s */
                period = MCE_MAX;
        }
-       if (period < MCE_MIN) {
+       if (variable_period && period < MCE_MIN) {
                /* limit: Poll every 2s.
                 * When this is reached an uncorrectable error
                 * is expected to happen, if Dom0 does nothing.
@@ -262,7 +210,7 @@ void amd_nonfatal_mcheck_init(struct cpu
 
        /* The threshold bitfields in MSR_IA32_MC4_MISC has
         * been introduced along with the SVME feature bit. */
-       if (cpu_has(c, X86_FEATURE_SVME)) {
+       if (variable_period && cpu_has(c, X86_FEATURE_SVME)) {
                uint64_t value;
 
                /* hw threshold registers present */
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/k7.c
--- a/xen/arch/x86/cpu/mcheck/k7.c      Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/k7.c      Tue Mar 17 14:22:50 2009 +0000
@@ -68,13 +68,16 @@ static fastcall void k7_machine_check(st
 
 
 /* AMD K7 machine check */
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        int i;
 
-       machine_check_vector = k7_machine_check;
-       wmb();
+       /* Check for PPro style MCA; our caller has confirmed MCE support. */
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k7_machine_check);
 
        rdmsr (MSR_IA32_MCG_CAP, l, h);
        if (l & (1<<8)) /* Control register present ? */
@@ -92,4 +95,6 @@ void amd_k7_mcheck_init(struct cpuinfo_x
        set_in_cr4 (X86_CR4_MCE);
        printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
                smp_processor_id());
+
+       return 1;
 }
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Tue Mar 17 14:22:50 2009 +0000
@@ -10,104 +10,490 @@
 #include <xen/smp.h>
 #include <xen/errno.h>
 #include <xen/console.h>
-
-#include <asm/processor.h> 
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+#include <xen/guest_access.h>
+
+#include <asm/processor.h>
 #include <asm/system.h>
+#include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 int mce_disabled = 0;
 unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);       /* non-fatal.o */
 
-/* XXX For now a fixed array is used. Later this should be changed
- * to a dynamic allocated array with the size calculated in relation
- * to physical cpus present in the machine.
- * The more physical cpus are available, the more entries you need.
- */
-#define MAX_MCINFO     20
-
-struct mc_machine_notify {
-       struct mc_info mc;
-       uint32_t fetch_idx;
-       uint32_t valid;
-};
-
-struct mc_machine {
-
-       /* Array structure used for collecting machine check error telemetry. */
-       struct mc_info mc[MAX_MCINFO];
-
-       /* We handle multiple machine check reports lockless by
-        * iterating through the array using the producer/consumer concept.
-        */
-       /* Producer array index to fill with machine check error data.
-        * Index must be increased atomically. */
-       uint32_t error_idx;
-
-       /* Consumer array index to fetch machine check error data from.
-        * Index must be increased atomically. */
-       uint32_t fetch_idx;
-
-       /* Integer array holding the indeces of the mc array that allows
-         * a Dom0 to notify a DomU to re-fetch the same machine check error
-         * data. The notification and refetch also uses its own 
-        * producer/consumer mechanism, because Dom0 may decide to not report
-        * every error to the impacted DomU.
-        */
-       struct mc_machine_notify notify[MAX_MCINFO];
-
-       /* Array index to get fetch_idx from.
-        * Index must be increased atomically. */
-       uint32_t notifyproducer_idx;
-       uint32_t notifyconsumer_idx;
-};
-
-/* Global variable with machine check information. */
-struct mc_machine mc_data;
+static void mcinfo_clear(struct mc_info *);
+
+#define        SEG_PL(segsel) ((segsel) & 0x3)
+
+#if 1  /* XXFM switch to 0 for putback */
+
+#define        x86_mcerr(str, err) _x86_mcerr(str, err)
+
+static int _x86_mcerr(const char *msg, int err)
+{
+       printk("x86_mcerr: %s, returning %d\n",
+           msg != NULL ? msg : "", err);
+       return err;
+}
+#else
+#define x86_mcerr(str,err)
+#endif
+
+cpu_banks_t mca_allbanks;
 
 /* Handle unconfigured int18 (should never happen) */
 static void unexpected_machine_check(struct cpu_user_regs *regs, long 
error_code)
-{      
+{
        printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
                smp_processor_id());
 }
 
 
+static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
+
+void x86_mce_vector_register(x86_mce_vector_t hdlr)
+{
+       _machine_check_vector = hdlr;
+       wmb();
+}
+
 /* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = 
unexpected_machine_check;
+
+void machine_check_vector(struct cpu_user_regs *regs, long error_code)
+{
+       _machine_check_vector(regs, error_code);
+}
 
 /* Init machine check callback handler
  * It is used to collect additional information provided by newer
  * CPU families/models without the need to duplicate the whole handler.
  * This avoids having many handlers doing almost nearly the same and each
  * with its own tweaks ands bugs. */
-int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
-
-
-static void amd_mcheck_init(struct cpuinfo_x86 *ci)
-{
+static x86_mce_callback_t mc_callback_bank_extended = NULL;
+
+void x86_mce_callback_register(x86_mce_callback_t cbfunc)
+{
+       mc_callback_bank_extended = cbfunc;
+}
+
+/* Utility function to perform MCA bank telemetry readout and to push that
+ * telemetry towards an interested dom0 for logging and diagnosis.
+ * The caller - #MC handler or MCA poll function - must arrange that we
+ * do not migrate cpus. */
+
+/* XXFM Could add overflow counting? */
+mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
+    struct mca_summary *sp)
+{
+       struct vcpu *v = current;
+       struct domain *d;
+       uint64_t gstatus, status, addr, misc;
+       struct mcinfo_global mcg;       /* on stack */
+       struct mcinfo_common *mic;
+       struct mcinfo_global *mig;      /* on stack */
+       mctelem_cookie_t mctc = NULL;
+       uint32_t uc = 0, pcc = 0;
+       struct mc_info *mci = NULL;
+       mctelem_class_t which = MC_URGENT;      /* XXXgcc */
+       unsigned int cpu_nr;
+       int errcnt = 0;
+       int i;
+       enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
+
+       cpu_nr = smp_processor_id();
+       BUG_ON(cpu_nr != v->processor);
+
+       rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+
+       memset(&mcg, 0, sizeof (mcg));
+       mcg.common.type = MC_TYPE_GLOBAL;
+       mcg.common.size = sizeof (mcg);
+       if (v != NULL && ((d = v->domain) != NULL)) {
+               mcg.mc_domid = d->domain_id;
+               mcg.mc_vcpuid = v->vcpu_id;
+       } else {
+               mcg.mc_domid = -1;
+               mcg.mc_vcpuid = -1;
+       }
+       mcg.mc_gstatus = gstatus;       /* MCG_STATUS */
+
+       switch (who) {
+       case MCA_MCE_HANDLER:
+               mcg.mc_flags = MC_FLAG_MCE;
+               which = MC_URGENT;
+               break;
+
+       case MCA_POLLER:
+       case MCA_RESET:
+               mcg.mc_flags = MC_FLAG_POLLED;
+               which = MC_NONURGENT;
+               break;
+
+       case MCA_CMCI_HANDLER:
+               mcg.mc_flags = MC_FLAG_CMCI;
+               which = MC_NONURGENT;
+               break;
+
+       default:
+               BUG();
+       }
+
+       /* Retrieve detector information */
+       x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
+           &mcg.mc_coreid, &mcg.mc_core_threadid,
+           &mcg.mc_apicid, NULL, NULL, NULL);
+
+       for (i = 0; i < 32 && i < nr_mce_banks; i++) {
+               struct mcinfo_bank mcb;         /* on stack */
+
+               /* Skip bank if corresponding bit in bankmask is clear */
+               if (!test_bit(i, bankmask))
+                       continue;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+               if (!(status & MCi_STATUS_VAL))
+                       continue;       /* this bank has no valid telemetry */
+
+               /* If this is the first bank with valid MCA DATA, then
+                * try to reserve an entry from the urgent/nonurgent queue
+                * depending on whethere we are called from an exception or
+                * a poller;  this can fail (for example dom0 may not
+                * yet have consumed past telemetry). */
+               if (errcnt == 0) {
+                       if ((mctc = mctelem_reserve(which)) != NULL) {
+                               mci = mctelem_dataptr(mctc);
+                               mcinfo_clear(mci);
+                       }
+               }
+
+               memset(&mcb, 0, sizeof (mcb));
+               mcb.common.type = MC_TYPE_BANK;
+               mcb.common.size = sizeof (mcb);
+               mcb.mc_bank = i;
+               mcb.mc_status = status;
+
+               /* form a mask of which banks have logged uncorrected errors */
+               if ((status & MCi_STATUS_UC) != 0)
+                       uc |= (1 << i);
+
+               /* likewise for those with processor context corrupt */
+               if ((status & MCi_STATUS_PCC) != 0)
+                       pcc |= (1 << i);
+
+               addr = misc = 0;
+
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+                       d = maddr_get_owner(addr);
+                       if (d != NULL && (who == MCA_POLLER ||
+                           who == MCA_CMCI_HANDLER))
+                               mcb.mc_domid = d->domain_id;
+               }
+
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
+
+               mcb.mc_addr = addr;
+               mcb.mc_misc = misc;
+
+               if (who == MCA_CMCI_HANDLER) {
+                       rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+                       rdtscll(mcb.mc_tsc);
+               }
+
+               /* Increment the error count;  if this is the first bank
+                * with a valid error then add the global info to the mcinfo. */
+               if (errcnt++ == 0 && mci != NULL)
+                       x86_mcinfo_add(mci, &mcg);
+
+               /* Add the bank data */
+               if (mci != NULL)
+                       x86_mcinfo_add(mci, &mcb);
+
+               if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
+                       cbret = mc_callback_bank_extended(mci, i, status);
+               }
+
+               /* Clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               wmb();
+       }
+
+       if (mci != NULL && errcnt > 0) {
+               x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
+               mig = (struct mcinfo_global *)mic;
+               if (pcc)
+                       mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
+               else if (uc)
+                       mcg.mc_flags |= MC_FLAG_RECOVERABLE;
+               else
+                       mcg.mc_flags |= MC_FLAG_CORRECTABLE;
+       }
+
+
+       if (sp) {
+               sp->errcnt = errcnt;
+               sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
+               sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
+               sp->uc = uc;
+               sp->pcc = pcc;
+       }
+
+       return mci != NULL ? mctc : NULL;       /* may be NULL */
+}
+
+#define DOM_NORMAL     0
+#define DOM0_TRAP      1
+#define DOMU_TRAP      2
+#define DOMU_KILLED    4
+
+/* Shared #MC handler. */
+void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
+    cpu_banks_t bankmask)
+{
+       int xen_state_lost, dom0_state_lost, domU_state_lost;
+       struct vcpu *v = current;
+       struct domain *curdom = v->domain;
+       domid_t domid = curdom->domain_id;
+       int ctx_xen, ctx_dom0, ctx_domU;
+       uint32_t dom_state = DOM_NORMAL;
+       mctelem_cookie_t mctc = NULL;
+       struct mca_summary bs;
+       struct mc_info *mci = NULL;
+       int irqlocked = 0;
+       uint64_t gstatus;
+       int ripv;
+
+       /* This handler runs as interrupt gate. So IPIs from the
+        * polling service routine are defered until we're finished.
+        */
+
+       /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+        * another physical CPU. */
+       vcpu_schedule_lock_irq(v);
+       irqlocked = 1;
+
+       /* Read global status;  if it does not indicate machine check
+        * in progress then bail as long as we have a valid ip to return to. */
+       rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
+       if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
+               add_taint(TAINT_MACHINE_CHECK); /* questionable */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               goto cmn_handler_done;
+       }
+
+       /* Go and grab error telemetry.  We must choose whether to commit
+        * for logging or dismiss the cookie that is returned, and must not
+        * reference the cookie after that action.
+        */
+       mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs);
+       if (mctc != NULL)
+               mci = (struct mc_info *)mctelem_dataptr(mctc);
+
+       /* Clear MCIP or another #MC will enter shutdown state */
+       gstatus &= ~MCG_STATUS_MCIP;
+       wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       wmb();
+
+       /* If no valid errors and our stack is intact, we're done */
+       if (ripv && bs.errcnt == 0) {
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               goto cmn_handler_done;
+       }
+
+       if (bs.uc || bs.pcc)
+               add_taint(TAINT_MACHINE_CHECK);
+
+       /* Machine check exceptions will usually be for UC and/or PCC errors,
+        * but it is possible to configure machine check for some classes
+        * of corrected error.
+        *
+        * UC errors could compromise any domain or the hypervisor
+        * itself - for example a cache writeback of modified data that
+        * turned out to be bad could be for data belonging to anyone, not
+        * just the current domain.  In the absence of known data poisoning
+        * to prevent consumption of such bad data in the system we regard
+        * all UC errors as terminal.  It may be possible to attempt some
+        * heuristics based on the address affected, which guests have
+        * mappings to that mfn etc.
+        *
+        * PCC errors apply to the current context.
+        *
+        * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
+        * and not PCC is terminal - the return instruction pointer
+        * pushed onto the stack is bogus.  If the interrupt context is
+        * the hypervisor or dom0 the game is over, otherwise we can
+        * limit the impact to a single domU but only if we trampoline
+        * somewhere safely - we can't return and unwind the stack.
+        * Since there is no trampoline in place we will treat !RIPV
+        * as terminal for any context.
+        */
+       ctx_xen = SEG_PL(regs->cs) == 0;
+       ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
+       ctx_domU = !ctx_xen && !ctx_dom0;
+
+       xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
+           !ripv;
+       dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
+       domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
+
+       if (xen_state_lost) {
+               /* Now we are going to panic anyway. Allow interrupts, so that
+                * printk on serial console can work. */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+
+               printk("Terminal machine check exception occured in "
+                   "hypervisor context.\n");
+
+               /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
+                * to the error then it makes sense to print a stack trace.
+                * That can be useful for more detailed error analysis and/or
+                * error case studies to figure out, if we can clear
+                * xen_impacted and kill a DomU instead
+                * (i.e. if a guest only control structure is affected, but then
+                * we must ensure the bad pages are not re-used again).
+                */
+               if (bs.eipv & MCG_STATUS_EIPV) {
+                       printk("MCE: Instruction Pointer is related to the "
+                           "error, therefore print the execution state.\n");
+                       show_execution_state(regs);
+               }
+
+               /* Commit the telemetry so that panic flow can find it. */
+               if (mctc != NULL) {
+                       x86_mcinfo_dump(mci);
+                       mctelem_commit(mctc);
+               }
+               mc_panic("Hypervisor state lost due to machine check "
+                   "exception.\n");
+               /*NOTREACHED*/
+       }
+
+       /*
+        * Xen hypervisor state is intact.  If dom0 state is lost then
+        * give it a chance to decide what to do if it has registered
+        * a handler for this event, otherwise panic.
+        *
+        * XXFM Could add some Solaris dom0 contract kill here?
+        */
+       if (dom0_state_lost) {
+               if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
+                       dom_state = DOM0_TRAP;
+                       send_guest_trap(dom0, 0, TRAP_machine_check);
+                       /* XXFM case of return with !ripv ??? */
+               } else {
+                       /* Commit telemetry for panic flow. */
+                       if (mctc != NULL) {
+                               x86_mcinfo_dump(mci);
+                               mctelem_commit(mctc);
+                       }
+                       mc_panic("Dom0 state lost due to machine check "
+                           "exception\n");
+                       /*NOTREACHED*/
+               }
+       }
+
+       /*
+        * If a domU has lost state then send it a trap if it has registered
+        * a handler, otherwise crash the domain.
+        * XXFM Revisit this functionality.
+        */
+       if (domU_state_lost) {
+               if (guest_has_trap_callback(v->domain, v->vcpu_id,
+                   TRAP_machine_check)) {
+                       dom_state = DOMU_TRAP;
+                       send_guest_trap(curdom, v->vcpu_id,
+                           TRAP_machine_check);
+               } else {
+                       dom_state = DOMU_KILLED;
+                       /* Enable interrupts. This basically results in
+                        * calling sti on the *physical* cpu. But after
+                        * domain_crash() the vcpu pointer is invalid.
+                        * Therefore, we must unlock the irqs before killing
+                        * it. */
+                       vcpu_schedule_unlock_irq(v);
+                       irqlocked = 0;
+
+                       /* DomU is impacted. Kill it and continue. */
+                       domain_crash(curdom);
+               }
+       }
+
+       switch (dom_state) {
+       case DOM0_TRAP:
+       case DOMU_TRAP:
+               /* Enable interrupts. */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+
+               /* guest softirqs and event callbacks are scheduled
+                * immediately after this handler exits. */
+               break;
+       case DOMU_KILLED:
+               /* Nothing to do here. */
+               break;
+
+       case DOM_NORMAL:
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               break;
+       }
+
+cmn_handler_done:
+       BUG_ON(irqlocked);
+       BUG_ON(!ripv);
+
+       if (bs.errcnt) {
+               /* Not panicing, so forward telemetry to dom0 now if it
+                * is interested. */
+               if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+                       if (mctc != NULL)
+                               mctelem_commit(mctc);
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               } else {
+                       x86_mcinfo_dump(mci);
+                       if (mctc != NULL)
+                               mctelem_dismiss(mctc);
+               }
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
+       }
+}
+
+static int amd_mcheck_init(struct cpuinfo_x86 *ci)
+{
+       int rc = 0;
 
        switch (ci->x86) {
        case 6:
-               amd_k7_mcheck_init(ci);
+               rc = amd_k7_mcheck_init(ci);
                break;
 
        case 0xf:
-               amd_k8_mcheck_init(ci);
+               rc = amd_k8_mcheck_init(ci);
                break;
 
        case 0x10:
-               amd_f10_mcheck_init(ci);
+               rc = amd_f10_mcheck_init(ci);
                break;
 
        default:
                /* Assume that machine check support is available.
                 * The minimum provided support is at least the K8. */
-               amd_k8_mcheck_init(ci);
-       }
+               rc = amd_k8_mcheck_init(ci);
+       }
+
+       return rc;
 }
 
 /*check the existence of Machine Check*/
@@ -116,50 +502,81 @@ int mce_available(struct cpuinfo_x86 *c)
        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
+/*
+ * Check if bank 0 is usable for MCE. It isn't for AMD K7,
+ * and Intel P6 family before model 0x1a.
+ */
+int mce_firstbank(struct cpuinfo_x86 *c)
+{
+       if (c->x86 == 6) {
+               if (c->x86_vendor == X86_VENDOR_AMD)
+                       return 1;
+
+               if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
+                       return 1;
+       }
+
+       return 0;
+}
+
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
+       int inited = 0, i;
+
        if (mce_disabled == 1) {
                printk(XENLOG_INFO "MCE support disabled by bootparam\n");
                return;
        }
 
+       for (i = 0; i < MAX_NR_BANKS; i++)
+               set_bit(i,mca_allbanks);
+
+       /* Enforce at least MCE support in CPUID information.  Individual
+        * families may also need to enforce a check for MCA support. */
        if (!cpu_has(c, X86_FEATURE_MCE)) {
                printk(XENLOG_INFO "CPU%i: No machine check support 
available\n",
                        smp_processor_id());
                return;
        }
 
-       memset(&mc_data, 0, sizeof(struct mc_machine));
+       mctelem_init(sizeof (struct mc_info));
 
        switch (c->x86_vendor) {
        case X86_VENDOR_AMD:
-               amd_mcheck_init(c);
+               inited = amd_mcheck_init(c);
                break;
 
        case X86_VENDOR_INTEL:
+               switch (c->x86) {
+               case 5:
 #ifndef CONFIG_X86_64
-               if (c->x86==5)
-                       intel_p5_mcheck_init(c);
+                       inited = intel_p5_mcheck_init(c);
 #endif
-               /*If it is P6 or P4 family, including CORE 2 DUO series*/
-               if (c->x86 == 6 || c->x86==15)
-               {
-                       printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
-                       intel_mcheck_init(c);
+                       break;
+
+               case 6:
+               case 15:
+                       inited = intel_mcheck_init(c);
+                       break;
                }
                break;
 
 #ifndef CONFIG_X86_64
        case X86_VENDOR_CENTAUR:
-               if (c->x86==5)
-                       winchip_mcheck_init(c);
+               if (c->x86==5) {
+                       inited = winchip_mcheck_init(c);
+               }
                break;
 #endif
 
        default:
                break;
        }
+
+       if (!inited)
+               printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
+                   smp_processor_id());
 }
 
 
@@ -176,190 +593,11 @@ custom_param("nomce", mcheck_disable);
 custom_param("nomce", mcheck_disable);
 custom_param("mce", mcheck_enable);
 
-
-#include <xen/guest_access.h>
-#include <asm/traps.h>
-
-struct mc_info *x86_mcinfo_getptr(void)
-{
-       struct mc_info *mi;
-       uint32_t entry, next;
-
-       for (;;) {
-               entry = mc_data.error_idx;
-               smp_rmb();
-               next = entry + 1;
-               if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
-                       break;
-       }
-
-       mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
-       BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
-
-       return mi;
-}
-
-static int x86_mcinfo_matches_guest(const struct mc_info *mi,
-                       const struct domain *d, const struct vcpu *v)
-{
-       struct mcinfo_common *mic;
-       struct mcinfo_global *mig;
-
-       x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
-       mig = (struct mcinfo_global *)mic;
-       if (mig == NULL)
-               return 0;
-
-       if (d->domain_id != mig->mc_domid)
-               return 0;
-
-       if (v->vcpu_id != mig->mc_vcpuid)
-               return 0;
-
-       return 1;
-}
-
-
-#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
-
-static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
-                               const struct domain *d, const struct vcpu *v)
-{
-       struct mc_info *mi;
-
-       /* This function is called from the fetch hypercall with
-        * the mc_lock spinlock held. Thus, no need for locking here.
-        */
-       mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
-       if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
-               /* Bogus domU command detected. */
-               *fetch_idx = 0;
-               return NULL;
-       }
-
-       *fetch_idx = mc_data.fetch_idx;
-       mc_data.fetch_idx++;
-       BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
-
-       return mi;
-}
-
-
-static void x86_mcinfo_marknotified(struct xen_mc_notifydomain 
*mc_notifydomain)
-{
-       struct mc_machine_notify *mn;
-       struct mcinfo_common *mic = NULL;
-       struct mcinfo_global *mig;
-       struct domain *d;
-       int i;
-
-       /* This function is called from the notifier hypercall with
-        * the mc_notify_lock spinlock held. Thus, no need for locking here.
-        */
-
-       /* First invalidate entries for guests that disappeared after
-        * notification (e.g. shutdown/crash). This step prevents the
-        * notification array from filling up with stalling/leaking entries.
-        */
-       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; 
i++) {
-               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
-               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
-               BUG_ON(mic == NULL);
-               mig = (struct mcinfo_global *)mic;
-               d = get_domain_by_id(mig->mc_domid);
-               if (d == NULL) {
-                       /* Domain does not exist. */
-                       mn->valid = 0;
-               }
-               if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
-                       mc_data.notifyconsumer_idx++;
-       }
-
-       /* Now put in the error telemetry. Since all error data fetchable
-        * by domUs are uncorrectable errors, they are very important.
-        * So we dump them before overriding them. When a guest takes that long,
-        * then we can assume something bad already happened (crash, hang, etc.)
-        */
-       mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
-
-       if (mn->valid) {
-               struct mcinfo_common *mic = NULL;
-               struct mcinfo_global *mig;
-
-               /* To not loose the information, we dump it. */
-               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
-               BUG_ON(mic == NULL);
-               mig = (struct mcinfo_global *)mic;
-               printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
-                       "fetch machine check error telemetry. But Domain ID "
-                       "did not do that in time.\n",
-                       mig->mc_domid);
-               x86_mcinfo_dump(&mn->mc);
-       }
-
-       memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
-               sizeof(struct mc_info));
-       mn->fetch_idx = mc_notifydomain->fetch_idx;
-       mn->valid = 1;
-
-       mc_data.notifyproducer_idx++;
-
-       /* By design there can never be more notifies than machine check errors.
-        * If that ever happens, then we hit a bug. */
-       BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
-       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
-}
-
-static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
-                               const struct domain *d, const struct vcpu *v)
-{
-       struct mc_machine_notify *mn = NULL;
-       uint32_t i;
-       int found;
-
-       /* This function is called from the fetch hypercall with
-        * the mc_notify_lock spinlock held. Thus, no need for locking here.
-        */
-
-       /* The notifier data is filled in the order guests get notified, but
-        * guests may fetch them in a different order. That's why we need
-        * the game with valid/invalid entries. */
-       found = 0;
-       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; 
i++) {
-               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
-               if (!mn->valid) {
-                       if (i == mc_data.notifyconsumer_idx)
-                               mc_data.notifyconsumer_idx++;
-                       continue;
-               }
-               if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
-                       found = 1;
-                       break;
-               }
-       }
-
-       if (!found) {
-               /* This domain has never been notified. This must be
-                * a bogus domU command. */
-               *fetch_idx = 0;
-               return NULL;
-       }
-
-       BUG_ON(mn == NULL);
-       *fetch_idx = mn->fetch_idx;
-       mn->valid = 0;
-
-       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
-       return &mn->mc;
-}
-
-
-void x86_mcinfo_clear(struct mc_info *mi)
+static void mcinfo_clear(struct mc_info *mi)
 {
        memset(mi, 0, sizeof(struct mc_info));
        x86_mcinfo_nentries(mi) = 0;
 }
-
 
 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
 {
@@ -380,7 +618,7 @@ int x86_mcinfo_add(struct mc_info *mi, v
        end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
 
        if (end1 < end2)
-               return -ENOSPC; /* No space. Can't add entry. */
+               return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
 
        /* there's enough space. add entry. */
        memcpy(mic_index, mic, mic->size);
@@ -388,7 +626,6 @@ int x86_mcinfo_add(struct mc_info *mi, v
 
        return 0;
 }
-
 
 /* Dump machine check information in a format,
  * mcelog can parse. This is used only when
@@ -404,7 +641,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
        if (mic == NULL)
                return;
        mc_global = (struct mcinfo_global *)mic;
-       if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+       if (mc_global->mc_flags & MC_FLAG_MCE) {
                printk(XENLOG_WARNING
                        "CPU%d: Machine Check Exception: %16"PRIx64"\n",
                        mc_global->mc_coreid, mc_global->mc_gstatus);
@@ -424,7 +661,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
                        goto next;
 
                mc_bank = (struct mcinfo_bank *)mic;
-       
+
                printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
                        mc_bank->mc_bank,
                        mc_bank->mc_status);
@@ -440,8 +677,6 @@ next:
                        break;
        } while (1);
 }
-
-
 
 static void do_mc_get_cpu_info(void *v)
 {
@@ -533,183 +768,141 @@ void x86_mc_get_cpu_info(unsigned cpu, u
        }
 }
 
+#if BITS_PER_LONG == 64
+
+#define        ID2COOKIE(id)   ((mctelem_cookie_t)(id))
+#define        COOKIE2ID(c) ((uint64_t)(c))
+
+#elif BITS_PER_LONG == 32
+
+#define        ID2COOKIE(id)   ((mctelem_cookie_t)(uint32_t)((id) & 
0xffffffffU))
+#define        COOKIE2ID(c)    ((uint64_t)(uint32_t)(c))
+
+#elif defined(BITS_PER_LONG)
+#error BITS_PER_LONG has unexpected value
+#else
+#error BITS_PER_LONG definition absent
+#endif
+
 /* Machine Check Architecture Hypercall */
 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
 {
        long ret = 0;
        struct xen_mc curop, *op = &curop;
        struct vcpu *v = current;
-       struct domain *domU;
        struct xen_mc_fetch *mc_fetch;
-       struct xen_mc_notifydomain *mc_notifydomain;
        struct xen_mc_physcpuinfo *mc_physcpuinfo;
-       struct mc_info *mi;
-       uint32_t flags;
-       uint32_t fetch_idx;
-        uint16_t vcpuid;
-       /* Use a different lock for the notify hypercall in order to allow
-        * a DomU to fetch mc data while Dom0 notifies another DomU. */
-       static DEFINE_SPINLOCK(mc_lock);
-       static DEFINE_SPINLOCK(mc_notify_lock);
+       uint32_t flags, cmdflags;
        int nlcpu;
        xen_mc_logical_cpu_t *log_cpus = NULL;
+       mctelem_cookie_t mctc;
+       mctelem_class_t which;
 
        if ( copy_from_guest(op, u_xen_mc, 1) )
-               return -EFAULT;
+               return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
 
        if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
-               return -EACCES;
-
-       switch ( op->cmd ) {
+               return x86_mcerr("do_mca: interface version mismatch", -EACCES);
+
+       switch (op->cmd) {
        case XEN_MC_fetch:
-               /* This hypercall is for any domain */
                mc_fetch = &op->u.mc_fetch;
-
-               switch (mc_fetch->flags) {
-               case XEN_MC_CORRECTABLE:
-                       /* But polling mode is Dom0 only, because
-                        * correctable errors are reported to Dom0 only */
-                       if ( !IS_PRIV(v->domain) )
-                               return -EPERM;
+               cmdflags = mc_fetch->flags;
+
+               /* This hypercall is for Dom0 only */
+               if (!IS_PRIV(v->domain) )
+                       return x86_mcerr(NULL, -EPERM);
+
+               switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
+               case XEN_MC_NONURGENT:
+                       which = MC_NONURGENT;
                        break;
 
-               case XEN_MC_TRAP:
+               case XEN_MC_URGENT:
+                       which = MC_URGENT;
                        break;
+
                default:
-                       return -EFAULT;
+                       return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
                }
 
                flags = XEN_MC_OK;
-               spin_lock(&mc_lock);
-
-               if ( IS_PRIV(v->domain) ) {
-                       /* this must be Dom0. So a notify hypercall
-                        * can't have happened before. */
-                       mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
+
+               if (cmdflags & XEN_MC_ACK) {
+                       mctelem_cookie_t cookie = ID2COOKIE(mc_fetch->fetch_id);
+                       mctelem_ack(which, cookie);
                } else {
-                       /* Hypercall comes from an unprivileged domain */
-                       domU = v->domain;
-                       if (guest_has_trap_callback(dom0, 0, 
TRAP_machine_check)) {
-                               /* Dom0 must have notified this DomU before
-                                * via the notify hypercall. */
-                               mi = x86_mcinfo_getnotifiedptr(&fetch_idx, 
domU, v);
+                       if (guest_handle_is_null(mc_fetch->data))
+                               return x86_mcerr("do_mca fetch: guest buffer "
+                                   "invalid", -EINVAL);
+
+                       if ((mctc = mctelem_consume_oldest_begin(which))) {
+                               struct mc_info *mcip = mctelem_dataptr(mctc);
+                               if (copy_to_guest(mc_fetch->data, mcip, 1)) {
+                                       ret = -EFAULT;
+                                       flags |= XEN_MC_FETCHFAILED;
+                                       mc_fetch->fetch_id = 0;
+                               } else {
+                                       mc_fetch->fetch_id = COOKIE2ID(mctc);
+                               }
+                               mctelem_consume_oldest_end(mctc);
                        } else {
-                               /* Xen notified the DomU. */
-                               mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, 
v);
+                               /* There is no data */
+                               flags |= XEN_MC_NODATA;
+                               mc_fetch->fetch_id = 0;
                        }
-               }
-
-               if (mi) {
-                       memcpy(&mc_fetch->mc_info, mi,
-                               sizeof(struct mc_info));
-               } else {
-                       /* There is no data for a bogus DomU command. */
-                       flags |= XEN_MC_NODATA;
-                       memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
-               }
-
-               mc_fetch->flags = flags;
-               mc_fetch->fetch_idx = fetch_idx;
-
-               if ( copy_to_guest(u_xen_mc, op, 1) )
-                       ret = -EFAULT;
-
-               spin_unlock(&mc_lock);
+
+                       mc_fetch->flags = flags;
+                       if (copy_to_guest(u_xen_mc, op, 1) != 0)
+                               ret = -EFAULT;
+               }
+
                break;
 
        case XEN_MC_notifydomain:
-               /* This hypercall is for Dom0 only */
+               return x86_mcerr("do_mca notify unsupported", -EINVAL);
+
+       case XEN_MC_physcpuinfo:
                if ( !IS_PRIV(v->domain) )
-                       return -EPERM;
-
-               spin_lock(&mc_notify_lock);
-
-               mc_notifydomain = &op->u.mc_notifydomain;
-               domU = get_domain_by_id(mc_notifydomain->mc_domid);
-               vcpuid = mc_notifydomain->mc_vcpuid;
-
-               if ((domU == NULL) || (domU == dom0)) {
-                       /* It's not possible to notify a non-existent domain
-                        * or the dom0. */
-                       spin_unlock(&mc_notify_lock);
-                       return -EACCES;
-               }
-
-               if (vcpuid >= MAX_VIRT_CPUS) {
-                       /* It's not possible to notify a vcpu, Xen can't
-                        * assign to a domain. */
-                       spin_unlock(&mc_notify_lock);
-                       return -EACCES;
-               }
-
-               mc_notifydomain->flags = XEN_MC_OK;
-
-               mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
-               if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
-                       /* The error telemetry is not for the guest, Dom0
-                        * wants to notify. */
-                       mc_notifydomain->flags |= XEN_MC_NOMATCH;
-               } else if ( guest_has_trap_callback(domU, vcpuid,
-                                               TRAP_machine_check) )
-               {
-                       /* Send notification */
-                       if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
-                               mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
-               } else
-                       mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
-
-#ifdef DEBUG
-               /* sanity check - these two flags are mutually exclusive */
-               if ((flags & XEN_MC_CANNOTHANDLE) && (flags & 
XEN_MC_NOTDELIVERED))
-                       BUG();
-#endif
-
-               if ( copy_to_guest(u_xen_mc, op, 1) )
-                       ret = -EFAULT;
-
-               if (ret == 0) {
-                       x86_mcinfo_marknotified(mc_notifydomain);
-               }
-
-               spin_unlock(&mc_notify_lock);
-               break;
-
-       case XEN_MC_physcpuinfo:
-              if ( !IS_PRIV(v->domain) )
-                      return -EPERM;
- 
-              mc_physcpuinfo = &op->u.mc_physcpuinfo;
-              nlcpu = num_online_cpus();
- 
-              if (!guest_handle_is_null(mc_physcpuinfo->info)) {
-                      if (mc_physcpuinfo->ncpus <= 0)
-                              return -EINVAL;
-                      nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
-                      log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
-                      if (log_cpus == NULL)
-                              return -ENOMEM;
- 
-                      if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
-                          1, 1) != 0) {
-                              xfree(log_cpus);
-                              return -EIO;
-                      }
-              }
- 
-              mc_physcpuinfo->ncpus = nlcpu;
- 
-              if (copy_to_guest(u_xen_mc, op, 1)) {
-                      if (log_cpus != NULL)
-                              xfree(log_cpus);
-                      return -EFAULT;
-              }
- 
-              if (!guest_handle_is_null(mc_physcpuinfo->info)) {
-                      if (copy_to_guest(mc_physcpuinfo->info,
-                          log_cpus, nlcpu))
-                              ret = -EFAULT;
-                      xfree(log_cpus);
-              }
+                       return x86_mcerr("do_mca cpuinfo", -EPERM);
+
+               mc_physcpuinfo = &op->u.mc_physcpuinfo;
+               nlcpu = num_online_cpus();
+
+               if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+                       if (mc_physcpuinfo->ncpus <= 0)
+                               return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
+                                   -EINVAL);
+                       nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
+                       log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
+                       if (log_cpus == NULL)
+                               return x86_mcerr("do_mca cpuinfo", -ENOMEM);
+
+                       if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
+                           1, 1) != 0) {
+                               xfree(log_cpus);
+                               return x86_mcerr("do_mca cpuinfo", -EIO);
+                       }
+               }
+
+               mc_physcpuinfo->ncpus = nlcpu;
+
+               if (copy_to_guest(u_xen_mc, op, 1)) {
+                       if (log_cpus != NULL)
+                               xfree(log_cpus);
+                       return x86_mcerr("do_mca cpuinfo", -EFAULT);
+               }
+
+               if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+                       if (copy_to_guest(mc_physcpuinfo->info,
+                           log_cpus, nlcpu))
+                               ret = -EFAULT;
+                       xfree(log_cpus);
+               }
+               break;
+
+       default:
+               return x86_mcerr("do_mca: bad command", -EINVAL);
        }
 
        return ret;
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Tue Mar 17 14:22:50 2009 +0000
@@ -1,38 +1,98 @@
+#ifndef _MCE_H
+
+#define _MCE_H
+
 #include <xen/init.h>
+#include <xen/smp.h>
 #include <asm/types.h>
 #include <asm/traps.h>
 #include <asm/atomic.h>
 #include <asm/percpu.h>
 
+#include "x86_mca.h"
+#include "mctelem.h"
 
 /* Init functions */
-void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c);
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c);
 
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+int winchip_mcheck_init(struct cpuinfo_x86 *c);
+int intel_mcheck_init(struct cpuinfo_x86 *c);
 
 void intel_mcheck_timer(struct cpuinfo_x86 *c);
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_mcheck_init(struct cpuinfo_x86 *c);
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
-
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-
-/* Function pointer used in the handlers to collect additional information
- * provided by newer CPU families/models without the need to duplicate
- * the whole handler resulting in various handlers each with its own
- * tweaks and bugs */
-extern int (*mc_callback_bank_extended)(struct mc_info *mi,
-               uint16_t bank, uint64_t status);
-
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
 
 int mce_available(struct cpuinfo_x86 *c);
+int mce_firstbank(struct cpuinfo_x86 *c);
 /* Helper functions used for collecting error telemetry */
 struct mc_info *x86_mcinfo_getptr(void);
-void x86_mcinfo_clear(struct mc_info *mi);
-int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
-void x86_mcinfo_dump(struct mc_info *mi);
 void mc_panic(char *s);
 void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *,
                         uint32_t *, uint32_t *, uint32_t *, uint32_t *);
+
+
+/* Register a handler for machine check exceptions. */
+typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long);
+extern void x86_mce_vector_register(x86_mce_vector_t);
+
+/* Common generic MCE handler that implementations may nominate
+ * via x86_mce_vector_register. */
+extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
+
+/* Utility function to "logout" all architectural MCA telemetry from the MCA
+ * banks of the current processor.  A cookie is returned which may be
+ * uses to reference the data so logged (the cookie can be NULL if
+ * no logout structures were available).  The caller can also pass a pointer
+ * to a structure which will be completed with some summary information
+ * of the MCA data observed in the logout operation. */
+
+enum mca_source {
+       MCA_MCE_HANDLER,
+       MCA_POLLER,
+       MCA_CMCI_HANDLER,
+       MCA_RESET
+};
+
+enum mca_extinfo {
+       MCA_EXTINFO_LOCAL,
+       MCA_EXTINFO_GLOBAL,
+       MCA_EXTINFO_IGNORED
+};
+
+struct mca_summary {
+       uint32_t        errcnt; /* number of banks with valid errors */
+       int             ripv;   /* meaningful on #MC */
+       int             eipv;   /* meaningful on #MC */
+       uint32_t        uc;     /* bitmask of banks with UC */
+       uint32_t        pcc;    /* bitmask of banks with PCC */
+};
+
+extern cpu_banks_t mca_allbanks;
+
+extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
+    struct mca_summary *);
+
+/* Register a callback to be made during bank telemetry logout.
+ * This callback is only available to those machine check handlers
+ * that call to the common mcheck_cmn_handler or who use the common
+ * telemetry logout function mcheck_mca_logout in error polling.
+ *
+ * This can be used to collect additional information (typically non-
+ * architectural) provided by newer CPU families/models without the need
+ * to duplicate the whole handler resulting in various handlers each with
+ * its own tweaks and bugs.  The callback receives an struct mc_info pointer
+ * which it can use with x86_mcinfo_add to add additional telemetry,
+ * the current MCA bank number we are reading telemetry from, and the
+ * MCi_STATUS value for that bank.
+ */
+typedef enum mca_extinfo (*x86_mce_callback_t)
+    (struct mc_info *, uint16_t, uint64_t);
+extern void x86_mce_callback_register(x86_mce_callback_t);
+
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
+void x86_mcinfo_dump(struct mc_info *mi);
+
+#endif /* _MCE_H */
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Mar 17 14:22:50 2009 +0000
@@ -14,6 +14,7 @@ DEFINE_PER_CPU(cpu_banks_t, mce_banks_ow
 
 static int nr_intel_ext_msrs = 0;
 static int cmci_support = 0;
+static int firstbank;
 
 #ifdef CONFIG_X86_MCE_THERMAL
 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
@@ -115,222 +116,51 @@ static void intel_init_thermal(struct cp
 }
 #endif /* CONFIG_X86_MCE_THERMAL */
 
-static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
-{
-    if (nr_intel_ext_msrs == 0)
-        return;
+static enum mca_extinfo
+intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
+{
+    struct mcinfo_extended mc_ext;
+
+    if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
+        return MCA_EXTINFO_IGNORED;
 
     /* this function will called when CAP(9).MCG_EXT_P = 1 */
-    memset(mc_ext, 0, sizeof(struct mcinfo_extended));
-    mc_ext->common.type = MC_TYPE_EXTENDED;
-    mc_ext->common.size = sizeof(mc_ext);
-    mc_ext->mc_msrs = 10;
-
-    mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
-    rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
-    mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
-    rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
-    mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
-    rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
-
-    mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
-    rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
-    mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
-    rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
-    mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
-    rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
-
-    mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
-    rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
-    mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
-    rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
-    mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
-    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
-    mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
-    rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
-}
-
-/* machine_check_poll might be called by following types:
- * 1. called when do mcheck_init.
- * 2. called in cmci interrupt handler
- * 3. called in polling handler
- * It will generate a new mc_info item if found CE/UC errors. DOM0 is the 
- * consumer.
- */
-static struct mc_info *machine_check_poll(int calltype)
-{
-    struct mc_info *mi = NULL;
-    int exceptions = (read_cr4() & X86_CR4_MCE);
-    int i, nr_unit = 0, uc = 0, pcc = 0;
-    uint64_t status, addr;
-    struct mcinfo_global mcg;
-    struct mcinfo_extended mce;
-    unsigned int cpu;
-    struct domain *d;
-
-    cpu = smp_processor_id();
-
-    memset(&mcg, 0, sizeof(mcg));
-    mcg.common.type = MC_TYPE_GLOBAL;
-    mcg.common.size = sizeof(mcg);
-    /* If called from cpu-reset check, don't need to fill them.
-     * If called from cmci context, we'll try to fill domid by memory addr
-     */
-    mcg.mc_domid = -1;
-    mcg.mc_vcpuid = -1;
-    if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
-        mcg.mc_flags = MC_FLAG_POLLED;
-    else if (calltype == MC_FLAG_CMCI)
-        mcg.mc_flags = MC_FLAG_CMCI;
-    x86_mc_get_cpu_info(
-        cpu, &mcg.mc_socketid, &mcg.mc_coreid,
-        &mcg.mc_core_threadid, &mcg.mc_apicid, NULL, NULL, NULL);
-    rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
-
-    for ( i = 0; i < nr_mce_banks; i++ ) {
-        struct mcinfo_bank mcb;
-        /* For CMCI, only owners checks the owned MSRs */
-        if ( !test_bit(i, __get_cpu_var(mce_banks_owned)) &&
-             (calltype & MC_FLAG_CMCI) )
-            continue;
-        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
-        if (! (status & MCi_STATUS_VAL) )
-            continue;
-        /*
-         * Uncorrected events are handled by the exception
-         * handler when it is enabled. But when the exception
-         * is disabled such as when mcheck_init, log everything.
-         */
-        if ((status & MCi_STATUS_UC) && exceptions)
-            continue;
-
-        if (status & MCi_STATUS_UC)
-            uc = 1;
-        if (status & MCi_STATUS_PCC)
-            pcc = 1;
-
-        if (!mi) {
-            mi = x86_mcinfo_getptr();
-            if (!mi) {
-                printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
-                return NULL;
-            }
-            x86_mcinfo_clear(mi);
-        }
-        memset(&mcb, 0, sizeof(mcb));
-        mcb.common.type = MC_TYPE_BANK;
-        mcb.common.size = sizeof(mcb);
-        mcb.mc_bank = i;
-        mcb.mc_status = status;
-        if (status & MCi_STATUS_MISCV)
-            rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
-        if (status & MCi_STATUS_ADDRV) {
-            rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
-            d = maddr_get_owner(addr);
-            if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) 
)
-                mcb.mc_domid = d->domain_id;
-        }
-        if (cmci_support)
-            rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
-        if (calltype == MC_FLAG_CMCI)
-            rdtscll(mcb.mc_tsc);
-        x86_mcinfo_add(mi, &mcb);
-        nr_unit++;
-        add_taint(TAINT_MACHINE_CHECK);
-        /* Clear state for this bank */
-        wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
-        printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%"PRIx64"]\n", 
-                i, cpu, status);
-        printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
-                "thread[%d]\n", cpu, mcg.mc_socketid, 
-                mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
- 
-    }
-    /* if pcc = 1, uc must be 1 */
-    if (pcc)
-        mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
-    else if (uc)
-        mcg.mc_flags |= MC_FLAG_RECOVERABLE;
-    else /* correctable */
-        mcg.mc_flags |= MC_FLAG_CORRECTABLE;
-
-    if (nr_unit && nr_intel_ext_msrs && 
-                    (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
-        intel_get_extended_msrs(&mce);
-        x86_mcinfo_add(mi, &mce);
-    }
-    if (nr_unit) 
-        x86_mcinfo_add(mi, &mcg);
-    /* Clear global state */
-    return mi;
-}
-
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long 
error_code)
-{
-    /* MACHINE CHECK Error handler will be sent in another patch,
-     * simply copy old solutions here. This code will be replaced
-     * by upcoming machine check patches
-     */
-
-    int recover=1;
-    u32 alow, ahigh, high, low;
-    u32 mcgstl, mcgsth;
-    int i;
-   
-    rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-    if (mcgstl & (1<<0))       /* Recoverable ? */
-        recover=0;
-    
-    printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-           smp_processor_id(), mcgsth, mcgstl);
-    
-    for (i=0; i<nr_mce_banks; i++) {
-        rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
-        if (high & (1<<31)) {
-            if (high & (1<<29))
-                recover |= 1;
-            if (high & (1<<25))
-                recover |= 2;
-            printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
-            high &= ~(1<<31);
-            if (high & (1<<27)) {
-                rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-                printk ("[%08x%08x]", ahigh, alow);
-            }
-            if (high & (1<<26)) {
-                rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-                printk (" at %08x%08x", ahigh, alow);
-            }
-            printk ("\n");
-        }
-    }
-    
-    if (recover & 2)
-        mc_panic ("CPU context corrupt");
-    if (recover & 1)
-        mc_panic ("Unable to continue");
-    
-    printk(KERN_EMERG "Attempting to continue.\n");
-    /* 
-     * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
-     * recoverable/continuable.This will allow BIOS to look at the MSRs
-     * for errors if the OS could not log the error.
-     */
-    for (i=0; i<nr_mce_banks; i++) {
-        u32 msr;
-        msr = MSR_IA32_MC0_STATUS+i*4;
-        rdmsr (msr, low, high);
-        if (high&(1<<31)) {
-            /* Clear it */
-            wrmsr(msr, 0UL, 0UL);
-            /* Serialize */
-            wmb();
-            add_taint(TAINT_MACHINE_CHECK);
-        }
-    }
-    mcgstl &= ~(1<<2);
-    wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+    memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
+    mc_ext.common.type = MC_TYPE_EXTENDED;
+    mc_ext.common.size = sizeof(mc_ext);
+    mc_ext.mc_msrs = 10;
+
+    mc_ext.mc_msr[0].reg = MSR_IA32_MCG_EAX;
+    rdmsrl(MSR_IA32_MCG_EAX, mc_ext.mc_msr[0].value);
+    mc_ext.mc_msr[1].reg = MSR_IA32_MCG_EBX;
+    rdmsrl(MSR_IA32_MCG_EBX, mc_ext.mc_msr[1].value);
+    mc_ext.mc_msr[2].reg = MSR_IA32_MCG_ECX;
+    rdmsrl(MSR_IA32_MCG_ECX, mc_ext.mc_msr[2].value);
+
+    mc_ext.mc_msr[3].reg = MSR_IA32_MCG_EDX;
+    rdmsrl(MSR_IA32_MCG_EDX, mc_ext.mc_msr[3].value);
+    mc_ext.mc_msr[4].reg = MSR_IA32_MCG_ESI;
+    rdmsrl(MSR_IA32_MCG_ESI, mc_ext.mc_msr[4].value);
+    mc_ext.mc_msr[5].reg = MSR_IA32_MCG_EDI;
+    rdmsrl(MSR_IA32_MCG_EDI, mc_ext.mc_msr[5].value);
+
+    mc_ext.mc_msr[6].reg = MSR_IA32_MCG_EBP;
+    rdmsrl(MSR_IA32_MCG_EBP, mc_ext.mc_msr[6].value);
+    mc_ext.mc_msr[7].reg = MSR_IA32_MCG_ESP;
+    rdmsrl(MSR_IA32_MCG_ESP, mc_ext.mc_msr[7].value);
+    mc_ext.mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
+    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext.mc_msr[8].value);
+    mc_ext.mc_msr[9].reg = MSR_IA32_MCG_EIP;
+    rdmsrl(MSR_IA32_MCG_EIP, mc_ext.mc_msr[9].value);
+
+    x86_mcinfo_add(mci, &mc_ext);
+
+    return MCA_EXTINFO_GLOBAL;
+}
+
+static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
+{
+       mcheck_cmn_handler(regs, error_code, mca_allbanks);
 }
 
 static DEFINE_SPINLOCK(cmci_discover_lock);
@@ -369,6 +199,8 @@ static void cmci_discover(void)
     unsigned long flags;
     int i;
     struct mc_info *mi = NULL;
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
 
     printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
 
@@ -385,12 +217,20 @@ static void cmci_discover(void)
      * MCi_status (error_count bit 38~52) is not cleared,
      * the CMCI interrupt will never be triggered again.
      */
-    mi = machine_check_poll(MC_FLAG_CMCI);
-    if (mi) {
-        x86_mcinfo_dump(mi);
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+
+    mctc = mcheck_mca_logout(
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+    if (bs.errcnt && mctc != NULL) {
+        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            mctelem_commit(mctc);
             send_guest_global_virq(dom0, VIRQ_MCA);
-    }
+        } else {
+            x86_mcinfo_dump(mi);
+            mctelem_dismiss(mctc);
+       }
+    } else if (mctc != NULL)
+        mctelem_dismiss(mctc);
 
     printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
            smp_processor_id(), 
@@ -487,17 +327,26 @@ fastcall void smp_cmci_interrupt(struct 
 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
 {
     struct mc_info *mi = NULL;
-    int cpu = smp_processor_id();
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
 
     ack_APIC_irq();
     irq_enter();
-    printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
-    mi = machine_check_poll(MC_FLAG_CMCI);
-    if (mi) {
-        x86_mcinfo_dump(mi);
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+
+    mctc = mcheck_mca_logout(
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+    if (bs.errcnt && mctc != NULL) {
+        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            mctelem_commit(mctc);
             send_guest_global_virq(dom0, VIRQ_MCA);
-    }
+        } else {
+            x86_mcinfo_dump(mi);
+            mctelem_dismiss(mctc);
+       }
+    } else if (mctc != NULL)
+        mctelem_dismiss(mctc);
+
     irq_exit();
 }
 
@@ -527,28 +376,28 @@ static void mce_cap_init(struct cpuinfo_
         printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
             smp_processor_id(), nr_intel_ext_msrs);
     }
-    /* for most of p6 family, bank 0 is an alias bios MSR.
-     * But after model>1a, bank 0 is available*/
-    if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
-            && c->x86_model < 0x1A)
-        firstbank = 1;
-    else
-        firstbank = 0;
+    firstbank = mce_firstbank(c);
 }
 
 static void mce_init(void)
 {
     u32 l, h;
     int i;
-    struct mc_info *mi;
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
+
     clear_in_cr4(X86_CR4_MCE);
+
     /* log the machine checks left over from the previous reset.
      * This also clears all registers*/
 
-    mi = machine_check_poll(MC_FLAG_RESET);
+    mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs);
+
     /* in the boot up stage, don't inject to DOM0, but print out */
-    if (mi)
-        x86_mcinfo_dump(mi);
+    if (bs.errcnt && mctc != NULL) {
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
+        mctelem_dismiss(mctc);
+    }
 
     set_in_cr4(X86_CR4_MCE);
     rdmsr (MSR_IA32_MCG_CAP, l, h);
@@ -573,71 +422,19 @@ static void mce_init(void)
 }
 
 /* p4/p6 family have similar MCA initialization process */
-void intel_mcheck_init(struct cpuinfo_x86 *c)
+int intel_mcheck_init(struct cpuinfo_x86 *c)
 {
     mce_cap_init(c);
     printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
             smp_processor_id());
+
     /* machine check is available */
-    machine_check_vector = intel_machine_check;
+    x86_mce_vector_register(intel_machine_check);
+    x86_mce_callback_register(intel_get_extended_msrs);
+
     mce_init();
     mce_intel_feature_init(c);
     mce_set_owner();
-}
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll faster. When the poller finds no more 
- * errors, poll slower
-*/
-static struct timer mce_timer;
-
-#define MCE_PERIOD 4000
-#define MCE_MIN    2000
-#define MCE_MAX    32000
-
-static u64 period = MCE_PERIOD;
-static int adjust = 0;
-
-static void mce_intel_checkregs(void *info)
-{
-    struct mc_info *mi;
-
-    if( !mce_available(&current_cpu_data))
-        return;
-    mi = machine_check_poll(MC_FLAG_POLLED);
-    if (mi)
-    {
-        x86_mcinfo_dump(mi);
-        adjust++;
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
-            send_guest_global_virq(dom0, VIRQ_MCA);
-    }
-}
-
-static void mce_intel_work_fn(void *data)
-{
-    on_each_cpu(mce_intel_checkregs, data, 1, 1);
-    if (adjust) {
-        period = period / (adjust + 1);
-        printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval "
-               "to %"PRIu64"\n", period);
-    }
-    else {
-        period *= 2;
-    }
-    if (period > MCE_MAX) 
-        period = MCE_MAX;
-    if (period < MCE_MIN)
-        period = MCE_MIN;
-    set_timer(&mce_timer, NOW() + MILLISECS(period));
-    adjust = 0;
-}
-
-void intel_mcheck_timer(struct cpuinfo_x86 *c)
-{
-    printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
-    init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
-    set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
-}
-
+
+    return 1;
+}
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mctelem.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c Tue Mar 17 14:22:50 2009 +0000
@@ -0,0 +1,443 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+/*
+ * mctelem.c - x86 Machine Check Telemetry Transport
+ */
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/config.h>
+#include <xen/smp.h>
+#include <xen/errno.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+
+struct mctelem_ent {
+       struct mctelem_ent *mcte_next;  /* next in chronological order */
+       struct mctelem_ent *mcte_prev;  /* previous in chronological order */
+       uint32_t mcte_flags;            /* See MCTE_F_* below */
+       uint32_t mcte_refcnt;           /* Reference count */
+       void *mcte_data;                /* corresponding data payload */
+};
+
+#define        MCTE_F_HOME_URGENT              0x0001U /* free to urgent 
freelist */
+#define        MCTE_F_HOME_NONURGENT           0x0002U /* free to nonurgent 
freelist */
+#define        MCTE_F_CLASS_URGENT             0x0004U /* in use - urgent 
errors */
+#define        MCTE_F_CLASS_NONURGENT          0x0008U /* in use - nonurgent 
errors */
+#define        MCTE_F_STATE_FREE               0x0010U /* on a freelist */
+#define        MCTE_F_STATE_UNCOMMITTED        0x0020U /* reserved; on no list 
*/
+#define        MCTE_F_STATE_COMMITTED          0x0040U /* on a committed list 
*/
+#define        MCTE_F_STATE_PROCESSING         0x0080U /* on a processing list 
*/
+
+#define        MCTE_F_MASK_HOME        (MCTE_F_HOME_URGENT | 
MCTE_F_HOME_NONURGENT)
+#define        MCTE_F_MASK_CLASS       (MCTE_F_CLASS_URGENT | 
MCTE_F_CLASS_NONURGENT)
+#define        MCTE_F_MASK_STATE       (MCTE_F_STATE_FREE | \
+                               MCTE_F_STATE_UNCOMMITTED | \
+                               MCTE_F_STATE_COMMITTED | \
+                               MCTE_F_STATE_PROCESSING)
+
+#define        MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME)
+
+#define        MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
+#define        MCTE_SET_CLASS(tep, new) do { \
+    (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
+    (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
+
+#define        MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
+#define        MCTE_TRANSITION_STATE(tep, old, new) do { \
+    BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
+    (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
+    (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
+
+#define        MC_URGENT_NENT          10
+#define        MC_NONURGENT_NENT       20
+
+#define        MC_NCLASSES             (MC_NONURGENT + 1)
+
+#define        COOKIE2MCTE(c)          ((struct mctelem_ent *)(c))
+#define        MCTE2COOKIE(tep)        ((mctelem_cookie_t)(tep))
+
+static struct mc_telem_ctl {
+       /* Linked lists that thread the array members together.
+        *
+        * The free lists are singly-linked via mcte_next, and we allocate
+        * from them by atomically unlinking an element from the head.
+        * Consumed entries are returned to the head of the free list.
+        * When an entry is reserved off the free list it is not linked
+        * on any list until it is committed or dismissed.
+        *
+        * The committed list grows at the head and we do not maintain a
+        * tail pointer; insertions are performed atomically.  The head
+        * thus has the most-recently committed telemetry, i.e. the
+        * list is in reverse chronological order.  The committed list
+        * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
+        * When we move telemetry from the committed list to the processing
+        * list we atomically unlink the committed list and keep a pointer
+        * to the head of that list;  we then traverse the list following
+        * mcte_prev and fill in mcte_next to doubly-link the list, and then
+        * append the tail of the list onto the processing list.  If we panic
+        * during this manipulation of the committed list we still have
+        * the pointer to its head so we can recover all entries during
+        * the panic flow (albeit in reverse chronological order).
+        *
+        * The processing list is updated in a controlled context, and
+        * we can lock it for updates.  The head of the processing list
+        * always has the oldest telemetry, and we append (as above)
+        * at the tail of the processing list. */
+       struct mctelem_ent *mctc_free[MC_NCLASSES];
+       struct mctelem_ent *mctc_committed[MC_NCLASSES];
+       struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
+       struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
+       /*
+        * Telemetry array
+        */
+       struct mctelem_ent *mctc_elems;
+} mctctl;
+
+/* Lock protecting all processing lists */
+static DEFINE_SPINLOCK(processing_lock);
+
+static void *cmpxchgptr(void *ptr, void *old, void *new)
+{
+       unsigned long *ulp = (unsigned long *)ptr;
+       unsigned long a = (unsigned long)old;
+       unsigned long b = (unsigned long)new;
+
+       return (void *)cmpxchg(ulp, a, b);
+}
+
+/* Free an entry to its native free list; the entry must not be linked on
+ * any list.
+ */
+static void mctelem_free(struct mctelem_ent *tep)
+{
+       mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent **freelp;
+       struct mctelem_ent *oldhead;
+
+       BUG_ON(tep->mcte_refcnt != 0);
+       BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
+
+       tep->mcte_prev = NULL;
+       freelp = &mctctl.mctc_free[target];
+       for (;;) {
+               oldhead = *freelp;
+               tep->mcte_next = oldhead;
+               wmb();
+               if (cmpxchgptr(freelp, oldhead, tep) == oldhead)
+                       break;
+       }
+}
+
+/* Increment the reference count of an entry that is not linked on to
+ * any list and which only the caller has a pointer to.
+ */
+static void mctelem_hold(struct mctelem_ent *tep)
+{
+       tep->mcte_refcnt++;
+}
+
+/* Increment the reference count on an entry that is linked at the head of
+ * a processing list.  The caller is responsible for locking the list.
+ */
+static void mctelem_processing_hold(struct mctelem_ent *tep)
+{
+       int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep != mctctl.mctc_processing_head[which]);
+       tep->mcte_refcnt++;
+}
+
+/* Decrement the reference count on an entry that is linked at the head of
+ * a processing list.  The caller is responsible for locking the list.
+ */
+static void mctelem_processing_release(struct mctelem_ent *tep)
+{
+       int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep != mctctl.mctc_processing_head[which]);
+       if (--tep->mcte_refcnt == 0) {
+               MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
+               mctctl.mctc_processing_head[which] = tep->mcte_next;
+               mctelem_free(tep);
+       }
+}
+
+void mctelem_init(int reqdatasz)
+{
+       static int called = 0;
+       static int datasz = 0, realdatasz = 0;
+       char *datarr;
+       int i;
+       
+       BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
+
+       /* Called from mcheck_init for all processors; initialize for the
+        * first call only (no race here since the boot cpu completes
+        * init before others start up). */
+       if (++called == 1) {
+               realdatasz = reqdatasz;
+               datasz = (reqdatasz & ~0xf) + 0x10;     /* 16 byte roundup */
+       } else {
+               BUG_ON(reqdatasz != realdatasz);
+               return;
+       }
+
+       if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
+           MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL ||
+           (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT) *
+           datasz)) == NULL) {
+               if (mctctl.mctc_elems)
+                       xfree(mctctl.mctc_elems);
+               printk("Allocations for MCA telemetry failed\n");
+               return;
+       }
+
+       for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
+               struct mctelem_ent *tep, **tepp;
+
+               tep = mctctl.mctc_elems + i;
+               tep->mcte_flags = MCTE_F_STATE_FREE;
+               tep->mcte_refcnt = 0;
+               tep->mcte_data = datarr + i * datasz;
+
+               if (i < MC_URGENT_NENT) {
+                       tepp = &mctctl.mctc_free[MC_URGENT];
+                       tep->mcte_flags |= MCTE_F_HOME_URGENT;
+               } else {
+                       tepp = &mctctl.mctc_free[MC_NONURGENT];
+                       tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
+               }
+
+               tep->mcte_next = *tepp;
+               tep->mcte_prev = NULL;
+               *tepp = tep;
+       }
+}
+
+/* incremented non-atomically when reserve fails */
+static int mctelem_drop_count;
+
+/* Reserve a telemetry entry, or return NULL if none available.
+ * If we return an entry then the caller must subsequently call exactly one of
+ * mctelem_unreserve or mctelem_commit for that entry.
+ */
+mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
+{
+       struct mctelem_ent **freelp;
+       struct mctelem_ent *oldhead, *newhead;
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+
+       freelp = &mctctl.mctc_free[target];
+       for (;;) {
+               if ((oldhead = *freelp) == NULL) {
+                       if (which == MC_URGENT && target == MC_URGENT) {
+                               /* raid the non-urgent freelist */
+                               target = MC_NONURGENT;
+                               freelp = &mctctl.mctc_free[target];
+                               continue;
+                       } else {
+                               mctelem_drop_count++;
+                               return (NULL);
+                       }
+               }
+
+               newhead = oldhead->mcte_next;
+               if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
+                       struct mctelem_ent *tep = oldhead;
+
+                       mctelem_hold(tep);
+                       MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
+                       tep->mcte_next = NULL;
+                       tep->mcte_prev = NULL;
+                       if (which == MC_URGENT)
+                               MCTE_SET_CLASS(tep, URGENT);
+                       else
+                               MCTE_SET_CLASS(tep, NONURGENT);
+                       return MCTE2COOKIE(tep);
+               }
+       }
+}
+
+void *mctelem_dataptr(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       return tep->mcte_data;
+}
+
+/* Release a previously reserved entry back to the freelist without
+ * submitting it for logging.  The entry must not be linked on to any
+ * list - that's how mctelem_reserve handed it out.
+ */
+void mctelem_dismiss(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       tep->mcte_refcnt--;
+       MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
+       mctelem_free(tep);
+}
+
+/* Commit an entry with completed telemetry for logging.  The caller must
+ * not reference the entry after this call.  Note that we add entries
+ * at the head of the committed list, so that list therefore has entries
+ * in reverse chronological order.
+ */
+void mctelem_commit(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+       struct mctelem_ent **commlp;
+       struct mctelem_ent *oldhead;
+       mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
+       MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
+
+       commlp = &mctctl.mctc_committed[target];
+       for (;;) {
+               oldhead = *commlp;
+               tep->mcte_prev = oldhead;
+               wmb();
+               if (cmpxchgptr(commlp, oldhead, tep) == oldhead)
+                       break;
+       }
+}
+
+/* Move telemetry from committed list to processing list, reversing the
+ * list into chronological order.  The processing list has been
+ * locked by the caller, and may be non-empty.  We append the
+ * reversed committed list on to the tail of the processing list.
+ * The committed list may grow even while we run, so use atomic
+ * operations to swap NULL to the freelist head.
+ *
+ * Note that "chronological order" means the order in which producers
+ * won additions to the processing list, which may not reflect the
+ * strict chronological order of the associated events if events are
+ * closely spaced in time and contend for the processing list at once.
+ */
+
+static struct mctelem_ent *dangling[MC_NCLASSES];
+
+static void mctelem_append_processing(mctelem_class_t which)
+{
+       mctelem_class_t target = which == MC_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
+       struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
+       struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
+       struct mctelem_ent *tep, *ltep;
+
+       /* Check for an empty list; no race since we hold the processing lock */
+       if (*commlp == NULL)
+               return;
+
+       /* Atomically unlink the committed list, and keep a pointer to
+        * the list we unlink in a well-known location so it can be
+        * picked up in panic code should we panic between this unlink
+        * and the append to the processing list. */
+       for (;;) {
+               dangling[target] = *commlp;
+               wmb();
+               if (cmpxchgptr(commlp, dangling[target], NULL) ==
+                   dangling[target])
+                       break;
+       }
+
+       if (dangling[target] == NULL)
+               return;
+
+       /* Traverse the list following the previous pointers (reverse
+        * chronological order).  For each entry fill in the next pointer
+        * and transition the element state.  */
+       for (tep = dangling[target], ltep = NULL; tep != NULL;
+           tep = tep->mcte_prev) {
+               MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
+               tep->mcte_next = ltep;
+               ltep = tep;
+       }
+
+       /* ltep points to the head of a chronologically ordered linked
+        * list of telemetry entries ending at the most recent entry
+        * dangling[target] if mcte_next is followed; tack this on to
+        * the processing list.
+        */
+       if (*proclhp == NULL) {
+               *proclhp = ltep;
+               *procltp = dangling[target];
+       } else {
+               (*procltp)->mcte_next = ltep;
+               ltep->mcte_prev = *procltp;
+               *procltp = dangling[target];
+       }
+       wmb();
+       dangling[target] = NULL;
+       wmb();
+}
+
+mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
+{
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent *tep;
+
+       spin_lock(&processing_lock);
+       mctelem_append_processing(target);
+       if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
+               spin_unlock(&processing_lock);
+               return NULL;
+       }
+
+       mctelem_processing_hold(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+       return MCTE2COOKIE(tep);
+}
+
+void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       spin_lock(&processing_lock);
+       mctelem_processing_release(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+}
+
+void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
+{
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       if (tep == NULL)
+               return;
+
+       spin_lock(&processing_lock);
+       if (tep == mctctl.mctc_processing_head[target])
+               mctelem_processing_release(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+}
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/mctelem.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/mctelem.h Tue Mar 17 14:22:50 2009 +0000
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#ifndef _MCTELEM_H
+
+#define        _MCTELEM_H
+
+#include <xen/init.h>
+#include <xen/smp.h>
+#include <asm/traps.h>
+
+/* Helper functions used for collecting error telemetry.
+ *
+ * mctelem_init preallocates a number of data areas for use during
+ * machine check data "logout".  Two classes are distinguished -
+ * urgent uses, intended for use from machine check exception handlers,
+ * and non-urgent uses intended for use from error pollers.
+ * Associated with each logout entry of whatever class is a data area
+ * sized per the single argument to mctelem_init.  mcelem_init should be
+ * called from MCA init code before anybody has the chance to change the
+ * machine check vector with mcheck_mca_logout or to use mcheck_mca_logout.
+ *
+ * To reserve an entry of a given class for use in logout, call
+ * mctelem_reserve (or use the common handler functions which do all this
+ * for you).  This returns an opaque cookie, or NULL if no elements are
+ * available.  Elements are reserved with an atomic operation so no deadlock
+ * will occur if, for example, a machine check exception interrupts a
+ * scheduled error poll.  The implementation will raid free non-urgent
+ * entries if all urgent entries are in use when an urgent request is received.
+ * Once an entry is reserved the caller must eventually perform exactly
+ * one of two actions: mctelem_commit or mctelem_dismiss.
+ *
+ * On mctelem_commit the entry is appended to a processing list; 
mctelem_dismiss
+ * frees the element without processing.  After either call the cookie
+ * must not be referenced again.
+ *
+ * To consume committed telemetry call mctelem_consume_oldest_begin
+ * which will return a cookie referencing the oldest (first committed)
+ * entry of the requested class.  Access the associated data using
+ * mctelem_dataptr and when finished use mctelem_consume_oldest_end - in the
+ * begin .. end bracket you are guaranteed that the entry canot be freed
+ * even if it is ack'd elsewhere).  Once the ultimate consumer of the
+ * telemetry has processed it to stable storage it should acknowledge
+ * the telemetry quoting the cookie id, at which point we will free
+ * the element from the processing list.
+ */
+
+typedef struct mctelem_cookie *mctelem_cookie_t;
+
+typedef enum mctelem_class {
+       MC_URGENT,
+       MC_NONURGENT
+} mctelem_class_t;
+
+extern void mctelem_init(int);
+extern mctelem_cookie_t mctelem_reserve(mctelem_class_t);
+extern void *mctelem_dataptr(mctelem_cookie_t);
+extern void mctelem_commit(mctelem_cookie_t);
+extern void mctelem_dismiss(mctelem_cookie_t);
+extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t);
+extern void mctelem_consume_oldest_end(mctelem_cookie_t);
+extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
+
+#endif
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c       Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c       Tue Mar 17 14:22:50 2009 +0000
@@ -14,46 +14,76 @@
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/errno.h>
+#include <xen/event.h>
+#include <xen/sched.h>
 #include <asm/processor.h> 
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
-int firstbank = 0;
+
+static cpu_banks_t bankmask;
 static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(8000)
+#define MCE_PERIOD_MIN MILLISECS(2000)
+#define MCE_PERIOD_MAX MILLISECS(16000)
+
+static uint64_t period = MCE_PERIOD;
+static int adjust = 0;
+static int variable_period = 1;
 
 static void mce_checkregs (void *info)
 {
-       u32 low, high;
-       int i;
+       mctelem_cookie_t mctc;
+       struct mca_summary bs;
+       static uint64_t dumpcount = 0;
 
-       for (i=firstbank; i<nr_mce_banks; i++) {
-               rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
+       mctc = mcheck_mca_logout(MCA_POLLER, bankmask, &bs);
 
-               if (high & (1<<31)) {
-                       printk(KERN_INFO "MCE: The hardware reports a non "
-                               "fatal, correctable incident occurred on "
-                               "CPU %d.\n",
-                               smp_processor_id());
-                       printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
+       if (bs.errcnt && mctc != NULL) {
+               adjust++;
 
-                       /* Scrub the error so we don't pick it up in MCE_RATE 
seconds time. */
-                       wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+               /* If Dom0 enabled the VIRQ_MCA event, then notify it.
+                * Otherwise, if dom0 has had plenty of time to register
+                * the virq handler but still hasn't then dump telemetry
+                * to the Xen console.  The call count may be incremented
+                * on multiple cpus at once and is indicative only - just
+                * a simple-minded attempt to avoid spamming the console
+                * for corrected errors in early startup.
+                */
 
-                       /* Serialize */
-                       wmb();
-                       add_taint(TAINT_MACHINE_CHECK);
+               if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+                       mctelem_commit(mctc);
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               } else if (++dumpcount >= 10) {
+                       x86_mcinfo_dump((struct mc_info 
*)mctelem_dataptr(mctc));
+                       mctelem_dismiss(mctc);
+               } else {
+                       mctelem_dismiss(mctc);
                }
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
        }
 }
 
 static void mce_work_fn(void *data)
 { 
        on_each_cpu(mce_checkregs, NULL, 1, 1);
-       set_timer(&mce_timer, NOW() + MCE_PERIOD);
+
+       if (variable_period) {
+               if (adjust)
+                       period /= (adjust + 1);
+               else
+                       period *= 2;
+               if (period > MCE_PERIOD_MAX)
+                       period = MCE_PERIOD_MAX;
+               if (period < MCE_PERIOD_MIN)
+                       period = MCE_PERIOD_MIN;
+       }
+
+       set_timer(&mce_timer, NOW() + period);
+       adjust = 0;
 }
 
 static int __init init_nonfatal_mce_checker(void)
@@ -63,13 +93,17 @@ static int __init init_nonfatal_mce_chec
        /* Check for MCE support */
        if (!mce_available(c))
                return -ENODEV;
+
+       memcpy(&bankmask, &mca_allbanks, sizeof (cpu_banks_t));
+       if (mce_firstbank(c) == 1)
+               clear_bit(0, bankmask);
+
        /*
         * Check for non-fatal errors every MCE_RATE s
         */
        switch (c->x86_vendor) {
        case X86_VENDOR_AMD:
                if (c->x86 == 6) { /* K7 */
-                       firstbank = 1;
                        init_timer(&mce_timer, mce_work_fn, NULL, 0);
                        set_timer(&mce_timer, NOW() + MCE_PERIOD);
                        break;
@@ -80,15 +114,14 @@ static int __init init_nonfatal_mce_chec
                break;
 
        case X86_VENDOR_INTEL:
-               /* p5 family is different. P4/P6 and latest CPUs shares the
-                * same polling methods
-               */
+               /*
+                * The P5 family is different. P4/P6 and latest CPUs share the
+                * same polling methods.
+                */
                if ( c->x86 != 5 )
                {
-                       /* some CPUs or banks don't support cmci, we need to 
-                        * enable this feature anyway
-                        */
-                       intel_mcheck_timer(c);
+                       init_timer(&mce_timer, mce_work_fn, NULL, 0);
+                       set_timer(&mce_timer, NOW() + MCE_PERIOD);
                }
                break;
        }
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/p5.c
--- a/xen/arch/x86/cpu/mcheck/p5.c      Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/p5.c      Tue Mar 17 14:22:50 2009 +0000
@@ -16,7 +16,7 @@
 #include "x86_mca.h"
 
 /* Machine check handler for Pentium class Intel */
-static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long 
error_code)
+static void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
 {
        u32 loaddr, hi, lotype;
        rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
@@ -28,19 +28,14 @@ static fastcall void pentium_machine_che
 }
 
 /* Set up machine check reporting for processors with Intel style MCE */
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        
-       /*Check for MCE support */
-       if( !cpu_has(c, X86_FEATURE_MCE) )
-               return; 
-
        /* Default P5 to off as its often misconnected */
        if(mce_disabled != -1)
-               return;
-       machine_check_vector = pentium_machine_check;
-       wmb();
+               return 0;
+       x86_mce_vector_register(pentium_machine_check);
 
        /* Read registers before enabling */
        rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
@@ -50,4 +45,6 @@ void intel_p5_mcheck_init(struct cpuinfo
        /* Enable MCE */
        set_in_cr4(X86_CR4_MCE);
        printk(KERN_INFO "Intel old style machine check reporting enabled on 
CPU#%d.\n", smp_processor_id());
+
+       return 1;
 }
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/winchip.c
--- a/xen/arch/x86/cpu/mcheck/winchip.c Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/winchip.c Tue Mar 17 14:22:50 2009 +0000
@@ -16,22 +16,24 @@
 #include "mce.h"
 
 /* Machine check handler for WinChip C6 */
-static fastcall void winchip_machine_check(struct cpu_user_regs * regs, long 
error_code)
+static void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
 {
        printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
        add_taint(TAINT_MACHINE_CHECK);
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
-void winchip_mcheck_init(struct cpuinfo_x86 *c)
+int winchip_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 lo, hi;
-       machine_check_vector = winchip_machine_check;
+
        wmb();
+       x86_mce_vector_register(winchip_machine_check);
        rdmsr(MSR_IDT_FCR1, lo, hi);
        lo|= (1<<2);    /* Enable EIERRINT (int 18 MCE) */
        lo&= ~(1<<4);   /* Enable MCE */
        wrmsr(MSR_IDT_FCR1, lo, hi);
        set_in_cr4(X86_CR4_MCE);
        printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+       return (1);
 }
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Mar 17 14:22:50 2009 +0000
@@ -16,6 +16,10 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+
+#ifndef X86_MCA_H
+
+#define X86_MCA_H
 
 
 /* The MCA/MCE MSRs should not be used anywhere else.
@@ -73,6 +77,9 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/* Bitfield of MSR_K8_HWCR register */
+#define K8_HWCR_MCi_STATUS_WREN                (1ULL << 18)
+
 /*Intel Specific bitfield*/
 #define CMCI_THRESHOLD                 0x2
 
@@ -87,3 +94,4 @@ extern unsigned int nr_mce_banks;
 extern unsigned int nr_mce_banks;
 extern int firstbank;
 
+#endif /* X86_MCA_H */
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/include/asm-x86/traps.h
--- a/xen/include/asm-x86/traps.h       Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/include/asm-x86/traps.h       Tue Mar 17 14:22:50 2009 +0000
@@ -28,7 +28,7 @@ struct softirq_trap {
 
 struct cpu_user_regs;
 
-extern void (*machine_check_vector)(struct cpu_user_regs *regs, long 
error_code);
+extern void machine_check_vector(struct cpu_user_regs *regs, long error_code);
  
 /**
  * guest_has_trap_callback
diff -r 0b1ce09f4577 -r 9c1be8f2013b xen/include/public/arch-x86/xen-mca.h
--- a/xen/include/public/arch-x86/xen-mca.h     Tue Mar 17 14:21:18 2009 +0000
+++ b/xen/include/public/arch-x86/xen-mca.h     Tue Mar 17 14:22:50 2009 +0000
@@ -56,13 +56,20 @@
 /* Hypercall */
 #define __HYPERVISOR_mca __HYPERVISOR_arch_0
 
-#define XEN_MCA_INTERFACE_VERSION 0x03000002
-
-/* IN: Dom0 calls hypercall from MC event handler. */
-#define XEN_MC_CORRECTABLE  0x0
-/* IN: Dom0/DomU calls hypercall from MC trap handler. */
-#define XEN_MC_TRAP         0x1
-/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
+/*
+ * The xen-unstable repo has interface version 0x03000001; out interface
+ * is incompatible with that and any future minor revisions, so we
+ * choose a different version number range that is numerically less
+ * than that used in xen-unstable.
+ */
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc002
+
+/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
+#define XEN_MC_NONURGENT  0x0001
+/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
+#define XEN_MC_URGENT     0x0002
+/* IN: Dom0 acknowledges previosly-fetched telemetry */
+#define XEN_MC_ACK        0x0004
 
 /* OUT: All is ok */
 #define XEN_MC_OK           0x0
@@ -110,6 +117,7 @@ struct mcinfo_common {
 #define MC_FLAG_POLLED         (1 << 3)
 #define MC_FLAG_RESET          (1 << 4)
 #define MC_FLAG_CMCI           (1 << 5)
+#define MC_FLAG_MCE            (1 << 6)
 /* contains global x86 mc information */
 struct mcinfo_global {
     struct mcinfo_common common;
@@ -174,6 +182,7 @@ struct mc_info {
     uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
 };
 typedef struct mc_info mc_info_t;
+DEFINE_XEN_GUEST_HANDLE(mc_info_t);
 
 #define __MC_MSR_ARRAYSIZE 8
 #define __MC_NMSRS 1
@@ -274,14 +283,14 @@ DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_c
 #define XEN_MC_fetch            1
 struct xen_mc_fetch {
     /* IN/OUT variables. */
-    uint32_t flags;
-
-/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
-/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint32_t flags;    /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+                           XEN_MC_ACK if ack'ing an earlier fetch */
+                       /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
+                          XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint64_t fetch_id; /* OUT: id for ack, IN: id we are ack'ing */
 
     /* OUT variables. */
-    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
-    struct mc_info mc_info;
+    XEN_GUEST_HANDLE(mc_info_t) data;
 };
 typedef struct xen_mc_fetch xen_mc_fetch_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
@@ -296,7 +305,6 @@ struct xen_mc_notifydomain {
     uint16_t mc_domid;    /* The unprivileged domain to notify. */
     uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
                            * Usually echo'd value from the fetch hypercall. */
-    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
 
     /* IN/OUT variables. */
     uint32_t flags;
@@ -316,15 +324,16 @@ struct xen_mc_physcpuinfo {
        XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
 };
 
+typedef union {
+    struct xen_mc_fetch        mc_fetch;
+    struct xen_mc_notifydomain mc_notifydomain;
+    struct xen_mc_physcpuinfo  mc_physcpuinfo;
+} xen_mc_arg_t;
+
 struct xen_mc {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
-    union {
-        struct xen_mc_fetch        mc_fetch;
-        struct xen_mc_notifydomain mc_notifydomain;
-        struct xen_mc_physcpuinfo  mc_physcpuinfo;
-        uint8_t pad[MCINFO_HYPERCALLSIZE];
-    } u;
+    xen_mc_arg_t u;
 };
 typedef struct xen_mc xen_mc_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_t);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.