[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 2/2] Provide ERST interface to Xen MCE



Provide ERST interface to Xen MCE

This patch is used to provide ERST write/read/clear operation interface to Xen 
MCE.

Signed-off-by: Liu, Jinsong <jinsong.liu@xxxxxxxxx>

 b/xen/arch/x86/cpu/mcheck/mce-apei.c |  129 +++++++++++++++++++++++++++++++++++
 xen/arch/x86/cpu/mcheck/Makefile     |    1 
 xen/arch/x86/cpu/mcheck/mce.h        |   24 ++++++
 xen/arch/x86/time.c                  |    5 +
 xen/include/xen/cper.h               |  113 ++++++++++++++++++++++++++++++
 5 files changed, 272 insertions(+)

diff -r 4c224a1c98c9 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile  Fri Aug 20 17:38:07 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/Makefile  Fri Aug 20 20:59:52 2010 +0800
@@ -4,6 +4,7 @@ obj-y += amd_f10.o
 obj-y += amd_f10.o
 obj-y += mctelem.o
 obj-y += mce.o
+obj-y += mce-apei.o
 obj-y += mce_intel.o
 obj-y += mce_amd_quirks.o
 obj-y += non-fatal.o
diff -r 4c224a1c98c9 xen/arch/x86/cpu/mcheck/mce-apei.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/mce-apei.c        Fri Aug 20 20:59:52 2010 +0800
@@ -0,0 +1,129 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@xxxxxxxxx>
+ *   Ported by: Liu, Jinsong <jinsong.liu@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/kernel.h>
+#include <xen/cper.h>
+#include <xen/errno.h>
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+#include "mce.h"
+
+#define CPER_CREATOR_MCE                                               \
+       UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c,     \
+               0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE                                          \
+       UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96,     \
+               0x04, 0x4a, 0x38, 0xfc)
+
+#pragma pack(1)
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+       struct cper_record_header hdr;
+       struct cper_section_descriptor sec_hdr;
+       struct mce mce;
+} __packed;
+/* Reset to default packing */
+#pragma pack()
+
+int apei_write_mce(struct mce *m)
+{
+       struct cper_mce_record rcd;
+
+       if (!m)
+               return -EINVAL;
+
+       memset(&rcd, 0, sizeof(rcd));
+       memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+       rcd.hdr.revision = CPER_RECORD_REV;
+       rcd.hdr.signature_end = CPER_SIG_END;
+       rcd.hdr.section_count = 1;
+       rcd.hdr.error_severity = CPER_SER_FATAL;
+       /* timestamp, platform_id, partition_id are all invalid */
+       rcd.hdr.validation_bits = 0;
+       rcd.hdr.record_length = sizeof(rcd);
+       rcd.hdr.creator_id = CPER_CREATOR_MCE;
+       rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+       rcd.hdr.record_id = cper_next_record_id();
+       rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+       rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+       rcd.sec_hdr.section_length = sizeof(rcd.mce);
+       rcd.sec_hdr.revision = CPER_SEC_REV;
+       /* fru_id and fru_text is invalid */
+       rcd.sec_hdr.validation_bits = 0;
+       rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+       rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+       rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+
+       memcpy(&rcd.mce, m, sizeof(*m));
+
+       return erst_write(&rcd.hdr);
+}
+
+size_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+       struct cper_mce_record rcd;
+       size_t len;
+
+       if (!m || !record_id)
+               return -EINVAL;
+
+       len = erst_read_next(&rcd.hdr, sizeof(rcd));
+       if (len <= 0)
+               return len;
+       /* Can not skip other records in storage via ERST unless clear them */
+       else if (len != sizeof(rcd) ||
+                uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
+               printk(KERN_WARNING
+                       "MCE-APEI: Can not skip the unknown record in ERST");
+               return -EIO;
+       }
+
+       memcpy(m, &rcd.mce, sizeof(*m));
+       *record_id = rcd.hdr.record_id;
+
+       return sizeof(*m);
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+       return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+       return erst_clear(record_id);
+}
diff -r 4c224a1c98c9 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Fri Aug 20 17:38:07 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Fri Aug 20 20:59:52 2010 +0800
@@ -186,4 +186,28 @@ static inline int mce_bank_msr(uint32_t 
         return 1;
     return 0;
 }
+
+/* Fields are zero when not available */
+struct mce {
+    __u64 status;
+    __u64 misc;
+    __u64 addr;
+    __u64 mcgstatus;
+    __u64 ip;
+    __u64 tsc;      /* cpu time stamp counter */
+    __u64 time;     /* wall time_t when error was detected */
+    __u8  cpuvendor;        /* cpu vendor as encoded in system.h */
+    __u8  inject_flags;     /* software inject flags */
+    __u16  pad;
+    __u32 cpuid;    /* CPUID 1 EAX */
+    __u8  cs;               /* code segment */
+    __u8  bank;     /* machine check bank */
+    __u8  cpu;      /* cpu number; obsolete; use extcpu now */
+    __u8  finished;   /* entry is valid */
+    __u32 extcpu;   /* linux cpu number that detected the error */
+    __u32 socketid; /* CPU socket ID */
+    __u32 apicid;   /* CPU initial apic ID */
+    __u64 mcgcap;   /* MCGCAP MSR: machine check capabilities of CPU */
+};
+
 #endif /* _MCE_H */
diff -r 4c224a1c98c9 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Fri Aug 20 17:38:07 2010 +0800
+++ b/xen/arch/x86/time.c       Fri Aug 20 20:59:52 2010 +0800
@@ -1501,6 +1501,11 @@ unsigned long get_localtime(struct domai
         + d->time_offset_seconds;
 }
 
+unsigned long get_sec(void)
+{
+    return wc_sec + (wc_nsec + NOW()) / 1000000000ULL;
+}
+
 /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
 static long cmos_utc_offset; /* in seconds */
 
diff -r 4c224a1c98c9 xen/include/xen/cper.h
--- a/xen/include/xen/cper.h    Fri Aug 20 17:38:07 2010 +0800
+++ b/xen/include/xen/cper.h    Fri Aug 20 20:59:52 2010 +0800
@@ -23,16 +23,129 @@
 #define LINUX_CPER_H
 
 #include <xen/types.h>
+#include <xen/string.h>
+
+extern unsigned long get_sec(void);
 
 typedef struct {
        __u8 b[16];
 } uuid_le;
+
+static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2)
+{
+        return memcmp(&u1, &u2, sizeof(uuid_le));
+}
+
+static inline u64 cper_next_record_id(void)
+{
+       static u64 record_id;
+
+       if (!record_id)
+               record_id = get_sec() << 32;
+
+       return ++record_id;
+}
+
+#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)               \
+((uuid_le)                                                             \
+{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+   (b) & 0xff, ((b) >> 8) & 0xff,                                      \
+   (c) & 0xff, ((c) >> 8) & 0xff,                                      \
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
 
 /* CPER record signature and the size */
 #define CPER_SIG_RECORD                                "CPER"
 #define CPER_SIG_SIZE                          4
 /* Used in signature_end field in struct cper_record_header */
 #define CPER_SIG_END                           0xffffffff
+
+/*
+ * CPER record header revision, used in revision field in struct
+ * cper_record_header
+ */
+#define CPER_RECORD_REV                                0x0100
+
+/*
+ * Severity difinition for error_severity in struct cper_record_header
+ * and section_severity in struct cper_section_descriptor
+ */
+#define CPER_SER_RECOVERABLE                   0x0
+#define CPER_SER_FATAL                         0x1
+#define CPER_SER_CORRECTED                     0x2
+#define CPER_SER_INFORMATIONAL                 0x3
+
+/*
+ * Notification type used to generate error record, used in
+ * notification_type in struct cper_record_header
+ *
+ * Corrected Machine Check
+ */
+#define CPER_NOTIFY_CMC                                                        
\
+       UUID_LE(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4,     \
+               0xEB, 0xD4, 0xF8, 0x90)
+/* Corrected Platform Error */
+#define CPER_NOTIFY_CPE                                                        
\
+       UUID_LE(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81,     \
+               0xF2, 0x7E, 0xBE, 0xEE)
+/* Machine Check Exception */
+#define CPER_NOTIFY_MCE                                                        
\
+       UUID_LE(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB,     \
+               0xE1, 0x49, 0x13, 0xBB)
+/* PCI Express Error */
+#define CPER_NOTIFY_PCIE                                               \
+       UUID_LE(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D,     \
+               0xAF, 0x67, 0xC1, 0x04)
+/* INIT Record (for IPF) */
+#define CPER_NOTIFY_INIT                                               \
+       UUID_LE(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B,     \
+               0xD3, 0x9B, 0xC9, 0x8E)
+/* Non-Maskable Interrupt */
+#define CPER_NOTIFY_NMI                                                        
\
+       UUID_LE(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24,     \
+               0x85, 0xD6, 0xE9, 0x8A)
+/* BOOT Error Record */
+#define CPER_NOTIFY_BOOT                                               \
+       UUID_LE(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62,     \
+               0xD4, 0x64, 0xB3, 0x8F)
+/* DMA Remapping Error */
+#define CPER_NOTIFY_DMAR                                               \
+       UUID_LE(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E,     \
+               0x72, 0x2D, 0xEB, 0x41)
+
+/*
+ * Flags bits definitions for flags in struct cper_record_header
+ * If set, the error has been recovered
+ */
+#define CPER_HW_ERROR_FLAGS_RECOVERED          0x1
+/* If set, the error is for previous boot */
+#define CPER_HW_ERROR_FLAGS_PREVERR            0x2
+/* If set, the error is injected for testing */
+#define CPER_HW_ERROR_FLAGS_SIMULATED          0x4
+
+/*
+ * CPER section header revision, used in revision field in struct
+ * cper_section_descriptor
+ */
+#define CPER_SEC_REV                           0x0100
+
+/*
+ * Validation bits difinition for validation_bits in struct
+ * cper_section_descriptor. If set, corresponding fields in struct
+ * cper_section_descriptor contain valid information.
+ *
+ * corresponds fru_id
+ */
+#define CPER_SEC_VALID_FRU_ID                  0x1
+/* corresponds fru_text */
+#define CPER_SEC_VALID_FRU_TEXT                        0x2
+
+/*
+ * Flags bits definitions for flags in struct cper_section_descriptor
+ *
+ * If set, the section is associated with the error condition
+ * directly, and should be focused on
+ */
+#define CPER_SEC_PRIMARY                       0x0001
 
 /*
  * All tables and structs must be byte-packed to match CPER

Attachment: mce-erst-2.patch
Description: mce-erst-2.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.