[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] MCA interfaces between XEN/DOM0, let DOM0 know the MCA recovery action



# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1237569929 0
# Node ID 891af2c54155afc4ca47a8e8eb8f6865b2f76f0f
# Parent  cc60defe5b9697ab0e068caa4fd1f8798bfe5104
MCA interfaces between XEN/DOM0, let DOM0 know the MCA recovery action

Signed-off-by: Jiang, yunhong <yunhong.jiang@xxxxxxxxx>
Signed-off-by: Ke, liping <liping.ke@xxxxxxxxx>
---
 xen/arch/x86/cpu/mcheck/x86_mca.h     |   47 +++++++++++++++++++++++++
 xen/include/public/arch-x86/xen-mca.h |   63 ++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff -r cc60defe5b96 -r 891af2c54155 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Fri Mar 20 17:24:53 2009 +0000
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Fri Mar 20 17:25:29 2009 +0000
@@ -87,6 +87,53 @@ typedef DECLARE_BITMAP(cpu_banks_t, MAX_
 typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS);
 DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned);
 
+/* Below interfaces are defined for MCA internal processing:
+ * a. pre_handler will be called early in MCA ISR context, mainly for early
+ *    need_reset detection for avoiding log missing. Also, it is used to judge
+ *    impacted DOMAIN if possible.
+ * b. mca_error_handler is actually a (error_action_index,
+ *    recovery_hanlder pointer) pair. The defined recovery_handler
+ *    performs the actual recovery operations such as page_offline, cpu_offline
+ *    in softIRQ context when the per_bank MCA error matching the corresponding
+ *    mca_code index. If pre_handler can't judge the impacted domain,
+ *    recovery_handler must figure it out.
+*/
+
+/* MCA error has been recovered successfully by the recovery action*/
+#define MCA_RECOVERED (0x1 < 0)
+/* MCA error impact the specified DOMAIN in owner field below */
+#define MCA_OWNER (0x1 < 1)
+/* MCA error can't be recovered and need reset */
+#define MCA_NEED_RESET (0x1 < 2)
+/* MCA error need further actions in softIRQ context for recovery */
+#define MCA_MORE_ACTION (0x1 < 3)
+
+struct mca_handle_result
+{
+    uint32_t result;
+    /* Used one result & MCA_OWNER */
+    domid_t owner;
+    /* Used by mca_error_handler, result & MCA_RECOVRED */
+    struct recovery_action *action;
+};
+
+extern void (*mca_prehandler)( struct cpu_user_regs *regs,
+                        struct mca_handle_result *result);
+
+struct mca_error_handler
+{
+    /* Assume corresponding recovery action could be uniquely
+     * identified by mca_code. Otherwise, we might need to have
+     * a seperate function to decode the corresponding actions
+     * for the particular mca error later.
+    */
+    uint16_t mca_code;
+    void (*recovery_handler)( struct mcinfo_bank *bank,
+                    struct mcinfo_global *global,
+                    struct mcinfo_extended *extension,
+                    struct mca_handle_result *result);
+};
+
 /* Global variables */
 extern int mce_disabled;
 extern unsigned int nr_mce_banks;
diff -r cc60defe5b96 -r 891af2c54155 xen/include/public/arch-x86/xen-mca.h
--- a/xen/include/public/arch-x86/xen-mca.h     Fri Mar 20 17:24:53 2009 +0000
+++ b/xen/include/public/arch-x86/xen-mca.h     Fri Mar 20 17:25:29 2009 +0000
@@ -104,6 +104,7 @@
 #define MC_TYPE_GLOBAL          0
 #define MC_TYPE_BANK            1
 #define MC_TYPE_EXTENDED        2
+#define MC_TYPE_RECOVERY        3
 
 struct mcinfo_common {
     uint16_t type;      /* structure type */
@@ -171,6 +172,68 @@ struct mcinfo_extended {
     */
     struct mcinfo_msr mc_msr[10];
 };
+
+/* Recovery Action flags. Giving recovery result information to DOM0 */
+
+/* Xen takes successful recovery action, the error is recovered */
+#define REC_ACTION_RECOVERED (0x1 << 0)
+/* No action is performed by XEN */
+#define REC_ACTION_NONE (0x1 << 1)
+/* It's possible DOM0 might take action ownership in some case */
+#define REC_ACTION_NEED_RESET (0x1 << 2)
+
+/* Different Recovery Action types, if the action is performed successfully,
+ * REC_ACTION_RECOVERED flag will be returned.
+ */
+
+/* Page Offline Action */
+#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
+/* CPU offline Action */
+#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+/* L3 cache disable Action */
+#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+
+/* Below interface used between XEN/DOM0 for passing XEN's recovery action 
+ * information to DOM0. 
+ * usage Senario: After offlining broken page, XEN might pass its page offline
+ * recovery action result to DOM0. DOM0 will save the information in 
+ * non-volatile memory for further proactive actions, such as offlining the
+ * easy broken page earlier when doing next reboot.
+*/
+struct page_offline_action
+{
+    /* Params for passing the offlined page number to DOM0 */
+    uint64_t mfn;
+    uint64_t status;
+};
+
+struct cpu_offline_action
+{
+    /* Params for passing the identity of the offlined CPU to DOM0 */
+    uint32_t mc_socketid;
+    uint16_t mc_coreid;
+    uint16_t mc_core_threadid;
+};
+
+#define MAX_UNION_SIZE 16
+struct mc_recovery
+{
+    uint16_t mc_bank; /* bank nr */
+    uint8_t action_flags;
+    uint8_t action_types;
+    union {
+        struct page_offline_action page_retire;
+        struct cpu_offline_action cpu_offline;
+        uint8_t pad[MAX_UNION_SIZE];
+    } action_info;
+};
+
+struct mcinfo_recovery
+{
+    struct mcinfo_common common;
+    struct mc_recovery mc_action;
+};
+
 
 #define MCINFO_HYPERCALLSIZE   1024
 #define MCINFO_MAXSIZE         768

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.