x86: show remote CPU state upon fatal NMI Quite frequently the watchdog would hit an innocent CPU, e.g. one trying to acquire a spin lock a remote CPU holds for extended periods of time, or a random CPU in TSC calbration rendezvous. In such cases the register and stack dump for that CPU doesn't really help in the analysis of the problem. To keep things reasonable on large systems, only log CS:RIP by default. This can be overridden via a new extension to the "nmi=" command line option such that full register/stack state will get dumped. Signed-off-by: Jan Beulich --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -1155,7 +1155,7 @@ Use the MWAIT idle driver (with model sp of the ACPI based one. ### nmi -> `= ignore | dom0 | fatal` +> `= ignore | dom0 | fatal [,show-all]` > Default: `fatal` for a debug build, or `dom0` for a non-debug build @@ -1163,6 +1163,9 @@ Specify what Xen should do in the event `ignore` discards the error; `dom0` causes Xen to report the error to dom0, while 'fatal' causes Xen to print diagnostics and then hang. +The `show-all` modifier forces all CPUs' full state to be dumped upon +fatal NMIs (normally a result of the watchdog kicking in). + ### noapic Instruct Xen to ignore any IOAPICs that are present in the system, and --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -84,10 +84,11 @@ * dom0: The NMI is virtualised to DOM0. * ignore: The NMI error is cleared and ignored. */ +static char __read_mostly opt_nmi[16] = #ifdef NDEBUG -static char __read_mostly opt_nmi[10] = "dom0"; + "dom0"; #else -static char __read_mostly opt_nmi[10] = "fatal"; + "fatal"; #endif string_param("nmi", opt_nmi); @@ -525,6 +526,35 @@ void vcpu_show_execution_state(struct vc vcpu_unpause(v); } +static cpumask_t nmi_show_state_mask; +static bool_t opt_nmi_show_all; + +static int __init get_nmi_show_all(void) +{ + const char *s = strchr(opt_nmi, ','); + + if ( s && !strcmp(s + 1, "show-all") ) + opt_nmi_show_all = 1; + + return 0; +} +presmp_initcall(get_nmi_show_all); + +static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu) +{ + if ( !cpumask_test_cpu(cpu, &nmi_show_state_mask) ) + return 0; + + if ( opt_nmi_show_all ) + show_execution_state(regs); + else + printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs, regs->rip, + guest_mode(regs) ? _p(regs->rip) : NULL); + cpumask_clear_cpu(cpu, &nmi_show_state_mask); + + return 1; +} + static const char *trapstr(unsigned int trapnr) { static const char * const strings[] = { @@ -570,6 +600,15 @@ void fatal_trap(const struct cpu_user_re printk("Faulting linear address: %p\n", _p(cr2)); show_page_walk(cr2); } + else if ( trapnr == TRAP_nmi ) + { + cpumask_andnot(&nmi_show_state_mask, &cpu_online_map, + cpumask_of(smp_processor_id())); + set_nmi_callback(nmi_show_execution_state); + smp_send_nmi_allbutself(); + while ( !cpumask_empty(&nmi_show_state_mask) ) + cpu_relax(); + } } panic("FATAL TRAP: vector = %d (%s)\n"