[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH v3] x86/NMI: Allow processing unknown NMIs when watchdog is enabled
On 27/08/14 17:25, Ross Lagerwall wrote: > Change NMI processing so that if watchdog=force is passed on the > command-line and the NMI is not caused by a perf counter overflow (i.e. > likely not a watchdog "tick"), the NMI is handled by the unknown NMI > handler. > > This allows injection of NMIs from IPMI controllers that don't set the > IOCK/SERR bits to trigger the unknown NMI handler rather than be > ignored. > > Signed-off-by: Ross Lagerwall <ross.lagerwall@xxxxxxxxxx> > CC: Keir Fraser <keir@xxxxxxx> > CC: Jan Beulich <jbeulich@xxxxxxxx> > --- > > Changes since V2: > Fix processing when watchdog is turned off. > > Changes since V1: > Most of the original behavior is now restored. Now, only if > watchdog=force is given on the command-line and the NMI is not a > watchdog tick (i.e. the perf counter has not overflowed) will the > unknown NMI handler be invoked. > > docs/misc/xen-command-line.markdown | 6 ++++-- > xen/arch/x86/nmi.c | 39 > +++++++++++++++++++++++++++++++++++-- > xen/arch/x86/traps.c | 7 ++++--- > xen/include/asm-x86/apic.h | 2 +- > xen/include/asm-x86/nmi.h | 3 +++ > 5 files changed, 49 insertions(+), 8 deletions(-) > > diff --git a/docs/misc/xen-command-line.markdown > b/docs/misc/xen-command-line.markdown > index a8cab59..1cbcca4 100644 > --- a/docs/misc/xen-command-line.markdown > +++ b/docs/misc/xen-command-line.markdown > @@ -1245,12 +1245,14 @@ As the BTS virtualisation is not 100% safe and > because of the nehalem quirk > don't use the vpmu flag on production systems with Intel cpus! > > ### watchdog > -> `= <boolean>` > +> `= force | <boolean>` > > > Default: `false` > > Run an NMI watchdog on each processor. If a processor is stuck for > -longer than the **watchdog\_timeout**, a panic occurs. > +longer than the **watchdog\_timeout**, a panic occurs. When `force` is > +specified, in addition to running an NMI watchdog on each processor, > +unknown NMIs will still be processed. > > ### watchdog\_timeout > > `= <integer>` > diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c > index c4427a6..873427f 100644 > --- a/xen/arch/x86/nmi.c > +++ b/xen/arch/x86/nmi.c > @@ -43,7 +43,18 @@ static DEFINE_PER_CPU(unsigned int, nmi_timer_ticks); > > /* opt_watchdog: If true, run a watchdog NMI on each processor. */ > bool_t __initdata opt_watchdog = 0; > -boolean_param("watchdog", opt_watchdog); > + > +/* watchdog_force: If true, process unknown NMIs when running the watchdog. > */ > +bool_t watchdog_force = 0; > + > +static void __init parse_watchdog(char * s) > +{ > + opt_watchdog = !!parse_bool(s); A lot of code in Xen gets the use of parse_bool() wrong. In this case, opt_watchdog will be set if garbage is passed in the parameter, which is a change in behaviour from before. I recommend: switch ( parse_bool(s) ) { case 0: break; default: if ( !strcmp(s, "force") ) watchdog_force = 1; /* fall through */ case 1: opt_watchdog = 1; } ~Andrew > + > + if ( !strcmp(s, "force") ) > + watchdog_force = 1; > +} > +custom_param("watchdog", parse_watchdog); > > /* opt_watchdog_timeout: Number of seconds to wait before panic. */ > static unsigned int opt_watchdog_timeout = 5; > @@ -82,6 +93,7 @@ int nmi_active; > #define K7_EVNTSEL_USR (1 << 16) > #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 > #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING > +#define K7_EVENT_WIDTH 32 > > #define P6_EVNTSEL0_ENABLE (1 << 22) > #define P6_EVNTSEL_INT (1 << 20) > @@ -89,10 +101,12 @@ int nmi_active; > #define P6_EVNTSEL_USR (1 << 16) > #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 > #define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c > +#define P6_EVENT_WIDTH 32 > > #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) > #define P4_CCCR_OVF_PMI0 (1<<26) > #define P4_CCCR_OVF_PMI1 (1<<27) > +#define P4_CCCR_OVF (1<<31) > #define P4_CCCR_THRESHOLD(N) ((N)<<20) > #define P4_CCCR_COMPLEMENT (1<<19) > #define P4_CCCR_COMPARE (1<<18) > @@ -432,8 +446,10 @@ int __init watchdog_setup(void) > return 0; > } > > -void nmi_watchdog_tick(const struct cpu_user_regs *regs) > +/* Returns false if this was not a watchdog NMI, true otherwise */ > +bool_t nmi_watchdog_tick(const struct cpu_user_regs *regs) > { > + bool_t watchdog_tick = 1; > unsigned int sum = this_cpu(nmi_timer_ticks); > > if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() ) > @@ -459,8 +475,15 @@ void nmi_watchdog_tick(const struct cpu_user_regs *regs) > > if ( nmi_perfctr_msr ) > { > + uint64_t msr_content; > + > + /* Work out if this is a watchdog tick by checking for overflow. */ > if ( nmi_perfctr_msr == MSR_P4_IQ_PERFCTR0 ) > { > + rdmsrl(MSR_P4_IQ_CCCR0, msr_content); > + if ( !(msr_content & P4_CCCR_OVF) ) > + watchdog_tick = 0; > + > /* > * P4 quirks: > * - An overflown perfctr will assert its interrupt > @@ -473,14 +496,26 @@ void nmi_watchdog_tick(const struct cpu_user_regs *regs) > } > else if ( nmi_perfctr_msr == MSR_P6_PERFCTR0 ) > { > + rdmsrl(MSR_P6_PERFCTR0, msr_content); > + if ( msr_content & (1ULL << P6_EVENT_WIDTH) ) > + watchdog_tick = 0; > + > /* > * Only P6 based Pentium M need to re-unmask the apic vector but > * it doesn't hurt other P6 variants. > */ > apic_write(APIC_LVTPC, APIC_DM_NMI); > } > + else if ( nmi_perfctr_msr == MSR_K7_PERFCTR0 ) > + { > + rdmsrl(MSR_K7_PERFCTR0, msr_content); > + if ( msr_content & (1ULL << K7_EVENT_WIDTH) ) > + watchdog_tick = 0; > + } > write_watchdog_counter(NULL); > } > + > + return watchdog_tick; > } > > /* > diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c > index 71be2ae..7f5306f 100644 > --- a/xen/arch/x86/traps.c > +++ b/xen/arch/x86/traps.c > @@ -3306,14 +3306,15 @@ void do_nmi(const struct cpu_user_regs *regs) > { > unsigned int cpu = smp_processor_id(); > unsigned char reason; > + bool_t handle_unknown = 0; > > ++nmi_count(cpu); > > if ( nmi_callback(regs, cpu) ) > return; > > - if ( nmi_watchdog ) > - nmi_watchdog_tick(regs); > + if ( !nmi_watchdog || (!nmi_watchdog_tick(regs) && watchdog_force) ) > + handle_unknown = 1; > > /* Only the BSP gets external NMIs from the system. */ > if ( cpu == 0 ) > @@ -3323,7 +3324,7 @@ void do_nmi(const struct cpu_user_regs *regs) > pci_serr_error(regs); > if ( reason & 0x40 ) > io_check_error(regs); > - if ( !(reason & 0xc0) && !nmi_watchdog ) > + if ( !(reason & 0xc0) && handle_unknown ) > unknown_nmi_error(regs, reason); > } > } > diff --git a/xen/include/asm-x86/apic.h b/xen/include/asm-x86/apic.h > index 5d7623f..6697245 100644 > --- a/xen/include/asm-x86/apic.h > +++ b/xen/include/asm-x86/apic.h > @@ -206,7 +206,7 @@ extern void release_lapic_nmi(void); > extern void self_nmi(void); > extern void disable_timer_nmi_watchdog(void); > extern void enable_timer_nmi_watchdog(void); > -extern void nmi_watchdog_tick (const struct cpu_user_regs *regs); > +extern bool_t nmi_watchdog_tick (const struct cpu_user_regs *regs); > extern int APIC_init_uniprocessor (void); > extern void disable_APIC_timer(void); > extern void enable_APIC_timer(void); > diff --git a/xen/include/asm-x86/nmi.h b/xen/include/asm-x86/nmi.h > index 4504fe1..2c92db9 100644 > --- a/xen/include/asm-x86/nmi.h > +++ b/xen/include/asm-x86/nmi.h > @@ -8,6 +8,9 @@ struct cpu_user_regs; > > /* Watchdog boolean from the command line */ > extern bool_t opt_watchdog; > + > +/* Watchdog force parameter from the command line */ > +extern bool_t watchdog_force; > > typedef int (*nmi_callback_t)(const struct cpu_user_regs *regs, int cpu); > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |