[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] x86/kexec: harden kexec path by entering with NMIs latched



On certain occasions platform might generate NMIs during kexec transition.
It could be cascades of NMIs following the first one, escalated Master
Aborts following IOMMU shutdown on the transition itself, etc.
Purgatory code is now unprepared for any sort of exception or interrupt
handling including NMI handling. This results in intermittent failures
to enter kdump kernel on certain events or certain platforms caused by
Triple Fault in purgatory.

It's possible to start loading kdump kernel in NMI context having them
latched which postpones NMI handling until the kernel itself enables
regular interrupts that should unlatch NMIs as soon as the first
interrupt comes.

Signed-off-by: Igor Druzhinin <igor.druzhinin@xxxxxxxxxx>
---
 xen/arch/x86/crash.c         | 69 ++++++++++++++++++++++++--------------------
 xen/arch/x86/machine_kexec.c | 38 ++++++++++++++++++------
 xen/include/asm-x86/apic.h   |  1 +
 3 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index 8d74258..e3dee4c 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -35,6 +35,41 @@ static cpumask_t waiting_to_crash;
 static unsigned int crashing_cpu;
 static DEFINE_PER_CPU_READ_MOSTLY(bool, crash_save_done);
 
+void crash_self_nmi(void)
+{
+    /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
+     * back to its boot state, so we are unable to rely on the regular
+     * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
+     * (The likely scenario is that we have reverted from x2apic mode to
+     * xapic, at which point #GPFs will occur if we use the apic_*
+     * functions)
+     */
+    switch ( current_local_apic_mode() )
+    {
+        u32 apic_id;
+
+    case APIC_MODE_X2APIC:
+        apic_id = apic_rdmsr(APIC_ID);
+
+        apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL
+                   | ((u64)apic_id << 32));
+        break;
+
+    case APIC_MODE_XAPIC:
+        apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));
+
+        while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
+            cpu_relax();
+
+        apic_mem_write(APIC_ICR2, apic_id << 24);
+        apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
+        break;
+
+    default:
+        break;
+    }
+}
+
 /* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing. */
 static void noreturn do_nmi_crash(const struct cpu_user_regs *regs)
 {
@@ -71,14 +106,7 @@ static void noreturn do_nmi_crash(const struct 
cpu_user_regs *regs)
         cpumask_clear_cpu(cpu, &waiting_to_crash);
     }
 
-    /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
-     * back to its boot state, so we are unable to rely on the regular
-     * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
-     * (The likely scenario is that we have reverted from x2apic mode to
-     * xapic, at which point #GPFs will occur if we use the apic_*
-     * functions)
-     *
-     * The ICR and APIC ID of the LAPIC are still valid even during
+    /* The ICR and APIC ID of the LAPIC are still valid even during
      * software disable (Intel SDM Vol 3, 10.4.7.2).  As a result, we
      * can deliberately queue up another NMI at the LAPIC which will not
      * be delivered as the hardware NMI latch is currently in effect.
@@ -86,30 +114,7 @@ static void noreturn do_nmi_crash(const struct 
cpu_user_regs *regs)
      * non-fatal MCE), the LAPIC will force us back here rather than
      * wandering back into regular Xen code.
      */
-    switch ( current_local_apic_mode() )
-    {
-        u32 apic_id;
-
-    case APIC_MODE_X2APIC:
-        apic_id = apic_rdmsr(APIC_ID);
-
-        apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL
-                   | ((u64)apic_id << 32));
-        break;
-
-    case APIC_MODE_XAPIC:
-        apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));
-
-        while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
-            cpu_relax();
-
-        apic_mem_write(APIC_ICR2, apic_id << 24);
-        apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
-        break;
-
-    default:
-        break;
-    }
+    crash_self_nmi();
 
     for ( ; ; )
         halt();
diff --git a/xen/arch/x86/machine_kexec.c b/xen/arch/x86/machine_kexec.c
index b70d5a6..3bdbdd1 100644
--- a/xen/arch/x86/machine_kexec.c
+++ b/xen/arch/x86/machine_kexec.c
@@ -22,6 +22,7 @@
 #include <asm/hpet.h>
 #include <asm/page.h>
 #include <asm/machine_kexec.h>
+#include <asm/apic.h>
 
 /*
  * Add a mapping for a page to the page tables used during kexec.
@@ -145,10 +146,25 @@ void machine_reboot_kexec(struct kexec_image *image)
     BUG();
 }
 
+static struct kexec_image *kexec_image;
+static void do_kexec_crash(void)
+{
+    unsigned long reloc_flags = 0;
+
+    if ( kexec_image->arch == EM_386 )
+        reloc_flags |= KEXEC_RELOC_FLAG_COMPAT;
+
+    kexec_reloc(page_to_maddr(kexec_image->control_code_page),
+                page_to_maddr(kexec_image->aux_page),
+                kexec_image->head, kexec_image->entry_maddr, reloc_flags);
+}
+
 void machine_kexec(struct kexec_image *image)
 {
     int i;
-    unsigned long reloc_flags = 0;
+    unsigned int cpu = smp_processor_id();
+
+    kexec_image = image;
 
     /* We are about to permenantly jump out of the Xen context into the kexec
      * purgatory code.  We really dont want to be still servicing interupts.
@@ -172,16 +188,20 @@ void machine_kexec(struct kexec_image *image)
         _update_gate_addr_lower(&idt_tables[i][TRAP_machine_check], &trap_nop);
     }
 
-    /* Explicitly enable NMIs on this CPU.  Some crashdump kernels do
-     * not like running with NMIs disabled. */
-    enable_nmis();
+    /* We're entering kexec with NMIs latched which should prevent kexec
+     * path from deviation until the new kernel enables interrupts for
+     * itself (this doesn't cover a sudden MCE case but there's nothing we
+     * can do about it). At this point if NMI suddenly comes we'll be only
+     * expedited to kexec.
+     */
+    _update_gate_addr_lower(&idt_tables[cpu][TRAP_nmi], &do_kexec_crash);
 
-    if ( image->arch == EM_386 )
-        reloc_flags |= KEXEC_RELOC_FLAG_COMPAT;
+    /* Explicitly enable NMIs on this CPU and enter self-NMI handler. */
+    enable_nmis();
+    crash_self_nmi();
 
-    kexec_reloc(page_to_maddr(image->control_code_page),
-                page_to_maddr(image->aux_page),
-                image->head, image->entry_maddr, reloc_flags);
+    for ( ; ; )
+        halt();
 }
 
 int machine_kexec_get(xen_kexec_range_t *range)
diff --git a/xen/include/asm-x86/apic.h b/xen/include/asm-x86/apic.h
index 9d7ec93..e6dee89 100644
--- a/xen/include/asm-x86/apic.h
+++ b/xen/include/asm-x86/apic.h
@@ -184,6 +184,7 @@ extern void disable_lapic_nmi_watchdog(void);
 extern int reserve_lapic_nmi(void);
 extern void release_lapic_nmi(void);
 extern void self_nmi(void);
+extern void crash_self_nmi(void);
 extern void disable_timer_nmi_watchdog(void);
 extern void enable_timer_nmi_watchdog(void);
 extern bool nmi_watchdog_tick(const struct cpu_user_regs *regs);
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.