[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH V5] x86/kexec: Change NMI and MCE handling on kexec path



On 12/12/2012 12:50, "Jan Beulich" <JBeulich@xxxxxxxx> wrote:

>>   * Explicitly enable NMIs.
>> 
>> Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> 
> Acked-by: Jan Beulich <jbeulich@xxxxxxxx>
> 
> (but for committing I'd want at least one other knowledgeable
> person's ack)

It looks okay to me too. Should I wait for an Ack from Tim too?

 -- Keir

>> --
>> 
>> Changes since v4:
>>  * Change stale comments, spelling corrections and coding style changes.
>> 
>> Changes since v3:
>>  * Added IDT entry helpers to safely update NMI/MCE entries.
>> 
>> Changes since v2:
>> 
>>  * Rework the alteration of the stack tables to completely remove the
>>    possibility of a PV domain getting very lucky with the "NMI or MCE in
>>    a 1 instruction race window on sysret" and managing to execute code
>>    in the hypervisor context.
>>  * Make use of set_ist() from the previous patch in the series to avoid
>>    open-coding the IST manipulation.
>> 
>> Changes since v1:
>>  * Reintroduce atomic_dec(&waiting_for_crash_ipi); which got missed
>>    during the original refactoring.
>>  * Fold trap_nop into the middle of enable_nmis to reuse the iret.
>>  * Expand comments in areas as per Tim's suggestions.
>> 
>> diff -r ef8c1b607b10 -r 96b068439bc4 xen/arch/x86/crash.c
>> --- a/xen/arch/x86/crash.c
>> +++ b/xen/arch/x86/crash.c
>> @@ -32,41 +32,127 @@
>>  
>>  static atomic_t waiting_for_crash_ipi;
>>  static unsigned int crashing_cpu;
>> +static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);
>>  
>> -static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
>> +/* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing.
>> */
>> +void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs)
>>  {
>> -    /* Don't do anything if this handler is invoked on crashing cpu.
>> -     * Otherwise, system will completely hang. Crashing cpu can get
>> -     * an NMI if system was initially booted with nmi_watchdog parameter.
>> +    int cpu = smp_processor_id();
>> +
>> +    /* nmi_shootdown_cpus() should ensure that this assertion is correct.
>> */
>> +    ASSERT( cpu != crashing_cpu );
>> +
>> +    /* Save crash information and shut down CPU.  Attempt only once. */
>> +    if ( ! this_cpu(crash_save_done) )
>> +    {
>> +        /* Disable the interrupt stack table for the MCE handler.  This
>> +         * prevents race conditions between clearing MCIP and receving a
>> +         * new MCE, during which the exception frame would be clobbered
>> +         * and the MCE handler fall into an infinite loop.  We are soon
>> +         * going to disable the NMI watchdog, so the loop would not be
>> +         * caught.
>> +         *
>> +         * We do not need to change the NMI IST, as the nmi_crash
>> +         * handler is immue to corrupt exception frames, by virtue of
>> +         * being designed never to return.
>> +         *
>> +         * This update is safe from a security point of view, as this
>> +         * pcpu is never going to try to sysret back to a PV vcpu.
>> +         */
>> +        set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
>> +
>> +        kexec_crash_save_cpu();
>> +        __stop_this_cpu();
>> +
>> +        this_cpu(crash_save_done) = 1;
>> +        atomic_dec(&waiting_for_crash_ipi);
>> +    }
>> +
>> +    /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
>> +     * back to its boot state, so we are unable to rely on the regular
>> +     * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
>> +     * (The likely scenario is that we have reverted from x2apic mode to
>> +     * xapic, at which point #GPFs will occur if we use the apic_*
>> +     * functions)
>> +     *
>> +     * The ICR and APIC ID of the LAPIC are still valid even during
>> +     * software disable (Intel SDM Vol 3, 10.4.7.2).  As a result, we
>> +     * can deliberately queue up another NMI at the LAPIC which will not
>> +     * be delivered as the hardware NMI latch is currently in effect.
>> +     * This means that if NMIs become unlatched (e.g. following a
>> +     * non-fatal MCE), the LAPIC will force us back here rather than
>> +     * wandering back into regular Xen code.
>>       */
>> -    if ( cpu == crashing_cpu )
>> -        return 1;
>> -    local_irq_disable();
>> +    switch ( current_local_apic_mode() )
>> +    {
>> +        u32 apic_id;
>>  
>> -    kexec_crash_save_cpu();
>> +    case APIC_MODE_X2APIC:
>> +        apic_id = apic_rdmsr(APIC_ID);
>>  
>> -    __stop_this_cpu();
>> +        apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL |
>> ((u64)apic_id << 32));
>> +        break;
>>  
>> -    atomic_dec(&waiting_for_crash_ipi);
>> +    case APIC_MODE_XAPIC:
>> +        apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));
>> +
>> +        while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
>> +            cpu_relax();
>> +
>> +        apic_mem_write(APIC_ICR2, apic_id << 24);
>> +        apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
>> +        break;
>> +
>> +    default:
>> +        break;
>> +    }
>>  
>>      for ( ; ; )
>>          halt();
>> -
>> -    return 1;
>>  }
>>  
>>  static void nmi_shootdown_cpus(void)
>>  {
>>      unsigned long msecs;
>> +    int i, cpu = smp_processor_id();
>>  
>>      local_irq_disable();
>>  
>> -    crashing_cpu = smp_processor_id();
>> +    crashing_cpu = cpu;
>>      local_irq_count(crashing_cpu) = 0;
>>  
>>      atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
>> -    /* Would it be better to replace the trap vector here? */
>> -    set_nmi_callback(crash_nmi_callback);
>> +
>> +    /* Change NMI trap handlers.  Non-crashing pcpus get nmi_crash which
>> +     * invokes do_nmi_crash (above), which cause them to write state and
>> +     * fall into a loop.  The crashing pcpu gets the nop handler to
>> +     * cause it to return to this function ASAP.
>> +     */
>> +    for ( i = 0; i < nr_cpu_ids; ++i )
>> +        if ( idt_tables[i] )
>> +        {
>> +
>> +            if ( i == cpu )
>> +            {
>> +                /* Disable the interrupt stack tables for this cpus MCE
>> +                 * and NMI handlers, and alter the NMI handler to have
>> +                 * no operation.  Disabling the stack tables prevents
>> +                 * stack corruption race conditions, while changing the
>> +                 * handler helps prevent cascading faults; we are
>> +                 * certainly going to crash by this point.
>> +                 *
>> +                 * This update is safe from a security point of view, as
>> +                 * this pcpu is never going to try to sysret back to a
>> +                 * PV vcpu.
>> +                 */
>> +                _set_gate_lower(&idt_tables[i][TRAP_nmi], 14, 0, &trap_nop);
>> +                set_ist(&idt_tables[i][TRAP_machine_check], IST_NONE);
>> +            }
>> +            else
>> +                /* Do not update stack table for other pcpus. */
>> +                _update_gate_addr_lower(&idt_tables[i][TRAP_nmi],
>> &nmi_crash);
>> +        }
>> +
>>      /* Ensure the new callback function is set before sending out the NMI.
>> */
>>      wmb();
>>  
>> diff -r ef8c1b607b10 -r 96b068439bc4 xen/arch/x86/machine_kexec.c
>> --- a/xen/arch/x86/machine_kexec.c
>> +++ b/xen/arch/x86/machine_kexec.c
>> @@ -81,12 +81,31 @@ void machine_kexec(xen_kexec_image_t *im
>>          .base = (unsigned long)(boot_cpu_gdt_table -
>> FIRST_RESERVED_GDT_ENTRY),
>>          .limit = LAST_RESERVED_GDT_BYTE
>>      };
>> +    int i;
>>  
>>      /* We are about to permenantly jump out of the Xen context into the
>> kexec
>>       * purgatory code.  We really dont want to be still servicing
>> interupts.
>>       */
>>      local_irq_disable();
>>  
>> +    /* Now regular interrupts are disabled, we need to reduce the impact
>> +     * of interrupts not disabled by 'cli'.
>> +     *
>> +     * The NMI handlers have already been set up nmi_shootdown_cpus().  All
>> +     * pcpus other than us have the nmi_crash handler, while we have the
>> nop
>> +     * handler.
>> +     *
>> +     * The MCE handlers touch extensive areas of Xen code and data.  At
>> this
>> +     * point, there is nothing we can usefully do, so set the nop handler.
>> +     */
>> +    for ( i = 0; i < nr_cpu_ids; ++i )
>> +        if ( idt_tables[i] )
>> +            _update_gate_addr_lower(&idt_tables[i][TRAP_machine_check],
>> &trap_nop);
>> +
>> +    /* Explicitly enable NMIs on this CPU.  Some crashdump kernels do
>> +     * not like running with NMIs disabled. */
>> +    enable_nmis();
>> +
>>      /*
>>       * compat_machine_kexec() returns to idle pagetables, which requires us
>>       * to be running on a static GDT mapping (idle pagetables have no GDT
>> diff -r ef8c1b607b10 -r 96b068439bc4 xen/arch/x86/x86_64/entry.S
>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -635,11 +635,45 @@ ENTRY(nmi)
>>          movl  $TRAP_nmi,4(%rsp)
>>          jmp   handle_ist_exception
>>  
>> +ENTRY(nmi_crash)
>> +        cli
>> +        pushq $0
>> +        movl $TRAP_nmi,4(%rsp)
>> +        SAVE_ALL
>> +        movq %rsp,%rdi
>> +        callq do_nmi_crash /* Does not return */
>> +        ud2
>> +
>>  ENTRY(machine_check)
>>          pushq $0
>>          movl  $TRAP_machine_check,4(%rsp)
>>          jmp   handle_ist_exception
>>  
>> +/* Enable NMIs.  No special register assumptions. Only %rax is not
>> preserved. */
>> +ENTRY(enable_nmis)
>> +        movq  %rsp, %rax /* Grab RSP before pushing */
>> +
>> +        /* Set up stack frame */
>> +        pushq $0               /* SS */
>> +        pushq %rax             /* RSP */
>> +        pushfq                 /* RFLAGS */
>> +        pushq $__HYPERVISOR_CS /* CS */
>> +        leaq  1f(%rip),%rax
>> +        pushq %rax             /* RIP */
>> +
>> +        iretq /* Disable the hardware NMI latch */
>> +1:
>> +        retq
>> +
>> +/* No op trap handler.  Required for kexec crash path.  This is not
>> + * declared with the ENTRY() macro to avoid wasted alignment space.
>> + */
>> +.globl trap_nop
>> +trap_nop:
>> +        iretq
>> +
>> +
>> +
>>  .section .rodata, "a", @progbits
>>  
>>  ENTRY(exception_table)
>> diff -r ef8c1b607b10 -r 96b068439bc4 xen/include/asm-x86/desc.h
>> --- a/xen/include/asm-x86/desc.h
>> +++ b/xen/include/asm-x86/desc.h
>> @@ -106,6 +106,21 @@ typedef struct {
>>      u64 a, b;
>>  } idt_entry_t;
>>  
>> +/* Write the lower 64 bits of an IDT Entry. This relies on the upper 32
>> + * bits of the address not changing, which is a safe assumption as all
>> + * functions we are likely to load will live inside the 1GB
>> + * code/data/bss address range.
>> + *
>> + * Ideally, we would use cmpxchg16b, but this is not supported on some
>> + * old AMD 64bit capable processors, and has no safe equivalent.
>> + */
>> +static inline void _write_gate_lower(volatile idt_entry_t *gate,
>> +                                     const idt_entry_t *new)
>> +{
>> +    ASSERT(gate->b == new->b);
>> +    gate->a = new->a;
>> +}
>> +
>>  #define _set_gate(gate_addr,type,dpl,addr)               \
>>  do {                                                     \
>>      (gate_addr)->a = 0;                                  \
>> @@ -122,6 +137,36 @@ do {
>>          (1UL << 47);                                     \
>>  } while (0)
>>  
>> +static inline void _set_gate_lower(idt_entry_t *gate, unsigned long type,
>> +                                   unsigned long dpl, void *addr)
>> +{
>> +    idt_entry_t idte;
>> +    idte.b = gate->b;
>> +    idte.a =
>> +        (((unsigned long)(addr) & 0xFFFF0000UL) << 32) |
>> +        ((unsigned long)(dpl) << 45) |
>> +        ((unsigned long)(type) << 40) |
>> +        ((unsigned long)(addr) & 0xFFFFUL) |
>> +        ((unsigned long)__HYPERVISOR_CS64 << 16) |
>> +        (1UL << 47);
>> +    _write_gate_lower(gate, &idte);
>> +}
>> +
>> +/* Update the lower half handler of an IDT Entry, without changing any
>> + * other configuration. */
>> +static inline void _update_gate_addr_lower(idt_entry_t *gate, void *addr)
>> +{
>> +    idt_entry_t idte;
>> +    idte.a = gate->a;
>> +
>> +    idte.b = ((unsigned long)(addr) >> 32);
>> +    idte.a &= 0x0000FFFFFFFF0000ULL;
>> +    idte.a |= (((unsigned long)(addr) & 0xFFFF0000UL) << 32) |
>> +        ((unsigned long)(addr) & 0xFFFFUL);
>> +
>> +    _write_gate_lower(gate, &idte);
>> +}
>> +
>>  #define _set_tssldt_desc(desc,addr,limit,type)           \
>>  do {                                                     \
>>      (desc)[0].b = (desc)[1].b = 0;                       \
>> diff -r ef8c1b607b10 -r 96b068439bc4 xen/include/asm-x86/processor.h
>> --- a/xen/include/asm-x86/processor.h
>> +++ b/xen/include/asm-x86/processor.h
>> @@ -527,6 +527,7 @@ void do_ ## _name(struct cpu_user_regs *
>>  DECLARE_TRAP_HANDLER(divide_error);
>>  DECLARE_TRAP_HANDLER(debug);
>>  DECLARE_TRAP_HANDLER(nmi);
>> +DECLARE_TRAP_HANDLER(nmi_crash);
>>  DECLARE_TRAP_HANDLER(int3);
>>  DECLARE_TRAP_HANDLER(overflow);
>>  DECLARE_TRAP_HANDLER(bounds);
>> @@ -545,6 +546,9 @@ DECLARE_TRAP_HANDLER(alignment_check);
>>  DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
>>  #undef DECLARE_TRAP_HANDLER
>>  
>> +void trap_nop(void);
>> +void enable_nmis(void);
>> +
>>  void syscall_enter(void);
>>  void sysenter_entry(void);
>>  void sysenter_eflags_saved(void);
> 
> 



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.