[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 2/3] x86: Port the basic alternative mechanism from Linux to Xen



On 26/05/2014 08:27, Feng Wu wrote:
> This patch ports the basic alternative mechanism from Linux to Xen.
> With this mechanism, we can patch code based on the CPU features.
>
> Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx>
> ---
>  xen/arch/x86/Makefile             |   1 +
>  xen/arch/x86/alternative.c        | 223 
> ++++++++++++++++++++++++++++++++++++++
>  xen/arch/x86/setup.c              |   5 +
>  xen/arch/x86/traps.c              |  15 +++
>  xen/arch/x86/xen.lds.S            |  22 ++++
>  xen/include/asm-x86/alternative.h |  77 +++++++++++++
>  xen/include/asm-x86/traps.h       |   2 +
>  7 files changed, 345 insertions(+)
>  create mode 100644 xen/arch/x86/alternative.c
>  create mode 100644 xen/include/asm-x86/alternative.h
>
> diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
> index d502bdf..3734884 100644
> --- a/xen/arch/x86/Makefile
> +++ b/xen/arch/x86/Makefile
> @@ -58,6 +58,7 @@ obj-y += crash.o
>  obj-y += tboot.o
>  obj-y += hpet.o
>  obj-y += xstate.o
> +obj-y += alternative.o
>  
>  obj-$(crash_debug) += gdbstub.o
>  
> diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
> new file mode 100644
> index 0000000..af8864e
> --- /dev/null
> +++ b/xen/arch/x86/alternative.c
> @@ -0,0 +1,223 @@
> +/******************************************************************************
> + * alternative.c
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> + */
> +
> +#include <xen/types.h>
> +#include <asm/processor.h>
> +#include <asm/nops.h>
> +#include <asm/alternative.h>
> +#include <xen/init.h>
> +#include <asm/system.h>
> +#include <asm/traps.h>
> +
> +#define MAX_PATCH_LEN (255-1)

Where does this number come from?  An individual instruction can't be
longer than 15 bytes.

> +
> +extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
> +
> +#ifdef K8_NOP1
> +static const unsigned char k8nops[] = {
> +    K8_NOP1,
> +    K8_NOP2,
> +    K8_NOP3,
> +    K8_NOP4,
> +    K8_NOP5,
> +    K8_NOP6,
> +    K8_NOP7,
> +    K8_NOP8
> +};
> +static const unsigned char * const k8_nops[ASM_NOP_MAX+1] = {
> +    NULL,
> +    k8nops,
> +    k8nops + 1,
> +    k8nops + 1 + 2,
> +    k8nops + 1 + 2 + 3,
> +    k8nops + 1 + 2 + 3 + 4,
> +    k8nops + 1 + 2 + 3 + 4 + 5,
> +    k8nops + 1 + 2 + 3 + 4 + 5 + 6,
> +    k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7
> +};
> +#endif
> +
> +#ifdef P6_NOP1
> +static const unsigned char p6nops[] = {
> +    P6_NOP1,
> +    P6_NOP2,
> +    P6_NOP3,
> +    P6_NOP4,
> +    P6_NOP5,
> +    P6_NOP6,
> +    P6_NOP7,
> +    P6_NOP8
> +};
> +static const unsigned char * const p6_nops[ASM_NOP_MAX+1] = {
> +    NULL,
> +    p6nops,
> +    p6nops + 1,
> +    p6nops + 1 + 2,
> +    p6nops + 1 + 2 + 3,
> +    p6nops + 1 + 2 + 3 + 4,
> +    p6nops + 1 + 2 + 3 + 4 + 5,
> +    p6nops + 1 + 2 + 3 + 4 + 5 + 6,
> +    p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7
> +};
> +#endif

What is the purpose of these pairs of tables?  Only the underscore
variant is used later on.

> +
> +const unsigned char * const *ideal_nops = p6_nops;
> +
> +void __init arch_init_ideal_nops(void)
> +{
> +    switch (boot_cpu_data.x86_vendor)
> +    {
> +    case X86_VENDOR_INTEL:
> +        /*
> +         * Due to a decoder implementation quirk, some
> +         * specific Intel CPUs actually perform better with
> +         * the "k8_nops" than with the SDM-recommended NOPs.
> +         */
> +        if ( boot_cpu_data.x86 == 6 &&
> +            boot_cpu_data.x86_model >= 0x0f &&
> +            boot_cpu_data.x86_model != 0x1c &&
> +            boot_cpu_data.x86_model != 0x26 &&
> +            boot_cpu_data.x86_model != 0x27 &&
> +            boot_cpu_data.x86_model < 0x30 )
> +            ideal_nops = k8_nops;
> +        else
> +            ideal_nops = p6_nops;
> +        break;
> +    default:
> +            ideal_nops = k8_nops;
> +    }
> +}

Surely given the statement in the middle, the better default for
ideal_nops() would be the k8_nops, and a simple if in
arch_init_ideal_ops().  Also, it could quite easily be static and called
from "alternative_instructions()", allowing it not to be exported, and
for ideal_nops to also be static and __init.

> +
> +/* Use this to add nops to a buffer, then text_poke the whole buffer. */
> +static void __init add_nops(void *insns, unsigned int len)
> +{
> +    while ( len > 0 )
> +    {
> +        unsigned int noplen = len;
> +        if ( noplen > ASM_NOP_MAX )
> +                noplen = ASM_NOP_MAX;
> +        memcpy(insns, ideal_nops[noplen], noplen);
> +        insns += noplen;
> +        len -= noplen;
> +    }
> +}
> +
> +/*
> + * text_poke_early - Update instructions on a live kernel at boot time
> + * @addr: address to modify
> + * @opcode: source of the copy
> + * @len: length to copy
> + *
> + * When you use this code to patch more than one byte of an instruction
> + * you need to make sure that other CPUs cannot execute this code in 
> parallel.
> + * Also no thread must be currently preempted in the middle of these
> + * instructions. And on the local CPU you need to be protected again NMI or 
> MCE
> + * handlers seeing an inconsistent instruction while you patch.
> + */
> +static void *__init text_poke_early(void *addr, const void *opcode, size_t 
> len)
> +{
> +    int tmp;
> +    unsigned long flags;

newline, as per style

> +    local_irq_save(flags);
> +    memcpy(addr, opcode, len);
> +    /*
> +     * CPUID is a barrier to speculative execution.
> +     * Prefetched instructions are automatically
> +     * invalidated when modified.
> +     */
> +    asm volatile("cpuid"
> +                 : "=a" (tmp)
> +                 : "0" (1)
> +                 : "ebx", "ecx", "edx", "memory");

sync_core() from processor.h

> +
> +    local_irq_restore(flags);
> +    /*
> +     * Could also do a CLFLUSH here to speed up CPU recovery; but
> +     * that causes hangs on some VIA CPUs.
> +     */
> +    return addr;
> +}
> +
> +/*
> + * Replace instructions with better alternatives for this CPU type.
> + * This runs before SMP is initialized to avoid SMP problems with
> + * self modifying code. This implies that asymmetric systems where
> + * APs have less capabilities than the boot processor are not handled.
> + * Tough. Make sure you disable such features by hand.
> + */
> +
> +static void __init apply_alternatives(struct alt_instr *start, struct 
> alt_instr *end)
> +{
> +    struct alt_instr *a;
> +    u8 *instr, *replacement;
> +    u8 insnbuf[MAX_PATCH_LEN];
> +
> +    printk("%s: alt table %p -> %p\n", __func__, start, end);
> +    /*
> +     * The scan order should be from start to end. A later scanned
> +     * alternative code can overwrite a previous scanned alternative code.
> +     * Some kernel functions (e.g. memcpy, memset, etc) use this order to
> +     * patch code.
> +     *
> +     * So be careful if you want to change the scan order to any other
> +     * order.
> +     */
> +    for ( a = start; a < end; a++ )
> +    {
> +        instr = (u8 *)&a->instr_offset + a->instr_offset;
> +        replacement = (u8 *)&a->repl_offset + a->repl_offset;
> +        BUG_ON(a->replacementlen > a->instrlen);
> +        BUG_ON(a->instrlen > sizeof(insnbuf));
> +        BUG_ON(a->cpuid >= NCAPINTS * 32);
> +        if ( !boot_cpu_has(a->cpuid) )
> +            continue;
> +
> +        memcpy(insnbuf, replacement, a->replacementlen);
> +
> +        /* 0xe8 is a relative jump; fix the offset. */
> +        if ( *insnbuf == 0xe8 && a->replacementlen == 5 )
> +            *(s32 *)(insnbuf + 1) += replacement - instr;

0xe8 is the call instruction.  Calling it "a relative jump" is slightly
misleading.  Why does it need special treatment here?

> +
> +        add_nops(insnbuf + a->replacementlen,
> +                 a->instrlen - a->replacementlen);
> +        text_poke_early(instr, insnbuf, a->instrlen);
> +    }
> +}
> +
> +void __init alternative_instructions(void)

This function would be more descriptive as
"patch_alternative_instructions" or so.

> +{
> +    /*
> +     * The patching is not fully atomic, so try to avoid local interruptions
> +     * that might execute the to be patched code.
> +     * Other CPUs are not running.
> +     */
> +    stop_nmi();

This stopping and starting nmis can be done with set_nmi_callback(),
providing a function which returns 1.

It might be a good idea to tweak set_nmi_callback() to return the old
callback, so it can be returned later, rather than potentially
clobbering the virq nmi callback.

> +    /*
> +     * Don't stop machine check exceptions while patching.
> +     * MCEs only happen when something got corrupted and in this
> +     * case we must do something about the corruption.
> +     * Ignoring it is worse than a unlikely patching race.
> +     * Also machine checks tend to be broadcast and if one CPU
> +     * goes into machine check the others follow quickly, so we don't
> +     * expect a machine check to cause undue problems during to code
> +     * patching.
> +     */
> +
> +    apply_alternatives(__alt_instructions, __alt_instructions_end);
> +    restart_nmi();
> +}

Can you put a local-variable block in here please?

> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> index 508649d..7635868 100644
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -48,6 +48,8 @@
>  #include <asm/setup.h>
>  #include <xen/cpu.h>
>  #include <asm/nmi.h>
> +#include <asm/nops.h>
> +#include <asm/alternative.h>
>  
>  /* opt_nosmp: If true, secondary processors are ignored. */
>  static bool_t __initdata opt_nosmp;
> @@ -1288,6 +1290,9 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>      if ( cpu_has_fsgsbase )
>          set_in_cr4(X86_CR4_FSGSBASE);
>  
> +    arch_init_ideal_nops();
> +    alternative_instructions();
> +
>      local_irq_enable();
>  
>      pt_pci_init();
> diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
> index 1722912..4108c8b 100644
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -111,6 +111,8 @@ integer_param("debug_stack_lines", debug_stack_lines);
>  static bool_t __devinitdata opt_ler;
>  boolean_param("ler", opt_ler);
>  
> +static int ignore_nmis;
> +
>  #define stack_words_per_line 4
>  #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
>  
> @@ -3303,6 +3305,9 @@ void do_nmi(struct cpu_user_regs *regs)
>  
>      ++nmi_count(cpu);
>  
> +    if ( ignore_nmis )
> +        return;
> +
>      if ( nmi_callback(regs, cpu) )
>          return;
>  
> @@ -3322,6 +3327,16 @@ void do_nmi(struct cpu_user_regs *regs)
>      }
>  }
>  
> +void stop_nmi(void)
> +{
> +   ignore_nmis++;

These arn't needed, but if they were, they should be __init.

> +}
> +
> +void restart_nmi(void)
> +{
> +   ignore_nmis--;
> +}
> +
>  void set_nmi_callback(nmi_callback_t callback)
>  {
>      nmi_callback = callback;
> diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
> index 17db361..64bdb18 100644
> --- a/xen/arch/x86/xen.lds.S
> +++ b/xen/arch/x86/xen.lds.S
> @@ -147,6 +147,28 @@ SECTIONS
>    . = ALIGN(STACK_SIZE);
>    __init_end = .;
>  
> +  /*
> +   * struct alt_inst entries. From the header (alternative.h):
> +   * "Alternative instructions for different CPU types or capabilities"
> +   * Think locking instructions on spinlocks.
> +   */
> +  . = ALIGN(8);
> +  .altinstructions : {
> +       __alt_instructions = .;
> +       *(.altinstructions)
> +       __alt_instructions_end = .;
> +  }
> +
> +  /*
> +   * And here are the replacement instructions. The linker sticks
> +   * them as binary blobs. The .altinstructions has enough data to
> +   * get the address and the length of them to patch the kernel safely.
> +   */
> +  .altinstr_replacement : {
> +       *(.altinstr_replacement)
> +  }
> +  . = ALIGN(STACK_SIZE);
> +

All patching is done in __start_xen(), before APs are brought up.  These
regions should be inside the .init section so they get reclaimed after boot.

~Andrew

>    .bss : {                     /* BSS */
>         __bss_start = .;
>         *(.bss.stack_aligned)
> diff --git a/xen/include/asm-x86/alternative.h 
> b/xen/include/asm-x86/alternative.h
> new file mode 100644
> index 0000000..18c0e85
> --- /dev/null
> +++ b/xen/include/asm-x86/alternative.h
> @@ -0,0 +1,77 @@
> +#ifndef __X86_ALTERNATIVE_H__
> +#define __X86_ALTERNATIVE_H__
> +
> +#ifdef __ASSEMBLY__
> +.macro altinstruction_entry orig alt feature orig_len alt_len
> +        .long \orig - .
> +        .long \alt - .
> +        .word \feature
> +        .byte \orig_len
> +        .byte \alt_len
> +.endm
> +#else
> +#include <xen/types.h>
> +
> +struct alt_instr {
> +    s32 instr_offset;       /* original instruction */
> +    s32 repl_offset;        /* offset to replacement instruction */
> +    u16 cpuid;              /* cpuid bit set for replacement */
> +    u8  instrlen;           /* length of original instruction */
> +    u8  replacementlen;     /* length of new instruction, <= instrlen */
> +};
> +
> +extern void alternative_instructions(void);
> +
> +#define OLDINSTR(oldinstr)      "661:\n\t" oldinstr "\n662:\n"
> +
> +#define b_replacement(number)   "663"#number
> +#define e_replacement(number)   "664"#number
> +
> +#define alt_slen "662b-661b"
> +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
> +
> +#define ALTINSTR_ENTRY(feature, number)                                      
>  \
> +        " .long 661b - .\n"                             /* label           
> */ \
> +        " .long " b_replacement(number)"f - .\n"        /* new instruction 
> */ \
> +        " .word " __stringify(feature) "\n"             /* feature bit     
> */ \
> +        " .byte " alt_slen "\n"                         /* source len      
> */ \
> +        " .byte " alt_rlen(number) "\n"                 /* replacement len */
> +
> +#define DISCARD_ENTRY(number)                           /* rlen <= slen */   
>  \
> +        " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
> +
> +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */    
>  \
> +        b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) 
> ":\n\t"
> +
> +/* alternative assembly primitive: */
> +#define ALTERNATIVE(oldinstr, newinstr, feature)                        \
> +        OLDINSTR(oldinstr)                                              \
> +        ".pushsection .altinstructions,\"a\"\n"                         \
> +        ALTINSTR_ENTRY(feature, 1)                                      \
> +        ".popsection\n"                                                 \
> +        ".pushsection .discard,\"aw\",@progbits\n"                      \
> +        DISCARD_ENTRY(1)                                                \
> +        ".popsection\n"                                                 \
> +        ".pushsection .altinstr_replacement, \"ax\"\n"                  \
> +        ALTINSTR_REPLACEMENT(newinstr, feature, 1)                      \
> +        ".popsection"
> +
> +/*
> + * Alternative instructions for different CPU types or capabilities.
> + *
> + * This allows to use optimized instructions even on generic binary
> + * kernels.
> + *
> + * length of oldinstr must be longer or equal the length of newinstr
> + * It can be padded with nops as needed.
> + *
> + * For non barrier like inlines please define new variants
> + * without volatile and memory clobber.
> + */
> +#define alternative(oldinstr, newinstr, feature)                        \
> +        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : 
> "memory")
> +
> +#endif  /*  __ASSEMBLY__  */
> +
> +#endif /* __X86_ALTERNATIVE_H__ */
> +
> diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h
> index 556b133..9b1dc2c 100644
> --- a/xen/include/asm-x86/traps.h
> +++ b/xen/include/asm-x86/traps.h
> @@ -54,4 +54,6 @@ uint32_t guest_io_read(unsigned int port, unsigned int 
> bytes,
>  void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
>                      struct vcpu *, struct cpu_user_regs *);
>  
> +extern void stop_nmi(void);
> +extern void restart_nmi(void);
>  #endif /* ASM_TRAP_H */


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.