[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH 2/3] x86: Port the basic alternative mechanism from Linux to Xen
> -----Original Message----- > From: Andrew Cooper [mailto:amc96@xxxxxxxxxxxxxxxx] On Behalf Of > Andrew Cooper > Sent: Monday, May 26, 2014 11:41 PM > To: Wu, Feng; xen-devel@xxxxxxxxxxxxx > Cc: keir.xen@xxxxxxxxx; JBeulich@xxxxxxxx > Subject: Re: [Xen-devel] [PATCH 2/3] x86: Port the basic alternative mechanism > from Linux to Xen > > On 26/05/2014 08:27, Feng Wu wrote: > > This patch ports the basic alternative mechanism from Linux to Xen. > > With this mechanism, we can patch code based on the CPU features. > > > > Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx> > > --- > > xen/arch/x86/Makefile | 1 + > > xen/arch/x86/alternative.c | 223 > ++++++++++++++++++++++++++++++++++++++ > > xen/arch/x86/setup.c | 5 + > > xen/arch/x86/traps.c | 15 +++ > > xen/arch/x86/xen.lds.S | 22 ++++ > > xen/include/asm-x86/alternative.h | 77 +++++++++++++ > > xen/include/asm-x86/traps.h | 2 + > > 7 files changed, 345 insertions(+) > > create mode 100644 xen/arch/x86/alternative.c > > create mode 100644 xen/include/asm-x86/alternative.h > > > > diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile > > index d502bdf..3734884 100644 > > --- a/xen/arch/x86/Makefile > > +++ b/xen/arch/x86/Makefile > > @@ -58,6 +58,7 @@ obj-y += crash.o > > obj-y += tboot.o > > obj-y += hpet.o > > obj-y += xstate.o > > +obj-y += alternative.o > > > > obj-$(crash_debug) += gdbstub.o > > > > diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c > > new file mode 100644 > > index 0000000..af8864e > > --- /dev/null > > +++ b/xen/arch/x86/alternative.c > > @@ -0,0 +1,223 @@ > > > +/************************************************************** > **************** > > + * alternative.c > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License as published by > > + * the Free Software Foundation; either version 2 of the License, or > > + * (at your option) any later version. > > + * > > + * This program is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + * > > + * You should have received a copy of the GNU General Public License > > + * along with this program; if not, write to the Free Software > > + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, > USA. > > + */ > > + > > +#include <xen/types.h> > > +#include <asm/processor.h> > > +#include <asm/nops.h> > > +#include <asm/alternative.h> > > +#include <xen/init.h> > > +#include <asm/system.h> > > +#include <asm/traps.h> > > + > > +#define MAX_PATCH_LEN (255-1) > > Where does this number come from? An individual instruction can't be > longer than 15 bytes. > > > + > > +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; > > + > > +#ifdef K8_NOP1 > > +static const unsigned char k8nops[] = { > > + K8_NOP1, > > + K8_NOP2, > > + K8_NOP3, > > + K8_NOP4, > > + K8_NOP5, > > + K8_NOP6, > > + K8_NOP7, > > + K8_NOP8 > > +}; > > +static const unsigned char * const k8_nops[ASM_NOP_MAX+1] = { > > + NULL, > > + k8nops, > > + k8nops + 1, > > + k8nops + 1 + 2, > > + k8nops + 1 + 2 + 3, > > + k8nops + 1 + 2 + 3 + 4, > > + k8nops + 1 + 2 + 3 + 4 + 5, > > + k8nops + 1 + 2 + 3 + 4 + 5 + 6, > > + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 > > +}; > > +#endif > > + > > +#ifdef P6_NOP1 > > +static const unsigned char p6nops[] = { > > + P6_NOP1, > > + P6_NOP2, > > + P6_NOP3, > > + P6_NOP4, > > + P6_NOP5, > > + P6_NOP6, > > + P6_NOP7, > > + P6_NOP8 > > +}; > > +static const unsigned char * const p6_nops[ASM_NOP_MAX+1] = { > > + NULL, > > + p6nops, > > + p6nops + 1, > > + p6nops + 1 + 2, > > + p6nops + 1 + 2 + 3, > > + p6nops + 1 + 2 + 3 + 4, > > + p6nops + 1 + 2 + 3 + 4 + 5, > > + p6nops + 1 + 2 + 3 + 4 + 5 + 6, > > + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 > > +}; > > +#endif > > What is the purpose of these pairs of tables? Only the underscore > variant is used later on. They define different types of NOP operations, which is used by "ideal_nops". > > > + > > +const unsigned char * const *ideal_nops = p6_nops; > > + > > +void __init arch_init_ideal_nops(void) > > +{ > > + switch (boot_cpu_data.x86_vendor) > > + { > > + case X86_VENDOR_INTEL: > > + /* > > + * Due to a decoder implementation quirk, some > > + * specific Intel CPUs actually perform better with > > + * the "k8_nops" than with the SDM-recommended NOPs. > > + */ > > + if ( boot_cpu_data.x86 == 6 && > > + boot_cpu_data.x86_model >= 0x0f && > > + boot_cpu_data.x86_model != 0x1c && > > + boot_cpu_data.x86_model != 0x26 && > > + boot_cpu_data.x86_model != 0x27 && > > + boot_cpu_data.x86_model < 0x30 ) > > + ideal_nops = k8_nops; > > + else > > + ideal_nops = p6_nops; > > + break; > > + default: > > + ideal_nops = k8_nops; > > + } > > +} > > Surely given the statement in the middle, the better default for > ideal_nops() would be the k8_nops, and a simple if in > arch_init_ideal_ops(). Also, it could quite easily be static and called > from "alternative_instructions()", allowing it not to be exported, and > for ideal_nops to also be static and __init. Why do you think the default value should be "k8_nops"? > > > + > > +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ > > +static void __init add_nops(void *insns, unsigned int len) > > +{ > > + while ( len > 0 ) > > + { > > + unsigned int noplen = len; > > + if ( noplen > ASM_NOP_MAX ) > > + noplen = ASM_NOP_MAX; > > + memcpy(insns, ideal_nops[noplen], noplen); > > + insns += noplen; > > + len -= noplen; > > + } > > +} > > + > > +/* > > + * text_poke_early - Update instructions on a live kernel at boot time > > + * @addr: address to modify > > + * @opcode: source of the copy > > + * @len: length to copy > > + * > > + * When you use this code to patch more than one byte of an instruction > > + * you need to make sure that other CPUs cannot execute this code in > parallel. > > + * Also no thread must be currently preempted in the middle of these > > + * instructions. And on the local CPU you need to be protected again NMI or > MCE > > + * handlers seeing an inconsistent instruction while you patch. > > + */ > > +static void *__init text_poke_early(void *addr, const void *opcode, size_t > len) > > +{ > > + int tmp; > > + unsigned long flags; > > newline, as per style > > > + local_irq_save(flags); > > + memcpy(addr, opcode, len); > > + /* > > + * CPUID is a barrier to speculative execution. > > + * Prefetched instructions are automatically > > + * invalidated when modified. > > + */ > > + asm volatile("cpuid" > > + : "=a" (tmp) > > + : "0" (1) > > + : "ebx", "ecx", "edx", "memory"); > > sync_core() from processor.h > > > + > > + local_irq_restore(flags); > > + /* > > + * Could also do a CLFLUSH here to speed up CPU recovery; but > > + * that causes hangs on some VIA CPUs. > > + */ > > + return addr; > > +} > > + > > +/* > > + * Replace instructions with better alternatives for this CPU type. > > + * This runs before SMP is initialized to avoid SMP problems with > > + * self modifying code. This implies that asymmetric systems where > > + * APs have less capabilities than the boot processor are not handled. > > + * Tough. Make sure you disable such features by hand. > > + */ > > + > > +static void __init apply_alternatives(struct alt_instr *start, struct > > alt_instr > *end) > > +{ > > + struct alt_instr *a; > > + u8 *instr, *replacement; > > + u8 insnbuf[MAX_PATCH_LEN]; > > + > > + printk("%s: alt table %p -> %p\n", __func__, start, end); > > + /* > > + * The scan order should be from start to end. A later scanned > > + * alternative code can overwrite a previous scanned alternative code. > > + * Some kernel functions (e.g. memcpy, memset, etc) use this order to > > + * patch code. > > + * > > + * So be careful if you want to change the scan order to any other > > + * order. > > + */ > > + for ( a = start; a < end; a++ ) > > + { > > + instr = (u8 *)&a->instr_offset + a->instr_offset; > > + replacement = (u8 *)&a->repl_offset + a->repl_offset; > > + BUG_ON(a->replacementlen > a->instrlen); > > + BUG_ON(a->instrlen > sizeof(insnbuf)); > > + BUG_ON(a->cpuid >= NCAPINTS * 32); > > + if ( !boot_cpu_has(a->cpuid) ) > > + continue; > > + > > + memcpy(insnbuf, replacement, a->replacementlen); > > + > > + /* 0xe8 is a relative jump; fix the offset. */ > > + if ( *insnbuf == 0xe8 && a->replacementlen == 5 ) > > + *(s32 *)(insnbuf + 1) += replacement - instr; > > 0xe8 is the call instruction. Calling it "a relative jump" is slightly > misleading. Why does it need special treatment here? > > > + > > + add_nops(insnbuf + a->replacementlen, > > + a->instrlen - a->replacementlen); > > + text_poke_early(instr, insnbuf, a->instrlen); > > + } > > +} > > + > > +void __init alternative_instructions(void) > > This function would be more descriptive as > "patch_alternative_instructions" or so. > > > +{ > > + /* > > + * The patching is not fully atomic, so try to avoid local > > interruptions > > + * that might execute the to be patched code. > > + * Other CPUs are not running. > > + */ > > + stop_nmi(); > > This stopping and starting nmis can be done with set_nmi_callback(), > providing a function which returns 1. > > It might be a good idea to tweak set_nmi_callback() to return the old > callback, so it can be returned later, rather than potentially > clobbering the virq nmi callback. > > > + /* > > + * Don't stop machine check exceptions while patching. > > + * MCEs only happen when something got corrupted and in this > > + * case we must do something about the corruption. > > + * Ignoring it is worse than a unlikely patching race. > > + * Also machine checks tend to be broadcast and if one CPU > > + * goes into machine check the others follow quickly, so we don't > > + * expect a machine check to cause undue problems during to code > > + * patching. > > + */ > > + > > + apply_alternatives(__alt_instructions, __alt_instructions_end); > > + restart_nmi(); > > +} > > Can you put a local-variable block in here please? What do you mean by this? > > > diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c > > index 508649d..7635868 100644 > > --- a/xen/arch/x86/setup.c > > +++ b/xen/arch/x86/setup.c > > @@ -48,6 +48,8 @@ > > #include <asm/setup.h> > > #include <xen/cpu.h> > > #include <asm/nmi.h> > > +#include <asm/nops.h> > > +#include <asm/alternative.h> > > > > /* opt_nosmp: If true, secondary processors are ignored. */ > > static bool_t __initdata opt_nosmp; > > @@ -1288,6 +1290,9 @@ void __init noreturn __start_xen(unsigned long > mbi_p) > > if ( cpu_has_fsgsbase ) > > set_in_cr4(X86_CR4_FSGSBASE); > > > > + arch_init_ideal_nops(); > > + alternative_instructions(); > > + > > local_irq_enable(); > > > > pt_pci_init(); > > diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c > > index 1722912..4108c8b 100644 > > --- a/xen/arch/x86/traps.c > > +++ b/xen/arch/x86/traps.c > > @@ -111,6 +111,8 @@ integer_param("debug_stack_lines", > debug_stack_lines); > > static bool_t __devinitdata opt_ler; > > boolean_param("ler", opt_ler); > > > > +static int ignore_nmis; > > + > > #define stack_words_per_line 4 > > #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp) > > > > @@ -3303,6 +3305,9 @@ void do_nmi(struct cpu_user_regs *regs) > > > > ++nmi_count(cpu); > > > > + if ( ignore_nmis ) > > + return; > > + > > if ( nmi_callback(regs, cpu) ) > > return; > > > > @@ -3322,6 +3327,16 @@ void do_nmi(struct cpu_user_regs *regs) > > } > > } > > > > +void stop_nmi(void) > > +{ > > + ignore_nmis++; > > These arn't needed, but if they were, they should be __init. > > > +} > > + > > +void restart_nmi(void) > > +{ > > + ignore_nmis--; > > +} > > + > > void set_nmi_callback(nmi_callback_t callback) > > { > > nmi_callback = callback; > > diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S > > index 17db361..64bdb18 100644 > > --- a/xen/arch/x86/xen.lds.S > > +++ b/xen/arch/x86/xen.lds.S > > @@ -147,6 +147,28 @@ SECTIONS > > . = ALIGN(STACK_SIZE); > > __init_end = .; > > > > + /* > > + * struct alt_inst entries. From the header (alternative.h): > > + * "Alternative instructions for different CPU types or capabilities" > > + * Think locking instructions on spinlocks. > > + */ > > + . = ALIGN(8); > > + .altinstructions : { > > + __alt_instructions = .; > > + *(.altinstructions) > > + __alt_instructions_end = .; > > + } > > + > > + /* > > + * And here are the replacement instructions. The linker sticks > > + * them as binary blobs. The .altinstructions has enough data to > > + * get the address and the length of them to patch the kernel safely. > > + */ > > + .altinstr_replacement : { > > + *(.altinstr_replacement) > > + } > > + . = ALIGN(STACK_SIZE); > > + > > All patching is done in __start_xen(), before APs are brought up. These > regions should be inside the .init section so they get reclaimed after boot. > > ~Andrew > > > .bss : { /* BSS */ > > __bss_start = .; > > *(.bss.stack_aligned) > > diff --git a/xen/include/asm-x86/alternative.h > b/xen/include/asm-x86/alternative.h > > new file mode 100644 > > index 0000000..18c0e85 > > --- /dev/null > > +++ b/xen/include/asm-x86/alternative.h > > @@ -0,0 +1,77 @@ > > +#ifndef __X86_ALTERNATIVE_H__ > > +#define __X86_ALTERNATIVE_H__ > > + > > +#ifdef __ASSEMBLY__ > > +.macro altinstruction_entry orig alt feature orig_len alt_len > > + .long \orig - . > > + .long \alt - . > > + .word \feature > > + .byte \orig_len > > + .byte \alt_len > > +.endm > > +#else > > +#include <xen/types.h> > > + > > +struct alt_instr { > > + s32 instr_offset; /* original instruction */ > > + s32 repl_offset; /* offset to replacement instruction */ > > + u16 cpuid; /* cpuid bit set for replacement */ > > + u8 instrlen; /* length of original instruction */ > > + u8 replacementlen; /* length of new instruction, <= instrlen */ > > +}; > > + > > +extern void alternative_instructions(void); > > + > > +#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" > > + > > +#define b_replacement(number) "663"#number > > +#define e_replacement(number) "664"#number > > + > > +#define alt_slen "662b-661b" > > +#define alt_rlen(number) > e_replacement(number)"f-"b_replacement(number)"f" > > + > > +#define ALTINSTR_ENTRY(feature, number) > \ > > + " .long 661b - .\n" /* label > */ \ > > + " .long " b_replacement(number)"f - .\n" /* new > instruction */ \ > > + " .word " __stringify(feature) "\n" /* feature bit > */ \ > > + " .byte " alt_slen "\n" /* source len > */ \ > > + " .byte " alt_rlen(number) "\n" /* > replacement len */ > > + > > +#define DISCARD_ENTRY(number) /* rlen > <= slen */ \ > > + " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" > > + > > +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* > replacement */ \ > > + b_replacement(number)":\n\t" newinstr "\n" > e_replacement(number) ":\n\t" > > + > > +/* alternative assembly primitive: */ > > +#define ALTERNATIVE(oldinstr, newinstr, feature) > \ > > + OLDINSTR(oldinstr) > \ > > + ".pushsection .altinstructions,\"a\"\n" > \ > > + ALTINSTR_ENTRY(feature, 1) > \ > > + ".popsection\n" > \ > > + ".pushsection .discard,\"aw\",@progbits\n" > \ > > + DISCARD_ENTRY(1) > \ > > + ".popsection\n" > \ > > + ".pushsection .altinstr_replacement, \"ax\"\n" > \ > > + ALTINSTR_REPLACEMENT(newinstr, feature, 1) > \ > > + ".popsection" > > + > > +/* > > + * Alternative instructions for different CPU types or capabilities. > > + * > > + * This allows to use optimized instructions even on generic binary > > + * kernels. > > + * > > + * length of oldinstr must be longer or equal the length of newinstr > > + * It can be padded with nops as needed. > > + * > > + * For non barrier like inlines please define new variants > > + * without volatile and memory clobber. > > + */ > > +#define alternative(oldinstr, newinstr, feature) > \ > > + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : > "memory") > > + > > +#endif /* __ASSEMBLY__ */ > > + > > +#endif /* __X86_ALTERNATIVE_H__ */ > > + > > diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h > > index 556b133..9b1dc2c 100644 > > --- a/xen/include/asm-x86/traps.h > > +++ b/xen/include/asm-x86/traps.h > > @@ -54,4 +54,6 @@ uint32_t guest_io_read(unsigned int port, unsigned int > bytes, > > void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data, > > struct vcpu *, struct cpu_user_regs *); > > > > +extern void stop_nmi(void); > > +extern void restart_nmi(void); > > #endif /* ASM_TRAP_H */ Thanks, Feng _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |