[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen stable-4.5] x86: suppress SMEP and SMAP while running 32-bit PV guest code
commit f3325975e2cdfd85aa6b49280897f4683f6aa896 Author: Jan Beulich <jbeulich@xxxxxxxx> AuthorDate: Fri May 27 14:46:31 2016 +0200 Commit: Jan Beulich <jbeulich@xxxxxxxx> CommitDate: Fri May 27 14:46:31 2016 +0200 x86: suppress SMEP and SMAP while running 32-bit PV guest code Since such guests' kernel code runs in ring 1, their memory accesses, at the paging layer, are supervisor mode ones, and hence subject to SMAP/SMEP checks. Such guests cannot be expected to be aware of those two features though (and so far we also don't expose the respective feature flags), and hence may suffer page faults they cannot deal with. While the placement of the re-enabling slightly weakens the intended protection, it was selected such that 64-bit paths would remain unaffected where possible. At the expense of a further performance hit the re-enabling could be put right next to the CLACs. Note that this introduces a number of extra TLB flushes - CR4.SMEP transitioning from 0 to 1 always causes a flush, and it transitioning from 1 to 0 may also do. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Reviewed-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> x86/compat: Cleanup and further debugging of SMAP/SMEP fixup * Abstract (X86_CR4_SMEP | X86_CR4_SMAP) behind XEN_CR4_PV32_BITS to avoid opencoding the invidial bits which are fixed up behind a 32bit PV guests back. * Show cr4_pv32_mask in the BUG register dump Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> Reviewed-by: Wei Liu <wei.liu2@xxxxxxxxxx> x86: refine debugging of SMEP/SMAP fix Instead of just latching cr4_pv32_mask into %rdx, correct the found wrong value in %cr4 (to avoid triggering another BUG). Also there is one more place for XEN_CR4_PV32_BITS to be used. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Reviewed-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> x86: make SMEP/SMAP suppression tolerate NMI/MCE at the "wrong" time There is one instruction boundary where any kind of interruption would break the assumptions cr4_pv32_restore's debug mode checking makes on the correlation between the CR4 register value and its in-memory cache. Correct this (see the code comment) even in non-debug mode, or else a subsequent cr4_pv32_restore would also be misguided into thinking the features are enabled when they really aren't. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Reviewed-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> master commit: ea3e8edfdbabfb17f0d39ed128716ec464f348b8 master date: 2016-05-13 18:15:45 +0100 master commit: ad4aa3619f436e3ed79eea8498ac18aa8d5e6b83 master date: 2016-05-16 13:11:05 +0100 master commit: e5e73163ec40b409151f2170d8e406a72b515ff2 master date: 2016-05-17 16:41:35 +0200 master commit: 9e28baf22ec98a64f68757eff39df72173d5f1bb master date: 2016-05-17 16:42:15 +0200 --- xen/arch/x86/setup.c | 10 +++++ xen/arch/x86/x86_64/asm-offsets.c | 1 + xen/arch/x86/x86_64/compat/entry.S | 81 ++++++++++++++++++++++++++++++++++++-- xen/arch/x86/x86_64/entry.S | 58 ++++++++++++++++++++++++++- xen/include/asm-x86/asm_defns.h | 18 ++++++++- xen/include/asm-x86/processor.h | 12 +++--- 6 files changed, 169 insertions(+), 11 deletions(-) diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 2828c71..ac283f6 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -66,6 +66,8 @@ invbool_param("smep", disable_smep); static bool_t __initdata disable_smap; invbool_param("smap", disable_smap); +unsigned long __read_mostly cr4_pv32_mask; + /* Boot dom0 in pvh mode */ static bool_t __initdata opt_dom0pvh; boolean_param("dom0pvh", opt_dom0pvh); @@ -1285,6 +1287,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) if ( cpu_has_smap ) set_in_cr4(X86_CR4_SMAP); + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; + if ( cpu_has_fsgsbase ) set_in_cr4(X86_CR4_FSGSBASE); @@ -1403,7 +1407,10 @@ void __init noreturn __start_xen(unsigned long mbi_p) * copy_from_user(). */ if ( cpu_has_smap ) + { + cr4_pv32_mask &= ~X86_CR4_SMAP; write_cr4(read_cr4() & ~X86_CR4_SMAP); + } /* * We're going to setup domain0 using the module(s) that we stashed safely @@ -1416,7 +1423,10 @@ void __init noreturn __start_xen(unsigned long mbi_p) panic("Could not set up DOM0 guest OS"); if ( cpu_has_smap ) + { write_cr4(read_cr4() | X86_CR4_SMAP); + cr4_pv32_mask |= X86_CR4_SMAP; + } /* Scrub RAM that is still free and so may go to an unprivileged domain. */ scrub_heap_pages(); diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c index 447c650..9477848 100644 --- a/xen/arch/x86/x86_64/asm-offsets.c +++ b/xen/arch/x86/x86_64/asm-offsets.c @@ -135,6 +135,7 @@ void __dummy__(void) OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); OFFSET(CPUINFO_processor_id, struct cpu_info, processor_id); OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_cr4, struct cpu_info, cr4); DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); BLANK(); diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S index 5b0af61..2fe56e2 100644 --- a/xen/arch/x86/x86_64/compat/entry.S +++ b/xen/arch/x86/x86_64/compat/entry.S @@ -16,14 +16,16 @@ ENTRY(compat_hypercall) ASM_CLAC pushq $0 SAVE_VOLATILE type=TRAP_syscall compat=1 + CR4_PV32_RESTORE cmpb $0,untrusted_msi(%rip) UNLIKELY_START(ne, msi_check) movl $HYPERCALL_VECTOR,%edi call check_for_unexpected_msi - LOAD_C_CLOBBERED + LOAD_C_CLOBBERED compat=1 ax=0 UNLIKELY_END(msi_check) + movl UREGS_rax(%rsp),%eax GET_CURRENT(%rbx) cmpl $NR_hypercalls,%eax @@ -33,7 +35,6 @@ UNLIKELY_END(msi_check) pushq UREGS_rbx(%rsp); pushq %rcx; pushq %rdx; pushq %rsi; pushq %rdi pushq UREGS_rbp+5*8(%rsp) leaq compat_hypercall_args_table(%rip),%r10 - movl %eax,%eax movl $6,%ecx subb (%r10,%rax,1),%cl movq %rsp,%rdi @@ -48,7 +49,6 @@ UNLIKELY_END(msi_check) #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */ #else /* Relocate argument registers and zero-extend to 64 bits. */ - movl %eax,%eax /* Hypercall # */ xchgl %ecx,%esi /* Arg 2, Arg 4 */ movl %edx,%edx /* Arg 3 */ movl %edi,%r8d /* Arg 5 */ @@ -174,6 +174,46 @@ compat_bad_hypercall: /* %rbx: struct vcpu, interrupts disabled */ ENTRY(compat_restore_all_guest) ASSERT_INTERRUPTS_DISABLED +.Lcr4_orig: + ASM_NOP8 /* testb $3,UREGS_cs(%rsp) */ + ASM_NOP2 /* jpe .Lcr4_alt_end */ + ASM_NOP8 /* mov CPUINFO_cr4...(%rsp), %rax */ + ASM_NOP6 /* and $..., %rax */ + ASM_NOP8 /* mov %rax, CPUINFO_cr4...(%rsp) */ + ASM_NOP3 /* mov %rax, %cr4 */ + ASM_NOP8 /* cmp %rax, CPUINFO_cr4...(%rsp) */ + ASM_NOP2 /* jne 1b */ +.Lcr4_orig_end: + .pushsection .altinstr_replacement, "ax" +.Lcr4_alt: + testb $3,UREGS_cs(%rsp) + jpe .Lcr4_alt_end + mov CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp), %rax + and $~XEN_CR4_PV32_BITS, %rax +1: + mov %rax, CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp) + mov %rax, %cr4 + /* + * An NMI or MCE may have occurred between the previous two + * instructions, leaving register and cache in a state where + * the next exit from the guest would trigger the BUG in + * cr4_pv32_restore. If this happened, the cached value is no + * longer what we just set it to, which we can utilize to + * correct that state. Note that we do not have to fear this + * loop to cause a live lock: If NMIs/MCEs occurred at that + * high a rate, we'd be live locked anyway. + */ + cmp %rax, CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp) + jne 1b +.Lcr4_alt_end: + .section .altinstructions, "a" + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, \ + (.Lcr4_orig_end - .Lcr4_orig), \ + (.Lcr4_alt_end - .Lcr4_alt) + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, \ + (.Lcr4_orig_end - .Lcr4_orig), \ + (.Lcr4_alt_end - .Lcr4_alt) + .popsection RESTORE_ALL adj=8 compat=1 .Lft0: iretq @@ -210,6 +250,38 @@ compat_failsafe_callback: _ASM_PRE_EXTABLE(.Lft0, .Lfx0) _ASM_EXTABLE(.Ldf0, compat_failsafe_callback) +/* This mustn't modify registers other than %rax. */ +ENTRY(cr4_pv32_restore) + push %rdx + GET_CPUINFO_FIELD(cr4, %rdx) + mov (%rdx), %rax + test $XEN_CR4_PV32_BITS, %eax + jnz 0f + or cr4_pv32_mask(%rip), %rax + mov %rax, %cr4 + mov %rax, (%rdx) + pop %rdx + ret +0: +#ifndef NDEBUG + /* Check that _all_ of the bits intended to be set actually are. */ + mov %cr4, %rax + and cr4_pv32_mask(%rip), %eax + cmp cr4_pv32_mask(%rip), %eax + je 1f + /* Cause cr4_pv32_mask to be visible in the BUG register dump. */ + mov cr4_pv32_mask(%rip), %rdx + /* Avoid coming back here while handling the #UD we cause below. */ + mov %cr4, %rcx + or %rdx, %rcx + mov %rcx, %cr4 + ud2 +1: +#endif + pop %rdx + xor %eax, %eax + ret + /* %rdx: trap_bounce, %rbx: struct vcpu */ ENTRY(compat_post_handle_exception) testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) @@ -220,6 +292,7 @@ ENTRY(compat_post_handle_exception) jmp compat_test_all_events ENTRY(compat_syscall) + CR4_PV32_RESTORE cmpb $0,VCPU_syscall32_disables_events(%rbx) movzwl VCPU_syscall32_sel(%rbx),%esi movq VCPU_syscall32_addr(%rbx),%rax @@ -244,6 +317,7 @@ UNLIKELY_END(compat_syscall_gpf) jmp .Lcompat_bounce_exception ENTRY(compat_sysenter) + CR4_PV32_RESTORE movq VCPU_trap_ctxt(%rbx),%rcx cmpb $TRAP_gp_fault,UREGS_entry_vector(%rsp) movzwl VCPU_sysenter_sel(%rbx),%eax @@ -257,6 +331,7 @@ ENTRY(compat_sysenter) jmp compat_test_all_events ENTRY(compat_int80_direct_trap) + CR4_PV32_RESTORE call compat_create_bounce_frame jmp compat_test_all_events diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S index 2d25d57..91d20d4 100644 --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -482,6 +482,7 @@ ENTRY(dom_crash_sync_extable) ENTRY(common_interrupt) SAVE_ALL CLAC + CR4_PV32_RESTORE movq %rsp,%rdi callq do_IRQ jmp ret_from_intr @@ -502,13 +503,67 @@ ENTRY(page_fault) GLOBAL(handle_exception) SAVE_ALL CLAC handle_exception_saved: + GET_CURRENT(%rbx) testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) jz exception_with_ints_disabled + +.Lcr4_pv32_orig: + jmp .Lcr4_pv32_done + .skip (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt) - (. - .Lcr4_pv32_orig), 0xcc + .pushsection .altinstr_replacement, "ax" +.Lcr4_pv32_alt: + mov VCPU_domain(%rbx),%rax +.Lcr4_pv32_alt_end: + .section .altinstructions, "a" + altinstruction_entry .Lcr4_pv32_orig, .Lcr4_pv32_alt, \ + X86_FEATURE_SMEP, \ + (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt), \ + (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt) + altinstruction_entry .Lcr4_pv32_orig, .Lcr4_pv32_alt, \ + X86_FEATURE_SMAP, \ + (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt), \ + (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt) + .popsection + + testb $3,UREGS_cs(%rsp) + jz .Lcr4_pv32_done + cmpb $0,DOMAIN_is_32bit_pv(%rax) + je .Lcr4_pv32_done + call cr4_pv32_restore + /* + * An NMI or #MC may occur between clearing CR4.SMEP / CR4.SMAP in + * compat_restore_all_guest and it actually returning to guest + * context, in which case the guest would run with the two features + * enabled. The only bad that can happen from this is a kernel mode + * #PF which the guest doesn't expect. Rather than trying to make the + * NMI/#MC exit path honor the intended CR4 setting, simply check + * whether the wrong CR4 was in use when the #PF occurred, and exit + * back to the guest (which will in turn clear the two CR4 bits) to + * re-execute the instruction. If we get back here, the CR4 bits + * should then be found clear (unless another NMI/#MC occurred at + * exactly the right time), and we'll continue processing the + * exception as normal. + */ + test %rax,%rax + jnz .Lcr4_pv32_done + /* + * The below effectively is + * if ( regs->entry_vector == TRAP_page_fault && + * (regs->error_code & PFEC_page_present) && + * !(regs->error_code & ~(PFEC_write_access|PFEC_insn_fetch)) ) + * goto compat_test_all_events; + */ + mov $PFEC_page_present,%al + cmpb $TRAP_page_fault,UREGS_entry_vector(%rsp) + jne .Lcr4_pv32_done + xor UREGS_error_code(%rsp),%eax + test $~(PFEC_write_access|PFEC_insn_fetch),%eax + jz compat_test_all_events +.Lcr4_pv32_done: sti 1: movq %rsp,%rdi movzbl UREGS_entry_vector(%rsp),%eax leaq exception_table(%rip),%rdx - GET_CURRENT(%rbx) PERFC_INCR(exceptions, %rax, %rbx) callq *(%rdx,%rax,8) testb $3,UREGS_cs(%rsp) @@ -637,6 +692,7 @@ ENTRY(nmi) movl $TRAP_nmi,4(%rsp) handle_ist_exception: SAVE_ALL CLAC + CR4_PV32_RESTORE testb $3,UREGS_cs(%rsp) jz 1f /* Interrupted guest context. Copy the context to stack bottom. */ diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h index 1674c7c..41ffeff 100644 --- a/xen/include/asm-x86/asm_defns.h +++ b/xen/include/asm-x86/asm_defns.h @@ -178,6 +178,16 @@ void ret_from_intr(void); #define ASM_STAC ASM_AC(STAC) #define ASM_CLAC ASM_AC(CLAC) + +#define CR4_PV32_RESTORE \ + 667: ASM_NOP5; \ + .pushsection .altinstr_replacement, "ax"; \ + 668: call cr4_pv32_restore; \ + .section .altinstructions, "a"; \ + altinstruction_entry 667b, 668b, X86_FEATURE_SMEP, 5, 5; \ + altinstruction_entry 667b, 668b, X86_FEATURE_SMAP, 5, 5; \ + .popsection + #else static always_inline void clac(void) { @@ -277,14 +287,18 @@ static always_inline void stac(void) * * For the way it is used in RESTORE_ALL, this macro must preserve EFLAGS.ZF. */ -.macro LOAD_C_CLOBBERED compat=0 +.macro LOAD_C_CLOBBERED compat=0 ax=1 .if !\compat movq UREGS_r11(%rsp),%r11 movq UREGS_r10(%rsp),%r10 movq UREGS_r9(%rsp),%r9 movq UREGS_r8(%rsp),%r8 -.endif +.if \ax movq UREGS_rax(%rsp),%rax +.endif +.elseif \ax + movl UREGS_rax(%rsp),%eax +.endif movq UREGS_rcx(%rsp),%rcx movq UREGS_rdx(%rsp),%rdx movq UREGS_rsi(%rsp),%rsi diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 3cf8b2f..397680a 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -134,16 +134,18 @@ #define TF_kernel_mode (1<<_TF_kernel_mode) /* #PF error code values. */ -#define PFEC_page_present (1U<<0) -#define PFEC_write_access (1U<<1) -#define PFEC_user_mode (1U<<2) -#define PFEC_reserved_bit (1U<<3) -#define PFEC_insn_fetch (1U<<4) +#define PFEC_page_present (_AC(1,U) << 0) +#define PFEC_write_access (_AC(1,U) << 1) +#define PFEC_user_mode (_AC(1,U) << 2) +#define PFEC_reserved_bit (_AC(1,U) << 3) +#define PFEC_insn_fetch (_AC(1,U) << 4) #define PFEC_page_paged (1U<<5) #define PFEC_page_shared (1U<<6) #define XEN_MINIMAL_CR4 (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) +#define XEN_CR4_PV32_BITS (X86_CR4_SMEP | X86_CR4_SMAP) + #define XEN_SYSCALL_MASK (X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF| \ X86_EFLAGS_NT|X86_EFLAGS_DF|X86_EFLAGS_IF| \ X86_EFLAGS_TF) -- generated by git-patchbot for /home/xen/git/xen.git#stable-4.5 _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |