[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH, RFC 4/5] x86: avoid unlikely taken forward branches



... since these get statically mis-predicted by most CPUs and increase
the cache footprint. This mostly concerns hypercall tracing and vm86
mode handling.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1330,7 +1330,7 @@ asmlinkage void do_page_fault(struct cpu
  * during early boot (an issue was seen once, but was most likely a hardware 
  * problem).
  */
-asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
+asmlinkage void __init do_early_page_fault(struct cpu_user_regs *regs)
 {
     static int stuck;
     static unsigned long prev_eip, prev_cr2;
--- a/xen/arch/x86/x86_32/entry.S
+++ b/xen/arch/x86/x86_32/entry.S
@@ -142,7 +142,7 @@ restore_all_xen:
 ENTRY(hypercall)
         subl $4,%esp
         FIXUP_RING0_GUEST_STACK
-        SAVE_ALL(1f,1f)
+        SAVE_ALL(,1f)
 1:      sti
         GET_CURRENT(%ebx)
         cmpl  $NR_hypercalls,%eax
@@ -182,12 +182,14 @@ ENTRY(hypercall)
 #define SHADOW_BYTES 24 /* 6 shadow parameters */
 #endif
         cmpb  $0,tb_init_done
-        je    1f
+UNLIKELY_START(ne, trace)
         call  trace_hypercall
         /* Now restore all the registers that trace_hypercall clobbered */
         movl  UREGS_eax+SHADOW_BYTES(%esp),%eax /* Hypercall # */
+UNLIKELY_END(trace)
+        call *hypercall_table(,%eax,4)
+        movl  %eax,UREGS_eax+SHADOW_BYTES(%esp) # save the return value
 #undef SHADOW_BYTES
-1:      call *hypercall_table(,%eax,4)
         addl  $24,%esp     # Discard the shadow parameters
 #ifndef NDEBUG
         /* Deliberately corrupt real parameter regs used by this hypercall. */
@@ -197,13 +199,10 @@ ENTRY(hypercall)
         jne   skip_clobber # If EIP has changed then don't clobber
         movzb hypercall_args_table(,%ecx,1),%ecx
         movl  %esp,%edi
-        movl  %eax,%esi
         movl  $0xDEADBEEF,%eax
         rep   stosl
-        movl  %esi,%eax
 skip_clobber:
 #endif
-        movl %eax,UREGS_eax(%esp)       # save the return value
 
 test_all_events:
         xorl %ecx,%ecx
@@ -293,8 +292,8 @@ create_bounce_frame:
         jz   ring1 /* jump if returning to an existing ring-1 activation */
         movl VCPU_kernel_sp(%ebx),%esi
 .Lft6:  mov  VCPU_kernel_ss(%ebx),%gs
-        testl $X86_EFLAGS_VM,UREGS_eflags+4(%esp)
-        jz   .Lnvm86_1
+        testl $X86_EFLAGS_VM,%ecx
+UNLIKELY_START(nz, bounce_vm86_1)
         subl $16,%esi       /* push ES/DS/FS/GS (VM86 stack frame) */
         movl UREGS_es+4(%esp),%eax
 .Lft7:  movl %eax,%gs:(%esi)
@@ -304,7 +303,7 @@ create_bounce_frame:
 .Lft9:  movl %eax,%gs:8(%esi)
         movl UREGS_gs+4(%esp),%eax
 .Lft10: movl %eax,%gs:12(%esi)
-.Lnvm86_1:
+UNLIKELY_END(bounce_vm86_1)
         subl $8,%esi        /* push SS/ESP (inter-priv iret) */
         movl UREGS_esp+4(%esp),%eax
 .Lft11: movl %eax,%gs:(%esi)
@@ -346,17 +345,10 @@ ring1:  /* obtain ss/esp from oldss/olde
         movl TRAPBOUNCE_error_code(%edx),%eax
 .Lft17: movl %eax,%gs:(%esi)
 1:      testb $TBF_FAILSAFE,%cl
-        jz   2f
+UNLIKELY_START(nz, bounce_failsafe)
         subl $16,%esi                # add DS/ES/FS/GS to failsafe stack frame
         testl $X86_EFLAGS_VM,UREGS_eflags+4(%esp)
-        jz   .Lnvm86_2
-        xorl %eax,%eax               # VM86: we write zero selector values
-.Lft18: movl %eax,%gs:(%esi)
-.Lft19: movl %eax,%gs:4(%esi)
-.Lft20: movl %eax,%gs:8(%esi)
-.Lft21: movl %eax,%gs:12(%esi)
-        jmp  2f
-.Lnvm86_2:
+        jnz  .Lvm86_2
         movl UREGS_ds+4(%esp),%eax   # non-VM86: write real selector values
 .Lft22: movl %eax,%gs:(%esi)
         movl UREGS_es+4(%esp),%eax
@@ -365,13 +357,22 @@ ring1:  /* obtain ss/esp from oldss/olde
 .Lft24: movl %eax,%gs:8(%esi)
         movl UREGS_gs+4(%esp),%eax
 .Lft25: movl %eax,%gs:12(%esi)
-2:      testl $X86_EFLAGS_VM,UREGS_eflags+4(%esp)
-        jz   .Lnvm86_3
+        jmp  .Lnvm86_3
+.Lvm86_2:
+        xorl %eax,%eax               # VM86: we write zero selector values
+.Lft18: movl %eax,%gs:(%esi)
+.Lft19: movl %eax,%gs:4(%esi)
+.Lft20: movl %eax,%gs:8(%esi)
+.Lft21: movl %eax,%gs:12(%esi)
+UNLIKELY_END(bounce_failsafe)
+        testl $X86_EFLAGS_VM,UREGS_eflags+4(%esp)
+UNLIKELY_START(nz, bounce_vm86_3)
         xorl %eax,%eax      /* zero DS-GS, just as a real CPU would */
         movl %eax,UREGS_ds+4(%esp)
         movl %eax,UREGS_es+4(%esp)
         movl %eax,UREGS_fs+4(%esp)
         movl %eax,UREGS_gs+4(%esp)
+UNLIKELY_END(bounce_vm86_3)
 .Lnvm86_3:
         /* Rewrite our stack frame and return to ring 1. */
         /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */
@@ -564,6 +565,7 @@ ENTRY(spurious_interrupt_bug)
         pushl $TRAP_spurious_int<<16
         jmp   handle_exception
 
+       .pushsection .init.text, "ax", @progbits
 ENTRY(early_page_fault)
         SAVE_ALL(1f,1f)
 1:      movl  %esp,%eax
@@ -571,6 +573,7 @@ ENTRY(early_page_fault)
         call  do_early_page_fault
         addl  $4,%esp
         jmp   restore_all_xen
+       .popsection
 
 handle_nmi_mce:
 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -49,7 +49,7 @@ ENTRY(compat_hypercall)
 #define SHADOW_BYTES 0  /* No on-stack shadow state */
 #endif
         cmpb  $0,tb_init_done(%rip)
-        je    1f
+UNLIKELY_START(ne, compat_trace)
         call  trace_hypercall
         /* Now restore all the registers that trace_hypercall clobbered */
         movl  UREGS_rax+SHADOW_BYTES(%rsp),%eax   /* Hypercall #  */
@@ -60,7 +60,8 @@ ENTRY(compat_hypercall)
         movl  UREGS_rdi+SHADOW_BYTES(%rsp),%r8d   /* Arg 5        */
         movl  UREGS_rbp+SHADOW_BYTES(%rsp),%r9d   /* Arg 6        */
 #undef SHADOW_BYTES
-1:      leaq  compat_hypercall_table(%rip),%r10
+UNLIKELY_END(compat_trace)
+        leaq  compat_hypercall_table(%rip),%r10
         PERFC_INCR(PERFC_hypercalls, %rax, %rbx)
         callq *(%r10,%rax,8)
 #ifndef NDEBUG
@@ -295,7 +296,7 @@ compat_create_bounce_frame:
 .Lft8:  movl  %eax,%fs:(%rsi)           # ERROR CODE
 1:
         testb $TBF_FAILSAFE,%cl
-        jz    2f
+UNLIKELY_START(nz, compat_bounce_failsafe)
         subl  $4*4,%esi
         movl  %gs,%eax
 .Lft9:  movl  %eax,%fs:3*4(%rsi)        # GS
@@ -304,7 +305,7 @@ compat_create_bounce_frame:
 .Lft11: movl  %eax,%fs:1*4(%rsi)        # ES
         movl  %ds,%eax
 .Lft12: movl  %eax,%fs:0*4(%rsi)        # DS
-2:
+UNLIKELY_END(compat_bounce_failsafe)
         /* Rewrite our stack frame and return to guest-OS mode. */
         /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */
         andl  $~(X86_EFLAGS_VM|X86_EFLAGS_RF|\
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -148,7 +148,7 @@ ENTRY(syscall_enter)
 #define SHADOW_BYTES 0  /* No on-stack shadow state */
 #endif
         cmpb  $0,tb_init_done(%rip)
-        je    1f
+UNLIKELY_START(ne, trace)
         call  trace_hypercall
         /* Now restore all the registers that trace_hypercall clobbered */
         movq  UREGS_rax+SHADOW_BYTES(%rsp),%rax   /* Hypercall #  */
@@ -159,7 +159,8 @@ ENTRY(syscall_enter)
         movq  UREGS_r8 +SHADOW_BYTES(%rsp),%r8    /* Arg 5        */
         movq  UREGS_r9 +SHADOW_BYTES(%rsp),%r9    /* Arg 6        */
 #undef SHADOW_BYTES
-1:      leaq  hypercall_table(%rip),%r10
+UNLIKELY_END(trace)
+        leaq  hypercall_table(%rip),%r10
         PERFC_INCR(PERFC_hypercalls, %rax, %rbx)
         callq *(%r10,%rax,8)
 #ifndef NDEBUG
@@ -341,11 +342,12 @@ create_bounce_frame:
 2:      andq  $~0xf,%rsi                # Stack frames are 16-byte aligned.
         movq  $HYPERVISOR_VIRT_START,%rax
         cmpq  %rax,%rsi
-        jb    1f                        # In +ve address space? Then okay.
         movq  $HYPERVISOR_VIRT_END+60,%rax
+        sbb   %ecx,%ecx                 # In +ve address space? Then okay.
         cmpq  %rax,%rsi
-        jb    domain_crash_synchronous  # Above Xen private area? Then okay.
-1:      movb  TRAPBOUNCE_flags(%rdx),%cl
+        adc   %ecx,%ecx                 # Above Xen private area? Then okay.
+        jg    domain_crash_synchronous
+        movb  TRAPBOUNCE_flags(%rdx),%cl
         subq  $40,%rsi
         movq  UREGS_ss+8(%rsp),%rax
 .Lft2:  movq  %rax,32(%rsi)             # SS
@@ -376,7 +378,7 @@ create_bounce_frame:
         movl  TRAPBOUNCE_error_code(%rdx),%eax
 .Lft7:  movq  %rax,(%rsi)               # ERROR CODE
 1:      testb $TBF_FAILSAFE,%cl
-        jz    2f
+UNLIKELY_START(nz, bounce_failsafe)
         subq  $32,%rsi
         movl  %gs,%eax
 .Lft8:  movq  %rax,24(%rsi)             # GS
@@ -386,7 +388,8 @@ create_bounce_frame:
 .Lft10: movq  %rax,8(%rsi)              # ES
         movl  %ds,%eax
 .Lft11: movq  %rax,(%rsi)               # DS
-2:      subq  $16,%rsi
+UNLIKELY_END(bounce_failsafe)
+        subq  $16,%rsi
         movq  UREGS_r11+8(%rsp),%rax
 .Lft12: movq  %rax,8(%rsi)              # R11
         movq  UREGS_rcx+8(%rsp),%rax
@@ -601,11 +604,13 @@ ENTRY(double_fault)
         call  do_double_fault
         ud2
 
+       .pushsection .init.text, "ax", @progbits
 ENTRY(early_page_fault)
         SAVE_ALL
         movq  %rsp,%rdi
         call  do_early_page_fault
         jmp   restore_all_xen
+       .popsection
 
 handle_ist_exception:
         SAVE_ALL
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -32,4 +32,18 @@
 #define _ASM_EXTABLE(from, to)     _ASM__EXTABLE(, from, to)
 #define _ASM_PRE_EXTABLE(from, to) _ASM__EXTABLE(.pre, from, to)
 
+#ifdef __ASSEMBLY__
+
+#define UNLIKELY_START(cond, tag)                      \
+       j##cond .Lunlikely.tag;                         \
+       .subsection 1;                                  \
+       .Lunlikely.tag:
+
+#define UNLIKELY_END(tag)                              \
+       jmp .Llikely.tag;                               \
+       .subsection 0;                                  \
+       .Llikely.tag:
+
+#endif
+
 #endif /* __X86_ASM_DEFNS_H__ */
--- a/xen/include/asm-x86/x86_32/asm_defns.h
+++ b/xen/include/asm-x86/x86_32/asm_defns.h
@@ -1,6 +1,7 @@
 #ifndef __X86_32_ASM_DEFNS_H__
 #define __X86_32_ASM_DEFNS_H__
 
+#include <xen/stringify.h>
 #include <asm/percpu.h>
 
 #ifdef CONFIG_FRAME_POINTER
@@ -53,12 +54,14 @@
         mov   %es,%esi;                                 \
         mov   $(__HYPERVISOR_DS),%ecx;                  \
         jnz   86f;                                      \
-        .text 1;                                        \
+        .subsection 1;                                  \
         86:   call setup_vm86_frame;                    \
         jmp   vm86_lbl;                                 \
         .previous;                                      \
+        .ifnes __stringify(xen_lbl), "";                \
         testb $3,UREGS_cs(%esp);                        \
         jz    xen_lbl;                                  \
+        .endif;                                         \
         /*                                              \
          * We are the outermost Xen context, but our    \
          * life is complicated by NMIs and MCEs. These  \
--- /dev/null
+++ b/xen/include/xen/stringify.h
@@ -0,0 +1,12 @@
+#ifndef __XEN_STRINGIFY_H
+#define __XEN_STRINGIFY_H
+
+/* Indirect stringification.  Doing two levels allows the parameter to be a
+ * macro itself.  For example, compile with -DFOO=bar, __stringify(FOO)
+ * converts to "bar".
+ */
+
+#define __stringify_1(x...)    #x
+#define __stringify(x...)      __stringify_1(x)
+
+#endif /* !__XEN_STRINGIFY_H */


Attachment: x86-forward-branches.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.