[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()



Jan --

I assume the 12% faster is on a benchmark...
Have you measured how much faster the copy_page_sse2
routine (standalond) is than the memcpy?  Is it a
factor of 2?

Thanks,
Dan

> -----Original Message-----
> From: Jan Beulich [mailto:jbeulich@xxxxxxxxxx]
> Sent: Wednesday, November 12, 2008 2:38 AM
> To: xen-devel@xxxxxxxxxxxxxxxxxxx
> Subject: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
> 
> 
> In top of the highmem asstance hypercalls added earlier, this provides
> a performance improvement of another 12% (measured on Xeon E5345) for
> the page copying case.
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>
> 
> Index: 2008-10-27/xen/arch/x86/Makefile
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/Makefile     2008-11-11 
> 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/Makefile  2008-11-11 
> 16:18:36.000000000 +0100
> @@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64
>  obj-y += apic.o
>  obj-y += bitops.o
>  obj-y += clear_page.o
> +obj-y += copy_page.o
>  obj-y += compat.o
>  obj-y += delay.o
>  obj-y += dmi_scan.o
> Index: 2008-10-27/xen/arch/x86/copy_page.S
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ 2008-10-27/xen/arch/x86/copy_page.S       2008-06-03 
> 14:24:57.000000000 +0200
> @@ -0,0 +1,66 @@
> +#include <xen/config.h>
> +#include <asm/page.h>
> +
> +#ifdef __i386__
> +#define src_reg %esi
> +#define dst_reg %edi
> +#define WORD_SIZE 4
> +#define tmp1_reg %eax
> +#define tmp2_reg %edx
> +#define tmp3_reg %ebx
> +#define tmp4_reg %ebp
> +#else
> +#define src_reg %rsi
> +#define dst_reg %rdi
> +#define WORD_SIZE 8
> +#define tmp1_reg %r8
> +#define tmp2_reg %r9
> +#define tmp3_reg %r10
> +#define tmp4_reg %r11
> +#endif
> +
> +ENTRY(copy_page_sse2)
> +#ifdef __i386__
> +        push    %ebx
> +        push    %ebp
> +        push    %esi
> +        push    %edi
> +        mov     6*4(%esp), src_reg
> +        mov     5*4(%esp), dst_reg
> +#endif
> +        mov     $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
> +
> +        prefetchnta 2*4*WORD_SIZE(src_reg)
> +        mov     (src_reg), tmp1_reg
> +        mov     WORD_SIZE(src_reg), tmp2_reg
> +        mov     2*WORD_SIZE(src_reg), tmp3_reg
> +        mov     3*WORD_SIZE(src_reg), tmp4_reg
> +
> +0:      prefetchnta 3*4*WORD_SIZE(src_reg)
> +1:      add     $4*WORD_SIZE, src_reg
> +        movnti  tmp1_reg, (dst_reg)
> +        mov     (src_reg), tmp1_reg
> +        dec     %ecx
> +        movnti  tmp2_reg, WORD_SIZE(dst_reg)
> +        mov     WORD_SIZE(src_reg), tmp2_reg
> +        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
> +        mov     2*WORD_SIZE(src_reg), tmp3_reg
> +        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
> +        lea     4*WORD_SIZE(dst_reg), dst_reg
> +        mov     3*WORD_SIZE(src_reg), tmp4_reg
> +        jg      0b
> +        jpe     1b
> +
> +        movnti  tmp1_reg, (dst_reg)
> +        movnti  tmp2_reg, WORD_SIZE(dst_reg)
> +        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
> +        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
> +
> +#ifdef __i386__
> +        pop     %edi
> +        pop     %esi
> +        pop     %ebp
> +        pop     %ebx
> +#endif
> +        sfence
> +        ret
> Index: 2008-10-27/xen/arch/x86/domain.c
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/domain.c     2008-11-11 
> 14:55:44.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/domain.c  2008-11-11 
> 16:24:48.000000000 +0100
> @@ -183,7 +183,8 @@ static int setup_compat_l4(struct vcpu *
>      /* This page needs to look like a pagetable so that it 
> can be shadowed */
>      pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
>  
> -    l4tab = copy_page(page_to_virt(pg), idle_pg_table);
> +    l4tab = page_to_virt(pg);
> +    copy_page(l4tab, idle_pg_table);
>      l4tab[0] = l4e_empty();
>      l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
>          l4e_from_page(pg, __PAGE_HYPERVISOR);
> Index: 2008-10-27/xen/arch/x86/domain_build.c
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/domain_build.c       
> 2008-11-11 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/domain_build.c    2008-11-11 
> 16:18:36.000000000 +0100
> @@ -467,8 +467,9 @@ int __init construct_dom0(
>      /* WARNING: The new domain must have its 'processor' 
> field filled in! */
>      l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc 
> += PAGE_SIZE;
>      l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc 
> += 4*PAGE_SIZE;
> -    memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
> -    for (i = 0; i < 4; i++) {
> +    for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
> +        copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
> +                  idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
>          l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
>          l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
>              l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, 
> __PAGE_HYPERVISOR);
> Index: 2008-10-27/xen/include/asm-x86/page.h
> ===================================================================
> --- 2008-10-27.orig/xen/include/asm-x86/page.h        
> 2008-11-11 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/include/asm-x86/page.h     2008-11-11 
> 16:18:36.000000000 +0100
> @@ -215,7 +215,10 @@ void clear_page_sse2(void *);
>  #define clear_page(_p)      (cpu_has_xmm2 ?                  
>            \
>                               clear_page_sse2((void *)(_p)) : 
>            \
>                               (void)memset((void *)(_p), 0, 
> PAGE_SIZE))
> -#define copy_page(_t,_f)    memcpy((void *)(_t), (void 
> *)(_f), PAGE_SIZE)
> +void copy_page_sse2(void *, const void *);
> +#define copy_page(_t,_f)    (cpu_has_xmm2 ?                  
>            \
> +                             copy_page_sse2(_t, _f) :        
>            \
> +                             (void)memcpy(_t, _f, PAGE_SIZE))
>  
>  #define mfn_valid(mfn)      ((mfn) < max_page)
>  
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel
>

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.