[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [linux-2.6.18-xen] x86-64: provide a memset() that can deal with 4Gb or above at a time
# HG changeset patch # User Jan Beulich <jbeulich@xxxxxxxx> # Date 1332754669 -7200 # Node ID 2748f5d7597de35e59318f4bad8ed762e8ce32aa # Parent b7fea0d6bf234c86e65ddf7781fdf63bb046cba6 x86-64: provide a memset() that can deal with 4Gb or above at a time Now that a corresponding change got accepted into Linux 3.4, let's fix this in our code too. It is particularly required by the memset() invoked from __alloc_bootmem_core(), which can be called with sizes beyond 4Gb out of alloc_node_mem_map() when CONFIG_FLAT_NODE_MEM_MAP is defined (starting at around 300Gb). In order to not affect the native kernel (which is unlikely to be affected anyway, as it usually sets up separate maps for each node [as long as NUMA is defined], and hence would require said amount of memory per node [and SPARSEMEM not to be used] for the problem to become visible, plus in this tree we're not really concerned about fixing native problems), introduce a Xen-specific clone of the original file. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- diff -r b7fea0d6bf23 -r 2748f5d7597d arch/x86_64/lib/memset-xen.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/x86_64/lib/memset-xen.S Mon Mar 26 11:37:49 2012 +0200 @@ -0,0 +1,122 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs */ +/* + * ISO C memset - set a memory block to a byte value. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .globl __memset + .globl memset + .p2align 4 +memset: +__memset: + movq %rdi,%r10 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + imulq %rcx,%rax + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment +.Lafter_bad_alignment: + + movq %rdx,%rcx + shrq $6,%rcx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decq %rcx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %edx,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + andl $7,%edx + jz .Lende + .p2align 4 +.Lloop_1: + decl %edx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + +.Lbad_alignment: + cmpq $7,%rdx + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%rdx + jmp .Lafter_bad_alignment + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad memset + .quad memset_c + .byte X86_FEATURE_REP_GOOD + .byte memset_c_end-memset_c + .byte memset_c_end-memset_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi value + * rdx count + */ +memset_c: + movq %rdi,%r9 + movq %rdx,%rcx + andl $7,%edx + shrq $3,%rcx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + imulq %rsi,%rax + rep + stosq + movl %edx,%ecx + rep + stosb + movq %r9,%rax + ret +memset_c_end: + .previous _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |