diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-04-06 09:02:57 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-04-06 09:02:57 +0200 |
commit | f541ae326fa120fa5c57433e4d9a133df212ce41 (patch) | |
tree | bdbd94ec72cfc601118051cb35e8617d55510177 /arch/x86/lib/memcpy_64.S | |
parent | e255357764f92afcafafbd4879b222b8c752065a (diff) | |
parent | 0221c81b1b8eb0cbb6b30a0ced52ead32d2b4e4c (diff) |
Merge branch 'linus' into perfcounters/core-v2
Merge reason: we have gathered quite a few conflicts, need to merge upstream
Conflicts:
arch/powerpc/kernel/Makefile
arch/x86/ia32/ia32entry.S
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/unistd_32.h
arch/x86/include/asm/unistd_64.h
arch/x86/kernel/cpu/common.c
arch/x86/kernel/irq.c
arch/x86/kernel/syscall_table_32.S
arch/x86/mm/iomap_32.c
include/linux/sched.h
kernel/Makefile
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 143 |
1 files changed, 81 insertions, 62 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index c22981fa2f3a..ad5441ed1b57 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,30 +1,38 @@ /* Copyright 2002 Andi Kleen */ #include <linux/linkage.h> -#include <asm/dwarf2.h> + #include <asm/cpufeature.h> +#include <asm/dwarf2.h> /* * memcpy - Copy a memory block. * - * Input: - * rdi destination - * rsi source - * rdx count - * + * Input: + * rdi destination + * rsi source + * rdx count + * * Output: * rax original destination - */ + */ +/* + * memcpy_c() - fast string ops (REP MOVSQ) based variant. + * + * Calls to this get patched into the kernel image via the + * alternative instructions framework: + */ ALIGN memcpy_c: CFI_STARTPROC - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx + movq %rdi, %rax + + movl %edx, %ecx + shrl $3, %ecx + andl $7, %edx rep movsq - movl %edx,%ecx + movl %edx, %ecx rep movsb ret CFI_ENDPROC @@ -33,99 +41,110 @@ ENDPROC(memcpy_c) ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - movq %rdi,%rax - movl %edx,%ecx - shrl $6,%ecx + /* + * Put the number of full 64-byte blocks into %ecx. + * Tail portion is handled at the end: + */ + movq %rdi, %rax + movl %edx, %ecx + shrl $6, %ecx jz .Lhandle_tail .p2align 4 .Lloop_64: + /* + * We decrement the loop index here - and the zero-flag is + * checked at the end of the loop (instructions inbetween do + * not change the zero flag): + */ decl %ecx - movq (%rsi),%r11 - movq 8(%rsi),%r8 + /* + * Move in blocks of 4x16 bytes: + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r8 + movq %r11, 0*8(%rdi) + movq %r8, 1*8(%rdi) - movq %r11,(%rdi) - movq %r8,1*8(%rdi) + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r10 + movq %r9, 2*8(%rdi) + movq %r10, 3*8(%rdi) - movq 2*8(%rsi),%r9 - movq 3*8(%rsi),%r10 + movq 4*8(%rsi), %r11 + movq 5*8(%rsi), %r8 + movq %r11, 4*8(%rdi) + movq %r8, 5*8(%rdi) - movq %r9,2*8(%rdi) - movq %r10,3*8(%rdi) + movq 6*8(%rsi), %r9 + movq 7*8(%rsi), %r10 + movq %r9, 6*8(%rdi) + movq %r10, 7*8(%rdi) - movq 4*8(%rsi),%r11 - movq 5*8(%rsi),%r8 + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi - movq %r11,4*8(%rdi) - movq %r8,5*8(%rdi) - - movq 6*8(%rsi),%r9 - movq 7*8(%rsi),%r10 - - movq %r9,6*8(%rdi) - movq %r10,7*8(%rdi) - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi jnz .Lloop_64 .Lhandle_tail: - movl %edx,%ecx - andl $63,%ecx - shrl $3,%ecx + movl %edx, %ecx + andl $63, %ecx + shrl $3, %ecx jz .Lhandle_7 + .p2align 4 .Lloop_8: decl %ecx - movq (%rsi),%r8 - movq %r8,(%rdi) - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi + movq (%rsi), %r8 + movq %r8, (%rdi) + leaq 8(%rdi), %rdi + leaq 8(%rsi), %rsi jnz .Lloop_8 .Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende + movl %edx, %ecx + andl $7, %ecx + jz .Lend + .p2align 4 .Lloop_1: - movb (%rsi),%r8b - movb %r8b,(%rdi) + movb (%rsi), %r8b + movb %r8b, (%rdi) incq %rdi incq %rsi decl %ecx jnz .Lloop_1 -.Lende: - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx +.Lend: ret -.Lfinal: CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ + /* + * Some CPUs run faster using the string copy instructions. + * It is also a lot simpler. Use this when possible: + */ - .section .altinstr_replacement,"ax" + .section .altinstr_replacement, "ax" 1: .byte 0xeb /* jmp <disp8> */ .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 2: .previous - .section .altinstructions,"a" + + .section .altinstructions, "a" .align 8 .quad memcpy .quad 1b .byte X86_FEATURE_REP_GOOD - /* Replace only beginning, memcpy is used to apply alternatives, so it - * is silly to overwrite itself with nops - reboot is only outcome... */ + + /* + * Replace only beginning, memcpy is used to apply alternatives, + * so it is silly to overwrite itself with nops - reboot is the + * only outcome... + */ .byte 2b - 1b .byte 2b - 1b .previous |