From: H.J. Lu Date: Thu, 16 Jul 2009 14:00:34 +0000 (-0700) Subject: memcmp implementation for x86-64 using SSE2. X-Git-Tag: upstream/2.30~13470 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e26c9b84155f31b37730fec7621f1d9a805b314d;p=external%2Fglibc.git memcmp implementation for x86-64 using SSE2. --- diff --git a/ChangeLog b/ChangeLog index c355ea4..87db19e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2009-07-15 H.J. Lu + + * sysdeps/x86_64/memcmp.S: New file. + 2009-07-15 Ulrich Drepper * sysdeps/x86-64/dl-trampoline.h: Remove after integrating code into... diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S new file mode 100644 index 0000000..165f42e --- /dev/null +++ b/sysdeps/x86_64/memcmp.S @@ -0,0 +1,359 @@ +/* memcmp with SSE2 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + + .text +ENTRY (memcmp) + test %rdx, %rdx + jz L(finz) + cmpq $1, %rdx + jle L(finr1b) + subq %rdi, %rsi + movq %rdx, %r10 + cmpq $32, %r10 + jge L(gt32) + /* Handle small chunks and last block of less than 32 bytes. */ +L(small): + testq $1, %r10 + jz L(s2b) + movzbl (%rdi), %eax + movzbl (%rdi, %rsi), %edx + subq $1, %r10 + je L(finz1) + addq $1, %rdi + subl %edx, %eax + jnz L(exit) +L(s2b): + testq $2, %r10 + jz L(s4b) + movzwl (%rdi), %eax + movzwl (%rdi, %rsi), %edx + subq $2, %r10 + je L(fin2_7) + addq $2, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s4b): + testq $4, %r10 + jz L(s8b) + movl (%rdi), %eax + movl (%rdi, %rsi), %edx + subq $4, %r10 + je L(fin2_7) + addq $4, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s8b): + testq $8, %r10 + jz L(s16b) + movq (%rdi), %rax + movq (%rdi, %rsi), %rdx + subq $8, %r10 + je L(fin2_7) + addq $8, %rdi + cmpq %rdx, %rax + jnz L(fin2_7) +L(s16b): + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + xorl %eax, %eax + subl $0xffff, %edx + jz L(finz) + bsfl %edx, %ecx + leaq (%rdi, %rcx), %rcx + movzbl (%rcx), %eax + movzbl (%rsi, %rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(finr1b): + movzbl (%rdi), %eax + movzbl (%rsi), %edx +L(finz1): + subl %edx, %eax +L(exit): + ret + + .p2align 4,, 4 +L(fin2_7): + cmpq %rdx, %rax + jz L(finz) + movq %rax, %r11 + subq %rdx, %r11 + bsfq %r11, %rcx + sarq $3, %rcx + salq $3, %rcx + sarq %cl, %rax + movzbl %al, %eax + sarq %cl, %rdx + movzbl %dl, %edx + subl %edx, %eax + ret + + .p2align 4,, 4 +L(finz): + xorl %eax, %eax + ret + + /* For blocks bigger than 32 bytes + 1. Advance one of the addr pointer to be 16B aligned. + 2. Treat the case of both addr pointers aligned to 16B + separately to avoid movdqu. + 3. Handle any blocks of greater than 64 consecutive bytes with + unrolling to reduce branches. + 4. At least one addr pointer is 16B aligned, use memory version + of pcmbeqb. + */ + .p2align 4,, 4 +L(gt32): + movq %rdx, %r11 + addq %rdi, %r11 + movq %rdi, %r8 + + andq $15, %r8 + jz L(16am) + /* Both pointers may be misaligned. */ + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + subl $0xffff, %edx + jnz L(neq) + neg %r8 + leaq 16(%rdi, %r8), %rdi +L(16am): + /* Handle two 16B aligned pointers separately. */ + testq $15, %rsi + jz L(ATR) + testq $16, %rdi + jz L(A32) + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi +L(A32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + /* Pre-unroll to be ready for unrolled 64B loop. */ + testq $32, %rdi + jz L(A64) + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(A64): + movq %r11, %r10 + andq $-64, %r10 + cmpq %r10, %rdi + jge L(mt32) + +L(A64main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A64main) + +L(mt32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(A32main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A32main) +L(mt16): + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + + .p2align 4,, 4 +L(neq): + bsfl %edx, %ecx + movzbl (%rdi, %rcx), %eax + addq %rdi, %rsi + movzbl (%rsi,%rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(ATR): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + testq $16, %rdi + jz L(ATR32) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + je L(mt16) + +L(ATR32): + movq %r11, %r10 + andq $-64, %r10 + testq $32, %rdi + jz L(ATR64) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(ATR64): + cmpq %rdi, %r10 + je L(mt32) + +L(ATR64main): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + jne L(ATR64main) + + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(ATR32res): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %r10, %rdi + jne L(ATR32res) + + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + /* Align to 16byte to improve instruction fetch. */ + .p2align 4,, 4 +END(memcmp) + +#undef bcmp +weak_alias (memcmp, bcmp) +libc_hidden_builtin_def (memcmp)