From be13f7bff66e1850f9057dd813d6e7be022d9516 Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Sat, 15 Oct 2011 11:10:08 -0400 Subject: [PATCH] Optimized memcmp and wmemcmp for x86-64 and x86-32 --- ChangeLog | 29 + NEWS | 2 +- string/test-memcmp.c | 47 +- sysdeps/i386/i686/multiarch/Makefile | 3 +- sysdeps/i386/i686/multiarch/memcmp-sse4.S | 396 ++++-- sysdeps/i386/i686/multiarch/memcmp-ssse3.S | 565 +++++--- sysdeps/i386/i686/multiarch/wmemcmp-c.c | 5 + sysdeps/i386/i686/multiarch/wmemcmp-sse4.S | 4 + sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S | 4 + sysdeps/i386/i686/multiarch/wmemcmp.S | 59 + sysdeps/x86_64/multiarch/Makefile | 3 +- sysdeps/x86_64/multiarch/memcmp-sse4.S | 192 ++- sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1997 +++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/memcmp.S | 19 +- sysdeps/x86_64/multiarch/wmemcmp-c.c | 5 + sysdeps/x86_64/multiarch/wmemcmp-sse4.S | 4 + sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 + sysdeps/x86_64/multiarch/wmemcmp.S | 47 + wcsmbs/wmemcmp.c | 21 +- 19 files changed, 3070 insertions(+), 336 deletions(-) create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-c.c create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-sse4.S create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp.S create mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp.S diff --git a/ChangeLog b/ChangeLog index 49f091a..414611a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +2011-09-27 Liubov Dmitrieva + + * sysdeps/x86_64/multiarch/Makefile: (sysdep_routines): Add + memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c + * sysdeps/x86_64/multiarch/memcmp-ssse3: New file. + * sysdeps/x86_64/multiarch/memcmp.S: Update. Add __memcmp_ssse3. + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Update. + (USE_AS_WMEMCMP): New macro. + Fixing indents. + * sysdeps/x86_64/multiarch/wmemcmp.S: New file. + * sysdeps/x86_64/multiarch/wmemcmp-ssse3.S: New file. + * sysdeps/x86_64/multiarch/wmemcmp-sse4.S: New file. + * sysdeps/x86_64/multiarch/wmemcmp-c.S: New file. + * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add + wmemcmp-ssse3 wmemcmp-sse4 wmemcmp-c + * sysdeps/i386/i686/multiarch/wmemcmp.S: New file. + * sysdeps/i386/i686/multiarch/wmemcmp-c.c: New file. + * sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/wmemcmp-sse4.S: New file. + * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Update. + (USE_AS_WMEMCMP): New macro. + * sysdeps/i386/i686/multiarch/memcmp-ssse3: Likewise. + * sysdeps/string/test-memcmp.c: Update. + Fix simple_wmemcmp. + Add new tests. + * wcsmbs/wmemcmp.c: Update. + (WMEMCMP): New macro. + Fix overflow bug. + 2011-10-12 Andreas Jaeger [BZ #13268] diff --git a/NEWS b/NEWS index 7e9b2c1..cdb2973 100644 --- a/NEWS +++ b/NEWS @@ -33,7 +33,7 @@ Version 2.15 * Optimized strchr and strrchr for SSE on x86-32. Contributed by Liubov Dmitrieva. -* Optimized memchr, memrchr, rawmemchr for x86-64 and x86-32. +* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32. Contributed by Liubov Dmitrieva. * New interfaces: scandirat, scandirat64 diff --git a/string/test-memcmp.c b/string/test-memcmp.c index 4675bd9..f246d3a 100644 --- a/string/test-memcmp.c +++ b/string/test-memcmp.c @@ -29,9 +29,21 @@ # define MEMCPY wmemcpy # define SIMPLE_MEMCMP simple_wmemcmp # define CHAR wchar_t -# define MAX_CHAR 256000 -# define UCHAR uint32_t +# define UCHAR wchar_t # define CHARBYTES 4 +# define CHAR__MIN WCHAR_MIN +# define CHAR__MAX WCHAR_MAX +int +simple_wmemcmp (const wchar_t *s1, const wchar_t *s2, size_t n) +{ + int ret = 0; + /* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. + */ + while (n-- && (ret = *s1 < *s2 ? -1 : *s1 == *s2 ? 0 : 1) == 0) {s1++; s2++;} + return ret; +} #else # define MEMCMP memcmp # define MEMCPY memcpy @@ -40,18 +52,20 @@ # define MAX_CHAR 255 # define UCHAR unsigned char # define CHARBYTES 1 -#endif - -typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); +# define CHAR__MIN CHAR_MIN +# define CHAR__MAX CHAR_MAX int -SIMPLE_MEMCMP (const CHAR *s1, const CHAR *s2, size_t n) +simple_memcmp (const char *s1, const char *s2, size_t n) { int ret = 0; - while (n-- && (ret = *(UCHAR *) s1++ - *(UCHAR *) s2++) == 0); + while (n-- && (ret = *(unsigned char *) s1++ - *(unsigned char *) s2++) == 0); return ret; } +#endif + +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); IMPL (SIMPLE_MEMCMP, 0) IMPL (MEMCMP, 1) @@ -121,7 +135,7 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result) s2 = (CHAR *) (buf2 + align2); for (i = 0; i < len; i++) - s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR; + s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX; s1[len] = align1; s2[len] = align2; @@ -412,8 +426,8 @@ check1 (void) s2[99] = 1; s1[100] = 116; s2[100] = 116; - s1[101] = -13; - s2[101] = -13; + s1[101] = CHAR__MIN; + s2[101] = CHAR__MAX; s1[102] = -109; s2[102] = -109; s1[103] = 1; @@ -434,8 +448,8 @@ check1 (void) s2[110] = -109; s1[111] = 1; s2[111] = 1; - s1[112] = 20; - s2[112] = 20; + s1[112] = CHAR__MAX; + s2[112] = CHAR__MIN; s1[113] = -13; s2[113] = -13; s1[114] = -109; @@ -444,9 +458,12 @@ check1 (void) s2[115] = 1; n = 116; - exp_result = SIMPLE_MEMCMP (s1, s2, n); - FOR_EACH_IMPL (impl, 0) - check_result (impl, s1, s2, n, exp_result); + for (size_t i = 0; i < n; i++) + { + exp_result = SIMPLE_MEMCMP (s1 + i, s2 + i, n - i); + FOR_EACH_IMPL (impl, 0) + check_result (impl, s1 + i, s2 + i, n - i, exp_result); + } } int diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 8a4c219..98d1ad6 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -17,7 +17,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \ memrchr-sse2 memrchr-sse2-bsf memrchr-c \ - rawmemchr-sse2 rawmemchr-sse2-bsf + rawmemchr-sse2 rawmemchr-sse2-bsf \ + wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S index b1ed778..1f5dbc1 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -1,5 +1,5 @@ -/* memcmp with SSE4.2 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSE4.2, wmemcmp with SSE4.2 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,84 +20,97 @@ #ifndef NOT_IN_libc -#include -#include "asm-syntax.h" +# include -#ifndef MEMCMP -# define MEMCMP __memcmp_sse4_2 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +# endif -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1 + 4 +# define LEN BLK2 + 4 +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) -#ifdef SHARED -# define JMPTBL(I, B) I - B +# ifdef SHARED +# define JMPTBL(I, B) I - B /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table and adjuested EDX/ESI. Go. */ \ - jmp *%ebx - - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret -#else -# define JMPTBL(I, B) I + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +/* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ +/* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ +/* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ +/* We loaded the jump table and adjuested EDX/ESI. Go. */ \ + jmp *%ebx +# else +# define JMPTBL(I, B) I /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -#endif + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ .section .text.sse4.2,"ax",@progbits ENTRY (MEMCMP) movl BLK1(%esp), %eax movl BLK2(%esp), %edx movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(return0) +# else cmp $1, %ecx jbe L(less1bytes) +# endif + pxor %xmm0, %xmm0 cmp $64, %ecx ja L(64bytesormore) cmp $8, %ecx - PUSH (%ebx) + +# ifndef USE_AS_WMEMCMP + PUSH (%ebx) + jb L(less8bytes) +# else jb L(less8bytes) + PUSH (%ebx) +# endif + add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less8bytes): mov (%eax), %bl cmpb (%edx), %bl @@ -141,22 +154,49 @@ L(less8bytes): mov 6(%eax), %bl cmpb 6(%edx), %bl je L(0bytes) + L(nonzero): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(above) neg %eax L(above): ret CFI_PUSH (%ebx) +# endif - ALIGN (4) + .p2align 4 L(0bytes): - POP (%ebx) + POP (%ebx) xor %eax, %eax ret - ALIGN (4) +# ifdef USE_AS_WMEMCMP + +/* for wmemcmp, case N == 1 */ + + .p2align 4 +L(less8bytes): + mov (%eax), %ecx + cmp (%edx), %ecx + je L(return0) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + + .p2align 4 +L(return0): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less1bytes): jb L(0bytesend) movzbl (%eax), %eax @@ -164,14 +204,14 @@ L(less1bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(0bytesend): xor %eax, %eax ret - - ALIGN (4) +# endif + .p2align 4 L(64bytesormore): - PUSH (%ebx) + PUSH (%ebx) mov %ecx, %ebx mov $64, %ecx sub $64, %ebx @@ -208,7 +248,14 @@ L(64bytesormore_loop): add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) - ALIGN (4) +# ifdef USE_AS_WMEMCMP + +/* Label needs only for table_64bytes filling */ +L(unreal_case): +/* no code here */ + +# endif + .p2align 4 L(find_16diff): sub $16, %ecx L(find_32diff): @@ -218,9 +265,9 @@ L(find_48diff): L(find_64diff): add %ecx, %edx add %ecx, %eax - jmp L(16bytes) - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(16bytes): mov -16(%eax), %ecx mov -16(%edx), %ebx @@ -243,8 +290,30 @@ L(4bytes): mov $0, %eax jne L(find_diff) RETURN +# else + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + cmp -4(%edx), %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# endif - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(49bytes): movdqu -49(%eax), %xmm1 movdqu -49(%edx), %xmm2 @@ -285,7 +354,7 @@ L(5bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(50bytes): mov $-50, %ebx movdqu -50(%eax), %xmm1 @@ -330,7 +399,7 @@ L(2bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(51bytes): mov $-51, %ebx movdqu -51(%eax), %xmm1 @@ -378,8 +447,8 @@ L(1bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(52bytes): movdqu -52(%eax), %xmm1 movdqu -52(%edx), %xmm2 @@ -402,13 +471,18 @@ L(20bytes): ptest %xmm2, %xmm0 jnc L(less16bytes) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(53bytes): movdqu -53(%eax), %xmm1 movdqu -53(%edx), %xmm2 @@ -440,7 +514,7 @@ L(21bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(54bytes): movdqu -54(%eax), %xmm1 movdqu -54(%edx), %xmm2 @@ -476,7 +550,7 @@ L(22bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(55bytes): movdqu -55(%eax), %xmm1 movdqu -55(%edx), %xmm2 @@ -513,8 +587,8 @@ L(23bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(56bytes): movdqu -56(%eax), %xmm1 movdqu -56(%edx), %xmm2 @@ -538,18 +612,27 @@ L(24bytes): jnc L(less16bytes) mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(57bytes): movdqu -57(%eax), %xmm1 movdqu -57(%edx), %xmm2 @@ -585,7 +668,7 @@ L(25bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(58bytes): movdqu -58(%eax), %xmm1 movdqu -58(%edx), %xmm2 @@ -627,7 +710,7 @@ L(26bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(59bytes): movdqu -59(%eax), %xmm1 movdqu -59(%edx), %xmm2 @@ -668,8 +751,8 @@ L(27bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(60bytes): movdqu -60(%eax), %xmm1 movdqu -60(%edx), %xmm2 @@ -691,22 +774,38 @@ L(28bytes): pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 jnc L(less16bytes) + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -12(%edx), %ebx cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif jne L(find_diff) + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(61bytes): movdqu -61(%eax), %xmm1 movdqu -61(%edx), %xmm2 @@ -749,7 +848,7 @@ L(29bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(62bytes): movdqu -62(%eax), %xmm1 movdqu -62(%edx), %xmm2 @@ -792,7 +891,7 @@ L(30bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(63bytes): movdqu -63(%eax), %xmm1 movdqu -63(%edx), %xmm2 @@ -838,8 +937,9 @@ L(31bytes): mov $0, %eax jne L(end) RETURN +# endif - ALIGN (4) + .p2align 4 L(64bytes): movdqu -64(%eax), %xmm1 movdqu -64(%edx), %xmm2 @@ -863,28 +963,45 @@ L(32bytes): jnc L(less16bytes) mov -16(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -16(%edx), %ebx cmp %ebx, %ecx +# else + cmp -16(%edx), %ecx +# endif jne L(find_diff) mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -12(%edx), %ebx cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif jne L(find_diff) mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less16bytes): add %ebx, %eax add %ebx, %edx @@ -910,9 +1027,35 @@ L(less16bytes): mov $0, %eax jne L(find_diff) RETURN +# else + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + cmp (%edx), %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + cmp 4(%edx), %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + cmp 8(%edx), %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + cmp 12(%edx), %ecx + + mov $0, %eax + jne L(find_diff) + RETURN +# endif - ALIGN (4) + .p2align 4 L(find_diff): +# ifndef USE_AS_WMEMCMP cmpb %bl, %cl jne L(end) cmp %bx, %cx @@ -923,17 +1066,29 @@ L(find_diff): jne L(end) cmp %bx, %cx L(end): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(bigger) neg %eax L(bigger): ret +# else + POP (%ebx) + mov $1, %eax + jg L(bigger) + neg %eax + ret + + .p2align 4 +L(bigger): + ret +# endif END (MEMCMP) .section .rodata.sse4.2,"a",@progbits - ALIGN (2) + .p2align 2 .type L(table_64bytes), @object +# ifndef USE_AS_WMEMCMP L(table_64bytes): .int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(1bytes), L(table_64bytes)) @@ -1000,5 +1155,72 @@ L(table_64bytes): .int JMPTBL (L(62bytes), L(table_64bytes)) .int JMPTBL (L(63bytes), L(table_64bytes)) .int JMPTBL (L(64bytes), L(table_64bytes)) - .size L(table_64bytes), .-L(table_64bytes) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# endif #endif diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S index 2e0d15f..eab85c1 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -1,5 +1,5 @@ -/* memcmp with SSSE3 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,47 +20,64 @@ #ifndef NOT_IN_libc -#include -#include "asm-syntax.h" +# include -#ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1+4 +# define LEN BLK2+4 +# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret -#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ - .section .text.ssse3,"ax",@progbits + atom_text_section ENTRY (MEMCMP) movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(zero) +# endif + movl BLK1(%esp), %eax cmp $48, %ecx movl BLK2(%esp), %edx jae L(48bytesormore) + +# ifndef USE_AS_WMEMCMP cmp $1, %ecx jbe L(less1bytes) - PUSH (%ebx) +# endif + + PUSH (%ebx) add %ecx, %edx add %ecx, %eax jmp L(less48bytes) - ALIGN (4) - CFI_POP (%ebx) + CFI_POP (%ebx) + +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less1bytes): jb L(zero) movb (%eax), %cl @@ -71,29 +88,30 @@ L(less1bytes): neg %eax L(1bytesend): ret +# endif - ALIGN (4) + .p2align 4 L(zero): - mov $0, %eax + xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(48bytesormore): - PUSH (%ebx) - PUSH (%esi) - PUSH (%edi) + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) cfi_remember_state - movdqu (%eax), %xmm3 - movdqu (%edx), %xmm0 + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 movl %eax, %edi movl %edx, %esi - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx lea 16(%edi), %edi - sub $0xffff, %edx + sub $0xffff, %edx lea 16(%esi), %esi - jnz L(less16bytes) + jnz L(less16bytes) mov %edi, %edx and $0xf, %edx xor %edx, %edi @@ -104,6 +122,7 @@ L(48bytesormore): jz L(shr_0) xor %edx, %esi +# ifndef USE_AS_WMEMCMP cmp $8, %edx jae L(next_unaligned_table) cmp $0, %edx @@ -122,7 +141,7 @@ L(48bytesormore): je L(shr_6) jmp L(shr_7) - ALIGN (4) + .p2align 2 L(next_unaligned_table): cmp $8, %edx je L(shr_8) @@ -139,8 +158,17 @@ L(next_unaligned_table): cmp $14, %edx je L(shr_14) jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif - ALIGN (4) + .p2align 4 L(shr_0): cmp $80, %ecx jae L(shr_0_gobble) @@ -159,13 +187,13 @@ L(shr_0): lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_0_gobble): lea -48(%ecx), %ecx movdqa (%esi), %xmm0 @@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next): jnz L(exit) lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_1): cmp $80, %ecx lea -48(%ecx), %ecx @@ -235,13 +264,13 @@ L(shr_1): jnz L(exit) lea (%ecx, %edi,1), %eax lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_1_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -288,14 +317,14 @@ L(shr_1_gobble_next): lea (%ecx, %edi,1), %eax lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_2): cmp $80, %ecx lea -48(%ecx), %ecx @@ -319,13 +348,13 @@ L(shr_2): jnz L(exit) lea (%ecx, %edi,1), %eax lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_2_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -372,13 +401,13 @@ L(shr_2_gobble_next): lea (%ecx, %edi,1), %eax lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_3): cmp $80, %ecx lea -48(%ecx), %ecx @@ -402,13 +431,13 @@ L(shr_3): jnz L(exit) lea (%ecx, %edi,1), %eax lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_3_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -455,13 +484,14 @@ L(shr_3_gobble_next): lea (%ecx, %edi,1), %eax lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_4): cmp $80, %ecx lea -48(%ecx), %ecx @@ -485,13 +515,13 @@ L(shr_4): jnz L(exit) lea (%ecx, %edi,1), %eax lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_4_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -538,13 +568,14 @@ L(shr_4_gobble_next): lea (%ecx, %edi,1), %eax lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_5): cmp $80, %ecx lea -48(%ecx), %ecx @@ -568,13 +599,13 @@ L(shr_5): jnz L(exit) lea (%ecx, %edi,1), %eax lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_5_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -621,13 +652,13 @@ L(shr_5_gobble_next): lea (%ecx, %edi,1), %eax lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_6): cmp $80, %ecx lea -48(%ecx), %ecx @@ -651,13 +682,13 @@ L(shr_6): jnz L(exit) lea (%ecx, %edi,1), %eax lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_6_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -704,13 +735,13 @@ L(shr_6_gobble_next): lea (%ecx, %edi,1), %eax lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_7): cmp $80, %ecx lea -48(%ecx), %ecx @@ -734,13 +765,13 @@ L(shr_7): jnz L(exit) lea (%ecx, %edi,1), %eax lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_7_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -787,13 +818,14 @@ L(shr_7_gobble_next): lea (%ecx, %edi,1), %eax lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_8): cmp $80, %ecx lea -48(%ecx), %ecx @@ -817,13 +849,13 @@ L(shr_8): jnz L(exit) lea (%ecx, %edi,1), %eax lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_8_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -870,13 +902,14 @@ L(shr_8_gobble_next): lea (%ecx, %edi,1), %eax lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_9): cmp $80, %ecx lea -48(%ecx), %ecx @@ -900,13 +933,13 @@ L(shr_9): jnz L(exit) lea (%ecx, %edi,1), %eax lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_9_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -953,13 +986,13 @@ L(shr_9_gobble_next): lea (%ecx, %edi,1), %eax lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_10): cmp $80, %ecx lea -48(%ecx), %ecx @@ -983,13 +1016,13 @@ L(shr_10): jnz L(exit) lea (%ecx, %edi,1), %eax lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_10_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1036,13 +1069,13 @@ L(shr_10_gobble_next): lea (%ecx, %edi,1), %eax lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_11): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1066,13 +1099,13 @@ L(shr_11): jnz L(exit) lea (%ecx, %edi,1), %eax lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_11_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1119,13 +1152,14 @@ L(shr_11_gobble_next): lea (%ecx, %edi,1), %eax lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_12): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1149,13 +1183,13 @@ L(shr_12): jnz L(exit) lea (%ecx, %edi,1), %eax lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_12_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1202,13 +1236,14 @@ L(shr_12_gobble_next): lea (%ecx, %edi,1), %eax lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_13): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1232,13 +1267,13 @@ L(shr_13): jnz L(exit) lea (%ecx, %edi,1), %eax lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_13_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1285,13 +1320,13 @@ L(shr_13_gobble_next): lea (%ecx, %edi,1), %eax lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_14): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1315,13 +1350,13 @@ L(shr_14): jnz L(exit) lea (%ecx, %edi,1), %eax lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_14_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1368,13 +1403,13 @@ L(shr_14_gobble_next): lea (%ecx, %edi,1), %eax lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_15): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1398,13 +1433,13 @@ L(shr_15): jnz L(exit) lea (%ecx, %edi,1), %eax lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_15_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1451,13 +1486,14 @@ L(shr_15_gobble_next): lea (%ecx, %edi,1), %eax lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(exit): pmovmskb %xmm1, %ebx sub $0xffff, %ebx @@ -1465,9 +1501,12 @@ L(exit): lea -16(%esi), %esi lea -16(%edi), %edi mov %ebx, %edx + L(first16bytes): add %eax, %esi L(less16bytes): + +# ifndef USE_AS_WMEMCMP test %dl, %dl jz L(next_24_bytes) @@ -1492,61 +1531,61 @@ L(less16bytes): test $0x40, %dl jnz L(Byte22) L(Byte23): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte16): - movzbl -16(%edi), %eax - movzbl -16(%esi), %edx + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte17): - movzbl -15(%edi), %eax - movzbl -15(%esi), %edx + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte18): - movzbl -14(%edi), %eax - movzbl -14(%esi), %edx + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte19): - movzbl -13(%edi), %eax - movzbl -13(%esi), %edx + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte20): - movzbl -12(%edi), %eax - movzbl -12(%esi), %edx + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte21): - movzbl -11(%edi), %eax - movzbl -11(%esi), %edx + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte22): - movzbl -10(%edi), %eax - movzbl -10(%esi), %edx + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(next_24_bytes): lea 8(%edi), %edi lea 8(%esi), %esi @@ -1571,20 +1610,69 @@ L(next_24_bytes): test $0x40, %dh jnz L(Byte22) - ALIGN (4) + .p2align 4 L(Byte31): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx sub %edx, %eax RETURN_END +# else + +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%edi), %eax + cmp -16(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov -12(%edi), %eax + cmp -12(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%edi), %eax + cmp -8(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov -4(%edi), %eax + cmp -4(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + RETURN + + .p2align 4 +L(nequal_bigger): + RETURN_END +# endif CFI_PUSH (%ebx) - ALIGN (4) + + .p2align 4 L(more8bytes): cmp $16, %ecx jae L(more16bytes) cmp $8, %ecx je L(8bytes) +# ifndef USE_AS_WMEMCMP cmp $9, %ecx je L(9bytes) cmp $10, %ecx @@ -1598,13 +1686,17 @@ L(more8bytes): cmp $14, %ecx je L(14bytes) jmp L(15bytes) +# else + jmp L(12bytes) +# endif - ALIGN (4) + .p2align 4 L(more16bytes): cmp $24, %ecx jae L(more24bytes) cmp $16, %ecx je L(16bytes) +# ifndef USE_AS_WMEMCMP cmp $17, %ecx je L(17bytes) cmp $18, %ecx @@ -1618,13 +1710,17 @@ L(more16bytes): cmp $22, %ecx je L(22bytes) jmp L(23bytes) +# else + jmp L(20bytes) +# endif - ALIGN (4) + .p2align 4 L(more24bytes): cmp $32, %ecx jae L(more32bytes) cmp $24, %ecx je L(24bytes) +# ifndef USE_AS_WMEMCMP cmp $25, %ecx je L(25bytes) cmp $26, %ecx @@ -1638,13 +1734,17 @@ L(more24bytes): cmp $30, %ecx je L(30bytes) jmp L(31bytes) +# else + jmp L(28bytes) +# endif - ALIGN (4) + .p2align 4 L(more32bytes): cmp $40, %ecx jae L(more40bytes) cmp $32, %ecx je L(32bytes) +# ifndef USE_AS_WMEMCMP cmp $33, %ecx je L(33bytes) cmp $34, %ecx @@ -1658,11 +1758,35 @@ L(more32bytes): cmp $38, %ecx je L(38bytes) jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) +# ifndef USE_AS_WMEMCMP + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif - ALIGN (4) + .p2align 4 L(more40bytes): cmp $40, %ecx je L(40bytes) +# ifndef USE_AS_WMEMCMP cmp $41, %ecx je L(41bytes) cmp $42, %ecx @@ -1677,23 +1801,7 @@ L(more40bytes): je L(46bytes) jmp L(47bytes) - ALIGN (4) -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) - - ALIGN (4) + .p2align 4 L(44bytes): mov -44(%eax), %ecx mov -44(%edx), %ebx @@ -1750,11 +1858,64 @@ L(4bytes): cmp %ebx, %ecx mov $0, %eax jne L(find_diff) - POP (%ebx) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# else + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + cmp -44(%edx), %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + cmp -40(%edx), %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + cmp -36(%edx), %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + cmp -32(%edx), %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + cmp -28(%edx), %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + cmp -24(%edx), %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + cmp -20(%edx), %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + xor %eax, %eax + cmp -4(%edx), %ecx + jne L(find_diff) + POP (%ebx) ret CFI_PUSH (%ebx) +# endif - ALIGN (4) +# ifndef USE_AS_WMEMCMP + + .p2align 4 L(45bytes): mov -45(%eax), %ecx mov -45(%edx), %ebx @@ -1814,11 +1975,11 @@ L(5bytes): cmp -1(%edx), %cl mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(46bytes): mov -46(%eax), %ecx mov -46(%edx), %ebx @@ -1882,11 +2043,11 @@ L(2bytes): cmp %bh, %ch mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(47bytes): movl -47(%eax), %ecx movl -47(%edx), %ebx @@ -1953,11 +2114,11 @@ L(3bytes): cmpb -1(%edx), %al mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(find_diff): cmpb %bl, %cl jne L(end) @@ -1968,14 +2129,30 @@ L(find_diff): cmp %bl, %cl jne L(end) cmp %bx, %cx + + .p2align 4 L(end): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(bigger) neg %eax L(bigger): ret +# else -END (MEMCMP) +/* for wmemcmp */ + .p2align 4 +L(find_diff): + POP (%ebx) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + .p2align 4 +L(find_diff_bigger): + ret + +# endif +END (MEMCMP) #endif diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/sysdeps/i386/i686/multiarch/wmemcmp-c.c new file mode 100644 index 0000000..94ff615 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wmemcmp-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define WMEMCMP __wmemcmp_ia32 +#endif + +#include "wcsmbs/wmemcmp.c" diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000..1a857c7 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_2 + +#include "memcmp-sse4.S" diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000..a41ef95 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/wmemcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S new file mode 100644 index 0000000..5080c14 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wmemcmp.S @@ -0,0 +1,59 @@ +/* Multiple versions of wmemcmp + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* Define multiple versions only for the definition in libc. */ + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function + __i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wmemcmp_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wmemcmp_ssse3@GOTOFF(%ebx), %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wmemcmp_sse4_2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(wmemcmp) +#endif diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index a5254dc..e0bb984 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ - strrchr-sse2-no-bsf strchr-sse2-no-bsf + strrchr-sse2-no-bsf strchr-sse2-no-bsf \ + memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index fc439bb..28dd505 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -1,5 +1,5 @@ -/* memcmp with SSE4.1 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSE4.1, wmemcmp with SSE4.1 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,43 +20,54 @@ #ifndef NOT_IN_libc -#include -#include "asm-syntax.h" +# include -#ifndef MEMCMP -# define MEMCMP __memcmp_sse4_1 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_1 +# endif -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif +# ifndef ALIGN +# define ALIGN(n) .p2align n +# endif -#define JMPTBL(I, B) (I - B) +# define JMPTBL(I, B) (I - B) -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ lea TABLE(%rip), %r11; \ movslq (%r11, INDEX, SCALE), %rcx; \ add %r11, %rcx; \ jmp *%rcx; \ ud2 +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + .section .text.sse4.1,"ax",@progbits ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx +# endif pxor %xmm0, %xmm0 cmp $79, %rdx ja L(79bytesormore) +# ifndef USE_AS_WMEMCMP cmp $1, %rdx je L(firstbyte) +# endif add %rdx, %rsi add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +# ifndef USE_AS_WMEMCMP ALIGN (4) L(firstbyte): movzbl (%rdi), %eax movzbl (%rsi), %ecx sub %ecx, %eax ret +# endif ALIGN (4) L(79bytesormore): @@ -308,11 +319,11 @@ L(less32bytesin256): ALIGN (4) L(512bytesormore): -#ifdef DATA_CACHE_SIZE_HALF +# ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %r8 -#else +# else mov __x86_64_data_cache_size_half(%rip), %r8 -#endif +# endif mov %r8, %r9 shr $1, %r8 add %r9, %r8 @@ -624,11 +635,11 @@ L(less32bytesin256in2alinged): ALIGN (4) L(512bytesormorein2aligned): -#ifdef DATA_CACHE_SIZE_HALF +# ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %r8 -#else +# else mov __x86_64_data_cache_size_half(%rip), %r8 -#endif +# endif mov %r8, %r9 shr $1, %r8 add %r9, %r8 @@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned): BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) L(L2_L3_cache_aglined): sub $64, %rdx + ALIGN (4) L(L2_L3_aligned_128bytes_loop): prefetchnta 0x1c0(%rdi) @@ -803,13 +815,19 @@ L(12bytes): jne L(diffin8bytes) L(4bytes): mov -4(%rsi), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%rdi), %eax cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif jne L(diffin4bytes) L(0bytes): xor %eax, %eax ret +# ifndef USE_AS_WMEMCMP +/* unreal case for wmemcmp */ ALIGN (4) L(65bytes): movdqu -65(%rdi), %xmm1 @@ -1017,6 +1035,7 @@ L(1bytes): movzbl -1(%rsi), %ecx sub %ecx, %eax ret +# endif ALIGN (4) L(68bytes): @@ -1047,13 +1066,20 @@ L(20bytes): pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 jnc L(less16bytes) - mov -4(%rdi), %eax mov -4(%rsi), %ecx + +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif jne L(diffin4bytes) xor %eax, %eax ret +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ ALIGN (4) L(69bytes): movdqu -69(%rsi), %xmm1 @@ -1161,6 +1187,7 @@ L(23bytes): jne L(diffin8bytes) xor %eax, %eax ret +# endif ALIGN (4) L(72bytes): @@ -1191,13 +1218,16 @@ L(24bytes): pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 jnc L(less16bytes) - mov -8(%rdi), %rax + mov -8(%rsi), %rcx + mov -8(%rdi), %rax cmp %rax, %rcx jne L(diffin8bytes) xor %eax, %eax ret +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ ALIGN (4) L(73bytes): movdqu -73(%rsi), %xmm1 @@ -1312,7 +1342,7 @@ L(27bytes): jne L(diffin4bytes) xor %eax, %eax ret - +# endif ALIGN (4) L(76bytes): movdqu -76(%rsi), %xmm1 @@ -1346,13 +1376,19 @@ L(28bytes): mov -12(%rsi), %rcx cmp %rax, %rcx jne L(diffin8bytes) - mov -4(%rdi), %eax mov -4(%rsi), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif jne L(diffin4bytes) xor %eax, %eax ret +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ ALIGN (4) L(77bytes): movdqu -77(%rsi), %xmm1 @@ -1474,7 +1510,7 @@ L(31bytes): jne L(diffin8bytes) xor %eax, %eax ret - +# endif ALIGN (4) L(64bytes): movdqu -64(%rdi), %xmm2 @@ -1527,7 +1563,17 @@ L(diffin8bytes): jne L(diffin4bytes) shr $32, %rcx shr $32, %rax + +# ifdef USE_AS_WMEMCMP +/* for wmemcmp */ + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret +# endif + L(diffin4bytes): +# ifndef USE_AS_WMEMCMP cmp %cx, %ax jne L(diffin2bytes) shr $16, %ecx @@ -1546,11 +1592,28 @@ L(end): and $0xff, %ecx sub %ecx, %eax ret +# else + +/* for wmemcmp */ + mov $1, %eax + jl L(nequal_bigger) + neg %eax + ret + + ALIGN (4) +L(nequal_bigger): + ret + +L(unreal_case): + xor %eax, %eax + ret +# endif END (MEMCMP) .section .rodata.sse4.1,"a",@progbits ALIGN (3) +# ifndef USE_AS_WMEMCMP L(table_64bytes): .int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(1bytes), L(table_64bytes)) @@ -1632,4 +1695,87 @@ L(table_64bytes): .int JMPTBL (L(77bytes), L(table_64bytes)) .int JMPTBL (L(78bytes), L(table_64bytes)) .int JMPTBL (L(79bytes), L(table_64bytes)) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) +# endif #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000..b3a2ca1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -0,0 +1,1997 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# ifndef ALIGN +# define ALIGN(n) .p2align n +# endif + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx + test %rdx, %rdx + jz L(equal) +# endif + mov %rdx, %rcx + mov %rdi, %rdx + cmp $48, %rcx; + jae L(48bytesormore) /* LEN => 48 */ + + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +/* ECX >= 32. */ +L(48bytesormore): + movdqu (%rdi), %xmm3 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + sub $0xffff, %edx + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %rdx, %rdi + sub %rdx, %rsi + add %rdx, %rcx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %rdx, %rsi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + ALIGN (2) +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + ALIGN (4) +L(shr_0): + cmp $80, %rcx + lea -48(%rcx), %rcx + jae L(shr_0_gobble) + xor %eax, %eax + movdqa (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_0_gobble): + movdqa (%rsi), %xmm0 + xor %eax, %eax + pcmpeqb (%rdi), %xmm0 + sub $32, %rcx + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %rcx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%rsi), %xmm0 + movdqa 48(%rsi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%rdi), %xmm0 + pcmpeqb 48(%rdi), %xmm2 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %rcx + jge L(next) + inc %edx + add $32, %rcx +L(next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_1): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $1, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $1, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_1_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $1, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $1, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $1, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $1, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_1_gobble_next) + inc %edx + add $32, %rcx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 1(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + + ALIGN (4) +L(shr_2): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $2, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $2, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_2_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $2, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $2, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $2, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $2, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_2_gobble_next) + inc %edx + add $32, %rcx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 2(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_3): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $3, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $3, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_3_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $3, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $3, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $3, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $3, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_3_gobble_next) + inc %edx + add $32, %rcx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 3(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_4): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $4, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $4, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_4_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $4, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $4, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $4, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $4, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_4_gobble_next) + inc %edx + add $32, %rcx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 4(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_5): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $5, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $5, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_5_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $5, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $5, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $5, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $5, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_5_gobble_next) + inc %edx + add $32, %rcx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 5(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_6): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $6, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $6, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_6_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $6, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $6, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $6, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $6, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_6_gobble_next) + inc %edx + add $32, %rcx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 6(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_7): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $7, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $7, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_7_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $7, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $7, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $7, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $7, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_7_gobble_next) + inc %edx + add $32, %rcx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 7(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_8): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $8, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $8, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_8_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $8, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $8, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $8, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $8, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_8_gobble_next) + inc %edx + add $32, %rcx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 8(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_9): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $9, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $9, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_9_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $9, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $9, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $9, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $9, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_9_gobble_next) + inc %edx + add $32, %rcx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 9(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_10): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $10, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $10, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_10_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $10, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $10, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $10, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $10, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_10_gobble_next) + inc %edx + add $32, %rcx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 10(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_11): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $11, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_11_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $11, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $11, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $11, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $11, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_11_gobble_next) + inc %edx + add $32, %rcx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 11(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + ALIGN (4) +L(shr_12): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $12, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_12_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $12, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $12, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $12, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $12, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_12_gobble_next) + inc %edx + add $32, %rcx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 12(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + ALIGN (4) +L(shr_13): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $13, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_13_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $13, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $13, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $13, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $13, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_13_gobble_next) + inc %edx + add $32, %rcx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 13(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_14): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $14, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_14_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $14, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $14, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $14, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $14, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_14_gobble_next) + inc %edx + add $32, %rcx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 14(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_15): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $15, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + ALIGN (4) +L(shr_15_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $15, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $15, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $15, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $15, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_15_gobble_next) + inc %edx + add $32, %rcx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 15(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) +# endif + ALIGN (4) +L(exit): + pmovmskb %xmm1, %r8d + sub $0xffff, %r8d + jz L(first16bytes) + lea -16(%rsi), %rsi + lea -16(%rdi), %rdi + mov %r8d, %edx +L(first16bytes): + add %rax, %rsi +L(less16bytes): +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) + + movzbl -9(%rdi), %eax + movzbl -9(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte16): + movzbl -16(%rdi), %eax + movzbl -16(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte17): + movzbl -15(%rdi), %eax + movzbl -15(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte18): + movzbl -14(%rdi), %eax + movzbl -14(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte19): + movzbl -13(%rdi), %eax + movzbl -13(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte20): + movzbl -12(%rdi), %eax + movzbl -12(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte21): + movzbl -11(%rdi), %eax + movzbl -11(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(Byte22): + movzbl -10(%rdi), %eax + movzbl -10(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(next_24_bytes): + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + mov -9(%rdi), %eax + and $0xff, %eax + mov -9(%rsi), %edx + and $0xff, %edx + sub %edx, %eax + ret +# else +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(second_double_word): + mov -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) + ret + + ALIGN (4) +L(fourth_double_word): + mov -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) + ret +# endif + + ALIGN (4) +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) + cmp $0, %ecx + je L(0bytes) +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + je L(1bytes) + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + ALIGN (4) +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + ALIGN (4) +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + ALIGN (4) +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + ALIGN (4) +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + ALIGN (4) +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + ALIGN (4) +L(44bytes): + movl -44(%rdi), %eax + movl -44(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + movl -40(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + movl -36(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + movl -32(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + movl -28(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + movl -24(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + movl -20(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + movl -16(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + movl -12(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + movl -8(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + movl -4(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# else + ALIGN (4) +L(44bytes): + movl -44(%rdi), %eax + cmp -44(%rsi), %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + cmp -40(%rsi), %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + cmp -36(%rsi), %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + cmp -32(%rsi), %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + cmp -28(%rsi), %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + cmp -24(%rsi), %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + cmp -20(%rsi), %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + ALIGN (4) +L(45bytes): + movl -45(%rdi), %eax + movl -45(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(41bytes): + movl -41(%rdi), %eax + movl -41(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(37bytes): + movl -37(%rdi), %eax + movl -37(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(33bytes): + movl -33(%rdi), %eax + movl -33(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(29bytes): + movl -29(%rdi), %eax + movl -29(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(25bytes): + movl -25(%rdi), %eax + movl -25(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(21bytes): + movl -21(%rdi), %eax + movl -21(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(17bytes): + movl -17(%rdi), %eax + movl -17(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(13bytes): + movl -13(%rdi), %eax + movl -13(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(9bytes): + movl -9(%rdi), %eax + movl -9(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(5bytes): + movl -5(%rdi), %eax + movl -5(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(1bytes): + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(46bytes): + movl -46(%rdi), %eax + movl -46(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(42bytes): + movl -42(%rdi), %eax + movl -42(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(38bytes): + movl -38(%rdi), %eax + movl -38(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(34bytes): + movl -34(%rdi), %eax + movl -34(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(30bytes): + movl -30(%rdi), %eax + movl -30(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(26bytes): + movl -26(%rdi), %eax + movl -26(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(22bytes): + movl -22(%rdi), %eax + movl -22(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(18bytes): + movl -18(%rdi), %eax + movl -18(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(14bytes): + movl -14(%rdi), %eax + movl -14(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(10bytes): + movl -10(%rdi), %eax + movl -10(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(6bytes): + movl -6(%rdi), %eax + movl -6(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(2bytes): + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(47bytes): + movl -47(%rdi), %eax + movl -47(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(43bytes): + movl -43(%rdi), %eax + movl -43(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(39bytes): + movl -39(%rdi), %eax + movl -39(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(35bytes): + movl -35(%rdi), %eax + movl -35(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(31bytes): + movl -31(%rdi), %eax + movl -31(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(27bytes): + movl -27(%rdi), %eax + movl -27(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(23bytes): + movl -23(%rdi), %eax + movl -23(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(19bytes): + movl -19(%rdi), %eax + movl -19(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(15bytes): + movl -15(%rdi), %eax + movl -15(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(11bytes): + movl -11(%rdi), %eax + movl -11(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(7bytes): + movl -7(%rdi), %eax + movl -7(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + ALIGN (4) +L(find_diff): + cmpb %cl, %al + jne L(set) + cmpw %cx, %ax + jne L(set) + shr $16, %eax + shr $16, %ecx + cmpb %cl, %al + jne L(set) + +/* We get there only if we already know there is a +difference. */ + + cmp %ecx, %eax +L(set): + sbb %eax, %eax + sbb $-1, %eax + ret +# else + +/* for wmemcmp */ + ALIGN (4) +L(find_diff): + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + ALIGN (4) +L(find_diff_bigger): + ret +# endif + + ALIGN (4) +L(equal): + xor %eax, %eax + ret + +END (MEMCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S index 301ab28..8bf8f3a 100644 --- a/sysdeps/x86_64/multiarch/memcmp.S +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -1,5 +1,5 @@ /* Multiple versions of memcmp - Copyright (C) 2010 Free Software Foundation, Inc. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -29,11 +29,20 @@ ENTRY(memcmp) cmpl $0, KIND_OFFSET+__cpu_features(%rip) jne 1f call __init_cpu_features -1: leaq __memcmp_sse2(%rip), %rax - testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) - jz 2f + +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 2f + leaq __memcmp_sse2(%rip), %rax + ret + +2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 3f leaq __memcmp_sse4_1(%rip), %rax -2: ret + ret + +3: leaq __memcmp_ssse3(%rip), %rax + ret + END(memcmp) # undef ENTRY diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c new file mode 100644 index 0000000..793f059 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define WMEMCMP __wmemcmp_sse2 +#endif + +#include "wcsmbs/wmemcmp.c" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000..b07973a --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_1 + +#include "memcmp-sse4.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000..a41ef95 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S new file mode 100644 index 0000000..7c3b7ed --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp.S @@ -0,0 +1,47 @@ +/* Multiple versions of wmemcmp + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features + +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 2f + leaq __wmemcmp_sse2(%rip), %rax + ret + +2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 3f + leaq __wmemcmp_sse4_1(%rip), %rax + ret + +3: leaq __wmemcmp_ssse3(%rip), %rax + ret + +END(wmemcmp) +#endif diff --git a/wcsmbs/wmemcmp.c b/wcsmbs/wmemcmp.c index c6a321b..e7edc87 100644 --- a/wcsmbs/wmemcmp.c +++ b/wcsmbs/wmemcmp.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1996, 1997 Free Software Foundation, Inc. +/* Copyright (C) 1996, 1997i, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1996. @@ -19,9 +19,12 @@ #include +#ifndef WMEMCMP +# define wmemcmp +#endif int -wmemcmp (s1, s2, n) +WMEMCMP (s1, s2, n) const wchar_t *s1; const wchar_t *s2; size_t n; @@ -34,19 +37,19 @@ wmemcmp (s1, s2, n) c1 = (wint_t) s1[0]; c2 = (wint_t) s2[0]; if (c1 - c2 != 0) - return c1 - c2; + return c1 > c2 ? 1 : -1; c1 = (wint_t) s1[1]; c2 = (wint_t) s2[1]; if (c1 - c2 != 0) - return c1 - c2; + return c1 > c2 ? 1 : -1; c1 = (wint_t) s1[2]; c2 = (wint_t) s2[2]; if (c1 - c2 != 0) - return c1 - c2; + return c1 > c2 ? 1 : -1; c1 = (wint_t) s1[3]; c2 = (wint_t) s2[3]; if (c1 - c2 != 0) - return c1 - c2; + return c1 > c2 ? 1 : -1; s1 += 4; s2 += 4; n -= 4; @@ -57,7 +60,7 @@ wmemcmp (s1, s2, n) c1 = (wint_t) s1[0]; c2 = (wint_t) s2[0]; if (c1 - c2 != 0) - return c1 - c2; + return c1 > c2 ? 1 : -1; ++s1; ++s2; --n; @@ -67,7 +70,7 @@ wmemcmp (s1, s2, n) c1 = (wint_t) s1[0]; c2 = (wint_t) s2[0]; if (c1 - c2 != 0) - return c1 - c2; + return c1 > c2 ? 1 : -1; ++s1; ++s2; --n; @@ -77,7 +80,7 @@ wmemcmp (s1, s2, n) c1 = (wint_t) s1[0]; c2 = (wint_t) s2[0]; if (c1 - c2 != 0) - return c1 - c2; + return c1 > c2 ? 1 : -1; } return 0; -- 2.7.4