From 0fda545d5f9253d7b2b7832c37cd8f57c5619da4 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 7 Aug 2009 22:51:02 -0700 Subject: [PATCH] Add SSSE3-optimized implementation of str{,n}cmp for x86-64. --- ChangeLog | 9 ++ sysdeps/x86_64/multiarch/Makefile | 2 +- sysdeps/x86_64/multiarch/strcmp-ssse3.S | 3 + sysdeps/x86_64/multiarch/strcmp.S | 12 +- sysdeps/x86_64/multiarch/strncmp-ssse3.S | 4 + sysdeps/x86_64/strcmp.S | 211 ++++++++++++++++++++++++------- 6 files changed, 194 insertions(+), 47 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S create mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S diff --git a/ChangeLog b/ChangeLog index 0defb11..abdb802 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,14 @@ 2009-08-07 Ulrich Drepper + * sysdeps/x86_64/strcmp.S: Add support to compile with + USE_SSSE3. In this case palignr is used. + * sysdeps/x86_64/multiarch/strcmp.S (strcmp): If SSE4.3 is not + available but SSSE3 is, pick __str{,n}cmp_ssse3. + * sysdeps/x86_64/multiarch/Makefile [subdir=string] (sysdep_routines): + Add strcmp-ssse3 and strncmp-ssse3. + * sysdeps/x86_64/multiarch/strcmp-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strncmp-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strcspn-c.c (STRCSPN_SSE42): Avoid warning through fake initialization. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index b066402..0ded3b3 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,7 +4,7 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += stpncpy-c strncpy-c +sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S new file mode 100644 index 0000000..98cecb8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-ssse3.S @@ -0,0 +1,3 @@ +#define USE_SSSE3 1 +#define STRCMP __strcmp_ssse3 +#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index 1a31573..05adf1e 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -34,6 +34,7 @@ mov %r9, %r11 #define STRCMP_SSE42 __strncmp_sse42 +#define STRCMP_SSSE3 __strncmp_ssse3 #define STRCMP_SSE2 __strncmp_sse2 #define __GI_STRCMP __GI_strncmp #else @@ -41,6 +42,7 @@ #ifndef STRCMP #define STRCMP strcmp #define STRCMP_SSE42 __strcmp_sse42 +#define STRCMP_SSSE3 __strcmp_ssse3 #define STRCMP_SSE2 __strcmp_sse2 #define __GI_STRCMP __GI_strcmp #endif @@ -60,10 +62,14 @@ ENTRY(STRCMP) cmpl $0, __cpu_features+KIND_OFFSET(%rip) jne 1f call __init_cpu_features -1: leaq STRCMP_SSE2(%rip), %rax - testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) - jz 2f +1: leaq STRCMP_SSE42(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jnz 2f + leaq STRCMP_SSSE3(%rip), %rax + testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jnz 2f + leaq STRCMP_SSE2(%rip), %rax 2: ret END(STRCMP) diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S new file mode 100644 index 0000000..a320a3e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_SSSE3 1 +#define STRCMP __strncmp_ssse3 +#define USE_AS_STRNCMP +#include "../strcmp.S" diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index 340a64b..650ec17 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -51,7 +51,12 @@ # endif #endif +#ifndef USE_SSSE3 .text +#else + .section .text.ssse3,"ax",@progbits +#endif + ENTRY (BP_SYM (STRCMP)) #ifdef NOT_IN_libc /* Simple version since we can't use SSE registers in ld.so. */ @@ -244,9 +249,13 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ +#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -269,9 +278,13 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ +#ifndef USE_SSSE3 psrldq $1, %xmm3 - pslldq $15, %xmm2 + pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -363,9 +376,13 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -389,9 +406,13 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $2, %xmm3 - pslldq $14, %xmm2 - por %xmm3, %xmm2 + pslldq $14, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -477,9 +498,13 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -503,9 +528,13 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $3, %xmm3 - pslldq $13, %xmm2 - por %xmm3, %xmm2 + pslldq $13, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -591,9 +620,13 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -617,9 +650,13 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $4, %xmm3 - pslldq $12, %xmm2 - por %xmm3, %xmm2 + pslldq $12, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -705,9 +742,13 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -731,9 +772,13 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $5, %xmm3 - pslldq $11, %xmm2 - por %xmm3, %xmm2 + pslldq $11, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -819,9 +864,13 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -845,9 +894,13 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $6, %xmm3 - pslldq $10, %xmm2 - por %xmm3, %xmm2 + pslldq $10, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -933,9 +986,13 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -959,9 +1016,13 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $7, %xmm3 - pslldq $9, %xmm2 - por %xmm3, %xmm2 + pslldq $9, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1047,9 +1108,13 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1073,9 +1138,13 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $8, %xmm3 - pslldq $8, %xmm2 - por %xmm3, %xmm2 + pslldq $8, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1161,9 +1230,13 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1187,9 +1260,13 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $9, %xmm3 - pslldq $7, %xmm2 - por %xmm3, %xmm2 + pslldq $7, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1275,9 +1352,13 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1301,9 +1382,13 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $10, %xmm3 - pslldq $6, %xmm2 - por %xmm3, %xmm2 + pslldq $6, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1389,9 +1474,13 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1415,9 +1504,13 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $11, %xmm3 - pslldq $5, %xmm2 - por %xmm3, %xmm2 + pslldq $5, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1503,9 +1596,13 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1529,9 +1626,13 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $12, %xmm3 - pslldq $4, %xmm2 - por %xmm3, %xmm2 + pslldq $4, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1617,9 +1718,13 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1643,9 +1748,13 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $13, %xmm3 - pslldq $3, %xmm2 - por %xmm3, %xmm2 + pslldq $3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1731,9 +1840,13 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1757,9 +1870,13 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $14, %xmm3 - pslldq $2, %xmm2 - por %xmm3, %xmm2 + pslldq $2, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1847,9 +1964,13 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 - por %xmm3, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1873,9 +1994,13 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 +#ifndef USE_SSSE3 psrldq $15, %xmm3 - pslldq $1, %xmm2 - por %xmm3, %xmm2 + pslldq $1, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ +#else + palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ +#endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 -- 2.7.4