From be13f7bff66e1850f9057dd813d6e7be022d9516 Mon Sep 17 00:00:00 2001
From: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date: Sat, 15 Oct 2011 11:10:08 -0400
Subject: [PATCH] Optimized memcmp and wmemcmp for x86-64 and x86-32

---
 ChangeLog                                   |   29 +
 NEWS                                        |    2 +-
 string/test-memcmp.c                        |   47 +-
 sysdeps/i386/i686/multiarch/Makefile        |    3 +-
 sysdeps/i386/i686/multiarch/memcmp-sse4.S   |  396 ++++--
 sysdeps/i386/i686/multiarch/memcmp-ssse3.S  |  565 +++++---
 sysdeps/i386/i686/multiarch/wmemcmp-c.c     |    5 +
 sysdeps/i386/i686/multiarch/wmemcmp-sse4.S  |    4 +
 sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S |    4 +
 sysdeps/i386/i686/multiarch/wmemcmp.S       |   59 +
 sysdeps/x86_64/multiarch/Makefile           |    3 +-
 sysdeps/x86_64/multiarch/memcmp-sse4.S      |  192 ++-
 sysdeps/x86_64/multiarch/memcmp-ssse3.S     | 1997 +++++++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memcmp.S           |   19 +-
 sysdeps/x86_64/multiarch/wmemcmp-c.c        |    5 +
 sysdeps/x86_64/multiarch/wmemcmp-sse4.S     |    4 +
 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S    |    4 +
 sysdeps/x86_64/multiarch/wmemcmp.S          |   47 +
 wcsmbs/wmemcmp.c                            |   21 +-
 19 files changed, 3070 insertions(+), 336 deletions(-)
 create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-c.c
 create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
 create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
 create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp.S
 create mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp.S

diff --git a/ChangeLog b/ChangeLog
index 49f091a..414611a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,32 @@
+2011-09-27  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
+
+	* sysdeps/x86_64/multiarch/Makefile: (sysdep_routines): Add
+	memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+	* sysdeps/x86_64/multiarch/memcmp-ssse3: New file.
+	* sysdeps/x86_64/multiarch/memcmp.S: Update.  Add __memcmp_ssse3.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Update.
+	(USE_AS_WMEMCMP): New macro.
+	Fixing indents.
+	* sysdeps/x86_64/multiarch/wmemcmp.S: New file.
+	* sysdeps/x86_64/multiarch/wmemcmp-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/wmemcmp-sse4.S: New file.
+	* sysdeps/x86_64/multiarch/wmemcmp-c.S: New file.
+	* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+	wmemcmp-ssse3 wmemcmp-sse4 wmemcmp-c
+	* sysdeps/i386/i686/multiarch/wmemcmp.S: New file.
+	* sysdeps/i386/i686/multiarch/wmemcmp-c.c: New file.
+	* sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/wmemcmp-sse4.S: New file.
+	* sysdeps/i386/i686/multiarch/memcmp-sse4.S: Update.
+	(USE_AS_WMEMCMP): New macro.
+	* sysdeps/i386/i686/multiarch/memcmp-ssse3: Likewise.
+	* sysdeps/string/test-memcmp.c: Update.
+	Fix simple_wmemcmp.
+	Add new tests.
+	* wcsmbs/wmemcmp.c: Update.
+	(WMEMCMP): New macro.
+	Fix overflow bug.
+
 2011-10-12  Andreas Jaeger  <aj@suse.de>
 
 	[BZ #13268]
diff --git a/NEWS b/NEWS
index 7e9b2c1..cdb2973 100644
--- a/NEWS
+++ b/NEWS
@@ -33,7 +33,7 @@ Version 2.15
 * Optimized strchr and strrchr for SSE on x86-32.
   Contributed by Liubov Dmitrieva.
 
-* Optimized memchr, memrchr, rawmemchr for x86-64 and x86-32.
+* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
   Contributed by Liubov Dmitrieva.
 
 * New interfaces: scandirat, scandirat64
diff --git a/string/test-memcmp.c b/string/test-memcmp.c
index 4675bd9..f246d3a 100644
--- a/string/test-memcmp.c
+++ b/string/test-memcmp.c
@@ -29,9 +29,21 @@
 # define MEMCPY wmemcpy
 # define SIMPLE_MEMCMP simple_wmemcmp
 # define CHAR wchar_t
-# define MAX_CHAR 256000
-# define UCHAR uint32_t
+# define UCHAR wchar_t
 # define CHARBYTES 4
+# define CHAR__MIN WCHAR_MIN
+# define CHAR__MAX WCHAR_MAX
+int
+simple_wmemcmp (const wchar_t *s1, const wchar_t *s2, size_t n)
+{
+  int ret = 0;
+  /* Warning!
+	wmemcmp has to use SIGNED comparison for elements.
+	memcmp has to use UNSIGNED comparison for elemnts.
+  */
+  while (n-- && (ret = *s1 < *s2 ? -1 : *s1 == *s2 ? 0 : 1) == 0) {s1++; s2++;}
+  return ret;
+}
 #else
 # define MEMCMP memcmp
 # define MEMCPY memcpy
@@ -40,18 +52,20 @@
 # define MAX_CHAR 255
 # define UCHAR unsigned char
 # define CHARBYTES 1
-#endif
-
-typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+# define CHAR__MIN CHAR_MIN
+# define CHAR__MAX CHAR_MAX
 
 int
-SIMPLE_MEMCMP (const CHAR *s1, const CHAR *s2, size_t n)
+simple_memcmp (const char *s1, const char *s2, size_t n)
 {
   int ret = 0;
 
-  while (n-- && (ret = *(UCHAR *) s1++ - *(UCHAR *) s2++) == 0);
+  while (n-- && (ret = *(unsigned char *) s1++ - *(unsigned char *) s2++) == 0);
   return ret;
 }
+#endif
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
 
 IMPL (SIMPLE_MEMCMP, 0)
 IMPL (MEMCMP, 1)
@@ -121,7 +135,7 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
   s2 = (CHAR *) (buf2 + align2);
 
   for (i = 0; i < len; i++)
-    s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
+    s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
 
   s1[len] = align1;
   s2[len] = align2;
@@ -412,8 +426,8 @@ check1 (void)
   s2[99] = 1;
   s1[100] = 116;
   s2[100] = 116;
-  s1[101] = -13;
-  s2[101] = -13;
+  s1[101] = CHAR__MIN;
+  s2[101] = CHAR__MAX;
   s1[102] = -109;
   s2[102] = -109;
   s1[103] = 1;
@@ -434,8 +448,8 @@ check1 (void)
   s2[110] = -109;
   s1[111] = 1;
   s2[111] = 1;
-  s1[112] = 20;
-  s2[112] = 20;
+  s1[112] = CHAR__MAX;
+  s2[112] = CHAR__MIN;
   s1[113] = -13;
   s2[113] = -13;
   s1[114] = -109;
@@ -444,9 +458,12 @@ check1 (void)
   s2[115] = 1;
 
   n = 116;
-  exp_result = SIMPLE_MEMCMP (s1, s2, n);
-  FOR_EACH_IMPL (impl, 0)
-    check_result (impl, s1, s2, n, exp_result);
+  for (size_t i = 0; i < n; i++)
+    {
+      exp_result = SIMPLE_MEMCMP (s1 + i, s2 + i, n - i);
+      FOR_EACH_IMPL (impl, 0)
+	check_result (impl, s1 + i, s2 + i, n - i, exp_result);
+    }
 }
 
 int
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 8a4c219..98d1ad6 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -17,7 +17,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
 		   wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
 		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
-		   rawmemchr-sse2 rawmemchr-sse2-bsf
+		   rawmemchr-sse2 rawmemchr-sse2-bsf \
+		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
index b1ed778..1f5dbc1 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.2
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -20,84 +20,97 @@
 
 #ifndef NOT_IN_libc
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
 
-#ifndef MEMCMP
-# define MEMCMP		__memcmp_sse4_2
-#endif
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_2
+# endif
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
 
-#define PARMS		4
-#define BLK1		PARMS
-#define BLK2		BLK1+4
-#define LEN		BLK2+4
-#define RETURN		POP (%ebx); ret; CFI_PUSH (%ebx)
+# define PARMS	4
+# define BLK1	PARMS
+# define BLK2	BLK1 + 4
+# define LEN	BLK2 + 4
+# define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
 
 
-#ifdef SHARED
-# define JMPTBL(I, B)	I - B
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    addl	$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
-    jmp		*%ebx
-
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-#else
-# define JMPTBL(I, B)	I
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	call	__i686.get_pc_thunk.bx;	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
+	jmp	*%ebx
+# else
+#  define JMPTBL(I, B)	I
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-#endif
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
 
 	.section .text.sse4.2,"ax",@progbits
 ENTRY (MEMCMP)
 	movl	BLK1(%esp), %eax
 	movl	BLK2(%esp), %edx
 	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+# else
 	cmp	$1, %ecx
 	jbe	L(less1bytes)
+# endif
+
 	pxor	%xmm0, %xmm0
 	cmp	$64, %ecx
 	ja	L(64bytesormore)
 	cmp	$8, %ecx
-	PUSH (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+# else
 	jb	L(less8bytes)
+	PUSH	(%ebx)
+# endif
+
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less8bytes):
 	mov	(%eax), %bl
 	cmpb	(%edx), %bl
@@ -141,22 +154,49 @@ L(less8bytes):
 	mov	6(%eax), %bl
 	cmpb	6(%edx), %bl
 	je	L(0bytes)
+
 L(nonzero):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(above)
 	neg	%eax
 L(above):
 	ret
 	CFI_PUSH (%ebx)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(0bytes):
-	POP (%ebx)
+	POP	(%ebx)
 	xor	%eax, %eax
 	ret
 
-	ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less1bytes):
 	jb	L(0bytesend)
 	movzbl	(%eax), %eax
@@ -164,14 +204,14 @@ L(less1bytes):
 	sub	%edx, %eax
 	ret
 
-	ALIGN (4)
+	.p2align 4
 L(0bytesend):
 	xor	%eax, %eax
 	ret
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(64bytesormore):
-	PUSH (%ebx)
+	PUSH	(%ebx)
 	mov	%ecx, %ebx
 	mov	$64, %ecx
 	sub	$64, %ebx
@@ -208,7 +248,14 @@ L(64bytesormore_loop):
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
 
-	ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+	.p2align 4
 L(find_16diff):
 	sub	$16, %ecx
 L(find_32diff):
@@ -218,9 +265,9 @@ L(find_48diff):
 L(find_64diff):
 	add	%ecx, %edx
 	add	%ecx, %eax
-	jmp	L(16bytes)
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(16bytes):
 	mov	-16(%eax), %ecx
 	mov	-16(%edx), %ebx
@@ -243,8 +290,30 @@ L(4bytes):
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
+# else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(49bytes):
 	movdqu	-49(%eax), %xmm1
 	movdqu	-49(%edx), %xmm2
@@ -285,7 +354,7 @@ L(5bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(50bytes):
 	mov	$-50, %ebx
 	movdqu	-50(%eax), %xmm1
@@ -330,7 +399,7 @@ L(2bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(51bytes):
 	mov	$-51, %ebx
 	movdqu	-51(%eax), %xmm1
@@ -378,8 +447,8 @@ L(1bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(52bytes):
 	movdqu	-52(%eax), %xmm1
 	movdqu	-52(%edx), %xmm2
@@ -402,13 +471,18 @@ L(20bytes):
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(53bytes):
 	movdqu	-53(%eax), %xmm1
 	movdqu	-53(%edx), %xmm2
@@ -440,7 +514,7 @@ L(21bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(54bytes):
 	movdqu	-54(%eax), %xmm1
 	movdqu	-54(%edx), %xmm2
@@ -476,7 +550,7 @@ L(22bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(55bytes):
 	movdqu	-55(%eax), %xmm1
 	movdqu	-55(%edx), %xmm2
@@ -513,8 +587,8 @@ L(23bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(56bytes):
 	movdqu	-56(%eax), %xmm1
 	movdqu	-56(%edx), %xmm2
@@ -538,18 +612,27 @@ L(24bytes):
 	jnc	L(less16bytes)
 
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(57bytes):
 	movdqu	-57(%eax), %xmm1
 	movdqu	-57(%edx), %xmm2
@@ -585,7 +668,7 @@ L(25bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(58bytes):
 	movdqu	-58(%eax), %xmm1
 	movdqu	-58(%edx), %xmm2
@@ -627,7 +710,7 @@ L(26bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(59bytes):
 	movdqu	-59(%eax), %xmm1
 	movdqu	-59(%edx), %xmm2
@@ -668,8 +751,8 @@ L(27bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(60bytes):
 	movdqu	-60(%eax), %xmm1
 	movdqu	-60(%edx), %xmm2
@@ -691,22 +774,38 @@ L(28bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
+
 	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-12(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
 	jne	L(find_diff)
+
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
+
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(61bytes):
 	movdqu	-61(%eax), %xmm1
 	movdqu	-61(%edx), %xmm2
@@ -749,7 +848,7 @@ L(29bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(62bytes):
 	movdqu	-62(%eax), %xmm1
 	movdqu	-62(%edx), %xmm2
@@ -792,7 +891,7 @@ L(30bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(63bytes):
 	movdqu	-63(%eax), %xmm1
 	movdqu	-63(%edx), %xmm2
@@ -838,8 +937,9 @@ L(31bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(64bytes):
 	movdqu	-64(%eax), %xmm1
 	movdqu	-64(%edx), %xmm2
@@ -863,28 +963,45 @@ L(32bytes):
 	jnc	L(less16bytes)
 
 	mov	-16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-16(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-16(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-12(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less16bytes):
 	add	%ebx, %eax
 	add	%ebx, %edx
@@ -910,9 +1027,35 @@ L(less16bytes):
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
+# else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(find_diff):
+# ifndef USE_AS_WMEMCMP
 	cmpb	%bl, %cl
 	jne	L(end)
 	cmp	%bx, %cx
@@ -923,17 +1066,29 @@ L(find_diff):
 	jne	L(end)
 	cmp	%bx, %cx
 L(end):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(bigger)
 	neg	%eax
 L(bigger):
 	ret
+# else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+# endif
 END (MEMCMP)
 
 	.section .rodata.sse4.2,"a",@progbits
-	ALIGN (2)
+	.p2align 2
 	.type	L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
 L(table_64bytes):
 	.int	JMPTBL (L(0bytes), L(table_64bytes))
 	.int	JMPTBL (L(1bytes), L(table_64bytes))
@@ -1000,5 +1155,72 @@ L(table_64bytes):
 	.int	JMPTBL (L(62bytes), L(table_64bytes))
 	.int	JMPTBL (L(63bytes), L(table_64bytes))
 	.int	JMPTBL (L(64bytes), L(table_64bytes))
-	.size	L(table_64bytes), .-L(table_64bytes)
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# endif
 #endif
diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
index 2e0d15f..eab85c1 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
-/* memcmp with SSSE3
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -20,47 +20,64 @@
 
 #ifndef NOT_IN_libc
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
 
-#ifndef MEMCMP
-# define MEMCMP		__memcmp_ssse3
-#endif
+# ifndef MEMCMP
+#  define MEMCMP		__memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+# define PARMS		4
+# define BLK1		PARMS
+# define BLK2		BLK1+4
+# define LEN		BLK2+4
+# define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
 
-#define PARMS		4
-#define BLK1		PARMS
-#define BLK2		BLK1+4
-#define LEN		BLK2+4
-#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
 
-	.section .text.ssse3,"ax",@progbits
+	atom_text_section
 ENTRY (MEMCMP)
 	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(zero)
+# endif
+
 	movl	BLK1(%esp), %eax
 	cmp	$48, %ecx
 	movl	BLK2(%esp), %edx
 	jae	L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
 	cmp	$1, %ecx
 	jbe	L(less1bytes)
-	PUSH (%ebx)
+# endif
+
+	PUSH	(%ebx)
 	add	%ecx, %edx
 	add	%ecx, %eax
 	jmp	L(less48bytes)
 
-	ALIGN (4)
-	CFI_POP (%ebx)
+	CFI_POP	(%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less1bytes):
 	jb	L(zero)
 	movb	(%eax), %cl
@@ -71,29 +88,30 @@ L(less1bytes):
 	neg	%eax
 L(1bytesend):
 	ret
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(zero):
-	mov	$0, %eax
+	xor	%eax, %eax
 	ret
 
-	ALIGN (4)
+	.p2align 4
 L(48bytesormore):
-	PUSH (%ebx)
-	PUSH (%esi)
-	PUSH (%edi)
+	PUSH	(%ebx)
+	PUSH	(%esi)
+	PUSH	(%edi)
 	cfi_remember_state
-	movdqu    (%eax), %xmm3
-	movdqu    (%edx), %xmm0
+	movdqu	(%eax), %xmm3
+	movdqu	(%edx), %xmm0
 	movl	%eax, %edi
 	movl	%edx, %esi
-	pcmpeqb   %xmm0, %xmm3
-	pmovmskb  %xmm3, %edx
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
 	lea	16(%edi), %edi
 
-	sub      $0xffff, %edx
+	sub	$0xffff, %edx
 	lea	16(%esi), %esi
-	jnz	  L(less16bytes)
+	jnz	L(less16bytes)
 	mov	%edi, %edx
 	and	$0xf, %edx
 	xor	%edx, %edi
@@ -104,6 +122,7 @@ L(48bytesormore):
 	jz	L(shr_0)
 	xor	%edx, %esi
 
+# ifndef USE_AS_WMEMCMP
 	cmp	$8, %edx
 	jae	L(next_unaligned_table)
 	cmp	$0, %edx
@@ -122,7 +141,7 @@ L(48bytesormore):
 	je	L(shr_6)
 	jmp	L(shr_7)
 
-	ALIGN (4)
+	.p2align 2
 L(next_unaligned_table):
 	cmp	$8, %edx
 	je	L(shr_8)
@@ -139,8 +158,17 @@ L(next_unaligned_table):
 	cmp	$14, %edx
 	je	L(shr_14)
 	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(shr_0):
 	cmp	$80, %ecx
 	jae	L(shr_0_gobble)
@@ -159,13 +187,13 @@ L(shr_0):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_0_gobble):
 	lea	-48(%ecx), %ecx
 	movdqa	(%esi), %xmm0
@@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_1):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -235,13 +264,13 @@ L(shr_1):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	1(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_1_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -288,14 +317,14 @@ L(shr_1_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	1(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_2):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -319,13 +348,13 @@ L(shr_2):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	2(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_2_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -372,13 +401,13 @@ L(shr_2_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	2(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_3):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -402,13 +431,13 @@ L(shr_3):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	3(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_3_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -455,13 +484,14 @@ L(shr_3_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	3(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_4):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -485,13 +515,13 @@ L(shr_4):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	4(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_4_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -538,13 +568,14 @@ L(shr_4_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	4(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_5):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -568,13 +599,13 @@ L(shr_5):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	5(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_5_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -621,13 +652,13 @@ L(shr_5_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	5(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_6):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -651,13 +682,13 @@ L(shr_6):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	6(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_6_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -704,13 +735,13 @@ L(shr_6_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	6(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_7):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -734,13 +765,13 @@ L(shr_7):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	7(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_7_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -787,13 +818,14 @@ L(shr_7_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	7(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_8):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -817,13 +849,13 @@ L(shr_8):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	8(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_8_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -870,13 +902,14 @@ L(shr_8_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	8(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_9):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -900,13 +933,13 @@ L(shr_9):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	9(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_9_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -953,13 +986,13 @@ L(shr_9_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	9(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_10):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -983,13 +1016,13 @@ L(shr_10):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	10(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_10_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1036,13 +1069,13 @@ L(shr_10_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	10(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_11):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1066,13 +1099,13 @@ L(shr_11):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	11(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_11_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1119,13 +1152,14 @@ L(shr_11_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	11(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_12):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1149,13 +1183,13 @@ L(shr_12):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	12(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_12_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1202,13 +1236,14 @@ L(shr_12_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	12(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_13):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1232,13 +1267,13 @@ L(shr_13):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	13(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_13_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1285,13 +1320,13 @@ L(shr_13_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	13(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_14):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1315,13 +1350,13 @@ L(shr_14):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	14(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_14_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1368,13 +1403,13 @@ L(shr_14_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	14(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_15):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1398,13 +1433,13 @@ L(shr_15):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	15(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_15_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1451,13 +1486,14 @@ L(shr_15_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	15(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(exit):
 	pmovmskb %xmm1, %ebx
 	sub	$0xffff, %ebx
@@ -1465,9 +1501,12 @@ L(exit):
 	lea	-16(%esi), %esi
 	lea	-16(%edi), %edi
 	mov	%ebx, %edx
+
 L(first16bytes):
 	add	%eax, %esi
 L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
 	test	%dl, %dl
 	jz	L(next_24_bytes)
 
@@ -1492,61 +1531,61 @@ L(less16bytes):
 	test	$0x40, %dl
 	jnz	L(Byte22)
 L(Byte23):
-	movzbl	 -9(%edi), %eax
-	movzbl	 -9(%esi), %edx
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte16):
-	movzbl	 -16(%edi), %eax
-	movzbl	 -16(%esi), %edx
+	movzbl	-16(%edi), %eax
+	movzbl	-16(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte17):
-	movzbl	 -15(%edi), %eax
-	movzbl	 -15(%esi), %edx
+	movzbl	-15(%edi), %eax
+	movzbl	-15(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte18):
-	movzbl	 -14(%edi), %eax
-	movzbl	 -14(%esi), %edx
+	movzbl	-14(%edi), %eax
+	movzbl	-14(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte19):
-	movzbl	 -13(%edi), %eax
-	movzbl	 -13(%esi), %edx
+	movzbl	-13(%edi), %eax
+	movzbl	-13(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte20):
-	movzbl	 -12(%edi), %eax
-	movzbl	 -12(%esi), %edx
+	movzbl	-12(%edi), %eax
+	movzbl	-12(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte21):
-	movzbl	 -11(%edi), %eax
-	movzbl	 -11(%esi), %edx
+	movzbl	-11(%edi), %eax
+	movzbl	-11(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte22):
-	movzbl	 -10(%edi), %eax
-	movzbl	 -10(%esi), %edx
+	movzbl	-10(%edi), %eax
+	movzbl	-10(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(next_24_bytes):
 	lea	8(%edi), %edi
 	lea	8(%esi), %esi
@@ -1571,20 +1610,69 @@ L(next_24_bytes):
 	test	$0x40, %dh
 	jnz	L(Byte22)
 
-	ALIGN (4)
+	.p2align 4
 L(Byte31):
-	movzbl	 -9(%edi), %eax
-	movzbl	 -9(%esi), %edx
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN_END
+# else
+
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%edi), %eax
+	cmp	-16(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%edi), %eax
+	cmp	-12(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%edi), %eax
+	cmp	-8(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%edi), %eax
+	cmp	-4(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(nequal_bigger):
+	RETURN_END
+# endif
 
 	CFI_PUSH (%ebx)
-	ALIGN (4)
+
+	.p2align 4
 L(more8bytes):
 	cmp	$16, %ecx
 	jae	L(more16bytes)
 	cmp	$8, %ecx
 	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$9, %ecx
 	je	L(9bytes)
 	cmp	$10, %ecx
@@ -1598,13 +1686,17 @@ L(more8bytes):
 	cmp	$14, %ecx
 	je	L(14bytes)
 	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more16bytes):
 	cmp	$24, %ecx
 	jae	L(more24bytes)
 	cmp	$16, %ecx
 	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$17, %ecx
 	je	L(17bytes)
 	cmp	$18, %ecx
@@ -1618,13 +1710,17 @@ L(more16bytes):
 	cmp	$22, %ecx
 	je	L(22bytes)
 	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more24bytes):
 	cmp	$32, %ecx
 	jae	L(more32bytes)
 	cmp	$24, %ecx
 	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$25, %ecx
 	je	L(25bytes)
 	cmp	$26, %ecx
@@ -1638,13 +1734,17 @@ L(more24bytes):
 	cmp	$30, %ecx
 	je	L(30bytes)
 	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more32bytes):
 	cmp	$40, %ecx
 	jae	L(more40bytes)
 	cmp	$32, %ecx
 	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$33, %ecx
 	je	L(33bytes)
 	cmp	$34, %ecx
@@ -1658,11 +1758,35 @@ L(more32bytes):
 	cmp	$38, %ecx
 	je	L(38bytes)
 	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more40bytes):
 	cmp	$40, %ecx
 	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$41, %ecx
 	je	L(41bytes)
 	cmp	$42, %ecx
@@ -1677,23 +1801,7 @@ L(more40bytes):
 	je	L(46bytes)
 	jmp	L(47bytes)
 
-	ALIGN (4)
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-
-	ALIGN (4)
+	.p2align 4
 L(44bytes):
 	mov	-44(%eax), %ecx
 	mov	-44(%edx), %ebx
@@ -1750,11 +1858,64 @@ L(4bytes):
 	cmp	%ebx, %ecx
 	mov	$0, %eax
 	jne	L(find_diff)
-	POP (%ebx)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# else
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	cmp	-44(%edx), %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	cmp	-40(%edx), %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	cmp	-36(%edx), %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	cmp	-32(%edx), %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	cmp	-28(%edx), %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	cmp	-24(%edx), %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	cmp	-20(%edx), %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	xor	%eax, %eax
+	cmp	-4(%edx), %ecx
+	jne	L(find_diff)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
+# endif
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
 L(45bytes):
 	mov	-45(%eax), %ecx
 	mov	-45(%edx), %ebx
@@ -1814,11 +1975,11 @@ L(5bytes):
 	cmp	-1(%edx), %cl
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(46bytes):
 	mov	-46(%eax), %ecx
 	mov	-46(%edx), %ebx
@@ -1882,11 +2043,11 @@ L(2bytes):
 	cmp	%bh, %ch
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(47bytes):
 	movl	-47(%eax), %ecx
 	movl	-47(%edx), %ebx
@@ -1953,11 +2114,11 @@ L(3bytes):
 	cmpb	-1(%edx), %al
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(find_diff):
 	cmpb	%bl, %cl
 	jne	L(end)
@@ -1968,14 +2129,30 @@ L(find_diff):
 	cmp	%bl, %cl
 	jne	L(end)
 	cmp	%bx, %cx
+
+	.p2align 4
 L(end):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(bigger)
 	neg	%eax
 L(bigger):
 	ret
+# else
 
-END (MEMCMP)
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
 
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+# endif
+END (MEMCMP)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000..94ff615
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP  __wmemcmp_ia32
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000..1a857c7
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000..a41ef95
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 0000000..5080c14
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,59 @@
+/* Multiple versions of wmemcmp
+   Copyright (C)  2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#ifndef NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+	__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__wmemcmp_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wmemcmp_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wmemcmp_sse4_2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(wmemcmp)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a5254dc..e0bb984 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
-		   strrchr-sse2-no-bsf strchr-sse2-no-bsf
+		   strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+		   memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index fc439bb..28dd505 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.1
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.1, wmemcmp with SSE4.1
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -20,43 +20,54 @@
 
 #ifndef NOT_IN_libc
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
 
-#ifndef MEMCMP
-# define MEMCMP		__memcmp_sse4_1
-#endif
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_1
+# endif
 
-#ifndef ALIGN
-# define ALIGN(n)	.p2align n
-#endif
+# ifndef ALIGN
+#  define ALIGN(n)	.p2align n
+# endif
 
-#define JMPTBL(I, B)	(I - B)
+# define JMPTBL(I, B)	(I - B)
 
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
   lea		TABLE(%rip), %r11;				\
   movslq	(%r11, INDEX, SCALE), %rcx;			\
   add		%r11, %rcx;					\
   jmp		*%rcx;						\
   ud2
 
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
 	.section .text.sse4.1,"ax",@progbits
 ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
 	pxor	%xmm0, %xmm0
 	cmp	$79, %rdx
 	ja	L(79bytesormore)
+# ifndef USE_AS_WMEMCMP
 	cmp	$1, %rdx
 	je	L(firstbyte)
+# endif
 	add	%rdx, %rsi
 	add	%rdx, %rdi
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
 
+# ifndef USE_AS_WMEMCMP
 	ALIGN (4)
 L(firstbyte):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
 	sub	%ecx, %eax
 	ret
+# endif
 
 	ALIGN (4)
 L(79bytesormore):
@@ -308,11 +319,11 @@ L(less32bytesin256):
 
 	ALIGN (4)
 L(512bytesormore):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
 	mov	$DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
 	mov	__x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
 	mov	%r8, %r9
 	shr	$1, %r8
 	add	%r9, %r8
@@ -624,11 +635,11 @@ L(less32bytesin256in2alinged):
 
 	ALIGN (4)
 L(512bytesormorein2aligned):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
 	mov	$DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
 	mov	__x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
 	mov	%r8, %r9
 	shr	$1, %r8
 	add	%r9, %r8
@@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned):
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
 L(L2_L3_cache_aglined):
 	sub	$64, %rdx
+
 	ALIGN (4)
 L(L2_L3_aligned_128bytes_loop):
 	prefetchnta 0x1c0(%rdi)
@@ -803,13 +815,19 @@ L(12bytes):
 	jne	L(diffin8bytes)
 L(4bytes):
 	mov	-4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%rdi), %eax
 	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
 	jne	L(diffin4bytes)
 L(0bytes):
 	xor	%eax, %eax
 	ret
 
+# ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
 	ALIGN (4)
 L(65bytes):
 	movdqu	-65(%rdi), %xmm1
@@ -1017,6 +1035,7 @@ L(1bytes):
 	movzbl	-1(%rsi), %ecx
 	sub	%ecx, %eax
 	ret
+# endif
 
 	ALIGN (4)
 L(68bytes):
@@ -1047,13 +1066,20 @@ L(20bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
-	mov	-4(%rdi), %eax
 	mov	-4(%rsi), %ecx
+
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
 	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
 
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
 	ALIGN (4)
 L(69bytes):
 	movdqu	-69(%rsi), %xmm1
@@ -1161,6 +1187,7 @@ L(23bytes):
 	jne	L(diffin8bytes)
 	xor	%eax, %eax
 	ret
+# endif
 
 	ALIGN (4)
 L(72bytes):
@@ -1191,13 +1218,16 @@ L(24bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
-	mov	-8(%rdi), %rax
+
 	mov	-8(%rsi), %rcx
+	mov	-8(%rdi), %rax
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
 	xor	%eax, %eax
 	ret
 
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
 	ALIGN (4)
 L(73bytes):
 	movdqu	-73(%rsi), %xmm1
@@ -1312,7 +1342,7 @@ L(27bytes):
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
-
+# endif
 	ALIGN (4)
 L(76bytes):
 	movdqu	-76(%rsi), %xmm1
@@ -1346,13 +1376,19 @@ L(28bytes):
 	mov	-12(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
-	mov	-4(%rdi), %eax
 	mov	-4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
 	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
 
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
 	ALIGN (4)
 L(77bytes):
 	movdqu	-77(%rsi), %xmm1
@@ -1474,7 +1510,7 @@ L(31bytes):
 	jne	L(diffin8bytes)
 	xor	%eax, %eax
 	ret
-
+# endif
 	ALIGN (4)
 L(64bytes):
 	movdqu	-64(%rdi), %xmm2
@@ -1527,7 +1563,17 @@ L(diffin8bytes):
 	jne	L(diffin4bytes)
 	shr	$32, %rcx
 	shr	$32, %rax
+
+# ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+# endif
+
 L(diffin4bytes):
+# ifndef USE_AS_WMEMCMP
 	cmp	%cx, %ax
 	jne	L(diffin2bytes)
 	shr	$16, %ecx
@@ -1546,11 +1592,28 @@ L(end):
 	and	$0xff, %ecx
 	sub	%ecx, %eax
 	ret
+# else
+
+/* for wmemcmp */
+	mov	$1, %eax
+	jl	L(nequal_bigger)
+	neg	%eax
+	ret
+
+	ALIGN (4)
+L(nequal_bigger):
+	ret
+
+L(unreal_case):
+	xor	%eax, %eax
+	ret
+# endif
 
 END (MEMCMP)
 
 	.section .rodata.sse4.1,"a",@progbits
 	ALIGN (3)
+# ifndef USE_AS_WMEMCMP
 L(table_64bytes):
 	.int	JMPTBL (L(0bytes), L(table_64bytes))
 	.int	JMPTBL (L(1bytes), L(table_64bytes))
@@ -1632,4 +1695,87 @@ L(table_64bytes):
 	.int	JMPTBL (L(77bytes), L(table_64bytes))
 	.int	JMPTBL (L(78bytes), L(table_64bytes))
 	.int	JMPTBL (L(79bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+# endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000..b3a2ca1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1997 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful, 
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_ssse3
+# endif
+
+# ifndef ALIGN
+#  define ALIGN(n)	.p2align n
+# endif
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	atom_text_section
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+	test	%rdx, %rdx
+	jz	L(equal)
+# endif
+	mov	%rdx, %rcx
+	mov	%rdi, %rdx
+	cmp	$48, %rcx;
+	jae	L(48bytesormore)	/* LEN => 48  */
+
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+/* ECX >= 32.  */
+L(48bytesormore):
+	movdqu	(%rdi), %xmm3
+	movdqu	(%rsi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%rdi), %rdi
+	lea	16(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%rdx, %rdi
+	sub	%rdx, %rsi
+	add	%rdx, %rcx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%rdx, %rsi
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	ALIGN	(2)
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
+
+	ALIGN	(4)
+L(shr_0):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	jae	L(shr_0_gobble)
+	xor	%eax, %eax
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+	movdqa	16(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_0_gobble):
+	movdqa	(%rsi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%rdi), %xmm0
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %rcx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%rsi), %xmm0
+	movdqa	48(%rsi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%rdi), %xmm0
+	pcmpeqb	48(%rdi), %xmm2
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %rcx
+	jge	L(next)
+	inc	%edx
+	add	$32, %rcx
+L(next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	ALIGN	(4)
+L(shr_1):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$1, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_1_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$1, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$1, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$1, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$1, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	1(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+
+	ALIGN	(4)
+L(shr_2):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$2, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_2_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$2, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$2, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$2, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$2, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	2(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_3):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$3, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_3_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$3, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$3, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$3, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$3, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	3(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	ALIGN	(4)
+L(shr_4):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$4, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_4_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$4, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$4, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$4, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$4, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	4(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	ALIGN	(4)
+L(shr_5):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$5, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_5_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$5, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$5, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$5, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$5, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	5(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_6):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$6, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_6_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$6, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$6, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$6, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$6, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	6(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_7):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$7, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_7_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$7, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$7, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$7, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$7, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	7(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	ALIGN	(4)
+L(shr_8):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$8, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_8_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$8, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$8, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$8, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$8, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	8(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	ALIGN	(4)
+L(shr_9):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$9, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_9_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$9, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$9, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$9, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$9, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	9(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_10):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$10, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_10_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$10, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$10, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$10, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$10, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	10(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_11):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$11, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_11_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$11, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$11, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$11, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$11, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	11(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	ALIGN	(4)
+L(shr_12):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$12, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_12_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$12, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$12, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$12, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$12, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	12(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	ALIGN	(4)
+L(shr_13):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$13, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_13_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$13, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$13, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$13, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$13, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	13(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_14):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$14, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_14_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$14, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$14, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$14, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$14, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	14(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_15):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$15, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_15_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$15, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$15, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$15, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$15, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	15(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+# endif
+	ALIGN	(4)
+L(exit):
+	pmovmskb %xmm1, %r8d
+	sub	$0xffff, %r8d
+	jz	L(first16bytes)
+	lea	-16(%rsi), %rsi
+	lea	-16(%rdi), %rdi
+	mov	%r8d, %edx
+L(first16bytes):
+	add	%rax, %rsi
+L(less16bytes):
+# ifndef USE_AS_WMEMCMP
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+
+	movzbl	-9(%rdi), %eax
+	movzbl	-9(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte16):
+	movzbl	-16(%rdi), %eax
+	movzbl	-16(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte17):
+	movzbl	-15(%rdi), %eax
+	movzbl	-15(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte18):
+	movzbl	-14(%rdi), %eax
+	movzbl	-14(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte19):
+	movzbl	-13(%rdi), %eax
+	movzbl	-13(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte20):
+	movzbl	-12(%rdi), %eax
+	movzbl	-12(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte21):
+	movzbl	-11(%rdi), %eax
+	movzbl	-11(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte22):
+	movzbl	-10(%rdi), %eax
+	movzbl	-10(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(next_24_bytes):
+	lea	8(%rdi), %rdi
+	lea	8(%rsi), %rsi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	mov	-9(%rdi), %eax
+	and	$0xff, %eax
+	mov	-9(%rsi), %edx
+	and	$0xff, %edx
+	sub	%edx, %eax
+	ret
+# else
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%rdi), %eax
+	cmp	-16(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	ALIGN	(4)
+L(second_double_word):
+	mov	-12(%rdi), %eax
+	cmp	-12(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	ALIGN	(4)
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%rdi), %eax
+	cmp	-8(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	ALIGN	(4)
+L(fourth_double_word):
+	mov	-4(%rdi), %eax
+	cmp	-4(%rsi), %eax
+	jne	L(find_diff)
+	ret
+# endif
+
+	ALIGN	(4)
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+	cmp	$0, %ecx
+	je	L(0bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %ecx
+	je	L(1bytes)
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
+
+	ALIGN	(4)
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
+
+	ALIGN	(4)
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
+
+	ALIGN	(4)
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
+
+	ALIGN	(4)
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+ 
+	ALIGN	(4)
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	ALIGN	(4)
+L(44bytes):
+	movl	-44(%rdi), %eax
+	movl	-44(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(40bytes):
+	movl	-40(%rdi), %eax
+	movl	-40(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(36bytes):
+	movl	-36(%rdi), %eax
+	movl	-36(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(32bytes):
+	movl	-32(%rdi), %eax
+	movl	-32(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(28bytes):
+	movl	-28(%rdi), %eax
+	movl	-28(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%rdi), %eax
+	movl	-24(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%rdi), %eax
+	movl	-20(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%rdi), %eax
+	movl	-16(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%rdi), %eax
+	movl	-12(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%rdi), %eax
+	movl	-8(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%rdi), %eax
+	movl	-4(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+# else
+	ALIGN	(4)
+L(44bytes):
+	movl	-44(%rdi), %eax
+	cmp	-44(%rsi), %eax
+	jne	L(find_diff)
+L(40bytes):
+	movl	-40(%rdi), %eax
+	cmp	-40(%rsi), %eax
+	jne	L(find_diff)
+L(36bytes):
+	movl	-36(%rdi), %eax
+	cmp	-36(%rsi), %eax
+	jne	L(find_diff)
+L(32bytes):
+	movl	-32(%rdi), %eax
+	cmp	-32(%rsi), %eax
+	jne	L(find_diff)
+L(28bytes):
+	movl	-28(%rdi), %eax
+	cmp	-28(%rsi), %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%rdi), %eax
+	cmp	-24(%rsi), %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%rdi), %eax
+	cmp	-20(%rsi), %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%rdi), %eax
+	cmp	-16(%rsi), %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%rdi), %eax
+	cmp	-12(%rsi), %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%rdi), %eax
+	cmp	-8(%rsi), %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%rdi), %eax
+	cmp	-4(%rsi), %eax
+	jne	L(find_diff)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	ALIGN	(4)
+L(45bytes):
+	movl	-45(%rdi), %eax
+	movl	-45(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(41bytes):
+	movl	-41(%rdi), %eax
+	movl	-41(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(37bytes):
+	movl	-37(%rdi), %eax
+	movl	-37(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(33bytes):
+	movl	-33(%rdi), %eax
+	movl	-33(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(29bytes):
+	movl	-29(%rdi), %eax
+	movl	-29(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(25bytes):
+	movl	-25(%rdi), %eax
+	movl	-25(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(21bytes):
+	movl	-21(%rdi), %eax
+	movl	-21(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(17bytes):
+	movl	-17(%rdi), %eax
+	movl	-17(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(13bytes):
+	movl	-13(%rdi), %eax
+	movl	-13(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(9bytes):
+	movl	-9(%rdi), %eax
+	movl	-9(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(5bytes):
+	movl	-5(%rdi), %eax
+	movl	-5(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	cmpb	-1(%rsi), %al
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	ALIGN	(4)
+L(46bytes):
+	movl	-46(%rdi), %eax
+	movl	-46(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(42bytes):
+	movl	-42(%rdi), %eax
+	movl	-42(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(38bytes):
+	movl	-38(%rdi), %eax
+	movl	-38(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(34bytes):
+	movl	-34(%rdi), %eax
+	movl	-34(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(30bytes):
+	movl	-30(%rdi), %eax
+	movl	-30(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(26bytes):
+	movl	-26(%rdi), %eax
+	movl	-26(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(22bytes):
+	movl	-22(%rdi), %eax
+	movl	-22(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(18bytes):
+	movl	-18(%rdi), %eax
+	movl	-18(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(14bytes):
+	movl	-14(%rdi), %eax
+	movl	-14(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(10bytes):
+	movl	-10(%rdi), %eax
+	movl	-10(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(6bytes):
+	movl	-6(%rdi), %eax
+	movl	-6(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmp	%ecx, %eax
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	ALIGN	(4)
+L(47bytes):
+	movl	-47(%rdi), %eax
+	movl	-47(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%rdi), %eax
+	movl	-43(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%rdi), %eax
+	movl	-39(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%rdi), %eax
+	movl	-35(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%rdi), %eax
+	movl	-31(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%rdi), %eax
+	movl	-27(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%rdi), %eax
+	movl	-23(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%rdi), %eax
+	movl	-19(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%rdi), %eax
+	movl	-15(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%rdi), %eax
+	movl	-11(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%rdi), %eax
+	movl	-7(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmp	%ecx, %eax
+	jne	L(set)
+	movzbl	-1(%rdi), %eax
+	cmpb	-1(%rsi), %al
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	ALIGN	(4)
+L(find_diff):
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpw	%cx, %ax
+	jne	L(set)
+	shr	$16, %eax
+	shr	$16, %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+
+/* We get there only if we already know there is a
+difference.  */
+
+	cmp	%ecx, %eax
+L(set):
+	sbb	%eax, %eax
+	sbb	$-1, %eax
+	ret
+# else
+
+/* for wmemcmp */
+	ALIGN	(4)
+L(find_diff):
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	ALIGN	(4)
+L(find_diff_bigger):
+	ret
+# endif
+
+	ALIGN	(4)
+L(equal):
+	xor	%eax, %eax
+	ret
+
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 301ab28..8bf8f3a 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -1,5 +1,5 @@
 /* Multiple versions of memcmp
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -29,11 +29,20 @@ ENTRY(memcmp)
 	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__memcmp_sse2(%rip), %rax
-	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
-	jz	2f
+
+1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	2f
+	leaq	__memcmp_sse2(%rip), %rax
+	ret
+
+2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	jz	3f
 	leaq	__memcmp_sse4_1(%rip), %rax
-2:	ret
+	ret
+
+3:	leaq	__memcmp_ssse3(%rip), %rax
+	ret
+
 END(memcmp)
 
 # undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000..793f059
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP  __wmemcmp_sse2
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000..b07973a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_1
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000..a41ef95
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
new file mode 100644
index 0000000..7c3b7ed
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -0,0 +1,47 @@
+/* Multiple versions of wmemcmp
+   Copyright (C)  2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	2f
+	leaq	__wmemcmp_sse2(%rip), %rax
+	ret
+
+2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	jz	3f
+	leaq	__wmemcmp_sse4_1(%rip), %rax
+	ret
+
+3:	leaq	__wmemcmp_ssse3(%rip), %rax
+	ret
+
+END(wmemcmp)
+#endif
diff --git a/wcsmbs/wmemcmp.c b/wcsmbs/wmemcmp.c
index c6a321b..e7edc87 100644
--- a/wcsmbs/wmemcmp.c
+++ b/wcsmbs/wmemcmp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996, 1997 Free Software Foundation, Inc.
+/* Copyright (C) 1996, 1997i, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
 
@@ -19,9 +19,12 @@
 
 #include <wchar.h>
 
+#ifndef WMEMCMP
+# define wmemcmp
+#endif
 
 int
-wmemcmp (s1, s2, n)
+WMEMCMP (s1, s2, n)
      const wchar_t *s1;
      const wchar_t *s2;
      size_t n;
@@ -34,19 +37,19 @@ wmemcmp (s1, s2, n)
       c1 = (wint_t) s1[0];
       c2 = (wint_t) s2[0];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       c1 = (wint_t) s1[1];
       c2 = (wint_t) s2[1];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       c1 = (wint_t) s1[2];
       c2 = (wint_t) s2[2];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       c1 = (wint_t) s1[3];
       c2 = (wint_t) s2[3];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       s1 += 4;
       s2 += 4;
       n -= 4;
@@ -57,7 +60,7 @@ wmemcmp (s1, s2, n)
       c1 = (wint_t) s1[0];
       c2 = (wint_t) s2[0];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       ++s1;
       ++s2;
       --n;
@@ -67,7 +70,7 @@ wmemcmp (s1, s2, n)
       c1 = (wint_t) s1[0];
       c2 = (wint_t) s2[0];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       ++s1;
       ++s2;
       --n;
@@ -77,7 +80,7 @@ wmemcmp (s1, s2, n)
       c1 = (wint_t) s1[0];
       c2 = (wint_t) s2[0];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
     }
 
   return 0;
-- 
2.7.4