From e0c06e310c23ebf2bc20f7f4324aec2ec3d34861 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 27 Oct 2020 14:40:52 +0000 Subject: [PATCH] [X86] Regenerate popcnt tests. NFCI. Merge prefixes where possible, use 'X86' instead of 'X32' (which we try to only use for gnux32 triple tests). --- llvm/test/CodeGen/X86/popcnt.ll | 1988 +++++++++++++++++++-------------------- 1 file changed, 994 insertions(+), 994 deletions(-) diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll index cc6f315..3fe9871 100644 --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -1,29 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X32,X32-NOSSE +; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X86,X86-NOSSE ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT +; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X86-POPCNT ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT -; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X32,X32-SSE2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X32,X32-SSSE3 +; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X86,X86-SSSE3 define i8 @cnt8(i8 %x) nounwind readnone { -; X32-LABEL: cnt8: -; X32: # %bb.0: -; X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shrb %al -; X32-NEXT: andb $85, %al -; X32-NEXT: subb %al, %cl -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: andb $51, %al -; X32-NEXT: shrb $2, %cl -; X32-NEXT: andb $51, %cl -; X32-NEXT: addb %al, %cl -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shrb $4, %al -; X32-NEXT: addb %cl, %al -; X32-NEXT: andb $15, %al -; X32-NEXT: retl +; X86-LABEL: cnt8: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb %al +; X86-NEXT: andb $85, %al +; X86-NEXT: subb %al, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andb $51, %al +; X86-NEXT: shrb $2, %cl +; X86-NEXT: andb $51, %cl +; X86-NEXT: addb %al, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $4, %al +; X86-NEXT: addb %cl, %al +; X86-NEXT: andb $15, %al +; X86-NEXT: retl ; ; X64-LABEL: cnt8: ; X64: # %bb.0: @@ -44,12 +44,12 @@ define i8 @cnt8(i8 %x) nounwind readnone { ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt8: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: popcntl %eax, %eax -; X32-POPCNT-NEXT: # kill: def $al killed $al killed $eax -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt8: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl %eax, %eax +; X86-POPCNT-NEXT: # kill: def $al killed $al killed $eax +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt8: ; X64-POPCNT: # %bb.0: @@ -62,28 +62,28 @@ define i8 @cnt8(i8 %x) nounwind readnone { } define i16 @cnt16(i16 %x) nounwind readnone { -; X32-LABEL: cnt16: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl %ecx -; X32-NEXT: andl $21845, %ecx # imm = 0x5555 -; X32-NEXT: subl %ecx, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $13107, %ecx # imm = 0x3333 -; X32-NEXT: shrl $2, %eax -; X32-NEXT: andl $13107, %eax # imm = 0x3333 -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl $4, %ecx -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: andl $3855, %ecx # imm = 0xF0F -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shll $8, %eax -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %ah, %eax -; X32-NEXT: # kill: def $ax killed $ax killed $eax -; X32-NEXT: retl +; X86-LABEL: cnt16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl %ecx +; X86-NEXT: andl $21845, %ecx # imm = 0x5555 +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $13107, %ecx # imm = 0x3333 +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $13107, %eax # imm = 0x3333 +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: andl $3855, %ecx # imm = 0xF0F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shll $8, %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %ah, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl ; ; X64-LABEL: cnt16: ; X64: # %bb.0: @@ -107,10 +107,10 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt16: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: popcntw {{[0-9]+}}(%esp), %ax -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt16: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntw {{[0-9]+}}(%esp), %ax +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt16: ; X64-POPCNT: # %bb.0: @@ -121,25 +121,25 @@ define i16 @cnt16(i16 %x) nounwind readnone { } define i32 @cnt32(i32 %x) nounwind readnone { -; X32-LABEL: cnt32: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl %ecx -; X32-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X32-NEXT: subl %ecx, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X32-NEXT: shrl $2, %eax -; X32-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl $4, %ecx -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; X32-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; X32-NEXT: shrl $24, %eax -; X32-NEXT: retl +; X86-LABEL: cnt32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 +; X86-NEXT: shrl $24, %eax +; X86-NEXT: retl ; ; X64-LABEL: cnt32: ; X64: # %bb.0: @@ -160,10 +160,10 @@ define i32 @cnt32(i32 %x) nounwind readnone { ; X64-NEXT: shrl $24, %eax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt32: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt32: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt32: ; X64-POPCNT: # %bb.0: @@ -174,43 +174,43 @@ define i32 @cnt32(i32 %x) nounwind readnone { } define i64 @cnt64(i64 %x) nounwind readnone { -; X32-NOSSE-LABEL: cnt64: -; X32-NOSSE: # %bb.0: -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOSSE-NEXT: movl %ecx, %edx -; X32-NOSSE-NEXT: shrl %edx -; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X32-NOSSE-NEXT: subl %edx, %ecx -; X32-NOSSE-NEXT: movl %ecx, %edx -; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X32-NOSSE-NEXT: shrl $2, %ecx -; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X32-NOSSE-NEXT: addl %edx, %ecx -; X32-NOSSE-NEXT: movl %ecx, %edx -; X32-NOSSE-NEXT: shrl $4, %edx -; X32-NOSSE-NEXT: addl %ecx, %edx -; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %ecx -; X32-NOSSE-NEXT: movl %eax, %edx -; X32-NOSSE-NEXT: shrl %edx -; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X32-NOSSE-NEXT: subl %edx, %eax -; X32-NOSSE-NEXT: movl %eax, %edx -; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X32-NOSSE-NEXT: shrl $2, %eax -; X32-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X32-NOSSE-NEXT: addl %edx, %eax -; X32-NOSSE-NEXT: movl %eax, %edx -; X32-NOSSE-NEXT: shrl $4, %edx -; X32-NOSSE-NEXT: addl %eax, %edx -; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %eax -; X32-NOSSE-NEXT: addl %ecx, %eax -; X32-NOSSE-NEXT: xorl %edx, %edx -; X32-NOSSE-NEXT: retl +; X86-NOSSE-LABEL: cnt64: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %ecx, %edx +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %eax +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: addl %ecx, %eax +; X86-NOSSE-NEXT: xorl %edx, %edx +; X86-NOSSE-NEXT: retl ; ; X64-LABEL: cnt64: ; X64: # %bb.0: @@ -235,146 +235,146 @@ define i64 @cnt64(i64 %x) nounwind readnone { ; X64-NEXT: shrq $56, %rax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt64: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: addl %ecx, %eax -; X32-POPCNT-NEXT: xorl %edx, %edx -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt64: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: addl %ecx, %eax +; X86-POPCNT-NEXT: xorl %edx, %edx +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt64: ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntq %rdi, %rax ; X64-POPCNT-NEXT: retq ; -; X32-SSE2-LABEL: cnt64: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $1, %xmm1 -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE2-NEXT: psubb %xmm1, %xmm0 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: pand %xmm1, %xmm2 -; X32-SSE2-NEXT: psrlw $2, %xmm0 -; X32-SSE2-NEXT: pand %xmm1, %xmm0 -; X32-SSE2-NEXT: paddb %xmm2, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $4, %xmm1 -; X32-SSE2-NEXT: paddb %xmm0, %xmm1 -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE2-NEXT: pxor %xmm0, %xmm0 -; X32-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE2-NEXT: movd %xmm0, %eax -; X32-SSE2-NEXT: xorl %edx, %edx -; X32-SSE2-NEXT: retl +; X86-SSE2-LABEL: cnt64: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; -; X32-SSSE3-LABEL: cnt64: -; X32-SSSE3: # %bb.0: -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X32-SSSE3-NEXT: psrlw $4, %xmm1 -; X32-SSSE3-NEXT: pand %xmm0, %xmm1 -; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3 -; X32-SSSE3-NEXT: paddb %xmm4, %xmm3 -; X32-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0 -; X32-SSSE3-NEXT: movd %xmm0, %eax -; X32-SSSE3-NEXT: xorl %edx, %edx -; X32-SSSE3-NEXT: retl +; X86-SSSE3-LABEL: cnt64: +; X86-SSSE3: # %bb.0: +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X86-SSSE3-NEXT: psrlw $4, %xmm1 +; X86-SSSE3-NEXT: pand %xmm0, %xmm1 +; X86-SSSE3-NEXT: pshufb %xmm1, %xmm3 +; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 +; X86-SSSE3-NEXT: pxor %xmm0, %xmm0 +; X86-SSSE3-NEXT: psadbw %xmm3, %xmm0 +; X86-SSSE3-NEXT: movd %xmm0, %eax +; X86-SSSE3-NEXT: xorl %edx, %edx +; X86-SSSE3-NEXT: retl %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt } define i128 @cnt128(i128 %x) nounwind readnone { -; X32-NOSSE-LABEL: cnt128: -; X32-NOSSE: # %bb.0: -; X32-NOSSE-NEXT: pushl %ebx -; X32-NOSSE-NEXT: pushl %edi -; X32-NOSSE-NEXT: pushl %esi -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NOSSE-NEXT: movl %edi, %ebx -; X32-NOSSE-NEXT: shrl %ebx -; X32-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 -; X32-NOSSE-NEXT: subl %ebx, %edi -; X32-NOSSE-NEXT: movl %edi, %ebx -; X32-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 -; X32-NOSSE-NEXT: shrl $2, %edi -; X32-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X32-NOSSE-NEXT: addl %ebx, %edi -; X32-NOSSE-NEXT: movl %edi, %ebx -; X32-NOSSE-NEXT: shrl $4, %ebx -; X32-NOSSE-NEXT: addl %edi, %ebx -; X32-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %edi -; X32-NOSSE-NEXT: movl %esi, %ebx -; X32-NOSSE-NEXT: shrl %ebx -; X32-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 -; X32-NOSSE-NEXT: subl %ebx, %esi -; X32-NOSSE-NEXT: movl %esi, %ebx -; X32-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 -; X32-NOSSE-NEXT: shrl $2, %esi -; X32-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 -; X32-NOSSE-NEXT: addl %ebx, %esi -; X32-NOSSE-NEXT: movl %esi, %ebx -; X32-NOSSE-NEXT: shrl $4, %ebx -; X32-NOSSE-NEXT: addl %esi, %ebx -; X32-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %esi -; X32-NOSSE-NEXT: addl %edi, %esi -; X32-NOSSE-NEXT: movl %edx, %edi -; X32-NOSSE-NEXT: shrl %edi -; X32-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X32-NOSSE-NEXT: subl %edi, %edx -; X32-NOSSE-NEXT: movl %edx, %edi -; X32-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X32-NOSSE-NEXT: shrl $2, %edx -; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X32-NOSSE-NEXT: addl %edi, %edx -; X32-NOSSE-NEXT: movl %edx, %edi -; X32-NOSSE-NEXT: shrl $4, %edi -; X32-NOSSE-NEXT: addl %edx, %edi -; X32-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %edx -; X32-NOSSE-NEXT: movl %ecx, %edi -; X32-NOSSE-NEXT: shrl %edi -; X32-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X32-NOSSE-NEXT: subl %edi, %ecx -; X32-NOSSE-NEXT: movl %ecx, %edi -; X32-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X32-NOSSE-NEXT: shrl $2, %ecx -; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X32-NOSSE-NEXT: addl %edi, %ecx -; X32-NOSSE-NEXT: movl %ecx, %edi -; X32-NOSSE-NEXT: shrl $4, %edi -; X32-NOSSE-NEXT: addl %ecx, %edi -; X32-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %ecx -; X32-NOSSE-NEXT: addl %edx, %ecx -; X32-NOSSE-NEXT: addl %esi, %ecx -; X32-NOSSE-NEXT: movl %ecx, (%eax) -; X32-NOSSE-NEXT: movl $0, 12(%eax) -; X32-NOSSE-NEXT: movl $0, 8(%eax) -; X32-NOSSE-NEXT: movl $0, 4(%eax) -; X32-NOSSE-NEXT: popl %esi -; X32-NOSSE-NEXT: popl %edi -; X32-NOSSE-NEXT: popl %ebx -; X32-NOSSE-NEXT: retl $4 +; X86-NOSSE-LABEL: cnt128: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: pushl %edi +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NOSSE-NEXT: movl %edi, %ebx +; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %ebx +; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %edi, %ebx +; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %ebx, %esi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %ebx, %esi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %esi, %ebx +; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %esi +; X86-NOSSE-NEXT: addl %edi, %esi +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: shrl %edi +; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edi, %edx +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edi, %edx +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %edx, %edi +; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl %edi +; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %ecx, %edi +; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: addl %esi, %ecx +; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: movl $0, 12(%eax) +; X86-NOSSE-NEXT: movl $0, 8(%eax) +; X86-NOSSE-NEXT: movl $0, 4(%eax) +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: popl %edi +; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: retl $4 ; ; X64-LABEL: cnt128: ; X64: # %bb.0: @@ -416,23 +416,23 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt128: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: pushl %esi -; X32-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx -; X32-POPCNT-NEXT: addl %ecx, %edx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi -; X32-POPCNT-NEXT: addl %ecx, %esi -; X32-POPCNT-NEXT: addl %edx, %esi -; X32-POPCNT-NEXT: movl %esi, (%eax) -; X32-POPCNT-NEXT: movl $0, 12(%eax) -; X32-POPCNT-NEXT: movl $0, 8(%eax) -; X32-POPCNT-NEXT: movl $0, 4(%eax) -; X32-POPCNT-NEXT: popl %esi -; X32-POPCNT-NEXT: retl $4 +; X86-POPCNT-LABEL: cnt128: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %esi +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx +; X86-POPCNT-NEXT: addl %ecx, %edx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: addl %ecx, %esi +; X86-POPCNT-NEXT: addl %edx, %esi +; X86-POPCNT-NEXT: movl %esi, (%eax) +; X86-POPCNT-NEXT: movl $0, 12(%eax) +; X86-POPCNT-NEXT: movl $0, 8(%eax) +; X86-POPCNT-NEXT: movl $0, 4(%eax) +; X86-POPCNT-NEXT: popl %esi +; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128: ; X64-POPCNT: # %bb.0: @@ -442,129 +442,129 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X64-POPCNT-NEXT: xorl %edx, %edx ; X64-POPCNT-NEXT: retq ; -; X32-SSE2-LABEL: cnt128: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $1, %xmm1 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; X32-SSE2-NEXT: pand %xmm2, %xmm1 -; X32-SSE2-NEXT: psubb %xmm1, %xmm0 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE2-NEXT: pand %xmm1, %xmm3 -; X32-SSE2-NEXT: psrlw $2, %xmm0 -; X32-SSE2-NEXT: pand %xmm1, %xmm0 -; X32-SSE2-NEXT: paddb %xmm3, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE2-NEXT: psrlw $4, %xmm3 -; X32-SSE2-NEXT: paddb %xmm0, %xmm3 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE2-NEXT: pand %xmm0, %xmm3 -; X32-SSE2-NEXT: pxor %xmm4, %xmm4 -; X32-SSE2-NEXT: psadbw %xmm4, %xmm3 -; X32-SSE2-NEXT: movd %xmm3, %ecx -; X32-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm3, %xmm5 -; X32-SSE2-NEXT: psrlw $1, %xmm5 -; X32-SSE2-NEXT: pand %xmm2, %xmm5 -; X32-SSE2-NEXT: psubb %xmm5, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X32-SSE2-NEXT: pand %xmm1, %xmm2 -; X32-SSE2-NEXT: psrlw $2, %xmm3 -; X32-SSE2-NEXT: pand %xmm1, %xmm3 -; X32-SSE2-NEXT: paddb %xmm2, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm1 -; X32-SSE2-NEXT: psrlw $4, %xmm1 -; X32-SSE2-NEXT: paddb %xmm3, %xmm1 -; X32-SSE2-NEXT: pand %xmm0, %xmm1 -; X32-SSE2-NEXT: psadbw %xmm4, %xmm1 -; X32-SSE2-NEXT: movd %xmm1, %edx -; X32-SSE2-NEXT: addl %ecx, %edx -; X32-SSE2-NEXT: movl %edx, (%eax) -; X32-SSE2-NEXT: movl $0, 12(%eax) -; X32-SSE2-NEXT: movl $0, 8(%eax) -; X32-SSE2-NEXT: movl $0, 4(%eax) -; X32-SSE2-NEXT: retl $4 +; X86-SSE2-LABEL: cnt128: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: psadbw %xmm4, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %ecx +; X86-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $1, %xmm5 +; X86-SSE2-NEXT: pand %xmm2, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm4, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %edx +; X86-SSE2-NEXT: addl %ecx, %edx +; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl $0, 12(%eax) +; X86-SSE2-NEXT: movl $0, 8(%eax) +; X86-SSE2-NEXT: movl $0, 4(%eax) +; X86-SSE2-NEXT: retl $4 ; -; X32-SSSE3-LABEL: cnt128: -; X32-SSSE3: # %bb.0: -; X32-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X32-SSSE3-NEXT: psrlw $4, %xmm1 -; X32-SSSE3-NEXT: pand %xmm0, %xmm1 -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm2 -; X32-SSSE3-NEXT: pshufb %xmm1, %xmm2 -; X32-SSSE3-NEXT: paddb %xmm4, %xmm2 -; X32-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X32-SSSE3-NEXT: psadbw %xmm1, %xmm2 -; X32-SSSE3-NEXT: movd %xmm2, %ecx -; X32-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; X32-SSSE3-NEXT: pand %xmm0, %xmm4 -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; X32-SSSE3-NEXT: pshufb %xmm4, %xmm5 -; X32-SSSE3-NEXT: psrlw $4, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm3 -; X32-SSSE3-NEXT: paddb %xmm5, %xmm3 -; X32-SSSE3-NEXT: psadbw %xmm1, %xmm3 -; X32-SSSE3-NEXT: movd %xmm3, %edx -; X32-SSSE3-NEXT: addl %ecx, %edx -; X32-SSSE3-NEXT: movl %edx, (%eax) -; X32-SSSE3-NEXT: movl $0, 12(%eax) -; X32-SSSE3-NEXT: movl $0, 8(%eax) -; X32-SSSE3-NEXT: movl $0, 4(%eax) -; X32-SSSE3-NEXT: retl $4 +; X86-SSSE3-LABEL: cnt128: +; X86-SSSE3: # %bb.0: +; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X86-SSSE3-NEXT: psrlw $4, %xmm1 +; X86-SSSE3-NEXT: pand %xmm0, %xmm1 +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; X86-SSSE3-NEXT: pshufb %xmm1, %xmm2 +; X86-SSSE3-NEXT: paddb %xmm4, %xmm2 +; X86-SSSE3-NEXT: pxor %xmm1, %xmm1 +; X86-SSSE3-NEXT: psadbw %xmm1, %xmm2 +; X86-SSSE3-NEXT: movd %xmm2, %ecx +; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm2, %xmm4 +; X86-SSSE3-NEXT: pand %xmm0, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: psrlw $4, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm3 +; X86-SSSE3-NEXT: psadbw %xmm1, %xmm3 +; X86-SSSE3-NEXT: movd %xmm3, %edx +; X86-SSSE3-NEXT: addl %ecx, %edx +; X86-SSSE3-NEXT: movl %edx, (%eax) +; X86-SSSE3-NEXT: movl $0, 12(%eax) +; X86-SSSE3-NEXT: movl $0, 8(%eax) +; X86-SSSE3-NEXT: movl $0, 4(%eax) +; X86-SSSE3-NEXT: retl $4 %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) ret i128 %cnt } define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat { -; X32-LABEL: cnt64_noimplicitfloat: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: shrl %edx -; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X32-NEXT: subl %edx, %ecx -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X32-NEXT: shrl $2, %ecx -; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: shrl $4, %edx -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; X32-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 -; X32-NEXT: shrl $24, %ecx -; X32-NEXT: movl %eax, %edx -; X32-NEXT: shrl %edx -; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X32-NEXT: subl %edx, %eax -; X32-NEXT: movl %eax, %edx -; X32-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X32-NEXT: shrl $2, %eax -; X32-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X32-NEXT: addl %edx, %eax -; X32-NEXT: movl %eax, %edx -; X32-NEXT: shrl $4, %edx -; X32-NEXT: addl %eax, %edx -; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; X32-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 -; X32-NEXT: shrl $24, %eax -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: retl +; X86-LABEL: cnt64_noimplicitfloat: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NEXT: shrl $2, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $4, %edx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NEXT: subl %edx, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $4, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X86-NEXT: shrl $24, %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl ; ; X64-LABEL: cnt64_noimplicitfloat: ; X64: # %bb.0: @@ -589,13 +589,13 @@ define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat { ; X64-NEXT: shrq $56, %rax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt64_noimplicitfloat: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: addl %ecx, %eax -; X32-POPCNT-NEXT: xorl %edx, %edx -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt64_noimplicitfloat: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: addl %ecx, %eax +; X86-POPCNT-NEXT: xorl %edx, %edx +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt64_noimplicitfloat: ; X64-POPCNT: # %bb.0: @@ -606,26 +606,26 @@ define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat { } define i32 @cnt32_optsize(i32 %x) nounwind readnone optsize { -; X32-LABEL: cnt32_optsize: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl %ecx -; X32-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X32-NEXT: subl %ecx, %eax -; X32-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X32-NEXT: movl %eax, %edx -; X32-NEXT: andl %ecx, %edx -; X32-NEXT: shrl $2, %eax -; X32-NEXT: andl %ecx, %eax -; X32-NEXT: addl %edx, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl $4, %ecx -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; X32-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; X32-NEXT: shrl $24, %eax -; X32-NEXT: retl +; X86-LABEL: cnt32_optsize: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 +; X86-NEXT: shrl $24, %eax +; X86-NEXT: retl ; ; X64-LABEL: cnt32_optsize: ; X64: # %bb.0: @@ -647,10 +647,10 @@ define i32 @cnt32_optsize(i32 %x) nounwind readnone optsize { ; X64-NEXT: shrl $24, %eax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt32_optsize: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt32_optsize: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt32_optsize: ; X64-POPCNT: # %bb.0: @@ -661,52 +661,52 @@ define i32 @cnt32_optsize(i32 %x) nounwind readnone optsize { } define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize { -; X32-NOSSE-LABEL: cnt64_optsize: -; X32-NOSSE: # %bb.0: -; X32-NOSSE-NEXT: pushl %ebx -; X32-NOSSE-NEXT: pushl %edi -; X32-NOSSE-NEXT: pushl %esi -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOSSE-NEXT: movl %ecx, %edx -; X32-NOSSE-NEXT: shrl %edx -; X32-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X32-NOSSE-NEXT: andl %esi, %edx -; X32-NOSSE-NEXT: subl %edx, %ecx -; X32-NOSSE-NEXT: movl $858993459, %edx # imm = 0x33333333 -; X32-NOSSE-NEXT: movl %ecx, %edi -; X32-NOSSE-NEXT: andl %edx, %edi -; X32-NOSSE-NEXT: shrl $2, %ecx -; X32-NOSSE-NEXT: andl %edx, %ecx -; X32-NOSSE-NEXT: addl %edi, %ecx -; X32-NOSSE-NEXT: movl %ecx, %edi -; X32-NOSSE-NEXT: shrl $4, %edi -; X32-NOSSE-NEXT: addl %ecx, %edi -; X32-NOSSE-NEXT: movl $252645135, %ecx # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: andl %ecx, %edi -; X32-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %edi -; X32-NOSSE-NEXT: movl %eax, %ebx -; X32-NOSSE-NEXT: shrl %ebx -; X32-NOSSE-NEXT: andl %esi, %ebx -; X32-NOSSE-NEXT: subl %ebx, %eax -; X32-NOSSE-NEXT: movl %eax, %esi -; X32-NOSSE-NEXT: andl %edx, %esi -; X32-NOSSE-NEXT: shrl $2, %eax -; X32-NOSSE-NEXT: andl %edx, %eax -; X32-NOSSE-NEXT: addl %esi, %eax -; X32-NOSSE-NEXT: movl %eax, %edx -; X32-NOSSE-NEXT: shrl $4, %edx -; X32-NOSSE-NEXT: addl %eax, %edx -; X32-NOSSE-NEXT: andl %ecx, %edx -; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %eax -; X32-NOSSE-NEXT: addl %edi, %eax -; X32-NOSSE-NEXT: xorl %edx, %edx -; X32-NOSSE-NEXT: popl %esi -; X32-NOSSE-NEXT: popl %edi -; X32-NOSSE-NEXT: popl %ebx -; X32-NOSSE-NEXT: retl +; X86-NOSSE-LABEL: cnt64_optsize: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: pushl %edi +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %esi, %edx +; X86-NOSSE-NEXT: subl %edx, %ecx +; X86-NOSSE-NEXT: movl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: andl %edx, %edi +; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: andl %edx, %ecx +; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %ecx, %edi +; X86-NOSSE-NEXT: movl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %ecx, %edi +; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl %esi, %ebx +; X86-NOSSE-NEXT: subl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %esi +; X86-NOSSE-NEXT: andl %edx, %esi +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl %edx, %eax +; X86-NOSSE-NEXT: addl %esi, %eax +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: addl %edi, %eax +; X86-NOSSE-NEXT: xorl %edx, %edx +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: popl %edi +; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: retl ; ; X64-LABEL: cnt64_optsize: ; X64: # %bb.0: @@ -731,154 +731,154 @@ define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize { ; X64-NEXT: shrq $56, %rax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt64_optsize: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: addl %ecx, %eax -; X32-POPCNT-NEXT: xorl %edx, %edx -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt64_optsize: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: addl %ecx, %eax +; X86-POPCNT-NEXT: xorl %edx, %edx +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt64_optsize: ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntq %rdi, %rax ; X64-POPCNT-NEXT: retq ; -; X32-SSE2-LABEL: cnt64_optsize: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $1, %xmm1 -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE2-NEXT: psubb %xmm1, %xmm0 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: pand %xmm1, %xmm2 -; X32-SSE2-NEXT: psrlw $2, %xmm0 -; X32-SSE2-NEXT: pand %xmm1, %xmm0 -; X32-SSE2-NEXT: paddb %xmm2, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $4, %xmm1 -; X32-SSE2-NEXT: paddb %xmm0, %xmm1 -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE2-NEXT: pxor %xmm0, %xmm0 -; X32-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE2-NEXT: movd %xmm0, %eax -; X32-SSE2-NEXT: xorl %edx, %edx -; X32-SSE2-NEXT: retl +; X86-SSE2-LABEL: cnt64_optsize: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; -; X32-SSSE3-LABEL: cnt64_optsize: -; X32-SSSE3: # %bb.0: -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X32-SSSE3-NEXT: psrlw $4, %xmm1 -; X32-SSSE3-NEXT: pand %xmm0, %xmm1 -; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3 -; X32-SSSE3-NEXT: paddb %xmm4, %xmm3 -; X32-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0 -; X32-SSSE3-NEXT: movd %xmm0, %eax -; X32-SSSE3-NEXT: xorl %edx, %edx -; X32-SSSE3-NEXT: retl +; X86-SSSE3-LABEL: cnt64_optsize: +; X86-SSSE3: # %bb.0: +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X86-SSSE3-NEXT: psrlw $4, %xmm1 +; X86-SSSE3-NEXT: pand %xmm0, %xmm1 +; X86-SSSE3-NEXT: pshufb %xmm1, %xmm3 +; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 +; X86-SSSE3-NEXT: pxor %xmm0, %xmm0 +; X86-SSSE3-NEXT: psadbw %xmm3, %xmm0 +; X86-SSSE3-NEXT: movd %xmm0, %eax +; X86-SSSE3-NEXT: xorl %edx, %edx +; X86-SSSE3-NEXT: retl %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt } define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { -; X32-NOSSE-LABEL: cnt128_optsize: -; X32-NOSSE: # %bb.0: -; X32-NOSSE-NEXT: pushl %ebp -; X32-NOSSE-NEXT: pushl %ebx -; X32-NOSSE-NEXT: pushl %edi -; X32-NOSSE-NEXT: pushl %esi -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NOSSE-NEXT: movl %ebx, %ecx -; X32-NOSSE-NEXT: shrl %ecx -; X32-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X32-NOSSE-NEXT: andl %edi, %ecx -; X32-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X32-NOSSE-NEXT: subl %ecx, %ebx -; X32-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X32-NOSSE-NEXT: movl %ebx, %ebp -; X32-NOSSE-NEXT: andl %ecx, %ebp -; X32-NOSSE-NEXT: shrl $2, %ebx -; X32-NOSSE-NEXT: andl %ecx, %ebx -; X32-NOSSE-NEXT: addl %ebp, %ebx -; X32-NOSSE-NEXT: movl %ebx, %ebp -; X32-NOSSE-NEXT: shrl $4, %ebp -; X32-NOSSE-NEXT: addl %ebx, %ebp -; X32-NOSSE-NEXT: movl %eax, %ebx -; X32-NOSSE-NEXT: shrl %ebx -; X32-NOSSE-NEXT: andl %edi, %ebx -; X32-NOSSE-NEXT: subl %ebx, %eax -; X32-NOSSE-NEXT: movl %eax, %ebx -; X32-NOSSE-NEXT: andl %ecx, %ebx -; X32-NOSSE-NEXT: shrl $2, %eax -; X32-NOSSE-NEXT: andl %ecx, %eax -; X32-NOSSE-NEXT: addl %ebx, %eax -; X32-NOSSE-NEXT: movl %eax, %edi -; X32-NOSSE-NEXT: shrl $4, %edi -; X32-NOSSE-NEXT: addl %eax, %edi -; X32-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: andl %ebx, %ebp -; X32-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %eax -; X32-NOSSE-NEXT: andl %ebx, %edi -; X32-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %edi -; X32-NOSSE-NEXT: addl %eax, %edi -; X32-NOSSE-NEXT: movl %esi, %eax -; X32-NOSSE-NEXT: shrl %eax -; X32-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 -; X32-NOSSE-NEXT: andl %ebp, %eax -; X32-NOSSE-NEXT: subl %eax, %esi -; X32-NOSSE-NEXT: movl %esi, %eax -; X32-NOSSE-NEXT: andl %ecx, %eax -; X32-NOSSE-NEXT: shrl $2, %esi -; X32-NOSSE-NEXT: andl %ecx, %esi -; X32-NOSSE-NEXT: addl %eax, %esi -; X32-NOSSE-NEXT: movl %esi, %eax -; X32-NOSSE-NEXT: shrl $4, %eax -; X32-NOSSE-NEXT: addl %esi, %eax -; X32-NOSSE-NEXT: movl %edx, %esi -; X32-NOSSE-NEXT: shrl %esi -; X32-NOSSE-NEXT: andl %ebp, %esi -; X32-NOSSE-NEXT: subl %esi, %edx -; X32-NOSSE-NEXT: movl %edx, %esi -; X32-NOSSE-NEXT: andl %ecx, %esi -; X32-NOSSE-NEXT: shrl $2, %edx -; X32-NOSSE-NEXT: andl %ecx, %edx -; X32-NOSSE-NEXT: addl %esi, %edx -; X32-NOSSE-NEXT: movl %edx, %ecx -; X32-NOSSE-NEXT: shrl $4, %ecx -; X32-NOSSE-NEXT: addl %edx, %ecx -; X32-NOSSE-NEXT: andl %ebx, %eax -; X32-NOSSE-NEXT: andl %ebx, %ecx -; X32-NOSSE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %eax -; X32-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %ecx -; X32-NOSSE-NEXT: addl %eax, %ecx -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOSSE-NEXT: addl %edi, %ecx -; X32-NOSSE-NEXT: xorl %edx, %edx -; X32-NOSSE-NEXT: movl %edx, 12(%eax) -; X32-NOSSE-NEXT: movl %edx, 8(%eax) -; X32-NOSSE-NEXT: movl %edx, 4(%eax) -; X32-NOSSE-NEXT: movl %ecx, (%eax) -; X32-NOSSE-NEXT: popl %esi -; X32-NOSSE-NEXT: popl %edi -; X32-NOSSE-NEXT: popl %ebx -; X32-NOSSE-NEXT: popl %ebp -; X32-NOSSE-NEXT: retl $4 +; X86-NOSSE-LABEL: cnt128_optsize: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %ebp +; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: pushl %edi +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOSSE-NEXT: movl %ebx, %ecx +; X86-NOSSE-NEXT: shrl %ecx +; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %edi, %ecx +; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %ecx, %ebx +; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %ebx, %ebp +; X86-NOSSE-NEXT: andl %ecx, %ebp +; X86-NOSSE-NEXT: shrl $2, %ebx +; X86-NOSSE-NEXT: andl %ecx, %ebx +; X86-NOSSE-NEXT: addl %ebp, %ebx +; X86-NOSSE-NEXT: movl %ebx, %ebp +; X86-NOSSE-NEXT: shrl $4, %ebp +; X86-NOSSE-NEXT: addl %ebx, %ebp +; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl %edi, %ebx +; X86-NOSSE-NEXT: subl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: andl %ecx, %ebx +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: addl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %ebx, %ebp +; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: andl %ebx, %edi +; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %ebp, %eax +; X86-NOSSE-NEXT: subl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl $4, %eax +; X86-NOSSE-NEXT: addl %esi, %eax +; X86-NOSSE-NEXT: movl %edx, %esi +; X86-NOSSE-NEXT: shrl %esi +; X86-NOSSE-NEXT: andl %ebp, %esi +; X86-NOSSE-NEXT: subl %esi, %edx +; X86-NOSSE-NEXT: movl %edx, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: shrl $2, %edx +; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: addl %esi, %edx +; X86-NOSSE-NEXT: movl %edx, %ecx +; X86-NOSSE-NEXT: shrl $4, %ecx +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: andl %ebx, %eax +; X86-NOSSE-NEXT: andl %ebx, %ecx +; X86-NOSSE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: addl %eax, %ecx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: xorl %edx, %edx +; X86-NOSSE-NEXT: movl %edx, 12(%eax) +; X86-NOSSE-NEXT: movl %edx, 8(%eax) +; X86-NOSSE-NEXT: movl %edx, 4(%eax) +; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: popl %edi +; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: popl %ebp +; X86-NOSSE-NEXT: retl $4 ; ; X64-LABEL: cnt128_optsize: ; X64: # %bb.0: @@ -920,24 +920,24 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt128_optsize: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: pushl %esi -; X32-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx -; X32-POPCNT-NEXT: addl %ecx, %edx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi -; X32-POPCNT-NEXT: addl %ecx, %esi -; X32-POPCNT-NEXT: addl %edx, %esi -; X32-POPCNT-NEXT: xorl %ecx, %ecx -; X32-POPCNT-NEXT: movl %ecx, 12(%eax) -; X32-POPCNT-NEXT: movl %ecx, 8(%eax) -; X32-POPCNT-NEXT: movl %ecx, 4(%eax) -; X32-POPCNT-NEXT: movl %esi, (%eax) -; X32-POPCNT-NEXT: popl %esi -; X32-POPCNT-NEXT: retl $4 +; X86-POPCNT-LABEL: cnt128_optsize: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %esi +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx +; X86-POPCNT-NEXT: addl %ecx, %edx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: addl %ecx, %esi +; X86-POPCNT-NEXT: addl %edx, %esi +; X86-POPCNT-NEXT: xorl %ecx, %ecx +; X86-POPCNT-NEXT: movl %ecx, 12(%eax) +; X86-POPCNT-NEXT: movl %ecx, 8(%eax) +; X86-POPCNT-NEXT: movl %ecx, 4(%eax) +; X86-POPCNT-NEXT: movl %esi, (%eax) +; X86-POPCNT-NEXT: popl %esi +; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128_optsize: ; X64-POPCNT: # %bb.0: @@ -947,114 +947,114 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X64-POPCNT-NEXT: xorl %edx, %edx ; X64-POPCNT-NEXT: retq ; -; X32-SSE2-LABEL: cnt128_optsize: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $1, %xmm1 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; X32-SSE2-NEXT: pand %xmm2, %xmm1 -; X32-SSE2-NEXT: psubb %xmm1, %xmm0 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE2-NEXT: pand %xmm1, %xmm3 -; X32-SSE2-NEXT: psrlw $2, %xmm0 -; X32-SSE2-NEXT: pand %xmm1, %xmm0 -; X32-SSE2-NEXT: paddb %xmm3, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE2-NEXT: psrlw $4, %xmm3 -; X32-SSE2-NEXT: paddb %xmm0, %xmm3 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE2-NEXT: pand %xmm0, %xmm3 -; X32-SSE2-NEXT: pxor %xmm4, %xmm4 -; X32-SSE2-NEXT: psadbw %xmm4, %xmm3 -; X32-SSE2-NEXT: movd %xmm3, %ecx -; X32-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm3, %xmm5 -; X32-SSE2-NEXT: psrlw $1, %xmm5 -; X32-SSE2-NEXT: pand %xmm2, %xmm5 -; X32-SSE2-NEXT: psubb %xmm5, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X32-SSE2-NEXT: pand %xmm1, %xmm2 -; X32-SSE2-NEXT: psrlw $2, %xmm3 -; X32-SSE2-NEXT: pand %xmm1, %xmm3 -; X32-SSE2-NEXT: paddb %xmm2, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm1 -; X32-SSE2-NEXT: psrlw $4, %xmm1 -; X32-SSE2-NEXT: paddb %xmm3, %xmm1 -; X32-SSE2-NEXT: pand %xmm0, %xmm1 -; X32-SSE2-NEXT: psadbw %xmm4, %xmm1 -; X32-SSE2-NEXT: movd %xmm1, %edx -; X32-SSE2-NEXT: addl %ecx, %edx -; X32-SSE2-NEXT: xorl %ecx, %ecx -; X32-SSE2-NEXT: movl %ecx, 12(%eax) -; X32-SSE2-NEXT: movl %ecx, 8(%eax) -; X32-SSE2-NEXT: movl %ecx, 4(%eax) -; X32-SSE2-NEXT: movl %edx, (%eax) -; X32-SSE2-NEXT: retl $4 +; X86-SSE2-LABEL: cnt128_optsize: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: psadbw %xmm4, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %ecx +; X86-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $1, %xmm5 +; X86-SSE2-NEXT: pand %xmm2, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm4, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %edx +; X86-SSE2-NEXT: addl %ecx, %edx +; X86-SSE2-NEXT: xorl %ecx, %ecx +; X86-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: retl $4 ; -; X32-SSSE3-LABEL: cnt128_optsize: -; X32-SSSE3: # %bb.0: -; X32-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X32-SSSE3-NEXT: psrlw $4, %xmm1 -; X32-SSSE3-NEXT: pand %xmm0, %xmm1 -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm2 -; X32-SSSE3-NEXT: pshufb %xmm1, %xmm2 -; X32-SSSE3-NEXT: paddb %xmm4, %xmm2 -; X32-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X32-SSSE3-NEXT: psadbw %xmm1, %xmm2 -; X32-SSSE3-NEXT: movd %xmm2, %ecx -; X32-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; X32-SSSE3-NEXT: pand %xmm0, %xmm4 -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; X32-SSSE3-NEXT: pshufb %xmm4, %xmm5 -; X32-SSSE3-NEXT: psrlw $4, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm3 -; X32-SSSE3-NEXT: paddb %xmm5, %xmm3 -; X32-SSSE3-NEXT: psadbw %xmm1, %xmm3 -; X32-SSSE3-NEXT: movd %xmm3, %edx -; X32-SSSE3-NEXT: addl %ecx, %edx -; X32-SSSE3-NEXT: xorl %ecx, %ecx -; X32-SSSE3-NEXT: movl %ecx, 12(%eax) -; X32-SSSE3-NEXT: movl %ecx, 8(%eax) -; X32-SSSE3-NEXT: movl %ecx, 4(%eax) -; X32-SSSE3-NEXT: movl %edx, (%eax) -; X32-SSSE3-NEXT: retl $4 +; X86-SSSE3-LABEL: cnt128_optsize: +; X86-SSSE3: # %bb.0: +; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X86-SSSE3-NEXT: psrlw $4, %xmm1 +; X86-SSSE3-NEXT: pand %xmm0, %xmm1 +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; X86-SSSE3-NEXT: pshufb %xmm1, %xmm2 +; X86-SSSE3-NEXT: paddb %xmm4, %xmm2 +; X86-SSSE3-NEXT: pxor %xmm1, %xmm1 +; X86-SSSE3-NEXT: psadbw %xmm1, %xmm2 +; X86-SSSE3-NEXT: movd %xmm2, %ecx +; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm2, %xmm4 +; X86-SSSE3-NEXT: pand %xmm0, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: psrlw $4, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm3 +; X86-SSSE3-NEXT: psadbw %xmm1, %xmm3 +; X86-SSSE3-NEXT: movd %xmm3, %edx +; X86-SSSE3-NEXT: addl %ecx, %edx +; X86-SSSE3-NEXT: xorl %ecx, %ecx +; X86-SSSE3-NEXT: movl %ecx, 12(%eax) +; X86-SSSE3-NEXT: movl %ecx, 8(%eax) +; X86-SSSE3-NEXT: movl %ecx, 4(%eax) +; X86-SSSE3-NEXT: movl %edx, (%eax) +; X86-SSSE3-NEXT: retl $4 %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) ret i128 %cnt } define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 { -; X32-LABEL: cnt32_pgso: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl %ecx -; X32-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X32-NEXT: subl %ecx, %eax -; X32-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X32-NEXT: movl %eax, %edx -; X32-NEXT: andl %ecx, %edx -; X32-NEXT: shrl $2, %eax -; X32-NEXT: andl %ecx, %eax -; X32-NEXT: addl %edx, %eax -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl $4, %ecx -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; X32-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; X32-NEXT: shrl $24, %eax -; X32-NEXT: retl +; X86-LABEL: cnt32_pgso: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 +; X86-NEXT: shrl $24, %eax +; X86-NEXT: retl ; ; X64-LABEL: cnt32_pgso: ; X64: # %bb.0: @@ -1076,10 +1076,10 @@ define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 { ; X64-NEXT: shrl $24, %eax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt32_pgso: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt32_pgso: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt32_pgso: ; X64-POPCNT: # %bb.0: @@ -1090,52 +1090,52 @@ define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 { } define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 { -; X32-NOSSE-LABEL: cnt64_pgso: -; X32-NOSSE: # %bb.0: -; X32-NOSSE-NEXT: pushl %ebx -; X32-NOSSE-NEXT: pushl %edi -; X32-NOSSE-NEXT: pushl %esi -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NOSSE-NEXT: movl %ecx, %edx -; X32-NOSSE-NEXT: shrl %edx -; X32-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X32-NOSSE-NEXT: andl %esi, %edx -; X32-NOSSE-NEXT: subl %edx, %ecx -; X32-NOSSE-NEXT: movl $858993459, %edx # imm = 0x33333333 -; X32-NOSSE-NEXT: movl %ecx, %edi -; X32-NOSSE-NEXT: andl %edx, %edi -; X32-NOSSE-NEXT: shrl $2, %ecx -; X32-NOSSE-NEXT: andl %edx, %ecx -; X32-NOSSE-NEXT: addl %edi, %ecx -; X32-NOSSE-NEXT: movl %ecx, %edi -; X32-NOSSE-NEXT: shrl $4, %edi -; X32-NOSSE-NEXT: addl %ecx, %edi -; X32-NOSSE-NEXT: movl $252645135, %ecx # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: andl %ecx, %edi -; X32-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %edi -; X32-NOSSE-NEXT: movl %eax, %ebx -; X32-NOSSE-NEXT: shrl %ebx -; X32-NOSSE-NEXT: andl %esi, %ebx -; X32-NOSSE-NEXT: subl %ebx, %eax -; X32-NOSSE-NEXT: movl %eax, %esi -; X32-NOSSE-NEXT: andl %edx, %esi -; X32-NOSSE-NEXT: shrl $2, %eax -; X32-NOSSE-NEXT: andl %edx, %eax -; X32-NOSSE-NEXT: addl %esi, %eax -; X32-NOSSE-NEXT: movl %eax, %edx -; X32-NOSSE-NEXT: shrl $4, %edx -; X32-NOSSE-NEXT: addl %eax, %edx -; X32-NOSSE-NEXT: andl %ecx, %edx -; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %eax -; X32-NOSSE-NEXT: addl %edi, %eax -; X32-NOSSE-NEXT: xorl %edx, %edx -; X32-NOSSE-NEXT: popl %esi -; X32-NOSSE-NEXT: popl %edi -; X32-NOSSE-NEXT: popl %ebx -; X32-NOSSE-NEXT: retl +; X86-NOSSE-LABEL: cnt64_pgso: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: pushl %edi +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %esi, %edx +; X86-NOSSE-NEXT: subl %edx, %ecx +; X86-NOSSE-NEXT: movl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: andl %edx, %edi +; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: andl %edx, %ecx +; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %ecx, %edi +; X86-NOSSE-NEXT: movl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %ecx, %edi +; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl %esi, %ebx +; X86-NOSSE-NEXT: subl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %esi +; X86-NOSSE-NEXT: andl %edx, %esi +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl %edx, %eax +; X86-NOSSE-NEXT: addl %esi, %eax +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: addl %edi, %eax +; X86-NOSSE-NEXT: xorl %edx, %edx +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: popl %edi +; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: retl ; ; X64-LABEL: cnt64_pgso: ; X64: # %bb.0: @@ -1160,154 +1160,154 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 { ; X64-NEXT: shrq $56, %rax ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt64_pgso: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: addl %ecx, %eax -; X32-POPCNT-NEXT: xorl %edx, %edx -; X32-POPCNT-NEXT: retl +; X86-POPCNT-LABEL: cnt64_pgso: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: addl %ecx, %eax +; X86-POPCNT-NEXT: xorl %edx, %edx +; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: cnt64_pgso: ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntq %rdi, %rax ; X64-POPCNT-NEXT: retq ; -; X32-SSE2-LABEL: cnt64_pgso: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $1, %xmm1 -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE2-NEXT: psubb %xmm1, %xmm0 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: pand %xmm1, %xmm2 -; X32-SSE2-NEXT: psrlw $2, %xmm0 -; X32-SSE2-NEXT: pand %xmm1, %xmm0 -; X32-SSE2-NEXT: paddb %xmm2, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $4, %xmm1 -; X32-SSE2-NEXT: paddb %xmm0, %xmm1 -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE2-NEXT: pxor %xmm0, %xmm0 -; X32-SSE2-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE2-NEXT: movd %xmm0, %eax -; X32-SSE2-NEXT: xorl %edx, %edx -; X32-SSE2-NEXT: retl +; X86-SSE2-LABEL: cnt64_pgso: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl %edx, %edx +; X86-SSE2-NEXT: retl ; -; X32-SSSE3-LABEL: cnt64_pgso: -; X32-SSSE3: # %bb.0: -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X32-SSSE3-NEXT: psrlw $4, %xmm1 -; X32-SSSE3-NEXT: pand %xmm0, %xmm1 -; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3 -; X32-SSSE3-NEXT: paddb %xmm4, %xmm3 -; X32-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0 -; X32-SSSE3-NEXT: movd %xmm0, %eax -; X32-SSSE3-NEXT: xorl %edx, %edx -; X32-SSSE3-NEXT: retl +; X86-SSSE3-LABEL: cnt64_pgso: +; X86-SSSE3: # %bb.0: +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X86-SSSE3-NEXT: psrlw $4, %xmm1 +; X86-SSSE3-NEXT: pand %xmm0, %xmm1 +; X86-SSSE3-NEXT: pshufb %xmm1, %xmm3 +; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 +; X86-SSSE3-NEXT: pxor %xmm0, %xmm0 +; X86-SSSE3-NEXT: psadbw %xmm3, %xmm0 +; X86-SSSE3-NEXT: movd %xmm0, %eax +; X86-SSSE3-NEXT: xorl %edx, %edx +; X86-SSSE3-NEXT: retl %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt } define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { -; X32-NOSSE-LABEL: cnt128_pgso: -; X32-NOSSE: # %bb.0: -; X32-NOSSE-NEXT: pushl %ebp -; X32-NOSSE-NEXT: pushl %ebx -; X32-NOSSE-NEXT: pushl %edi -; X32-NOSSE-NEXT: pushl %esi -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NOSSE-NEXT: movl %ebx, %ecx -; X32-NOSSE-NEXT: shrl %ecx -; X32-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X32-NOSSE-NEXT: andl %edi, %ecx -; X32-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X32-NOSSE-NEXT: subl %ecx, %ebx -; X32-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X32-NOSSE-NEXT: movl %ebx, %ebp -; X32-NOSSE-NEXT: andl %ecx, %ebp -; X32-NOSSE-NEXT: shrl $2, %ebx -; X32-NOSSE-NEXT: andl %ecx, %ebx -; X32-NOSSE-NEXT: addl %ebp, %ebx -; X32-NOSSE-NEXT: movl %ebx, %ebp -; X32-NOSSE-NEXT: shrl $4, %ebp -; X32-NOSSE-NEXT: addl %ebx, %ebp -; X32-NOSSE-NEXT: movl %eax, %ebx -; X32-NOSSE-NEXT: shrl %ebx -; X32-NOSSE-NEXT: andl %edi, %ebx -; X32-NOSSE-NEXT: subl %ebx, %eax -; X32-NOSSE-NEXT: movl %eax, %ebx -; X32-NOSSE-NEXT: andl %ecx, %ebx -; X32-NOSSE-NEXT: shrl $2, %eax -; X32-NOSSE-NEXT: andl %ecx, %eax -; X32-NOSSE-NEXT: addl %ebx, %eax -; X32-NOSSE-NEXT: movl %eax, %edi -; X32-NOSSE-NEXT: shrl $4, %edi -; X32-NOSSE-NEXT: addl %eax, %edi -; X32-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F -; X32-NOSSE-NEXT: andl %ebx, %ebp -; X32-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %eax -; X32-NOSSE-NEXT: andl %ebx, %edi -; X32-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %edi -; X32-NOSSE-NEXT: addl %eax, %edi -; X32-NOSSE-NEXT: movl %esi, %eax -; X32-NOSSE-NEXT: shrl %eax -; X32-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 -; X32-NOSSE-NEXT: andl %ebp, %eax -; X32-NOSSE-NEXT: subl %eax, %esi -; X32-NOSSE-NEXT: movl %esi, %eax -; X32-NOSSE-NEXT: andl %ecx, %eax -; X32-NOSSE-NEXT: shrl $2, %esi -; X32-NOSSE-NEXT: andl %ecx, %esi -; X32-NOSSE-NEXT: addl %eax, %esi -; X32-NOSSE-NEXT: movl %esi, %eax -; X32-NOSSE-NEXT: shrl $4, %eax -; X32-NOSSE-NEXT: addl %esi, %eax -; X32-NOSSE-NEXT: movl %edx, %esi -; X32-NOSSE-NEXT: shrl %esi -; X32-NOSSE-NEXT: andl %ebp, %esi -; X32-NOSSE-NEXT: subl %esi, %edx -; X32-NOSSE-NEXT: movl %edx, %esi -; X32-NOSSE-NEXT: andl %ecx, %esi -; X32-NOSSE-NEXT: shrl $2, %edx -; X32-NOSSE-NEXT: andl %ecx, %edx -; X32-NOSSE-NEXT: addl %esi, %edx -; X32-NOSSE-NEXT: movl %edx, %ecx -; X32-NOSSE-NEXT: shrl $4, %ecx -; X32-NOSSE-NEXT: addl %edx, %ecx -; X32-NOSSE-NEXT: andl %ebx, %eax -; X32-NOSSE-NEXT: andl %ebx, %ecx -; X32-NOSSE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %eax -; X32-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 -; X32-NOSSE-NEXT: shrl $24, %ecx -; X32-NOSSE-NEXT: addl %eax, %ecx -; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NOSSE-NEXT: addl %edi, %ecx -; X32-NOSSE-NEXT: xorl %edx, %edx -; X32-NOSSE-NEXT: movl %edx, 12(%eax) -; X32-NOSSE-NEXT: movl %edx, 8(%eax) -; X32-NOSSE-NEXT: movl %edx, 4(%eax) -; X32-NOSSE-NEXT: movl %ecx, (%eax) -; X32-NOSSE-NEXT: popl %esi -; X32-NOSSE-NEXT: popl %edi -; X32-NOSSE-NEXT: popl %ebx -; X32-NOSSE-NEXT: popl %ebp -; X32-NOSSE-NEXT: retl $4 +; X86-NOSSE-LABEL: cnt128_pgso: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %ebp +; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: pushl %edi +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOSSE-NEXT: movl %ebx, %ecx +; X86-NOSSE-NEXT: shrl %ecx +; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %edi, %ecx +; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %ecx, %ebx +; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %ebx, %ebp +; X86-NOSSE-NEXT: andl %ecx, %ebp +; X86-NOSSE-NEXT: shrl $2, %ebx +; X86-NOSSE-NEXT: andl %ecx, %ebx +; X86-NOSSE-NEXT: addl %ebp, %ebx +; X86-NOSSE-NEXT: movl %ebx, %ebp +; X86-NOSSE-NEXT: shrl $4, %ebp +; X86-NOSSE-NEXT: addl %ebx, %ebp +; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl %edi, %ebx +; X86-NOSSE-NEXT: subl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: andl %ecx, %ebx +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: addl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %ebx, %ebp +; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: andl %ebx, %edi +; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %ebp, %eax +; X86-NOSSE-NEXT: subl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl $4, %eax +; X86-NOSSE-NEXT: addl %esi, %eax +; X86-NOSSE-NEXT: movl %edx, %esi +; X86-NOSSE-NEXT: shrl %esi +; X86-NOSSE-NEXT: andl %ebp, %esi +; X86-NOSSE-NEXT: subl %esi, %edx +; X86-NOSSE-NEXT: movl %edx, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: shrl $2, %edx +; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: addl %esi, %edx +; X86-NOSSE-NEXT: movl %edx, %ecx +; X86-NOSSE-NEXT: shrl $4, %ecx +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: andl %ebx, %eax +; X86-NOSSE-NEXT: andl %ebx, %ecx +; X86-NOSSE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: addl %eax, %ecx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: xorl %edx, %edx +; X86-NOSSE-NEXT: movl %edx, 12(%eax) +; X86-NOSSE-NEXT: movl %edx, 8(%eax) +; X86-NOSSE-NEXT: movl %edx, 4(%eax) +; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: popl %edi +; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: popl %ebp +; X86-NOSSE-NEXT: retl $4 ; ; X64-LABEL: cnt128_pgso: ; X64: # %bb.0: @@ -1349,24 +1349,24 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; -; X32-POPCNT-LABEL: cnt128_pgso: -; X32-POPCNT: # %bb.0: -; X32-POPCNT-NEXT: pushl %esi -; X32-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx -; X32-POPCNT-NEXT: addl %ecx, %edx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi -; X32-POPCNT-NEXT: addl %ecx, %esi -; X32-POPCNT-NEXT: addl %edx, %esi -; X32-POPCNT-NEXT: xorl %ecx, %ecx -; X32-POPCNT-NEXT: movl %ecx, 12(%eax) -; X32-POPCNT-NEXT: movl %ecx, 8(%eax) -; X32-POPCNT-NEXT: movl %ecx, 4(%eax) -; X32-POPCNT-NEXT: movl %esi, (%eax) -; X32-POPCNT-NEXT: popl %esi -; X32-POPCNT-NEXT: retl $4 +; X86-POPCNT-LABEL: cnt128_pgso: +; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %esi +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx +; X86-POPCNT-NEXT: addl %ecx, %edx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: addl %ecx, %esi +; X86-POPCNT-NEXT: addl %edx, %esi +; X86-POPCNT-NEXT: xorl %ecx, %ecx +; X86-POPCNT-NEXT: movl %ecx, 12(%eax) +; X86-POPCNT-NEXT: movl %ecx, 8(%eax) +; X86-POPCNT-NEXT: movl %ecx, 4(%eax) +; X86-POPCNT-NEXT: movl %esi, (%eax) +; X86-POPCNT-NEXT: popl %esi +; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128_pgso: ; X64-POPCNT: # %bb.0: @@ -1376,89 +1376,89 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X64-POPCNT-NEXT: xorl %edx, %edx ; X64-POPCNT-NEXT: retq ; -; X32-SSE2-LABEL: cnt128_pgso: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrlw $1, %xmm1 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; X32-SSE2-NEXT: pand %xmm2, %xmm1 -; X32-SSE2-NEXT: psubb %xmm1, %xmm0 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE2-NEXT: pand %xmm1, %xmm3 -; X32-SSE2-NEXT: psrlw $2, %xmm0 -; X32-SSE2-NEXT: pand %xmm1, %xmm0 -; X32-SSE2-NEXT: paddb %xmm3, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE2-NEXT: psrlw $4, %xmm3 -; X32-SSE2-NEXT: paddb %xmm0, %xmm3 -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE2-NEXT: pand %xmm0, %xmm3 -; X32-SSE2-NEXT: pxor %xmm4, %xmm4 -; X32-SSE2-NEXT: psadbw %xmm4, %xmm3 -; X32-SSE2-NEXT: movd %xmm3, %ecx -; X32-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm3, %xmm5 -; X32-SSE2-NEXT: psrlw $1, %xmm5 -; X32-SSE2-NEXT: pand %xmm2, %xmm5 -; X32-SSE2-NEXT: psubb %xmm5, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X32-SSE2-NEXT: pand %xmm1, %xmm2 -; X32-SSE2-NEXT: psrlw $2, %xmm3 -; X32-SSE2-NEXT: pand %xmm1, %xmm3 -; X32-SSE2-NEXT: paddb %xmm2, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm1 -; X32-SSE2-NEXT: psrlw $4, %xmm1 -; X32-SSE2-NEXT: paddb %xmm3, %xmm1 -; X32-SSE2-NEXT: pand %xmm0, %xmm1 -; X32-SSE2-NEXT: psadbw %xmm4, %xmm1 -; X32-SSE2-NEXT: movd %xmm1, %edx -; X32-SSE2-NEXT: addl %ecx, %edx -; X32-SSE2-NEXT: xorl %ecx, %ecx -; X32-SSE2-NEXT: movl %ecx, 12(%eax) -; X32-SSE2-NEXT: movl %ecx, 8(%eax) -; X32-SSE2-NEXT: movl %ecx, 4(%eax) -; X32-SSE2-NEXT: movl %edx, (%eax) -; X32-SSE2-NEXT: retl $4 +; X86-SSE2-LABEL: cnt128_pgso: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: psubb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: paddb %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: paddb %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: psadbw %xmm4, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %ecx +; X86-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $1, %xmm5 +; X86-SSE2-NEXT: pand %xmm2, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlw $2, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: paddb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: psrlw $4, %xmm1 +; X86-SSE2-NEXT: paddb %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: psadbw %xmm4, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %edx +; X86-SSE2-NEXT: addl %ecx, %edx +; X86-SSE2-NEXT: xorl %ecx, %ecx +; X86-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: retl $4 ; -; X32-SSSE3-LABEL: cnt128_pgso: -; X32-SSSE3: # %bb.0: -; X32-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X32-SSSE3-NEXT: psrlw $4, %xmm1 -; X32-SSSE3-NEXT: pand %xmm0, %xmm1 -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm2 -; X32-SSSE3-NEXT: pshufb %xmm1, %xmm2 -; X32-SSSE3-NEXT: paddb %xmm4, %xmm2 -; X32-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X32-SSSE3-NEXT: psadbw %xmm1, %xmm2 -; X32-SSSE3-NEXT: movd %xmm2, %ecx -; X32-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X32-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; X32-SSSE3-NEXT: pand %xmm0, %xmm4 -; X32-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; X32-SSSE3-NEXT: pshufb %xmm4, %xmm5 -; X32-SSSE3-NEXT: psrlw $4, %xmm2 -; X32-SSSE3-NEXT: pand %xmm0, %xmm2 -; X32-SSSE3-NEXT: pshufb %xmm2, %xmm3 -; X32-SSSE3-NEXT: paddb %xmm5, %xmm3 -; X32-SSSE3-NEXT: psadbw %xmm1, %xmm3 -; X32-SSSE3-NEXT: movd %xmm3, %edx -; X32-SSSE3-NEXT: addl %ecx, %edx -; X32-SSSE3-NEXT: xorl %ecx, %ecx -; X32-SSSE3-NEXT: movl %ecx, 12(%eax) -; X32-SSSE3-NEXT: movl %ecx, 8(%eax) -; X32-SSSE3-NEXT: movl %ecx, 4(%eax) -; X32-SSSE3-NEXT: movl %edx, (%eax) -; X32-SSSE3-NEXT: retl $4 +; X86-SSSE3-LABEL: cnt128_pgso: +; X86-SSSE3: # %bb.0: +; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X86-SSSE3-NEXT: psrlw $4, %xmm1 +; X86-SSSE3-NEXT: pand %xmm0, %xmm1 +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; X86-SSSE3-NEXT: pshufb %xmm1, %xmm2 +; X86-SSSE3-NEXT: paddb %xmm4, %xmm2 +; X86-SSSE3-NEXT: pxor %xmm1, %xmm1 +; X86-SSSE3-NEXT: psadbw %xmm1, %xmm2 +; X86-SSSE3-NEXT: movd %xmm2, %ecx +; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm2, %xmm4 +; X86-SSSE3-NEXT: pand %xmm0, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: psrlw $4, %xmm2 +; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm3 +; X86-SSSE3-NEXT: psadbw %xmm1, %xmm3 +; X86-SSSE3-NEXT: movd %xmm3, %edx +; X86-SSSE3-NEXT: addl %ecx, %edx +; X86-SSSE3-NEXT: xorl %ecx, %ecx +; X86-SSSE3-NEXT: movl %ecx, 12(%eax) +; X86-SSSE3-NEXT: movl %ecx, 8(%eax) +; X86-SSSE3-NEXT: movl %ecx, 4(%eax) +; X86-SSSE3-NEXT: movl %edx, (%eax) +; X86-SSSE3-NEXT: retl $4 %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) ret i128 %cnt } -- 2.7.4