From: Simon Pilgrim Date: Fri, 21 Oct 2016 10:50:52 +0000 (+0000) Subject: [X86][AVX] Add 32-bit target tests for vector lzcnt/tzcnt to demonstrate missed foldi... X-Git-Tag: llvmorg-4.0.0-rc1~6606 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f447fbf913dcdbe651764e452c8abc1d32656e46;p=platform%2Fupstream%2Fllvm.git [X86][AVX] Add 32-bit target tests for vector lzcnt/tzcnt to demonstrate missed folding opportunities llvm-svn: 284816 --- diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll index a145621..e288f32 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -3,6 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD +; +; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: @@ -101,6 +104,36 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 ; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv4i64: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 +; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; X32-AVX-NEXT: vpsrld $16, %ymm2, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; X32-AVX-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0) ret <4 x i64> %out @@ -203,6 +236,36 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 ; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv4i64u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 +; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; X32-AVX-NEXT: vpsrld $16, %ymm2, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; X32-AVX-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1) ret <4 x i64> %out @@ -290,6 +353,31 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv8i32: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 +; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0) ret <8 x i32> %out @@ -377,6 +465,31 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv8i32u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 +; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1) ret <8 x i32> %out @@ -445,6 +558,26 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: testv16i16: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 +; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 0) ret <16 x i16> %out } @@ -512,6 +645,26 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: testv16i16u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 +; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 +; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 -1) ret <16 x i16> %out } @@ -586,6 +739,21 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv32i8: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm2, %ymm1 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0) ret <32 x i8> %out } @@ -660,6 +828,21 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv32i8u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm2, %ymm1 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1) ret <32 x i8> %out } @@ -674,6 +857,37 @@ define <4 x i64> @foldv4i64() nounwind { ; AVX512: ## BB#0: ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: foldv4i64: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [256,0,4294967295,4294967295,0,0,255,0] +; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm1, %ymm4 +; X32-AVX-NEXT: vpand %ymm0, %ymm4, %ymm0 +; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm5 +; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4294967295,0,0,4294967295,4294967295,0,4294967295] +; X32-AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> , i1 0) ret <4 x i64> %out } @@ -688,6 +902,37 @@ define <4 x i64> @foldv4i64u() nounwind { ; AVX512: ## BB#0: ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: foldv4i64u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [256,0,4294967295,4294967295,0,0,255,0] +; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm1, %ymm4 +; X32-AVX-NEXT: vpand %ymm0, %ymm4, %ymm0 +; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm5 +; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm2 +; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4294967295,0,0,4294967295,4294967295,0,4294967295] +; X32-AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> , i1 -1) ret <4 x i64> %out } @@ -702,6 +947,11 @@ define <8 x i32> @foldv8i32() nounwind { ; AVX512: ## BB#0: ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: foldv8i32: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out } @@ -716,6 +966,11 @@ define <8 x i32> @foldv8i32u() nounwind { ; AVX512: ## BB#0: ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: foldv8i32u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out } @@ -730,6 +985,11 @@ define <16 x i16> @foldv16i16() nounwind { ; AVX512: ## BB#0: ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: foldv16i16: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out } @@ -744,6 +1004,11 @@ define <16 x i16> @foldv16i16u() nounwind { ; AVX512: ## BB#0: ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: foldv16i16u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out } @@ -758,6 +1023,11 @@ define <32 x i8> @foldv32i8() nounwind { ; AVX512: ## BB#0: ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: foldv32i8: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out } @@ -772,6 +1042,11 @@ define <32 x i8> @foldv32i8u() nounwind { ; AVX512: ## BB#0: ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; AVX512-NEXT: retq +; +; X32-AVX-LABEL: foldv32i8u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out } diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll index ffac692..db9081d 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -3,6 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD +; +; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: @@ -88,6 +91,23 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv4i64: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) ret <4 x i64> %out } @@ -161,6 +181,23 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv4i64u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) ret <4 x i64> %out } @@ -269,6 +306,28 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv8i32: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; X32-AVX-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; X32-AVX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 +; X32-AVX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0) ret <8 x i32> %out } @@ -354,6 +413,28 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv8i32u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; X32-AVX-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; X32-AVX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 +; X32-AVX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1) ret <8 x i32> %out } @@ -450,6 +531,25 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv16i16: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsubw {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsllw $8, %ymm0, %ymm1 +; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0) ret <16 x i16> %out } @@ -546,6 +646,25 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv16i16u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsubw {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsllw $8, %ymm0, %ymm1 +; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1) ret <16 x i16> %out } @@ -627,6 +746,22 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv32i8: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsubb {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0) ret <32 x i8> %out } @@ -708,78 +843,160 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq +; +; X32-AVX-LABEL: testv32i8u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsubb {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1) ret <32 x i8> %out } define <4 x i64> @foldv4i64() nounwind { -; ALL-LABEL: foldv4i64: -; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] -; ALL-NEXT: retq +; AVX-LABEL: foldv4i64: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: retq +; +; X32-AVX-LABEL: foldv4i64: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [256,0,4294967295,4294967295,0,0,255,0] +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> , i1 0) ret <4 x i64> %out } define <4 x i64> @foldv4i64u() nounwind { -; ALL-LABEL: foldv4i64u: -; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] -; ALL-NEXT: retq +; AVX-LABEL: foldv4i64u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: retq +; +; X32-AVX-LABEL: foldv4i64u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [256,0,4294967295,4294967295,0,0,255,0] +; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> , i1 -1) ret <4 x i64> %out } define <8 x i32> @foldv8i32() nounwind { -; ALL-LABEL: foldv8i32: -; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; ALL-NEXT: retq +; AVX-LABEL: foldv8i32: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX-NEXT: retq +; +; X32-AVX-LABEL: foldv8i32: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out } define <8 x i32> @foldv8i32u() nounwind { -; ALL-LABEL: foldv8i32u: -; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; ALL-NEXT: retq +; AVX-LABEL: foldv8i32u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX-NEXT: retq +; +; X32-AVX-LABEL: foldv8i32u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out } define <16 x i16> @foldv16i16() nounwind { -; ALL-LABEL: foldv16i16: -; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; ALL-NEXT: retq +; AVX-LABEL: foldv16i16: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: retq +; +; X32-AVX-LABEL: foldv16i16: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out } define <16 x i16> @foldv16i16u() nounwind { -; ALL-LABEL: foldv16i16u: -; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; ALL-NEXT: retq +; AVX-LABEL: foldv16i16u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: retq +; +; X32-AVX-LABEL: foldv16i16u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out } define <32 x i8> @foldv32i8() nounwind { -; ALL-LABEL: foldv32i8: -; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; ALL-NEXT: retq +; AVX-LABEL: foldv32i8: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX-NEXT: retq +; +; X32-AVX-LABEL: foldv32i8: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out } define <32 x i8> @foldv32i8u() nounwind { -; ALL-LABEL: foldv32i8u: -; ALL: # BB#0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; ALL-NEXT: retq +; AVX-LABEL: foldv32i8u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX-NEXT: retq +; +; X32-AVX-LABEL: foldv32i8u: +; X32-AVX: # BB#0: +; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out }