From 7920527796eacd596d362ab5266e23189565892e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 24 Mar 2021 14:05:43 +0000 Subject: [PATCH] [X86][AVX] combineBitcastvxi1 - improve handling of vectors truncated to vXi1 If we're truncating to vXi1 from a wider type, then prefer the original wider vector as is simplifies folding the separate truncations + extensions. AVX1 this is only worth it for v8i1 cases, not v4i1 where we're always better off truncating down to v4i32 for movmsk. Helps with some regressions encountered in D96609 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++++--- llvm/test/CodeGen/X86/vector-reduce-and-bool.ll | 73 +++++++++++------------ llvm/test/CodeGen/X86/vector-reduce-or-bool.ll | 77 +++++++++++++------------ llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll | 73 +++++++++++------------ 4 files changed, 124 insertions(+), 122 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bdb09b9..350a1b7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39201,17 +39201,22 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( Op, DemandedBits, DemandedElts, DAG, Depth); } -// Helper to peek through bitops/setcc to determine size of source vector. +// Helper to peek through bitops/trunc/setcc to determine size of source vector. // Allows combineBitcastvxi1 to determine what size vector generated a . -static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { +static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, + bool AllowTruncate) { switch (Src.getOpcode()) { + case ISD::TRUNCATE: + if (!AllowTruncate) + return false; + LLVM_FALLTHROUGH; case ISD::SETCC: return Src.getOperand(0).getValueSizeInBits() == Size; case ISD::AND: case ISD::XOR: case ISD::OR: - return checkBitcastSrcVectorSize(Src.getOperand(0), Size) && - checkBitcastSrcVectorSize(Src.getOperand(1), Size); + return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) && + checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate); } return false; } @@ -39266,6 +39271,7 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL) { switch (Src.getOpcode()) { case ISD::SETCC: + case ISD::TRUNCATE: return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); case ISD::AND: case ISD::XOR: @@ -39349,7 +39355,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. - if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) { + if (Subtarget.hasAVX() && + checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) { SExtVT = MVT::v4i64; PropagateSExt = true; } @@ -39361,8 +39368,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. - if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) || - checkBitcastSrcVectorSize(Src, 512))) { + if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) || + checkBitcastSrcVectorSize(Src, 512, true))) { SExtVT = MVT::v8i32; PropagateSExt = true; } @@ -39387,7 +39394,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, break; } // Split if this is a <64 x i8> comparison result. - if (checkBitcastSrcVectorSize(Src, 512)) { + if (checkBitcastSrcVectorSize(Src, 512, false)) { SExtVT = MVT::v64i8; break; } diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index 2958f7e..6965d2bc 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -219,16 +219,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) { ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_v4i64_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovmskps %xmm0, %eax -; AVX-NEXT: cmpb $15, %al -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v4i64_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: cmpb $15, %al +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v4i64_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: cmpb $15, %al +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_v4i1: ; AVX512F: # %bb.0: @@ -296,14 +305,11 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { ; ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpslld $31, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: cmpb $-1, %al ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper @@ -311,11 +317,8 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { ; ; AVX2-LABEL: trunc_v8i32_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: cmpb $-1, %al ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper @@ -536,17 +539,14 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: cmpb $-1, %al ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper @@ -557,11 +557,8 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: cmpb $-1, %al ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index 283939f..1363f5f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -212,16 +212,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) { ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_v4i64_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovmskps %xmm0, %eax -; AVX-NEXT: testb %al, %al -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v4i64_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: testb %al, %al +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v4i64_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_v4i1: ; AVX512F: # %bb.0: @@ -285,25 +294,21 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { ; ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA +; AVX1-NEXT: vpslld $31, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -521,17 +526,15 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -541,11 +544,9 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index a60dbcd..96b4d88 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -215,16 +215,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) { ; SSE-NEXT: setnp %al ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_v4i64_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovmskps %xmm0, %eax -; AVX-NEXT: testb %al, %al -; AVX-NEXT: setnp %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v4i64_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: testb %al, %al +; AVX1-NEXT: setnp %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v4i64_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: setnp %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_v4i1: ; AVX512F: # %bb.0: @@ -290,14 +299,11 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { ; ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpslld $31, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: setnp %al ; AVX1-NEXT: vzeroupper @@ -305,11 +311,8 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { ; ; AVX2-LABEL: trunc_v8i32_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: setnp %al ; AVX2-NEXT: vzeroupper @@ -548,17 +551,14 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: setnp %al ; AVX1-NEXT: vzeroupper @@ -569,11 +569,8 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: setnp %al ; AVX2-NEXT: vzeroupper -- 2.7.4