From a71c0ed471eb06e2879d8687b427dcf6333474c3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 24 Mar 2019 16:30:35 +0000 Subject: [PATCH] [X86][AVX] Start shuffle combining from ZERO_EXTEND_VECTOR_INREG (PR40685) Just enable this for AVX for now as SSE41 introduces extra register moves for the PMOVZX(PSHUFD(V)) -> UNPCKH(V,0) pattern (but otherwise helps reduce port5 usage on Intel targets). Only AVX support is required for PR40685 as the issue is due to 8i8->8i32 zext shuffle leftovers. llvm-svn: 356858 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 16 +++- llvm/test/CodeGen/X86/avg.ll | 110 ++++++++++++----------- llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 7 +- llvm/test/CodeGen/X86/known-bits-vector.ll | 16 ++-- llvm/test/CodeGen/X86/psubus.ll | 12 +-- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll | 44 ++++----- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll | 96 +++++++++----------- llvm/test/CodeGen/X86/vector-reduce-umax.ll | 25 +++--- llvm/test/CodeGen/X86/vector-reduce-umin.ll | 25 +++--- llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll | 17 ++-- llvm/test/CodeGen/X86/vector-zext.ll | 103 ++++++++++++++------- 11 files changed, 248 insertions(+), 223 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2bb03bf..67631f7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1888,6 +1888,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -42597,7 +42598,8 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG) { +static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // Disabling for widening legalization for now. We can enable if we find a // case that needs it. Otherwise it can be deleted when we switch to // widening legalization. @@ -42613,6 +42615,15 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG) { TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); + // Attempt to combine as a shuffle. + // TODO: SSE41 support + if (Subtarget.hasAVX() && N->getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) { + SDValue Op(N, 0); + if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } + return SDValue(); } @@ -42679,7 +42690,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); - case ISD::ANY_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG); + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 943fb4e..7bdcd9f 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -2047,78 +2047,80 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-NEXT: vpextrq $1, %xmm5, %r15 -; AVX1-NEXT: vmovq %xmm5, %r12 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-NEXT: vpextrq $1, %xmm6, %r15 +; AVX1-NEXT: vmovq %xmm6, %r12 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpextrq $1, %xmm2, %r11 ; AVX1-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-NEXT: vpextrq $1, %xmm2, %r13 ; AVX1-NEXT: vmovq %xmm2, %r14 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX1-NEXT: vpextrq $1, %xmm5, %rbx -; AVX1-NEXT: vmovq %xmm5, %rdx -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vpextrq $1, %xmm6, %rbx +; AVX1-NEXT: vmovq %xmm6, %rdx +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpextrq $1, %xmm1, %r9 ; AVX1-NEXT: vmovq %xmm1, %r10 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX1-NEXT: vmovd %xmm6, %esi -; AVX1-NEXT: vpextrd $1, %xmm6, %edi +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vmovd %xmm7, %esi +; AVX1-NEXT: vpextrd $1, %xmm7, %edi +; AVX1-NEXT: vpextrd $2, %xmm7, %ecx +; AVX1-NEXT: vpextrd $3, %xmm7, %ebp +; AVX1-NEXT: vpextrd $3, %xmm6, %eax +; AVX1-NEXT: leal -1(%rbp,%rax), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX1-NEXT: vpextrd $2, %xmm6, %eax -; AVX1-NEXT: vpextrd $3, %xmm6, %ebp -; AVX1-NEXT: vpextrd $3, %xmm5, %ecx -; AVX1-NEXT: leal -1(%rbp,%rcx), %ecx -; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrd $2, %xmm5, %ecx -; AVX1-NEXT: leal -1(%rax,%rcx), %eax +; AVX1-NEXT: leal -1(%rcx,%rax), %eax ; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vpextrd $1, %xmm5, %ecx -; AVX1-NEXT: leal -1(%rdi,%rcx), %eax +; AVX1-NEXT: vpextrd $1, %xmm6, %eax +; AVX1-NEXT: leal -1(%rdi,%rax), %eax ; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX1-NEXT: vmovd %xmm5, %ecx -; AVX1-NEXT: leal -1(%rsi,%rcx), %r8d -; AVX1-NEXT: vpextrq $1, %xmm4, %rcx +; AVX1-NEXT: vmovd %xmm6, %eax +; AVX1-NEXT: leal -1(%rsi,%rax), %r8d +; AVX1-NEXT: vpextrq $1, %xmm5, %rax ; AVX1-NEXT: leal -1(%r15,%rbx), %r15d -; AVX1-NEXT: vmovq %xmm4, %rsi -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX1-NEXT: vmovq %xmm5, %rsi +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-NEXT: leal -1(%r12,%rdx), %edx -; AVX1-NEXT: vmovd %xmm2, %r12d +; AVX1-NEXT: vmovd %xmm4, %r12d ; AVX1-NEXT: leal -1(%r11,%r9), %r11d -; AVX1-NEXT: vpextrd $1, %xmm2, %edi -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: leal -1(%rax,%r10), %r10d +; AVX1-NEXT: vpextrd $2, %xmm4, %edi +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX1-NEXT: leal -1(%rcx,%r10), %r10d ; AVX1-NEXT: vpextrd $2, %xmm2, %ebx -; AVX1-NEXT: leal -1(%r13,%rcx), %r9d -; AVX1-NEXT: vpextrd $3, %xmm2, %ecx +; AVX1-NEXT: leal -1(%r13,%rax), %r9d +; AVX1-NEXT: vpextrd $3, %xmm2, %eax ; AVX1-NEXT: leal -1(%r14,%rsi), %esi -; AVX1-NEXT: vpextrd $3, %xmm3, %eax -; AVX1-NEXT: leal -1(%rcx,%rax), %ecx -; AVX1-NEXT: vpextrd $2, %xmm3, %eax -; AVX1-NEXT: leal -1(%rbx,%rax), %ebx -; AVX1-NEXT: vpextrd $1, %xmm3, %eax -; AVX1-NEXT: leal -1(%rdi,%rax), %eax +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: leal -1(%rax,%rcx), %eax +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: leal -1(%rbx,%rcx), %ebx +; AVX1-NEXT: vpextrd $2, %xmm3, %ecx +; AVX1-NEXT: leal -1(%rdi,%rcx), %ecx ; AVX1-NEXT: vmovd %xmm3, %edi ; AVX1-NEXT: leal -1(%r12,%rdi), %edi -; AVX1-NEXT: vpextrq $1, %xmm0, %r12 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpextrq $1, %xmm1, %r13 +; AVX1-NEXT: vpextrq $1, %xmm8, %r12 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpextrq $1, %xmm0, %r13 ; AVX1-NEXT: leal -1(%r12,%r13), %r12d -; AVX1-NEXT: vmovq %xmm0, %r13 -; AVX1-NEXT: vmovq %xmm1, %r14 +; AVX1-NEXT: vmovq %xmm8, %r13 +; AVX1-NEXT: vmovq %xmm0, %r14 ; AVX1-NEXT: leal -1(%r13,%r14), %ebp ; AVX1-NEXT: shrl %ebp ; AVX1-NEXT: vmovd %ebp, %xmm0 @@ -2138,12 +2140,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0 ; AVX1-NEXT: shrl %edi ; AVX1-NEXT: vpinsrb $8, %edi, %xmm0, %xmm0 -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: shrl %ebx ; AVX1-NEXT: vpinsrb $10, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX1-NEXT: shrl %r8d ; AVX1-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0 ; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 2040ccc..5b9b400 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -940,9 +940,10 @@ define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 { ; AVX512-LABEL: test46: ; AVX512: ## %bb.0: ; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x00] -; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0] -; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] +; AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x14,0xc1] +; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0x05,A,A,A,A] ; AVX512-NEXT: ## fixup A - offset: 4, value: LCPI47_0-4, kind: reloc_riprel_4byte ; AVX512-NEXT: retq ## encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll index d0eaed8..53a43b9 100644 --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -76,15 +76,15 @@ define <4 x i32> @knownbits_mask_shuffle_sext(<8 x i16> %a0) nounwind { ; X32-LABEL: knownbits_mask_shuffle_sext: ; X32: # %bb.0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_sext: ; X64: # %bb.0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: retq %1 = and <8 x i16> %a0, %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -96,15 +96,15 @@ define <4 x i32> @knownbits_mask_shuffle_shuffle_sext(<8 x i16> %a0) nounwind { ; X32-LABEL: knownbits_mask_shuffle_shuffle_sext: ; X32: # %bb.0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_shuffle_sext: ; X64: # %bb.0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: retq %1 = and <8 x i16> %a0, %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 39dbf40..e5c3abc 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1950,16 +1950,10 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsubusw %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: psubus_16i32_max: diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index c34f46b..62bcc54 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -352,18 +352,16 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 @@ -864,18 +862,16 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 @@ -888,16 +884,14 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index 32b49a7..f40a079 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -253,18 +253,16 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 @@ -286,18 +284,16 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7] -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 @@ -688,18 +684,16 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5 @@ -708,39 +702,35 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] -; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7] +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm6, %xmm6 @@ -753,16 +743,14 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 32ea3f5..c56ca54 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -772,32 +772,29 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; AVX1-LABEL: test_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 +; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index e3adb9f..d7e9838 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -771,32 +771,29 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; AVX1-LABEL: test_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 +; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpminuq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index bb73165..1d66383 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -307,18 +307,17 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; ; AVX1-LABEL: var_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 1b90798..c3c019c 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -123,15 +123,14 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { ; ; AVX1-LABEL: zext_32i8_to_32i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -585,15 +584,14 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; ; AVX1-LABEL: zext_16i16_to_16i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -884,15 +882,14 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; ; AVX1-LABEL: zext_8i32_to_8i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -1736,11 +1733,33 @@ define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; -; AVX-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> %Z = bitcast <16 x i8> %B to <2 x i64> @@ -1824,11 +1843,33 @@ define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: retq ; -; AVX-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> %Z = bitcast <8 x i16> %B to <2 x i64> -- 2.7.4