From 4742715eb7fbd90b2d7ca807980bdca7e552fcd2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Jun 2023 14:42:38 +0100 Subject: [PATCH] [DAG] Fold (*ext (*_extend_vector_inreg x)) -> (*_extend_vector_inreg x) --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 22 ++++ llvm/test/CodeGen/X86/masked_store.ll | 82 ++++++-------- llvm/test/CodeGen/X86/pr45563-2.ll | 154 +++++++++++++------------- llvm/test/CodeGen/X86/pr45833.ll | 137 +++++++++++------------ llvm/test/CodeGen/X86/vector-bo-select.ll | 48 +++----- llvm/test/CodeGen/X86/vector-fshl-128.ll | 3 +- 6 files changed, 221 insertions(+), 225 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ada1176..d654752 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13011,6 +13011,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); + // fold (sext (aext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x) + // fold (sext (sext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x) + if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) + return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, + N0.getOperand(0)); + // fold (sext (sext_inreg x)) -> (sext (trunc x)) if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG) { SDValue N00 = N0.getOperand(0); @@ -13269,6 +13276,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); + // fold (zext (aext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x) + // fold (zext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x) + if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) + return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, + N0.getOperand(0)); + // fold (zext (truncate x)) -> (zext x) or // (zext (truncate x)) -> (truncate x) // This is valid when the truncated bits of x are already zero. @@ -13535,6 +13549,14 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { N0.getOpcode() == ISD::SIGN_EXTEND) return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); + // fold (aext (aext_extend_vector_inreg x)) -> (aext_extend_vector_inreg x) + // fold (aext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x) + // fold (aext (sext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x) + if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) + return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); + // fold (aext (truncate (load x))) -> (aext (smaller load x)) // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index ac538e0..9ae1e69 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -6130,42 +6130,37 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; ; AVX1-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: ; AVX1: ## %bb.0: -; AVX1-NEXT: vmovaps (%rsi), %ymm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX1-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd 48(%rdi), %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtd 32(%rdi), %xmm3, %xmm5 ; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpgtd 16(%rdi), %xmm3, %xmm5 -; AVX1-NEXT: vpcmpgtd (%rdi), %xmm3, %xmm6 -; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpacksswb %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpgtd 80(%rdi), %xmm3, %xmm5 ; AVX1-NEXT: vpcmpgtd 64(%rdi), %xmm3, %xmm6 -; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpacksswb %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-NEXT: vpslld $31, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpcmpgtd 16(%rdi), %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtd (%rdi), %xmm3, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3],xmm8[4,5],xmm3[6,7] +; AVX1-NEXT: vpslld $31, %xmm8, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3],xmm7[4,5],xmm3[6,7] ; AVX1-NEXT: vpslld $31, %xmm7, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vmaskmovps %ymm1, %ymm6, (%rdx) -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-NEXT: vmaskmovps %ymm0, %ymm7, (%rdx) +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7] ; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-NEXT: vmaskmovps %ymm2, %ymm1, 64(%rdx) -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, 64(%rdx) +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, 32(%rdx) +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -6178,27 +6173,20 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; AVX2-NEXT: vpcmpgtd 32(%rdi), %ymm3, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 -; AVX2-NEXT: vpmaskmovd %ymm1, %ymm5, 32(%rdx) -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdx) -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm5 +; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-NEXT: vpslld $31, %ymm6, %ymm6 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm6, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6442,8 +6430,8 @@ define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) #0 { ; ; AVX2-LABEL: undefshuffle: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,2,u,u,u,4,u,u,u,6,u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u] ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll index 7f5adef..72877e1 100644 --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -28,22 +28,22 @@ define <9 x float> @mload_split9(<9 x i1> %mask, ptr %addr, <9 x float> %dst) { ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 -; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 -; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u] +; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm4 +; CHECK-NEXT: vblendvps %ymm1, %ymm4, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u] ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm1, %ymm2 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -70,37 +70,37 @@ define <13 x float> @mload_split13(<13 x i1> %mask, ptr %addr, <13 x float> %dst ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm4 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 -; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm6, %xmm6 +; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm6 +; CHECK-NEXT: vblendvps %ymm1, %ymm6, %ymm0, %ymm0 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 ; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vblendvps %xmm1, %xmm3, %xmm5, %xmm0 +; CHECK-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 -; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm6, %xmm0 +; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 48(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -127,39 +127,39 @@ define <14 x float> @mload_split14(<14 x i1> %mask, ptr %addr, <14 x float> %dst ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 -; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm3[u],zero,xmm3[u] +; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm5 +; CHECK-NEXT: vblendvps %ymm1, %ymm5, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm2[u],zero,xmm2[u] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 -; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm5, %ymm5 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 -; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm0 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vmovlps %xmm0, 48(%rdi) -; CHECK-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vblendvps %xmm2, %xmm5, %xmm3, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -191,16 +191,17 @@ define <17 x float> @mload_split17(<17 x i1> %mask, ptr %addr, <17 x float> %dst ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: vmovd %esi, %xmm3 -; CHECK-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, %r8d, %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; CHECK-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm3, %xmm3 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $8, %r9d, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; CHECK-NEXT: vmaskmovps (%rdi), %ymm3, %ymm4 @@ -221,7 +222,6 @@ define <17 x float> @mload_split17(<17 x i1> %mask, ptr %addr, <17 x float> %dst ; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm3, %ymm4 ; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1 ; CHECK-NEXT: vmovd %r10d, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 ; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm3, %ymm4 ; CHECK-NEXT: vblendvps %xmm3, %xmm4, %xmm0, %xmm0 @@ -264,16 +264,17 @@ define <23 x float> @mload_split23(<23 x i1> %mask, ptr %addr, <23 x float> %dst ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: vmovd %esi, %xmm4 -; CHECK-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $4, %ecx, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $6, %r8d, %xmm4, %xmm4 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; CHECK-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm4, %xmm4 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 -; CHECK-NEXT: vpinsrb $8, %r9d, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; CHECK-NEXT: vmaskmovps (%rdi), %ymm4, %ymm5 @@ -294,15 +295,16 @@ define <23 x float> @mload_split23(<23 x i1> %mask, ptr %addr, <23 x float> %dst ; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm4, %ymm5 ; CHECK-NEXT: vblendvps %ymm4, %ymm5, %ymm2, %ymm2 ; CHECK-NEXT: vmovd %r10d, %xmm4 +; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 ; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 ; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm6, %ymm6 diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll index d32fdfb..04c342b 100644 --- a/llvm/test/CodeGen/X86/pr45833.ll +++ b/llvm/test/CodeGen/X86/pr45833.ll @@ -28,14 +28,14 @@ define void @mstore_split9(<9 x float> %value, ptr %addr, <9 x i1> %mask) { ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u] +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u] ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vmaskmovps %ymm1, %ymm4, 32(%rdi) ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) @@ -69,25 +69,25 @@ define void @mstore_split13(<13 x float> %value, ptr %addr, <13 x i1> %mask) { ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm5 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm4 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) -; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; CHECK-NEXT: vmaskmovps %ymm1, %ymm3, 32(%rdi) +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdi) +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v13f32.p0(<13 x float> %value, ptr %addr, i32 4, <13 x i1>%mask) @@ -119,20 +119,20 @@ define void @mstore_split14(<14 x float> %value, ptr %addr, <14 x i1> %mask) { ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm4[u],zero,xmm4[u] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm3[u],zero,xmm3[u] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] @@ -167,7 +167,6 @@ define void @mstore_split17(<17 x float> %value, ptr %addr, <17 x i1> %mask) { ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: vmovd %eax, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 ; CHECK-NEXT: vmaskmovps %ymm2, %ymm3, 64(%rdi) ; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -185,16 +184,17 @@ define void @mstore_split17(<17 x float> %value, ptr %addr, <17 x i1> %mask) { ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi) ; CHECK-NEXT: vmovd %esi, %xmm1 -; CHECK-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) @@ -213,15 +213,15 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) { ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] @@ -231,6 +231,7 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) { ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 @@ -245,35 +246,35 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) { ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; CHECK-NEXT: vmaskmovps %ymm2, %ymm3, 32(%rdi) -; CHECK-NEXT: vmovd %esi, %xmm2 -; CHECK-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $6, %r8d, %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; CHECK-NEXT: vmovd %eax, %xmm2 +; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, %r9d, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, (%rdi) -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 64(%rdi) +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, 64(%rdi) +; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v23f32.p0(<23 x float> %value, ptr %addr, i32 4, <23 x i1>%mask) diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 94ba152..3e44e2c 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -202,8 +202,7 @@ define <16 x float> @fadd_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; ; AVX2-LABEL: fadd_v16f32_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 @@ -301,8 +300,7 @@ define <16 x float> @fadd_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; ; AVX2-LABEL: fadd_v16f32_commute_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 @@ -506,8 +504,7 @@ define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -601,8 +598,7 @@ define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -824,8 +820,7 @@ define <16 x float> @fmul_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; ; AVX2-LABEL: fmul_v16f32_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 @@ -927,8 +922,7 @@ define <16 x float> @fmul_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; ; AVX2-LABEL: fmul_v16f32_commute_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 @@ -1157,8 +1151,7 @@ define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; ; AVX2-LABEL: fdiv_v16f32_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 @@ -1260,8 +1253,7 @@ define <16 x float> @fdiv_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; ; AVX2-LABEL: fdiv_v16f32_commute_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 @@ -2417,8 +2409,7 @@ define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3 ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -2512,8 +2503,7 @@ define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -3375,8 +3365,7 @@ define <16 x i32> @shl_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3 ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -3513,8 +3502,7 @@ define <16 x i32> @shl_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -4127,8 +4115,7 @@ define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -4334,8 +4321,7 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -4980,8 +4966,7 @@ define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 @@ -5187,8 +5172,7 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 35689ec..2b97280 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -782,8 +782,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -- 2.7.4