From e07a982693353aad85590301357035e9d583bbbc Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 28 Jun 2020 13:00:15 +0100 Subject: [PATCH] [X86] combineScalarToVector - handle (v2i64 scalar_to_vector(aextload)) as well as (v2i64 scalar_to_vector(aext)) We already fold (v2i64 scalar_to_vector(aext)) -> (v2i64 bitcast(v4i32 scalar_to_vector(x))), this adds support for similar aextload cases and also handles v2f64 cases that wrap the i64 extension behind bitcasts. Fixes the remaining issue with PR39016 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 26 ++++++++++++++++------ .../CodeGen/X86/bitcast-int-to-vector-bool-sext.ll | 10 ++++----- .../CodeGen/X86/bitcast-int-to-vector-bool-zext.ll | 10 ++++----- llvm/test/CodeGen/X86/dagcombine-cse.ll | 2 -- llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll | 6 ++--- llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 4 ++-- 6 files changed, 32 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bef29f0..d943d75 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48038,13 +48038,25 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { Src.getOperand(1)); // Reduce v2i64 to v4i32 if we don't need the upper bits. - // TODO: Move to DAGCombine? - if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND && - Src.getValueType() == MVT::i64 && Src.hasOneUse() && - Src.getOperand(0).getScalarValueSizeInBits() <= 32) - return DAG.getBitcast( - VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, - DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32))); + // TODO: Move to DAGCombine/SimplifyDemandedBits? + if (VT == MVT::v2i64 || VT == MVT::v2f64) { + auto IsAnyExt64 = [](SDValue Op) { + if (Op.getValueType() != MVT::i64 || !Op.hasOneUse()) + return SDValue(); + if (Op.getOpcode() == ISD::ANY_EXTEND && + Op.getOperand(0).getScalarValueSizeInBits() <= 32) + return Op.getOperand(0); + if (auto *Ld = dyn_cast(Op)) + if (Ld->getExtensionType() == ISD::EXTLOAD && + Ld->getMemoryVT().getScalarSizeInBits() <= 32) + return Op; + return SDValue(); + }; + if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src))) + return DAG.getBitcast( + VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, + DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32))); + } // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ. if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST && diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index cb2af23..f380516 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -204,9 +204,8 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; ; AVX1-LABEL: ext_i4_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -432,9 +431,8 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; ; AVX1-LABEL: ext_i8_8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 0f76c35..20135e8 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -260,9 +260,8 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; ; AVX1-LABEL: ext_i4_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -555,9 +554,8 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; ; AVX1-LABEL: ext_i8_8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll index 91a86ba..9ca88d2 100644 --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -18,8 +18,6 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n ; X64-NEXT: addl %edx, %esi ; X64-NEXT: movslq %esi, %rax ; X64-NEXT: movl (%rdi,%rax), %eax -; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: retq entry: %tmp7 = mul i32 %idxY, %ref_frame_stride ; [#uses=2] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 1de6103..04064f1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -3570,7 +3570,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) { ; SSE-LABEL: insert_dup_mem_v8i16_sext_i16_i64: ; SSE: # %bb.0: ; SSE-NEXT: movzwl (%rdi), %eax -; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq @@ -3578,7 +3578,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) { ; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq @@ -3591,7 +3591,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) { ; XOPAVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: movzwl (%rdi), %eax -; XOPAVX1-NEXT: vmovq %rax, %xmm0 +; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 6bbb895..635d94b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -7683,7 +7683,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) { ; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7697,7 +7697,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) { ; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: movzwl (%rdi), %eax -; XOPAVX1-NEXT: vmovq %rax, %xmm0 +; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -- 2.7.4