From: Simon Pilgrim Date: Fri, 29 Jul 2016 21:05:10 +0000 (+0000) Subject: [X86][AVX] Fix VBROADCASTF128 selection bug (PR28770) X-Git-Tag: llvmorg-4.0.0-rc1~13777 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f107ffa8f027c5dc18a075f254d1a33243ada641;p=platform%2Fupstream%2Fllvm.git [X86][AVX] Fix VBROADCASTF128 selection bug (PR28770) Support for lowering to VBROADCASTF128 etc. in D22460 was not correctly ensuring that the only users of the 128-bit vector load were the insertions of the vector into the lower/upper subvectors. llvm-svn: 277214 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4ab79db..01389e7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4750,6 +4750,13 @@ static SDValue peekThroughBitcasts(SDValue V) { return V; } +static SDValue peekThroughOneUseBitcasts(SDValue V) { + while (V.getNode() && V.getOpcode() == ISD::BITCAST && + V.getOperand(0).hasOneUse()) + V = V.getOperand(0); + return V; +} + static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask) { @@ -12785,6 +12792,14 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } +static bool areOnlyUsersOf(SDNode *N, ArrayRef ValidUsers) { + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) + if (llvm::all_of(ValidUsers, + [&I](SDValue V) { return V.getNode() != *I; })) + return false; + return true; +} + // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. @@ -12818,23 +12833,26 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, OpVT.is256BitVector() && SubVecVT.is128BitVector()) { auto *Idx2 = dyn_cast(Vec.getOperand(2)); if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); // If needed, look through bitcasts to get to the load. - SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1)); - if (auto *FirstLd = dyn_cast(SubVec2)) { + if (auto *FirstLd = dyn_cast(peekThroughBitcasts(SubVec2))) { bool Fast; unsigned Alignment = FirstLd->getAlignment(); unsigned AS = FirstLd->getAddressSpace(); const X86TargetLowering *TLI = Subtarget.getTargetLowering(); if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), OpVT, AS, Alignment, &Fast) && Fast) { - SDValue Ops[] = { SubVec2, SubVec }; + SDValue Ops[] = {SubVec2, SubVec}; if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) return Ld; } - - // If lower/upper loads are the same then lower to a VBROADCASTF128. - if (SubVec2 == peekThroughBitcasts(SubVec)) - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); + } + // If lower/upper loads are the same and the only users of the load, then + // lower to a VBROADCASTF128/VBROADCASTI128/etc. + if (SubVec2 == SubVec && + isa(peekThroughOneUseBitcasts(SubVec2)) && + areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); } } } diff --git a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll index c2646e5..975d499 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll @@ -97,3 +97,135 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> ret <32 x i8> %2 } + +define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { +; X32-LABEL: test_broadcast_2f64_4f64_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_2f64_4f64_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <2 x double>, <2 x double>* %p0 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> + store <2 x double> %1, <2 x double>* %p1 + ret <4 x double> %2 +} + +define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { +; X32-LABEL: test_broadcast_2i64_4i64_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_2i64_4i64_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <2 x i64>, <2 x i64>* %p0 + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> + store <2 x i64> %1, <2 x i64>* %p1 + ret <4 x i64> %2 +} + +define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { +; X32-LABEL: test_broadcast_4f32_8f32_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_4f32_8f32_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <4 x float>, <4 x float>* %p0 + %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> + store <4 x float> %1, <4 x float>* %p1 + ret <8 x float> %2 +} + +define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { +; X32-LABEL: test_broadcast_4i32_8i32_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_4i32_8i32_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <4 x i32>, <4 x i32>* %p0 + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> + store <4 x i32> %1, <4 x i32>* %p1 + ret <8 x i32> %2 +} + +define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { +; X32-LABEL: test_broadcast_8i16_16i16_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_8i16_16i16_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <8 x i16>, <8 x i16> *%p0 + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> + store <8 x i16> %1, <8 x i16>* %p1 + ret <16 x i16> %2 +} + +define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { +; X32-LABEL: test_broadcast_16i8_32i8_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_16i8_32i8_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <16 x i8>, <16 x i8> *%p0 + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> + store <16 x i8> %1, <16 x i8>* %p1 + ret <32 x i8> %2 +} diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll index 308c626..7e1fff8 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll @@ -115,3 +115,153 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { %3 = add <32 x i8> %2, ret <32 x i8> %3 } + +define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { +; X32-LABEL: test_broadcast_2f64_4f64_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovapd (%ecx), %xmm1 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vaddpd LCPI6_0, %ymm0, %ymm0 +; X32-NEXT: vmovapd %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_2f64_4f64_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovapd (%rdi), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovapd %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <2 x double>, <2 x double>* %p0 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> + %3 = fadd <4 x double> %2, + store <2 x double> %1, <2 x double>* %p1 + ret <4 x double> %3 +} + +define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { +; X32-LABEL: test_broadcast_2i64_4i64_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovdqa (%ecx), %xmm1 +; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vpaddq LCPI7_0, %ymm0, %ymm0 +; X32-NEXT: vmovdqa %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_2i64_4i64_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovdqa (%rdi), %xmm1 +; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovdqa %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <2 x i64>, <2 x i64>* %p0 + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> + %3 = add <4 x i64> %2, + store <2 x i64> %1, <2 x i64>* %p1 + ret <4 x i64> %3 +} + +define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { +; X32-LABEL: test_broadcast_4f32_8f32_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vaddps LCPI8_0, %ymm0, %ymm0 +; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_4f32_8f32_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <4 x float>, <4 x float>* %p0 + %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> + %3 = fadd <8 x float> %2, + store <4 x float> %1, <4 x float>* %p1 + ret <8 x float> %3 +} + +define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { +; X32-LABEL: test_broadcast_4i32_8i32_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovdqa (%ecx), %xmm1 +; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vpaddd LCPI9_0, %ymm0, %ymm0 +; X32-NEXT: vmovdqa %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_4i32_8i32_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovdqa (%rdi), %xmm1 +; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovdqa %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <4 x i32>, <4 x i32>* %p0 + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> + %3 = add <8 x i32> %2, + store <4 x i32> %1, <4 x i32>* %p1 + ret <8 x i32> %3 +} + +define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { +; X32-LABEL: test_broadcast_8i16_16i16_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovdqa (%ecx), %xmm1 +; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vpaddw LCPI10_0, %ymm0, %ymm0 +; X32-NEXT: vmovdqa %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_8i16_16i16_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovdqa (%rdi), %xmm1 +; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovdqa %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <8 x i16>, <8 x i16> *%p0 + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> + %3 = add <16 x i16> %2, + store <8 x i16> %1, <8 x i16>* %p1 + ret <16 x i16> %3 +} + +define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { +; X32-LABEL: test_broadcast_16i8_32i8_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovdqa (%ecx), %xmm1 +; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; X32-NEXT: vpaddb LCPI11_0, %ymm0, %ymm0 +; X32-NEXT: vmovdqa %xmm1, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_16i8_32i8_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovdqa (%rdi), %xmm1 +; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovdqa %xmm1, (%rsi) +; X64-NEXT: retq + %1 = load <16 x i8>, <16 x i8> *%p0 + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> + %3 = add <32 x i8> %2, + store <16 x i8> %1, <16 x i8>* %p1 + ret <32 x i8> %3 +}