From b2921c7ca00afe7d1dd9194a60fb6460e94aa4a9 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Thu, 10 Jul 2014 18:04:55 +0000 Subject: [PATCH] [DAG] Further improve the logic in DAGCombiner that folds a pair of shuffles into a single shuffle if the resulting mask is legal. This patch teaches the DAGCombiner how to fold shuffles according to the following new rules: 1. shuffle(shuffle(x, y), undef) -> x 2. shuffle(shuffle(x, y), undef) -> y 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef) 4. shuffle(shuffle(x, y), undef) -> shuffle(y, undef) The backend avoids to combine shuffles according to rules 3. and 4. if the resulting shuffle does not have a legal mask. This is to avoid introducing illegal shuffles that are potentially expanded into a sub-optimal sequence of target specific dag nodes during vector legalization. Added test case combine-vec-shuffle-2.ll to verify that we correctly triggers the new rules when combining shuffles. llvm-svn: 212748 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 65 +++++++--- llvm/test/CodeGen/X86/combine-vec-shuffle-2.ll | 164 +++++++++++++++++++++++++ llvm/test/CodeGen/X86/widen_shuffle-1.ll | 4 +- 3 files changed, 218 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/X86/combine-vec-shuffle-2.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f6077b0..7c42e4d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10677,17 +10677,12 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } // If this shuffle node is simply a swizzle of another shuffle node, - // and it reverses the swizzle of the previous shuffle then we can - // optimize shuffle(shuffle(x, undef), undef) -> x. + // then try to simplify it. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && N1.getOpcode() == ISD::UNDEF) { ShuffleVectorSDNode *OtherSV = cast(N0); - // Shuffle nodes can only reverse shuffles with a single non-undef value. - if (N0.getOperand(1).getOpcode() != ISD::UNDEF) - return SDValue(); - // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(OtherSV->getOperand(0).getValueType() == VT && @@ -10704,27 +10699,69 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { Idx = OtherSV->getMaskElt(Idx); Mask.push_back(Idx); } + + bool CommuteOperands = false; + if (N0.getOperand(1).getOpcode() != ISD::UNDEF) { + // To be valid, the combine shuffle mask should only reference elements + // from one of the two vectors in input to the inner shufflevector. + bool IsValidMask = true; + for (unsigned i = 0; i != NumElts && IsValidMask; ++i) + // See if the combined mask only reference undefs or elements coming + // from the first shufflevector operand. + IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts; + + if (!IsValidMask) { + IsValidMask = true; + for (unsigned i = 0; i != NumElts && IsValidMask; ++i) + // Check that all the elements come from the second shuffle operand. + IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts; + CommuteOperands = IsValidMask; + } + + // Early exit if the combined shuffle mask is not valid. + if (!IsValidMask) + return SDValue(); + } + // See if this pair of shuffles can be safely folded according to either + // of the following rules: + // shuffle(shuffle(x, y), undef) -> x + // shuffle(shuffle(x, undef), undef) -> x + // shuffle(shuffle(x, y), undef) -> y bool IsIdentityMask = true; + unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0; for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) { // Skip Undefs. if (Mask[i] < 0) continue; // The combined shuffle must map each index to itself. - IsIdentityMask = (unsigned)Mask[i] == i; - } - - if (IsIdentityMask) - // optimize shuffle(shuffle(x, undef), undef) -> x. + IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex; + } + + if (IsIdentityMask) { + if (CommuteOperands) + // optimize shuffle(shuffle(x, y), undef) -> y. + return OtherSV->getOperand(1); + + // optimize shuffle(shuffle(x, undef), undef) -> x + // optimize shuffle(shuffle(x, y), undef) -> x return OtherSV->getOperand(0); + } // It may still be beneficial to combine the two shuffles if the // resulting shuffle is legal. - // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3). - if (TLI.isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1, + if (TLI.isShuffleMaskLegal(Mask, VT)) { + if (!CommuteOperands) + // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3). + // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3) + return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1, + &Mask[0]); + + // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(undef, y, M3) + return DAG.getVectorShuffle(VT, SDLoc(N), N1, N0->getOperand(1), &Mask[0]); + } } return SDValue(); diff --git a/llvm/test/CodeGen/X86/combine-vec-shuffle-2.ll b/llvm/test/CodeGen/X86/combine-vec-shuffle-2.ll new file mode 100644 index 0000000..7ab7f80 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-vec-shuffle-2.ll @@ -0,0 +1,164 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +; Check that DAGCombiner correctly folds the following pairs of shuffles +; using the following rules: +; 1. shuffle(shuffle(x, y), undef) -> x +; 2. shuffle(shuffle(x, y), undef) -> y +; 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef) +; 4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y) +; +; Rules 3. and 4. are used only if the resulting shuffle mask is legal. + +define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test1 +; Mask: [3,0,0,1] +; CHECK: pshufd $67 +; CHECK-NEXT: ret + + +define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test2 +; Mask: [2,0,0,3] +; CHECK: pshufd $-62 +; CHECK-NEXT: ret + + +define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test3 +; Mask: [2,0,0,3] +; CHECK: pshufd $-62 +; CHECK-NEXT: ret + + +define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test4 +; Mask: [0,0,0,1] +; CHECK: pshufd $64 +; CHECK-NEXT: ret + + +define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test5 +; Mask: [1,1] +; CHECK: movhlps +; CHECK-NEXT: ret + + +define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test6 +; Mask: [2,0,0,0] +; CHECK: pshufd $2 +; CHECK-NEXT: ret + + +define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test7 +; Mask: [0,2,0,2] +; CHECK: pshufd $-120 +; CHECK-NEXT: ret + + +define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test8 +; Mask: [1,0,3,0] +; CHECK: pshufd $49 +; CHECK-NEXT: ret + + +define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test9 +; Mask: [1,3,0,2] +; CHECK: pshufd $-115 +; CHECK-NEXT: ret + + +define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test10 +; Mask: [1,0,1,0] +; CHECK: pshufd $17 +; CHECK-NEXT: ret + + +define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test11 +; Mask: [1,0,2,1] +; CHECK: pshufd $97 +; CHECK-NEXT: ret + + +define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test12 +; Mask: [0,0,0,0] +; CHECK: pshufd $0 +; CHECK-NEXT: ret + + +; The following pair of shuffles is folded into vector %A. +define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test13 +; CHECK-NOT: pshufd +; CHECK: ret + + +; The following pair of shuffles is folded into vector %B. +define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test14 +; CHECK-NOT: pshufd +; CHECK: ret + diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll index 803402b..a355b75 100644 --- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll +++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll @@ -33,7 +33,9 @@ entry: define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind { entry: ; CHECK-LABEL: shuf3: -; CHECK: shufps +; CHECK-NOT: movlhps +; CHECK-NOT: shufps +; CHECK: pshufd %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> -- 2.7.4