From bd5555cc3f1c0d31c96511c6e041a8f1b2f34302 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Tue, 15 Jul 2014 13:26:28 +0000 Subject: [PATCH] [DAGCombiner] Add more rules to fold shuffles. This patch adds two new rules to the DAGCombiner: 1. shuffle (shuffle A, Undef, M0), B, M1 -> shuffle A, B, M2 2. shuffle (shuffle A, Undef, M0), A, M1 -> shuffle A, Undef, M2 We only do this if the combined shuffle is legal for the target. Example: ;; define <4 x float> @test(<4 x float> %a, <4 x float> %b) { %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x i32> %2 } ;; (using llc -mcpu=corei7 -march=x86-64) Before, the x86 backend generated: pshufd $120, %xmm0, %xmm0 shufps $-108, %xmm0, %xmm1 movaps %xmm1, %xmm0 Now the x86 backend generates: movsd %xmm1, %xmm0 llvm-svn: 213069 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +++-- llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll | 122 +++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9d91b3e..28031bb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10780,11 +10780,13 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } // Try to fold according to rules: - // shuffle(shuffle A, B, M0), B, M1) -> shuffle(A, B, M2) - // shuffle(shuffle A, B, M0), A, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, B, M0), B, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, B, M0), A, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2) // Don't try to fold shuffles with illegal type. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && - TLI.isTypeLegal(VT)) { + N1.getOpcode() != ISD::UNDEF && TLI.isTypeLegal(VT)) { ShuffleVectorSDNode *OtherSV = cast(N0); // The incoming shuffle must be of the same type as the result of the @@ -10795,7 +10797,8 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { SDValue SV0 = OtherSV->getOperand(0); SDValue SV1 = OtherSV->getOperand(1); bool HasSameOp0 = N1 == SV0; - if (!HasSameOp0 && N1 != SV1) + bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF; + if (!HasSameOp0 && !IsSV1Undef && N1 != SV1) // Early exit. return SDValue(); @@ -10810,17 +10813,24 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { continue; } - if (Idx < (int)NumElts) + if (Idx < (int)NumElts) { Idx = OtherSV->getMaskElt(Idx); - else + if (IsSV1Undef && Idx >= (int) NumElts) + Idx = -1; // Propagate Undef. + } else Idx = HasSameOp0 ? Idx - NumElts : Idx; Mask.push_back(Idx); } // Avoid introducing shuffles with illegal mask. - if (TLI.isShuffleMaskLegal(Mask, VT)) + if (TLI.isShuffleMaskLegal(Mask, VT)) { + if (IsSV1Undef) + // shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2) + return DAG.getVectorShuffle(VT, SDLoc(N), SV0, N1, &Mask[0]); return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]); + } } return SDValue(); diff --git a/llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll b/llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll new file mode 100644 index 0000000..45c624a --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll @@ -0,0 +1,122 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +; Verify that we fold shuffles according to rule: +; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) + +define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test1 +; Mask: [4,5,2,3] +; CHECK: movsd +; CHECK: ret + +define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test2 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test3 +; Mask: [0,1,4,u] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; FIXME: this should be lowered as a single movhlps. However, the backend +; wrongly thinks that shuffle mask [6,7,2,3] is not legal. Therefore, we +; end up with the sub-optimal sequence 'movhlps, palignr'. +; CHECK-LABEL: test4 +; Mask: [6,7,2,3] +; CHECK: movhlps +; CHECK: palignr $8 +; CHECK: ret + +define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test5 +; Mask: [0,1,6,7] +; CHECK: blendps $12 +; CHECK: ret + +; Verify that we fold shuffles according to rule: +; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) + +define <4 x float> @test6(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test6 +; Mask: [0,1,2,3] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK-NOT: movlhps +; CHECK: ret + +define <4 x float> @test7(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test7 +; Mask: [0,1,0,1] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x float> @test8(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test8 +; Mask: [0,1,0,u] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x float> @test9(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test9 +; Mask: [2,3,2,3] +; CHECK-NOT: movlhps +; CHECK-NOT: palignr +; CHECK: movhlps +; CHECK-NEXT: ret + +define <4 x float> @test10(<4 x float> %a) { + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test10 +; Mask: [0,1,2,3] +; CHECK-NOT: pshufd +; CHECK-NOT: shufps +; CHECK-NOT: movlhps +; CHECK: ret + -- 2.7.4