From 6f9e1ea0efb93388c9301e672b7a73e8216ffa44 Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 8 May 2022 10:32:41 +0100 Subject: [PATCH] [VectorCombine] Attempt to fold select shuffles from reductions Given a commutative reduction leading from a shuffle, the order of the lanes on the shuffle are not important for the result. This means we can reorder the shuffle to something simpler, which we try shuffling the first vector lanes first. This was D123494. The new shuffle may not be profitable though, and if it is not we can try the folding of select shuffles from D123911. This, with some adjustment as the output lane ordering is now unimportant, can allow the final shuffle to simplify given the inputs to the patterns from D123911. Where as each transformation on their own are not profitable, the combination is. We can only support a single shuffle when called from reductions, but we are able to sort the ReconstructMask, potentially allowing it to simplify to an identity or concat mask. Differential Revision: https://reviews.llvm.org/D125086 --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 16 +++- .../VectorCombine/AArch64/select-shuffle.ll | 88 ++++++++++++---------- 2 files changed, 62 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index cf46090..c206a5c 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -104,7 +104,7 @@ private: bool scalarizeLoadExtract(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleFromReductions(Instruction &I); - bool foldSelectShuffle(Instruction &I); + bool foldSelectShuffle(Instruction &I, bool FromReduction = false); void replaceValue(Value &Old, Value &New) { Old.replaceAllUsesWith(&New); @@ -1223,7 +1223,9 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { replaceValue(*Shuffle, *NewShuffle); } - return false; + // See if we can re-use foldSelectShuffle, getting it to reduce the size of + // the shuffle into a nicer order, as it can ignore the order of the shuffles. + return foldSelectShuffle(*Shuffle, true); } /// This method looks for groups of shuffles acting on binops, of the form: @@ -1236,7 +1238,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { /// the shuffle to a form where only parts of a and b need to be computed. On /// architectures with no obvious "select" shuffle, this can reduce the total /// number of operations if the target reports them as cheaper. -bool VectorCombine::foldSelectShuffle(Instruction &I) { +bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { auto *SVI = dyn_cast(&I); auto *VT = dyn_cast(I.getType()); if (!SVI || !VT) @@ -1275,6 +1277,10 @@ bool VectorCombine::foldSelectShuffle(Instruction &I) { }; if (!collectShuffles(Op0) || !collectShuffles(Op1)) return false; + // From a reduction, we need to be processing a single shuffle, otherwise the + // other uses will not be lane-invariant. + if (FromReduction && Shuffles.size() > 1) + return false; // For each of the output shuffles, we try to sort all the first vector // elements to the beginning, followed by the second array elements at the @@ -1328,6 +1334,10 @@ bool VectorCombine::foldSelectShuffle(Instruction &I) { } } + // For reductions, we know that the lane ordering out doesn't alter the + // result. In-order can help simplify the shuffle away. + if (FromReduction) + sort(ReconstructMask); ReconstructMasks.push_back(ReconstructMask); } diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll index 3d6507a..d6eab5a 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -22,11 +22,13 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) { define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test1_reduce( -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]]) ; CHECK-NEXT: ret i32 [[R]] ; @@ -512,23 +514,27 @@ define dso_local i32 @full(i8* nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP62]], [[TMP64]] ; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[REORDER2:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP68]], [[REORDER2]] -; CHECK-NEXT: [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP68]], [[REORDER2]] -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> -; CHECK-NEXT: [[REORDER3:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = add nsw <16 x i32> [[TMP71]], [[REORDER3]] -; CHECK-NEXT: [[TMP73:%.*]] = sub nsw <16 x i32> [[TMP71]], [[REORDER3]] -; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> -; CHECK-NEXT: [[TMP75:%.*]] = lshr <16 x i32> [[TMP74]], -; CHECK-NEXT: [[TMP76:%.*]] = and <16 x i32> [[TMP75]], -; CHECK-NEXT: [[TMP77:%.*]] = mul nuw <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = add <16 x i32> [[TMP77]], [[TMP74]] -; CHECK-NEXT: [[TMP79:%.*]] = xor <16 x i32> [[TMP78]], [[TMP77]] -; CHECK-NEXT: [[TMP80:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP79]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP80]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP80]], 16 +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP70]] +; CHECK-NEXT: [[TMP73:%.*]] = sub nsw <16 x i32> [[TMP69]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP78:%.*]] = add nsw <16 x i32> [[TMP74]], [[TMP76]] +; CHECK-NEXT: [[TMP79:%.*]] = sub nsw <16 x i32> [[TMP75]], [[TMP77]] +; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <16 x i32> [[TMP78]], <16 x i32> [[TMP79]], <16 x i32> +; CHECK-NEXT: [[TMP81:%.*]] = lshr <16 x i32> [[TMP80]], +; CHECK-NEXT: [[TMP82:%.*]] = and <16 x i32> [[TMP81]], +; CHECK-NEXT: [[TMP83:%.*]] = mul nuw <16 x i32> [[TMP82]], +; CHECK-NEXT: [[TMP84:%.*]] = add <16 x i32> [[TMP83]], [[TMP80]] +; CHECK-NEXT: [[TMP85:%.*]] = xor <16 x i32> [[TMP84]], [[TMP83]] +; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP85]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP86]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP86]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] @@ -719,23 +725,27 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi ; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> ; CHECK-NEXT: [[TMP58:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP56]] ; CHECK-NEXT: [[TMP59:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP57]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> -; CHECK-NEXT: [[REORDER192:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP60]], [[REORDER192]] -; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP60]], [[REORDER192]] -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[REORDER193:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = add nsw <16 x i32> [[TMP63]], [[REORDER193]] -; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP63]], [[REORDER193]] -; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = lshr <16 x i32> [[TMP66]], -; CHECK-NEXT: [[TMP68:%.*]] = and <16 x i32> [[TMP67]], -; CHECK-NEXT: [[TMP69:%.*]] = mul nuw <16 x i32> [[TMP68]], -; CHECK-NEXT: [[TMP70:%.*]] = add <16 x i32> [[TMP69]], [[TMP66]] -; CHECK-NEXT: [[TMP71:%.*]] = xor <16 x i32> [[TMP70]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP71]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP72]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP72]], 16 +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP62]] +; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP61]], [[TMP63]] +; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP66]], [[TMP68]] +; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP69]] +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = lshr <16 x i32> [[TMP72]], +; CHECK-NEXT: [[TMP74:%.*]] = and <16 x i32> [[TMP73]], +; CHECK-NEXT: [[TMP75:%.*]] = mul nuw <16 x i32> [[TMP74]], +; CHECK-NEXT: [[TMP76:%.*]] = add <16 x i32> [[TMP75]], [[TMP72]] +; CHECK-NEXT: [[TMP77:%.*]] = xor <16 x i32> [[TMP76]], [[TMP75]] +; CHECK-NEXT: [[TMP78:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP77]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP78]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP78]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] -- 2.7.4