From cbcdd747e85b8d33b821d94d8114b971f31fd0d2 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 21 Feb 2023 10:20:41 -0800 Subject: [PATCH] [SLP]Do not swap not counted extractelements. No need to swap extractelements, which were not excluded from the list during cost analysis. It leads to incorrect cost calculation and make vector code more profitable than it is actually is. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../AArch64/extractelements-to-shuffle.ll | 38 ++++++++++++---------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b0c0c5f..8d22823 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -642,7 +642,7 @@ tryToGatherExtractElements(SmallVectorImpl &VL, !isa(EI->getIndexOperand()) || is_contained(UndefVectorExtracts, I)) continue; - if (Mask[I] == UndefMaskElem) + if (Mask[I] == UndefMaskElem && !isa(GatheredExtracts[I])) std::swap(VL[I], GatheredExtracts[I]); } return Res; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll index db68c8a..61aa911 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -4,24 +4,26 @@ define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: @test( ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP9]], i64 [[TMP6]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> , i64 [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32> -; CHECK-NEXT: br label [[TMP15:%.*]] -; CHECK: 15: -; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i32> [ [[TMP20:%.*]], [[TMP15]] ], [ [[TMP14]], [[TMP3:%.*]] ] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP16]], <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> zeroinitializer, [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> zeroinitializer, [[TMP17]] -; CHECK-NEXT: [[TMP20]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> -; CHECK-NEXT: br label [[TMP15]] +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; CHECK-NEXT: br label [[TMP17:%.*]] +; CHECK: 17: +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ] +; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ] +; CHECK-NEXT: [[TMP22]] = or i32 [[TMP18]], 0 +; CHECK-NEXT: br label [[TMP17]] ; %4 = extractelement <2 x i64> %1, i64 0 %5 = or i64 %4, 0 -- 2.7.4