From 485c8ce7337b11b8afe4a32a319960a5d7672986 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 29 Mar 2021 14:41:53 +0100 Subject: [PATCH] Revert "[LV] Move runtime pointer size check to LVP::plan()." This reverts commit 25fbe803d4dbcf8ff3a3a9ca161f5b9a68353ed0. This breaks a clang test which filters for the wrong remark type. --- .../Vectorize/LoopVectorizationLegality.h | 10 +++-- .../Vectorize/LoopVectorizationLegality.cpp | 33 +++++++++++++- .../Vectorize/LoopVectorizationPlanner.h | 16 +------ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 50 ++++++---------------- llvm/test/LTO/X86/diagnostic-handler-remarks.ll | 4 +- .../LoopVectorize/{X86 => }/runtime-limit.ll | 14 +++--- 6 files changed, 60 insertions(+), 67 deletions(-) rename llvm/test/Transforms/LoopVectorize/{X86 => }/runtime-limit.ll (86%) diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index bfaad81..f9a8be3 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -177,6 +177,8 @@ private: /// followed by a non-expert user. class LoopVectorizationRequirements { public: + LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {} + /// Track the 1st floating-point instruction that can not be reassociated. void addExactFPMathInst(Instruction *I) { if (I && !ExactFPMathInst) @@ -185,19 +187,19 @@ public: void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } + bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints); Instruction *getExactFPInst() { return ExactFPMathInst; } bool canVectorizeFPMath(const LoopVectorizeHints &Hints) const { return !ExactFPMathInst || Hints.allowReordering(); } - unsigned getNumRuntimePointerChecks() const { - return NumRuntimePointerChecks; - } - private: unsigned NumRuntimePointerChecks = 0; Instruction *ExactFPMathInst = nullptr; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; }; /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 016f61a..939fbe3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -37,8 +37,11 @@ static cl::opt EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); -// TODO: Move size-based thresholds out of legality checking, make cost based -// decisions instead of hard thresholds. +static cl::opt PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + static cl::opt VectorizeSCEVCheckThreshold( "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed.")); @@ -243,6 +246,32 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { } } +bool LoopVectorizationRequirements::doesNotMeet( + Function *F, Loop *L, const LoopVectorizeHints &Hints) { + const char *PassName = Hints.vectorizeAnalysisPassName(); + bool Failed = false; + + // Test if runtime memcheck thresholds are exceeded. + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + ORE.emit([&]() { + return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", + L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Failed = true; + } + + return Failed; +} + // Return true if the inner loop \p Lp is uniform with regard to the outer loop // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes // executing the inner loop will execute the same iterations). This check is diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index d5306e5..70e1226 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -34,9 +34,6 @@ namespace llvm { class LoopVectorizationLegality; class LoopVectorizationCostModel; class PredicatedScalarEvolution; -class LoopVectorizationRequirements; -class LoopVectorizeHints; -class OptimizationRemarkEmitter; class VPRecipeBuilder; /// VPlan-based builder utility analogous to IRBuilder. @@ -223,12 +220,6 @@ class LoopVectorizationPlanner { PredicatedScalarEvolution &PSE; - const LoopVectorizeHints &Hints; - - LoopVectorizationRequirements &Requirements; - - OptimizationRemarkEmitter *ORE; - SmallVector VPlans; /// A builder used to construct the current plan. @@ -246,12 +237,9 @@ public: LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, - PredicatedScalarEvolution &PSE, - const LoopVectorizeHints &Hints, - LoopVectorizationRequirements &Requirements, - OptimizationRemarkEmitter *ORE) + PredicatedScalarEvolution &PSE) : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {} + PSE(PSE) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 039aa89..077b786 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -197,11 +197,6 @@ static cl::opt TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -static cl::opt PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); - // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -7779,30 +7774,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - auto SelectedVF = CM.selectVectorizationFactor(MaxVF); - - // Check if it is profitable to vectorize with runtime checks. - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "CantReorderMemOps", - OrigLoop->getStartLoc(), - OrigLoop->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Hints.emitRemarkWithHints(); - return VectorizationFactor::Disabled(); - } - } - return SelectedVF; + return CM.selectVectorizationFactor(MaxVF); } void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { @@ -9419,8 +9391,7 @@ static bool processLoopInVPlanNativePath( LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, - LoopVectorizationRequirements &Requirements) { + ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { if (isa(PSE.getBackedgeTakenCount())) { LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); @@ -9438,8 +9409,7 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -9567,7 +9537,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { PredicatedScalarEvolution PSE(*SE, *L); // Check if it is legal to vectorize the loop. - LoopVectorizationRequirements Requirements; + LoopVectorizationRequirements Requirements(*ORE); LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { @@ -9588,7 +9558,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // pipeline. if (!L->isInnermost()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, BFI, PSI, Hints, Requirements); + ORE, BFI, PSI, Hints); assert(L->isInnermost() && "Inner loop expected."); @@ -9667,8 +9637,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -9689,6 +9658,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; + if (Requirements.doesNotMeet(F, L, Hints)) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " + "requirements.\n"); + Hints.emitRemarkWithHints(); + return false; + } + if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( diff --git a/llvm/test/LTO/X86/diagnostic-handler-remarks.ll b/llvm/test/LTO/X86/diagnostic-handler-remarks.ll index 85e4a62..f38293d 100644 --- a/llvm/test/LTO/X86/diagnostic-handler-remarks.ll +++ b/llvm/test/LTO/X86/diagnostic-handler-remarks.ll @@ -6,14 +6,14 @@ ; Confirm that there are -pass-remarks. ; RUN: llvm-lto -use-new-pm=false \ ; RUN: -pass-remarks=inline \ -; RUN: -exported-symbol _func2 -pass-remarks-missed=loop-vectorize \ +; RUN: -exported-symbol _func2 -pass-remarks-analysis=loop-vectorize \ ; RUN: -exported-symbol _main -o %t.o %t.bc 2>&1 | \ ; RUN: FileCheck %s -allow-empty -check-prefix=REMARKS ; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM ; RUN: llvm-lto -use-new-pm=false \ ; RUN: -pass-remarks=inline -use-diagnostic-handler \ -; RUN: -exported-symbol _func2 -pass-remarks-missed=loop-vectorize \ +; RUN: -exported-symbol _func2 -pass-remarks-analysis=loop-vectorize \ ; RUN: -exported-symbol _main -o %t.o %t.bc 2>&1 | \ ; RUN: FileCheck %s -allow-empty -check-prefix=REMARKS_DH ; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/runtime-limit.ll similarity index 86% rename from llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll rename to llvm/test/Transforms/LoopVectorize/runtime-limit.ll index 7e93ef4..a7f692c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-limit.ll @@ -1,19 +1,17 @@ -; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE -; RUN: opt < %s -loop-vectorize -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux" - ; First loop produced diagnostic pass remark. -;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2) +;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) ; Second loop produces diagnostic analysis remark. ;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations ; First loop produced diagnostic pass remark. -;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2) +;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) ; Second loop produces diagnostic pass remark. -;OVERRIDE: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations +;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) ; We are vectorizing with 6 runtime checks. ;CHECK-LABEL: func1x6( @@ -58,7 +56,7 @@ for.end: ; preds = %for.body ;CHECK: ret ; We vectorize with 12 checks if a vectorization hint is provided. ;OVERRIDE-LABEL: func2x6( -;OVERRIDE-NOT: <4 x i32> +;OVERRIDE: <4 x i32> ;OVERRIDE: ret define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) { entry: -- 2.7.4