From ede6d608f4a92ef508b91f0a37f5b9dc1de45f47 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 22 Nov 2022 09:28:55 -0500 Subject: [PATCH] [VectorCombine] switch on opcode to compile faster This follows 87debdadaf18 to further eliminate wasting time calling helper functions only to early return to the main run loop. Once again, this results in significant savings based on experimental data: https://llvm-compile-time-tracker.com/compare.php?from=01023bfcd33f922ed8c934ce563e54abe8bfe246&to=3dce4f70b73e48ccb045decb634c185e6b4c67d5&stat=instructions:u This is NFCI other than making the pass faster. The total cost of VectorCombine runs in an -O3 build appears to be well under 0.1% of compile-time now, so there's not much left to do AFAICT. There's a TODO about making the code cleaner, but it probably doesn't change timing much. I didn't include those changes here because it requires updating much more code. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 89 +++++++++++++++++++------ 1 file changed, 68 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 7769aaf..fe80158 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -85,6 +85,9 @@ private: InstructionWorklist Worklist; + // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" + // parameter. That should be updated to specific sub-classes because the + // run loop was changed to dispatch on opcode. bool vectorizeLoadInsert(Instruction &I); bool widenSubvectorLoad(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, @@ -271,8 +274,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { /// This removes a shuffle in IR and may allow combining of other loaded values. bool VectorCombine::widenSubvectorLoad(Instruction &I) { // Match subvector insert of fixed vector. - auto *Shuf = dyn_cast(&I); - if (!Shuf || !Shuf->isIdentityWithPadding()) + auto *Shuf = cast(&I); + if (!Shuf->isIdentityWithPadding()) return false; // Allow a non-canonical shuffle mask that is choosing elements from op1. @@ -1061,8 +1064,8 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment, // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 // store i32 %b, i32* %1 bool VectorCombine::foldSingleElementStore(Instruction &I) { - StoreInst *SI = dyn_cast(&I); - if (!SI || !SI->isSimple() || + auto *SI = cast(&I); + if (!SI->isSimple() || !isa(SI->getValueOperand()->getType())) return false; @@ -1371,10 +1374,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { /// architectures with no obvious "select" shuffle, this can reduce the total /// number of operations if the target reports them as cheaper. bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { - auto *SVI = dyn_cast(&I); - if (!SVI) - return false; - + auto *SVI = cast(&I); auto *VT = cast(I.getType()); auto *Op0 = dyn_cast(SVI->getOperand(0)); auto *Op1 = dyn_cast(SVI->getOperand(1)); @@ -1698,26 +1698,73 @@ bool VectorCombine::run() { bool MadeChange = false; auto FoldInst = [this, &MadeChange](Instruction &I) { Builder.SetInsertPoint(&I); - if (!TryEarlyFoldsOnly) { - if (isa(I.getType())) { + bool IsFixedVectorType = isa(I.getType()); + auto Opcode = I.getOpcode(); + + // These folds should be beneficial regardless of when this pass is run + // in the optimization pipeline. + // The type checking is for run-time efficiency. We can avoid wasting time + // dispatching to folding functions if there's no chance of matching. + if (IsFixedVectorType) { + switch (Opcode) { + case Instruction::InsertElement: + MadeChange |= vectorizeLoadInsert(I); + break; + case Instruction::ShuffleVector: + MadeChange |= widenSubvectorLoad(I); + break; + case Instruction::Load: + MadeChange |= scalarizeLoadExtract(I); + break; + default: + MadeChange |= scalarizeBinopOrCmp(I); + break; + } + } + if (Opcode == Instruction::Store) + MadeChange |= foldSingleElementStore(I); + + + // If this is an early pipeline invocation of this pass, we are done. + if (TryEarlyFoldsOnly) + return; + + // Otherwise, try folds that improve codegen but may interfere with + // early IR canonicalizations. + // The type checking is for run-time efficiency. We can avoid wasting time + // dispatching to folding functions if there's no chance of matching. + if (IsFixedVectorType) { + switch (Opcode) { + case Instruction::InsertElement: MadeChange |= foldInsExtFNeg(I); - MadeChange |= foldBitcastShuf(I); + break; + case Instruction::ShuffleVector: MadeChange |= foldShuffleOfBinops(I); MadeChange |= foldSelectShuffle(I); - } else { - MadeChange |= foldExtractExtract(I); - MadeChange |= foldExtractedCmps(I); + break; + case Instruction::BitCast: + MadeChange |= foldBitcastShuf(I); + break; + } + } else { + switch (Opcode) { + case Instruction::Call: MadeChange |= foldShuffleFromReductions(I); + break; + case Instruction::ICmp: + case Instruction::FCmp: + MadeChange |= foldExtractExtract(I); + break; + default: + if (I.isBinaryOp()) { + MadeChange |= foldExtractExtract(I); + MadeChange |= foldExtractedCmps(I); + } + break; } } - if (isa(I.getType())) { - MadeChange |= vectorizeLoadInsert(I); - MadeChange |= widenSubvectorLoad(I); - MadeChange |= scalarizeBinopOrCmp(I); - MadeChange |= scalarizeLoadExtract(I); - } - MadeChange |= foldSingleElementStore(I); }; + for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) -- 2.7.4