From 9912232b461ab76b08497021019084360b137060 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Tue, 8 Oct 2019 17:32:56 +0000 Subject: [PATCH] Revert "[LoopVectorize][PowerPC] Estimate int and float register pressure separately in loop-vectorize" Also Revert "[LoopVectorize] Fix non-debug builds after rL374017" This reverts commit 9f41deccc0e648a006c9f38e11919f181b6c7e0a. This reverts commit 18b6fe07bcf44294f200bd2b526cb737ed275c04. The patch is breaking PowerPC internal build, checked with author, reverting on behalf of him for now due to timezone. llvm-svn: 374091 --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 35 +--- .../llvm/Analysis/TargetTransformInfoImpl.h | 15 +- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 2 + llvm/lib/Analysis/TargetTransformInfo.cpp | 12 +- .../Target/AArch64/AArch64TargetTransformInfo.h | 3 +- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 3 +- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 35 +--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h | 8 +- .../Target/SystemZ/SystemZTargetTransformInfo.cpp | 3 +- .../Target/SystemZ/SystemZTargetTransformInfo.h | 2 +- .../WebAssembly/WebAssemblyTargetTransformInfo.cpp | 5 +- .../WebAssembly/WebAssemblyTargetTransformInfo.h | 2 +- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 3 +- llvm/lib/Target/X86/X86TargetTransformInfo.h | 2 +- llvm/lib/Target/XCore/XCoreTargetTransformInfo.h | 3 +- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 4 +- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 153 ++++++------------ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../Transforms/LoopVectorize/PowerPC/reg-usage.ll | 179 --------------------- .../LoopVectorize/X86/reg-usage-debug.ll | 12 +- .../test/Transforms/LoopVectorize/X86/reg-usage.ll | 34 +--- 21 files changed, 86 insertions(+), 431 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index abea2af..6da2d7f 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -788,23 +788,10 @@ public: /// Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; - /// \return the number of registers in the target-provided register class. - unsigned getNumberOfRegisters(unsigned ClassID) const; - - /// \return the target-provided register class ID for the provided type, - /// accounting for type promotion and other type-legalization techniques that the target might apply. - /// However, it specifically does not account for the scalarization or splitting of vector types. - /// Should a vector type require scalarization or splitting into multiple underlying vector registers, - /// that type should be mapped to a register class containing no registers. - /// Specifically, this is designed to provide a simple, high-level view of the register allocation - /// later performed by the backend. These register classes don't necessarily map onto the - /// register classes used by the backend. - /// FIXME: It's not currently possible to determine how many registers - /// are used by the provided type. - unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const; - - /// \return the target-provided register class name - const char* getRegisterClassName(unsigned ClassID) const; + /// \return The number of scalar or vector registers that the target has. + /// If 'Vectors' is true, it returns the number of vector registers. If it is + /// set to false, it returns the number of scalar registers. + unsigned getNumberOfRegisters(bool Vector) const; /// \return The width of the largest scalar or vector register type. unsigned getRegisterBitWidth(bool Vector) const; @@ -1256,9 +1243,7 @@ public: Type *Ty) = 0; virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) = 0; - virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0; - virtual unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const = 0; - virtual const char* getRegisterClassName(unsigned ClassID) const = 0; + virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; @@ -1601,14 +1586,8 @@ public: Type *Ty) override { return Impl.getIntImmCost(IID, Idx, Imm, Ty); } - unsigned getNumberOfRegisters(unsigned ClassID) const override { - return Impl.getNumberOfRegisters(ClassID); - } - unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const override { - return Impl.getRegisterClassForType(Vector, Ty); - } - const char* getRegisterClassName(unsigned ClassID) const override { - return Impl.getRegisterClassName(ClassID); + unsigned getNumberOfRegisters(bool Vector) override { + return Impl.getNumberOfRegisters(Vector); } unsigned getRegisterBitWidth(bool Vector) const override { return Impl.getRegisterBitWidth(Vector); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index f850d9a..2f10117 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -354,20 +354,7 @@ public: return TTI::TCC_Free; } - unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; } - - unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const { - return Vector ? 1 : 0; - }; - - const char* getRegisterClassName(unsigned ClassID) const { - switch (ClassID) { - default: - return "Generic::Unknown Register Class"; - case 0: return "Generic::ScalarRC"; - case 1: return "Generic::VectorRC"; - } - } + unsigned getNumberOfRegisters(bool Vector) { return 8; } unsigned getRegisterBitWidth(bool Vector) const { return 32; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 1cf9cc6..75e0f84 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -519,6 +519,8 @@ public: /// \name Vector TTI Implementations /// @{ + unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } + unsigned getRegisterBitWidth(bool Vector) const { return 32; } /// Estimate the overhead of scalarizing an instruction. Insert and Extract diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index aa93e1d..f3d20ce 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -466,16 +466,8 @@ int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx, return Cost; } -unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const { - return TTIImpl->getNumberOfRegisters(ClassID); -} - -unsigned TargetTransformInfo::getRegisterClassForType(bool Vector, Type *Ty) const { - return TTIImpl->getRegisterClassForType(Vector, Ty); -} - -const char* TargetTransformInfo::getRegisterClassName(unsigned ClassID) const { - return TTIImpl->getRegisterClassName(ClassID); +unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { + return TTIImpl->getNumberOfRegisters(Vector); } unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index d5ef0e5..95cda63 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -85,8 +85,7 @@ public: bool enableInterleavedAccessVectorization() { return true; } - unsigned getNumberOfRegisters(unsigned ClassID) const { - bool Vector = (ClassID == 1); + unsigned getNumberOfRegisters(bool Vector) { if (Vector) { if (ST->hasNEON()) return 32; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index b878ea3..47e98da 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -122,8 +122,7 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(unsigned ClassID) const { - bool Vector = (ClassID == 1); + unsigned getNumberOfRegisters(bool Vector) { if (Vector) { if (ST->hasNEON()) return 16; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 7643351..40e5366 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -594,37 +594,10 @@ bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } -unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const { - assert(ClassID == GPRRC || ClassID == FPRRC || - ClassID == VRRC || ClassID == VSXRC); - if (ST->hasVSX()) { - assert(ClassID == GPRRC || ClassID == VSXRC); - return ClassID == GPRRC ? 32 : 64; - } - assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC); - return 32; -} - -unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const { - if (Vector) - return ST->hasVSX() ? VSXRC : VRRC; - else if (Ty && Ty->getScalarType()->isFloatTy()) - return ST->hasVSX() ? VSXRC : FPRRC; - else - return GPRRC; -} - -const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const { - - switch (ClassID) { - default: - llvm_unreachable("unknown register class"); - return "PPC::unknown register class"; - case GPRRC: return "PPC::GPRRC"; - case FPRRC: return "PPC::FPRRC"; - case VRRC: return "PPC::VRRC"; - case VSXRC: return "PPC::VSXRC"; - } +unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { + if (Vector && !ST->hasAltivec() && !ST->hasQPX()) + return 0; + return ST->hasVSX() ? 64 : 32; } unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 294c970..5d76ee4 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -72,13 +72,7 @@ public: TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); - - enum PPCRegisterClass { - GPRRC, FPRRC, VRRC, VSXRC - }; - unsigned getNumberOfRegisters(unsigned ClassID) const; - unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const; - const char* getRegisterClassName(unsigned ClassID) const; + unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; unsigned getCacheLineSize(); unsigned getPrefetchDistance(); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 11c99aa..8d45e67 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -304,8 +304,7 @@ bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, C2.ScaleCost, C2.SetupCost); } -unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { - bool Vector = (ClassID == 1); +unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) { if (!Vector) // Discount the stack pointer. Also leave out %r0, since it can't // be used in an address. diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index e59bade..16ce2ef 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -56,7 +56,7 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(unsigned ClassID) const; + unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; unsigned getCacheLineSize() { return 256; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 1c53e90..46ef765 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -25,11 +25,10 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const { return TargetTransformInfo::PSK_FastHardware; } -unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const { - unsigned Result = BaseT::getNumberOfRegisters(ClassID); +unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) { + unsigned Result = BaseT::getNumberOfRegisters(Vector); // For SIMD, use at least 16 registers, as a rough guess. - bool Vector = (ClassID == 1); if (Vector) Result = std::max(Result, 16u); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index f0ecc73..1b11b4b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -53,7 +53,7 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(unsigned ClassID) const; + unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 2f419b7..b634da1 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -116,8 +116,7 @@ llvm::Optional X86TTIImpl::getCacheAssociativity( llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } -unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { - bool Vector = (ClassID == 1); +unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasSSE1()) return 0; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 3ff1896..9b948db 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -116,7 +116,7 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(unsigned ClassID) const; + unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); diff --git a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h index 58df1f2..3fecaaa 100644 --- a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h +++ b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h @@ -40,8 +40,7 @@ public: : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} - unsigned getNumberOfRegisters(unsigned ClassID) const { - bool Vector = (ClassID == 1); + unsigned getNumberOfRegisters(bool Vector) { if (Vector) { return 0; } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 7f11917..852bbef 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1386,9 +1386,7 @@ void Cost::RateFormula(const Formula &F, // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as // additional instruction (at least fill). - // TODO: Need distinguish register class? - unsigned TTIRegNum = TTI->getNumberOfRegisters( - TTI->getRegisterClassForType(false, F.getType())) - 1; + unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1; if (C.NumRegs > TTIRegNum) { // Cost already exceeded TTIRegNum, then only newly added register can add // new instructions. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 18cc61f..7e95038 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -983,11 +983,10 @@ public: /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. - /// The key is ClassID of target-provided register class. - SmallMapVector LoopInvariantRegs; + unsigned LoopInvariantRegs; + /// Holds the maximum number of concurrent live intervals in the loop. - /// The key is ClassID of target-provided register class. - SmallMapVector MaxLocalUsers; + unsigned MaxLocalUsers; }; /// \return Returns information about the register usages of the loop for the @@ -4963,14 +4962,9 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { // Select the largest VF which doesn't require more registers than existing // ones. + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); for (int i = RUs.size() - 1; i >= 0; --i) { - bool Selected = true; - for (auto& pair : RUs[i].MaxLocalUsers) { - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); - if (pair.second > TargetNumRegisters) - Selected = false; - } - if (Selected) { + if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { MaxVF = VFs[i]; break; } @@ -5121,12 +5115,22 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); + LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters + << " registers\n"); + + if (VF == 1) { + if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumScalarRegs; + } else { + if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumVectorRegs; + } + RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. - for (auto& pair : R.MaxLocalUsers) { - pair.second = std::max(pair.second, 1U); - } + R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available @@ -5139,35 +5143,13 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, // We also want power of two interleave counts to ensure that the induction // variable of the vector loop wraps to zero, when tail is folded by masking; // this currently happens when OptForSize, in which case IC is set to 1 above. - unsigned IC = UINT_MAX; + unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / + R.MaxLocalUsers); - for (auto& pair : R.MaxLocalUsers) { - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); - LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters - << " registers of " - << TTI.getRegisterClassName(pair.first) << " register class\n"); - if (VF == 1) { - if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumScalarRegs; - } else { - if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumVectorRegs; - } - unsigned MaxLocalUsers = pair.second; - unsigned LoopInvariantRegs = 0; - if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) - LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; - - unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); - // Don't count the induction variable as interleaved. - if (EnableIndVarRegisterHeur) { - TmpIC = - PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / - std::max(1U, (MaxLocalUsers - 1))); - } - - IC = std::min(IC, TmpIC); - } + // Don't count the induction variable as interleaved. + if (EnableIndVarRegisterHeur) + IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / + std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); @@ -5349,7 +5331,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { const DataLayout &DL = TheFunction->getParent()->getDataLayout(); SmallVector RUs(VFs.size()); - SmallVector, 8> MaxUsages(VFs.size()); + SmallVector MaxUsages(VFs.size(), 0); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); @@ -5379,45 +5361,21 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { - // Count the number of live intervals. - SmallMapVector RegUsage; - if (VFs[j] == 1) { - for (auto Inst : OpenIntervals) { - unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); - if (RegUsage.find(ClassID) == RegUsage.end()) - RegUsage[ClassID] = 1; - else - RegUsage[ClassID] += 1; - } - } else { - collectUniformsAndScalars(VFs[j]); - for (auto Inst : OpenIntervals) { - // Skip ignored values for VF > 1. - if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) - continue; - if (isScalarAfterVectorization(Inst, VFs[j])) { - unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); - if (RegUsage.find(ClassID) == RegUsage.end()) - RegUsage[ClassID] = 1; - else - RegUsage[ClassID] += 1; - } else { - unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); - if (RegUsage.find(ClassID) == RegUsage.end()) - RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); - else - RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); - } - } + MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); + continue; } - - for (auto& pair : RegUsage) { - if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) - MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); - else - MaxUsages[j][pair.first] = pair.second; + collectUniformsAndScalars(VFs[j]); + // Count the number of live intervals. + unsigned RegUsage = 0; + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || + isScalarAfterVectorization(Inst, VFs[j])) + continue; + RegUsage += GetRegUsage(Inst->getType(), VFs[j]); } + MaxUsages[j] = std::max(MaxUsages[j], RegUsage); } LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " @@ -5428,34 +5386,18 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { } for (unsigned i = 0, e = VFs.size(); i < e; ++i) { - SmallMapVector Invariant; - - for (auto Inst : LoopInvariants) { - unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); - unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); - if (Invariant.find(ClassID) == Invariant.end()) - Invariant[ClassID] = Usage; - else - Invariant[ClassID] += Usage; + unsigned Invariant = 0; + if (VFs[i] == 1) + Invariant = LoopInvariants.size(); + else { + for (auto Inst : LoopInvariants) + Invariant += GetRegUsage(Inst->getType(), VFs[i]); } LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " - << MaxUsages[i].size() << " item\n"); - for (const auto& Pair : MaxUsages[i]) { - (void)Pair; - LLVM_DEBUG(dbgs() << "LV(REG): RegisterClass: " - << TTI.getRegisterClassName(Pair.first) - << ", " << Pair.second << " registers \n"); - } - LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " - << Invariant.size() << " item\n"); - for (const auto& Pair : Invariant) { - (void)Pair; - LLVM_DEBUG(dbgs() << "LV(REG): RegisterClass: " - << TTI.getRegisterClassName(Pair.first) - << ", " << Pair.second << " registers \n"); - } + LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant + << '\n'); RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; @@ -7820,8 +7762,7 @@ bool LoopVectorizePass::runImpl( // The second condition is necessary because, even if the target has no // vector registers, loop vectorization may still enable scalar // interleaving. - if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && - TTI->getMaxInterleaveFactor(1) < 2) + if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) return false; bool Changed = false; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a22153b..99428c6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5237,7 +5237,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // If the target claims to have no vector registers don't attempt // vectorization. - if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) + if (!TTI->getNumberOfRegisters(true)) return false; // Don't vectorize when the attribute NoImplicitFloat is used. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll deleted file mode 100644 index 55593c3..0000000 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ /dev/null @@ -1,179 +0,0 @@ -; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=powerpc64-unknown-linux -S -mcpu=pwr8 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 -; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR9 -; REQUIRES: asserts - -@a = global [1024 x i8] zeroinitializer, align 16 -@b = global [1024 x i8] zeroinitializer, align 16 - -define i32 @foo() { -; -; CHECK-LABEL: foo - -; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 7 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item -; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 13 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item - -; CHECK-PWR8: LV(REG): VF = 16 -; CHECK-PWR8-NEXT: LV(REG): Found max usage: 2 item -; CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 13 registers -; CHECK-PWR8-NEXT: LV(REG): Found invariant usage: 0 item -; CHECK-PWR8: Setting best plan to VF=16, UF=4 - -; CHECK-PWR9: LV(REG): VF = 8 -; CHECK-PWR9-NEXT: LV(REG): Found max usage: 2 item -; CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 7 registers -; CHECK-PWR9-NEXT: LV(REG): Found invariant usage: 0 item -; CHECK-PWR9: Setting best plan to VF=8, UF=8 - - -entry: - br label %for.body - -for.cond.cleanup: - %add.lcssa = phi i32 [ %add, %for.body ] - ret i32 %add.lcssa - -for.body: - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ] - %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv - %0 = load i8, i8* %arrayidx, align 1 - %conv = zext i8 %0 to i32 - %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv - %1 = load i8, i8* %arrayidx2, align 1 - %conv3 = zext i8 %1 to i32 - %sub = sub nsw i32 %conv, %conv3 - %ispos = icmp sgt i32 %sub, -1 - %neg = sub nsw i32 0, %sub - %2 = select i1 %ispos, i32 %sub, i32 %neg - %add = add nsw i32 %2, %s.015 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 1024 - br i1 %exitcond, label %for.cond.cleanup, label %for.body -} - -define i32 @goo() { -; For indvars.iv used in a computating chain only feeding into getelementptr or cmp, -; it will not have vector version and the vector register usage will not exceed the -; available vector register number. -; CHECK-LABEL: goo -; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 7 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item -; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 13 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item -; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 13 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item - -; CHECK: Setting best plan to VF=16, UF=4 - -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.body - %add.lcssa = phi i32 [ %add, %for.body ] - ret i32 %add.lcssa - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ] - %tmp1 = add nsw i64 %indvars.iv, 3 - %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1 - %tmp = load i8, i8* %arrayidx, align 1 - %conv = zext i8 %tmp to i32 - %tmp2 = add nsw i64 %indvars.iv, 2 - %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2 - %tmp3 = load i8, i8* %arrayidx2, align 1 - %conv3 = zext i8 %tmp3 to i32 - %sub = sub nsw i32 %conv, %conv3 - %ispos = icmp sgt i32 %sub, -1 - %neg = sub nsw i32 0, %sub - %tmp4 = select i1 %ispos, i32 %sub, i32 %neg - %add = add nsw i32 %tmp4, %s.015 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 1024 - br i1 %exitcond, label %for.cond.cleanup, label %for.body -} - -define i64 @bar(i64* nocapture %a) { -; CHECK-LABEL: bar -; CHECK: LV(REG): VF = 2 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item - -; CHECK: Setting best plan to VF=2, UF=12 - -entry: - br label %for.body - -for.cond.cleanup: - %add2.lcssa = phi i64 [ %add2, %for.body ] - ret i64 %add2.lcssa - -for.body: - %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ] - %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ] - %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012 - %0 = load i64, i64* %arrayidx, align 8 - %add = add nsw i64 %0, %i.012 - store i64 %add, i64* %arrayidx, align 8 - %add2 = add nsw i64 %add, %s.011 - %inc = add nuw nsw i64 %i.012, 1 - %exitcond = icmp eq i64 %inc, 1024 - br i1 %exitcond, label %for.cond.cleanup, label %for.body -} - -@d = external global [0 x i64], align 8 -@e = external global [0 x i32], align 4 -@c = external global [0 x i32], align 4 - -define void @hoo(i32 %n) { -; CHECK-LABEL: hoo -; CHECK: LV(REG): VF = 4 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 2 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item -; CHECK: LV(REG): VF = 1 -; CHECK-NEXT: LV(REG): Found max usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item -; CHECK: Setting best plan to VF=1, UF=12 - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv - %tmp = load i64, i64* %arrayidx, align 8 - %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp - %tmp1 = load i32, i32* %arrayidx1, align 4 - %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv - store i32 %tmp1, i32* %arrayidx3, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 10000 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll index b6254a4..8205092 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll @@ -22,11 +22,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; CHECK: LV: Checking a loop in "test_g" -; CHECK: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK: LV(REG): Found max usage: 2 define i32 @test_g(i32* nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 { entry: @@ -64,11 +60,7 @@ for.end: ; preds = %for.end.loopexit, % } ; CHECK: LV: Checking a loop in "test" -; CHECK: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 1 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers +; CHECK: LV(REG): Found max usage: 2 define i32 @test(i32* nocapture readonly %a, i32 %n) local_unnamed_addr { entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll index cae9360..9b276aa 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll @@ -11,15 +11,9 @@ define i32 @foo() { ; ; CHECK-LABEL: foo ; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found max usage: 7 ; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found max usage: 13 entry: br label %for.body @@ -53,15 +47,9 @@ define i32 @goo() { ; available vector register number. ; CHECK-LABEL: goo ; CHECK: LV(REG): VF = 8 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found max usage: 7 ; CHECK: LV(REG): VF = 16 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found max usage: 13 entry: br label %for.body @@ -93,11 +81,8 @@ for.body: ; preds = %for.body, %entry define i64 @bar(i64* nocapture %a) { ; CHECK-LABEL: bar ; CHECK: LV(REG): VF = 2 -; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item - +; CHECK: LV(REG): Found max usage: 3 +; entry: br label %for.body @@ -128,11 +113,8 @@ define void @hoo(i32 %n) { ; so the max usage of AVX512 vector register will be 2. ; AVX512F-LABEL: bar ; AVX512F: LV(REG): VF = 16 -; AVX512F-CHECK: LV(REG): Found max usage: 2 item -; AVX512F-CHECK: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; AVX512F-CHECK: LV(REG): RegisterClass: Generic::VectorRC, 2 registers -; AVX512F-CHECK: LV(REG): Found invariant usage: 0 item - +; AVX512F: LV(REG): Found max usage: 2 +; entry: br label %for.body -- 2.7.4