From: Andrew Litteken Date: Thu, 17 Sep 2020 20:12:08 +0000 (-0500) Subject: [IRSim] Adding a basic similarity identifier. X-Git-Tag: llvmorg-13-init~11075 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b63bfc2030dedf1f94fd074805993b53c5a71caf;p=platform%2Fupstream%2Fllvm.git [IRSim] Adding a basic similarity identifier. This takes the mapped instructions from the IRInstructionMapper, and passes it to the Suffix Tree to find the repeated substrings. Within each set of repeated substrings, the IRSimilarityCandidates are compared against one another for structure, and ensuring that the operands in the instructions are used in the same way. Each of these structurally similarity IRSimilarityCandidates are contained in a SimilarityGroup. Tests checking for identifying identity of structure, different isomorphic structure, and different nonisomoprhic structure are found in unittests/Analysis/IRSimilarityIdentifierTest.cpp. Differential Revision: https://reviews.llvm.org/D86972 --- diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h index a5b1fac..dceb531 100644 --- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h +++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h @@ -41,6 +41,9 @@ // An IRSimilarityCandidate is a region of IRInstructionData (wrapped // Instructions), usually used to denote a region of similarity has been found. // +// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally +// similar to one another. +// //===----------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H @@ -575,6 +578,122 @@ public: iterator end() const { return std::next(iterator(back())); } }; +typedef std::vector SimilarityGroup; +typedef std::vector SimilarityGroupList; + +/// This class puts all the pieces of the IRInstructionData, +/// IRInstructionMapper, IRSimilarityCandidate together. +/// +/// It first feeds the Module or vector of Modules into the IRInstructionMapper, +/// and puts all the mapped instructions into a single long list of +/// IRInstructionData. +/// +/// The list of unsigned integers is given to the Suffix Tree or similar data +/// structure to find repeated subsequences. We construct an +/// IRSimilarityCandidate for each instance of the subsequence. We compare them +/// against one another since These repeated subsequences can have different +/// structure. For each different kind of structure found, we create a +/// similarity group. +/// +/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are +/// structurally similar to one another, while C is different we would have two +/// SimilarityGroups: +/// +/// SimilarityGroup 1: SimilarityGroup 2 +/// A, B, D C +/// +/// A list of the different similarity groups is then returned after +/// analyzing the module. +class IRSimilarityIdentifier { +public: + IRSimilarityIdentifier() + : Mapper(&InstDataAllocator, &InstDataListAllocator) {} + + /// \param M the module to find similarity in. + explicit IRSimilarityIdentifier(Module &M) + : Mapper(&InstDataAllocator, &InstDataListAllocator) { + findSimilarity(M); + } + +private: + /// Map the instructions in the module to unsigned integers, using mapping + /// already present in the Mapper if possible. + /// + /// \param [in] M Module - To map to integers. + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + void populateMapper(Module &M, std::vector &InstrList, + std::vector &IntegerMapping); + + /// Map the instructions in the modules vector to unsigned integers, using + /// mapping already present in the mapper if possible. + /// + /// \param [in] Modules - The list of modules to use to populate the mapper + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + void populateMapper(ArrayRef> &Modules, + std::vector &InstrList, + std::vector &IntegerMapping); + + /// Find the similarity candidates in \p InstrList and corresponding + /// \p UnsignedVec + /// + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + /// candidates found in the program. + void findCandidates(std::vector &InstrList, + std::vector &IntegerMapping); + +public: + // Find the IRSimilarityCandidates in the \p Modules and group by structural + // similarity in a SimilarityGroup, each group is returned in a + // SimilarityGroupList. + // + // \param [in] Modules - the modules to analyze. + // \returns The groups of similarity ranges found in the modules. + SimilarityGroupList & + findSimilarity(ArrayRef> Modules); + + // Find the IRSimilarityCandidates in the given Module grouped by structural + // similarity in a SimilarityGroup, contained inside a SimilarityGroupList. + // + // \param [in] M - the module to analyze. + // \returns The groups of similarity ranges found in the module. + SimilarityGroupList &findSimilarity(Module &M); + + // Clears \ref SimilarityCandidates if it is already filled by a previous run. + void resetSimilarityCandidates() { + // If we've already analyzed a Module or set of Modules, so we must clear + // the SimilarityCandidates to make sure we do not have only old values + // hanging around. + if (SimilarityCandidates.hasValue()) + SimilarityCandidates->clear(); + else + SimilarityCandidates = SimilarityGroupList(); + } + + // \returns The groups of similarity ranges found in the most recently passed + // set of modules. + Optional &getSimilarity() { + return SimilarityCandidates; + } + +private: + /// The allocator for IRInstructionData. + SpecificBumpPtrAllocator InstDataAllocator; + + /// The allocator for IRInstructionDataLists. + SpecificBumpPtrAllocator InstDataListAllocator; + + /// Map Instructions to unsigned integers and wraps the Instruction in an + /// instance of IRInstructionData. + IRInstructionMapper Mapper; + + /// The SimilarityGroups found with the most recent run of \ref + /// findSimilarity. None if there is no recent run. + Optional SimilarityCandidates; +}; + } // end namespace IRSimilarity } // end namespace llvm diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp index 8933f54..c482fd5 100644 --- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp +++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/User.h" +#include "llvm/Support/SuffixTree.h" using namespace llvm; using namespace IRSimilarity; @@ -389,7 +390,6 @@ bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A, {B, OperValsB, ValueNumberMappingB})) return false; } - return true; } @@ -405,3 +405,242 @@ bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A, return DoesOverlap(A, B) || DoesOverlap(B, A); } + +void IRSimilarityIdentifier::populateMapper( + Module &M, std::vector &InstrList, + std::vector &IntegerMapping) { + + std::vector InstrListForModule; + std::vector IntegerMappingForModule; + // Iterate over the functions in the module to map each Instruction in each + // BasicBlock to an unsigned integer. + for (Function &F : M) { + + if (F.empty()) + continue; + + for (BasicBlock &BB : F) { + + if (BB.sizeWithoutDebug() < 2) + continue; + + // BB has potential to have similarity since it has a size greater than 2 + // and can therefore match other regions greater than 2. Map it to a list + // of unsigned integers. + Mapper.convertToUnsignedVec(BB, InstrListForModule, + IntegerMappingForModule); + } + } + + // Insert the InstrListForModule at the end of the overall InstrList so that + // we can have a long InstrList for the entire set of Modules being analyzed. + InstrList.insert(InstrList.end(), InstrListForModule.begin(), + InstrListForModule.end()); + // Do the same as above, but for IntegerMapping. + IntegerMapping.insert(IntegerMapping.end(), IntegerMappingForModule.begin(), + IntegerMappingForModule.end()); +} + +void IRSimilarityIdentifier::populateMapper( + ArrayRef> &Modules, + std::vector &InstrList, + std::vector &IntegerMapping) { + + // Iterate over, and map the instructions in each module. + for (const std::unique_ptr &M : Modules) + populateMapper(*M, InstrList, IntegerMapping); +} + +/// From a repeated subsequence, find all the different instances of the +/// subsequence from the \p InstrList, and create an IRSimilarityCandidate from +/// the IRInstructionData in subsequence. +/// +/// \param [in] Mapper - The instruction mapper for sanity checks. +/// \param [in] InstrList - The vector that holds the instruction data. +/// \param [in] IntegerMapping - The vector that holds the mapped integers. +/// \param [out] CandsForRepSubstring - The vector to store the generated +/// IRSimilarityCandidates. +static void createCandidatesFromSuffixTree( + IRInstructionMapper Mapper, std::vector &InstrList, + std::vector &IntegerMapping, SuffixTree::RepeatedSubstring &RS, + std::vector &CandsForRepSubstring) { + + unsigned StringLen = RS.Length; + unsigned NumFound = 0; + unsigned PreviousIdx = 0; + + // Create an IRSimilarityCandidate for instance of this subsequence \p RS. + for (const unsigned &StartIdx : RS.StartIndices) { + unsigned EndIdx = StartIdx + StringLen - 1; + + // Check that this subsequence does not contain an illegal instruction. + bool ContainsIllegal = false; + for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) { + unsigned Key = IntegerMapping[CurrIdx]; + if (Key > Mapper.IllegalInstrNumber) { + ContainsIllegal = true; + break; + } + } + + // If we have an illegal instruction, we should not create an + // IRSimilarityCandidate for this region. + if (ContainsIllegal) + continue; + + PreviousIdx = EndIdx; + NumFound++; + + // We are getting iterators to the instructions in this region of code + // by advancing the start and end indices from the start of the + // InstrList. + std::vector::iterator StartIt = InstrList.begin(); + std::advance(StartIt, StartIdx); + std::vector::iterator EndIt = InstrList.begin(); + std::advance(EndIt, EndIdx); + + CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt); + } +} + +/// From the list of IRSimilarityCandidates, perform a comparison between each +/// IRSimilarityCandidate to determine if there are overlapping +/// IRInstructionData, or if they do not have the same structure. +/// +/// \param [in] CandsForRepSubstring - The vector containing the +/// IRSimilarityCandidates. +/// \param [out] StructuralGroups - the mapping of unsigned integers to vector +/// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the +/// vector are structurally similar to one another. +static void findCandidateStructures( + std::vector &CandsForRepSubstring, + DenseMap &StructuralGroups) { + std::vector::iterator CandIt, CandEndIt, InnerCandIt, + InnerCandEndIt; + + // IRSimilarityCandidates each have a structure for operand use. It is + // possible that two instances of the same subsequences have different + // structure. Each type of structure found is assigned a number. This + // DenseMap maps an IRSimilarityCandidate to which type of similarity + // discovered it fits within. + DenseMap CandToGroup; + + // Find the compatibility from each candidate to the others to determine + // which candidates overlap and which have the same structure by mapping + // each structure to a different group. + bool SameStructure; + bool Inserted; + unsigned CurrentGroupNum = 0; + unsigned OuterGroupNum; + DenseMap::iterator CandToGroupIt; + DenseMap::iterator CandToGroupItInner; + DenseMap::iterator CurrentGroupPair; + + // Iterate over the candidates to determine its structural and overlapping + // compatibility with other instructions + for (CandIt = CandsForRepSubstring.begin(), + CandEndIt = CandsForRepSubstring.end(); + CandIt != CandEndIt; CandIt++) { + + // Determine if it has an assigned structural group already. + CandToGroupIt = CandToGroup.find(&*CandIt); + if (CandToGroupIt == CandToGroup.end()) { + // If not, we assign it one, and add it to our mapping. + std::tie(CandToGroupIt, Inserted) = + CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++)); + } + + // Get the structural group number from the iterator. + OuterGroupNum = CandToGroupIt->second; + + // Check if we already have a list of IRSimilarityCandidates for the current + // structural group. Create one if one does not exist. + CurrentGroupPair = StructuralGroups.find(OuterGroupNum); + if (CurrentGroupPair == StructuralGroups.end()) + std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert( + std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt}))); + + // Iterate over the IRSimilarityCandidates following the current + // IRSimilarityCandidate in the list to determine whether the two + // IRSimilarityCandidates are compatible. This is so we do not repeat pairs + // of IRSimilarityCandidates. + for (InnerCandIt = std::next(CandIt), + InnerCandEndIt = CandsForRepSubstring.end(); + InnerCandIt != InnerCandEndIt; InnerCandIt++) { + + // We check if the inner item has a group already, if it does, we skip it. + CandToGroupItInner = CandToGroup.find(&*InnerCandIt); + if (CandToGroupItInner != CandToGroup.end()) + continue; + + // Otherwise we determine if they have the same structure and add it to + // vector if they match. + SameStructure = + IRSimilarityCandidate::compareStructure(*CandIt, *InnerCandIt); + if (!SameStructure) + continue; + + CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum)); + CurrentGroupPair->second.push_back(*InnerCandIt); + } + } +} + +void IRSimilarityIdentifier::findCandidates( + std::vector &InstrList, + std::vector &IntegerMapping) { + SuffixTree ST(IntegerMapping); + + std::vector CandsForRepSubstring; + std::vector NewCandidateGroups; + + DenseMap StructuralGroups; + + // Iterate over the subsequences found by the Suffix Tree to create + // IRSimilarityCandidates for each repeated subsequence and determine which + // instances are structurally similar to one another. + for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) { + createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, *It, + CandsForRepSubstring); + + if (CandsForRepSubstring.size() < 2) + continue; + + findCandidateStructures(CandsForRepSubstring, StructuralGroups); + for (std::pair &Group : StructuralGroups) + // We only add the group if it contains more than one + // IRSimilarityCandidate. If there is only one, that means there is no + // other repeated subsequence with the same structure. + if (Group.second.size() > 1) + SimilarityCandidates->push_back(Group.second); + + CandsForRepSubstring.clear(); + StructuralGroups.clear(); + NewCandidateGroups.clear(); + } +} + +SimilarityGroupList &IRSimilarityIdentifier::findSimilarity( + ArrayRef> Modules) { + resetSimilarityCandidates(); + + std::vector InstrList; + std::vector IntegerMapping; + + populateMapper(Modules, InstrList, IntegerMapping); + findCandidates(InstrList, IntegerMapping); + + return SimilarityCandidates.getValue(); +} + +SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) { + resetSimilarityCandidates(); + + std::vector InstrList; + std::vector IntegerMapping; + + populateMapper(M, InstrList, IntegerMapping); + findCandidates(InstrList, IntegerMapping); + + return SimilarityCandidates.getValue(); +} diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp index 4f5fcbb..4370f16 100644 --- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp +++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp @@ -38,7 +38,14 @@ void getVectors(Module &M, IRInstructionMapper &Mapper, Mapper.convertToUnsignedVec(BB, InstrList, UnsignedVec); } -// Checks that different opcodes are mapped to different values. +void getSimilarities( + Module &M, + std::vector> &SimilarityCandidates) { + IRSimilarityIdentifier Identifier; + SimilarityCandidates = Identifier.findSimilarity(M); +} + +// Checks that different opcodes are mapped to different values TEST(IRInstructionMapper, OpcodeDifferentiation) { StringRef ModuleString = R"( define i32 @f(i32 %a, i32 %b) { @@ -1625,3 +1632,193 @@ TEST(IRSimilarityCandidate, DifferentNameSameStructure) { ASSERT_TRUE(longSimCandCompare(InstrList, true)); } + +// Checks that two sets of identical instructions are found to be the same. +// Both sequences of adds have the same operand ordering, and the same +// instructions, making them strcturally equivalent. +TEST(IRSimilarityIdentifier, IdentitySimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = sub i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %a, %b + %3 = sub i32 %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 1); + for (std::vector &Cands : SimilarityCandidates) { + ASSERT_TRUE(Cands.size() == 2); + unsigned InstIdx = 0; + for (IRSimilarityCandidate &Cand : Cands) { + ASSERT_TRUE(Cand.getStartIdx() == InstIdx); + InstIdx += 3; + } + } +} + +// Checks that incorrect sequences are not found as similar. In this case, +// we have different sequences of instructions. +TEST(IRSimilarityIdentifier, InstructionDifference) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b, i32 %c, i32 %d) { + bb0: + %0 = sub i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %c, %d + %3 = sub i32 %d, %c + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 0); +} + +// This test checks to see whether we can detect similarity for commutativen +// instructions where the operands have been reversed. Right now, we cannot. +TEST(IRSimilarityIdentifier, CommutativeSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %a, %b + %3 = add i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 0); +} + +// Check that we are not finding commutative similarity in non commutative +// instructions. That is, while the instruction and operands used are the same +// in the two subtraction sequences, they cannot be counted as the same since +// a subtraction is not commutative. +TEST(IRSimilarityIdentifier, NonCommutativeDifference) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = sub i32 %a, %b + %1 = sub i32 %b, %a + br label %bb1 + bb1: + %2 = sub i32 %a, %b + %3 = sub i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 0); +} + +// Check that we find similarity despite changing the register names. +TEST(IRSimilarityIdentifier, MappingSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b, i32 %c, i32 %d) { + bb0: + %0 = add i32 %a, %b + %1 = sub i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %c, %d + %3 = sub i32 %d, %c + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 1); + for (std::vector &Cands : SimilarityCandidates) { + ASSERT_TRUE(Cands.size() == 2); + unsigned InstIdx = 0; + for (IRSimilarityCandidate &Cand : Cands) { + ASSERT_TRUE(Cand.getStartIdx() == InstIdx); + InstIdx += 3; + } + } +} + +// Checks that constants are detected as the same operand in each use in the +// sequences of instructions. Also checks that we can find structural +// equivalence using constants. In this case the 1 has the same use pattern as +// %a. +TEST(IRSimilarityIdentifier, ConstantMappingSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 1, %b + %1 = icmp sgt i32 %b, 1 + br label %bb1 + bb1: + %2 = add i32 %a, %b + %3 = icmp sgt i32 %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 1); + for (std::vector &Cands : SimilarityCandidates) { + ASSERT_TRUE(Cands.size() == 2); + unsigned InstIdx = 0; + for (IRSimilarityCandidate &Cand : Cands) { + ASSERT_TRUE(Cand.getStartIdx() == InstIdx); + InstIdx += 3; + } + } +} + +// Check that constants are uniquely identified. i.e. two different constants +// are not considered the same. This means that this should not find any +// structural similarity. +TEST(IRSimilarityIdentifier, ConstantMappingDifference) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 1, %b + %1 = icmp sgt i32 %b, 2 + br label %bb1 + bb1: + %2 = add i32 %a, %b + %3 = icmp slt i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 0); +}