// An IRSimilarityCandidate is a region of IRInstructionData (wrapped
// Instructions), usually used to denote a region of similarity has been found.
//
+// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
+// similar to one another.
+//
//===----------------------------------------------------------------------===//
#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
iterator end() const { return std::next(iterator(back())); }
};
+typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
+typedef std::vector<SimilarityGroup> SimilarityGroupList;
+
+/// This class puts all the pieces of the IRInstructionData,
+/// IRInstructionMapper, IRSimilarityCandidate together.
+///
+/// It first feeds the Module or vector of Modules into the IRInstructionMapper,
+/// and puts all the mapped instructions into a single long list of
+/// IRInstructionData.
+///
+/// The list of unsigned integers is given to the Suffix Tree or similar data
+/// structure to find repeated subsequences. We construct an
+/// IRSimilarityCandidate for each instance of the subsequence. We compare them
+/// against one another since These repeated subsequences can have different
+/// structure. For each different kind of structure found, we create a
+/// similarity group.
+///
+/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
+/// structurally similar to one another, while C is different we would have two
+/// SimilarityGroups:
+///
+/// SimilarityGroup 1: SimilarityGroup 2
+/// A, B, D C
+///
+/// A list of the different similarity groups is then returned after
+/// analyzing the module.
+class IRSimilarityIdentifier {
+public:
+ IRSimilarityIdentifier()
+ : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
+
+ /// \param M the module to find similarity in.
+ explicit IRSimilarityIdentifier(Module &M)
+ : Mapper(&InstDataAllocator, &InstDataListAllocator) {
+ findSimilarity(M);
+ }
+
+private:
+ /// Map the instructions in the module to unsigned integers, using mapping
+ /// already present in the Mapper if possible.
+ ///
+ /// \param [in] M Module - To map to integers.
+ /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+ /// \param [in,out] IntegerMapping - The vector to append integers to.
+ void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping);
+
+ /// Map the instructions in the modules vector to unsigned integers, using
+ /// mapping already present in the mapper if possible.
+ ///
+ /// \param [in] Modules - The list of modules to use to populate the mapper
+ /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+ /// \param [in,out] IntegerMapping - The vector to append integers to.
+ void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
+ std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping);
+
+ /// Find the similarity candidates in \p InstrList and corresponding
+ /// \p UnsignedVec
+ ///
+ /// \param [in,out] InstrList - The vector to append IRInstructionData to.
+ /// \param [in,out] IntegerMapping - The vector to append integers to.
+ /// candidates found in the program.
+ void findCandidates(std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping);
+
+public:
+ // Find the IRSimilarityCandidates in the \p Modules and group by structural
+ // similarity in a SimilarityGroup, each group is returned in a
+ // SimilarityGroupList.
+ //
+ // \param [in] Modules - the modules to analyze.
+ // \returns The groups of similarity ranges found in the modules.
+ SimilarityGroupList &
+ findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
+
+ // Find the IRSimilarityCandidates in the given Module grouped by structural
+ // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
+ //
+ // \param [in] M - the module to analyze.
+ // \returns The groups of similarity ranges found in the module.
+ SimilarityGroupList &findSimilarity(Module &M);
+
+ // Clears \ref SimilarityCandidates if it is already filled by a previous run.
+ void resetSimilarityCandidates() {
+ // If we've already analyzed a Module or set of Modules, so we must clear
+ // the SimilarityCandidates to make sure we do not have only old values
+ // hanging around.
+ if (SimilarityCandidates.hasValue())
+ SimilarityCandidates->clear();
+ else
+ SimilarityCandidates = SimilarityGroupList();
+ }
+
+ // \returns The groups of similarity ranges found in the most recently passed
+ // set of modules.
+ Optional<SimilarityGroupList> &getSimilarity() {
+ return SimilarityCandidates;
+ }
+
+private:
+ /// The allocator for IRInstructionData.
+ SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
+
+ /// The allocator for IRInstructionDataLists.
+ SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
+
+ /// Map Instructions to unsigned integers and wraps the Instruction in an
+ /// instance of IRInstructionData.
+ IRInstructionMapper Mapper;
+
+ /// The SimilarityGroups found with the most recent run of \ref
+ /// findSimilarity. None if there is no recent run.
+ Optional<SimilarityGroupList> SimilarityCandidates;
+};
+
} // end namespace IRSimilarity
} // end namespace llvm
#include "llvm/ADT/DenseMap.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/User.h"
+#include "llvm/Support/SuffixTree.h"
using namespace llvm;
using namespace IRSimilarity;
{B, OperValsB, ValueNumberMappingB}))
return false;
}
-
return true;
}
return DoesOverlap(A, B) || DoesOverlap(B, A);
}
+
+void IRSimilarityIdentifier::populateMapper(
+ Module &M, std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping) {
+
+ std::vector<IRInstructionData *> InstrListForModule;
+ std::vector<unsigned> IntegerMappingForModule;
+ // Iterate over the functions in the module to map each Instruction in each
+ // BasicBlock to an unsigned integer.
+ for (Function &F : M) {
+
+ if (F.empty())
+ continue;
+
+ for (BasicBlock &BB : F) {
+
+ if (BB.sizeWithoutDebug() < 2)
+ continue;
+
+ // BB has potential to have similarity since it has a size greater than 2
+ // and can therefore match other regions greater than 2. Map it to a list
+ // of unsigned integers.
+ Mapper.convertToUnsignedVec(BB, InstrListForModule,
+ IntegerMappingForModule);
+ }
+ }
+
+ // Insert the InstrListForModule at the end of the overall InstrList so that
+ // we can have a long InstrList for the entire set of Modules being analyzed.
+ InstrList.insert(InstrList.end(), InstrListForModule.begin(),
+ InstrListForModule.end());
+ // Do the same as above, but for IntegerMapping.
+ IntegerMapping.insert(IntegerMapping.end(), IntegerMappingForModule.begin(),
+ IntegerMappingForModule.end());
+}
+
+void IRSimilarityIdentifier::populateMapper(
+ ArrayRef<std::unique_ptr<Module>> &Modules,
+ std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping) {
+
+ // Iterate over, and map the instructions in each module.
+ for (const std::unique_ptr<Module> &M : Modules)
+ populateMapper(*M, InstrList, IntegerMapping);
+}
+
+/// From a repeated subsequence, find all the different instances of the
+/// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
+/// the IRInstructionData in subsequence.
+///
+/// \param [in] Mapper - The instruction mapper for sanity checks.
+/// \param [in] InstrList - The vector that holds the instruction data.
+/// \param [in] IntegerMapping - The vector that holds the mapped integers.
+/// \param [out] CandsForRepSubstring - The vector to store the generated
+/// IRSimilarityCandidates.
+static void createCandidatesFromSuffixTree(
+ IRInstructionMapper Mapper, std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
+ std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
+
+ unsigned StringLen = RS.Length;
+ unsigned NumFound = 0;
+ unsigned PreviousIdx = 0;
+
+ // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
+ for (const unsigned &StartIdx : RS.StartIndices) {
+ unsigned EndIdx = StartIdx + StringLen - 1;
+
+ // Check that this subsequence does not contain an illegal instruction.
+ bool ContainsIllegal = false;
+ for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
+ unsigned Key = IntegerMapping[CurrIdx];
+ if (Key > Mapper.IllegalInstrNumber) {
+ ContainsIllegal = true;
+ break;
+ }
+ }
+
+ // If we have an illegal instruction, we should not create an
+ // IRSimilarityCandidate for this region.
+ if (ContainsIllegal)
+ continue;
+
+ PreviousIdx = EndIdx;
+ NumFound++;
+
+ // We are getting iterators to the instructions in this region of code
+ // by advancing the start and end indices from the start of the
+ // InstrList.
+ std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
+ std::advance(StartIt, StartIdx);
+ std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
+ std::advance(EndIt, EndIdx);
+
+ CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt);
+ }
+}
+
+/// From the list of IRSimilarityCandidates, perform a comparison between each
+/// IRSimilarityCandidate to determine if there are overlapping
+/// IRInstructionData, or if they do not have the same structure.
+///
+/// \param [in] CandsForRepSubstring - The vector containing the
+/// IRSimilarityCandidates.
+/// \param [out] StructuralGroups - the mapping of unsigned integers to vector
+/// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
+/// vector are structurally similar to one another.
+static void findCandidateStructures(
+ std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
+ DenseMap<unsigned, SimilarityGroup> &StructuralGroups) {
+ std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
+ InnerCandEndIt;
+
+ // IRSimilarityCandidates each have a structure for operand use. It is
+ // possible that two instances of the same subsequences have different
+ // structure. Each type of structure found is assigned a number. This
+ // DenseMap maps an IRSimilarityCandidate to which type of similarity
+ // discovered it fits within.
+ DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
+
+ // Find the compatibility from each candidate to the others to determine
+ // which candidates overlap and which have the same structure by mapping
+ // each structure to a different group.
+ bool SameStructure;
+ bool Inserted;
+ unsigned CurrentGroupNum = 0;
+ unsigned OuterGroupNum;
+ DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
+ DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
+ DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
+
+ // Iterate over the candidates to determine its structural and overlapping
+ // compatibility with other instructions
+ for (CandIt = CandsForRepSubstring.begin(),
+ CandEndIt = CandsForRepSubstring.end();
+ CandIt != CandEndIt; CandIt++) {
+
+ // Determine if it has an assigned structural group already.
+ CandToGroupIt = CandToGroup.find(&*CandIt);
+ if (CandToGroupIt == CandToGroup.end()) {
+ // If not, we assign it one, and add it to our mapping.
+ std::tie(CandToGroupIt, Inserted) =
+ CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++));
+ }
+
+ // Get the structural group number from the iterator.
+ OuterGroupNum = CandToGroupIt->second;
+
+ // Check if we already have a list of IRSimilarityCandidates for the current
+ // structural group. Create one if one does not exist.
+ CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
+ if (CurrentGroupPair == StructuralGroups.end())
+ std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
+ std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
+
+ // Iterate over the IRSimilarityCandidates following the current
+ // IRSimilarityCandidate in the list to determine whether the two
+ // IRSimilarityCandidates are compatible. This is so we do not repeat pairs
+ // of IRSimilarityCandidates.
+ for (InnerCandIt = std::next(CandIt),
+ InnerCandEndIt = CandsForRepSubstring.end();
+ InnerCandIt != InnerCandEndIt; InnerCandIt++) {
+
+ // We check if the inner item has a group already, if it does, we skip it.
+ CandToGroupItInner = CandToGroup.find(&*InnerCandIt);
+ if (CandToGroupItInner != CandToGroup.end())
+ continue;
+
+ // Otherwise we determine if they have the same structure and add it to
+ // vector if they match.
+ SameStructure =
+ IRSimilarityCandidate::compareStructure(*CandIt, *InnerCandIt);
+ if (!SameStructure)
+ continue;
+
+ CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
+ CurrentGroupPair->second.push_back(*InnerCandIt);
+ }
+ }
+}
+
+void IRSimilarityIdentifier::findCandidates(
+ std::vector<IRInstructionData *> &InstrList,
+ std::vector<unsigned> &IntegerMapping) {
+ SuffixTree ST(IntegerMapping);
+
+ std::vector<IRSimilarityCandidate> CandsForRepSubstring;
+ std::vector<SimilarityGroup> NewCandidateGroups;
+
+ DenseMap<unsigned, SimilarityGroup> StructuralGroups;
+
+ // Iterate over the subsequences found by the Suffix Tree to create
+ // IRSimilarityCandidates for each repeated subsequence and determine which
+ // instances are structurally similar to one another.
+ for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
+ createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, *It,
+ CandsForRepSubstring);
+
+ if (CandsForRepSubstring.size() < 2)
+ continue;
+
+ findCandidateStructures(CandsForRepSubstring, StructuralGroups);
+ for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups)
+ // We only add the group if it contains more than one
+ // IRSimilarityCandidate. If there is only one, that means there is no
+ // other repeated subsequence with the same structure.
+ if (Group.second.size() > 1)
+ SimilarityCandidates->push_back(Group.second);
+
+ CandsForRepSubstring.clear();
+ StructuralGroups.clear();
+ NewCandidateGroups.clear();
+ }
+}
+
+SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
+ ArrayRef<std::unique_ptr<Module>> Modules) {
+ resetSimilarityCandidates();
+
+ std::vector<IRInstructionData *> InstrList;
+ std::vector<unsigned> IntegerMapping;
+
+ populateMapper(Modules, InstrList, IntegerMapping);
+ findCandidates(InstrList, IntegerMapping);
+
+ return SimilarityCandidates.getValue();
+}
+
+SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
+ resetSimilarityCandidates();
+
+ std::vector<IRInstructionData *> InstrList;
+ std::vector<unsigned> IntegerMapping;
+
+ populateMapper(M, InstrList, IntegerMapping);
+ findCandidates(InstrList, IntegerMapping);
+
+ return SimilarityCandidates.getValue();
+}
Mapper.convertToUnsignedVec(BB, InstrList, UnsignedVec);
}
-// Checks that different opcodes are mapped to different values.
+void getSimilarities(
+ Module &M,
+ std::vector<std::vector<IRSimilarityCandidate>> &SimilarityCandidates) {
+ IRSimilarityIdentifier Identifier;
+ SimilarityCandidates = Identifier.findSimilarity(M);
+}
+
+// Checks that different opcodes are mapped to different values
TEST(IRInstructionMapper, OpcodeDifferentiation) {
StringRef ModuleString = R"(
define i32 @f(i32 %a, i32 %b) {
ASSERT_TRUE(longSimCandCompare(InstrList, true));
}
+
+// Checks that two sets of identical instructions are found to be the same.
+// Both sequences of adds have the same operand ordering, and the same
+// instructions, making them strcturally equivalent.
+TEST(IRSimilarityIdentifier, IdentitySimilarity) {
+ StringRef ModuleString = R"(
+ define i32 @f(i32 %a, i32 %b) {
+ bb0:
+ %0 = add i32 %a, %b
+ %1 = sub i32 %b, %a
+ br label %bb1
+ bb1:
+ %2 = add i32 %a, %b
+ %3 = sub i32 %b, %a
+ ret i32 0
+ })";
+ LLVMContext Context;
+ std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+ std::vector<std::vector<IRSimilarityCandidate>> SimilarityCandidates;
+ getSimilarities(*M, SimilarityCandidates);
+
+ ASSERT_TRUE(SimilarityCandidates.size() == 1);
+ for (std::vector<IRSimilarityCandidate> &Cands : SimilarityCandidates) {
+ ASSERT_TRUE(Cands.size() == 2);
+ unsigned InstIdx = 0;
+ for (IRSimilarityCandidate &Cand : Cands) {
+ ASSERT_TRUE(Cand.getStartIdx() == InstIdx);
+ InstIdx += 3;
+ }
+ }
+}
+
+// Checks that incorrect sequences are not found as similar. In this case,
+// we have different sequences of instructions.
+TEST(IRSimilarityIdentifier, InstructionDifference) {
+ StringRef ModuleString = R"(
+ define i32 @f(i32 %a, i32 %b, i32 %c, i32 %d) {
+ bb0:
+ %0 = sub i32 %a, %b
+ %1 = add i32 %b, %a
+ br label %bb1
+ bb1:
+ %2 = add i32 %c, %d
+ %3 = sub i32 %d, %c
+ ret i32 0
+ })";
+ LLVMContext Context;
+ std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+ std::vector<std::vector<IRSimilarityCandidate>> SimilarityCandidates;
+ getSimilarities(*M, SimilarityCandidates);
+
+ ASSERT_TRUE(SimilarityCandidates.size() == 0);
+}
+
+// This test checks to see whether we can detect similarity for commutativen
+// instructions where the operands have been reversed. Right now, we cannot.
+TEST(IRSimilarityIdentifier, CommutativeSimilarity) {
+ StringRef ModuleString = R"(
+ define i32 @f(i32 %a, i32 %b) {
+ bb0:
+ %0 = add i32 %a, %b
+ %1 = add i32 %b, %a
+ br label %bb1
+ bb1:
+ %2 = add i32 %a, %b
+ %3 = add i32 %a, %b
+ ret i32 0
+ })";
+ LLVMContext Context;
+ std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+ std::vector<std::vector<IRSimilarityCandidate>> SimilarityCandidates;
+ getSimilarities(*M, SimilarityCandidates);
+
+ ASSERT_TRUE(SimilarityCandidates.size() == 0);
+}
+
+// Check that we are not finding commutative similarity in non commutative
+// instructions. That is, while the instruction and operands used are the same
+// in the two subtraction sequences, they cannot be counted as the same since
+// a subtraction is not commutative.
+TEST(IRSimilarityIdentifier, NonCommutativeDifference) {
+ StringRef ModuleString = R"(
+ define i32 @f(i32 %a, i32 %b) {
+ bb0:
+ %0 = sub i32 %a, %b
+ %1 = sub i32 %b, %a
+ br label %bb1
+ bb1:
+ %2 = sub i32 %a, %b
+ %3 = sub i32 %a, %b
+ ret i32 0
+ })";
+ LLVMContext Context;
+ std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+ std::vector<std::vector<IRSimilarityCandidate>> SimilarityCandidates;
+ getSimilarities(*M, SimilarityCandidates);
+
+ ASSERT_TRUE(SimilarityCandidates.size() == 0);
+}
+
+// Check that we find similarity despite changing the register names.
+TEST(IRSimilarityIdentifier, MappingSimilarity) {
+ StringRef ModuleString = R"(
+ define i32 @f(i32 %a, i32 %b, i32 %c, i32 %d) {
+ bb0:
+ %0 = add i32 %a, %b
+ %1 = sub i32 %b, %a
+ br label %bb1
+ bb1:
+ %2 = add i32 %c, %d
+ %3 = sub i32 %d, %c
+ ret i32 0
+ })";
+ LLVMContext Context;
+ std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+ std::vector<std::vector<IRSimilarityCandidate>> SimilarityCandidates;
+ getSimilarities(*M, SimilarityCandidates);
+
+ ASSERT_TRUE(SimilarityCandidates.size() == 1);
+ for (std::vector<IRSimilarityCandidate> &Cands : SimilarityCandidates) {
+ ASSERT_TRUE(Cands.size() == 2);
+ unsigned InstIdx = 0;
+ for (IRSimilarityCandidate &Cand : Cands) {
+ ASSERT_TRUE(Cand.getStartIdx() == InstIdx);
+ InstIdx += 3;
+ }
+ }
+}
+
+// Checks that constants are detected as the same operand in each use in the
+// sequences of instructions. Also checks that we can find structural
+// equivalence using constants. In this case the 1 has the same use pattern as
+// %a.
+TEST(IRSimilarityIdentifier, ConstantMappingSimilarity) {
+ StringRef ModuleString = R"(
+ define i32 @f(i32 %a, i32 %b) {
+ bb0:
+ %0 = add i32 1, %b
+ %1 = icmp sgt i32 %b, 1
+ br label %bb1
+ bb1:
+ %2 = add i32 %a, %b
+ %3 = icmp sgt i32 %b, %a
+ ret i32 0
+ })";
+ LLVMContext Context;
+ std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+ std::vector<std::vector<IRSimilarityCandidate>> SimilarityCandidates;
+ getSimilarities(*M, SimilarityCandidates);
+
+ ASSERT_TRUE(SimilarityCandidates.size() == 1);
+ for (std::vector<IRSimilarityCandidate> &Cands : SimilarityCandidates) {
+ ASSERT_TRUE(Cands.size() == 2);
+ unsigned InstIdx = 0;
+ for (IRSimilarityCandidate &Cand : Cands) {
+ ASSERT_TRUE(Cand.getStartIdx() == InstIdx);
+ InstIdx += 3;
+ }
+ }
+}
+
+// Check that constants are uniquely identified. i.e. two different constants
+// are not considered the same. This means that this should not find any
+// structural similarity.
+TEST(IRSimilarityIdentifier, ConstantMappingDifference) {
+ StringRef ModuleString = R"(
+ define i32 @f(i32 %a, i32 %b) {
+ bb0:
+ %0 = add i32 1, %b
+ %1 = icmp sgt i32 %b, 2
+ br label %bb1
+ bb1:
+ %2 = add i32 %a, %b
+ %3 = icmp slt i32 %a, %b
+ ret i32 0
+ })";
+ LLVMContext Context;
+ std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+ std::vector<std::vector<IRSimilarityCandidate>> SimilarityCandidates;
+ getSimilarities(*M, SimilarityCandidates);
+
+ ASSERT_TRUE(SimilarityCandidates.size() == 0);
+}