From 13906755427611fe2d1b1bf915e87b97ddffd236 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Mon, 19 Jun 2023 10:39:13 -0700 Subject: [PATCH] [SpecialCaseList] Remove TrigramIndex MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit `TrigramIndex` was added back in https://reviews.llvm.org/D27188 as an optimization to make `SpecialCaseList::match()` faster. I've found that `TrigramIndex` actually makes the function slower and it has no functional use, so we can remove it. I grabbed the list of queries passed to `SpecialCaseList::match()` on a random very large file (`AArch64ISelLowering.cpp`) and measured the runtime to call `match()` on all of them with [this line](https://github.com/llvm/llvm-project/blob/8e1f820bb4eadf5c0704818f6063e0db1006e32d/llvm/lib/Support/SpecialCaseList.cpp#L64) disabled and then enabled. ``` $ hyperfine --warmup 3 'GTEST_FILTER="SpecialCaseListTest.Large" USE_TRIGRAMS=1 build/unittests/Support/SupportTests' 'GTEST_FILTER="SpecialCaseListTest.Large" USE_TRIGRAMS=0 build/unittests/Support/SupportTests' Benchmark 1: GTEST_FILTER="SpecialCaseListTest.Large" USE_TRIGRAMS=1 build/unittests/Support/SupportTests Time (mean ± σ): 575.9 ms ± 20.3 ms [User: 573.1 ms, System: 2.7 ms] Range (min … max): 555.5 ms … 620.0 ms 10 runs Benchmark 2: GTEST_FILTER="SpecialCaseListTest.Large" USE_TRIGRAMS=0 build/unittests/Support/SupportTests Time (mean ± σ): 283.4 ms ± 6.7 ms [User: 280.3 ms, System: 3.0 ms] Range (min … max): 277.0 ms … 294.9 ms 10 runs Summary 'GTEST_FILTER="SpecialCaseListTest.Large" USE_TRIGRAMS=0 build/unittests/Support/SupportTests' ran 2.03 ± 0.09 times faster than 'GTEST_FILTER="SpecialCaseListTest.Large" USE_TRIGRAMS=1 build/unittests/Support/SupportTests' ``` Using `perf` I found that most of the runtime in `TrigramIndex::isDefinitelyOut()` comes from a division operation that seems to come from `std::unordered_map`: https://github.com/llvm/llvm-project/blob/8e1f820bb4eadf5c0704818f6063e0db1006e32d/llvm/include/llvm/Support/TrigramIndex.h#L62 Removing `TrigramIndex` will make it easier to potentially switch to using `GlobPattern` instead of a full regex for `SpecialCaseList`. See discussion in https://reviews.llvm.org/D152762 for details. Reviewed By: MaskRay, #sanitizers, vitalybuka Differential Revision: https://reviews.llvm.org/D153171 --- llvm/include/llvm/Support/SpecialCaseList.h | 5 +- llvm/include/llvm/Support/TrigramIndex.h | 67 ----------- llvm/lib/Support/CMakeLists.txt | 1 - llvm/lib/Support/SpecialCaseList.cpp | 3 - llvm/lib/Support/TrigramIndex.cpp | 107 ----------------- llvm/unittests/Support/CMakeLists.txt | 1 - llvm/unittests/Support/TrigramIndexTest.cpp | 131 --------------------- llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn | 1 - .../gn/secondary/llvm/unittests/Support/BUILD.gn | 1 - 9 files changed, 1 insertion(+), 316 deletions(-) delete mode 100644 llvm/include/llvm/Support/TrigramIndex.h delete mode 100644 llvm/lib/Support/TrigramIndex.cpp delete mode 100644 llvm/unittests/Support/TrigramIndexTest.cpp diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h index 0d56c4b..b6d1b56 100644 --- a/llvm/include/llvm/Support/SpecialCaseList.h +++ b/llvm/include/llvm/Support/SpecialCaseList.h @@ -54,7 +54,6 @@ #include "llvm/ADT/StringMap.h" #include "llvm/Support/Regex.h" -#include "llvm/Support/TrigramIndex.h" #include #include #include @@ -128,7 +127,6 @@ protected: private: StringMap Strings; - TrigramIndex Trigrams; std::vector, unsigned>> RegExes; }; @@ -155,5 +153,4 @@ protected: } // namespace llvm -#endif // LLVM_SUPPORT_SPECIALCASELIST_H - +#endif // LLVM_SUPPORT_SPECIALCASELIST_H diff --git a/llvm/include/llvm/Support/TrigramIndex.h b/llvm/include/llvm/Support/TrigramIndex.h deleted file mode 100644 index 0bfac49..0000000 --- a/llvm/include/llvm/Support/TrigramIndex.h +++ /dev/null @@ -1,67 +0,0 @@ -//===-- TrigramIndex.h - a heuristic for SpecialCaseList --------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//===----------------------------------------------------------------------===// -// -// TrigramIndex implements a heuristic for SpecialCaseList that allows to -// filter out ~99% incoming queries when all regular expressions in the -// SpecialCaseList are simple wildcards with '*' and '.'. If rules are more -// complicated, the check is defeated and it will always pass the queries to a -// full regex. -// -// The basic idea is that in order for a wildcard to match a query, the query -// needs to have all trigrams which occur in the wildcard. We create a trigram -// index (trigram -> list of rules with it) and then count trigrams in the query -// for each rule. If the count for one of the rules reaches the expected value, -// the check passes the query to a regex. If none of the rules got enough -// trigrams, the check tells that the query is definitely not matched by any -// of the rules, and no regex matching is needed. -// A similar idea was used in Google Code Search as described in the blog post: -// https://swtch.com/~rsc/regexp/regexp4.html -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_SUPPORT_TRIGRAMINDEX_H -#define LLVM_SUPPORT_TRIGRAMINDEX_H - -#include "llvm/ADT/SmallVector.h" -#include -#include -#include - -namespace llvm { -class StringRef; - -class TrigramIndex { - public: - /// Inserts a new Regex into the index. - void insert(const std::string &Regex); - - /// Returns true, if special case list definitely does not have a line - /// that matches the query. Returns false, if it's not sure. - bool isDefinitelyOut(StringRef Query) const; - - /// Returned true, iff the heuristic is defeated and not useful. - /// In this case isDefinitelyOut always returns false. - bool isDefeated() { return Defeated; } - private: - // If true, the rules are too complicated for the check to work, and full - // regex matching is needed for every rule. - bool Defeated = false; - // The minimum number of trigrams which should match for a rule to have a - // chance to match the query. The number of elements equals the number of - // regex rules in the SpecialCaseList. - std::vector Counts; - // Index holds a list of rules indices for each trigram. The same indices - // are used in Counts to store per-rule limits. - // If a trigram is too common (>4 rules with it), we stop tracking it, - // which increases the probability for a need to match using regex, but - // decreases the costs in the regular case. - std::unordered_map> Index{256}; -}; - -} // namespace llvm - -#endif // LLVM_SUPPORT_TRIGRAMINDEX_H diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index e6bff11..976ae88d 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -226,7 +226,6 @@ add_llvm_component_library(LLVMSupport TimeProfiler.cpp Timer.cpp ToolOutputFile.cpp - TrigramIndex.cpp Twine.cpp TypeSize.cpp Unicode.cpp diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 4650347..64f66e0 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -37,7 +37,6 @@ bool SpecialCaseList::Matcher::insert(std::string Regexp, Strings[Regexp] = LineNumber; return true; } - Trigrams.insert(Regexp); // Replace * with .* for (size_t pos = 0; (pos = Regexp.find('*', pos)) != std::string::npos; @@ -61,8 +60,6 @@ unsigned SpecialCaseList::Matcher::match(StringRef Query) const { auto It = Strings.find(Query); if (It != Strings.end()) return It->second; - if (Trigrams.isDefinitelyOut(Query)) - return false; for (const auto &RegExKV : RegExes) if (RegExKV.first->match(Query)) return RegExKV.second; diff --git a/llvm/lib/Support/TrigramIndex.cpp b/llvm/lib/Support/TrigramIndex.cpp deleted file mode 100644 index 40a20cc..0000000 --- a/llvm/lib/Support/TrigramIndex.cpp +++ /dev/null @@ -1,107 +0,0 @@ -//===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// TrigramIndex implements a heuristic for SpecialCaseList that allows to -// filter out ~99% incoming queries when all regular expressions in the -// SpecialCaseList are simple wildcards with '*' and '.'. If rules are more -// complicated, the check is defeated and it will always pass the queries to a -// full regex. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/TrigramIndex.h" -#include "llvm/ADT/StringRef.h" -#include - -using namespace llvm; - -static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; - -static bool isAdvancedMetachar(unsigned Char) { - return strchr(RegexAdvancedMetachars, Char) != nullptr; -} - -void TrigramIndex::insert(const std::string &Regex) { - if (Defeated) return; - std::set Was; - unsigned Cnt = 0; - unsigned Tri = 0; - unsigned Len = 0; - bool Escaped = false; - for (unsigned Char : Regex) { - if (!Escaped) { - // Regular expressions allow escaping symbols by preceding it with '\'. - if (Char == '\\') { - Escaped = true; - continue; - } - if (isAdvancedMetachar(Char)) { - // This is a more complicated regex than we can handle here. - Defeated = true; - return; - } - if (Char == '.' || Char == '*') { - Tri = 0; - Len = 0; - continue; - } - } - if (Escaped && Char >= '1' && Char <= '9') { - Defeated = true; - return; - } - // We have already handled escaping and can reset the flag. - Escaped = false; - Tri = ((Tri << 8) + Char) & 0xFFFFFF; - Len++; - if (Len < 3) - continue; - // We don't want the index to grow too much for the popular trigrams, - // as they are weak signals. It's ok to still require them for the - // rules we have already processed. It's just a small additional - // computational cost. - if (Index[Tri].size() >= 4) - continue; - Cnt++; - if (!Was.count(Tri)) { - // Adding the current rule to the index. - Index[Tri].push_back(Counts.size()); - Was.insert(Tri); - } - } - if (!Cnt) { - // This rule does not have remarkable trigrams to rely on. - // We have to always call the full regex chain. - Defeated = true; - return; - } - Counts.push_back(Cnt); -} - -bool TrigramIndex::isDefinitelyOut(StringRef Query) const { - if (Defeated) - return false; - std::vector CurCounts(Counts.size()); - unsigned Tri = 0; - for (size_t I = 0; I < Query.size(); I++) { - Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; - if (I < 2) - continue; - const auto &II = Index.find(Tri); - if (II == Index.end()) - continue; - for (size_t J : II->second) { - CurCounts[J]++; - // If we have reached a desired limit, we have to look at the query - // more closely by running a full regex. - if (CurCounts[J] >= Counts[J]) - return false; - } - } - return true; -} diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index 9c2c62b..e828640 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -89,7 +89,6 @@ add_llvm_unittest(SupportTests TypeSizeTest.cpp TypeTraitsTest.cpp TrailingObjectsTest.cpp - TrigramIndexTest.cpp UnicodeTest.cpp VersionTupleTest.cpp VirtualFileSystemTest.cpp diff --git a/llvm/unittests/Support/TrigramIndexTest.cpp b/llvm/unittests/Support/TrigramIndexTest.cpp deleted file mode 100644 index 42b3fcd..0000000 --- a/llvm/unittests/Support/TrigramIndexTest.cpp +++ /dev/null @@ -1,131 +0,0 @@ -//===- TrigramIndexTest.cpp - Unit tests for TrigramIndex -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/TrigramIndex.h" -#include "llvm/ADT/STLExtras.h" -#include "gtest/gtest.h" - -#include -#include - -using namespace llvm; - -namespace { - -class TrigramIndexTest : public ::testing::Test { -protected: - std::unique_ptr makeTrigramIndex( - std::vector Rules) { - std::unique_ptr TI = - std::make_unique(); - for (auto &Rule : Rules) - TI->insert(Rule); - return TI; - } -}; - -TEST_F(TrigramIndexTest, Empty) { - std::unique_ptr TI = - makeTrigramIndex({}); - EXPECT_FALSE(TI->isDefeated()); - EXPECT_TRUE(TI->isDefinitelyOut("foo")); -} - -TEST_F(TrigramIndexTest, Basic) { - std::unique_ptr TI = - makeTrigramIndex({"*hello*", "*wor.d*"}); - EXPECT_FALSE(TI->isDefeated()); - EXPECT_TRUE(TI->isDefinitelyOut("foo")); -} - -TEST_F(TrigramIndexTest, NoTrigramsInRules) { - std::unique_ptr TI = - makeTrigramIndex({"b.r", "za*az"}); - EXPECT_TRUE(TI->isDefeated()); - EXPECT_FALSE(TI->isDefinitelyOut("foo")); - EXPECT_FALSE(TI->isDefinitelyOut("bar")); - EXPECT_FALSE(TI->isDefinitelyOut("zakaz")); -} - -TEST_F(TrigramIndexTest, NoTrigramsInARule) { - std::unique_ptr TI = - makeTrigramIndex({"*hello*", "*wo.ld*"}); - EXPECT_TRUE(TI->isDefeated()); - EXPECT_FALSE(TI->isDefinitelyOut("foo")); -} - -TEST_F(TrigramIndexTest, RepetitiveRule) { - std::unique_ptr TI = - makeTrigramIndex({"*bar*bar*bar*bar*bar", "bar*bar"}); - EXPECT_FALSE(TI->isDefeated()); - EXPECT_TRUE(TI->isDefinitelyOut("foo")); - EXPECT_TRUE(TI->isDefinitelyOut("bar")); - EXPECT_FALSE(TI->isDefinitelyOut("barbara")); - EXPECT_FALSE(TI->isDefinitelyOut("bar+bar")); -} - -TEST_F(TrigramIndexTest, PopularTrigram) { - std::unique_ptr TI = - makeTrigramIndex({"*aaa*", "*aaaa*", "*aaaaa*", "*aaaaa*", "*aaaaaa*"}); - EXPECT_TRUE(TI->isDefeated()); -} - -TEST_F(TrigramIndexTest, PopularTrigram2) { - std::unique_ptr TI = - makeTrigramIndex({"class1.h", "class2.h", "class3.h", "class4.h", "class.h"}); - EXPECT_TRUE(TI->isDefeated()); -} - -TEST_F(TrigramIndexTest, TooComplicatedRegex) { - std::unique_ptr TI = - makeTrigramIndex({"[0-9]+"}); - EXPECT_TRUE(TI->isDefeated()); -} - -TEST_F(TrigramIndexTest, TooComplicatedRegex2) { - std::unique_ptr TI = - makeTrigramIndex({"foo|bar"}); - EXPECT_TRUE(TI->isDefeated()); -} - -TEST_F(TrigramIndexTest, EscapedSymbols) { - std::unique_ptr TI = - makeTrigramIndex({"*c\\+\\+*", "*hello\\\\world*", "a\\tb", "a\\0b"}); - EXPECT_FALSE(TI->isDefeated()); - EXPECT_FALSE(TI->isDefinitelyOut("c++")); - EXPECT_TRUE(TI->isDefinitelyOut("c\\+\\+")); - EXPECT_FALSE(TI->isDefinitelyOut("hello\\world")); - EXPECT_TRUE(TI->isDefinitelyOut("hello\\\\world")); - EXPECT_FALSE(TI->isDefinitelyOut("atb")); - EXPECT_TRUE(TI->isDefinitelyOut("a\\tb")); - EXPECT_TRUE(TI->isDefinitelyOut("a\tb")); - EXPECT_FALSE(TI->isDefinitelyOut("a0b")); -} - -TEST_F(TrigramIndexTest, Backreference1) { - std::unique_ptr TI = - makeTrigramIndex({"*foo\\1*"}); - EXPECT_TRUE(TI->isDefeated()); -} - -TEST_F(TrigramIndexTest, Backreference2) { - std::unique_ptr TI = - makeTrigramIndex({"*foo\\2*"}); - EXPECT_TRUE(TI->isDefeated()); -} - -TEST_F(TrigramIndexTest, Sequence) { - std::unique_ptr TI = - makeTrigramIndex({"class1.h", "class2.h", "class3.h", "class4.h"}); - EXPECT_FALSE(TI->isDefeated()); - EXPECT_FALSE(TI->isDefinitelyOut("class1")); - EXPECT_TRUE(TI->isDefinitelyOut("class.h")); - EXPECT_TRUE(TI->isDefinitelyOut("class")); -} - -} // namespace diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index 8c1a29a..989e805 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -141,7 +141,6 @@ static_library("Support") { "TimeProfiler.cpp", "Timer.cpp", "ToolOutputFile.cpp", - "TrigramIndex.cpp", "Twine.cpp", "TypeSize.cpp", "Unicode.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn index 7f17a69..5143c36 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn @@ -89,7 +89,6 @@ unittest("SupportTests") { "TimerTest.cpp", "ToolOutputFileTest.cpp", "TrailingObjectsTest.cpp", - "TrigramIndexTest.cpp", "TypeNameTest.cpp", "TypeSizeTest.cpp", "TypeTraitsTest.cpp", -- 2.7.4