Support escaping in TrigramIndex.
authorIvan Krasin <krasin@chromium.org>
Fri, 2 Dec 2016 23:30:16 +0000 (23:30 +0000)
committerIvan Krasin <krasin@chromium.org>
Fri, 2 Dec 2016 23:30:16 +0000 (23:30 +0000)
Summary:
This is a follow up to r288303, where I have introduced TrigramIndex
to speed up SpecialCaseList for the cases when all rules are
simple wildcards, like *hello*wor.d*.

Here, I add support for escaping, so that it's possible to
specify rules like *c\+\+abi*.

Reviewers: pcc

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D27318

llvm-svn: 288553

llvm/lib/Support/TrigramIndex.cpp
llvm/unittests/Support/SpecialCaseListTest.cpp
llvm/unittests/Support/TrigramIndexTest.cpp

index bba996e..85ab528 100644 (file)
@@ -26,28 +26,41 @@ using namespace llvm;
 
 static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}";
 
-static bool isSimpleWildcard(StringRef Str) {
-  // Check for regex metacharacters other than '*' and '.'.
-  return Str.find_first_of(RegexAdvancedMetachars) == StringRef::npos;
+static bool isAdvancedMetachar(unsigned Char) {
+  return strchr(RegexAdvancedMetachars, Char) != nullptr;
 }
 
 void TrigramIndex::insert(std::string Regex) {
   if (Defeated) return;
-  if (!isSimpleWildcard(Regex)) {
-    Defeated = true;
-    return;
-  }
-
   std::set<unsigned> Was;
   unsigned Cnt = 0;
   unsigned Tri = 0;
   unsigned Len = 0;
+  bool Escaped = false;
   for (unsigned Char : Regex) {
-    if (Char == '.' || Char == '*') {
-      Tri = 0;
-      Len = 0;
-      continue;
+    if (!Escaped) {
+      // Regular expressions allow escaping symbols by preceding it with '\'.
+      if (Char == '\\') {
+        Escaped = true;
+        continue;
+      }
+      if (isAdvancedMetachar(Char)) {
+        // This is a more complicated regex than we can handle here.
+        Defeated = true;
+        return;
+      }
+      if (Char == '.' || Char == '*') {
+        Tri = 0;
+        Len = 0;
+        continue;
+      }
+    }
+    if (Escaped && Char >= '1' && Char <= '9') {
+      Defeated = true;
+      return;
     }
+    // We have already handled escaping and can reset the flag.
+    Escaped = false;
     Tri = ((Tri << 8) + Char) & 0xFFFFFF;
     Len++;
     if (Len < 3)
index 4647499..e86eecb 100644 (file)
@@ -178,4 +178,15 @@ TEST_F(SpecialCaseListTest, PopularTrigram) {
   EXPECT_TRUE(SCL->inSection("fun", "aaaabbbaaa"));
 }
 
+TEST_F(SpecialCaseListTest, EscapedSymbols) {
+  std::unique_ptr<SpecialCaseList> SCL = makeSpecialCaseList("src:*c\\+\\+abi*\n"
+                                                             "src:*hello\\\\world*\n");
+  EXPECT_TRUE(SCL->inSection("src", "dir/c++abi"));
+  EXPECT_FALSE(SCL->inSection("src", "dir/c\\+\\+abi"));
+  EXPECT_FALSE(SCL->inSection("src", "c\\+\\+abi"));
+  EXPECT_TRUE(SCL->inSection("src", "C:\\hello\\world"));
+  EXPECT_TRUE(SCL->inSection("src", "hello\\world"));
+  EXPECT_FALSE(SCL->inSection("src", "hello\\\\world"));
+}
+
 }
index 9f61e7e..fb0ad17 100644 (file)
@@ -94,9 +94,29 @@ TEST_F(TrigramIndexTest, TooComplicatedRegex2) {
   EXPECT_TRUE(TI->isDefeated());
 }
 
-TEST_F(TrigramIndexTest, SpecialSymbol) {
+TEST_F(TrigramIndexTest, EscapedSymbols) {
   std::unique_ptr<TrigramIndex> TI =
-      makeTrigramIndex({"*c\\+\\+*"});
+      makeTrigramIndex({"*c\\+\\+*", "*hello\\\\world*", "a\\tb", "a\\0b"});
+  EXPECT_FALSE(TI->isDefeated());
+  EXPECT_FALSE(TI->isDefinitelyOut("c++"));
+  EXPECT_TRUE(TI->isDefinitelyOut("c\\+\\+"));
+  EXPECT_FALSE(TI->isDefinitelyOut("hello\\world"));
+  EXPECT_TRUE(TI->isDefinitelyOut("hello\\\\world"));
+  EXPECT_FALSE(TI->isDefinitelyOut("atb"));
+  EXPECT_TRUE(TI->isDefinitelyOut("a\\tb"));
+  EXPECT_TRUE(TI->isDefinitelyOut("a\tb"));
+  EXPECT_FALSE(TI->isDefinitelyOut("a0b"));
+}
+
+TEST_F(TrigramIndexTest, Backreference1) {
+  std::unique_ptr<TrigramIndex> TI =
+      makeTrigramIndex({"*foo\\1*"});
+  EXPECT_TRUE(TI->isDefeated());
+}
+
+TEST_F(TrigramIndexTest, Backreference2) {
+  std::unique_ptr<TrigramIndex> TI =
+      makeTrigramIndex({"*foo\\2*"});
   EXPECT_TRUE(TI->isDefeated());
 }