From 75453b057b6b76889ff3a5bca99f8099c628b389 Mon Sep 17 00:00:00 2001 From: Ivan Krasin Date: Fri, 2 Dec 2016 23:30:16 +0000 Subject: [PATCH] Support escaping in TrigramIndex. Summary: This is a follow up to r288303, where I have introduced TrigramIndex to speed up SpecialCaseList for the cases when all rules are simple wildcards, like *hello*wor.d*. Here, I add support for escaping, so that it's possible to specify rules like *c\+\+abi*. Reviewers: pcc Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D27318 llvm-svn: 288553 --- llvm/lib/Support/TrigramIndex.cpp | 37 +++++++++++++++++--------- llvm/unittests/Support/SpecialCaseListTest.cpp | 11 ++++++++ llvm/unittests/Support/TrigramIndexTest.cpp | 24 +++++++++++++++-- 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Support/TrigramIndex.cpp b/llvm/lib/Support/TrigramIndex.cpp index bba996e..85ab528 100644 --- a/llvm/lib/Support/TrigramIndex.cpp +++ b/llvm/lib/Support/TrigramIndex.cpp @@ -26,28 +26,41 @@ using namespace llvm; static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; -static bool isSimpleWildcard(StringRef Str) { - // Check for regex metacharacters other than '*' and '.'. - return Str.find_first_of(RegexAdvancedMetachars) == StringRef::npos; +static bool isAdvancedMetachar(unsigned Char) { + return strchr(RegexAdvancedMetachars, Char) != nullptr; } void TrigramIndex::insert(std::string Regex) { if (Defeated) return; - if (!isSimpleWildcard(Regex)) { - Defeated = true; - return; - } - std::set Was; unsigned Cnt = 0; unsigned Tri = 0; unsigned Len = 0; + bool Escaped = false; for (unsigned Char : Regex) { - if (Char == '.' || Char == '*') { - Tri = 0; - Len = 0; - continue; + if (!Escaped) { + // Regular expressions allow escaping symbols by preceding it with '\'. + if (Char == '\\') { + Escaped = true; + continue; + } + if (isAdvancedMetachar(Char)) { + // This is a more complicated regex than we can handle here. + Defeated = true; + return; + } + if (Char == '.' || Char == '*') { + Tri = 0; + Len = 0; + continue; + } + } + if (Escaped && Char >= '1' && Char <= '9') { + Defeated = true; + return; } + // We have already handled escaping and can reset the flag. + Escaped = false; Tri = ((Tri << 8) + Char) & 0xFFFFFF; Len++; if (Len < 3) diff --git a/llvm/unittests/Support/SpecialCaseListTest.cpp b/llvm/unittests/Support/SpecialCaseListTest.cpp index 4647499..e86eecb 100644 --- a/llvm/unittests/Support/SpecialCaseListTest.cpp +++ b/llvm/unittests/Support/SpecialCaseListTest.cpp @@ -178,4 +178,15 @@ TEST_F(SpecialCaseListTest, PopularTrigram) { EXPECT_TRUE(SCL->inSection("fun", "aaaabbbaaa")); } +TEST_F(SpecialCaseListTest, EscapedSymbols) { + std::unique_ptr SCL = makeSpecialCaseList("src:*c\\+\\+abi*\n" + "src:*hello\\\\world*\n"); + EXPECT_TRUE(SCL->inSection("src", "dir/c++abi")); + EXPECT_FALSE(SCL->inSection("src", "dir/c\\+\\+abi")); + EXPECT_FALSE(SCL->inSection("src", "c\\+\\+abi")); + EXPECT_TRUE(SCL->inSection("src", "C:\\hello\\world")); + EXPECT_TRUE(SCL->inSection("src", "hello\\world")); + EXPECT_FALSE(SCL->inSection("src", "hello\\\\world")); +} + } diff --git a/llvm/unittests/Support/TrigramIndexTest.cpp b/llvm/unittests/Support/TrigramIndexTest.cpp index 9f61e7e..fb0ad17 100644 --- a/llvm/unittests/Support/TrigramIndexTest.cpp +++ b/llvm/unittests/Support/TrigramIndexTest.cpp @@ -94,9 +94,29 @@ TEST_F(TrigramIndexTest, TooComplicatedRegex2) { EXPECT_TRUE(TI->isDefeated()); } -TEST_F(TrigramIndexTest, SpecialSymbol) { +TEST_F(TrigramIndexTest, EscapedSymbols) { std::unique_ptr TI = - makeTrigramIndex({"*c\\+\\+*"}); + makeTrigramIndex({"*c\\+\\+*", "*hello\\\\world*", "a\\tb", "a\\0b"}); + EXPECT_FALSE(TI->isDefeated()); + EXPECT_FALSE(TI->isDefinitelyOut("c++")); + EXPECT_TRUE(TI->isDefinitelyOut("c\\+\\+")); + EXPECT_FALSE(TI->isDefinitelyOut("hello\\world")); + EXPECT_TRUE(TI->isDefinitelyOut("hello\\\\world")); + EXPECT_FALSE(TI->isDefinitelyOut("atb")); + EXPECT_TRUE(TI->isDefinitelyOut("a\\tb")); + EXPECT_TRUE(TI->isDefinitelyOut("a\tb")); + EXPECT_FALSE(TI->isDefinitelyOut("a0b")); +} + +TEST_F(TrigramIndexTest, Backreference1) { + std::unique_ptr TI = + makeTrigramIndex({"*foo\\1*"}); + EXPECT_TRUE(TI->isDefeated()); +} + +TEST_F(TrigramIndexTest, Backreference2) { + std::unique_ptr TI = + makeTrigramIndex({"*foo\\2*"}); EXPECT_TRUE(TI->isDefeated()); } -- 2.7.4