From 77a719cb9ebc01db7d6936d1b4dd2bfcfa38b39b Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Mon, 5 Mar 2018 17:34:33 +0000 Subject: [PATCH] [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it. Summary: The intent was that [ar] doesn't match "FooBar"; the first character must match a Head character (hard requirement, not just a low score). This matches VSCode, and was "tested" but the tests were defective. The tests expected matches("FooBar") to fail for lack of a match. But instead it fails because the string should be annotated - matches("FooB[ar]"). This patch makes matches("FooBar") ignore annotations, as was intended. Fixing the code to reject weak matches for the first char causes problems: - [bre] no longer matches "HTMLBRElement". We allow matching against an uppercase char even if we don't think it's head. Only do this if there's at least one lowercase, to avoid triggering on MACROS - [print] no longer matches "sprintf". This is hard to fix without false positives (e.g. [int] vs "sprintf"]) This patch leaves this case broken. A future patch will add a dictionary providing custom segmentation to common names from the standard library. Fixed a couple of index tests that indirectly relied on broken fuzzy matching. Added const in a couple of missing places for consistency with new code. Subscribers: klimek, ilya-biryukov, jkorous-apple, ioeric, cfe-commits Differential Revision: https://reviews.llvm.org/D44003 llvm-svn: 326721 --- clang-tools-extra/clangd/FuzzyMatch.cpp | 51 ++++++++++++++++------ clang-tools-extra/clangd/FuzzyMatch.h | 10 +++-- .../unittests/clangd/FuzzyMatchTests.cpp | 34 ++++++++++----- clang-tools-extra/unittests/clangd/IndexTests.cpp | 28 +++++------- 4 files changed, 76 insertions(+), 47 deletions(-) diff --git a/clang-tools-extra/clangd/FuzzyMatch.cpp b/clang-tools-extra/clangd/FuzzyMatch.cpp index 68ba695..152f179 100644 --- a/clang-tools-extra/clangd/FuzzyMatch.cpp +++ b/clang-tools-extra/clangd/FuzzyMatch.cpp @@ -33,6 +33,8 @@ // Legal if the characters match. // - Moving down (consuming a pattern character) is never legal. // Never legal: all pattern characters must match something. +// Characters are matched case-insensitively. +// The first pattern character may only match the start of a word segment. // // The scoring is based on heuristics: // - when matching a character, apply a bonus or penalty depending on the @@ -74,13 +76,11 @@ static bool isAwful(int S) { return S < AwfulScore / 2; } static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score. FuzzyMatcher::FuzzyMatcher(StringRef Pattern) - : PatN(std::min(MaxPat, Pattern.size())), CaseSensitive(false), + : PatN(std::min(MaxPat, Pattern.size())), ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) { std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat); - for (int I = 0; I < PatN; ++I) { + for (int I = 0; I < PatN; ++I) LowPat[I] = lower(Pat[I]); - CaseSensitive |= LowPat[I] != Pat[I]; - } Scores[0][0][Miss] = {0, Miss}; Scores[0][0][Match] = {AwfulScore, Miss}; for (int P = 0; P <= PatN; ++P) @@ -88,7 +88,7 @@ FuzzyMatcher::FuzzyMatcher(StringRef Pattern) for (Action A : {Miss, Match}) Scores[P][W][A] = {AwfulScore, Miss}; if (PatN > 0) - calculateRoles(Pat, PatRole, PatN); + calculateRoles(Pat, PatRole, PatTypeSet, PatN); } Optional FuzzyMatcher::match(StringRef Word) { @@ -177,16 +177,21 @@ constexpr static uint8_t CharRoles[] = { template static T packedLookup(const uint8_t *Data, int I) { return static_cast((Data[I >> 2] >> ((I & 3) * 2)) & 3); } -void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) { +void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet, + int N) { assert(N > 0); + CharType Type = packedLookup(CharTypes, Text[0]); + TypeSet = 1 << Type; // Types holds a sliding window of (Prev, Curr, Next) types. // Initial value is (Empty, Empty, type of Text[0]). - int Types = packedLookup(CharTypes, Text[0]); + int Types = Type; // Rotate slides in the type of the next character. auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; }; for (int I = 0; I < N - 1; ++I) { // For each character, rotate in the next, and look up the role. - Rotate(packedLookup(CharTypes, Text[I + 1])); + Type = packedLookup(CharTypes, Text[I + 1]); + TypeSet |= 1 << Type; + Rotate(Type); *Out++ = packedLookup(CharRoles, Types); } // For the last character, the "next character" is Empty. @@ -214,7 +219,10 @@ bool FuzzyMatcher::init(StringRef NewWord) { ++P; } - calculateRoles(Word, WordRole, WordN); + // FIXME: some words are hard to tokenize algorithmically. + // e.g. vsprintf is V S Print F, and should match [pri] but not [int]. + // We could add a tokenization dictionary for common stdlib names. + calculateRoles(Word, WordRole, WordTypeSet, WordN); return true; } @@ -247,7 +255,7 @@ void FuzzyMatcher::buildGraph() { ? ScoreInfo{MatchMissScore, Match} : ScoreInfo{MissMissScore, Miss}; - if (LowPat[P] != LowWord[W]) { // No match possible. + if (!allowMatch(P, W)) { Score[Match] = {AwfulScore, Miss}; } else { auto &PreMatch = Scores[P][W]; @@ -261,7 +269,22 @@ void FuzzyMatcher::buildGraph() { } } -int FuzzyMatcher::skipPenalty(int W, Action Last) { +bool FuzzyMatcher::allowMatch(int P, int W) const { + if (LowPat[P] != LowWord[W]) + return false; + // We require a "strong" match for the first pattern character only. + if (P > 0) + return true; + // Obvious "strong match" for first char: match against a word head. + // We're banning matches outright, so conservatively accept some other cases + // where our segmentation might be wrong: + // - allow matching B in ABCDef (but not in NDEBUG) + // - we'd like to accept print in sprintf, but too many false positives + return WordRole[W] != Tail || + (Word[W] != LowWord[W] && WordTypeSet & 1 << Lower); +} + +int FuzzyMatcher::skipPenalty(int W, Action Last) const { int S = 0; if (WordRole[W] == Head) // Skipping a segment. S += 1; @@ -270,14 +293,14 @@ int FuzzyMatcher::skipPenalty(int W, Action Last) { return S; } -int FuzzyMatcher::matchBonus(int P, int W, Action Last) { +int FuzzyMatcher::matchBonus(int P, int W, Action Last) const { assert(LowPat[P] == LowWord[W]); int S = 1; // Bonus: pattern so far is a (case-insensitive) prefix of the word. if (P == W) // We can't skip pattern characters, so we must have matched all. ++S; // Bonus: case matches, or a Head in the pattern aligns with one in the word. - if ((Pat[P] == Word[W] && (CaseSensitive || P == W)) || + if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) || (PatRole[P] == Head && WordRole[W] == Head)) ++S; // Penalty: matching inside a segment (and previous char wasn't matched). @@ -312,7 +335,7 @@ llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const { Scores[PatN][WordN][Miss].Score))) { OS << "Substring check passed, but all matches are forbidden\n"; } - if (!CaseSensitive) + if (!(PatTypeSet & 1 << Upper)) OS << "Lowercase query, so scoring ignores case\n"; // Traverse Matched table backwards to reconstruct the Pattern/Word mapping. diff --git a/clang-tools-extra/clangd/FuzzyMatch.h b/clang-tools-extra/clangd/FuzzyMatch.h index 21bf92c..13a25e2 100644 --- a/clang-tools-extra/clangd/FuzzyMatch.h +++ b/clang-tools-extra/clangd/FuzzyMatch.h @@ -56,16 +56,17 @@ private: bool init(llvm::StringRef Word); void buildGraph(); - void calculateRoles(const char *Text, CharRole *Out, int N); - int skipPenalty(int W, Action Last); - int matchBonus(int P, int W, Action Last); + void calculateRoles(const char *Text, CharRole *Out, int &Types, int N); + bool allowMatch(int P, int W) const; + int skipPenalty(int W, Action Last) const; + int matchBonus(int P, int W, Action Last) const; // Pattern data is initialized by the constructor, then constant. char Pat[MaxPat]; // Pattern data int PatN; // Length char LowPat[MaxPat]; // Pattern in lowercase CharRole PatRole[MaxPat]; // Pattern segmentation info - bool CaseSensitive; // Case-sensitive match if pattern has uppercase + int PatTypeSet; // Bitmask of 1<size()) + Annotated = None; + } + bool accepts(StringRef ActualAnnotated) const { + return !Annotated || ActualAnnotated == *Annotated; } std::string Word; - StringRef Annotated; + + friend raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) { + return OS << "'" << M.Word; + if (M.Annotated) + OS << "' as " << *M.Annotated; + } + +private: + Optional Annotated; }; -raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) { - return OS << "'" << M.Word << "' as " << M.Annotated; -} struct MatchesMatcher : public testing::MatcherInterface { ExpectedMatch Candidate; @@ -47,7 +58,7 @@ struct MatchesMatcher : public testing::MatcherInterface { FuzzyMatcher Matcher(Pattern); auto Result = Matcher.match(Candidate.Word); auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n"); - return Result && AnnotatedMatch == Candidate.Annotated; + return Result && Candidate.accepts(AnnotatedMatch); } }; @@ -122,6 +133,7 @@ TEST(FuzzyMatch, Matches) { EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList")); EXPECT_THAT("sllll", matches("[S]Visua[lL]ogger[L]ogs[L]ist")); EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment")); + EXPECT_THAT("b", Not(matches("NDEBUG"))); EXPECT_THAT("Three", matches("[Three]")); EXPECT_THAT("fo", Not(matches("barfoo"))); EXPECT_THAT("fo", matches("bar_[fo]o")); @@ -151,8 +163,9 @@ TEST(FuzzyMatch, Matches) { EXPECT_THAT("g", matches("zz[G]roup")); EXPECT_THAT("aaaa", matches("_a_[aaaa]")); // Prefer consecutive. - EXPECT_THAT("printf", matches("s[printf]")); - EXPECT_THAT("str", matches("o[str]eam")); + // These would ideally match, but would need special segmentation rules. + EXPECT_THAT("printf", Not(matches("s[printf]"))); + EXPECT_THAT("str", Not(matches("o[str]eam"))); } struct RankMatcher : public testing::MatcherInterface { @@ -188,9 +201,8 @@ struct RankMatcher : public testing::MatcherInterface { llvm::raw_string_ostream Info(Buf); auto AnnotatedMatch = Matcher.dumpLast(Info); - if (AnnotatedMatch != Str.Annotated) { - *OS << "\nMatched " << Str.Word << " as " << AnnotatedMatch - << " instead of " << Str.Annotated << "\n" + if (!Str.accepts(AnnotatedMatch)) { + *OS << "\nDoesn't match " << Str << ", but " << AnnotatedMatch << "\n" << Info.str(); Ok = false; } else if (LastScore && *LastScore < *Score) { diff --git a/clang-tools-extra/unittests/clangd/IndexTests.cpp b/clang-tools-extra/unittests/clangd/IndexTests.cpp index 3c61533..20eb452 100644 --- a/clang-tools-extra/unittests/clangd/IndexTests.cpp +++ b/clang-tools-extra/unittests/clangd/IndexTests.cpp @@ -116,14 +116,6 @@ TEST(MemIndexTest, MemIndexSymbolsRecycled) { EXPECT_TRUE(Symbols.expired()); } -TEST(MemIndexTest, MemIndexMatchSubstring) { - MemIndex I; - I.build(generateNumSymbols(5, 25)); - FuzzyFindRequest Req; - Req.Query = "5"; - EXPECT_THAT(match(I, Req), UnorderedElementsAre("5", "15", "25")); -} - TEST(MemIndexTest, MemIndexDeduplicate) { auto Symbols = generateNumSymbols(0, 10); @@ -166,46 +158,46 @@ TEST(MemIndexTest, FuzzyMatch) { TEST(MemIndexTest, MatchQualifiedNamesWithoutSpecificScope) { MemIndex I; - I.build(generateSymbols({"a::xyz", "b::yz", "yz"})); + I.build(generateSymbols({"a::y1", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; - EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "b::yz", "yz")); + EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3")); } TEST(MemIndexTest, MatchQualifiedNamesWithGlobalScope) { MemIndex I; - I.build(generateSymbols({"a::xyz", "b::yz", "yz"})); + I.build(generateSymbols({"a::y1", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {""}; - EXPECT_THAT(match(I, Req), UnorderedElementsAre("yz")); + EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3")); } TEST(MemIndexTest, MatchQualifiedNamesWithOneScope) { MemIndex I; - I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"})); + I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a"}; - EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy")); + EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2")); } TEST(MemIndexTest, MatchQualifiedNamesWithMultipleScopes) { MemIndex I; - I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"})); + I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a", "b"}; - EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy", "b::yz")); + EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3")); } TEST(MemIndexTest, NoMatchNestedScopes) { MemIndex I; - I.build(generateSymbols({"a::xyz", "a::b::yy"})); + I.build(generateSymbols({"a::y1", "a::b::y2"})); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a"}; - EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz")); + EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1")); } TEST(MemIndexTest, IgnoreCases) { -- 2.7.4