[clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

author Sam McCall <sam.mccall@gmail.com>

Mon, 5 Mar 2018 17:34:33 +0000 (17:34 +0000)

committer Sam McCall <sam.mccall@gmail.com>

Mon, 5 Mar 2018 17:34:33 +0000 (17:34 +0000)
author Sam McCall <sam.mccall@gmail.com>
Mon, 5 Mar 2018 17:34:33 +0000 (17:34 +0000)
committer Sam McCall <sam.mccall@gmail.com>
Mon, 5 Mar 2018 17:34:33 +0000 (17:34 +0000)
diff --git a/clang-tools-extra/clangd/FuzzyMatch.cpp b/clang-tools-extra/clangd/FuzzyMatch.cpp

index 68ba695..152f179 100644 (file)
--- a/clang-tools-extra/clangd/FuzzyMatch.cpp
+++ b/clang-tools-extra/clangd/FuzzyMatch.cpp
@@ -33,6 +33,8 @@
  //    Legal if the characters match.
  //  - Moving down (consuming a pattern character) is never legal.
  //    Never legal: all pattern characters must match something.
+// Characters are matched case-insensitively.
+// The first pattern character may only match the start of a word segment.
  //
  // The scoring is based on heuristics:
  //  - when matching a character, apply a bonus or penalty depending on the
@@ -74,13 +76,11 @@ static bool isAwful(int S) { return S < AwfulScore / 2; }
  static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
  
  FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
-    : PatN(std::min<int>(MaxPat, Pattern.size())), CaseSensitive(false),
+    : PatN(std::min<int>(MaxPat, Pattern.size())),
        ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
    std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
-  for (int I = 0; I < PatN; ++I) {
+  for (int I = 0; I < PatN; ++I)
      LowPat[I] = lower(Pat[I]);
-    CaseSensitive |= LowPat[I] != Pat[I];
-  }
    Scores[0][0][Miss] = {0, Miss};
    Scores[0][0][Match] = {AwfulScore, Miss};
    for (int P = 0; P <= PatN; ++P)
@@ -88,7 +88,7 @@ FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
        for (Action A : {Miss, Match})
          Scores[P][W][A] = {AwfulScore, Miss};
    if (PatN > 0)
-    calculateRoles(Pat, PatRole, PatN);
+    calculateRoles(Pat, PatRole, PatTypeSet, PatN);
  }
  
  Optional<float> FuzzyMatcher::match(StringRef Word) {
@@ -177,16 +177,21 @@ constexpr static uint8_t CharRoles[] = {
  template <typename T> static T packedLookup(const uint8_t *Data, int I) {
    return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
  }
-void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet,
+                                  int N) {
    assert(N > 0);
+  CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
+  TypeSet = 1 << Type;
    // Types holds a sliding window of (Prev, Curr, Next) types.
    // Initial value is (Empty, Empty, type of Text[0]).
-  int Types = packedLookup<CharType>(CharTypes, Text[0]);
+  int Types = Type;
    // Rotate slides in the type of the next character.
    auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
    for (int I = 0; I < N - 1; ++I) {
      // For each character, rotate in the next, and look up the role.
-    Rotate(packedLookup<CharType>(CharTypes, Text[I + 1]));
+    Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
+    TypeSet |= 1 << Type;
+    Rotate(Type);
      *Out++ = packedLookup<CharRole>(CharRoles, Types);
    }
    // For the last character, the "next character" is Empty.
@@ -214,7 +219,10 @@ bool FuzzyMatcher::init(StringRef NewWord) {
        ++P;
    }
  
-  calculateRoles(Word, WordRole, WordN);
+  // FIXME: some words are hard to tokenize algorithmically.
+  // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
+  // We could add a tokenization dictionary for common stdlib names.
+  calculateRoles(Word, WordRole, WordTypeSet, WordN);
    return true;
  }
  
@@ -247,7 +255,7 @@ void FuzzyMatcher::buildGraph() {
                          ? ScoreInfo{MatchMissScore, Match}
                          : ScoreInfo{MissMissScore, Miss};
  
-      if (LowPat[P] != LowWord[W]) { // No match possible.
+      if (!allowMatch(P, W)) {
          Score[Match] = {AwfulScore, Miss};
        } else {
          auto &PreMatch = Scores[P][W];
@@ -261,7 +269,22 @@ void FuzzyMatcher::buildGraph() {
    }
  }
  
-int FuzzyMatcher::skipPenalty(int W, Action Last) {
+bool FuzzyMatcher::allowMatch(int P, int W) const {
+  if (LowPat[P] != LowWord[W])
+    return false;
+  // We require a "strong" match for the first pattern character only.
+  if (P > 0)
+    return true;
+  // Obvious "strong match" for first char: match against a word head.
+  // We're banning matches outright, so conservatively accept some other cases
+  // where our segmentation might be wrong:
+  //  - allow matching B in ABCDef (but not in NDEBUG)
+  //  - we'd like to accept print in sprintf, but too many false positives
+  return WordRole[W] != Tail ||
+         (Word[W] != LowWord[W] && WordTypeSet & 1 << Lower);
+}
+
+int FuzzyMatcher::skipPenalty(int W, Action Last) const {
    int S = 0;
    if (WordRole[W] == Head) // Skipping a segment.
      S += 1;
@@ -270,14 +293,14 @@ int FuzzyMatcher::skipPenalty(int W, Action Last) {
    return S;
  }
  
-int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
+int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
    assert(LowPat[P] == LowWord[W]);
    int S = 1;
    // Bonus: pattern so far is a (case-insensitive) prefix of the word.
    if (P == W) // We can't skip pattern characters, so we must have matched all.
      ++S;
    // Bonus: case matches, or a Head in the pattern aligns with one in the word.
-  if ((Pat[P] == Word[W] && (CaseSensitive || P == W)) ||
+  if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) ||
        (PatRole[P] == Head && WordRole[W] == Head))
      ++S;
    // Penalty: matching inside a segment (and previous char wasn't matched).
@@ -312,7 +335,7 @@ llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
                                Scores[PatN][WordN][Miss].Score))) {
      OS << "Substring check passed, but all matches are forbidden\n";
    }
-  if (!CaseSensitive)
+  if (!(PatTypeSet & 1 << Upper))
      OS << "Lowercase query, so scoring ignores case\n";
  
    // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
diff --git a/clang-tools-extra/clangd/FuzzyMatch.h b/clang-tools-extra/clangd/FuzzyMatch.h

index 21bf92c..13a25e2 100644 (file)
--- a/clang-tools-extra/clangd/FuzzyMatch.h
+++ b/clang-tools-extra/clangd/FuzzyMatch.h
@@ -56,16 +56,17 @@ private:
  
    bool init(llvm::StringRef Word);
    void buildGraph();
-  void calculateRoles(const char *Text, CharRole *Out, int N);
-  int skipPenalty(int W, Action Last);
-  int matchBonus(int P, int W, Action Last);
+  void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
+  bool allowMatch(int P, int W) const;
+  int skipPenalty(int W, Action Last) const;
+  int matchBonus(int P, int W, Action Last) const;
  
    // Pattern data is initialized by the constructor, then constant.
    char Pat[MaxPat];         // Pattern data
    int PatN;                 // Length
    char LowPat[MaxPat];      // Pattern in lowercase
    CharRole PatRole[MaxPat]; // Pattern segmentation info
-  bool CaseSensitive;       // Case-sensitive match if pattern has uppercase
+  int PatTypeSet;           // Bitmask of 1<<CharType
    float ScoreScale;         // Normalizes scores for the pattern length.
  
    // Word data is initialized on each call to match(), mostly by init().
@@ -73,6 +74,7 @@ private:
    int WordN;                  // Length
    char LowWord[MaxWord];      // Word in lowercase
    CharRole WordRole[MaxWord]; // Word segmentation info
+  int WordTypeSet;            // Bitmask of 1<<CharType
    bool WordContainsPattern;   // Simple substring check
  
    // Cumulative best-match score table.
diff --git a/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp b/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp

index 6efcdbe..b665fe3 100644 (file)
--- a/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp
+++ b/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp
@@ -20,16 +20,27 @@ using namespace llvm;
  using testing::Not;
  
  struct ExpectedMatch {
-  ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+  // Annotations are optional, and will not be asserted if absent.
+  ExpectedMatch(StringRef Match) : Word(Match), Annotated(Match) {
      for (char C : "[]")
        Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+    if (Word.size() == Annotated->size())
+      Annotated = None;
+  }
+  bool accepts(StringRef ActualAnnotated) const {
+    return !Annotated || ActualAnnotated == *Annotated;
    }
    std::string Word;
-  StringRef Annotated;
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
+    return OS << "'" << M.Word;
+    if (M.Annotated)
+      OS << "' as " << *M.Annotated;
+  }
+
+private:
+  Optional<StringRef> Annotated;
  };
-raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
-  return OS << "'" << M.Word << "' as " << M.Annotated;
-}
  
  struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
    ExpectedMatch Candidate;
@@ -47,7 +58,7 @@ struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
      FuzzyMatcher Matcher(Pattern);
      auto Result = Matcher.match(Candidate.Word);
      auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n");
-    return Result && AnnotatedMatch == Candidate.Annotated;
+    return Result && Candidate.accepts(AnnotatedMatch);
    }
  };
  
@@ -122,6 +133,7 @@ TEST(FuzzyMatch, Matches) {
    EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList"));
    EXPECT_THAT("sllll", matches("[S]Visua[lL]ogger[L]ogs[L]ist"));
    EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment"));
+  EXPECT_THAT("b", Not(matches("NDEBUG")));
    EXPECT_THAT("Three", matches("[Three]"));
    EXPECT_THAT("fo", Not(matches("barfoo")));
    EXPECT_THAT("fo", matches("bar_[fo]o"));
@@ -151,8 +163,9 @@ TEST(FuzzyMatch, Matches) {
    EXPECT_THAT("g", matches("zz[G]roup"));
  
    EXPECT_THAT("aaaa", matches("_a_[aaaa]")); // Prefer consecutive.
-  EXPECT_THAT("printf", matches("s[printf]"));
-  EXPECT_THAT("str", matches("o[str]eam"));
+  // These would ideally match, but would need special segmentation rules.
+  EXPECT_THAT("printf", Not(matches("s[printf]")));
+  EXPECT_THAT("str", Not(matches("o[str]eam")));
  }
  
  struct RankMatcher : public testing::MatcherInterface<StringRef> {
@@ -188,9 +201,8 @@ struct RankMatcher : public testing::MatcherInterface<StringRef> {
          llvm::raw_string_ostream Info(Buf);
          auto AnnotatedMatch = Matcher.dumpLast(Info);
  
-        if (AnnotatedMatch != Str.Annotated) {
-          *OS << "\nMatched " << Str.Word << " as " << AnnotatedMatch
-              << " instead of " << Str.Annotated << "\n"
+        if (!Str.accepts(AnnotatedMatch)) {
+          *OS << "\nDoesn't match " << Str << ", but " << AnnotatedMatch << "\n"
                << Info.str();
            Ok = false;
          } else if (LastScore && *LastScore < *Score) {
diff --git a/clang-tools-extra/unittests/clangd/IndexTests.cpp b/clang-tools-extra/unittests/clangd/IndexTests.cpp

index 3c61533..20eb452 100644 (file)
--- a/clang-tools-extra/unittests/clangd/IndexTests.cpp
+++ b/clang-tools-extra/unittests/clangd/IndexTests.cpp
@@ -116,14 +116,6 @@ TEST(MemIndexTest, MemIndexSymbolsRecycled) {
    EXPECT_TRUE(Symbols.expired());
  }
  
-TEST(MemIndexTest, MemIndexMatchSubstring) {
-  MemIndex I;
-  I.build(generateNumSymbols(5, 25));
-  FuzzyFindRequest Req;
-  Req.Query = "5";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("5", "15", "25"));
-}
-
  TEST(MemIndexTest, MemIndexDeduplicate) {
    auto Symbols = generateNumSymbols(0, 10);
  
@@ -166,46 +158,46 @@ TEST(MemIndexTest, FuzzyMatch) {
  
  TEST(MemIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
    MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
    FuzzyFindRequest Req;
    Req.Query = "y";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "b::yz", "yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
  }
  
  TEST(MemIndexTest, MatchQualifiedNamesWithGlobalScope) {
    MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
    FuzzyFindRequest Req;
    Req.Query = "y";
    Req.Scopes = {""};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
  }
  
  TEST(MemIndexTest, MatchQualifiedNamesWithOneScope) {
    MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
    FuzzyFindRequest Req;
    Req.Query = "y";
    Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
  }
  
  TEST(MemIndexTest, MatchQualifiedNamesWithMultipleScopes) {
    MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
    FuzzyFindRequest Req;
    Req.Query = "y";
    Req.Scopes = {"a", "b"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy", "b::yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
  }
  
  TEST(MemIndexTest, NoMatchNestedScopes) {
    MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::b::yy"}));
+  I.build(generateSymbols({"a::y1", "a::b::y2"}));
    FuzzyFindRequest Req;
    Req.Query = "y";
    Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
  }
  
  TEST(MemIndexTest, IgnoreCases) {
author	Sam McCall <sam.mccall@gmail.com>
	Mon, 5 Mar 2018 17:34:33 +0000 (17:34 +0000)
committer	Sam McCall <sam.mccall@gmail.com>
	Mon, 5 Mar 2018 17:34:33 +0000 (17:34 +0000)
clang-tools-extra/clangd/FuzzyMatch.cpp		patch \| blob \| history
clang-tools-extra/clangd/FuzzyMatch.h		patch \| blob \| history
clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp		patch \| blob \| history
clang-tools-extra/unittests/clangd/IndexTests.cpp		patch \| blob \| history