// Legal if the characters match.
// - Moving down (consuming a pattern character) is never legal.
// Never legal: all pattern characters must match something.
+// Characters are matched case-insensitively.
+// The first pattern character may only match the start of a word segment.
//
// The scoring is based on heuristics:
// - when matching a character, apply a bonus or penalty depending on the
static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
- : PatN(std::min<int>(MaxPat, Pattern.size())), CaseSensitive(false),
+ : PatN(std::min<int>(MaxPat, Pattern.size())),
ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
- for (int I = 0; I < PatN; ++I) {
+ for (int I = 0; I < PatN; ++I)
LowPat[I] = lower(Pat[I]);
- CaseSensitive |= LowPat[I] != Pat[I];
- }
Scores[0][0][Miss] = {0, Miss};
Scores[0][0][Match] = {AwfulScore, Miss};
for (int P = 0; P <= PatN; ++P)
for (Action A : {Miss, Match})
Scores[P][W][A] = {AwfulScore, Miss};
if (PatN > 0)
- calculateRoles(Pat, PatRole, PatN);
+ calculateRoles(Pat, PatRole, PatTypeSet, PatN);
}
Optional<float> FuzzyMatcher::match(StringRef Word) {
template <typename T> static T packedLookup(const uint8_t *Data, int I) {
return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
}
-void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet,
+ int N) {
assert(N > 0);
+ CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
+ TypeSet = 1 << Type;
// Types holds a sliding window of (Prev, Curr, Next) types.
// Initial value is (Empty, Empty, type of Text[0]).
- int Types = packedLookup<CharType>(CharTypes, Text[0]);
+ int Types = Type;
// Rotate slides in the type of the next character.
auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
for (int I = 0; I < N - 1; ++I) {
// For each character, rotate in the next, and look up the role.
- Rotate(packedLookup<CharType>(CharTypes, Text[I + 1]));
+ Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
+ TypeSet |= 1 << Type;
+ Rotate(Type);
*Out++ = packedLookup<CharRole>(CharRoles, Types);
}
// For the last character, the "next character" is Empty.
++P;
}
- calculateRoles(Word, WordRole, WordN);
+ // FIXME: some words are hard to tokenize algorithmically.
+ // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
+ // We could add a tokenization dictionary for common stdlib names.
+ calculateRoles(Word, WordRole, WordTypeSet, WordN);
return true;
}
? ScoreInfo{MatchMissScore, Match}
: ScoreInfo{MissMissScore, Miss};
- if (LowPat[P] != LowWord[W]) { // No match possible.
+ if (!allowMatch(P, W)) {
Score[Match] = {AwfulScore, Miss};
} else {
auto &PreMatch = Scores[P][W];
}
}
-int FuzzyMatcher::skipPenalty(int W, Action Last) {
+bool FuzzyMatcher::allowMatch(int P, int W) const {
+ if (LowPat[P] != LowWord[W])
+ return false;
+ // We require a "strong" match for the first pattern character only.
+ if (P > 0)
+ return true;
+ // Obvious "strong match" for first char: match against a word head.
+ // We're banning matches outright, so conservatively accept some other cases
+ // where our segmentation might be wrong:
+ // - allow matching B in ABCDef (but not in NDEBUG)
+ // - we'd like to accept print in sprintf, but too many false positives
+ return WordRole[W] != Tail ||
+ (Word[W] != LowWord[W] && WordTypeSet & 1 << Lower);
+}
+
+int FuzzyMatcher::skipPenalty(int W, Action Last) const {
int S = 0;
if (WordRole[W] == Head) // Skipping a segment.
S += 1;
return S;
}
-int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
+int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
assert(LowPat[P] == LowWord[W]);
int S = 1;
// Bonus: pattern so far is a (case-insensitive) prefix of the word.
if (P == W) // We can't skip pattern characters, so we must have matched all.
++S;
// Bonus: case matches, or a Head in the pattern aligns with one in the word.
- if ((Pat[P] == Word[W] && (CaseSensitive || P == W)) ||
+ if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) ||
(PatRole[P] == Head && WordRole[W] == Head))
++S;
// Penalty: matching inside a segment (and previous char wasn't matched).
Scores[PatN][WordN][Miss].Score))) {
OS << "Substring check passed, but all matches are forbidden\n";
}
- if (!CaseSensitive)
+ if (!(PatTypeSet & 1 << Upper))
OS << "Lowercase query, so scoring ignores case\n";
// Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
bool init(llvm::StringRef Word);
void buildGraph();
- void calculateRoles(const char *Text, CharRole *Out, int N);
- int skipPenalty(int W, Action Last);
- int matchBonus(int P, int W, Action Last);
+ void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
+ bool allowMatch(int P, int W) const;
+ int skipPenalty(int W, Action Last) const;
+ int matchBonus(int P, int W, Action Last) const;
// Pattern data is initialized by the constructor, then constant.
char Pat[MaxPat]; // Pattern data
int PatN; // Length
char LowPat[MaxPat]; // Pattern in lowercase
CharRole PatRole[MaxPat]; // Pattern segmentation info
- bool CaseSensitive; // Case-sensitive match if pattern has uppercase
+ int PatTypeSet; // Bitmask of 1<<CharType
float ScoreScale; // Normalizes scores for the pattern length.
// Word data is initialized on each call to match(), mostly by init().
int WordN; // Length
char LowWord[MaxWord]; // Word in lowercase
CharRole WordRole[MaxWord]; // Word segmentation info
+ int WordTypeSet; // Bitmask of 1<<CharType
bool WordContainsPattern; // Simple substring check
// Cumulative best-match score table.
using testing::Not;
struct ExpectedMatch {
- ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+ // Annotations are optional, and will not be asserted if absent.
+ ExpectedMatch(StringRef Match) : Word(Match), Annotated(Match) {
for (char C : "[]")
Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+ if (Word.size() == Annotated->size())
+ Annotated = None;
+ }
+ bool accepts(StringRef ActualAnnotated) const {
+ return !Annotated || ActualAnnotated == *Annotated;
}
std::string Word;
- StringRef Annotated;
+
+ friend raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
+ return OS << "'" << M.Word;
+ if (M.Annotated)
+ OS << "' as " << *M.Annotated;
+ }
+
+private:
+ Optional<StringRef> Annotated;
};
-raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
- return OS << "'" << M.Word << "' as " << M.Annotated;
-}
struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
ExpectedMatch Candidate;
FuzzyMatcher Matcher(Pattern);
auto Result = Matcher.match(Candidate.Word);
auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n");
- return Result && AnnotatedMatch == Candidate.Annotated;
+ return Result && Candidate.accepts(AnnotatedMatch);
}
};
EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList"));
EXPECT_THAT("sllll", matches("[S]Visua[lL]ogger[L]ogs[L]ist"));
EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment"));
+ EXPECT_THAT("b", Not(matches("NDEBUG")));
EXPECT_THAT("Three", matches("[Three]"));
EXPECT_THAT("fo", Not(matches("barfoo")));
EXPECT_THAT("fo", matches("bar_[fo]o"));
EXPECT_THAT("g", matches("zz[G]roup"));
EXPECT_THAT("aaaa", matches("_a_[aaaa]")); // Prefer consecutive.
- EXPECT_THAT("printf", matches("s[printf]"));
- EXPECT_THAT("str", matches("o[str]eam"));
+ // These would ideally match, but would need special segmentation rules.
+ EXPECT_THAT("printf", Not(matches("s[printf]")));
+ EXPECT_THAT("str", Not(matches("o[str]eam")));
}
struct RankMatcher : public testing::MatcherInterface<StringRef> {
llvm::raw_string_ostream Info(Buf);
auto AnnotatedMatch = Matcher.dumpLast(Info);
- if (AnnotatedMatch != Str.Annotated) {
- *OS << "\nMatched " << Str.Word << " as " << AnnotatedMatch
- << " instead of " << Str.Annotated << "\n"
+ if (!Str.accepts(AnnotatedMatch)) {
+ *OS << "\nDoesn't match " << Str << ", but " << AnnotatedMatch << "\n"
<< Info.str();
Ok = false;
} else if (LastScore && *LastScore < *Score) {
EXPECT_TRUE(Symbols.expired());
}
-TEST(MemIndexTest, MemIndexMatchSubstring) {
- MemIndex I;
- I.build(generateNumSymbols(5, 25));
- FuzzyFindRequest Req;
- Req.Query = "5";
- EXPECT_THAT(match(I, Req), UnorderedElementsAre("5", "15", "25"));
-}
-
TEST(MemIndexTest, MemIndexDeduplicate) {
auto Symbols = generateNumSymbols(0, 10);
TEST(MemIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
MemIndex I;
- I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+ I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
FuzzyFindRequest Req;
Req.Query = "y";
- EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "b::yz", "yz"));
+ EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
}
TEST(MemIndexTest, MatchQualifiedNamesWithGlobalScope) {
MemIndex I;
- I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+ I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
FuzzyFindRequest Req;
Req.Query = "y";
Req.Scopes = {""};
- EXPECT_THAT(match(I, Req), UnorderedElementsAre("yz"));
+ EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
}
TEST(MemIndexTest, MatchQualifiedNamesWithOneScope) {
MemIndex I;
- I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+ I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
FuzzyFindRequest Req;
Req.Query = "y";
Req.Scopes = {"a"};
- EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy"));
+ EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
}
TEST(MemIndexTest, MatchQualifiedNamesWithMultipleScopes) {
MemIndex I;
- I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+ I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
FuzzyFindRequest Req;
Req.Query = "y";
Req.Scopes = {"a", "b"};
- EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy", "b::yz"));
+ EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
}
TEST(MemIndexTest, NoMatchNestedScopes) {
MemIndex I;
- I.build(generateSymbols({"a::xyz", "a::b::yy"}));
+ I.build(generateSymbols({"a::y1", "a::b::y2"}));
FuzzyFindRequest Req;
Req.Query = "y";
Req.Scopes = {"a"};
- EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz"));
+ EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
}
TEST(MemIndexTest, IgnoreCases) {