TUScheduler.cpp
URI.cpp
XRefs.cpp
+
index/CanonicalIncludes.cpp
index/FileIndex.cpp
index/Index.cpp
index/SymbolCollector.cpp
index/SymbolYAML.cpp
+ index/dex/Trigram.cpp
+
LINK_LIBS
clangAST
clangASTMatchers
--- /dev/null
+//===--- Token.h - Symbol Search primitive ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Token objects represent a characteristic of a symbol, which can be used to
+// perform efficient search. Tokens are keys for inverted index which are mapped
+// to the corresponding posting lists.
+//
+// The symbol std::cout might have the tokens:
+// * Scope "std::"
+// * Trigram "cou"
+// * Trigram "out"
+// * Type "std::ostream"
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// A Token represents an attribute of a symbol, such as a particular trigram
+/// present in the name (used for fuzzy search).
+///
+/// Tokens can be used to perform more sophisticated search queries by
+/// constructing complex iterator trees.
+struct Token {
+ /// Kind specifies Token type which defines semantics for the internal
+ /// representation. Each Kind has different representation stored in Data
+ /// field.
+ enum class Kind {
+ /// Represents trigram used for fuzzy search of unqualified symbol names.
+ ///
+ /// Data contains 3 bytes with trigram contents.
+ Trigram,
+ /// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
+ ///
+ /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global
+ /// scope).
+ Scope,
+ /// Internal Token type for invalid/special tokens, e.g. empty tokens for
+ /// llvm::DenseMap.
+ Sentinel,
+ /// FIXME(kbobyrev): Add other Token Kinds
+ /// * Path with full or relative path to the directory in which symbol is
+ /// defined
+ /// * Type with qualified type name or its USR
+ };
+
+ Token(Kind TokenKind, llvm::StringRef Data)
+ : Data(Data), TokenKind(TokenKind) {}
+
+ bool operator==(const Token &Other) const {
+ return TokenKind == Other.TokenKind && Data == Other.Data;
+ }
+
+ /// Representation which is unique among Token with the same Kind.
+ std::string Data;
+ Kind TokenKind;
+
+ friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+ return OS << T.Data;
+ }
+
+private:
+ friend llvm::hash_code hash_value(const Token &Token) {
+ return llvm::hash_combine(static_cast<int>(Token.TokenKind), Token.Data);
+ }
+};
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+namespace llvm {
+
+// Support Tokens as DenseMap keys.
+template <> struct DenseMapInfo<clang::clangd::dex::Token> {
+ static inline clang::clangd::dex::Token getEmptyKey() {
+ return {clang::clangd::dex::Token::Kind::Sentinel, "EmptyKey"};
+ }
+
+ static inline clang::clangd::dex::Token getTombstoneKey() {
+ return {clang::clangd::dex::Token::Kind::Sentinel, "TombstoneKey"};
+ }
+
+ static unsigned getHashValue(const clang::clangd::dex::Token &Tag) {
+ return hash_value(Tag);
+ }
+
+ static bool isEqual(const clang::clangd::dex::Token &LHS,
+ const clang::clangd::dex::Token &RHS) {
+ return LHS == RHS;
+ }
+};
+
+} // namespace llvm
+
+#endif
--- /dev/null
+//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Trigram.h"
+#include "../../FuzzyMatch.h"
+#include "Token.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+
+#include <cctype>
+#include <queue>
+#include <string>
+
+using namespace llvm;
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
+// be generating unigrams and bigrams here, too. This would prevent symbol index
+// from applying fuzzy matching on a tremendous number of symbols and allow
+// supplementary retrieval for short queries.
+//
+// Short names (total segment length <3 characters) are currently ignored.
+std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
+ // Apply fuzzy matching text segmentation.
+ std::vector<CharRole> Roles(Identifier.size());
+ calculateRoles(Identifier,
+ llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
+
+ std::string LowercaseIdentifier = Identifier.lower();
+
+ // For each character, store indices of the characters to which fuzzy matching
+ // algorithm can jump. There are 3 possible variants:
+ //
+ // * Next Tail - next character from the same segment
+ // * Next Head - front character of the next segment
+ // * Skip-1-Next Head - front character of the skip-1-next segment
+ //
+ // Next stores tuples of three indices in the presented order, if a variant is
+ // not available then 0 is stored.
+ std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
+ unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
+ for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
+ Next[I] = {{NextTail, NextHead, NextNextHead}};
+ NextTail = Roles[I] == Tail ? I : 0;
+ if (Roles[I] == Head) {
+ NextNextHead = NextHead;
+ NextHead = I;
+ }
+ }
+
+ DenseSet<Token> UniqueTrigrams;
+ std::array<char, 4> Chars;
+ for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
+ // Skip delimiters.
+ if (Roles[I] != Head && Roles[I] != Tail)
+ continue;
+ for (const unsigned J : Next[I]) {
+ if (!J)
+ continue;
+ for (const unsigned K : Next[J]) {
+ if (!K)
+ continue;
+ Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J],
+ LowercaseIdentifier[K], 0}};
+ auto Trigram = Token(Token::Kind::Trigram, Chars.data());
+ // Push unique trigrams to the result.
+ if (!UniqueTrigrams.count(Trigram)) {
+ UniqueTrigrams.insert(Trigram);
+ }
+ }
+ }
+ }
+
+ std::vector<Token> Result;
+ for (const auto &Trigram : UniqueTrigrams)
+ Result.push_back(Trigram);
+
+ return Result;
+}
+
+// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
+// inputs (total segment length <3 characters).
+std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
+ // Apply fuzzy matching text segmentation.
+ std::vector<CharRole> Roles(Query.size());
+ calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
+
+ std::string LowercaseQuery = Query.lower();
+
+ DenseSet<Token> UniqueTrigrams;
+ std::deque<char> Chars;
+
+ for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
+ // If current symbol is delimiter, just skip it.
+ if (Roles[I] != Head && Roles[I] != Tail)
+ continue;
+
+ Chars.push_back(LowercaseQuery[I]);
+
+ if (Chars.size() > 3)
+ Chars.pop_front();
+ if (Chars.size() == 3) {
+ auto Trigram =
+ Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
+ // Push unique trigrams to the result.
+ if (!UniqueTrigrams.count(Trigram)) {
+ UniqueTrigrams.insert(Trigram);
+ }
+ }
+ }
+
+ std::vector<Token> Result;
+ for (const auto &Trigram : UniqueTrigrams)
+ Result.push_back(Trigram);
+
+ return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
--- /dev/null
+//===--- Trigram.h - Trigram generation for Fuzzy Matching ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Trigrams are attributes of the symbol unqualified name used to effectively
+// extract symbols which can be fuzzy-matched given user query from the inverted
+// index. To match query with the extracted set of trigrams Q, the set of
+// generated trigrams T for identifier (unqualified symbol name) should contain
+// all items of Q, i.e. Q ⊆ T.
+//
+// Trigram sets extracted from unqualified name and from query are different:
+// the set of query trigrams only contains consecutive sequences of three
+// characters (which is only a subset of all trigrams generated for an
+// identifier).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+
+#include "Token.h"
+
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+///
+/// First, given Identifier (unqualified symbol name) is segmented using
+/// FuzzyMatch API and lowercased. After segmentation, the following technique
+/// is applied for generating trigrams: for each letter or digit in the input
+/// string the algorithms looks for the possible next and skip-1-next symbols
+/// which can be jumped to during fuzzy matching. Each combination of such three
+/// symbols is inserted into the result.
+///
+/// Trigrams can start at any character in the input. Then we can choose to move
+/// to the next character, move to the start of the next segment, or skip over a
+/// segment.
+///
+/// Note: the returned list of trigrams does not have duplicates, if any trigram
+/// belongs to more than one class it is only inserted once.
+std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
+
+/// Returns list of unique fuzzy-search trigrams given a query.
+///
+/// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then,
+/// the simplest trigrams - sequences of three consecutive letters and digits
+/// are extracted and returned after deduplication.
+std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+#endif
CodeCompleteTests.cpp
CodeCompletionStringsTests.cpp
ContextTests.cpp
+ DexIndexTests.cpp
DraftStoreTests.cpp
- FileIndexTests.cpp
FileDistanceTests.cpp
+ FileIndexTests.cpp
FindSymbolsTests.cpp
FuzzyMatchTests.cpp
GlobalCompilationDatabaseTests.cpp
SourceCodeTests.cpp
SymbolCollectorTests.cpp
SyncAPI.cpp
+ TUSchedulerTests.cpp
TestFS.cpp
TestTU.cpp
ThreadingTests.cpp
TraceTests.cpp
- TUSchedulerTests.cpp
URITests.cpp
XRefsTests.cpp
)
--- /dev/null
+//===-- DexIndexTests.cpp ----------------------------*- C++ -*-----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "index/dex/Token.h"
+#include "index/dex/Trigram.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+testing::Matcher<std::vector<Token>>
+trigramsAre(std::initializer_list<std::string> Trigrams) {
+ std::vector<Token> Tokens;
+ for (const auto &Symbols : Trigrams) {
+ Tokens.push_back(Token(Token::Kind::Trigram, Symbols));
+ }
+ return testing::UnorderedElementsAreArray(Tokens);
+}
+
+TEST(DexIndexTrigrams, IdentifierTrigrams) {
+ EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("clangd"),
+ trigramsAre({"cla", "lan", "ang", "ngd"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
+ trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"}));
+
+ EXPECT_THAT(
+ generateIdentifierTrigrams("a_b_c_d_e_"),
+ trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+
+ EXPECT_THAT(
+ generateIdentifierTrigrams("unique_ptr"),
+ trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp",
+ "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("TUDecl"),
+ trigramsAre({"tud", "tde", "ude", "dec", "ecl"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
+ trigramsAre({"iso", "iok", "sok"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"),
+ trigramsAre({
+ "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd",
+ "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk",
+ "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg",
+ "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
+ "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm",
+ }));
+}
+
+TEST(DexIndexTrigrams, QueryTrigrams) {
+ EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
+
+ EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({}));
+
+ EXPECT_THAT(generateQueryTrigrams("clangd"),
+ trigramsAre({"cla", "lan", "ang", "ngd"}));
+
+ EXPECT_THAT(generateQueryTrigrams("abc_def"),
+ trigramsAre({"abc", "bcd", "cde", "def"}));
+
+ EXPECT_THAT(generateQueryTrigrams("a_b_c_d_e_"),
+ trigramsAre({"abc", "bcd", "cde"}));
+
+ EXPECT_THAT(generateQueryTrigrams("unique_ptr"),
+ trigramsAre({"uni", "niq", "iqu", "que", "uep", "ept", "ptr"}));
+
+ EXPECT_THAT(generateQueryTrigrams("TUDecl"),
+ trigramsAre({"tud", "ude", "dec", "ecl"}));
+
+ EXPECT_THAT(generateQueryTrigrams("IsOK"), trigramsAre({"iso", "sok"}));
+
+ EXPECT_THAT(generateQueryTrigrams("abc_defGhij__klm"),
+ trigramsAre({"abc", "bcd", "cde", "def", "efg", "fgh", "ghi",
+ "hij", "ijk", "jkl", "klm"}));
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang