From 2ec5a10db3c4b843db864fb6702e2b675887a6e8 Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Thu, 4 Oct 2018 14:08:11 +0000 Subject: [PATCH] [clangd] Remove one-segment-skipping from Dex trigrams. Summary: Currently queries like "ab" can match identifiers like a_yellow_bee. The value of allowing this for exactly one segment but no more seems dubious. It costs ~3% of overall ram (~9% of posting list ram) and some quality. Reviewers: ilya-biryukov, ioeric Subscribers: MaskRay, jkorous, arphaman, kadircet, cfe-commits Differential Revision: https://reviews.llvm.org/D52885 llvm-svn: 343777 --- clang-tools-extra/clangd/index/dex/Trigram.cpp | 6 ++---- clang-tools-extra/clangd/index/dex/Trigram.h | 2 +- clang-tools-extra/unittests/clangd/DexTests.cpp | 12 ++++-------- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp index eb5473d..641c899 100644 --- a/clang-tools-extra/clangd/index/dex/Trigram.cpp +++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -36,17 +36,15 @@ std::vector generateIdentifierTrigrams(llvm::StringRef Identifier) { // // * Next Tail - next character from the same segment // * Next Head - front character of the next segment - // * Skip-1-Next Head - front character of the skip-1-next segment // // Next stores tuples of three indices in the presented order, if a variant is // not available then 0 is stored. std::vector> Next(LowercaseIdentifier.size()); - unsigned NextTail = 0, NextHead = 0, NextNextHead = 0; + unsigned NextTail = 0, NextHead = 0; for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { - Next[I] = {{NextTail, NextHead, NextNextHead}}; + Next[I] = {{NextTail, NextHead}}; NextTail = Roles[I] == Tail ? I : 0; if (Roles[I] == Head) { - NextNextHead = NextHead; NextHead = I; } } diff --git a/clang-tools-extra/clangd/index/dex/Trigram.h b/clang-tools-extra/clangd/index/dex/Trigram.h index d0fb474..adce9f4 100644 --- a/clang-tools-extra/clangd/index/dex/Trigram.h +++ b/clang-tools-extra/clangd/index/dex/Trigram.h @@ -37,7 +37,7 @@ namespace dex { /// /// The symbol's name is broken into segments, e.g. "FooBar" has two segments. /// Trigrams can start at any character in the input. Then we can choose to move -/// to the next character, move to the start of the next segment, or stop. +/// to the next character or to the start of the next segment. /// /// Short trigrams (length 1-2) are used for short queries. These are: /// - prefixes of the identifier, of length 1 and 2 diff --git a/clang-tools-extra/unittests/clangd/DexTests.cpp b/clang-tools-extra/unittests/clangd/DexTests.cpp index 2931930..d7223ab 100644 --- a/clang-tools-extra/unittests/clangd/DexTests.cpp +++ b/clang-tools-extra/unittests/clangd/DexTests.cpp @@ -381,8 +381,7 @@ TEST(DexTrigrams, IdentifierTrigrams) { "cde", "def"})); EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"), - trigramsAre({"a", "a_", "ab", "abc", "abd", "acd", "ace", "bcd", - "bce", "bde", "cde"})); + trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"})); EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip", @@ -396,14 +395,11 @@ TEST(DexTrigrams, IdentifierTrigrams) { EXPECT_THAT(generateIdentifierTrigrams("IsOK"), trigramsAre({"i", "is", "io", "iso", "iok", "sok"})); - auto X = generateIdentifierTrigrams("abc_defGhij__klm"); EXPECT_THAT( generateIdentifierTrigrams("abc_defGhij__klm"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "abg", "ade", "adg", - "adk", "agh", "agk", "bcd", "bcg", "bde", "bdg", "bdk", - "bgh", "bgk", "cde", "cdg", "cdk", "cgh", "cgk", "def", - "deg", "dek", "dgh", "dgk", "dkl", "efg", "efk", "egh", - "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", + trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd", + "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk", + "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"})); } -- 2.7.4