From 50f3631057f717448ba34b4175daaa81215fbd5e Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Tue, 4 Sep 2018 16:16:50 +0000 Subject: [PATCH] [clangd] Define a compact binary serialization fomat for symbol slab/index. Summary: This is intended to replace the current YAML format for general use. It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML: llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M It's also simpler/faster to read and write. The format is a RIFF container (chunks of (type, size, data)) with: - a compressed string table - simple binary encoding of symbols (with varints for compactness) It can be extended to include occurrences, Dex posting lists, etc. There's no rich backwards-compatibility scheme, but a version number is included so we can detect incompatible files and do ad-hoc back-compat. Alternatives considered: - compressed YAML or JSON: bulky and slow to load - llvm bitstream: confusing model and libraries are hard to use. My attempt produced slightly larger files, and the code was longer and slower. - protobuf or similar: would be really nice (esp for back-compat) but the dependency is a big hassle - ad-hoc binary format without a container: it seems clear we're going to add posting lists and occurrences here, and that they will benefit from sharing a string table. The container makes it easy to debug these pieces in isolation, and make them optional. Reviewers: ioeric Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits Differential Revision: https://reviews.llvm.org/D51585 llvm-svn: 341375 --- clang-tools-extra/clangd/CMakeLists.txt | 2 + clang-tools-extra/clangd/RIFF.cpp | 88 +++++ clang-tools-extra/clangd/RIFF.h | 81 +++++ .../GlobalSymbolBuilderMain.cpp | 32 +- clang-tools-extra/clangd/index/Index.cpp | 46 +-- clang-tools-extra/clangd/index/Index.h | 41 ++- clang-tools-extra/clangd/index/Serialization.cpp | 366 +++++++++++++++++++++ clang-tools-extra/clangd/index/Serialization.h | 48 +++ clang-tools-extra/clangd/index/SymbolYAML.cpp | 17 +- clang-tools-extra/clangd/tool/ClangdMain.cpp | 1 + clang-tools-extra/unittests/clangd/CMakeLists.txt | 2 + clang-tools-extra/unittests/clangd/RIFFTests.cpp | 39 +++ .../unittests/clangd/SerializationTests.cpp | 138 ++++++++ .../unittests/clangd/SymbolCollectorTests.cpp | 79 ----- 14 files changed, 848 insertions(+), 132 deletions(-) create mode 100644 clang-tools-extra/clangd/RIFF.cpp create mode 100644 clang-tools-extra/clangd/RIFF.h create mode 100644 clang-tools-extra/clangd/index/Serialization.cpp create mode 100644 clang-tools-extra/clangd/index/Serialization.h create mode 100644 clang-tools-extra/unittests/clangd/RIFFTests.cpp create mode 100644 clang-tools-extra/unittests/clangd/SerializationTests.cpp diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index ef332ba..df859b4 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -29,6 +29,7 @@ add_clang_library(clangDaemon Protocol.cpp ProtocolHandlers.cpp Quality.cpp + RIFF.cpp SourceCode.cpp Threading.cpp Trace.cpp @@ -41,6 +42,7 @@ add_clang_library(clangDaemon index/Index.cpp index/MemIndex.cpp index/Merge.cpp + index/Serialization.cpp index/SymbolCollector.cpp index/SymbolYAML.cpp diff --git a/clang-tools-extra/clangd/RIFF.cpp b/clang-tools-extra/clangd/RIFF.cpp new file mode 100644 index 0000000..cb832db --- /dev/null +++ b/clang-tools-extra/clangd/RIFF.cpp @@ -0,0 +1,88 @@ +//===--- RIFF.cpp - Binary container file format --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "RIFF.h" +#include "llvm/Support/Endian.h" + +using namespace llvm; +namespace clang { +namespace clangd { +namespace riff { + +static Error makeError(const char *Msg) { + return createStringError(inconvertibleErrorCode(), Msg); +} + +Expected readChunk(StringRef &Stream) { + if (Stream.size() < 8) + return makeError("incomplete chunk header"); + Chunk C; + std::copy(Stream.begin(), Stream.begin() + 4, C.ID.begin()); + Stream = Stream.drop_front(4); + uint32_t Len = support::endian::read32le(Stream.take_front(4).begin()); + Stream = Stream.drop_front(4); + if (Stream.size() < Len) + return makeError("truncated chunk"); + C.Data = Stream.take_front(Len); + Stream = Stream.drop_front(Len); + if (Len % 2 & !Stream.empty()) { // Skip padding byte. + if (Stream.front()) + return makeError("nonzero padding byte"); + Stream = Stream.drop_front(); + } + return C; +}; + +raw_ostream &operator<<(raw_ostream &OS, const Chunk &C) { + OS.write(C.ID.begin(), C.ID.size()); + char Size[4]; + llvm::support::endian::write32le(Size, C.Data.size()); + OS.write(Size, sizeof(Size)); + OS << C.Data; + if (C.Data.size() % 2) + OS.write(0); + return OS; +} + +llvm::Expected readFile(llvm::StringRef Stream) { + auto RIFF = readChunk(Stream); + if (!RIFF) + return RIFF.takeError(); + if (RIFF->ID != fourCC("RIFF")) + return makeError("not a RIFF container"); + if (RIFF->Data.size() < 4) + return makeError("RIFF chunk too short"); + File F; + std::copy(RIFF->Data.begin(), RIFF->Data.begin() + 4, F.Type.begin()); + for (llvm::StringRef Body = RIFF->Data.drop_front(4); !Body.empty();) + if (auto Chunk = readChunk(Body)) { + F.Chunks.push_back(*Chunk); + } else + return Chunk.takeError(); + return F; +} + +raw_ostream &operator<<(raw_ostream &OS, const File &F) { + // To avoid copies, we serialize the outer RIFF chunk "by hand". + size_t DataLen = 4; // Predict length of RIFF chunk data. + for (const auto &C : F.Chunks) + DataLen += 4 + 4 + C.Data.size() + (C.Data.size() % 2); + OS << "RIFF"; + char Size[4]; + llvm::support::endian::write32le(Size, DataLen); + OS.write(Size, sizeof(Size)); + OS.write(F.Type.begin(), F.Type.size()); + for (const auto &C : F.Chunks) + OS << C; + return OS; +} + +} // namespace riff +} // namespace clangd +} // namespace clang diff --git a/clang-tools-extra/clangd/RIFF.h b/clang-tools-extra/clangd/RIFF.h new file mode 100644 index 0000000..f56d087 --- /dev/null +++ b/clang-tools-extra/clangd/RIFF.h @@ -0,0 +1,81 @@ +//===--- RIFF.h - Binary container file format -------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tools for reading and writing data in RIFF containers. +// +// A chunk consists of: +// - ID : char[4] +// - Length : uint32 +// - Data : byte[Length] +// - Padding : byte[Length % 2] +// The semantics of a chunk's Data are determined by its ID. +// The format makes it easy to skip over uninteresting or unknown chunks. +// +// A RIFF file is a single chunk with ID "RIFF". Its Data is: +// - Type : char[4] +// - Chunks : chunk[] +// +// This means that a RIFF file consists of: +// - "RIFF" : char[4] +// - File length - 8 : uint32 +// - File type : char[4] +// - Chunks : chunk[] +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_RIFF_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_RIFF_H +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ScopedPrinter.h" +#include + +namespace clang { +namespace clangd { +namespace riff { + +// A FourCC identifies a chunk in a file, or the type of file itself. +using FourCC = std::array; +// Get a FourCC from a string literal, e.g. fourCC("RIFF"). +inline constexpr FourCC fourCC(const char (&Literal)[5]) { + return FourCC{{Literal[0], Literal[1], Literal[2], Literal[3]}}; +} +// A chunk is a section in a RIFF container. +struct Chunk { + FourCC ID; + llvm::StringRef Data; +}; +inline bool operator==(const Chunk &L, const Chunk &R) { + return std::tie(L.ID, L.Data) == std::tie(R.ID, R.Data); +} +// A File is a RIFF container, which is a typed chunk sequence. +struct File { + FourCC Type; + std::vector Chunks; +}; +inline bool operator==(const File &L, const File &R) { + return std::tie(L.Type, L.Chunks) == std::tie(R.Type, R.Chunks); +} + +// Reads a single chunk from the start of Stream. +// Stream is updated to exclude the consumed chunk. +llvm::Expected readChunk(llvm::StringRef &Stream); + +// Serialize a single chunk to OS. +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Chunk &); + +// Parses a RIFF file consisting of a single RIFF chunk. +llvm::Expected readFile(llvm::StringRef Stream); + +// Serialize a RIFF file (i.e. a single RIFF chunk) to OS. +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const File &); + +} // namespace riff +} // namespace clangd +} // namespace clang +#endif diff --git a/clang-tools-extra/clangd/global-symbol-builder/GlobalSymbolBuilderMain.cpp b/clang-tools-extra/clangd/global-symbol-builder/GlobalSymbolBuilderMain.cpp index 84211c6..4fbc0a7 100644 --- a/clang-tools-extra/clangd/global-symbol-builder/GlobalSymbolBuilderMain.cpp +++ b/clang-tools-extra/clangd/global-symbol-builder/GlobalSymbolBuilderMain.cpp @@ -7,15 +7,16 @@ // //===----------------------------------------------------------------------===// // -// GlobalSymbolBuilder is a tool to generate YAML-format symbols across the -// whole project. This tools is for **experimental** only. Don't use it in -// production code. +// GlobalSymbolBuilder is a tool to extract symbols from a whole project. +// This tool is **experimental** only. Don't use it in production code. // //===----------------------------------------------------------------------===// +#include "RIFF.h" #include "index/CanonicalIncludes.h" #include "index/Index.h" #include "index/Merge.h" +#include "index/Serialization.h" #include "index/SymbolCollector.h" #include "index/SymbolYAML.h" #include "clang/Frontend/CompilerInstance.h" @@ -59,6 +60,14 @@ static llvm::cl::opt MergeOnTheFly( "MapReduce."), llvm::cl::init(true), llvm::cl::Hidden); +enum class Format { YAML, Binary }; +static llvm::cl::opt + Format("format", llvm::cl::desc("Format of the index to be written"), + llvm::cl::values( + clEnumValN(Format::YAML, "yaml", "human-readable YAML format"), + clEnumValN(Format::Binary, "binary", "binary RIFF format")), + llvm::cl::init(Format::YAML)); + /// Responsible for aggregating symbols from each processed file and producing /// the final results. All methods in this class must be thread-safe, /// 'consumeSymbols' may be called from multiple threads. @@ -210,8 +219,8 @@ int main(int argc, const char **argv) { llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); const char *Overview = R"( - This is an **experimental** tool to generate YAML-format project-wide symbols - for clangd (global code completion). It would be changed and deprecated + This is an **experimental** tool to extract symbols from a whole project + for clangd (global code completion). It will be changed and deprecated eventually. Don't use it in production code! Example usage for building index for the whole project using CMake compile @@ -262,7 +271,16 @@ int main(int argc, const char **argv) { } // Reduce phase: combine symbols with the same IDs. auto UniqueSymbols = Consumer->mergeResults(); - // Output phase: emit YAML for result symbols. - SymbolsToYAML(UniqueSymbols, llvm::outs()); + // Output phase: emit result symbols. + switch (clang::clangd::Format) { + case clang::clangd::Format::YAML: + SymbolsToYAML(UniqueSymbols, llvm::outs()); + break; + case clang::clangd::Format::Binary: { + clang::clangd::IndexFileOut Out; + Out.Symbols = &UniqueSymbols; + llvm::outs() << Out; + } + } return 0; } diff --git a/clang-tools-extra/clangd/index/Index.cpp b/clang-tools-extra/clangd/index/Index.cpp index 7cbc9205..48a7c68 100644 --- a/clang-tools-extra/clangd/index/Index.cpp +++ b/clang-tools-extra/clangd/index/Index.cpp @@ -10,6 +10,7 @@ #include "Index.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" #include "llvm/Support/SHA1.h" #include "llvm/Support/raw_ostream.h" @@ -28,21 +29,20 @@ SymbolID::SymbolID(StringRef USR) : HashValue(SHA1::hash(arrayRefFromStringRef(USR))) {} raw_ostream &operator<<(raw_ostream &OS, const SymbolID &ID) { - OS << toHex(toStringRef(ID.HashValue)); - return OS; + return OS << toHex(ID.raw()); } -std::string SymbolID::str() const { - std::string ID; - llvm::raw_string_ostream OS(ID); - OS << *this; - return OS.str(); +SymbolID SymbolID::fromRaw(llvm::StringRef Raw) { + SymbolID ID; + assert(Raw.size() == RawSize); + memcpy(ID.HashValue.data(), Raw.data(), RawSize); + return ID; } +std::string SymbolID::str() const { return toHex(raw()); } + void operator>>(StringRef Str, SymbolID &ID) { - std::string HexString = fromHex(Str); - assert(HexString.size() == ID.HashValue.size()); - std::copy(HexString.begin(), HexString.end(), ID.HashValue.begin()); + ID = SymbolID::fromRaw(fromHex(Str)); } raw_ostream &operator<<(raw_ostream &OS, SymbolOrigin O) { @@ -78,34 +78,18 @@ SymbolSlab::const_iterator SymbolSlab::find(const SymbolID &ID) const { } // Copy the underlying data of the symbol into the owned arena. -static void own(Symbol &S, llvm::UniqueStringSaver &Strings, - BumpPtrAllocator &Arena) { - // Intern replaces V with a reference to the same string owned by the arena. - auto Intern = [&](StringRef &V) { V = Strings.save(V); }; - - // We need to copy every StringRef field onto the arena. - Intern(S.Name); - Intern(S.Scope); - Intern(S.CanonicalDeclaration.FileURI); - Intern(S.Definition.FileURI); - - Intern(S.Signature); - Intern(S.CompletionSnippetSuffix); - - Intern(S.Documentation); - Intern(S.ReturnType); - for (auto &I : S.IncludeHeaders) - Intern(I.IncludeHeader); +static void own(Symbol &S, llvm::UniqueStringSaver &Strings) { + visitStrings(S, [&](StringRef &V) { V = Strings.save(V); }); } void SymbolSlab::Builder::insert(const Symbol &S) { auto R = SymbolIndex.try_emplace(S.ID, Symbols.size()); if (R.second) { Symbols.push_back(S); - own(Symbols.back(), UniqueStrings, Arena); + own(Symbols.back(), UniqueStrings); } else { auto &Copy = Symbols[R.first->second] = S; - own(Copy, UniqueStrings, Arena); + own(Copy, UniqueStrings); } } @@ -118,7 +102,7 @@ SymbolSlab SymbolSlab::Builder::build() && { BumpPtrAllocator NewArena; llvm::UniqueStringSaver Strings(NewArena); for (auto &S : Symbols) - own(S, Strings, NewArena); + own(S, Strings); return SymbolSlab(std::move(NewArena), std::move(Symbols)); } diff --git a/clang-tools-extra/clangd/index/Index.h b/clang-tools-extra/clangd/index/Index.h index bd21ead..2492370 100644 --- a/clang-tools-extra/clangd/index/Index.h +++ b/clang-tools-extra/clangd/index/Index.h @@ -84,26 +84,28 @@ public: return HashValue < Sym.HashValue; } + constexpr static size_t RawSize = 20; + llvm::StringRef raw() const { + return StringRef(reinterpret_cast(HashValue.data()), RawSize); + } + static SymbolID fromRaw(llvm::StringRef); // Returns a 40-bytes hex encoded string. std::string str() const; private: - static constexpr unsigned HashByteLength = 20; - - friend llvm::hash_code hash_value(const SymbolID &ID) { - // We already have a good hash, just return the first bytes. - static_assert(sizeof(size_t) <= HashByteLength, "size_t longer than SHA1!"); - size_t Result; - memcpy(&Result, ID.HashValue.data(), sizeof(size_t)); - return llvm::hash_code(Result); - } - friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const SymbolID &ID); friend void operator>>(llvm::StringRef Str, SymbolID &ID); - std::array HashValue; + std::array HashValue; }; +inline llvm::hash_code hash_value(const SymbolID &ID) { + // We already have a good hash, just return the first bytes. + assert(sizeof(size_t) <= SymbolID::RawSize && "size_t longer than SHA1!"); + size_t Result; + memcpy(&Result, ID.raw().data(), sizeof(size_t)); + return llvm::hash_code(Result); +} + // Write SymbolID into the given stream. SymbolID is encoded as a 40-bytes // hex string. llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolID &ID); @@ -246,6 +248,21 @@ struct Symbol { }; llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Symbol &S); +// Invokes Callback with each StringRef& contained in the Symbol. +// Useful for deduplicating backing strings. +template void visitStrings(Symbol &S, const Callback &CB) { + CB(S.Name); + CB(S.Scope); + CB(S.CanonicalDeclaration.FileURI); + CB(S.Definition.FileURI); + CB(S.Signature); + CB(S.CompletionSnippetSuffix); + CB(S.Documentation); + CB(S.ReturnType); + for (auto &Include : S.IncludeHeaders) + CB(Include.IncludeHeader); +} + // Computes query-independent quality score for a Symbol. // This currently falls in the range [1, ln(#indexed documents)]. // FIXME: this should probably be split into symbol -> signals diff --git a/clang-tools-extra/clangd/index/Serialization.cpp b/clang-tools-extra/clangd/index/Serialization.cpp new file mode 100644 index 0000000..df88142 --- /dev/null +++ b/clang-tools-extra/clangd/index/Serialization.cpp @@ -0,0 +1,366 @@ +//===-- Serialization.cpp - Binary serialization of index data ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "Serialization.h" +#include "../RIFF.h" +#include "llvm/Support/Compression.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" + +using namespace llvm; +namespace clang { +namespace clangd { +namespace { +Error makeError(const Twine &Msg) { + return make_error(Msg, inconvertibleErrorCode()); +} + +// IO PRIMITIVES +// We use little-endian 32 bit ints, sometimes with variable-length encoding. + +StringRef consume(StringRef &Data, int N) { + StringRef Ret = Data.take_front(N); + Data = Data.drop_front(N); + return Ret; +} + +uint8_t consume8(StringRef &Data) { + uint8_t Ret = Data.front(); + Data = Data.drop_front(); + return Ret; +} + +uint32_t consume32(StringRef &Data) { + auto Ret = support::endian::read32le(Data.bytes_begin()); + Data = Data.drop_front(4); + return Ret; +} + +void write32(uint32_t I, raw_ostream &OS) { + char buf[4]; + support::endian::write32le(buf, I); + OS.write(buf, sizeof(buf)); +} + +// Variable-length int encoding (varint) uses the bottom 7 bits of each byte +// to encode the number, and the top bit to indicate whether more bytes follow. +// e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop]. +// This represents 0x1a | 0x2f<<7 = 6042. +// A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact. +void writeVar(uint32_t I, raw_ostream &OS) { + constexpr static uint8_t More = 1 << 7; + if (LLVM_LIKELY(I < 1 << 7)) { + OS.write(I); + return; + } + for (;;) { + OS.write(I | More); + I >>= 7; + if (I < 1 << 7) { + OS.write(I); + return; + } + } +} + +uint32_t consumeVar(StringRef &Data) { + constexpr static uint8_t More = 1 << 7; + uint8_t B = consume8(Data); + if (LLVM_LIKELY(!(B & More))) + return B; + uint32_t Val = B & ~More; + for (int Shift = 7; B & More && Shift < 32; Shift += 7) { + B = consume8(Data); + Val |= (B & ~More) << Shift; + } + return Val; +} + +// STRING TABLE ENCODING +// Index data has many string fields, and many strings are identical. +// We store each string once, and refer to them by index. +// +// The string table's format is: +// - UncompressedSize : uint32 +// - CompressedData : byte[CompressedSize] +// +// CompressedData is a zlib-compressed byte[UncompressedSize]. +// It contains a sequence of null-terminated strings, e.g. "foo\0bar\0". +// These are sorted to improve compression. + +// Maps each string to a canonical representation. +// Strings remain owned externally (e.g. by SymbolSlab). +class StringTableOut { + DenseSet Unique; + std::vector Sorted; + // Since strings are interned, look up can be by pointer. + DenseMap, unsigned> Index; + +public: + // Add a string to the table. Overwrites S if an identical string exists. + void intern(StringRef &S) { S = *Unique.insert(S).first; }; + // Finalize the table and write it to OS. No more strings may be added. + void finalize(raw_ostream &OS) { + Sorted = {Unique.begin(), Unique.end()}; + std::sort(Sorted.begin(), Sorted.end()); + for (unsigned I = 0; I < Sorted.size(); ++I) + Index.try_emplace({Sorted[I].data(), Sorted[I].size()}, I); + + std::string RawTable; + for (StringRef S : Sorted) { + RawTable.append(S); + RawTable.push_back(0); + } + SmallString<1> Compressed; + cantFail(zlib::compress(RawTable, Compressed)); + write32(RawTable.size(), OS); + OS << Compressed; + } + // Get the ID of an string, which must be interned. Table must be finalized. + unsigned index(StringRef S) const { + assert(!Sorted.empty() && "table not finalized"); + assert(Index.count({S.data(), S.size()}) && "string not interned"); + return Index.find({S.data(), S.size()})->second; + } +}; + +struct StringTableIn { + BumpPtrAllocator Arena; + std::vector Strings; +}; + +Expected readStringTable(StringRef Data) { + if (Data.size() < 4) + return makeError("Bad string table: not enough metadata"); + size_t UncompressedSize = consume32(Data); + SmallString<1> Uncompressed; + if (Error E = llvm::zlib::uncompress(Data, Uncompressed, UncompressedSize)) + return std::move(E); + + StringTableIn Table; + StringSaver Saver(Table.Arena); + for (StringRef Rest = Uncompressed; !Rest.empty();) { + auto Len = Rest.find(0); + if (Len == StringRef::npos) + return makeError("Bad string table: not null terminated"); + Table.Strings.push_back(Saver.save(consume(Rest, Len))); + Rest = Rest.drop_front(); + } + return Table; +} + +// SYMBOL ENCODING +// Each field of clangd::Symbol is encoded in turn (see implementation). +// - StringRef fields encode as varint (index into the string table) +// - enums encode as the underlying type +// - most numbers encode as varint + +// It's useful to the implementation to assume symbols have a bounded size. +constexpr size_t SymbolSizeBound = 512; +// To ensure the bounded size, restrict the number of include headers stored. +constexpr unsigned MaxIncludes = 50; + +void writeSymbol(const Symbol &Sym, const StringTableOut &Strings, + raw_ostream &OS) { + auto StartOffset = OS.tell(); + OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists, + // symbol IDs should probably be in a string table. + OS.write(static_cast(Sym.SymInfo.Kind)); + OS.write(static_cast(Sym.SymInfo.Lang)); + writeVar(Strings.index(Sym.Name), OS); + writeVar(Strings.index(Sym.Scope), OS); + for (const auto &Loc : {Sym.Definition, Sym.CanonicalDeclaration}) { + writeVar(Strings.index(Loc.FileURI), OS); + for (const auto &Endpoint : {Loc.Start, Loc.End}) { + writeVar(Endpoint.Line, OS); + writeVar(Endpoint.Column, OS); + } + } + writeVar(Sym.References, OS); + OS.write(Sym.IsIndexedForCodeCompletion); + OS.write(static_cast(Sym.Origin)); + writeVar(Strings.index(Sym.Signature), OS); + writeVar(Strings.index(Sym.CompletionSnippetSuffix), OS); + writeVar(Strings.index(Sym.Documentation), OS); + writeVar(Strings.index(Sym.ReturnType), OS); + + auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) { + writeVar(Strings.index(Include.IncludeHeader), OS); + writeVar(Include.References, OS); + }; + // There are almost certainly few includes, so we can just write them. + if (LLVM_LIKELY(Sym.IncludeHeaders.size() <= MaxIncludes)) { + writeVar(Sym.IncludeHeaders.size(), OS); + for (const auto &Include : Sym.IncludeHeaders) + WriteInclude(Include); + } else { + // If there are too many, make sure we truncate the least important. + using Pointer = const Symbol::IncludeHeaderWithReferences *; + std::vector Pointers; + for (const auto &Include : Sym.IncludeHeaders) + Pointers.push_back(&Include); + std::sort(Pointers.begin(), Pointers.end(), [](Pointer L, Pointer R) { + return L->References > R->References; + }); + Pointers.resize(MaxIncludes); + + writeVar(MaxIncludes, OS); + for (Pointer P : Pointers) + WriteInclude(*P); + } + + assert(OS.tell() - StartOffset < SymbolSizeBound && "Symbol length unsafe!"); + (void)StartOffset; // Unused in NDEBUG; +} + +Expected readSymbol(StringRef &Data, const StringTableIn &Strings) { + // Usually we can skip bounds checks because the buffer is huge. + // Near the end of the buffer, this would be unsafe. In this rare case, copy + // the data into a bigger buffer so we can again skip the checks. + if (LLVM_UNLIKELY(Data.size() < SymbolSizeBound)) { + std::string Buf(Data); + Buf.resize(SymbolSizeBound); + StringRef ExtendedData = Buf; + auto Ret = readSymbol(ExtendedData, Strings); + unsigned BytesRead = Buf.size() - ExtendedData.size(); + if (BytesRead > Data.size()) + return makeError("read past end of data"); + Data = Data.drop_front(BytesRead); + return Ret; + } + +#define READ_STRING(Field) \ + do { \ + auto StringIndex = consumeVar(Data); \ + if (LLVM_UNLIKELY(StringIndex >= Strings.Strings.size())) \ + return makeError("Bad string index"); \ + Field = Strings.Strings[StringIndex]; \ + } while (0) + + Symbol Sym; + Sym.ID = SymbolID::fromRaw(consume(Data, 20)); + Sym.SymInfo.Kind = static_cast(consume8(Data)); + Sym.SymInfo.Lang = static_cast(consume8(Data)); + READ_STRING(Sym.Name); + READ_STRING(Sym.Scope); + for (SymbolLocation *Loc : {&Sym.Definition, &Sym.CanonicalDeclaration}) { + READ_STRING(Loc->FileURI); + for (auto &Endpoint : {&Loc->Start, &Loc->End}) { + Endpoint->Line = consumeVar(Data); + Endpoint->Column = consumeVar(Data); + } + } + Sym.References = consumeVar(Data); + Sym.IsIndexedForCodeCompletion = consume8(Data); + Sym.Origin = static_cast(consume8(Data)); + READ_STRING(Sym.Signature); + READ_STRING(Sym.CompletionSnippetSuffix); + READ_STRING(Sym.Documentation); + READ_STRING(Sym.ReturnType); + unsigned IncludeHeaderN = consumeVar(Data); + if (IncludeHeaderN > MaxIncludes) + return makeError("too many IncludeHeaders"); + Sym.IncludeHeaders.resize(IncludeHeaderN); + for (auto &I : Sym.IncludeHeaders) { + READ_STRING(I.IncludeHeader); + I.References = consumeVar(Data); + } + +#undef READ_STRING + return Sym; +} + +} // namespace + +// FILE ENCODING +// A file is a RIFF chunk with type 'CdIx'. +// It contains the sections: +// - meta: version number +// - stri: string table +// - symb: symbols + +// The current versioning scheme is simple - non-current versions are rejected. +// This allows arbitrary format changes, which invalidate stored data. +// Later we may want to support some backward compatibility. +constexpr static uint32_t Version = 1; + +Expected readIndexFile(StringRef Data) { + auto RIFF = riff::readFile(Data); + if (!RIFF) + return RIFF.takeError(); + if (RIFF->Type != riff::fourCC("CdIx")) + return makeError("wrong RIFF type"); + StringMap Chunks; + for (const auto &Chunk : RIFF->Chunks) + Chunks.try_emplace(StringRef(Chunk.ID.data(), Chunk.ID.size()), Chunk.Data); + + for (StringRef RequiredChunk : {"meta", "stri"}) + if (!Chunks.count(RequiredChunk)) + return makeError("missing required chunk " + RequiredChunk); + + StringRef Meta = Chunks.lookup("meta"); + if (Meta.size() < 4 || consume32(Meta) != Version) + return makeError("wrong version"); + + auto Strings = readStringTable(Chunks.lookup("stri")); + if (!Strings) + return Strings.takeError(); + + IndexFileIn Result; + if (Chunks.count("symb")) { + StringRef SymbolData = Chunks.lookup("symb"); + SymbolSlab::Builder Symbols; + while (!SymbolData.empty()) + if (auto Sym = readSymbol(SymbolData, *Strings)) + Symbols.insert(*Sym); + else + return Sym.takeError(); + Result.Symbols = std::move(Symbols).build(); + } + return Result; +} + +raw_ostream &operator<<(raw_ostream &OS, const IndexFileOut &Data) { + assert(Data.Symbols && "An index file without symbols makes no sense!"); + riff::File RIFF; + RIFF.Type = riff::fourCC("CdIx"); + + SmallString<4> Meta; + { + raw_svector_ostream MetaOS(Meta); + write32(Version, MetaOS); + } + RIFF.Chunks.push_back({riff::fourCC("meta"), Meta}); + + StringTableOut Strings; + std::vector Symbols; + for (const auto &Sym : *Data.Symbols) { + Symbols.emplace_back(Sym); + visitStrings(Symbols.back(), [&](StringRef &S) { Strings.intern(S); }); + } + + std::string StringSection; + { + raw_string_ostream StringOS(StringSection); + Strings.finalize(StringOS); + } + RIFF.Chunks.push_back({riff::fourCC("stri"), StringSection}); + + std::string SymbolSection; + { + raw_string_ostream SymbolOS(SymbolSection); + for (const auto &Sym : Symbols) + writeSymbol(Sym, Strings, SymbolOS); + } + RIFF.Chunks.push_back({riff::fourCC("symb"), SymbolSection}); + + return OS << RIFF; +} + +} // namespace clangd +} // namespace clang diff --git a/clang-tools-extra/clangd/index/Serialization.h b/clang-tools-extra/clangd/index/Serialization.h new file mode 100644 index 0000000..7ad867f --- /dev/null +++ b/clang-tools-extra/clangd/index/Serialization.h @@ -0,0 +1,48 @@ +//===--- Serialization.h - Binary serialization of index data ----*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides a compact binary serialization of indexed symbols. +// +// It writes two sections: +// - a string table (which is compressed) +// - lists of encoded symbols +// +// The format has a simple versioning scheme: the version is embedded in the +// data and non-current versions are rejected when reading. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_RIFF_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_RIFF_H +#include "Index.h" +#include "llvm/Support/Error.h" + +namespace clang { +namespace clangd { + +// Specifies the contents of an index file to be written. +struct IndexFileOut { + const SymbolSlab *Symbols; + // TODO: Support serializing symbol occurrences. + // TODO: Support serializing Dex posting lists. +}; +// Serializes an index file. (This is a RIFF container chunk). +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const IndexFileOut &); + +// Holds the contents of an index file that was read. +struct IndexFileIn { + llvm::Optional Symbols; +}; +// Parse an index file. The input must be a RIFF container chunk. +llvm::Expected readIndexFile(llvm::StringRef); + +} // namespace clangd +} // namespace clang + +#endif diff --git a/clang-tools-extra/clangd/index/SymbolYAML.cpp b/clang-tools-extra/clangd/index/SymbolYAML.cpp index 8db4845..d90f4b0 100644 --- a/clang-tools-extra/clangd/index/SymbolYAML.cpp +++ b/clang-tools-extra/clangd/index/SymbolYAML.cpp @@ -9,6 +9,7 @@ #include "SymbolYAML.h" #include "Index.h" +#include "Serialization.h" #include "dex/DexIndex.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" @@ -189,10 +190,20 @@ std::unique_ptr loadIndex(llvm::StringRef SymbolFile, llvm::errs() << "Can't open " << SymbolFile << "\n"; return nullptr; } - auto Slab = symbolsFromYAML(Buffer.get()->getBuffer()); + StringRef Data = Buffer->get()->getBuffer(); + + llvm::Optional Slab; + if (Data.startswith("RIFF")) { // Magic for binary index file. + if (auto RIFF = readIndexFile(Data)) + Slab = std::move(RIFF->Symbols); + else + llvm::errs() << "Bad RIFF: " << llvm::toString(RIFF.takeError()) << "\n"; + } else { + Slab = symbolsFromYAML(Data); + } - return UseDex ? dex::DexIndex::build(std::move(Slab)) - : MemIndex::build(std::move(Slab), RefSlab()); + return UseDex ? dex::DexIndex::build(std::move(*Slab)) + : MemIndex::build(std::move(*Slab), RefSlab()); } } // namespace clangd diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index a71f0b6..ffc1fec 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -10,6 +10,7 @@ #include "ClangdLSPServer.h" #include "JSONRPCDispatcher.h" #include "Path.h" +#include "RIFF.h" #include "Trace.h" #include "index/SymbolYAML.h" #include "index/dex/DexIndex.h" diff --git a/clang-tools-extra/unittests/clangd/CMakeLists.txt b/clang-tools-extra/unittests/clangd/CMakeLists.txt index 92f8b96..dee2e4e 100644 --- a/clang-tools-extra/unittests/clangd/CMakeLists.txt +++ b/clang-tools-extra/unittests/clangd/CMakeLists.txt @@ -26,6 +26,8 @@ add_extra_unittest(ClangdTests HeadersTests.cpp IndexTests.cpp QualityTests.cpp + RIFFTests.cpp + SerializationTests.cpp SourceCodeTests.cpp SymbolCollectorTests.cpp SyncAPI.cpp diff --git a/clang-tools-extra/unittests/clangd/RIFFTests.cpp b/clang-tools-extra/unittests/clangd/RIFFTests.cpp new file mode 100644 index 0000000..d252edf --- /dev/null +++ b/clang-tools-extra/unittests/clangd/RIFFTests.cpp @@ -0,0 +1,39 @@ +//===-- RIFFTests.cpp - Binary container unit tests -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "RIFF.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang { +namespace clangd { +namespace { +using namespace llvm; +using ::testing::ElementsAre; + +TEST(RIFFTest, File) { + riff::File File{riff::fourCC("test"), + { + {riff::fourCC("even"), "abcd"}, + {riff::fourCC("oddd"), "abcde"}, + }}; + StringRef Serialized = StringRef("RIFF\x1e\0\0\0test" + "even\x04\0\0\0abcd" + "oddd\x05\0\0\0abcde\0", + 38); + + EXPECT_EQ(llvm::to_string(File), Serialized); + auto Parsed = riff::readFile(Serialized); + ASSERT_TRUE(bool(Parsed)) << Parsed.takeError(); + EXPECT_EQ(*Parsed, File); +} + +} // namespace +} // namespace clangd +} // namespace clang diff --git a/clang-tools-extra/unittests/clangd/SerializationTests.cpp b/clang-tools-extra/unittests/clangd/SerializationTests.cpp new file mode 100644 index 0000000..cc43096 --- /dev/null +++ b/clang-tools-extra/unittests/clangd/SerializationTests.cpp @@ -0,0 +1,138 @@ +//===-- SerializationTests.cpp - Binary and YAML serialization unit tests -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "index/Serialization.h" +#include "index/SymbolYAML.h" +#include "llvm/Support/ScopedPrinter.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using testing::UnorderedElementsAre; +using testing::UnorderedElementsAreArray; +namespace clang { +namespace clangd { +namespace { + +const char *YAML1 = R"( +--- +ID: 057557CEBF6E6B2DD437FBF60CC58F352D1DF856 +Name: 'Foo1' +Scope: 'clang::' +SymInfo: + Kind: Function + Lang: Cpp +CanonicalDeclaration: + FileURI: file:///path/foo.h + Start: + Line: 1 + Column: 0 + End: + Line: 1 + Column: 1 +IsIndexedForCodeCompletion: true +Documentation: 'Foo doc' +ReturnType: 'int' +IncludeHeaders: + - Header: 'include1' + References: 7 + - Header: 'include2' + References: 3 +... +)"; + +const char *YAML2 = R"( +--- +ID: 057557CEBF6E6B2DD437FBF60CC58F352D1DF858 +Name: 'Foo2' +Scope: 'clang::' +SymInfo: + Kind: Function + Lang: Cpp +CanonicalDeclaration: + FileURI: file:///path/bar.h + Start: + Line: 1 + Column: 0 + End: + Line: 1 + Column: 1 +IsIndexedForCodeCompletion: false +Signature: '-sig' +CompletionSnippetSuffix: '-snippet' +... +)"; + +MATCHER_P(QName, Name, "") { return (arg.Scope + arg.Name).str() == Name; } +MATCHER_P2(IncludeHeaderWithRef, IncludeHeader, References, "") { + return (arg.IncludeHeader == IncludeHeader) && (arg.References == References); +} + +TEST(SerializationTest, YAMLConversions) { + auto Symbols1 = symbolsFromYAML(YAML1); + ASSERT_EQ(Symbols1.size(), 1u); + const auto &Sym1 = *Symbols1.begin(); + EXPECT_THAT(Sym1, QName("clang::Foo1")); + EXPECT_EQ(Sym1.Signature, ""); + EXPECT_EQ(Sym1.Documentation, "Foo doc"); + EXPECT_EQ(Sym1.ReturnType, "int"); + EXPECT_EQ(Sym1.CanonicalDeclaration.FileURI, "file:///path/foo.h"); + EXPECT_TRUE(Sym1.IsIndexedForCodeCompletion); + EXPECT_THAT(Sym1.IncludeHeaders, + UnorderedElementsAre(IncludeHeaderWithRef("include1", 7u), + IncludeHeaderWithRef("include2", 3u))); + + auto Symbols2 = symbolsFromYAML(YAML2); + ASSERT_EQ(Symbols2.size(), 1u); + const auto &Sym2 = *Symbols2.begin(); + EXPECT_THAT(Sym2, QName("clang::Foo2")); + EXPECT_EQ(Sym2.Signature, "-sig"); + EXPECT_EQ(Sym2.ReturnType, ""); + EXPECT_EQ(Sym2.CanonicalDeclaration.FileURI, "file:///path/bar.h"); + EXPECT_FALSE(Sym2.IsIndexedForCodeCompletion); + + std::string ConcatenatedYAML; + { + llvm::raw_string_ostream OS(ConcatenatedYAML); + SymbolsToYAML(Symbols1, OS); + SymbolsToYAML(Symbols2, OS); + } + auto ConcatenatedSymbols = symbolsFromYAML(ConcatenatedYAML); + EXPECT_THAT(ConcatenatedSymbols, + UnorderedElementsAre(QName("clang::Foo1"), QName("clang::Foo2"))); +} + +std::vector YAMLFromSymbols(const SymbolSlab &Slab) { + std::vector Result; + for (const auto &Sym : Slab) + Result.push_back(SymbolToYAML(Sym)); + return Result; +} + +TEST(SerializationTest, BinaryConversions) { + // We reuse the test symbols from YAML. + auto Slab = symbolsFromYAML(std::string(YAML1) + YAML2); + ASSERT_EQ(Slab.size(), 2u); + + // Write to binary format, and parse again. + IndexFileOut Out; + Out.Symbols = &Slab; + std::string Serialized = llvm::to_string(Out); + + auto In = readIndexFile(Serialized); + ASSERT_TRUE(bool(In)) << In.takeError(); + ASSERT_TRUE(In->Symbols); + + // Assert the YAML serializations match, for nice comparisons and diffs. + EXPECT_THAT(YAMLFromSymbols(*In->Symbols), + UnorderedElementsAreArray(YAMLFromSymbols(Slab))); +} + +} // namespace +} // namespace clangd +} // namespace clang diff --git a/clang-tools-extra/unittests/clangd/SymbolCollectorTests.cpp b/clang-tools-extra/unittests/clangd/SymbolCollectorTests.cpp index cc3c7d1..6688dae 100644 --- a/clang-tools-extra/unittests/clangd/SymbolCollectorTests.cpp +++ b/clang-tools-extra/unittests/clangd/SymbolCollectorTests.cpp @@ -48,7 +48,6 @@ using testing::UnorderedElementsAreArray; MATCHER_P(Labeled, Label, "") { return (arg.Name + arg.Signature).str() == Label; } -MATCHER(HasReturnType, "") { return !arg.ReturnType.empty(); } MATCHER_P(ReturnType, D, "") { return arg.ReturnType == D; } MATCHER_P(Doc, D, "") { return arg.Documentation == D; } MATCHER_P(Snippet, S, "") { @@ -744,84 +743,6 @@ TEST_F(SymbolCollectorTest, Snippet) { Snippet("ff(${1:int x}, ${2:double y})")))); } -TEST_F(SymbolCollectorTest, YAMLConversions) { - const std::string YAML1 = R"( ---- -ID: 057557CEBF6E6B2DD437FBF60CC58F352D1DF856 -Name: 'Foo1' -Scope: 'clang::' -SymInfo: - Kind: Function - Lang: Cpp -CanonicalDeclaration: - FileURI: file:///path/foo.h - Start: - Line: 1 - Column: 0 - End: - Line: 1 - Column: 1 -IsIndexedForCodeCompletion: true -Documentation: 'Foo doc' -ReturnType: 'int' -IncludeHeaders: - - Header: 'include1' - References: 7 - - Header: 'include2' - References: 3 -... -)"; - const std::string YAML2 = R"( ---- -ID: 057557CEBF6E6B2DD437FBF60CC58F352D1DF858 -Name: 'Foo2' -Scope: 'clang::' -SymInfo: - Kind: Function - Lang: Cpp -CanonicalDeclaration: - FileURI: file:///path/bar.h - Start: - Line: 1 - Column: 0 - End: - Line: 1 - Column: 1 -IsIndexedForCodeCompletion: false -Signature: '-sig' -CompletionSnippetSuffix: '-snippet' -... -)"; - - auto Symbols1 = symbolsFromYAML(YAML1); - - EXPECT_THAT(Symbols1, - UnorderedElementsAre(AllOf(QName("clang::Foo1"), Labeled("Foo1"), - Doc("Foo doc"), ReturnType("int"), - DeclURI("file:///path/foo.h"), - ForCodeCompletion(true)))); - auto &Sym1 = *Symbols1.begin(); - EXPECT_THAT(Sym1.IncludeHeaders, - UnorderedElementsAre(IncludeHeaderWithRef("include1", 7u), - IncludeHeaderWithRef("include2", 3u))); - auto Symbols2 = symbolsFromYAML(YAML2); - EXPECT_THAT(Symbols2, UnorderedElementsAre(AllOf( - QName("clang::Foo2"), Labeled("Foo2-sig"), - Not(HasReturnType()), DeclURI("file:///path/bar.h"), - ForCodeCompletion(false)))); - - std::string ConcatenatedYAML; - { - llvm::raw_string_ostream OS(ConcatenatedYAML); - SymbolsToYAML(Symbols1, OS); - SymbolsToYAML(Symbols2, OS); - } - auto ConcatenatedSymbols = symbolsFromYAML(ConcatenatedYAML); - EXPECT_THAT(ConcatenatedSymbols, - UnorderedElementsAre(QName("clang::Foo1"), - QName("clang::Foo2"))); -} - TEST_F(SymbolCollectorTest, IncludeHeaderSameAsFileURI) { CollectorOpts.CollectIncludePath = true; runSymbolCollector("class Foo {};", /*Main=*/""); -- 2.7.4