third_party/pigweed/repo/pw_tokenizer/public/pw_tokenizer/token_database.h

   1 // Copyright 2020 The Pigweed Authors
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
   4 // use this file except in compliance with the License. You may obtain a copy of
   5 // the License at
   6 //
   7 //     https://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  12 // License for the specific language governing permissions and limitations under
  13 // the License.
  14 #pragma once
  15
  16 #include <array>
  17 #include <cstddef>
  18 #include <cstdint>
  19 #include <iterator>
  20
  21 namespace pw::tokenizer {
  22
  23 // Reads entries from a binary token string database. This class does not copy
  24 // or modify the contents of the database.
  25 //
  26 // A binary token database is comprised of a 16-byte header followed by an array
  27 // of 8-byte entries and a table of null-terminated strings. The header
  28 // specifies the number of entries. Each entry contains information about a
  29 // tokenized string: the token and removal date, if any. All fields are
  30 // little-endian.
  31 //
  32 //            Header
  33 //            ======
  34 //   Offset  Size  Field
  35 //   -----------------------------------
  36 //        0     6  Magic number (TOKENS)
  37 //        6     2  Version (00 00)
  38 //        8     4  Entry count
  39 //       12     4  Reserved
  40 //
  41 //             Entry
  42 //             =====
  43 //   Offset  Size  Field
  44 //   -----------------------------------
  45 //        0     4  Token
  46 //        4     4  Removal date (d m yy)
  47 //
  48 // Entries are sorted by token. A string table with a null-terminated string for
  49 // each entry in order follows the entries.
  50 //
  51 // Entries are accessed by iterating over the database. A O(n) Find function is
  52 // also provided. In typical use, a TokenDatabase is preprocessed by a
  53 // Detokenizer into a std::unordered_map.
  54 class TokenDatabase {
  55  public:
  56   // Internal struct that describes how the underlying binary token database
  57   // stores entries. RawEntries generally should not be used directly. Instead,
  58   // use an Entry, which contains a pointer to the entry's string.
  59   struct RawEntry {
  60     uint32_t token;
  61     uint32_t date_removed;
  62   };
  63
  64   static_assert(sizeof(RawEntry) == 8u);
  65
  66   // An entry in the token database. This struct adds the string to a RawEntry.
  67   struct Entry {
  68     // The token calculated for this string.
  69     uint32_t token;
  70
  71     // The date the token and string was removed from the database, or
  72     // 0xFFFFFFFF if it was never removed. Dates are encoded such that natural
  73     // integer sorting sorts from oldest to newest dates. The day is stored an
  74     // an 8-bit day, 8-bit month, and 16-bit year, packed into a little-endian
  75     // uint32_t.
  76     uint32_t date_removed;
  77
  78     // The null-terminated string represented by this token.
  79     const char* string;
  80   };
  81
  82   // Iterator for TokenDatabase values. Note that this is not a normal STL-style
  83   // iterator, since * returns a value instead of a reference.
  84   class Iterator {
  85    public:
  86     constexpr Iterator(const RawEntry* raw_entry, const char* string)
  87         : raw_(raw_entry), string_(string) {}
  88
  89     // Constructs a TokenDatabase::Entry for the entry this iterator refers to.
  90     constexpr Entry entry() const {
  91       return {raw_->token, raw_->date_removed, string_};
  92     }
  93
  94     constexpr Iterator& operator++() {
  95       raw_ += 1;
  96       // Move string_ to the character beyond the next null terminator.
  97       while (*string_++ != '\0') {
  98       }
  99       return *this;
 100     }
 101     constexpr Iterator operator++(int) {
 102       Iterator previous(raw_, string_);
 103       operator++();
 104       return previous;
 105     }
 106     constexpr bool operator==(const Iterator& rhs) const {
 107       return raw_ == rhs.raw_;
 108     }
 109     constexpr bool operator!=(const Iterator& rhs) const {
 110       return raw_ != rhs.raw_;
 111     }
 112
 113     // Derefencing a TokenDatabase::Iterator returns an Entry, not an Entry&.
 114     constexpr Entry operator*() const { return entry(); }
 115
 116     // Point directly into the underlying RawEntry. Strings are not accessible
 117     // via this operator.
 118     constexpr const RawEntry* operator->() const { return raw_; }
 119
 120     constexpr ptrdiff_t operator-(const Iterator& rhs) const {
 121       return raw_ - rhs.raw_;
 122     }
 123
 124    private:
 125     const RawEntry* raw_;
 126     const char* string_;
 127   };
 128
 129   // A list of token entries returned from a Find operation. This object can be
 130   // iterated over or indexed as an array.
 131   class Entries {
 132    public:
 133     constexpr Entries(const Iterator& begin, const Iterator& end)
 134         : begin_(begin), end_(end) {}
 135
 136     // The number of entries in this list.
 137     constexpr size_t size() const { return end_ - begin_; }
 138
 139     // True of the list is empty.
 140     constexpr bool empty() const { return begin_ == end_; }
 141
 142     // Accesses the specified entry in this set. Returns an Entry object, which
 143     // is constructed from the underlying raw entry. The index must be less than
 144     // size(). This operation is O(n) in size().
 145     Entry operator[](size_t index) const;
 146
 147     constexpr const Iterator& begin() const { return begin_; }
 148     constexpr const Iterator& end() const { return end_; }
 149
 150    private:
 151     Iterator begin_;
 152     Iterator end_;
 153   };
 154
 155   // Returns true if the provided data is a valid token database. This checks
 156   // the magic number ("TOKENS"), version (which must be 0), and that there is
 157   // is one string for each entry in the database. A database with extra strings
 158   // or other trailing data is considered valid.
 159   template <typename ByteArray>
 160   static constexpr bool IsValid(const ByteArray& bytes) {
 161     return HasValidHeader(bytes) && EachEntryHasAString(bytes);
 162   }
 163
 164   // Creates a TokenDatabase and checks if the provided data is valid at compile
 165   // time. Accepts references to constexpr containers (array, span, string_view,
 166   // etc.) with static storage duration. For example:
 167   //
 168   //   constexpr char kMyData[] = ...;
 169   //   constexpr TokenDatabase db = TokenDatabase::Create<kMyData>();
 170   //
 171   template <const auto& kDatabaseBytes>
 172   static constexpr TokenDatabase Create() {
 173     static_assert(
 174         HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes),
 175         "Databases must start with a 16-byte header that begins with TOKENS.");
 176
 177     static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes),
 178                   "The database must have at least one string for each entry.");
 179
 180     return TokenDatabase(std::data(kDatabaseBytes));
 181   }
 182
 183   // Creates a TokenDatabase from the provided byte array. The array may be a
 184   // span, array, or other container type. If the data is not valid, returns a
 185   // default-constructed database for which ok() is false.
 186   //
 187   // Prefer the Create overload that takes the data as a template parameter
 188   // whenever possible, since that function checks the integrity of the data at
 189   // compile time.
 190   template <typename ByteArray>
 191   static constexpr TokenDatabase Create(const ByteArray& database_bytes) {
 192     return IsValid<ByteArray>(database_bytes)
 193                ? TokenDatabase(std::data(database_bytes))
 194                : TokenDatabase();  // Invalid database.
 195   }
 196   // Creates a database with no data. ok() returns false.
 197   constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {}
 198
 199   // Returns all entries associated with this token. This is a O(n) operation.
 200   Entries Find(uint32_t token) const;
 201
 202   // Returns the total number of entries (unique token-string pairs).
 203   constexpr size_t size() const {
 204     return (end_.data - begin_.data) / sizeof(RawEntry);
 205   }
 206
 207   // True if this database was constructed with valid data.
 208   constexpr bool ok() const { return begin_.data != nullptr; }
 209
 210   Iterator begin() const { return Iterator(begin_.entry, end_.data); }
 211   Iterator end() const { return Iterator(end_.entry, nullptr); }
 212
 213  private:
 214   struct Header {
 215     std::array<char, 6> magic;
 216     uint16_t version;
 217     uint32_t entry_count;
 218     uint32_t reserved;
 219   };
 220
 221   static_assert(sizeof(Header) == 2 * sizeof(RawEntry));
 222
 223   template <typename ByteArray>
 224   static constexpr bool HasValidHeader(const ByteArray& bytes) {
 225     static_assert(sizeof(*std::data(bytes)) == 1u);
 226
 227     if (std::size(bytes) < sizeof(Header)) {
 228       return false;
 229     }
 230
 231     // Check the magic number and version.
 232     for (size_t i = 0; i < kMagicAndVersion.size(); ++i) {
 233       if (bytes[i] != kMagicAndVersion[i]) {
 234         return false;
 235       }
 236     }
 237
 238     return true;
 239   }
 240
 241   template <typename ByteArray>
 242   static constexpr bool EachEntryHasAString(const ByteArray& bytes) {
 243     const size_t entries = ReadEntryCount(std::data(bytes));
 244
 245     // Check that the data is large enough to have a string table.
 246     if (std::size(bytes) < StringTable(entries)) {
 247       return false;
 248     }
 249
 250     // Count the strings in the string table.
 251     size_t string_count = 0;
 252     for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes);
 253          ++i) {
 254       string_count += (*i == '\0') ? 1 : 0;
 255     }
 256
 257     // Check that there is at least one string for each entry.
 258     return string_count >= entries;
 259   }
 260
 261   // Reads the number of entries from a database header. Cast to the bytes to
 262   // uint8_t to avoid sign extension if T is signed.
 263   template <typename T>
 264   static constexpr uint32_t ReadEntryCount(const T* header_bytes) {
 265     const T* bytes = header_bytes + offsetof(Header, entry_count);
 266     return static_cast<uint8_t>(bytes[0]) |
 267            static_cast<uint8_t>(bytes[1]) << 8 |
 268            static_cast<uint8_t>(bytes[2]) << 16 |
 269            static_cast<uint8_t>(bytes[3]) << 24;
 270   }
 271
 272   // Calculates the offset of the string table.
 273   static constexpr size_t StringTable(size_t entries) {
 274     return sizeof(Header) + entries * sizeof(RawEntry);
 275   }
 276
 277   // The magic number that starts the table is "TOKENS". The version is encoded
 278   // next as two bytes.
 279   static constexpr std::array<char, 8> kMagicAndVersion = {
 280       'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'};
 281
 282   template <typename Byte>
 283   constexpr TokenDatabase(const Byte bytes[])
 284       : TokenDatabase(bytes + sizeof(Header),
 285                       bytes + StringTable(ReadEntryCount(bytes))) {
 286     static_assert(sizeof(Byte) == 1u);
 287   }
 288
 289   // It is illegal to reinterpret_cast in constexpr functions, but acceptable to
 290   // use unions. Instead of using a reinterpret_cast to change the byte pointer
 291   // to a RawEntry pointer, have a separate overload for each byte pointer type
 292   // and store them in a union.
 293   constexpr TokenDatabase(const char* begin, const char* end)
 294       : begin_{.data = begin}, end_{.data = end} {}
 295
 296   constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end)
 297       : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {}
 298
 299   constexpr TokenDatabase(const signed char* begin, const signed char* end)
 300       : begin_{.signed_data = begin}, end_{.signed_data = end} {}
 301
 302   // Store the beginning and end pointers as a union to avoid breaking constexpr
 303   // rules for reinterpret_cast.
 304   union {
 305     const RawEntry* entry;
 306     const char* data;
 307     const unsigned char* unsigned_data;
 308     const signed char* signed_data;
 309   } begin_, end_;
 310 };
 311
 312 }  // namespace pw::tokenizer