1 // Copyright 2020 The Pigweed Authors
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
7 // https://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
21 namespace pw::tokenizer {
23 // Reads entries from a binary token string database. This class does not copy
24 // or modify the contents of the database.
26 // A binary token database is comprised of a 16-byte header followed by an array
27 // of 8-byte entries and a table of null-terminated strings. The header
28 // specifies the number of entries. Each entry contains information about a
29 // tokenized string: the token and removal date, if any. All fields are
35 // -----------------------------------
36 // 0 6 Magic number (TOKENS)
37 // 6 2 Version (00 00)
44 // -----------------------------------
46 // 4 4 Removal date (d m yy)
48 // Entries are sorted by token. A string table with a null-terminated string for
49 // each entry in order follows the entries.
51 // Entries are accessed by iterating over the database. A O(n) Find function is
52 // also provided. In typical use, a TokenDatabase is preprocessed by a
53 // Detokenizer into a std::unordered_map.
56 // Internal struct that describes how the underlying binary token database
57 // stores entries. RawEntries generally should not be used directly. Instead,
58 // use an Entry, which contains a pointer to the entry's string.
61 uint32_t date_removed;
64 static_assert(sizeof(RawEntry) == 8u);
66 // An entry in the token database. This struct adds the string to a RawEntry.
68 // The token calculated for this string.
71 // The date the token and string was removed from the database, or
72 // 0xFFFFFFFF if it was never removed. Dates are encoded such that natural
73 // integer sorting sorts from oldest to newest dates. The day is stored an
74 // an 8-bit day, 8-bit month, and 16-bit year, packed into a little-endian
76 uint32_t date_removed;
78 // The null-terminated string represented by this token.
82 // Iterator for TokenDatabase values. Note that this is not a normal STL-style
83 // iterator, since * returns a value instead of a reference.
86 constexpr Iterator(const RawEntry* raw_entry, const char* string)
87 : raw_(raw_entry), string_(string) {}
89 // Constructs a TokenDatabase::Entry for the entry this iterator refers to.
90 constexpr Entry entry() const {
91 return {raw_->token, raw_->date_removed, string_};
94 constexpr Iterator& operator++() {
96 // Move string_ to the character beyond the next null terminator.
97 while (*string_++ != '\0') {
101 constexpr Iterator operator++(int) {
102 Iterator previous(raw_, string_);
106 constexpr bool operator==(const Iterator& rhs) const {
107 return raw_ == rhs.raw_;
109 constexpr bool operator!=(const Iterator& rhs) const {
110 return raw_ != rhs.raw_;
113 // Derefencing a TokenDatabase::Iterator returns an Entry, not an Entry&.
114 constexpr Entry operator*() const { return entry(); }
116 // Point directly into the underlying RawEntry. Strings are not accessible
117 // via this operator.
118 constexpr const RawEntry* operator->() const { return raw_; }
120 constexpr ptrdiff_t operator-(const Iterator& rhs) const {
121 return raw_ - rhs.raw_;
125 const RawEntry* raw_;
129 // A list of token entries returned from a Find operation. This object can be
130 // iterated over or indexed as an array.
133 constexpr Entries(const Iterator& begin, const Iterator& end)
134 : begin_(begin), end_(end) {}
136 // The number of entries in this list.
137 constexpr size_t size() const { return end_ - begin_; }
139 // True of the list is empty.
140 constexpr bool empty() const { return begin_ == end_; }
142 // Accesses the specified entry in this set. Returns an Entry object, which
143 // is constructed from the underlying raw entry. The index must be less than
144 // size(). This operation is O(n) in size().
145 Entry operator[](size_t index) const;
147 constexpr const Iterator& begin() const { return begin_; }
148 constexpr const Iterator& end() const { return end_; }
155 // Returns true if the provided data is a valid token database. This checks
156 // the magic number ("TOKENS"), version (which must be 0), and that there is
157 // is one string for each entry in the database. A database with extra strings
158 // or other trailing data is considered valid.
159 template <typename ByteArray>
160 static constexpr bool IsValid(const ByteArray& bytes) {
161 return HasValidHeader(bytes) && EachEntryHasAString(bytes);
164 // Creates a TokenDatabase and checks if the provided data is valid at compile
165 // time. Accepts references to constexpr containers (array, span, string_view,
166 // etc.) with static storage duration. For example:
168 // constexpr char kMyData[] = ...;
169 // constexpr TokenDatabase db = TokenDatabase::Create<kMyData>();
171 template <const auto& kDatabaseBytes>
172 static constexpr TokenDatabase Create() {
174 HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes),
175 "Databases must start with a 16-byte header that begins with TOKENS.");
177 static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes),
178 "The database must have at least one string for each entry.");
180 return TokenDatabase(std::data(kDatabaseBytes));
183 // Creates a TokenDatabase from the provided byte array. The array may be a
184 // span, array, or other container type. If the data is not valid, returns a
185 // default-constructed database for which ok() is false.
187 // Prefer the Create overload that takes the data as a template parameter
188 // whenever possible, since that function checks the integrity of the data at
190 template <typename ByteArray>
191 static constexpr TokenDatabase Create(const ByteArray& database_bytes) {
192 return IsValid<ByteArray>(database_bytes)
193 ? TokenDatabase(std::data(database_bytes))
194 : TokenDatabase(); // Invalid database.
196 // Creates a database with no data. ok() returns false.
197 constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {}
199 // Returns all entries associated with this token. This is a O(n) operation.
200 Entries Find(uint32_t token) const;
202 // Returns the total number of entries (unique token-string pairs).
203 constexpr size_t size() const {
204 return (end_.data - begin_.data) / sizeof(RawEntry);
207 // True if this database was constructed with valid data.
208 constexpr bool ok() const { return begin_.data != nullptr; }
210 Iterator begin() const { return Iterator(begin_.entry, end_.data); }
211 Iterator end() const { return Iterator(end_.entry, nullptr); }
215 std::array<char, 6> magic;
217 uint32_t entry_count;
221 static_assert(sizeof(Header) == 2 * sizeof(RawEntry));
223 template <typename ByteArray>
224 static constexpr bool HasValidHeader(const ByteArray& bytes) {
225 static_assert(sizeof(*std::data(bytes)) == 1u);
227 if (std::size(bytes) < sizeof(Header)) {
231 // Check the magic number and version.
232 for (size_t i = 0; i < kMagicAndVersion.size(); ++i) {
233 if (bytes[i] != kMagicAndVersion[i]) {
241 template <typename ByteArray>
242 static constexpr bool EachEntryHasAString(const ByteArray& bytes) {
243 const size_t entries = ReadEntryCount(std::data(bytes));
245 // Check that the data is large enough to have a string table.
246 if (std::size(bytes) < StringTable(entries)) {
250 // Count the strings in the string table.
251 size_t string_count = 0;
252 for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes);
254 string_count += (*i == '\0') ? 1 : 0;
257 // Check that there is at least one string for each entry.
258 return string_count >= entries;
261 // Reads the number of entries from a database header. Cast to the bytes to
262 // uint8_t to avoid sign extension if T is signed.
263 template <typename T>
264 static constexpr uint32_t ReadEntryCount(const T* header_bytes) {
265 const T* bytes = header_bytes + offsetof(Header, entry_count);
266 return static_cast<uint8_t>(bytes[0]) |
267 static_cast<uint8_t>(bytes[1]) << 8 |
268 static_cast<uint8_t>(bytes[2]) << 16 |
269 static_cast<uint8_t>(bytes[3]) << 24;
272 // Calculates the offset of the string table.
273 static constexpr size_t StringTable(size_t entries) {
274 return sizeof(Header) + entries * sizeof(RawEntry);
277 // The magic number that starts the table is "TOKENS". The version is encoded
278 // next as two bytes.
279 static constexpr std::array<char, 8> kMagicAndVersion = {
280 'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'};
282 template <typename Byte>
283 constexpr TokenDatabase(const Byte bytes[])
284 : TokenDatabase(bytes + sizeof(Header),
285 bytes + StringTable(ReadEntryCount(bytes))) {
286 static_assert(sizeof(Byte) == 1u);
289 // It is illegal to reinterpret_cast in constexpr functions, but acceptable to
290 // use unions. Instead of using a reinterpret_cast to change the byte pointer
291 // to a RawEntry pointer, have a separate overload for each byte pointer type
292 // and store them in a union.
293 constexpr TokenDatabase(const char* begin, const char* end)
294 : begin_{.data = begin}, end_{.data = end} {}
296 constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end)
297 : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {}
299 constexpr TokenDatabase(const signed char* begin, const signed char* end)
300 : begin_{.signed_data = begin}, end_{.signed_data = end} {}
302 // Store the beginning and end pointers as a union to avoid breaking constexpr
303 // rules for reinterpret_cast.
305 const RawEntry* entry;
307 const unsigned char* unsigned_data;
308 const signed char* signed_data;
312 } // namespace pw::tokenizer