1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Create a state machine for validating UTF-8. The algorithm in brief:
6 // 1. Convert the complete unicode range of code points, except for the
7 // surrogate code points, to an ordered array of sequences of bytes in
9 // 2. Convert individual bytes to ranges, starting from the right of each byte
10 // sequence. For each range, ensure the bytes on the left and the ranges
11 // on the right are the identical.
12 // 3. Convert the resulting list of ranges into a state machine, collapsing
14 // 4. Convert the state machine to an array of bytes.
15 // 5. Output as a C++ file.
18 // $ ninja -C out/Release build_utf8_validator_tables
19 // $ out/Release/build_utf8_validator_tables
20 // --output=base/i18n/utf8_validator_tables.cc
21 // $ git add base/i18n/utf8_validator_tables.cc
23 // Because the table is not expected to ever change, it is checked into the
24 // repository rather than being regenerated at build time.
26 // This code uses type uint8_t throughout to represent bytes, to avoid
27 // signed/unsigned char confusion.
40 #include "base/command_line.h"
41 #include "base/files/file_path.h"
42 #include "base/files/file_util.h"
43 #include "base/logging.h"
44 #include "base/memory/raw_ptr.h"
45 #include "base/numerics/safe_conversions.h"
46 #include "base/strings/stringprintf.h"
47 #include "third_party/icu/source/common/unicode/utf8.h"
51 const char kHelpText[] =
52 "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
54 const char kProlog[] =
55 "// Copyright 2013 The Chromium Authors\n"
56 "// Use of this source code is governed by a BSD-style license that can "
58 "// found in the LICENSE file.\n"
60 "// This file is auto-generated by build_utf8_validator_tables.\n"
63 "#include \"base/i18n/utf8_validator_tables.h\"\n"
66 "namespace internal {\n"
68 "const uint8_t kUtf8ValidatorTables[] = {\n";
70 const char kEpilog[] =
73 "const size_t kUtf8ValidatorTablesSize = "
74 "std::size(kUtf8ValidatorTables);\n"
76 "} // namespace internal\n"
77 "} // namespace base\n";
79 // Ranges are inclusive at both ends--they represent [from, to]
82 // Ranges always start with just one byte.
83 explicit Range(uint8_t value) : from_(value), to_(value) {}
85 // Range objects are copyable and assignable to be used in STL
86 // containers. Since they only contain non-pointer POD types, the default copy
87 // constructor, assignment operator and destructor will work.
89 // Add a byte to the range. We intentionally only support adding a byte at the
90 // end, since that is the only operation the code needs.
91 void AddByte(uint8_t to) {
96 uint8_t from() const { return from_; }
97 uint8_t to() const { return to_; }
99 bool operator<(const Range& rhs) const {
100 return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
103 bool operator==(const Range& rhs) const {
104 return from() == rhs.from() && to() == rhs.to();
112 // A vector of Ranges is like a simple regular expression--it corresponds to
113 // a set of strings of the same length that have bytes in each position in
114 // the appropriate range.
115 typedef std::vector<Range> StringSet;
117 // A UTF-8 "character" is represented by a sequence of bytes.
118 typedef std::vector<uint8_t> Character;
120 // In the second stage of the algorithm, we want to convert a large list of
121 // Characters into a small list of StringSets.
127 typedef std::vector<Pair> PairVector;
129 // A class to print a table of numbers in the same style as clang-format.
132 explicit TablePrinter(FILE* stream)
133 : stream_(stream), values_on_this_line_(0), current_offset_(0) {}
135 TablePrinter(const TablePrinter&) = delete;
136 TablePrinter& operator=(const TablePrinter&) = delete;
138 void PrintValue(uint8_t value) {
139 if (values_on_this_line_ == 0) {
141 } else if (values_on_this_line_ == kMaxValuesPerLine) {
142 fprintf(stream_.get(), " // 0x%02x\n ", current_offset_);
143 values_on_this_line_ = 0;
145 fprintf(stream_.get(), " 0x%02x,", static_cast<int>(value));
146 ++values_on_this_line_;
151 while (values_on_this_line_ < kMaxValuesPerLine) {
153 ++values_on_this_line_;
155 fprintf(stream_.get(), " // 0x%02x\n", current_offset_);
156 values_on_this_line_ = 0;
160 // stdio stream. Not owned.
161 raw_ptr<FILE> stream_;
163 // Number of values so far printed on this line.
164 int values_on_this_line_;
166 // Total values printed so far.
169 static const int kMaxValuesPerLine = 8;
172 // Start by filling a PairVector with characters. The resulting vector goes from
173 // "\x00" to "\xf4\x8f\xbf\xbf".
174 PairVector InitializeCharacters() {
176 for (int i = 0; i <= 0x10FFFF; ++i) {
177 if (i >= 0xD800 && i < 0xE000) {
178 // Surrogate codepoints are not permitted. Non-character code points are
179 // explicitly permitted.
183 unsigned int offset = 0;
184 UBool is_error = false;
185 U8_APPEND(bytes, offset, std::size(bytes), i, is_error);
187 DCHECK_GT(offset, 0u);
188 DCHECK_LE(offset, std::size(bytes));
189 Pair pair = {Character(bytes, bytes + offset), StringSet()};
190 vector.push_back(pair);
195 // Construct a new Pair from |character| and the concatenation of |new_range|
196 // and |existing_set|, and append it to |pairs|.
197 void ConstructPairAndAppend(const Character& character,
198 const Range& new_range,
199 const StringSet& existing_set,
201 Pair new_pair = {character, StringSet(1, new_range)};
203 new_pair.set.end(), existing_set.begin(), existing_set.end());
204 pairs->push_back(new_pair);
207 // Each pass over the PairVector strips one byte off the right-hand-side of the
208 // characters and adds a range to the set on the right. For example, the first
209 // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
210 // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
211 // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
212 // [\xa0-\xbf][\x80-\xbf]).
213 void MoveRightMostCharToSet(PairVector* pairs) {
214 PairVector new_pairs;
215 PairVector::const_iterator it = pairs->begin();
216 while (it != pairs->end() && it->character.empty()) {
217 new_pairs.push_back(*it);
220 CHECK(it != pairs->end());
221 Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
222 Range new_range(it->character.back());
223 StringSet converted = it->set;
225 while (it != pairs->end()) {
226 const Pair& current_pair = *it++;
227 if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
228 std::equal(unconverted_bytes.begin(),
229 unconverted_bytes.end(),
230 current_pair.character.begin()) &&
231 converted == current_pair.set) {
232 // The particular set of UTF-8 codepoints we are validating guarantees
233 // that each byte range will be contiguous. This would not necessarily be
234 // true for an arbitrary set of UTF-8 codepoints.
235 DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
236 new_range.AddByte(current_pair.character.back());
239 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
240 unconverted_bytes = Character(current_pair.character.begin(),
241 current_pair.character.end() - 1);
242 new_range = Range(current_pair.character.back());
243 converted = current_pair.set;
245 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
246 new_pairs.swap(*pairs);
249 void MoveAllCharsToSets(PairVector* pairs) {
250 // Since each pass of the function moves one character, and UTF-8 sequences
251 // are at most 4 characters long, this simply runs the algorithm four times.
252 for (int i = 0; i < 4; ++i) {
253 MoveRightMostCharToSet(pairs);
256 for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
258 DCHECK(it->character.empty());
263 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
264 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
265 // algorithm is working. Use the command-line option
266 // --vmodule=build_utf8_validator_tables=1 to see this output.
267 void LogStringSets(const PairVector& pairs) {
268 for (const auto& pair_it : pairs) {
269 std::string set_as_string;
270 for (auto set_it = pair_it.set.begin(); set_it != pair_it.set.end();
272 set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
273 static_cast<int>(set_it->from()),
274 static_cast<int>(set_it->to()));
276 VLOG(1) << set_as_string;
280 // A single state in the state machine is represented by a sorted vector of
281 // start bytes and target states. All input bytes in the range between the start
282 // byte and the next entry in the vector (or 0xFF) result in a transition to the
286 uint8_t target_state;
289 typedef std::vector<StateRange> State;
291 // Generates a state where all bytes go to state 1 (invalid). This is also used
292 // as an initialiser for other states (since bytes from outside the desired
293 // range are invalid).
294 State GenerateInvalidState() {
295 const StateRange range = {0, 1};
296 return State(1, range);
299 // A map from a state (ie. a set of strings which will match from this state) to
300 // a number (which is an index into the array of states).
301 typedef std::map<StringSet, uint8_t> StateMap;
303 // Create a new state corresponding to |set|, add it |states| and |state_map|
304 // and return the index it was given in |states|.
305 uint8_t MakeState(const StringSet& set,
306 std::vector<State>* states,
307 StateMap* state_map) {
308 DCHECK(!set.empty());
309 const Range& range = set.front();
310 const StringSet rest(set.begin() + 1, set.end());
311 const StateMap::const_iterator where = state_map->find(rest);
312 const uint8_t target_state = where == state_map->end()
313 ? MakeState(rest, states, state_map)
315 DCHECK_LT(0, range.from());
316 DCHECK_LT(range.to(), 0xFF);
317 const StateRange new_state_initializer[] = {
319 {range.from(), target_state},
320 {static_cast<uint8_t>(range.to() + 1), 1}};
322 State(new_state_initializer,
323 new_state_initializer + std::size(new_state_initializer)));
324 const uint8_t new_state_number =
325 base::checked_cast<uint8_t>(states->size() - 1);
326 CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
327 return new_state_number;
330 std::vector<State> GenerateStates(const PairVector& pairs) {
331 // States 0 and 1 are the initial/valid state and invalid state, respectively.
332 std::vector<State> states(2, GenerateInvalidState());
334 state_map.insert(std::make_pair(StringSet(), 0));
335 for (auto it = pairs.begin(); it != pairs.end(); ++it) {
336 DCHECK(it->character.empty());
337 DCHECK(!it->set.empty());
338 const Range& range = it->set.front();
339 const StringSet rest(it->set.begin() + 1, it->set.end());
340 const StateMap::const_iterator where = state_map.find(rest);
341 const uint8_t target_state = where == state_map.end()
342 ? MakeState(rest, &states, &state_map)
344 if (states[0].back().from == range.from()) {
345 DCHECK_EQ(1, states[0].back().target_state);
346 states[0].back().target_state = target_state;
347 DCHECK_LT(range.to(), 0xFF);
348 const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1};
349 states[0].push_back(new_range);
351 DCHECK_LT(range.to(), 0xFF);
352 const StateRange new_range_initializer[] = {
353 {range.from(), target_state},
354 {static_cast<uint8_t>(range.to() + 1), 1}};
356 states[0].end(), new_range_initializer,
357 new_range_initializer + std::size(new_range_initializer));
363 // Output the generated states as a C++ table. Two tricks are used to compact
364 // the table: each state in the table starts with a shift value which indicates
365 // how many bits we can discard from the right-hand-side of the byte before
366 // doing the table lookup. Secondly, only the state-transitions for bytes
367 // with the top-bit set are included in the table; bytes without the top-bit set
368 // are just ASCII and are handled directly by the code.
369 void PrintStates(const std::vector<State>& states, FILE* stream) {
370 // First calculate the start-offset of each state. This allows the state
371 // machine to jump directly to the correct offset, avoiding an extra
372 // indirection. State 0 starts at offset 0.
373 std::vector<uint8_t> state_offset(1, 0);
374 std::vector<uint8_t> shifts;
377 for (const auto& state_it : states) {
378 // We want to set |shift| to the (0-based) index of the least-significant
379 // set bit in any of the ranges for this state, since this tells us how many
380 // bits we can discard and still determine what range a byte lies in. Sadly
381 // it appears that ffs() is not portable, so we do it clumsily.
383 for (auto range_it = state_it.begin(); range_it != state_it.end();
385 while (shift > 0 && range_it->from % (1 << shift) != 0) {
389 shifts.push_back(shift);
390 pos += 1 + (1 << (7 - shift));
391 state_offset.push_back(pos);
394 DCHECK_EQ(129, state_offset[1]);
396 fputs(kProlog, stream);
397 TablePrinter table_printer(stream);
399 for (uint8_t state_index = 0; state_index < states.size(); ++state_index) {
400 const uint8_t shift = shifts[state_index];
401 uint8_t next_range = 0;
402 uint8_t target_state = 1;
404 " // State %d, offset 0x%02x\n",
405 static_cast<int>(state_index),
406 static_cast<int>(state_offset[state_index]));
407 table_printer.PrintValue(shift);
408 for (int i = 0; i < 0x100; i += (1 << shift)) {
409 if (next_range < states[state_index].size() &&
410 states[state_index][next_range].from == i) {
411 target_state = states[state_index][next_range].target_state;
415 table_printer.PrintValue(state_offset[target_state]);
418 table_printer.NewLine();
421 fputs(kEpilog, stream);
426 int main(int argc, char* argv[]) {
427 base::CommandLine::Init(argc, argv);
428 logging::LoggingSettings settings;
429 settings.logging_dest =
430 logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
431 logging::InitLogging(settings);
432 if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {
433 fwrite(kHelpText, 1, std::size(kHelpText), stdout);
436 base::FilePath filename =
437 base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
439 FILE* output = stdout;
440 if (!filename.empty()) {
441 output = base::OpenFile(filename, "wb");
443 PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
447 // Step 1: Enumerate the characters
448 PairVector pairs = InitializeCharacters();
449 // Step 2: Convert to sets.
450 MoveAllCharsToSets(&pairs);
452 LogStringSets(pairs);
454 // Step 3: Generate states.
455 std::vector<State> states = GenerateStates(pairs);
456 // Step 4/5: Print output
457 PrintStates(states, output);
459 if (!filename.empty()) {
460 if (!base::CloseFile(output))
461 PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()