From 4f13f155e9ffa2919ecc9cc55dba16fdaf9b63da Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Tue, 8 Feb 2022 14:56:37 +0900 Subject: [PATCH] Imported Upstream version 20211101 --- BUILD | 2 - Makefile | 4 +- WORKSPACE | 8 - doc/syntax.html | 5 + doc/syntax.txt | 5 + re2/bitstate.cc | 6 +- re2/dfa.cc | 26 +-- re2/fuzzing/re2_fuzzer.cc | 111 +++++++-- re2/nfa.cc | 8 +- re2/onepass.cc | 4 +- re2/prog.cc | 6 + re2/prog.h | 21 +- re2/re2.cc | 17 +- re2/testing/backtrack.cc | 6 +- re2/testing/exhaustive_tester.cc | 4 +- re2/testing/tester.cc | 18 +- re2/unicode.py | 2 +- re2/unicode_casefold.cc | 36 ++- re2/unicode_groups.cc | 470 ++++++++++++++++++++++++++------------- re2/walker-inl.h | 5 +- 20 files changed, 516 insertions(+), 248 deletions(-) diff --git a/BUILD b/BUILD index 3dc27d5..00330b6 100644 --- a/BUILD +++ b/BUILD @@ -23,8 +23,6 @@ config_setting( values = {"cpu": "x64_windows"}, ) -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") - cc_library( name = "re2", srcs = [ diff --git a/Makefile b/Makefile index ba5e4f6..2409093 100644 --- a/Makefile +++ b/Makefile @@ -244,8 +244,8 @@ re2/perl_groups.cc: re2/make_perl_groups.pl perl $< > $@ .PRECIOUS: re2/unicode_%.cc -re2/unicode_%.cc: re2/make_unicode_%.py - python $< > $@ +re2/unicode_%.cc: re2/make_unicode_%.py re2/unicode.py + python3 $< > $@ endif .PHONY: distclean diff --git a/WORKSPACE b/WORKSPACE index ec1292e..b35619c 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -5,11 +5,3 @@ # Bazel (http://bazel.io/) WORKSPACE file for RE2. workspace(name = "com_googlesource_code_re2") - -load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") - -http_archive( - name = "rules_cc", - strip_prefix = "rules_cc-main", - urls = ["https://github.com/bazelbuild/rules_cc/archive/main.zip"], -) diff --git a/doc/syntax.html b/doc/syntax.html index 47541e5..f0e0138 100644 --- a/doc/syntax.html +++ b/doc/syntax.html @@ -269,6 +269,7 @@ Coptic Cuneiform Cypriot +Cypro_Minoan Cyrillic Deseret Devanagari @@ -352,6 +353,7 @@ Old_Sogdian Old_South_Arabian Old_Turkic +Old_Uyghur Oriya Osage Osmanya @@ -383,6 +385,7 @@ Tai_Viet Takri Tamil +Tangsa Tangut Telugu Thaana @@ -390,8 +393,10 @@ Tibetan Tifinagh Tirhuta +Toto Ugaritic Vai +Vithkuqi Wancho Warang_Citi Yezidi diff --git a/doc/syntax.txt b/doc/syntax.txt index ce87866..c12a482 100644 --- a/doc/syntax.txt +++ b/doc/syntax.txt @@ -258,6 +258,7 @@ Common Coptic Cuneiform Cypriot +Cypro_Minoan Cyrillic Deseret Devanagari @@ -341,6 +342,7 @@ Old_Persian Old_Sogdian Old_South_Arabian Old_Turkic +Old_Uyghur Oriya Osage Osmanya @@ -372,6 +374,7 @@ Tai_Tham Tai_Viet Takri Tamil +Tangsa Tangut Telugu Thaana @@ -379,8 +382,10 @@ Thai Tibetan Tifinagh Tirhuta +Toto Ugaritic Vai +Vithkuqi Wancho Warang_Citi Yezidi diff --git a/re2/bitstate.cc b/re2/bitstate.cc index 320d1ee..877e548 100644 --- a/re2/bitstate.cc +++ b/re2/bitstate.cc @@ -293,9 +293,9 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, context_ = context; if (context_.data() == NULL) context_ = text; - if (prog_->anchor_start() && context_.begin() != text.begin()) + if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text)) return false; - if (prog_->anchor_end() && context_.end() != text.end()) + if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text)) return false; anchored_ = anchored || prog_->anchor_start(); longest_ = longest || prog_->anchor_end(); @@ -377,7 +377,7 @@ bool Prog::SearchBitState(const StringPiece& text, bool longest = kind != kFirstMatch; if (!b.Search(text, context, anchored, longest, match, nmatch)) return false; - if (kind == kFullMatch && match[0].end() != text.end()) + if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) return false; return true; } diff --git a/re2/dfa.cc b/re2/dfa.cc index 583303e..d47c7d5 100644 --- a/re2/dfa.cc +++ b/re2/dfa.cc @@ -1488,15 +1488,15 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { int lastbyte; if (run_forward) { - if (params->text.end() == params->context.end()) + if (EndPtr(params->text) == EndPtr(params->context)) lastbyte = kByteEndText; else - lastbyte = params->text.end()[0] & 0xFF; + lastbyte = EndPtr(params->text)[0] & 0xFF; } else { - if (params->text.begin() == params->context.begin()) + if (BeginPtr(params->text) == BeginPtr(params->context)) lastbyte = kByteEndText; else - lastbyte = params->text.begin()[-1] & 0xFF; + lastbyte = BeginPtr(params->text)[-1] & 0xFF; } State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire); @@ -1627,7 +1627,7 @@ bool DFA::AnalyzeSearch(SearchParams* params) { const StringPiece& context = params->context; // Sanity check: make sure that text lies within context. - if (text.begin() < context.begin() || text.end() > context.end()) { + if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { LOG(DFATAL) << "context does not contain text"; params->start = DeadState; return true; @@ -1637,13 +1637,13 @@ bool DFA::AnalyzeSearch(SearchParams* params) { int start; uint32_t flags; if (params->run_forward) { - if (text.begin() == context.begin()) { + if (BeginPtr(text) == BeginPtr(context)) { start = kStartBeginText; flags = kEmptyBeginText|kEmptyBeginLine; - } else if (text.begin()[-1] == '\n') { + } else if (BeginPtr(text)[-1] == '\n') { start = kStartBeginLine; flags = kEmptyBeginLine; - } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) { + } else if (Prog::IsWordChar(BeginPtr(text)[-1] & 0xFF)) { start = kStartAfterWordChar; flags = kFlagLastWord; } else { @@ -1651,13 +1651,13 @@ bool DFA::AnalyzeSearch(SearchParams* params) { flags = 0; } } else { - if (text.end() == context.end()) { + if (EndPtr(text) == EndPtr(context)) { start = kStartBeginText; flags = kEmptyBeginText|kEmptyBeginLine; - } else if (text.end()[0] == '\n') { + } else if (EndPtr(text)[0] == '\n') { start = kStartBeginLine; flags = kEmptyBeginLine; - } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) { + } else if (Prog::IsWordChar(EndPtr(text)[0] & 0xFF)) { start = kStartAfterWordChar; flags = kFlagLastWord; } else { @@ -1837,9 +1837,9 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, using std::swap; swap(caret, dollar); } - if (caret && context.begin() != text.begin()) + if (caret && BeginPtr(context) != BeginPtr(text)) return false; - if (dollar && context.end() != text.end()) + if (dollar && EndPtr(context) != EndPtr(text)) return false; // Handle full match by running an anchored longest match diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc index af1129f..3082a76 100644 --- a/re2/fuzzing/re2_fuzzer.cc +++ b/re2/fuzzing/re2_fuzzer.cc @@ -5,20 +5,96 @@ #include #include #include -#include -#include +#include #include #include -#include "re2/prefilter.h" #include "re2/re2.h" #include "re2/regexp.h" +#include "re2/walker-inl.h" using re2::StringPiece; // NOT static, NOT signed. uint8_t dummy = 0; +// Walks kRegexpConcat and kRegexpAlternate subexpressions +// to determine their maximum length. +class SubexpressionWalker : public re2::Regexp::Walker { + public: + SubexpressionWalker() = default; + ~SubexpressionWalker() override = default; + + int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) override { + switch (re->op()) { + case re2::kRegexpConcat: + case re2::kRegexpAlternate: { + int max = nchild_args; + for (int i = 0; i < nchild_args; i++) + max = std::max(max, child_args[i]); + return max; + } + + default: + break; + } + return -1; + } + + // Should never be called: we use Walk(), not WalkExponential(). + int ShortVisit(re2::Regexp* re, int parent_arg) override { + return parent_arg; + } + + private: + SubexpressionWalker(const SubexpressionWalker&) = delete; + SubexpressionWalker& operator=(const SubexpressionWalker&) = delete; +}; + +// Walks substrings (i.e. kRegexpLiteralString subexpressions) +// to determine their maximum length... in runes, but avoiding +// overheads due to UTF-8 encoding is worthwhile when fuzzing. +class SubstringWalker : public re2::Regexp::Walker { + public: + SubstringWalker() = default; + ~SubstringWalker() override = default; + + int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) override { + switch (re->op()) { + case re2::kRegexpConcat: + case re2::kRegexpAlternate: + case re2::kRegexpStar: + case re2::kRegexpPlus: + case re2::kRegexpQuest: + case re2::kRegexpRepeat: + case re2::kRegexpCapture: { + int max = -1; + for (int i = 0; i < nchild_args; i++) + max = std::max(max, child_args[i]); + return max; + } + + case re2::kRegexpLiteralString: + return re->nrunes(); + + default: + break; + } + return -1; + } + + // Should never be called: we use Walk(), not WalkExponential(). + int ShortVisit(re2::Regexp* re, int parent_arg) override { + return parent_arg; + } + + private: + SubstringWalker(const SubstringWalker&) = delete; + SubstringWalker& operator=(const SubstringWalker&) = delete; +}; + void TestOneInput(StringPiece pattern, const RE2::Options& options, StringPiece text) { // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. @@ -27,11 +103,15 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, // generating such patterns that fall within the other limits, but result // in timeouts nonetheless. The marginal cost is high - even more so when // counted repetition is involved - whereas the marginal benefit is zero. + // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become + // three-element character classes when case-insensitive and using UTF-8. // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. int char_class = 0; int backslash_p = 0; // very expensive, so handle specially for (size_t i = 0; i < pattern.size(); i++) { - if (pattern[i] == '.') + if (pattern[i] == '.' || + pattern[i] == 'k' || pattern[i] == 'K' || + pattern[i] == 's' || pattern[i] == 'S') char_class++; if (pattern[i] != '\\') continue; @@ -59,27 +139,18 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, if (!re.ok()) return; + // Don't waste time fuzzing programs with large subexpressions. + // They can cause bug reports due to fuzzer timeouts. And they + // aren't interesting for fuzzing purposes. + if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9) + return; + // Don't waste time fuzzing programs with large substrings. // They can cause bug reports due to fuzzer timeouts when they // are repetitions (e.g. hundreds of NUL bytes) and matching is // unanchored. And they aren't interesting for fuzzing purposes. - std::unique_ptr prefilter(re2::Prefilter::FromRE2(&re)); - if (prefilter == nullptr) + if (SubstringWalker().Walk(re.Regexp(), -1) > 9) return; - std::queue nodes; - nodes.push(prefilter.get()); - while (!nodes.empty()) { - re2::Prefilter* node = nodes.front(); - nodes.pop(); - if (node->op() == re2::Prefilter::ATOM) { - if (node->atom().size() > 9) - return; - } else if (node->op() == re2::Prefilter::AND || - node->op() == re2::Prefilter::OR) { - for (re2::Prefilter* sub : *node->subs()) - nodes.push(sub); - } - } // Don't waste time fuzzing high-size programs. // They can cause bug reports due to fuzzer timeouts. diff --git a/re2/nfa.cc b/re2/nfa.cc index e858451..c7339f8 100644 --- a/re2/nfa.cc +++ b/re2/nfa.cc @@ -456,14 +456,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, context = text; // Sanity check: make sure that text lies within context. - if (text.begin() < context.begin() || text.end() > context.end()) { + if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { LOG(DFATAL) << "context does not contain text"; return false; } - if (prog_->anchor_start() && context.begin() != text.begin()) + if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text)) return false; - if (prog_->anchor_end() && context.end() != text.end()) + if (prog_->anchor_end() && EndPtr(context) != EndPtr(text)) return false; anchored |= prog_->anchor_start(); if (prog_->anchor_end()) { @@ -646,7 +646,7 @@ Prog::SearchNFA(const StringPiece& text, const StringPiece& context, } if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) return false; - if (kind == kFullMatch && match[0].end() != text.end()) + if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) return false; return true; } diff --git a/re2/onepass.cc b/re2/onepass.cc index 66a62d9..2639746 100644 --- a/re2/onepass.cc +++ b/re2/onepass.cc @@ -237,9 +237,9 @@ bool Prog::SearchOnePass(const StringPiece& text, StringPiece context = const_context; if (context.data() == NULL) context = text; - if (anchor_start() && context.begin() != text.begin()) + if (anchor_start() && BeginPtr(context) != BeginPtr(text)) return false; - if (anchor_end() && context.end() != text.end()) + if (anchor_end() && EndPtr(context) != EndPtr(text)) return false; if (anchor_end()) kind = kFullMatch; diff --git a/re2/prog.cc b/re2/prog.cc index 396b46c..55dc105 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -118,6 +118,7 @@ Prog::Prog() prefix_foldcase_(false), prefix_size_(0), list_count_(0), + bit_state_text_max_size_(0), dfa_mem_(0), dfa_first_(NULL), dfa_longest_(NULL) { @@ -640,6 +641,11 @@ void Prog::Flatten() { for (int i = 0; i < list_count_; ++i) list_heads_[flatmap[i]] = i; } + + // BitState allocates a bitmap of size list_count_ * (text.size()+1) + // for tracking pairs of possibilities that it has already explored. + const size_t kBitStateBitmapMaxSize = 256*1024; // max size in bits + bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1; } void Prog::MarkSuccessors(SparseArray* rootmap, diff --git a/re2/prog.h b/re2/prog.h index 8ca9880..4af012a 100644 --- a/re2/prog.h +++ b/re2/prog.h @@ -207,6 +207,7 @@ class Prog { int list_count() { return list_count_; } int inst_count(InstOp op) { return inst_count_[op]; } uint16_t* list_heads() { return list_heads_.data(); } + size_t bit_state_text_max_size() { return bit_state_text_max_size_; } int64_t dfa_mem() { return dfa_mem_; } void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } bool anchor_start() { return anchor_start_; } @@ -429,10 +430,11 @@ class Prog { }; }; - int list_count_; // count of lists (see above) - int inst_count_[kNumInst]; // count of instructions by opcode - PODArray list_heads_; // sparse array enumerating list heads - // not populated if size_ is overly large + int list_count_; // count of lists (see above) + int inst_count_[kNumInst]; // count of instructions by opcode + PODArray list_heads_; // sparse array enumerating list heads + // not populated if size_ is overly large + size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size() PODArray inst_; // pointer to instruction array PODArray onepass_nodes_; // data for OnePass nodes @@ -450,6 +452,17 @@ class Prog { Prog& operator=(const Prog&) = delete; }; +// std::string_view in MSVC has iterators that aren't just pointers and +// that don't allow comparisons between different objects - not even if +// those objects are views into the same string! Thus, we provide these +// conversion functions for convenience. +static inline const char* BeginPtr(const StringPiece& s) { + return s.data(); +} +static inline const char* EndPtr(const StringPiece& s) { + return s.data() + s.size(); +} + } // namespace re2 #endif // RE2_PROG_H_ diff --git a/re2/re2.cc b/re2/re2.cc index 128c8bd..c027133 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -690,15 +690,9 @@ bool RE2::Match(const StringPiece& text, if (options_.longest_match()) kind = Prog::kLongestMatch; - bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); - - // BitState allocates a bitmap of size prog_->list_count() * text.size(). - // It also allocates a stack of 3-word structures which could potentially - // grow as large as prog_->list_count() * text.size(), but in practice is - // much smaller. - const int kMaxBitStateBitmapSize = 256*1024; // bitmap size <= max (bits) + bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; bool can_bit_state = prog_->CanBitState(); - size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count(); + size_t bit_state_text_max_size = prog_->bit_state_text_max_size(); #ifdef RE2_HAVE_THREAD_LOCAL hooks::context = this; @@ -805,7 +799,8 @@ bool RE2::Match(const StringPiece& text, skipped_test = true; break; } - if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { + if (can_bit_state && text.size() <= bit_state_text_max_size && + ncap > 1) { skipped_test = true; break; } @@ -852,7 +847,7 @@ bool RE2::Match(const StringPiece& text, LOG(ERROR) << "SearchOnePass inconsistency"; return false; } - } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { + } else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) { if (!prog_->SearchBitState(subtext1, text, anchor, kind, submatch, ncap)) { if (!skipped_test && options_.log_errors()) @@ -920,7 +915,7 @@ bool RE2::DoMatch(const StringPiece& text, } if (consumed != NULL) - *consumed = static_cast(vec[0].end() - text.begin()); + *consumed = static_cast(EndPtr(vec[0]) - BeginPtr(text)); if (n == 0 || args == NULL) { // We are not interested in results diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc index 216d259..920a453 100644 --- a/re2/testing/backtrack.cc +++ b/re2/testing/backtrack.cc @@ -103,9 +103,9 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, context_ = context; if (context_.data() == NULL) context_ = text; - if (prog_->anchor_start() && text.begin() > context_.begin()) + if (prog_->anchor_start() && BeginPtr(text) > BeginPtr(context_)) return false; - if (prog_->anchor_end() && text.end() < context_.end()) + if (prog_->anchor_end() && EndPtr(text) < EndPtr(context_)) return false; anchored_ = anchored | prog_->anchor_start(); longest_ = longest | prog_->anchor_end(); @@ -267,7 +267,7 @@ bool Prog::UnsafeSearchBacktrack(const StringPiece& text, bool longest = kind != kFirstMatch; if (!b.Search(text, context, anchored, longest, match, nmatch)) return false; - if (kind == kFullMatch && match[0].end() != text.end()) + if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) return false; return true; } diff --git a/re2/testing/exhaustive_tester.cc b/re2/testing/exhaustive_tester.cc index bdac381..b0409c3 100644 --- a/re2/testing/exhaustive_tester.cc +++ b/re2/testing/exhaustive_tester.cc @@ -67,8 +67,8 @@ static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anc printf("-"); else printf("%td-%td", - m[i].begin() - input.begin(), - m[i].end() - input.begin()); + BeginPtr(m[i]) - BeginPtr(input), + EndPtr(m[i]) - BeginPtr(input)); } } diff --git a/re2/testing/tester.cc b/re2/testing/tester.cc index d2ec4fb..b0c22f2 100644 --- a/re2/testing/tester.cc +++ b/re2/testing/tester.cc @@ -117,8 +117,8 @@ static std::string FormatCapture(const StringPiece& text, if (s.data() == NULL) return "(?,?)"; return StringPrintf("(%td,%td)", - s.begin() - text.begin(), - s.end() - text.begin()); + BeginPtr(s) - BeginPtr(text), + EndPtr(s) - BeginPtr(text)); } // Returns whether text contains non-ASCII (>= 0x80) bytes. @@ -403,7 +403,7 @@ void TestInstance::RunSearch(Engine type, case kEngineRE2: case kEngineRE2a: case kEngineRE2b: { - if (!re2_ || text.end() != context.end()) { + if (!re2_ || EndPtr(text) != EndPtr(context)) { result->skipped = true; break; } @@ -418,8 +418,8 @@ void TestInstance::RunSearch(Engine type, result->matched = re2_->Match( context, - static_cast(text.begin() - context.begin()), - static_cast(text.end() - context.begin()), + static_cast(BeginPtr(text) - BeginPtr(context)), + static_cast(EndPtr(text) - BeginPtr(context)), re_anchor, result->submatch, nsubmatch); @@ -428,8 +428,8 @@ void TestInstance::RunSearch(Engine type, } case kEnginePCRE: { - if (!re_ || text.begin() != context.begin() || - text.end() != context.end()) { + if (!re_ || BeginPtr(text) != BeginPtr(context) || + EndPtr(text) != EndPtr(context)) { result->skipped = true; break; } @@ -606,9 +606,9 @@ void TestInstance::LogMatch(const char* prefix, Engine e, << " text " << CEscape(text) << " (" - << text.begin() - context.begin() + << BeginPtr(text) - BeginPtr(context) << "," - << text.end() - context.begin() + << EndPtr(text) - BeginPtr(context) << ") of context " << CEscape(context) << " (" << FormatKind(kind_) diff --git a/re2/unicode.py b/re2/unicode.py index e0f33ef..727bea5 100644 --- a/re2/unicode.py +++ b/re2/unicode.py @@ -13,7 +13,7 @@ import re from six.moves import urllib # Directory or URL where Unicode tables reside. -_UNICODE_DIR = "https://www.unicode.org/Public/13.0.0/ucd" +_UNICODE_DIR = "https://www.unicode.org/Public/14.0.0/ucd" # Largest valid Unicode code value. _RUNE_MAX = 0x10FFFF diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc index 8424107..d9de282 100644 --- a/re2/unicode_casefold.cc +++ b/re2/unicode_casefold.cc @@ -7,7 +7,7 @@ namespace re2 { -// 1384 groups, 2798 pairs, 358 ranges +// 1424 groups, 2878 pairs, 367 ranges const CaseFold unicode_casefold[] = { { 65, 90, 32 }, { 97, 106, -32 }, @@ -299,8 +299,8 @@ const CaseFold unicode_casefold[] = { { 8579, 8580, OddEven }, { 9398, 9423, 26 }, { 9424, 9449, -26 }, - { 11264, 11310, 48 }, - { 11312, 11358, -48 }, + { 11264, 11311, 48 }, + { 11312, 11359, -48 }, { 11360, 11361, EvenOdd }, { 11362, 11362, -10743 }, { 11363, 11363, -3814 }, @@ -344,12 +344,13 @@ const CaseFold unicode_casefold[] = { { 42929, 42929, -42282 }, { 42930, 42930, -42261 }, { 42931, 42931, 928 }, - { 42932, 42943, EvenOdd }, - { 42946, 42947, EvenOdd }, + { 42932, 42947, EvenOdd }, { 42948, 42948, -48 }, { 42949, 42949, -42307 }, { 42950, 42950, -35384 }, { 42951, 42954, OddEven }, + { 42960, 42961, EvenOdd }, + { 42966, 42969, EvenOdd }, { 42997, 42998, OddEven }, { 43859, 43859, -928 }, { 43888, 43967, -38864 }, @@ -359,6 +360,14 @@ const CaseFold unicode_casefold[] = { { 66600, 66639, -40 }, { 66736, 66771, 40 }, { 66776, 66811, -40 }, + { 66928, 66938, 39 }, + { 66940, 66954, 39 }, + { 66956, 66962, 39 }, + { 66964, 66965, 39 }, + { 66967, 66977, -39 }, + { 66979, 66993, -39 }, + { 66995, 67001, -39 }, + { 67003, 67004, -39 }, { 68736, 68786, 64 }, { 68800, 68850, -64 }, { 71840, 71871, 32 }, @@ -368,9 +377,9 @@ const CaseFold unicode_casefold[] = { { 125184, 125217, 34 }, { 125218, 125251, -34 }, }; -const int num_unicode_casefold = 358; +const int num_unicode_casefold = 367; -// 1384 groups, 1414 pairs, 200 ranges +// 1424 groups, 1454 pairs, 205 ranges const CaseFold unicode_tolower[] = { { 65, 90, 32 }, { 181, 181, 775 }, @@ -521,7 +530,7 @@ const CaseFold unicode_tolower[] = { { 8544, 8559, 16 }, { 8579, 8579, OddEven }, { 9398, 9423, 26 }, - { 11264, 11310, 48 }, + { 11264, 11311, 48 }, { 11360, 11360, EvenOdd }, { 11362, 11362, -10743 }, { 11363, 11363, -3814 }, @@ -557,23 +566,28 @@ const CaseFold unicode_tolower[] = { { 42929, 42929, -42282 }, { 42930, 42930, -42261 }, { 42931, 42931, 928 }, - { 42932, 42942, EvenOddSkip }, - { 42946, 42946, EvenOdd }, + { 42932, 42946, EvenOddSkip }, { 42948, 42948, -48 }, { 42949, 42949, -42307 }, { 42950, 42950, -35384 }, { 42951, 42953, OddEvenSkip }, + { 42960, 42960, EvenOdd }, + { 42966, 42968, EvenOddSkip }, { 42997, 42997, OddEven }, { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, { 66736, 66771, 40 }, + { 66928, 66938, 39 }, + { 66940, 66954, 39 }, + { 66956, 66962, 39 }, + { 66964, 66965, 39 }, { 68736, 68786, 64 }, { 71840, 71871, 32 }, { 93760, 93791, 32 }, { 125184, 125217, 34 }, }; -const int num_unicode_tolower = 200; +const int num_unicode_tolower = 205; diff --git a/re2/unicode_groups.cc b/re2/unicode_groups.cc index 7b7a3c6..2a8d7da 100644 --- a/re2/unicode_groups.cc +++ b/re2/unicode_groups.cc @@ -15,6 +15,7 @@ static const URange16 C_range16[] = { { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, + { 2192, 2193 }, { 2274, 2274 }, { 6158, 6158 }, { 8203, 8207 }, @@ -46,6 +47,7 @@ static const URange16 Cf_range16[] = { { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, + { 2192, 2193 }, { 2274, 2274 }, { 6158, 6158 }, { 8203, 8207 }, @@ -124,8 +126,9 @@ static const URange16 L_range16[] = { { 2088, 2088 }, { 2112, 2136 }, { 2144, 2154 }, - { 2208, 2228 }, - { 2230, 2247 }, + { 2160, 2183 }, + { 2185, 2190 }, + { 2208, 2249 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -190,6 +193,7 @@ static const URange16 L_range16[] = { { 3114, 3129 }, { 3133, 3133 }, { 3160, 3162 }, + { 3165, 3165 }, { 3168, 3169 }, { 3200, 3200 }, { 3205, 3212 }, @@ -198,7 +202,7 @@ static const URange16 L_range16[] = { { 3242, 3251 }, { 3253, 3257 }, { 3261, 3261 }, - { 3294, 3294 }, + { 3293, 3294 }, { 3296, 3297 }, { 3313, 3314 }, { 3332, 3340 }, @@ -269,9 +273,8 @@ static const URange16 L_range16[] = { { 5761, 5786 }, { 5792, 5866 }, { 5873, 5880 }, - { 5888, 5900 }, - { 5902, 5905 }, - { 5920, 5937 }, + { 5888, 5905 }, + { 5919, 5937 }, { 5952, 5969 }, { 5984, 5996 }, { 5998, 6000 }, @@ -292,7 +295,7 @@ static const URange16 L_range16[] = { { 6688, 6740 }, { 6823, 6823 }, { 6917, 6963 }, - { 6981, 6987 }, + { 6981, 6988 }, { 7043, 7072 }, { 7086, 7087 }, { 7098, 7141 }, @@ -343,9 +346,7 @@ static const URange16 L_range16[] = { { 8517, 8521 }, { 8526, 8526 }, { 8579, 8580 }, - { 11264, 11310 }, - { 11312, 11358 }, - { 11360, 11492 }, + { 11264, 11492 }, { 11499, 11502 }, { 11506, 11507 }, { 11520, 11557 }, @@ -375,8 +376,7 @@ static const URange16 L_range16[] = { { 12704, 12735 }, { 12784, 12799 }, { 13312, 19903 }, - { 19968, 40956 }, - { 40960, 42124 }, + { 19968, 42124 }, { 42192, 42237 }, { 42240, 42508 }, { 42512, 42527 }, @@ -386,9 +386,11 @@ static const URange16 L_range16[] = { { 42656, 42725 }, { 42775, 42783 }, { 42786, 42888 }, - { 42891, 42943 }, - { 42946, 42954 }, - { 42997, 43009 }, + { 42891, 42954 }, + { 42960, 42961 }, + { 42963, 42963 }, + { 42965, 42969 }, + { 42994, 43009 }, { 43011, 43013 }, { 43015, 43018 }, { 43020, 43042 }, @@ -478,9 +480,20 @@ static const URange32 L_range32[] = { { 66776, 66811 }, { 66816, 66855 }, { 66864, 66915 }, + { 66928, 66938 }, + { 66940, 66954 }, + { 66956, 66962 }, + { 66964, 66965 }, + { 66967, 66977 }, + { 66979, 66993 }, + { 66995, 67001 }, + { 67003, 67004 }, { 67072, 67382 }, { 67392, 67413 }, { 67424, 67431 }, + { 67456, 67461 }, + { 67463, 67504 }, + { 67506, 67514 }, { 67584, 67589 }, { 67592, 67592 }, { 67594, 67637 }, @@ -516,9 +529,12 @@ static const URange32 L_range32[] = { { 69376, 69404 }, { 69415, 69415 }, { 69424, 69445 }, + { 69488, 69505 }, { 69552, 69572 }, { 69600, 69622 }, { 69635, 69687 }, + { 69745, 69746 }, + { 69749, 69749 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, @@ -560,6 +576,7 @@ static const URange32 L_range32[] = { { 71296, 71338 }, { 71352, 71352 }, { 71424, 71450 }, + { 71488, 71494 }, { 71680, 71723 }, { 71840, 71903 }, { 71935, 71942 }, @@ -579,7 +596,7 @@ static const URange32 L_range32[] = { { 72272, 72272 }, { 72284, 72329 }, { 72349, 72349 }, - { 72384, 72440 }, + { 72368, 72440 }, { 72704, 72712 }, { 72714, 72750 }, { 72768, 72768 }, @@ -596,10 +613,12 @@ static const URange32 L_range32[] = { { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, + { 77712, 77808 }, { 77824, 78894 }, { 82944, 83526 }, { 92160, 92728 }, { 92736, 92766 }, + { 92784, 92862 }, { 92880, 92909 }, { 92928, 92975 }, { 92992, 92995 }, @@ -614,7 +633,10 @@ static const URange32 L_range32[] = { { 94208, 100343 }, { 100352, 101589 }, { 101632, 101640 }, - { 110592, 110878 }, + { 110576, 110579 }, + { 110581, 110587 }, + { 110589, 110590 }, + { 110592, 110882 }, { 110928, 110930 }, { 110948, 110951 }, { 110960, 111355 }, @@ -652,10 +674,16 @@ static const URange32 L_range32[] = { { 120714, 120744 }, { 120746, 120770 }, { 120772, 120779 }, + { 122624, 122654 }, { 123136, 123180 }, { 123191, 123197 }, { 123214, 123214 }, + { 123536, 123565 }, { 123584, 123627 }, + { 124896, 124902 }, + { 124904, 124907 }, + { 124909, 124910 }, + { 124912, 124926 }, { 124928, 125124 }, { 125184, 125251 }, { 125259, 125259 }, @@ -692,8 +720,8 @@ static const URange32 L_range32[] = { { 126625, 126627 }, { 126629, 126633 }, { 126635, 126651 }, - { 131072, 173789 }, - { 173824, 177972 }, + { 131072, 173791 }, + { 173824, 177976 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, @@ -1132,7 +1160,7 @@ static const URange16 Ll_range16[] = { { 8518, 8521 }, { 8526, 8526 }, { 8580, 8580 }, - { 11312, 11358 }, + { 11312, 11359 }, { 11361, 11361 }, { 11365, 11366 }, { 11368, 11368 }, @@ -1301,9 +1329,15 @@ static const URange16 Ll_range16[] = { { 42939, 42939 }, { 42941, 42941 }, { 42943, 42943 }, + { 42945, 42945 }, { 42947, 42947 }, { 42952, 42952 }, { 42954, 42954 }, + { 42961, 42961 }, + { 42963, 42963 }, + { 42965, 42965 }, + { 42967, 42967 }, + { 42969, 42969 }, { 42998, 42998 }, { 43002, 43002 }, { 43824, 43866 }, @@ -1316,6 +1350,10 @@ static const URange16 Ll_range16[] = { static const URange32 Ll_range32[] = { { 66600, 66639 }, { 66776, 66811 }, + { 66967, 66977 }, + { 66979, 66993 }, + { 66995, 67001 }, + { 67003, 67004 }, { 68800, 68850 }, { 71872, 71903 }, { 93792, 93823 }, @@ -1347,6 +1385,8 @@ static const URange32 Ll_range32[] = { { 120746, 120770 }, { 120772, 120777 }, { 120779, 120779 }, + { 122624, 122633 }, + { 122635, 122654 }, { 125218, 125251 }, }; static const URange16 Lm_range16[] = { @@ -1365,6 +1405,7 @@ static const URange16 Lm_range16[] = { { 2074, 2074 }, { 2084, 2084 }, { 2088, 2088 }, + { 2249, 2249 }, { 2417, 2417 }, { 3654, 3654 }, { 3782, 3782 }, @@ -1395,6 +1436,7 @@ static const URange16 Lm_range16[] = { { 42775, 42783 }, { 42864, 42864 }, { 42888, 42888 }, + { 42994, 42996 }, { 43000, 43001 }, { 43471, 43471 }, { 43494, 43494 }, @@ -1407,10 +1449,16 @@ static const URange16 Lm_range16[] = { { 65438, 65439 }, }; static const URange32 Lm_range32[] = { + { 67456, 67461 }, + { 67463, 67504 }, + { 67506, 67514 }, { 92992, 92995 }, { 94099, 94111 }, { 94176, 94177 }, { 94179, 94179 }, + { 110576, 110579 }, + { 110581, 110587 }, + { 110589, 110590 }, { 123191, 123197 }, { 125259, 125259 }, }; @@ -1438,8 +1486,9 @@ static const URange16 Lo_range16[] = { { 2048, 2069 }, { 2112, 2136 }, { 2144, 2154 }, - { 2208, 2228 }, - { 2230, 2247 }, + { 2160, 2183 }, + { 2185, 2190 }, + { 2208, 2248 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -1504,6 +1553,7 @@ static const URange16 Lo_range16[] = { { 3114, 3129 }, { 3133, 3133 }, { 3160, 3162 }, + { 3165, 3165 }, { 3168, 3169 }, { 3200, 3200 }, { 3205, 3212 }, @@ -1512,7 +1562,7 @@ static const URange16 Lo_range16[] = { { 3242, 3251 }, { 3253, 3257 }, { 3261, 3261 }, - { 3294, 3294 }, + { 3293, 3294 }, { 3296, 3297 }, { 3313, 3314 }, { 3332, 3340 }, @@ -1576,9 +1626,8 @@ static const URange16 Lo_range16[] = { { 5761, 5786 }, { 5792, 5866 }, { 5873, 5880 }, - { 5888, 5900 }, - { 5902, 5905 }, - { 5920, 5937 }, + { 5888, 5905 }, + { 5919, 5937 }, { 5952, 5969 }, { 5984, 5996 }, { 5998, 6000 }, @@ -1598,7 +1647,7 @@ static const URange16 Lo_range16[] = { { 6656, 6678 }, { 6688, 6740 }, { 6917, 6963 }, - { 6981, 6987 }, + { 6981, 6988 }, { 7043, 7072 }, { 7086, 7087 }, { 7098, 7141 }, @@ -1631,8 +1680,7 @@ static const URange16 Lo_range16[] = { { 12704, 12735 }, { 12784, 12799 }, { 13312, 19903 }, - { 19968, 40956 }, - { 40960, 40980 }, + { 19968, 40980 }, { 40982, 42124 }, { 42192, 42231 }, { 42240, 42507 }, @@ -1762,9 +1810,12 @@ static const URange32 Lo_range32[] = { { 69376, 69404 }, { 69415, 69415 }, { 69424, 69445 }, + { 69488, 69505 }, { 69552, 69572 }, { 69600, 69622 }, { 69635, 69687 }, + { 69745, 69746 }, + { 69749, 69749 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, @@ -1806,6 +1857,7 @@ static const URange32 Lo_range32[] = { { 71296, 71338 }, { 71352, 71352 }, { 71424, 71450 }, + { 71488, 71494 }, { 71680, 71723 }, { 71935, 71942 }, { 71945, 71945 }, @@ -1824,7 +1876,7 @@ static const URange32 Lo_range32[] = { { 72272, 72272 }, { 72284, 72329 }, { 72349, 72349 }, - { 72384, 72440 }, + { 72368, 72440 }, { 72704, 72712 }, { 72714, 72750 }, { 72768, 72768 }, @@ -1841,10 +1893,12 @@ static const URange32 Lo_range32[] = { { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, + { 77712, 77808 }, { 77824, 78894 }, { 82944, 83526 }, { 92160, 92728 }, { 92736, 92766 }, + { 92784, 92862 }, { 92880, 92909 }, { 92928, 92975 }, { 93027, 93047 }, @@ -1854,7 +1908,7 @@ static const URange32 Lo_range32[] = { { 94208, 100343 }, { 100352, 101589 }, { 101632, 101640 }, - { 110592, 110878 }, + { 110592, 110882 }, { 110928, 110930 }, { 110948, 110951 }, { 110960, 111355 }, @@ -1862,9 +1916,15 @@ static const URange32 Lo_range32[] = { { 113776, 113788 }, { 113792, 113800 }, { 113808, 113817 }, + { 122634, 122634 }, { 123136, 123180 }, { 123214, 123214 }, + { 123536, 123565 }, { 123584, 123627 }, + { 124896, 124902 }, + { 124904, 124907 }, + { 124909, 124910 }, + { 124912, 124926 }, { 124928, 125124 }, { 126464, 126467 }, { 126469, 126495 }, @@ -1899,8 +1959,8 @@ static const URange32 Lo_range32[] = { { 126625, 126627 }, { 126629, 126633 }, { 126635, 126651 }, - { 131072, 173789 }, - { 173824, 177972 }, + { 131072, 173791 }, + { 173824, 177976 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, @@ -2351,7 +2411,7 @@ static const URange16 Lu_range16[] = { { 8510, 8511 }, { 8517, 8517 }, { 8579, 8579 }, - { 11264, 11310 }, + { 11264, 11311 }, { 11360, 11360 }, { 11362, 11364 }, { 11367, 11367 }, @@ -2516,15 +2576,23 @@ static const URange16 Lu_range16[] = { { 42938, 42938 }, { 42940, 42940 }, { 42942, 42942 }, + { 42944, 42944 }, { 42946, 42946 }, { 42948, 42951 }, { 42953, 42953 }, + { 42960, 42960 }, + { 42966, 42966 }, + { 42968, 42968 }, { 42997, 42997 }, { 65313, 65338 }, }; static const URange32 Lu_range32[] = { { 66560, 66599 }, { 66736, 66771 }, + { 66928, 66938 }, + { 66940, 66954 }, + { 66956, 66962 }, + { 66964, 66965 }, { 68736, 68786 }, { 71840, 71871 }, { 93760, 93791 }, @@ -2586,7 +2654,8 @@ static const URange16 M_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, - { 2259, 2273 }, + { 2200, 2207 }, + { 2250, 2273 }, { 2275, 2307 }, { 2362, 2364 }, { 2366, 2383 }, @@ -2628,6 +2697,7 @@ static const URange16 M_range16[] = { { 3018, 3021 }, { 3031, 3031 }, { 3072, 3076 }, + { 3132, 3132 }, { 3134, 3140 }, { 3142, 3144 }, { 3146, 3149 }, @@ -2679,13 +2749,14 @@ static const URange16 M_range16[] = { { 4239, 4239 }, { 4250, 4253 }, { 4957, 4959 }, - { 5906, 5908 }, + { 5906, 5909 }, { 5938, 5940 }, { 5970, 5971 }, { 6002, 6003 }, { 6068, 6099 }, { 6109, 6109 }, { 6155, 6157 }, + { 6159, 6159 }, { 6277, 6278 }, { 6313, 6313 }, { 6432, 6443 }, @@ -2694,7 +2765,7 @@ static const URange16 M_range16[] = { { 6741, 6750 }, { 6752, 6780 }, { 6783, 6783 }, - { 6832, 6848 }, + { 6832, 6862 }, { 6912, 6916 }, { 6964, 6980 }, { 7019, 7027 }, @@ -2707,8 +2778,7 @@ static const URange16 M_range16[] = { { 7405, 7405 }, { 7412, 7412 }, { 7415, 7417 }, - { 7616, 7673 }, - { 7675, 7679 }, + { 7616, 7679 }, { 8400, 8432 }, { 11503, 11505 }, { 11647, 11647 }, @@ -2763,10 +2833,14 @@ static const URange32 M_range32[] = { { 68900, 68903 }, { 69291, 69292 }, { 69446, 69456 }, + { 69506, 69509 }, { 69632, 69634 }, { 69688, 69702 }, + { 69744, 69744 }, + { 69747, 69748 }, { 69759, 69762 }, { 69808, 69818 }, + { 69826, 69826 }, { 69888, 69890 }, { 69927, 69940 }, { 69957, 69958 }, @@ -2832,6 +2906,8 @@ static const URange32 M_range32[] = { { 94180, 94180 }, { 94192, 94193 }, { 113821, 113822 }, + { 118528, 118573 }, + { 118576, 118598 }, { 119141, 119145 }, { 119149, 119154 }, { 119163, 119170 }, @@ -2850,6 +2926,7 @@ static const URange32 M_range32[] = { { 122915, 122916 }, { 122918, 122922 }, { 123184, 123190 }, + { 123566, 123566 }, { 123628, 123631 }, { 125136, 125142 }, { 125252, 125258 }, @@ -2913,6 +2990,8 @@ static const URange16 Mc_range16[] = { { 4231, 4236 }, { 4239, 4239 }, { 4250, 4252 }, + { 5909, 5909 }, + { 5940, 5940 }, { 6070, 6070 }, { 6078, 6085 }, { 6087, 6088 }, @@ -3066,7 +3145,8 @@ static const URange16 Mn_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, - { 2259, 2273 }, + { 2200, 2207 }, + { 2250, 2273 }, { 2275, 2306 }, { 2362, 2362 }, { 2364, 2364 }, @@ -3107,6 +3187,7 @@ static const URange16 Mn_range16[] = { { 3021, 3021 }, { 3072, 3072 }, { 3076, 3076 }, + { 3132, 3132 }, { 3134, 3136 }, { 3142, 3144 }, { 3146, 3149 }, @@ -3156,7 +3237,7 @@ static const URange16 Mn_range16[] = { { 4253, 4253 }, { 4957, 4959 }, { 5906, 5908 }, - { 5938, 5940 }, + { 5938, 5939 }, { 5970, 5971 }, { 6002, 6003 }, { 6068, 6069 }, @@ -3165,6 +3246,7 @@ static const URange16 Mn_range16[] = { { 6089, 6099 }, { 6109, 6109 }, { 6155, 6157 }, + { 6159, 6159 }, { 6277, 6278 }, { 6313, 6313 }, { 6432, 6434 }, @@ -3181,7 +3263,7 @@ static const URange16 Mn_range16[] = { { 6771, 6780 }, { 6783, 6783 }, { 6832, 6845 }, - { 6847, 6848 }, + { 6847, 6862 }, { 6912, 6915 }, { 6964, 6964 }, { 6966, 6970 }, @@ -3204,8 +3286,7 @@ static const URange16 Mn_range16[] = { { 7405, 7405 }, { 7412, 7412 }, { 7416, 7417 }, - { 7616, 7673 }, - { 7675, 7679 }, + { 7616, 7679 }, { 8400, 8412 }, { 8417, 8417 }, { 8421, 8432 }, @@ -3266,11 +3347,15 @@ static const URange32 Mn_range32[] = { { 68900, 68903 }, { 69291, 69292 }, { 69446, 69456 }, + { 69506, 69509 }, { 69633, 69633 }, { 69688, 69702 }, + { 69744, 69744 }, + { 69747, 69748 }, { 69759, 69761 }, { 69811, 69814 }, { 69817, 69818 }, + { 69826, 69826 }, { 69888, 69890 }, { 69927, 69931 }, { 69933, 69940 }, @@ -3350,6 +3435,8 @@ static const URange32 Mn_range32[] = { { 94095, 94098 }, { 94180, 94180 }, { 113821, 113822 }, + { 118528, 118573 }, + { 118576, 118598 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, @@ -3367,6 +3454,7 @@ static const URange32 Mn_range32[] = { { 122915, 122916 }, { 122918, 122922 }, { 123184, 123190 }, + { 123566, 123566 }, { 123628, 123631 }, { 125136, 125142 }, { 125252, 125258 }, @@ -3491,6 +3579,7 @@ static const URange32 N_range32[] = { { 73664, 73684 }, { 74752, 74862 }, { 92768, 92777 }, + { 92864, 92873 }, { 93008, 93017 }, { 93019, 93025 }, { 93824, 93846 }, @@ -3567,6 +3656,7 @@ static const URange32 Nd_range32[] = { { 73040, 73049 }, { 73120, 73129 }, { 92768, 92777 }, + { 92864, 92873 }, { 93008, 93017 }, { 120782, 120831 }, { 123200, 123209 }, @@ -3693,7 +3783,7 @@ static const URange16 P_range16[] = { { 1545, 1546 }, { 1548, 1549 }, { 1563, 1563 }, - { 1566, 1567 }, + { 1565, 1567 }, { 1642, 1645 }, { 1748, 1748 }, { 1792, 1805 }, @@ -3732,6 +3822,7 @@ static const URange16 P_range16[] = { { 6816, 6822 }, { 6824, 6829 }, { 7002, 7008 }, + { 7037, 7038 }, { 7164, 7167 }, { 7227, 7231 }, { 7294, 7295 }, @@ -3756,7 +3847,7 @@ static const URange16 P_range16[] = { { 11632, 11632 }, { 11776, 11822 }, { 11824, 11855 }, - { 11858, 11858 }, + { 11858, 11869 }, { 12289, 12291 }, { 12296, 12305 }, { 12308, 12319 }, @@ -3814,6 +3905,7 @@ static const URange32 P_range32[] = { { 68505, 68508 }, { 69293, 69293 }, { 69461, 69465 }, + { 69510, 69513 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, @@ -3832,6 +3924,7 @@ static const URange32 P_range32[] = { { 71105, 71127 }, { 71233, 71235 }, { 71264, 71276 }, + { 71353, 71353 }, { 71484, 71486 }, { 71739, 71739 }, { 72004, 72006 }, @@ -3844,6 +3937,7 @@ static const URange32 P_range32[] = { { 73463, 73464 }, { 73727, 73727 }, { 74864, 74868 }, + { 77809, 77810 }, { 92782, 92783 }, { 92917, 92917 }, { 92983, 92987 }, @@ -3873,6 +3967,7 @@ static const URange16 Pd_range16[] = { { 11802, 11802 }, { 11834, 11835 }, { 11840, 11840 }, + { 11869, 11869 }, { 12316, 12316 }, { 12336, 12336 }, { 12448, 12448 }, @@ -3928,6 +4023,10 @@ static const URange16 Pe_range16[] = { { 11813, 11813 }, { 11815, 11815 }, { 11817, 11817 }, + { 11862, 11862 }, + { 11864, 11864 }, + { 11866, 11866 }, + { 11868, 11868 }, { 12297, 12297 }, { 12299, 12299 }, { 12301, 12301 }, @@ -4007,7 +4106,7 @@ static const URange16 Po_range16[] = { { 1545, 1546 }, { 1548, 1549 }, { 1563, 1563 }, - { 1566, 1567 }, + { 1565, 1567 }, { 1642, 1645 }, { 1748, 1748 }, { 1792, 1805 }, @@ -4044,6 +4143,7 @@ static const URange16 Po_range16[] = { { 6816, 6822 }, { 6824, 6829 }, { 7002, 7008 }, + { 7037, 7038 }, { 7164, 7167 }, { 7227, 7231 }, { 7294, 7295 }, @@ -4072,7 +4172,7 @@ static const URange16 Po_range16[] = { { 11836, 11839 }, { 11841, 11841 }, { 11843, 11855 }, - { 11858, 11858 }, + { 11858, 11860 }, { 12289, 12291 }, { 12349, 12349 }, { 12539, 12539 }, @@ -4128,6 +4228,7 @@ static const URange32 Po_range32[] = { { 68409, 68415 }, { 68505, 68508 }, { 69461, 69465 }, + { 69510, 69513 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, @@ -4146,6 +4247,7 @@ static const URange32 Po_range32[] = { { 71105, 71127 }, { 71233, 71235 }, { 71264, 71276 }, + { 71353, 71353 }, { 71484, 71486 }, { 71739, 71739 }, { 72004, 72006 }, @@ -4158,6 +4260,7 @@ static const URange32 Po_range32[] = { { 73463, 73464 }, { 73727, 73727 }, { 74864, 74868 }, + { 77809, 77810 }, { 92782, 92783 }, { 92917, 92917 }, { 92983, 92987 }, @@ -4215,6 +4318,10 @@ static const URange16 Ps_range16[] = { { 11814, 11814 }, { 11816, 11816 }, { 11842, 11842 }, + { 11861, 11861 }, + { 11863, 11863 }, + { 11865, 11865 }, + { 11867, 11867 }, { 12296, 12296 }, { 12298, 12298 }, { 12300, 12300 }, @@ -4279,6 +4386,7 @@ static const URange16 S_range16[] = { { 1789, 1790 }, { 2038, 2038 }, { 2046, 2047 }, + { 2184, 2184 }, { 2546, 2547 }, { 2554, 2555 }, { 2801, 2801 }, @@ -4317,7 +4425,7 @@ static const URange16 S_range16[] = { { 8274, 8274 }, { 8314, 8316 }, { 8330, 8332 }, - { 8352, 8383 }, + { 8352, 8384 }, { 8448, 8449 }, { 8451, 8454 }, { 8456, 8457 }, @@ -4379,8 +4487,10 @@ static const URange16 S_range16[] = { { 43867, 43867 }, { 43882, 43883 }, { 64297, 64297 }, - { 64434, 64449 }, - { 65020, 65021 }, + { 64434, 64450 }, + { 64832, 64847 }, + { 64975, 64975 }, + { 65020, 65023 }, { 65122, 65122 }, { 65124, 65126 }, { 65129, 65129 }, @@ -4409,13 +4519,14 @@ static const URange32 S_range32[] = { { 92988, 92991 }, { 92997, 92997 }, { 113820, 113820 }, + { 118608, 118723 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119140 }, { 119146, 119148 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119272 }, + { 119214, 119274 }, { 119296, 119361 }, { 119365, 119365 }, { 119552, 119638 }, @@ -4453,28 +4564,29 @@ static const URange32 S_range32[] = { { 127568, 127569 }, { 127584, 127589 }, { 127744, 128727 }, - { 128736, 128748 }, + { 128733, 128748 }, { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, + { 129008, 129008 }, { 129024, 129035 }, { 129040, 129095 }, { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, { 129200, 129201 }, - { 129280, 129400 }, - { 129402, 129483 }, - { 129485, 129619 }, + { 129280, 129619 }, { 129632, 129645 }, { 129648, 129652 }, - { 129656, 129658 }, + { 129656, 129660 }, { 129664, 129670 }, - { 129680, 129704 }, - { 129712, 129718 }, - { 129728, 129730 }, - { 129744, 129750 }, + { 129680, 129708 }, + { 129712, 129722 }, + { 129728, 129733 }, + { 129744, 129753 }, + { 129760, 129767 }, + { 129776, 129782 }, { 129792, 129938 }, { 129940, 129994 }, }; @@ -4490,7 +4602,7 @@ static const URange16 Sc_range16[] = { { 3065, 3065 }, { 3647, 3647 }, { 6107, 6107 }, - { 8352, 8383 }, + { 8352, 8384 }, { 43064, 43064 }, { 65020, 65020 }, { 65129, 65129 }, @@ -4517,6 +4629,7 @@ static const URange16 Sk_range16[] = { { 751, 767 }, { 885, 885 }, { 900, 901 }, + { 2184, 2184 }, { 8125, 8125 }, { 8127, 8129 }, { 8141, 8143 }, @@ -4529,7 +4642,7 @@ static const URange16 Sk_range16[] = { { 42889, 42890 }, { 43867, 43867 }, { 43882, 43883 }, - { 64434, 64449 }, + { 64434, 64450 }, { 65342, 65342 }, { 65344, 65344 }, { 65507, 65507 }, @@ -4713,7 +4826,9 @@ static const URange16 So_range16[] = { { 43062, 43063 }, { 43065, 43065 }, { 43639, 43641 }, - { 65021, 65021 }, + { 64832, 64847 }, + { 64975, 64975 }, + { 65021, 65023 }, { 65508, 65508 }, { 65512, 65512 }, { 65517, 65518 }, @@ -4734,13 +4849,14 @@ static const URange32 So_range32[] = { { 92988, 92991 }, { 92997, 92997 }, { 113820, 113820 }, + { 118608, 118723 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119140 }, { 119146, 119148 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119272 }, + { 119214, 119274 }, { 119296, 119361 }, { 119365, 119365 }, { 119552, 119638 }, @@ -4766,28 +4882,29 @@ static const URange32 So_range32[] = { { 127584, 127589 }, { 127744, 127994 }, { 128000, 128727 }, - { 128736, 128748 }, + { 128733, 128748 }, { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, + { 129008, 129008 }, { 129024, 129035 }, { 129040, 129095 }, { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, { 129200, 129201 }, - { 129280, 129400 }, - { 129402, 129483 }, - { 129485, 129619 }, + { 129280, 129619 }, { 129632, 129645 }, { 129648, 129652 }, - { 129656, 129658 }, + { 129656, 129660 }, { 129664, 129670 }, - { 129680, 129704 }, - { 129712, 129718 }, - { 129728, 129730 }, - { 129744, 129750 }, + { 129680, 129708 }, + { 129712, 129722 }, + { 129728, 129733 }, + { 129744, 129753 }, + { 129760, 129767 }, + { 129776, 129782 }, { 129792, 129938 }, { 129940, 129994 }, }; @@ -4824,7 +4941,7 @@ static const URange32 Adlam_range32[] = { static const URange32 Ahom_range32[] = { { 71424, 71450 }, { 71453, 71467 }, - { 71472, 71487 }, + { 71472, 71494 }, }; static const URange32 Anatolian_Hieroglyphs_range32[] = { { 82944, 83526 }, @@ -4833,23 +4950,23 @@ static const URange16 Arabic_range16[] = { { 1536, 1540 }, { 1542, 1547 }, { 1549, 1562 }, - { 1564, 1564 }, - { 1566, 1566 }, + { 1564, 1566 }, { 1568, 1599 }, { 1601, 1610 }, { 1622, 1647 }, { 1649, 1756 }, { 1758, 1791 }, { 1872, 1919 }, - { 2208, 2228 }, - { 2230, 2247 }, - { 2259, 2273 }, + { 2160, 2190 }, + { 2192, 2193 }, + { 2200, 2273 }, { 2275, 2303 }, - { 64336, 64449 }, + { 64336, 64450 }, { 64467, 64829 }, - { 64848, 64911 }, + { 64832, 64911 }, { 64914, 64967 }, - { 65008, 65021 }, + { 64975, 64975 }, + { 65008, 65023 }, { 65136, 65140 }, { 65142, 65276 }, }; @@ -4901,8 +5018,8 @@ static const URange32 Avestan_range32[] = { { 68409, 68415 }, }; static const URange16 Balinese_range16[] = { - { 6912, 6987 }, - { 6992, 7036 }, + { 6912, 6988 }, + { 6992, 7038 }, }; static const URange16 Bamum_range16[] = { { 42656, 42743 }, @@ -4947,7 +5064,7 @@ static const URange16 Bopomofo_range16[] = { }; static const URange32 Brahmi_range32[] = { { 69632, 69709 }, - { 69714, 69743 }, + { 69714, 69749 }, { 69759, 69759 }, }; static const URange16 Braille_range16[] = { @@ -4964,6 +5081,9 @@ static const URange16 Canadian_Aboriginal_range16[] = { { 5120, 5759 }, { 6320, 6389 }, }; +static const URange32 Canadian_Aboriginal_range32[] = { + { 72368, 72383 }, +}; static const URange32 Carian_range32[] = { { 66208, 66256 }, }; @@ -5030,7 +5150,7 @@ static const URange16 Common_range16[] = { { 8294, 8304 }, { 8308, 8318 }, { 8320, 8334 }, - { 8352, 8383 }, + { 8352, 8384 }, { 8448, 8485 }, { 8487, 8489 }, { 8492, 8497 }, @@ -5043,7 +5163,7 @@ static const URange16 Common_range16[] = { { 10496, 11123 }, { 11126, 11157 }, { 11159, 11263 }, - { 11776, 11858 }, + { 11776, 11869 }, { 12272, 12283 }, { 12288, 12292 }, { 12294, 12294 }, @@ -5089,15 +5209,15 @@ static const URange32 Common_range32[] = { { 65936, 65948 }, { 66000, 66044 }, { 66273, 66299 }, - { 94178, 94179 }, { 113824, 113827 }, + { 118608, 118723 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119142 }, { 119146, 119162 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119272 }, + { 119214, 119274 }, { 119520, 119539 }, { 119552, 119638 }, { 119648, 119672 }, @@ -5138,28 +5258,29 @@ static const URange32 Common_range32[] = { { 127568, 127569 }, { 127584, 127589 }, { 127744, 128727 }, - { 128736, 128748 }, + { 128733, 128748 }, { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, + { 129008, 129008 }, { 129024, 129035 }, { 129040, 129095 }, { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, { 129200, 129201 }, - { 129280, 129400 }, - { 129402, 129483 }, - { 129485, 129619 }, + { 129280, 129619 }, { 129632, 129645 }, { 129648, 129652 }, - { 129656, 129658 }, + { 129656, 129660 }, { 129664, 129670 }, - { 129680, 129704 }, - { 129712, 129718 }, - { 129728, 129730 }, - { 129744, 129750 }, + { 129680, 129708 }, + { 129712, 129722 }, + { 129728, 129733 }, + { 129744, 129753 }, + { 129760, 129767 }, + { 129776, 129782 }, { 129792, 129938 }, { 129940, 129994 }, { 130032, 130041 }, @@ -5185,6 +5306,9 @@ static const URange32 Cypriot_range32[] = { { 67644, 67644 }, { 67647, 67647 }, }; +static const URange32 Cypro_Minoan_range32[] = { + { 77712, 77810 }, +}; static const URange16 Cyrillic_range16[] = { { 1024, 1156 }, { 1159, 1327 }, @@ -5268,6 +5392,12 @@ static const URange16 Ethiopic_range16[] = { { 43808, 43814 }, { 43816, 43822 }, }; +static const URange32 Ethiopic_range32[] = { + { 124896, 124902 }, + { 124904, 124907 }, + { 124909, 124910 }, + { 124912, 124926 }, +}; static const URange16 Georgian_range16[] = { { 4256, 4293 }, { 4295, 4295 }, @@ -5281,8 +5411,7 @@ static const URange16 Georgian_range16[] = { { 11565, 11565 }, }; static const URange16 Glagolitic_range16[] = { - { 11264, 11310 }, - { 11312, 11358 }, + { 11264, 11359 }, }; static const URange32 Glagolitic_range32[] = { { 122880, 122886 }, @@ -5402,14 +5531,15 @@ static const URange16 Han_range16[] = { { 12321, 12329 }, { 12344, 12347 }, { 13312, 19903 }, - { 19968, 40956 }, + { 19968, 40959 }, { 63744, 64109 }, { 64112, 64217 }, }; static const URange32 Han_range32[] = { + { 94178, 94179 }, { 94192, 94193 }, - { 131072, 173789 }, - { 173824, 177972 }, + { 131072, 173791 }, + { 173824, 177976 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, @@ -5460,7 +5590,7 @@ static const URange16 Hiragana_range16[] = { { 12445, 12447 }, }; static const URange32 Hiragana_range32[] = { - { 110593, 110878 }, + { 110593, 110879 }, { 110928, 110930 }, { 127488, 127488 }, }; @@ -5474,15 +5604,14 @@ static const URange16 Inherited_range16[] = { { 1611, 1621 }, { 1648, 1648 }, { 2385, 2388 }, - { 6832, 6848 }, + { 6832, 6862 }, { 7376, 7378 }, { 7380, 7392 }, { 7394, 7400 }, { 7405, 7405 }, { 7412, 7412 }, { 7416, 7417 }, - { 7616, 7673 }, - { 7675, 7679 }, + { 7616, 7679 }, { 8204, 8205 }, { 8400, 8432 }, { 12330, 12333 }, @@ -5494,6 +5623,8 @@ static const URange32 Inherited_range32[] = { { 66045, 66045 }, { 66272, 66272 }, { 70459, 70459 }, + { 118528, 118573 }, + { 118576, 118598 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, @@ -5514,7 +5645,7 @@ static const URange16 Javanese_range16[] = { { 43486, 43487 }, }; static const URange32 Kaithi_range32[] = { - { 69760, 69825 }, + { 69760, 69826 }, { 69837, 69837 }, }; static const URange16 Kannada_range16[] = { @@ -5527,7 +5658,7 @@ static const URange16 Kannada_range16[] = { { 3270, 3272 }, { 3274, 3277 }, { 3285, 3286 }, - { 3294, 3294 }, + { 3293, 3294 }, { 3296, 3299 }, { 3302, 3311 }, { 3313, 3314 }, @@ -5542,7 +5673,11 @@ static const URange16 Katakana_range16[] = { { 65393, 65437 }, }; static const URange32 Katakana_range32[] = { + { 110576, 110579 }, + { 110581, 110587 }, + { 110589, 110590 }, { 110592, 110592 }, + { 110880, 110882 }, { 110948, 110951 }, }; static const URange16 Kayah_Li_range16[] = { @@ -5614,9 +5749,11 @@ static const URange16 Latin_range16[] = { { 8544, 8584 }, { 11360, 11391 }, { 42786, 42887 }, - { 42891, 42943 }, - { 42946, 42954 }, - { 42997, 43007 }, + { 42891, 42954 }, + { 42960, 42961 }, + { 42963, 42963 }, + { 42965, 42969 }, + { 42994, 43007 }, { 43824, 43866 }, { 43868, 43876 }, { 43878, 43881 }, @@ -5624,6 +5761,12 @@ static const URange16 Latin_range16[] = { { 65313, 65338 }, { 65345, 65370 }, }; +static const URange32 Latin_range32[] = { + { 67456, 67461 }, + { 67463, 67504 }, + { 67506, 67514 }, + { 122624, 122654 }, +}; static const URange16 Lepcha_range16[] = { { 7168, 7223 }, { 7227, 7241 }, @@ -5732,8 +5875,7 @@ static const URange32 Modi_range32[] = { static const URange16 Mongolian_range16[] = { { 6144, 6145 }, { 6148, 6148 }, - { 6150, 6158 }, - { 6160, 6169 }, + { 6150, 6169 }, { 6176, 6264 }, { 6272, 6314 }, }; @@ -5824,6 +5966,9 @@ static const URange32 Old_South_Arabian_range32[] = { static const URange32 Old_Turkic_range32[] = { { 68608, 68680 }, }; +static const URange32 Old_Uyghur_range32[] = { + { 69488, 69513 }, +}; static const URange16 Oriya_range16[] = { { 2817, 2819 }, { 2821, 2828 }, @@ -5945,8 +6090,8 @@ static const URange16 Syriac_range16[] = { { 2144, 2154 }, }; static const URange16 Tagalog_range16[] = { - { 5888, 5900 }, - { 5902, 5908 }, + { 5888, 5909 }, + { 5919, 5919 }, }; static const URange16 Tagbanwa_range16[] = { { 5984, 5996 }, @@ -5969,7 +6114,7 @@ static const URange16 Tai_Viet_range16[] = { { 43739, 43743 }, }; static const URange32 Takri_range32[] = { - { 71296, 71352 }, + { 71296, 71353 }, { 71360, 71369 }, }; static const URange16 Tamil_range16[] = { @@ -5994,6 +6139,10 @@ static const URange32 Tamil_range32[] = { { 73664, 73713 }, { 73727, 73727 }, }; +static const URange32 Tangsa_range32[] = { + { 92784, 92862 }, + { 92864, 92873 }, +}; static const URange32 Tangut_range32[] = { { 94176, 94176 }, { 94208, 100343 }, @@ -6005,11 +6154,12 @@ static const URange16 Telugu_range16[] = { { 3086, 3088 }, { 3090, 3112 }, { 3114, 3129 }, - { 3133, 3140 }, + { 3132, 3140 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, { 3160, 3162 }, + { 3165, 3165 }, { 3168, 3171 }, { 3174, 3183 }, { 3191, 3199 }, @@ -6039,6 +6189,9 @@ static const URange32 Tirhuta_range32[] = { { 70784, 70855 }, { 70864, 70873 }, }; +static const URange32 Toto_range32[] = { + { 123536, 123566 }, +}; static const URange32 Ugaritic_range32[] = { { 66432, 66461 }, { 66463, 66463 }, @@ -6046,6 +6199,16 @@ static const URange32 Ugaritic_range32[] = { static const URange16 Vai_range16[] = { { 42240, 42539 }, }; +static const URange32 Vithkuqi_range32[] = { + { 66928, 66938 }, + { 66940, 66954 }, + { 66956, 66962 }, + { 66964, 66965 }, + { 66967, 66977 }, + { 66979, 66993 }, + { 66995, 67001 }, + { 67003, 67004 }, +}; static const URange32 Wancho_range32[] = { { 123584, 123641 }, { 123647, 123647 }, @@ -6066,7 +6229,7 @@ static const URange16 Yi_range16[] = { static const URange32 Zanabazar_Square_range32[] = { { 72192, 72263 }, }; -// 4001 16-bit ranges, 1602 32-bit ranges +// 4038 16-bit ranges, 1712 32-bit ranges const UGroup unicode_groups[] = { { "Adlam", +1, 0, 0, Adlam_range32, 3 }, { "Ahom", +1, 0, 0, Ahom_range32, 3 }, @@ -6085,22 +6248,23 @@ const UGroup unicode_groups[] = { { "Braille", +1, Braille_range16, 1, 0, 0 }, { "Buginese", +1, Buginese_range16, 2, 0, 0 }, { "Buhid", +1, Buhid_range16, 1, 0, 0 }, - { "C", +1, C_range16, 16, C_range32, 9 }, - { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, 0, 0 }, + { "C", +1, C_range16, 17, C_range32, 9 }, + { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, Canadian_Aboriginal_range32, 1 }, { "Carian", +1, 0, 0, Carian_range32, 1 }, { "Caucasian_Albanian", +1, 0, 0, Caucasian_Albanian_range32, 2 }, { "Cc", +1, Cc_range16, 2, 0, 0 }, - { "Cf", +1, Cf_range16, 13, Cf_range32, 7 }, + { "Cf", +1, Cf_range16, 14, Cf_range32, 7 }, { "Chakma", +1, 0, 0, Chakma_range32, 2 }, { "Cham", +1, Cham_range16, 4, 0, 0 }, { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, { "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 91, Common_range32, 82 }, + { "Common", +1, Common_range16, 91, Common_range32, 83 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, + { "Cypro_Minoan", +1, 0, 0, Cypro_Minoan_range32, 1 }, { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, @@ -6110,16 +6274,16 @@ const UGroup unicode_groups[] = { { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 2 }, { "Elbasan", +1, 0, 0, Elbasan_range32, 1 }, { "Elymaic", +1, 0, 0, Elymaic_range32, 1 }, - { "Ethiopic", +1, Ethiopic_range16, 32, 0, 0 }, + { "Ethiopic", +1, Ethiopic_range16, 32, Ethiopic_range32, 4 }, { "Georgian", +1, Georgian_range16, 10, 0, 0 }, - { "Glagolitic", +1, Glagolitic_range16, 2, Glagolitic_range32, 5 }, + { "Glagolitic", +1, Glagolitic_range16, 1, Glagolitic_range32, 5 }, { "Gothic", +1, 0, 0, Gothic_range32, 1 }, { "Grantha", +1, 0, 0, Grantha_range32, 15 }, { "Greek", +1, Greek_range16, 33, Greek_range32, 3 }, { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, { "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 11, Han_range32, 8 }, + { "Han", +1, Han_range16, 11, Han_range32, 9 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, @@ -6127,35 +6291,35 @@ const UGroup unicode_groups[] = { { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 3 }, { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 }, - { "Inherited", +1, Inherited_range16, 20, Inherited_range32, 8 }, + { "Inherited", +1, Inherited_range16, 19, Inherited_range32, 10 }, { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 }, { "Inscriptional_Parthian", +1, 0, 0, Inscriptional_Parthian_range32, 2 }, { "Javanese", +1, Javanese_range16, 3, 0, 0 }, { "Kaithi", +1, 0, 0, Kaithi_range32, 2 }, { "Kannada", +1, Kannada_range16, 13, 0, 0 }, - { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 2 }, + { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 6 }, { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, { "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, { "Khojki", +1, 0, 0, Khojki_range32, 2 }, { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, - { "L", +1, L_range16, 380, L_range32, 242 }, + { "L", +1, L_range16, 380, L_range32, 268 }, { "Lao", +1, Lao_range16, 11, 0, 0 }, - { "Latin", +1, Latin_range16, 32, 0, 0 }, + { "Latin", +1, Latin_range16, 34, Latin_range32, 4 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, { "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 }, - { "Ll", +1, Ll_range16, 611, Ll_range32, 34 }, - { "Lm", +1, Lm_range16, 55, Lm_range32, 6 }, - { "Lo", +1, Lo_range16, 290, Lo_range32, 199 }, + { "Ll", +1, Ll_range16, 617, Ll_range32, 40 }, + { "Lm", +1, Lm_range16, 57, Lm_range32, 12 }, + { "Lo", +1, Lo_range16, 290, Lo_range32, 211 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, - { "Lu", +1, Lu_range16, 601, Lu_range32, 37 }, + { "Lu", +1, Lu_range16, 605, Lu_range32, 41 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 187, M_range32, 103 }, + { "M", +1, M_range16, 189, M_range32, 110 }, { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, { "Makasar", +1, 0, 0, Makasar_range32, 1 }, { "Malayalam", +1, Malayalam_range16, 7, 0, 0 }, @@ -6163,7 +6327,7 @@ const UGroup unicode_groups[] = { { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, { "Marchen", +1, 0, 0, Marchen_range32, 3 }, { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, - { "Mc", +1, Mc_range16, 109, Mc_range32, 66 }, + { "Mc", +1, Mc_range16, 111, Mc_range32, 66 }, { "Me", +1, Me_range16, 5, 0, 0 }, { "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, @@ -6171,16 +6335,16 @@ const UGroup unicode_groups[] = { { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, { "Miao", +1, 0, 0, Miao_range32, 3 }, - { "Mn", +1, Mn_range16, 210, Mn_range32, 117 }, + { "Mn", +1, Mn_range16, 212, Mn_range32, 124 }, { "Modi", +1, 0, 0, Modi_range32, 2 }, - { "Mongolian", +1, Mongolian_range16, 6, Mongolian_range32, 1 }, + { "Mongolian", +1, Mongolian_range16, 5, Mongolian_range32, 1 }, { "Mro", +1, 0, 0, Mro_range32, 3 }, { "Multani", +1, 0, 0, Multani_range32, 5 }, { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, - { "N", +1, N_range16, 67, N_range32, 66 }, + { "N", +1, N_range16, 67, N_range32, 67 }, { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, { "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 }, - { "Nd", +1, Nd_range16, 37, Nd_range32, 24 }, + { "Nd", +1, Nd_range16, 37, Nd_range32, 25 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, { "Newa", +1, 0, 0, Newa_range32, 2 }, { "Nko", +1, Nko_range16, 2, 0, 0 }, @@ -6198,26 +6362,27 @@ const UGroup unicode_groups[] = { { "Old_Sogdian", +1, 0, 0, Old_Sogdian_range32, 1 }, { "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 }, { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 }, + { "Old_Uyghur", +1, 0, 0, Old_Uyghur_range32, 1 }, { "Oriya", +1, Oriya_range16, 14, 0, 0 }, { "Osage", +1, 0, 0, Osage_range32, 2 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 132, P_range32, 53 }, + { "P", +1, P_range16, 133, P_range32, 56 }, { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 }, { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 }, { "Pc", +1, Pc_range16, 6, 0, 0 }, - { "Pd", +1, Pd_range16, 17, Pd_range32, 1 }, - { "Pe", +1, Pe_range16, 72, 0, 0 }, + { "Pd", +1, Pd_range16, 18, Pd_range32, 1 }, + { "Pe", +1, Pe_range16, 76, 0, 0 }, { "Pf", +1, Pf_range16, 10, 0, 0 }, { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 129, Po_range32, 52 }, - { "Ps", +1, Ps_range16, 75, 0, 0 }, + { "Po", +1, Po_range16, 130, Po_range32, 55 }, + { "Ps", +1, Ps_range16, 79, 0, 0 }, { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 148, S_range32, 81 }, + { "S", +1, S_range16, 151, S_range32, 83 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, { "Sc", +1, Sc_range16, 18, Sc_range32, 3 }, @@ -6226,9 +6391,9 @@ const UGroup unicode_groups[] = { { "Siddham", +1, 0, 0, Siddham_range32, 2 }, { "SignWriting", +1, 0, 0, SignWriting_range32, 3 }, { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 }, - { "Sk", +1, Sk_range16, 29, Sk_range32, 1 }, + { "Sk", +1, Sk_range16, 30, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, - { "So", +1, So_range16, 112, So_range32, 70 }, + { "So", +1, So_range16, 114, So_range32, 72 }, { "Sogdian", +1, 0, 0, Sogdian_range32, 1 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, { "Soyombo", +1, 0, 0, Soyombo_range32, 1 }, @@ -6242,15 +6407,18 @@ const UGroup unicode_groups[] = { { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, { "Takri", +1, 0, 0, Takri_range32, 2 }, { "Tamil", +1, Tamil_range16, 16, Tamil_range32, 2 }, + { "Tangsa", +1, 0, 0, Tangsa_range32, 2 }, { "Tangut", +1, 0, 0, Tangut_range32, 4 }, - { "Telugu", +1, Telugu_range16, 12, 0, 0 }, + { "Telugu", +1, Telugu_range16, 13, 0, 0 }, { "Thaana", +1, Thaana_range16, 1, 0, 0 }, { "Thai", +1, Thai_range16, 2, 0, 0 }, { "Tibetan", +1, Tibetan_range16, 7, 0, 0 }, { "Tifinagh", +1, Tifinagh_range16, 3, 0, 0 }, { "Tirhuta", +1, 0, 0, Tirhuta_range32, 2 }, + { "Toto", +1, 0, 0, Toto_range32, 1 }, { "Ugaritic", +1, 0, 0, Ugaritic_range32, 2 }, { "Vai", +1, Vai_range16, 1, 0, 0 }, + { "Vithkuqi", +1, 0, 0, Vithkuqi_range32, 8 }, { "Wancho", +1, 0, 0, Wancho_range32, 2 }, { "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 }, { "Yezidi", +1, 0, 0, Yezidi_range32, 3 }, @@ -6261,7 +6429,7 @@ const UGroup unicode_groups[] = { { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -const int num_unicode_groups = 192; +const int num_unicode_groups = 197; } // namespace re2 diff --git a/re2/walker-inl.h b/re2/walker-inl.h index 8e0f946..4d064a0 100644 --- a/re2/walker-inl.h +++ b/re2/walker-inl.h @@ -148,7 +148,8 @@ template void Regexp::Walker::Reset() { if (!stack_.empty()) { LOG(DFATAL) << "Stack not empty."; while (!stack_.empty()) { - delete[] stack_.top().child_args; + if (stack_.top().re->nsub_ > 1) + delete[] stack_.top().child_args; stack_.pop(); } } @@ -169,7 +170,7 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, for (;;) { T t; s = &stack_.top(); - Regexp* re = s->re; + re = s->re; switch (s->n) { case -1: { if (--max_visits_ < 0) { -- 2.7.4