From c04192c9e5fef3601690a75a4d3dd197c79aaf5b Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Mon, 21 Nov 2016 16:59:01 +0900 Subject: [PATCH] Imported Upstream version 20160801 Change-Id: I3f2d32dbe405230b6c3d4c4d4c3156eac24d419d Signed-off-by: DongHun Kwak --- BUILD | 1 + Makefile | 1 + re2/dfa.cc | 92 +++++++--------- re2/filtered_re2.h | 6 +- re2/nfa.cc | 2 +- re2/onepass.cc | 58 +++++------ re2/parse.cc | 6 +- re2/prefilter.h | 6 +- re2/prefilter_tree.h | 6 +- re2/prog.cc | 225 +++++++++++++++++++++++++++++++--------- re2/prog.h | 49 ++------- re2/re2.cc | 66 ++++++------ re2/re2.h | 127 +++++++++++++++-------- re2/regexp.h | 8 +- re2/set.h | 6 +- re2/stringpiece.h | 8 +- re2/testing/compile_test.cc | 122 +++++++++++++--------- re2/testing/exhaustive_tester.h | 6 +- re2/testing/re2_arg_test.cc | 12 +-- re2/testing/re2_test.cc | 12 +++ re2/testing/regexp_generator.h | 8 +- re2/testing/string_generator.h | 8 +- re2/testing/tester.h | 8 +- re2/unicode_casefold.h | 8 +- re2/unicode_groups.h | 8 +- re2/walker-inl.h | 10 +- runtests | 2 +- util/benchmark.cc | 4 + util/benchmark.h | 8 +- util/bitmap.h | 92 ++++++++++++++++ util/flags.h | 8 +- util/logging.h | 8 +- util/mutex.h | 8 +- util/pcre.cc | 64 ++++++------ util/pcre.h | 60 ++++++----- util/random.h | 8 +- util/sparse_array.h | 8 +- util/sparse_set.h | 8 +- util/test.h | 6 +- util/thread.h | 6 +- util/utf.h | 7 +- util/util.h | 15 ++- util/valgrind.h | 7 +- 43 files changed, 726 insertions(+), 462 deletions(-) create mode 100644 util/bitmap.h diff --git a/BUILD b/BUILD index 3376943..5798eae 100644 --- a/BUILD +++ b/BUILD @@ -38,6 +38,7 @@ cc_library( "re2/unicode_groups.cc", "re2/unicode_groups.h", "re2/walker-inl.h", + "util/bitmap.h", "util/flags.h", "util/hash.cc", "util/logging.cc", diff --git a/Makefile b/Makefile index d498296..7ddf8b2 100644 --- a/Makefile +++ b/Makefile @@ -78,6 +78,7 @@ INSTALL_HFILES=\ HFILES=\ util/benchmark.h\ + util/bitmap.h\ util/flags.h\ util/logging.h\ util/mutex.h\ diff --git a/re2/dfa.cc b/re2/dfa.cc index e33bb01..fba7b64 100644 --- a/re2/dfa.cc +++ b/re2/dfa.cc @@ -114,25 +114,6 @@ class DFA { kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left }; -#ifndef STL_MSVC - // STL function structures for use with unordered_set. - struct StateEqual { - bool operator()(const State* a, const State* b) const { - if (a == b) - return true; - if (a == NULL || b == NULL) - return false; - if (a->ninst_ != b->ninst_) - return false; - if (a->flag_ != b->flag_) - return false; - for (int i = 0; i < a->ninst_; i++) - if (a->inst_[i] != b->inst_[i]) - return false; - return true; // they're equal - } - }; -#endif // STL_MSVC struct StateHash { size_t operator()(const State* a) const { if (a == NULL) @@ -140,39 +121,30 @@ class DFA { const char* s = reinterpret_cast(a->inst_); int len = a->ninst_ * sizeof a->inst_[0]; if (sizeof(size_t) == sizeof(uint32)) - return Hash32StringWithSeed(s, len, a->flag_); + return static_cast(Hash32StringWithSeed(s, len, a->flag_)); else return static_cast(Hash64StringWithSeed(s, len, a->flag_)); } -#ifdef STL_MSVC - // Less than operator. + }; + + struct StateEqual { bool operator()(const State* a, const State* b) const { if (a == b) - return false; + return true; if (a == NULL || b == NULL) - return a == NULL; + return false; if (a->ninst_ != b->ninst_) - return a->ninst_ < b->ninst_; + return false; if (a->flag_ != b->flag_) - return a->flag_ < b->flag_; - for (int i = 0; i < a->ninst_; ++i) + return false; + for (int i = 0; i < a->ninst_; i++) if (a->inst_[i] != b->inst_[i]) - return a->inst_[i] < b->inst_[i]; - return false; // they're equal + return false; + return true; // they're equal } - // The two public members are required by msvc. 4 and 8 are default values. - // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx - static const size_t bucket_size = 4; - static const size_t min_buckets = 8; -#endif // STL_MSVC }; -#ifdef STL_MSVC - typedef unordered_set StateSet; -#else // !STL_MSVC typedef unordered_set StateSet; -#endif // STL_MSVC - private: // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.) @@ -645,7 +617,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { fprintf(stderr, " -> FullMatchState\n"); return FullMatchState; } - // Fall through. + FALLTHROUGH_INTENDED; default: // Record iff id is the head of its list, which must // be the case if id-1 is the last of *its* list. :) @@ -726,7 +698,12 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) { mutex_.AssertHeld(); // Look in the cache for a pre-existing state. - State state = {inst, ninst, flag}; + // We have to initialise the struct like this because otherwise + // MSVC will complain about the flexible array member. :( + State state; + state.inst_ = inst; + state.ninst_ = ninst; + state.flag_ = flag; StateSet::iterator it = state_cache_.find(&state); if (it != state_cache_.end()) { if (DebugDFA) @@ -770,16 +747,15 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) { // Clear the cache. Must hold cache_mutex_.w or be in destructor. void DFA::ClearCache() { - // In case state_cache_ doesn't support deleting entries - // during iteration, copy into a vector and then delete. - vector v; - v.reserve(state_cache_.size()); - for (StateSet::iterator it = state_cache_.begin(); - it != state_cache_.end(); ++it) - v.push_back(*it); + StateSet::iterator begin = state_cache_.begin(); + StateSet::iterator end = state_cache_.end(); + while (begin != end) { + StateSet::iterator tmp = begin; + ++begin; + // Deallocate the blob of memory that we allocated in DFA::CachedState(). + delete[] reinterpret_cast(*tmp); + } state_cache_.clear(); - for (size_t i = 0; i < v.size(); i++) - delete[] reinterpret_cast(v[i]); } // Copies insts in state s to the work queue q. @@ -1716,7 +1692,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, State* s = RunStateOnByte(info->start, i); if (s == NULL) { // Synchronize with "quick check" above. - info->firstbyte.store(firstbyte, std::memory_order_release); + info->firstbyte.store(kFbUnknown, std::memory_order_release); return false; } if (s == info->start) @@ -1809,15 +1785,19 @@ DFA* Prog::GetDFA(MatchKind kind) { return dfa; // For a forward DFA, half the memory goes to each DFA. + // However, if it is a "many match" DFA, then there is + // no counterpart with which the memory must be shared. + // // For a reverse DFA, all the memory goes to the // "longest match" DFA, because RE2 never does reverse // "first match" searches. - int64 m = dfa_mem_/2; + int64 m = dfa_mem_; if (reversed_) { - if (kind == kLongestMatch || kind == kManyMatch) - m = dfa_mem_; - else - m = 0; + DCHECK_EQ(kind, kLongestMatch); + } else if (kind == kFirstMatch || kind == kLongestMatch) { + m /= 2; + } else { + DCHECK_EQ(kind, kManyMatch); } dfa = new DFA(this, kind, m); diff --git a/re2/filtered_re2.h b/re2/filtered_re2.h index f4b2be4..1035a12 100644 --- a/re2/filtered_re2.h +++ b/re2/filtered_re2.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_FILTERED_RE2_H_ +#define RE2_FILTERED_RE2_H_ + // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. // It provides a prefilter mechanism that helps in cutting down the // number of regexps that need to be actually searched. @@ -18,9 +21,6 @@ // indices of strings that were found in the text to get the actual // regexp matches. -#ifndef RE2_FILTERED_RE2_H_ -#define RE2_FILTERED_RE2_H_ - #include #include "re2/re2.h" diff --git a/re2/nfa.cc b/re2/nfa.cc index f084e60..1de5aa7 100644 --- a/re2/nfa.cc +++ b/re2/nfa.cc @@ -288,7 +288,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag, case kInstByteRange: if (!ip->Matches(c)) goto Next; - // Fallthrough intended. + FALLTHROUGH_INTENDED; case kInstMatch: // Save state; will pick up at next byte. diff --git a/re2/onepass.cc b/re2/onepass.cc index a6e3321..da90a86 100644 --- a/re2/onepass.cc +++ b/re2/onepass.cc @@ -191,13 +191,10 @@ static void ApplyCaptures(uint32 cond, const char* p, cap[i] = p; } -// Compute a node pointer. -// Basically (OneState*)(nodes + statesize*nodeindex) -// but the version with the C++ casts overflows 80 characters (and is ugly). -static inline OneState* IndexToNode(volatile uint8* nodes, int statesize, +// Computes the OneState* for the given nodeindex. +static inline OneState* IndexToNode(uint8* nodes, int statesize, int nodeindex) { - return reinterpret_cast( - const_cast(nodes + statesize*nodeindex)); + return reinterpret_cast(nodes + statesize*nodeindex); } bool Prog::SearchOnePass(const StringPiece& text, @@ -233,13 +230,10 @@ bool Prog::SearchOnePass(const StringPiece& text, if (anchor_end()) kind = kFullMatch; - // State and act are marked volatile to - // keep the compiler from re-ordering the - // memory accesses walking over the NFA. - // This is worth about 5%. - volatile OneState* state = onepass_start_; - volatile uint8* nodes = onepass_nodes_; - volatile uint32 statesize = onepass_statesize_; + uint8* nodes = onepass_nodes_; + int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32); + // start() is always mapped to the zeroth OneState. + OneState* state = IndexToNode(nodes, statesize, 0); uint8* bytemap = bytemap_; const char* bp = text.begin(); const char* ep = text.end(); @@ -374,7 +368,7 @@ struct InstCond { // Constructs and saves corresponding one-pass NFA on success. bool Prog::IsOnePass() { if (did_onepass_) - return onepass_start_ != NULL; + return onepass_nodes_ != NULL; did_onepass_ = true; if (start() == 0) // no match @@ -385,7 +379,7 @@ bool Prog::IsOnePass() { // Limit max node count to 65000 as a conservative estimate to // avoid overflowing 16-bit node index in encoding. int maxnodes = 2 + inst_count(kInstByteRange); - int statesize = sizeof(OneState) + bytemap_range_*sizeof(uint32); + int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32); if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) return false; @@ -401,18 +395,20 @@ bool Prog::IsOnePass() { int* nodebyid = new int[size]; // indexed by ip memset(nodebyid, 0xFF, size*sizeof nodebyid[0]); - uint8* nodes = new uint8[maxnodes*statesize]; - uint8* nodep = nodes; + // Originally, nodes was a uint8[maxnodes*statesize], but that was + // unnecessarily optimistic: why allocate a large amount of memory + // upfront for a large program when it is unlikely to be one-pass? + vector nodes; Instq tovisit(size), workq(size); AddQ(&tovisit, start()); nodebyid[start()] = 0; - nodep += statesize; int nalloc = 1; + nodes.insert(nodes.end(), statesize, 0); for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { int id = *it; int nodeindex = nodebyid[id]; - OneState* node = IndexToNode(nodes, statesize, nodeindex); + OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); // Flood graph using manual stack, filling in actions as found. // Default is none. @@ -452,14 +448,16 @@ bool Prog::IsOnePass() { if (nalloc >= maxnodes) { if (Debug) LOG(ERROR) << StringPrintf( - "Not OnePass: hit node limit %d > %d", nalloc, maxnodes); + "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes); goto fail; } nextindex = nalloc; - nodep += statesize; - nodebyid[ip->out()] = nextindex; - nalloc++; AddQ(&tovisit, ip->out()); + nodebyid[ip->out()] = nalloc; + nalloc++; + nodes.insert(nodes.end(), statesize, 0); + // Update node because it might have been invalidated. + node = IndexToNode(nodes.data(), statesize, nodeindex); } for (int c = ip->lo(); c <= ip->hi(); c++) { int b = bytemap_[c]; @@ -587,7 +585,7 @@ bool Prog::IsOnePass() { int nodeindex = nodebyid[id]; if (nodeindex == -1) continue; - OneState* node = IndexToNode(nodes, statesize, nodeindex); + OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n", nodeindex, id, node->matchcond); for (int i = 0; i < bytemap_range_; i++) { @@ -602,16 +600,9 @@ bool Prog::IsOnePass() { LOG(ERROR) << "nodes:\n" << dump; } - // Overallocated earlier; cut down to actual size. - nodep = new uint8[nalloc*statesize]; - memmove(nodep, nodes, nalloc*statesize); - delete[] nodes; - nodes = nodep; - - onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]); - onepass_nodes_ = nodes; - onepass_statesize_ = statesize; dfa_mem_ -= nalloc*statesize; + onepass_nodes_ = new uint8[nalloc*statesize]; + memmove(onepass_nodes_, nodes.data(), nalloc*statesize); delete[] stack; delete[] nodebyid; @@ -620,7 +611,6 @@ bool Prog::IsOnePass() { fail: delete[] stack; delete[] nodebyid; - delete[] nodes; return false; } diff --git a/re2/parse.cc b/re2/parse.cc index f55dc8c..9cd9cc1 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -284,7 +284,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) { case EvenOddSkip: // even <-> odd but only applies to every other if ((r - f->lo) % 2) return r; - // fall through + FALLTHROUGH_INTENDED; case EvenOdd: // even <-> odd if (r%2 == 0) return r + 1; @@ -293,7 +293,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) { case OddEvenSkip: // odd <-> even but only applies to every other if ((r - f->lo) % 2) return r; - // fall through + FALLTHROUGH_INTENDED; case OddEven: // odd <-> even if (r%2 == 1) return r + 1; @@ -1354,7 +1354,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // Single non-zero octal digit is a backreference; not supported. if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') goto BadEscape; - // fall through + FALLTHROUGH_INTENDED; case '0': // consume up to three octal digits; already have one. code = c - '0'; diff --git a/re2/prefilter.h b/re2/prefilter.h index 531c189..e58efe8 100644 --- a/re2/prefilter.h +++ b/re2/prefilter.h @@ -2,13 +2,13 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_PREFILTER_H_ +#define RE2_PREFILTER_H_ + // Prefilter is the class used to extract string guards from regexps. // Rather than using Prefilter class directly, use FilteredRE2. // See filtered_re2.h -#ifndef RE2_PREFILTER_H_ -#define RE2_PREFILTER_H_ - #include "util/util.h" namespace re2 { diff --git a/re2/prefilter_tree.h b/re2/prefilter_tree.h index abea55d..a8ec589 100644 --- a/re2/prefilter_tree.h +++ b/re2/prefilter_tree.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_PREFILTER_TREE_H_ +#define RE2_PREFILTER_TREE_H_ + // The PrefilterTree class is used to form an AND-OR tree of strings // that would trigger each regexp. The 'prefilter' of each regexp is // added tp PrefilterTree, and then PrefilterTree is used to find all @@ -12,9 +15,6 @@ // favorite engine. PrefilterTree provides a set of strings (called // atoms) that the user of this class should use to do the string // matching. -// -#ifndef RE2_PREFILTER_TREE_H_ -#define RE2_PREFILTER_TREE_H_ #include "util/util.h" #include "util/sparse_array.h" diff --git a/re2/prog.cc b/re2/prog.cc index 9098dd4..5d8dd6c 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -6,6 +6,7 @@ // Tested by compile_test.cc #include "util/util.h" +#include "util/bitmap.h" #include "re2/prog.h" #include "re2/stringpiece.h" @@ -101,13 +102,12 @@ Prog::Prog() bytemap_range_(0), first_byte_(-1), flags_(0), - onepass_statesize_(0), + list_count_(0), inst_(NULL), + onepass_nodes_(NULL), dfa_first_(NULL), dfa_longest_(NULL), - dfa_mem_(0), - onepass_nodes_(NULL), - onepass_start_(NULL) { + dfa_mem_(0) { } Prog::~Prog() { @@ -313,70 +313,197 @@ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { return flags; } +// ByteMapBuilder implements a coloring algorithm. +// +// The first phase is a series of "mark and merge" batches: we mark one or more +// [lo-hi] ranges, then merge them into our internal state. Batching is not for +// performance; rather, it means that the ranges are treated indistinguishably. +// +// Internally, the ranges are represented using a bitmap that stores the splits +// and a vector that stores the colors; both of them are indexed by the ranges' +// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at +// hi (if not already split), then recolor each range in between. The color map +// (i.e. from the old color to the new color) is maintained for the lifetime of +// the batch and so underpins this somewhat obscure approach to set operations. +// +// The second phase builds the bytemap from our internal state: we recolor each +// range, then store the new color (which is now the byte class) in each of the +// corresponding array elements. Finally, we output the number of byte classes. +class ByteMapBuilder { + public: + ByteMapBuilder() { + // Initial state: the [0-255] range has color 256. + // This will avoid problems during the second phase, + // in which we assign byte classes numbered from 0. + splits_.Set(255); + colors_.resize(256); + colors_[255] = 256; + nextcolor_ = 257; + } + + void Mark(int lo, int hi); + void Merge(); + void Build(uint8* bytemap, int* bytemap_range); + + private: + int Recolor(int oldcolor); + + Bitmap256 splits_; + vector colors_; + int nextcolor_; + vector> colormap_; + vector> ranges_; + + DISALLOW_COPY_AND_ASSIGN(ByteMapBuilder); +}; + +void ByteMapBuilder::Mark(int lo, int hi) { + DCHECK_GE(lo, 0); + DCHECK_GE(hi, 0); + DCHECK_LE(lo, 255); + DCHECK_LE(hi, 255); + DCHECK_LE(lo, hi); + + // Ignore any [0-255] ranges. They cause us to recolor every range, which + // has no effect on the eventual result and is therefore a waste of time. + if (lo == 0 && hi == 255) + return; + + ranges_.emplace_back(lo, hi); +} + +void ByteMapBuilder::Merge() { + for (vector>::const_iterator it = ranges_.begin(); + it != ranges_.end(); + ++it) { + int lo = it->first-1; + int hi = it->second; + + if (0 <= lo && !splits_.Test(lo)) { + splits_.Set(lo); + int next = splits_.FindNextSetBit(lo+1); + colors_[lo] = colors_[next]; + } + if (!splits_.Test(hi)) { + splits_.Set(hi); + int next = splits_.FindNextSetBit(hi+1); + colors_[hi] = colors_[next]; + } + + int c = lo+1; + while (c < 256) { + int next = splits_.FindNextSetBit(c); + colors_[next] = Recolor(colors_[next]); + if (next == hi) + break; + c = next+1; + } + } + colormap_.clear(); + ranges_.clear(); +} + +void ByteMapBuilder::Build(uint8* bytemap, int* bytemap_range) { + // Assign byte classes numbered from 0. + nextcolor_ = 0; + + int c = 0; + while (c < 256) { + int next = splits_.FindNextSetBit(c); + uint8 b = static_cast(Recolor(colors_[next])); + while (c <= next) { + bytemap[c] = b; + c++; + } + } + + *bytemap_range = nextcolor_; +} + +int ByteMapBuilder::Recolor(int oldcolor) { + // Yes, this is a linear search. There can be at most 256 + // colors and there will typically be far fewer than that. + // Also, we need to consider keys *and* values in order to + // avoid recoloring a given range more than once per batch. + vector>::const_iterator it = + std::find_if(colormap_.begin(), colormap_.end(), + [&](const pair& kv) -> bool { + return kv.first == oldcolor || kv.second == oldcolor; + }); + if (it != colormap_.end()) + return it->second; + int newcolor = nextcolor_; + nextcolor_++; + colormap_.emplace_back(oldcolor, newcolor); + return newcolor; +} + void Prog::ComputeByteMap() { - // Fill in byte map with byte classes for the program. + // Fill in bytemap with byte classes for the program. // Ranges of bytes that are treated indistinguishably - // are mapped to a single byte class. - Bitmap<256> v; + // will be mapped to a single byte class. + ByteMapBuilder builder; + // Don't repeat the work for ^ and $. + bool marked_line_boundaries = false; // Don't repeat the work for \b and \B. - bool done_word_boundaries = false; + bool marked_word_boundaries = false; for (int id = 0; id < static_cast(size()); id++) { Inst* ip = inst(id); if (ip->opcode() == kInstByteRange) { int lo = ip->lo(); int hi = ip->hi(); - if (0 < lo) - v.Set(lo - 1); - v.Set(hi); + builder.Mark(lo, hi); if (ip->foldcase() && lo <= 'z' && hi >= 'a') { - if (lo < 'a') - lo = 'a'; - if (hi > 'z') - hi = 'z'; - if (lo <= hi) { - v.Set(lo + 'A' - 'a' - 1); - v.Set(hi + 'A' - 'a'); - } + int foldlo = lo; + int foldhi = hi; + if (foldlo < 'a') + foldlo = 'a'; + if (foldhi > 'z') + foldhi = 'z'; + if (foldlo <= foldhi) + builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a'); } + // If this Inst is not the last Inst in its list AND the next Inst is + // also a ByteRange AND the Insts have the same out, defer the merge. + if (!ip->last() && + inst(id+1)->opcode() == kInstByteRange && + ip->out() == inst(id+1)->out()) + continue; + builder.Merge(); } else if (ip->opcode() == kInstEmptyWidth) { - if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine)) { - v.Set('\n' - 1); - v.Set('\n'); + if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) && + !marked_line_boundaries) { + builder.Mark('\n', '\n'); + builder.Merge(); + marked_line_boundaries = true; } - if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { - if (done_word_boundaries) - continue; - int j; - for (int i = 0; i < 256; i = j) { - for (j = i + 1; j < 256 && - Prog::IsWordChar(static_cast(i)) == - Prog::IsWordChar(static_cast(j)); - j++) - ; - if (0 < i) - v.Set(i - 1); - v.Set(j - 1); + if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) && + !marked_word_boundaries) { + // We require two batches here: the first for ranges that are word + // characters, the second for ranges that are not word characters. + for (bool isword : {true, false}) { + int j; + for (int i = 0; i < 256; i = j) { + for (j = i + 1; j < 256 && + Prog::IsWordChar(static_cast(i)) == + Prog::IsWordChar(static_cast(j)); + j++) + ; + if (Prog::IsWordChar(static_cast(i)) == isword) + builder.Mark(i, j - 1); + } + builder.Merge(); } - done_word_boundaries = true; + marked_word_boundaries = true; } } } - COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize); - uint8 n = 0; - uint32 bits = 0; - for (int i = 0; i < 256; i++) { - if ((i & 31) == 0) - bits = v.Word(i >> 5); - bytemap_[i] = n; - n += bits & 1; - bits >>= 1; - } - bytemap_range_ = bytemap_[255] + 1; + builder.Build(bytemap_, &bytemap_range_); - if (0) { // For debugging: use trivial byte map. + if (0) { // For debugging: use trivial bytemap. for (int i = 0; i < 256; i++) bytemap_[i] = static_cast(i); bytemap_range_ = 256; @@ -536,7 +663,7 @@ void Prog::EmitList(int root, SparseArray* rootmap, vector* flat, flat->back().set_opcode(kInstAltMatch); flat->back().set_out(static_cast(flat->size())); flat->back().out1_ = static_cast(flat->size())+1; - // Fall through. + FALLTHROUGH_INTENDED; case kInstAlt: stk->push_back(ip->out1()); diff --git a/re2/prog.h b/re2/prog.h index d5c8616..e1c7249 100644 --- a/re2/prog.h +++ b/re2/prog.h @@ -2,13 +2,13 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_PROG_H_ +#define RE2_PROG_H_ + // Compiled representation of regular expressions. // See regexp.h for the Regexp class, which represents a regular // expression symbolically. -#ifndef RE2_PROG_H__ -#define RE2_PROG_H__ - #include "util/util.h" #include "util/sparse_array.h" #include "util/sparse_set.h" @@ -16,38 +16,6 @@ namespace re2 { -// Simple fixed-size bitmap. -template -class Bitmap { - public: - Bitmap() { Reset(); } - int Size() { return Bits; } - - void Reset() { - for (int i = 0; i < Words; i++) - w_[i] = 0; - } - bool Get(int k) const { - return w_[k >> WordLog] & (1<<(k & 31)); - } - void Set(int k) { - w_[k >> WordLog] |= 1<<(k & 31); - } - void Clear(int k) { - w_[k >> WordLog] &= ~(1<<(k & 31)); - } - uint32 Word(int i) const { - return w_[i]; - } - - private: - static const int WordLog = 5; - static const int Words = (Bits+31)/32; - uint32 w_[Words]; - DISALLOW_COPY_AND_ASSIGN(Bitmap); -}; - - // Opcodes for Inst enum InstOp { kInstAlt = 0, // choose between out_ and out1_ @@ -291,7 +259,7 @@ class Prog { // for testing purposes. Returns number of states. int BuildEntireDFA(MatchKind kind); - // Compute byte map. + // Compute bytemap. void ComputeByteMap(); // Computes whether all matches must begin with the same first @@ -385,22 +353,19 @@ class Prog { int bytemap_range_; // bytemap_[x] < bytemap_range_ int first_byte_; // required first byte for match, or -1 if none int flags_; // regexp parse flags - int onepass_statesize_; // byte size of each OneState* node int list_count_; // count of lists (see above) int inst_count_[kNumInst]; // count of instructions by opcode Inst* inst_; // pointer to instruction array + uint8* onepass_nodes_; // data for OnePass nodes Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_ std::atomic dfa_first_; // DFA cached for kFirstMatch std::atomic dfa_longest_; // DFA cached for kLongestMatch and kFullMatch int64 dfa_mem_; // Maximum memory for DFAs. - uint8 bytemap_[256]; // map from input bytes to byte classes - - uint8* onepass_nodes_; // data for OnePass nodes - OneState* onepass_start_; // start node for OnePass program + uint8 bytemap_[256]; // map from input bytes to byte classes std::once_flag first_byte_once_; @@ -409,4 +374,4 @@ class Prog { } // namespace re2 -#endif // RE2_PROG_H__ +#endif // RE2_PROG_H_ diff --git a/re2/re2.cc b/re2/re2.cc index 6bc5edd..49388c7 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -962,6 +962,13 @@ bool RE2::Arg::parse_char(const char* str, int n, void* dest) { return true; } +bool RE2::Arg::parse_schar(const char* str, int n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { if (n != 1) return false; if (dest == NULL) return true; @@ -1074,8 +1081,8 @@ bool RE2::Arg::parse_short_radix(const char* str, void* dest, int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range if (dest == NULL) return true; *(reinterpret_cast(dest)) = (short)r; return true; @@ -1086,10 +1093,10 @@ bool RE2::Arg::parse_ushort_radix(const char* str, void* dest, int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((ushort)r != r) return false; // Out of range + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (ushort)r; + *(reinterpret_cast(dest)) = (unsigned short)r; return true; } @@ -1098,10 +1105,10 @@ bool RE2::Arg::parse_int_radix(const char* str, void* dest, int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (int)r; return true; } @@ -1110,14 +1117,13 @@ bool RE2::Arg::parse_uint_radix(const char* str, void* dest, int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((uint)r != r) return false; // Out of range + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (unsigned int)r; return true; } -#if RE2_HAVE_LONGLONG bool RE2::Arg::parse_longlong_radix(const char* str, int n, void* dest, @@ -1156,7 +1162,6 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str, *(reinterpret_cast(dest)) = r; return true; } -#endif static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { if (n == 0) return false; @@ -1190,30 +1195,29 @@ bool RE2::Arg::parse_float(const char* str, int n, void* dest) { return parse_double_float(str, n, true, dest); } - -#define DEFINE_INTEGER_PARSERS(name) \ +#define DEFINE_INTEGER_PARSER(name) \ bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 10); \ - } \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 16); \ - } \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 8); \ - } \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 0); \ + return parse_##name##_radix(str, n, dest, 0); \ } -DEFINE_INTEGER_PARSERS(short); -DEFINE_INTEGER_PARSERS(ushort); -DEFINE_INTEGER_PARSERS(int); -DEFINE_INTEGER_PARSERS(uint); -DEFINE_INTEGER_PARSERS(long); -DEFINE_INTEGER_PARSERS(ulong); -DEFINE_INTEGER_PARSERS(longlong); -DEFINE_INTEGER_PARSERS(ulonglong); +DEFINE_INTEGER_PARSER(short); +DEFINE_INTEGER_PARSER(ushort); +DEFINE_INTEGER_PARSER(int); +DEFINE_INTEGER_PARSER(uint); +DEFINE_INTEGER_PARSER(long); +DEFINE_INTEGER_PARSER(ulong); +DEFINE_INTEGER_PARSER(longlong); +DEFINE_INTEGER_PARSER(ulonglong); -#undef DEFINE_INTEGER_PARSERS +#undef DEFINE_INTEGER_PARSER } // namespace re2 diff --git a/re2/re2.h b/re2/re2.h index d9139b2..cc35736 100644 --- a/re2/re2.h +++ b/re2/re2.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_RE2_H -#define RE2_RE2_H +#ifndef RE2_RE2_H_ +#define RE2_RE2_H_ // C++ interface to the re2 regular-expression library. // RE2 supports Perl-style regular expressions (with extensions like @@ -185,10 +185,6 @@ #include #include "re2/stringpiece.h" -#ifndef RE2_HAVE_LONGLONG -#define RE2_HAVE_LONGLONG 1 -#endif - namespace re2 { using std::string; @@ -686,10 +682,8 @@ class RE2 { static inline Arg CRadix(unsigned int* x); static inline Arg CRadix(long* x); static inline Arg CRadix(unsigned long* x); - #if RE2_HAVE_LONGLONG static inline Arg CRadix(long long* x); static inline Arg CRadix(unsigned long long* x); - #endif static inline Arg Hex(short* x); static inline Arg Hex(unsigned short* x); @@ -697,10 +691,8 @@ class RE2 { static inline Arg Hex(unsigned int* x); static inline Arg Hex(long* x); static inline Arg Hex(unsigned long* x); - #if RE2_HAVE_LONGLONG static inline Arg Hex(long long* x); static inline Arg Hex(unsigned long long* x); - #endif static inline Arg Octal(short* x); static inline Arg Octal(unsigned short* x); @@ -708,10 +700,8 @@ class RE2 { static inline Arg Octal(unsigned int* x); static inline Arg Octal(long* x); static inline Arg Octal(unsigned long* x); - #if RE2_HAVE_LONGLONG static inline Arg Octal(long long* x); static inline Arg Octal(unsigned long long* x); - #endif private: void Init(const StringPiece& pattern, const Options& options); @@ -783,28 +773,26 @@ class RE2::Arg { typedef bool (*Parser)(const char* str, int n, void* dest); // Type-specific parsers -#define MAKE_PARSER(type,name) \ - Arg(type* p) : arg_(p), parser_(name) { } \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \ - +#define MAKE_PARSER(type, name) \ + Arg(type* p) : arg_(p), parser_(name) {} \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} MAKE_PARSER(char, parse_char); - MAKE_PARSER(signed char, parse_char); + MAKE_PARSER(signed char, parse_schar); MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + MAKE_PARSER(short, parse_short); MAKE_PARSER(unsigned short, parse_ushort); MAKE_PARSER(int, parse_int); MAKE_PARSER(unsigned int, parse_uint); MAKE_PARSER(long, parse_long); MAKE_PARSER(unsigned long, parse_ulong); - #if RE2_HAVE_LONGLONG MAKE_PARSER(long long, parse_longlong); MAKE_PARSER(unsigned long long, parse_ulonglong); - #endif - MAKE_PARSER(float, parse_float); - MAKE_PARSER(double, parse_double); - MAKE_PARSER(string, parse_string); - MAKE_PARSER(StringPiece, parse_stringpiece); #undef MAKE_PARSER @@ -823,21 +811,23 @@ class RE2::Arg { static bool parse_null (const char* str, int n, void* dest); static bool parse_char (const char* str, int n, void* dest); + static bool parse_schar (const char* str, int n, void* dest); static bool parse_uchar (const char* str, int n, void* dest); static bool parse_float (const char* str, int n, void* dest); static bool parse_double (const char* str, int n, void* dest); static bool parse_string (const char* str, int n, void* dest); static bool parse_stringpiece (const char* str, int n, void* dest); -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_ ## name(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _radix( \ - const char* str, int n, void* dest, int radix); \ - public: \ - static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _cradix(const char* str, int n, void* dest) +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_##name(const char* str, int n, void* dest); \ + static bool parse_##name##_radix(const char* str, int n, void* dest, \ + int radix); \ + \ + public: \ + static bool parse_##name##_hex(const char* str, int n, void* dest); \ + static bool parse_##name##_octal(const char* str, int n, void* dest); \ + static bool parse_##name##_cradix(const char* str, int n, void* dest) DECLARE_INTEGER_PARSER(short); DECLARE_INTEGER_PARSER(ushort); @@ -845,12 +835,11 @@ class RE2::Arg { DECLARE_INTEGER_PARSER(uint); DECLARE_INTEGER_PARSER(long); DECLARE_INTEGER_PARSER(ulong); - #if RE2_HAVE_LONGLONG DECLARE_INTEGER_PARSER(longlong); DECLARE_INTEGER_PARSER(ulonglong); - #endif #undef DECLARE_INTEGER_PARSER + }; inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { } @@ -861,13 +850,16 @@ inline bool RE2::Arg::Parse(const char* str, int n) const { } // This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline RE2::Arg RE2::Hex(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \ - inline RE2::Arg RE2::Octal(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \ - inline RE2::Arg RE2::CRadix(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); } +#define MAKE_INTEGER_PARSER(type, name) \ + inline RE2::Arg RE2::Hex(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \ + } \ + inline RE2::Arg RE2::Octal(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \ + } \ + inline RE2::Arg RE2::CRadix(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \ + } MAKE_INTEGER_PARSER(short, short) MAKE_INTEGER_PARSER(unsigned short, ushort) @@ -875,15 +867,62 @@ MAKE_INTEGER_PARSER(int, int) MAKE_INTEGER_PARSER(unsigned int, uint) MAKE_INTEGER_PARSER(long, long) MAKE_INTEGER_PARSER(unsigned long, ulong) -#if RE2_HAVE_LONGLONG MAKE_INTEGER_PARSER(long long, longlong) MAKE_INTEGER_PARSER(unsigned long long, ulonglong) -#endif #undef MAKE_INTEGER_PARSER +#ifndef SWIG +// Helper for writing global or static RE2s safely. +// Write +// static LazyRE2 re = {".*"}; +// and then use *re instead of writing +// static RE2 re(".*"); +// The former is more careful about multithreaded +// situations than the latter. +// +// N.B. This class never deletes the RE2 object that +// it constructs: that's a feature, so that it can be used +// for global and function static variables. +class LazyRE2 { + private: + struct NoArg {}; + + public: + typedef RE2 element_type; // support std::pointer_traits + + // Constructor omitted to preserve braced initialization in C++98. + + // Pretend to be a pointer to Type (never NULL due to on-demand creation): + RE2& operator*() const { return *get(); } + RE2* operator->() const { return get(); } + + // Named accessor/initializer: + RE2* get() const { + std::call_once(once_, [this]() { LazyRE2::Init(this); }); + return ptr_; + } + + // All data fields must be public to support {"foo"} initialization. + const char* pattern_; + RE2::CannedOptions options_; + NoArg barrier_against_excess_initializers_; + + mutable RE2* ptr_; + mutable std::once_flag once_; + + private: + static void Init(const LazyRE2* lazy_re2) { + lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_); + } + + void operator=(const LazyRE2&); // disallowed +}; +#endif // SWIG + } // namespace re2 using re2::RE2; +using re2::LazyRE2; -#endif /* RE2_RE2_H */ +#endif // RE2_RE2_H_ diff --git a/re2/regexp.h b/re2/regexp.h index 54720eb..607dc93 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_REGEXP_H_ +#define RE2_REGEXP_H_ + // --- SPONSORED LINK -------------------------------------------------- // If you want to use this library for regular expression matching, // you should use re2/re2.h, which provides a class RE2 that @@ -83,9 +86,6 @@ // form accessible to clients, so that client code can analyze the // parsed regular expressions. -#ifndef RE2_REGEXP_H__ -#define RE2_REGEXP_H__ - #include "util/util.h" #include "re2/stringpiece.h" @@ -630,4 +630,4 @@ inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) } // namespace re2 -#endif // RE2_REGEXP_H__ +#endif // RE2_REGEXP_H_ diff --git a/re2/set.h b/re2/set.h index 5bcdb70..0e6d4b3 100644 --- a/re2/set.h +++ b/re2/set.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_SET_H -#define RE2_SET_H +#ifndef RE2_SET_H_ +#define RE2_SET_H_ #include #include @@ -52,4 +52,4 @@ class RE2::Set { } // namespace re2 -#endif // RE2_SET_H +#endif // RE2_SET_H_ diff --git a/re2/stringpiece.h b/re2/stringpiece.h index 1479d1a..0bf5b0c 100644 --- a/re2/stringpiece.h +++ b/re2/stringpiece.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_STRINGPIECE_H_ +#define RE2_STRINGPIECE_H_ + // A string-like object that points to a sized piece of memory. // // Functions or methods may use const StringPiece& parameters to accept either @@ -16,9 +19,6 @@ // // Arghh! I wish C++ literals were "string". -#ifndef STRINGS_STRINGPIECE_H__ -#define STRINGS_STRINGPIECE_H__ - #include #include #include @@ -182,4 +182,4 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) { // allow StringPiece to be logged extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece); -#endif // STRINGS_STRINGPIECE_H__ +#endif // RE2_STRINGPIECE_H_ diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc index 0c598e0..cd8406d 100644 --- a/re2/testing/compile_test.cc +++ b/re2/testing/compile_test.cc @@ -143,63 +143,89 @@ TEST(TestRegexpCompileToProg, Simple) { EXPECT_EQ(failed, 0); } -// The distinct byte ranges involved in the Latin-1 dot ([^\n]). -static struct Latin1ByteRange { - int lo; - int hi; -} latin1ranges[] = { - { 0x00, 0x09 }, - { 0x0A, 0x0A }, - { 0x0B, 0xFF }, -}; - -TEST(TestCompile, Latin1Ranges) { - Regexp* re = Regexp::Parse(".", Regexp::PerlX|Regexp::Latin1, NULL); +static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, + string* bytemap) { + Regexp* re = Regexp::Parse(pattern, flags, NULL); EXPECT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); EXPECT_TRUE(prog != NULL); - EXPECT_EQ(prog->bytemap_range(), arraysize(latin1ranges)); - for (int i = 0; i < arraysize(latin1ranges); i++) - for (int j = latin1ranges[i].lo; j <= latin1ranges[i].hi; j++) - EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j; + *bytemap = prog->DumpByteMap(); delete prog; + re->Decref(); } -// The distinct byte ranges involved in the UTF-8 dot ([^\n]). -// Once, erroneously split between 0x3f and 0x40 because it is -// a 6-bit boundary. -static struct UTF8ByteRange { - int lo; - int hi; -} utf8ranges[] = { - { 0x00, 0x09 }, - { 0x0A, 0x0A }, - { 0x0B, 0x7F }, - { 0x80, 0x8F }, - { 0x90, 0x9F }, - { 0xA0, 0xBF }, - { 0xC0, 0xC1 }, - { 0xC2, 0xDF }, - { 0xE0, 0xE0 }, - { 0xE1, 0xEF }, - { 0xF0, 0xF0 }, - { 0xF1, 0xF3 }, - { 0xF4, 0xF4 }, - { 0xF5, 0xFF }, -}; +TEST(TestCompile, Latin1Ranges) { + // The distinct byte ranges involved in the Latin-1 dot ([^\n]). + + string bytemap; + + DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-09] -> 0\n" + "[0a-0a] -> 1\n" + "[0b-ff] -> 0\n", + bytemap); +} + +TEST(TestCompile, OtherByteMapTests) { + string bytemap; + + // Test that "absent" ranges are mapped to the same byte class. + DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-2f] -> 0\n" + "[30-39] -> 1\n" + "[3a-40] -> 0\n" + "[41-46] -> 1\n" + "[47-60] -> 0\n" + "[61-66] -> 1\n" + "[67-ff] -> 0\n", + bytemap); + + // Test the byte classes for \b. + DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-2f] -> 0\n" + "[30-39] -> 1\n" + "[3a-40] -> 0\n" + "[41-5a] -> 1\n" + "[5b-5e] -> 0\n" + "[5f-5f] -> 1\n" + "[60-60] -> 0\n" + "[61-7a] -> 1\n" + "[7b-ff] -> 0\n", + bytemap); + + // Bug in the ASCII case-folding optimization created too many byte classes. + DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-5e] -> 0\n" + "[5f-5f] -> 1\n" + "[60-ff] -> 0\n", + bytemap); +} TEST(TestCompile, UTF8Ranges) { - Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL); - EXPECT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(0); - EXPECT_TRUE(prog != NULL); - EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges)); - for (int i = 0; i < arraysize(utf8ranges); i++) - for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++) - EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j; - delete prog; - re->Decref(); + // The distinct byte ranges involved in the UTF-8 dot ([^\n]). + // Once, erroneously split between 0x3f and 0x40 because it is + // a 6-bit boundary. + + string bytemap; + + DumpByteMap(".", Regexp::PerlX, &bytemap); + EXPECT_EQ("[00-09] -> 0\n" + "[0a-0a] -> 1\n" + "[0b-7f] -> 0\n" + "[80-8f] -> 2\n" + "[90-9f] -> 3\n" + "[a0-bf] -> 4\n" + "[c0-c1] -> 1\n" + "[c2-df] -> 5\n" + "[e0-e0] -> 6\n" + "[e1-ef] -> 7\n" + "[f0-f0] -> 8\n" + "[f1-f3] -> 9\n" + "[f4-f4] -> 10\n" + "[f5-ff] -> 1\n", + bytemap); } TEST(TestCompile, InsufficientMemory) { diff --git a/re2/testing/exhaustive_tester.h b/re2/testing/exhaustive_tester.h index 1facb97..a8f39eb 100644 --- a/re2/testing/exhaustive_tester.h +++ b/re2/testing/exhaustive_tester.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__ -#define RE2_TESTING_EXHAUSTIVE_TESTER_H__ +#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H_ +#define RE2_TESTING_EXHAUSTIVE_TESTER_H_ #include #include @@ -92,4 +92,4 @@ void EgrepTest(int maxatoms, int maxops, const string& alphabet, } // namespace re2 -#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__ +#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H_ diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc index d843ffa..06c58f1 100644 --- a/re2/testing/re2_arg_test.cc +++ b/re2/testing/re2_arg_test.cc @@ -106,27 +106,27 @@ const int kNumStrings = arraysize(kSuccessTable); } \ } -TEST(REArgTest, Int16Test) { +TEST(RE2ArgTest, Int16Test) { PARSE_FOR_TYPE(int16, 0); } -TEST(REArgTest, Uint16Test) { +TEST(RE2ArgTest, Uint16Test) { PARSE_FOR_TYPE(uint16, 1); } -TEST(REArgTest, IntTest) { +TEST(RE2ArgTest, IntTest) { PARSE_FOR_TYPE(int, 2); } -TEST(REArgTest, UInt32Test) { +TEST(RE2ArgTest, Uint32Test) { PARSE_FOR_TYPE(uint32, 3); } -TEST(REArgTest, Iint64Test) { +TEST(RE2ArgTest, Int64Test) { PARSE_FOR_TYPE(int64, 4); } -TEST(REArgTest, Uint64Test) { +TEST(RE2ArgTest, Uint64Test) { PARSE_FOR_TYPE(uint64, 5); } diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 81d5fef..830b3f7 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -1411,6 +1411,18 @@ TEST(RE2, UnicodeClasses) { EXPECT_EQ("鋒", c); } +TEST(RE2, LazyRE2) { + // Test with and without options. + static LazyRE2 a = {"a"}; + static LazyRE2 b = {"b", RE2::Latin1}; + + EXPECT_EQ("a", a->pattern()); + EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding()); + + EXPECT_EQ("b", b->pattern()); + EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding()); +} + // Bug reported by saito. 2009/02/17 TEST(RE2, NullVsEmptyString) { RE2 re(".*"); diff --git a/re2/testing/regexp_generator.h b/re2/testing/regexp_generator.h index 3ba0d70..06ea4c4 100644 --- a/re2/testing/regexp_generator.h +++ b/re2/testing/regexp_generator.h @@ -2,12 +2,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_TESTING_REGEXP_GENERATOR_H_ +#define RE2_TESTING_REGEXP_GENERATOR_H_ + // Regular expression generator: generates all possible // regular expressions within given parameters (see below for details). -#ifndef RE2_TESTING_REGEXP_GENERATOR_H__ -#define RE2_TESTING_REGEXP_GENERATOR_H__ - #include #include #include "util/random.h" @@ -67,4 +67,4 @@ vector Split(const StringPiece& sep, const StringPiece& s); } // namespace re2 -#endif // RE2_TESTING_REGEXP_GENERATOR_H__ +#endif // RE2_TESTING_REGEXP_GENERATOR_H_ diff --git a/re2/testing/string_generator.h b/re2/testing/string_generator.h index 52e5e22..ff5a711 100644 --- a/re2/testing/string_generator.h +++ b/re2/testing/string_generator.h @@ -2,13 +2,13 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_TESTING_STRING_GENERATOR_H_ +#define RE2_TESTING_STRING_GENERATOR_H_ + // String generator: generates all possible strings of up to // maxlen letters using the set of letters in alpha. // Fetch strings using a Java-like Next()/HasNext() interface. -#ifndef RE2_TESTING_STRING_GENERATOR_H__ -#define RE2_TESTING_STRING_GENERATOR_H__ - #include #include #include "util/util.h" @@ -55,4 +55,4 @@ class StringGenerator { } // namespace re2 -#endif // RE2_TESTING_STRING_GENERATOR_H__ +#endif // RE2_TESTING_STRING_GENERATOR_H_ diff --git a/re2/testing/tester.h b/re2/testing/tester.h index e9fae85..07291d2 100644 --- a/re2/testing/tester.h +++ b/re2/testing/tester.h @@ -2,12 +2,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_TESTING_TESTER_H_ +#define RE2_TESTING_TESTER_H_ + // Comparative tester for regular expression matching. // Checks all implementations against each other. -#ifndef RE2_TESTING_TESTER_H__ -#define RE2_TESTING_TESTER_H__ - #include "re2/stringpiece.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -118,4 +118,4 @@ bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text); } // namespace re2 -#endif // RE2_TESTING_TESTER_H__ +#endif // RE2_TESTING_TESTER_H_ diff --git a/re2/unicode_casefold.h b/re2/unicode_casefold.h index 1671140..164ca41 100644 --- a/re2/unicode_casefold.h +++ b/re2/unicode_casefold.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_UNICODE_CASEFOLD_H_ +#define RE2_UNICODE_CASEFOLD_H_ + // Unicode case folding tables. // The Unicode case folding tables encode the mapping from one Unicode point @@ -36,9 +39,6 @@ // The grouped form also allows for efficient fold range calculations // rather than looping one character at a time. -#ifndef RE2_UNICODE_CASEFOLD_H__ -#define RE2_UNICODE_CASEFOLD_H__ - #include "util/util.h" namespace re2 { @@ -72,4 +72,4 @@ extern Rune ApplyFold(const CaseFold *f, Rune r); } // namespace re2 -#endif // RE2_UNICODE_CASEFOLD_H__ +#endif // RE2_UNICODE_CASEFOLD_H_ diff --git a/re2/unicode_groups.h b/re2/unicode_groups.h index fc1c253..d61cd83 100644 --- a/re2/unicode_groups.h +++ b/re2/unicode_groups.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_UNICODE_GROUPS_H_ +#define RE2_UNICODE_GROUPS_H_ + // Unicode character groups. // The codes get split into ranges of 16-bit codes @@ -15,9 +18,6 @@ // to 16.5 kB of data but make the data harder to use; // we don't bother. -#ifndef RE2_UNICODE_GROUPS_H__ -#define RE2_UNICODE_GROUPS_H__ - #include "util/util.h" namespace re2 { @@ -61,4 +61,4 @@ extern const int num_perl_groups; } // namespace re2 -#endif // RE2_UNICODE_GROUPS_H__ +#endif // RE2_UNICODE_GROUPS_H_ diff --git a/re2/walker-inl.h b/re2/walker-inl.h index bdcf7f5..6a1113a 100644 --- a/re2/walker-inl.h +++ b/re2/walker-inl.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_WALKER_INL_H_ +#define RE2_WALKER_INL_H_ + // Helper class for traversing Regexps without recursion. // Clients should declare their own subclasses that override // the PreVisit and PostVisit methods, which are called before @@ -10,9 +13,6 @@ // Not quite the Visitor pattern, because (among other things) // the Visitor pattern is recursive. -#ifndef RE2_WALKER_INL_H__ -#define RE2_WALKER_INL_H__ - #include "re2/regexp.h" namespace re2 { @@ -187,7 +187,7 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, s->child_args = &s->child_arg; else if (re->nsub_ > 1) s->child_args = new T[re->nsub_]; - // Fall through. + FALLTHROUGH_INTENDED; } default: { if (re->nsub_ > 0) { @@ -241,4 +241,4 @@ template T Regexp::Walker::WalkExponential(Regexp* re, T top_arg, } // namespace re2 -#endif // RE2_WALKER_INL_H__ +#endif // RE2_WALKER_INL_H_ diff --git a/runtests b/runtests index aadcb92..ce7c18d 100755 --- a/runtests +++ b/runtests @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env sh success=true for i diff --git a/util/benchmark.cc b/util/benchmark.cc index b77e22d..20b6765 100644 --- a/util/benchmark.cc +++ b/util/benchmark.cc @@ -44,7 +44,11 @@ static int64 nsec() { return ticks.QuadPart; #else struct timespec tp; +#ifdef CLOCK_PROCESS_CPUTIME_ID + if(clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &tp) < 0) +#else if(clock_gettime(CLOCK_REALTIME, &tp) < 0) +#endif return -1; return (int64)tp.tv_sec*1000*1000*1000 + tp.tv_nsec; #endif diff --git a/util/benchmark.h b/util/benchmark.h index 31bbd53..694565f 100644 --- a/util/benchmark.h +++ b/util/benchmark.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_UTIL_BENCHMARK_H__ -#define RE2_UTIL_BENCHMARK_H__ +#ifndef UTIL_BENCHMARK_H_ +#define UTIL_BENCHMARK_H_ namespace testing { struct Benchmark { @@ -14,7 +14,7 @@ struct Benchmark { int hi; int threadlo; int threadhi; - + void Register(); Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); } Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); } @@ -38,4 +38,4 @@ int NumCPUs(); ::testing::Benchmark* _benchmark_##f = \ (new ::testing::Benchmark(#f, f, lo, hi)) -#endif // RE2_UTIL_BENCHMARK_H__ +#endif // UTIL_BENCHMARK_H_ diff --git a/util/bitmap.h b/util/bitmap.h new file mode 100644 index 0000000..8a93d81 --- /dev/null +++ b/util/bitmap.h @@ -0,0 +1,92 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_BITMAP_H_ +#define UTIL_BITMAP_H_ + +#ifdef _MSC_VER +#include +#endif +#include "util/util.h" + +namespace re2 { + +class Bitmap256 { + public: + Bitmap256() { + memset(words_, 0, sizeof words_); + } + + // Tests the bit with index c. + bool Test(int c) const { + DCHECK_GE(c, 0); + DCHECK_LE(c, 255); + + return (words_[c / 64] & (1ULL << (c % 64))) != 0; + } + + // Sets the bit with index c. + void Set(int c) { + DCHECK_GE(c, 0); + DCHECK_LE(c, 255); + + words_[c / 64] |= (1ULL << (c % 64)); + } + + // Finds the next non-zero bit with index >= c. + // Returns -1 if no such bit exists. + int FindNextSetBit(int c) const; + + private: + // Finds the least significant non-zero bit in n. + static int FindLSBSet(uint64 n) { + DCHECK_NE(n, 0); + +#if defined(__GNUC__) + return __builtin_ctzll(n); +#elif defined(_MSC_VER) + unsigned long c; + _BitScanForward64(&c, n); + return static_cast(c); +#else +#error "bit scan forward not implemented" +#endif + } + + uint64 words_[4]; +}; + +int Bitmap256::FindNextSetBit(int c) const { + DCHECK_GE(c, 0); + DCHECK_LE(c, 255); + + // Check the word that contains the bit. Mask out any lower bits. + int i = c / 64; + uint64 word = words_[i] & (~0ULL << (c % 64)); + if (word != 0) + return (i * 64) + FindLSBSet(word); + + // Check any following words. + i++; + switch (i) { + case 1: + if (words_[1] != 0) + return (1 * 64) + FindLSBSet(words_[1]); + FALLTHROUGH_INTENDED; + case 2: + if (words_[2] != 0) + return (2 * 64) + FindLSBSet(words_[2]); + FALLTHROUGH_INTENDED; + case 3: + if (words_[3] != 0) + return (3 * 64) + FindLSBSet(words_[3]); + FALLTHROUGH_INTENDED; + default: + return -1; + } +} + +} // namespace re2 + +#endif // UTIL_BITMAP_H_ diff --git a/util/flags.h b/util/flags.h index 98d5c06..1fd5c91 100644 --- a/util/flags.h +++ b/util/flags.h @@ -2,14 +2,14 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_FLAGS_H_ +#define UTIL_FLAGS_H_ + // Simplified version of Google's command line flags. // Does not support parsing the command line. // If you want to do that, see // https://gflags.github.io/gflags/ -#ifndef RE2_UTIL_FLAGS_H__ -#define RE2_UTIL_FLAGS_H__ - #define DEFINE_flag(type, name, deflt, desc) \ namespace re2 { type FLAGS_##name = deflt; } @@ -24,4 +24,4 @@ #define DECLARE_int32(name) DECLARE_flag(int32, name) #define DECLARE_string(name) DECLARE_flag(string, name) -#endif // RE2_UTIL_FLAGS_H__ +#endif // UTIL_FLAGS_H_ diff --git a/util/logging.h b/util/logging.h index feac199..1573b18 100644 --- a/util/logging.h +++ b/util/logging.h @@ -2,10 +2,10 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Simplified version of Google's logging. +#ifndef UTIL_LOGGING_H_ +#define UTIL_LOGGING_H_ -#ifndef RE2_UTIL_LOGGING_H__ -#define RE2_UTIL_LOGGING_H__ +// Simplified version of Google's logging. #include /* for fwrite */ #include @@ -106,4 +106,4 @@ class LogMessageFatal : public LogMessage { #pragma warning(pop) #endif -#endif // RE2_UTIL_LOGGING_H__ +#endif // UTIL_LOGGING_H_ diff --git a/util/mutex.h b/util/mutex.h index 8a13226..81121a4 100644 --- a/util/mutex.h +++ b/util/mutex.h @@ -2,14 +2,14 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_MUTEX_H_ +#define UTIL_MUTEX_H_ + /* * A simple mutex wrapper, supporting locks and read-write locks. * You should assume the locks are *not* re-entrant. */ -#ifndef RE2_UTIL_MUTEX_H_ -#define RE2_UTIL_MUTEX_H_ - #include #if !defined(_WIN32) @@ -210,4 +210,4 @@ class WriterMutexLock { } // namespace re2 -#endif /* #define RE2_UTIL_MUTEX_H_ */ +#endif // UTIL_MUTEX_H_ diff --git a/util/pcre.cc b/util/pcre.cc index 9a3f32d..87affdc 100644 --- a/util/pcre.cc +++ b/util/pcre.cc @@ -754,6 +754,13 @@ bool PCRE::Arg::parse_char(const char* str, int n, void* dest) { return true; } +bool PCRE::Arg::parse_schar(const char* str, int n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) { if (n != 1) return false; if (dest == NULL) return true; @@ -838,8 +845,8 @@ bool PCRE::Arg::parse_short_radix(const char* str, void* dest, int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range if (dest == NULL) return true; *(reinterpret_cast(dest)) = (short)r; return true; @@ -850,10 +857,10 @@ bool PCRE::Arg::parse_ushort_radix(const char* str, void* dest, int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((ushort)r != r) return false; // Out of range + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (ushort)r; + *(reinterpret_cast(dest)) = (unsigned short)r; return true; } @@ -862,10 +869,10 @@ bool PCRE::Arg::parse_int_radix(const char* str, void* dest, int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (int)r; return true; } @@ -874,10 +881,10 @@ bool PCRE::Arg::parse_uint_radix(const char* str, void* dest, int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((uint)r != r) return false; // Out of range + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (unsigned int)r; return true; } @@ -970,30 +977,29 @@ bool PCRE::Arg::parse_float(const char* str, int n, void* dest) { return true; } - -#define DEFINE_INTEGER_PARSERS(name) \ +#define DEFINE_INTEGER_PARSER(name) \ bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 10); \ - } \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 16); \ - } \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 8); \ - } \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 0); \ + return parse_##name##_radix(str, n, dest, 0); \ } -DEFINE_INTEGER_PARSERS(short); -DEFINE_INTEGER_PARSERS(ushort); -DEFINE_INTEGER_PARSERS(int); -DEFINE_INTEGER_PARSERS(uint); -DEFINE_INTEGER_PARSERS(long); -DEFINE_INTEGER_PARSERS(ulong); -DEFINE_INTEGER_PARSERS(longlong); -DEFINE_INTEGER_PARSERS(ulonglong); +DEFINE_INTEGER_PARSER(short); +DEFINE_INTEGER_PARSER(ushort); +DEFINE_INTEGER_PARSER(int); +DEFINE_INTEGER_PARSER(uint); +DEFINE_INTEGER_PARSER(long); +DEFINE_INTEGER_PARSER(ulong); +DEFINE_INTEGER_PARSER(longlong); +DEFINE_INTEGER_PARSER(ulonglong); -#undef DEFINE_INTEGER_PARSERS +#undef DEFINE_INTEGER_PARSER } // namespace re2 diff --git a/util/pcre.h b/util/pcre.h index 20b10c0..9ccdf35 100644 --- a/util/pcre.h +++ b/util/pcre.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_PCRE_H_ +#define UTIL_PCRE_H_ + // This is a variant of PCRE's pcrecpp.h, originally written at Google. // The main changes are the addition of the HitLimit method and // compilation as PCRE in namespace re2. @@ -570,13 +573,18 @@ class PCRE::Arg { typedef bool (*Parser)(const char* str, int n, void* dest); // Type-specific parsers -#define MAKE_PARSER(type,name) \ - Arg(type* p) : arg_(p), parser_(name) { } \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \ - +#define MAKE_PARSER(type, name) \ + Arg(type* p) : arg_(p), parser_(name) {} \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_schar); MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + MAKE_PARSER(short, parse_short); MAKE_PARSER(unsigned short, parse_ushort); MAKE_PARSER(int, parse_int); @@ -585,10 +593,6 @@ class PCRE::Arg { MAKE_PARSER(unsigned long, parse_ulong); MAKE_PARSER(long long, parse_longlong); MAKE_PARSER(unsigned long long, parse_ulonglong); - MAKE_PARSER(float, parse_float); - MAKE_PARSER(double, parse_double); - MAKE_PARSER(string, parse_string); - MAKE_PARSER(StringPiece, parse_stringpiece); #undef MAKE_PARSER @@ -608,21 +612,23 @@ class PCRE::Arg { static bool parse_null (const char* str, int n, void* dest); static bool parse_char (const char* str, int n, void* dest); + static bool parse_schar (const char* str, int n, void* dest); static bool parse_uchar (const char* str, int n, void* dest); static bool parse_float (const char* str, int n, void* dest); static bool parse_double (const char* str, int n, void* dest); static bool parse_string (const char* str, int n, void* dest); static bool parse_stringpiece (const char* str, int n, void* dest); -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_ ## name(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _radix( \ - const char* str, int n, void* dest, int radix); \ - public: \ - static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _cradix(const char* str, int n, void* dest) +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_##name(const char* str, int n, void* dest); \ + static bool parse_##name##_radix(const char* str, int n, void* dest, \ + int radix); \ + \ + public: \ + static bool parse_##name##_hex(const char* str, int n, void* dest); \ + static bool parse_##name##_octal(const char* str, int n, void* dest); \ + static bool parse_##name##_cradix(const char* str, int n, void* dest) DECLARE_INTEGER_PARSER(short); DECLARE_INTEGER_PARSER(ushort); @@ -634,6 +640,7 @@ class PCRE::Arg { DECLARE_INTEGER_PARSER(ulonglong); #undef DECLARE_INTEGER_PARSER + }; inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } @@ -644,13 +651,16 @@ inline bool PCRE::Arg::Parse(const char* str, int n) const { } // This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline PCRE::Arg Hex(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _hex); } \ - inline PCRE::Arg Octal(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _octal); } \ - inline PCRE::Arg CRadix(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _cradix); } +#define MAKE_INTEGER_PARSER(type, name) \ + inline PCRE::Arg Hex(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \ + } \ + inline PCRE::Arg Octal(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \ + } \ + inline PCRE::Arg CRadix(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \ + } MAKE_INTEGER_PARSER(short, short); MAKE_INTEGER_PARSER(unsigned short, ushort); @@ -664,3 +674,5 @@ MAKE_INTEGER_PARSER(unsigned long long, ulonglong); #undef MAKE_INTEGER_PARSER } // namespace re2 + +#endif // UTIL_PCRE_H_ diff --git a/util/random.h b/util/random.h index 6c6e701..6c67b2c 100644 --- a/util/random.h +++ b/util/random.h @@ -2,10 +2,10 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Modified from Google perftools's tcmalloc_unittest.cc. +#ifndef UTIL_RANDOM_H_ +#define UTIL_RANDOM_H_ -#ifndef RE2_UTIL_RANDOM_H__ -#define RE2_UTIL_RANDOM_H__ +// Modified from Google perftools's tcmalloc_unittest.cc. #include "util/util.h" @@ -26,4 +26,4 @@ class ACMRandom { } // namespace re2 -#endif // RE2_UTIL_RANDOM_H__ +#endif // UTIL_RANDOM_H_ diff --git a/util/sparse_array.h b/util/sparse_array.h index 8f71fa0..d37a10a 100644 --- a/util/sparse_array.h +++ b/util/sparse_array.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_SPARSE_ARRAY_H_ +#define UTIL_SPARSE_ARRAY_H_ + // DESCRIPTION // // SparseArray(m) is a map from integers in [0, m) to T values. @@ -89,9 +92,6 @@ // immediately become inaccessible, but they are only guaranteed to be // destroyed when the SparseArray destructor is called. -#ifndef RE2_UTIL_SPARSE_ARRAY_H__ -#define RE2_UTIL_SPARSE_ARRAY_H__ - #include "util/util.h" namespace re2 { @@ -462,4 +462,4 @@ template bool SparseArray::less(const IndexValue& a, } // namespace re2 -#endif // RE2_UTIL_SPARSE_ARRAY_H__ +#endif // UTIL_SPARSE_ARRAY_H_ diff --git a/util/sparse_set.h b/util/sparse_set.h index 9dd41ee..537a094 100644 --- a/util/sparse_set.h +++ b/util/sparse_set.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_SPARSE_SET_H_ +#define UTIL_SPARSE_SET_H_ + // DESCRIPTION // // SparseSet(m) is a set of integers in [0, m). @@ -44,9 +47,6 @@ // // See sparse_array.h for implementation details -#ifndef RE2_UTIL_SPARSE_SET_H__ -#define RE2_UTIL_SPARSE_SET_H__ - #include "util/util.h" namespace re2 { @@ -182,4 +182,4 @@ class SparseSet { } // namespace re2 -#endif // RE2_UTIL_SPARSE_SET_H__ +#endif // UTIL_SPARSE_SET_H_ diff --git a/util/test.h b/util/test.h index 3701eab..4bdd343 100644 --- a/util/test.h +++ b/util/test.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_UTIL_TEST_H__ -#define RE2_UTIL_TEST_H__ +#ifndef UTIL_TEST_H_ +#define UTIL_TEST_H_ #include "util/util.h" #include "util/flags.h" @@ -42,4 +42,4 @@ class MallocCounter { }; } // namespace testing -#endif // RE2_UTIL_TEST_H__ +#endif // UTIL_TEST_H_ diff --git a/util/thread.h b/util/thread.h index fb67bdc..f9ecaf6 100644 --- a/util/thread.h +++ b/util/thread.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_UTIL_THREAD_H__ -#define RE2_UTIL_THREAD_H__ +#ifndef UTIL_THREAD_H_ +#define UTIL_THREAD_H_ #ifdef _WIN32 #include @@ -30,4 +30,4 @@ class Thread { bool joinable_; }; -#endif // RE2_UTIL_THREAD_H__ +#endif // UTIL_THREAD_H_ diff --git a/util/utf.h b/util/utf.h index 06ff8f0..85b4297 100644 --- a/util/utf.h +++ b/util/utf.h @@ -14,8 +14,9 @@ * This file and rune.cc have been converted to compile as C++ code * in name space re2. */ -#ifndef RE2_UTIL_UTF_H__ -#define RE2_UTIL_UTF_H__ + +#ifndef UTIL_UTF_H_ +#define UTIL_UTF_H_ #include @@ -40,4 +41,4 @@ char* utfrune(const char*, Rune); } // namespace re2 -#endif // RE2_UTIL_UTF_H__ +#endif // UTIL_UTF_H_ diff --git a/util/util.h b/util/util.h index 1e78aa3..27c075f 100644 --- a/util/util.h +++ b/util/util.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_UTIL_UTIL_H__ -#define RE2_UTIL_UTIL_H__ +#ifndef UTIL_UTIL_H_ +#define UTIL_UTIL_H_ // C #include @@ -33,6 +33,7 @@ #include #include // For std::call_once #include +#include // Use std names. using std::set; @@ -58,6 +59,8 @@ using std::unordered_set; #define strtoull _strtoui64 #define vsnprintf vsnprintf_s +#pragma warning(disable: 4200) // zero-sized array + #endif namespace re2 { @@ -71,9 +74,7 @@ typedef uint32_t uint32; typedef int64_t int64; typedef uint64_t uint64; -typedef unsigned long ulong; typedef unsigned int uint; -typedef unsigned short ushort; // Prevent the compiler from complaining about or optimizing away variables // that appear unused. @@ -101,6 +102,10 @@ template struct CompileAssert {}; #define arraysize(array) (int)(sizeof(array)/sizeof((array)[0])) +#ifndef FALLTHROUGH_INTENDED +#define FALLTHROUGH_INTENDED do { } while (0) +#endif + #ifndef NO_THREAD_SAFETY_ANALYSIS #define NO_THREAD_SAFETY_ANALYSIS #endif @@ -138,4 +143,4 @@ bool RunningOnValgrind(); #include "util/mutex.h" #include "util/utf.h" -#endif // RE2_UTIL_UTIL_H__ +#endif // UTIL_UTIL_H_ diff --git a/util/valgrind.h b/util/valgrind.h index ca10b1a..2200a22 100644 --- a/util/valgrind.h +++ b/util/valgrind.h @@ -55,6 +55,8 @@ ---------------------------------------------------------------- */ +#ifndef UTIL_VALGRIND_H_ +#define UTIL_VALGRIND_H_ /* This file is for inclusion into client (your!) code. @@ -70,9 +72,6 @@ problem, you can compile with the NVALGRIND symbol defined (gcc -DNVALGRIND) so that client requests are not even compiled in. */ -#ifndef __VALGRIND_H -#define __VALGRIND_H - #include /* Nb: this file might be included in a file compiled with -ansi. So @@ -4514,4 +4513,4 @@ VALGRIND_PRINTF_BACKTRACE(const char *format, ...) #undef PLAT_ppc32_aix5 #undef PLAT_ppc64_aix5 -#endif /* __VALGRIND_H */ +#endif // UTIL_VALGRIND_H_ -- 2.7.4