Imported Upstream version 20170301

author DongHun Kwak <dh0128.kwak@samsung.com>

Tue, 20 Feb 2018 06:34:04 +0000 (15:34 +0900)

committer DongHun Kwak <dh0128.kwak@samsung.com>

Tue, 20 Feb 2018 06:34:05 +0000 (15:34 +0900)
author DongHun Kwak <dh0128.kwak@samsung.com>
Tue, 20 Feb 2018 06:34:04 +0000 (15:34 +0900)
committer DongHun Kwak <dh0128.kwak@samsung.com>
Tue, 20 Feb 2018 06:34:05 +0000 (15:34 +0900)
diff --git a/re2/compile.cc b/re2/compile.cc

index 472bbfa..9a19bde 100644 (file)
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -10,8 +10,7 @@
  
  #include <stdint.h>
  #include <string.h>
-#include <sys/types.h>
-#include <map>
+#include <unordered_map>
  #include <utility>
  
  #include "util/logging.h"
@@ -246,7 +245,7 @@ class Compiler : public Regexp::Walker<Frag> {
  
    int64_t max_mem_;    // Total memory budget.
  
-  std::map<uint64_t, int> rune_cache_;
+  std::unordered_map<uint64_t, int> rune_cache_;
    Frag rune_range_;
  
    RE2::Anchor anchor_;  // anchor mode for RE2::Set
@@ -496,7 +495,7 @@ static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
  int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
                                     int next) {
    uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
-  std::map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
+  std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
    if (it != rune_cache_.end())
      return it->second;
    int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
@@ -1211,7 +1210,7 @@ Prog* Compiler::Finish() {
    if (max_mem_ <= 0) {
      prog_->set_dfa_mem(1<<20);
    } else {
-    int64_t m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
+    int64_t m = max_mem_ - sizeof(Prog) - prog_->size_*sizeof(Prog::Inst);
      if (m < 0)
        m = 0;
      prog_->set_dfa_mem(m);
diff --git a/re2/dfa.cc b/re2/dfa.cc

index 2e60910..08992e1 100644 (file)
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -25,7 +25,6 @@
  #include <stdint.h>
  #include <stdio.h>
  #include <string.h>
-#include <sys/types.h>
  #include <algorithm>
  #include <atomic>
  #include <map>
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc

index f9326cf..daef0be 100644 (file)
--- a/re2/fuzzing/re2_fuzzer.cc
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -20,6 +20,12 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
    if (!re.ok())
      return;
  
+  // Don't waste time fuzzing high-size programs.
+  // (They can cause bug reports due to fuzzer timeouts.)
+  int size = re.ProgramSize();
+  if (size > 9999)
+    return;
+
    // Don't waste time fuzzing high-fanout programs.
    // (They can also cause bug reports due to fuzzer timeouts.)
    std::map<int, int> histogram;
@@ -50,7 +56,7 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
  
  // Entry point for libFuzzer.
  extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  if (size == 0 || size > 1000000)
+  if (size == 0 || size > 1024)
      return 0;
  
    // The one-at-a-time hash by Bob Jenkins.
@@ -86,15 +92,5 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
    StringPiece text(ptr, len);
    Test(pattern, options, text);
  
-  for (int i = 2; i <= 4; i++) {
-    if (len < i)
-      break;
-
-    int frac = len / i;
-    pattern = StringPiece(ptr, frac);
-    text = StringPiece(ptr + frac, len - frac);
-    Test(pattern, options, text);
-  }
-
    return 0;
  }
diff --git a/re2/parse.cc b/re2/parse.cc

index 03a43a9..0729b75 100644 (file)
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -42,6 +42,13 @@
  
  namespace re2 {
  
+// Reduce the maximum repeat count by an order of magnitude when fuzzing.
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+static const int kMaxRepeat = 100;
+#else
+static const int kMaxRepeat = 1000;
+#endif
+
  // Regular expression parse state.
  // The list of parsed regexps so far is maintained as a vector of
  // Regexp pointers called the stack.  Left parenthesis and vertical
@@ -473,6 +480,23 @@ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
    Regexp::ParseFlags fl = flags_;
    if (nongreedy)
      fl = fl ^ NonGreedy;
+
+  // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but
+  // they're mostly for use during simplification, not during parsing.
+  if (op == stacktop_->op() && fl == stacktop_->parse_flags())
+    return true;
+
+  // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
+  // op is a repeat, we just have to check that stacktop_->op() is too,
+  // then adjust stacktop_.
+  if ((stacktop_->op() == kRegexpStar ||
+       stacktop_->op() == kRegexpPlus ||
+       stacktop_->op() == kRegexpQuest) &&
+      fl == stacktop_->parse_flags()) {
+    stacktop_->op_ = kRegexpStar;
+    return true;
+  }
+
    Regexp* re = new Regexp(op, fl);
    re->AllocSub(1);
    re->down_ = stacktop_->down_;
@@ -541,7 +565,7 @@ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
  bool Regexp::ParseState::PushRepetition(int min, int max,
                                          const StringPiece& s,
                                          bool nongreedy) {
-  if ((max != -1 && max < min) || min > 1000 || max > 1000) {
+  if ((max != -1 && max < min) || min > kMaxRepeat || max > kMaxRepeat) {
      status_->set_code(kRegexpRepeatSize);
      status_->set_error_arg(s);
      return false;
@@ -564,7 +588,7 @@ bool Regexp::ParseState::PushRepetition(int min, int max,
    stacktop_ = re;
    if (min >= 2 || max >= 2) {
      RepetitionWalker w;
-    if (w.Walk(stacktop_, 1000) == 0) {
+    if (w.Walk(stacktop_, kMaxRepeat) == 0) {
        status_->set_code(kRegexpRepeatSize);
        status_->set_error_arg(s);
        return false;
diff --git a/re2/prog.cc b/re2/prog.cc

index 714bff8..0eb1fa9 100644 (file)
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -459,7 +459,7 @@ void Prog::ComputeByteMap() {
    // Don't repeat the work for \b and \B.
    bool marked_word_boundaries = false;
  
-  for (int id = 0; id < static_cast<int>(size()); id++) {
+  for (int id = 0; id < size(); id++) {
      Inst* ip = inst(id);
      if (ip->opcode() == kInstByteRange) {
        int lo = ip->lo();
@@ -521,23 +521,68 @@ void Prog::ComputeByteMap() {
    }
  }
  
+// Prog::Flatten() implements a graph rewriting algorithm.
+//
+// The overall process is similar to epsilon removal, but retains some epsilon
+// transitions: those from Capture and EmptyWidth instructions; and those from
+// nullable subexpressions. (The latter avoids quadratic blowup in transitions
+// in the worst case.) It might be best thought of as Alt instruction elision.
+//
+// In conceptual terms, it divides the Prog into "trees" of instructions, then
+// traverses the "trees" in order to produce "lists" of instructions. A "tree"
+// is one or more instructions that grow from one "root" instruction to one or
+// more "leaf" instructions; if a "tree" has exactly one instruction, then the
+// "root" is also the "leaf". In most cases, a "root" is the successor of some
+// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
+// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
+// EmptyWidth or Match instruction. However, this is insufficient for handling
+// nested nullable subexpressions correctly, so in some cases, a "root" is the
+// dominator of the instructions reachable from some "successor root" (i.e. it
+// has an unreachable predecessor) and is considered a "dominator root". Since
+// only Alt instructions can be "dominator roots" (other instructions would be
+// "leaves"), only Alt instructions require their predecessors to be computed.
+//
+// Dividing the Prog into "trees" comprises two passes: marking the "successor
+// roots" and the predecessors; and marking the "dominator roots". Walking the
+// Prog in its entirety causes the "successor roots" to be marked in a certain
+// order during the first pass. Iteration over the "successor roots" occurs in
+// reverse order of their marking during the second pass; by working backwards
+// and flooding the graph no further than "leaves" and already marked "roots",
+// it becomes possible to mark "dominator roots" without doing excessive work.
+//
+// Traversing the "trees" is just iterating over the "roots" in order of their
+// marking and flooding the graph no further than "leaves" and "roots". When a
+// "leaf" is reached, the instruction is copied with its successor remapped to
+// its "root" number. When a "root" is reached, a Nop instruction is generated
+// with its successor remapped similarly. As each "list" is produced, its last
+// instruction is marked as such. After all of the "lists" have been produced,
+// a pass over their instructions remaps their successors to bytecode offsets.
  void Prog::Flatten() {
    if (did_flatten_)
      return;
    did_flatten_ = true;
  
-  // Scratch structures. It's important that these are reused by EmitList()
-  // because we call it in a loop and it would thrash the heap otherwise.
-  SparseSet q(size());
+  // Scratch structures. It's important that these are reused by functions
+  // that we call in loops because they would thrash the heap otherwise.
+  SparseSet reachable(size());
    std::vector<int> stk;
    stk.reserve(size());
  
-  // First pass: Marks "roots".
+  // First pass: Marks "successor roots" and predecessors.
    // Builds the mapping from inst-ids to root-ids.
    SparseArray<int> rootmap(size());
-  MarkRoots(&rootmap, &q, &stk);
+  SparseArray<int> predmap(size());
+  std::vector<std::vector<int>> predvec;
+  MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
+
+  // Second pass: Marks "dominator roots".
+  for (SparseArray<int>::const_iterator i = rootmap.end() - 1;
+       i != rootmap.begin();
+       --i) {
+    MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
+  }
  
-  // Second pass: Emits "lists". Remaps outs to root-ids.
+  // Third pass: Emits "lists". Remaps outs to root-ids.
    // Builds the mapping from root-ids to flat-ids.
    std::vector<int> flatmap(rootmap.size());
    std::vector<Inst> flat;
@@ -546,7 +591,7 @@ void Prog::Flatten() {
         i != rootmap.end();
         ++i) {
      flatmap[i->value()] = static_cast<int>(flat.size());
-    EmitList(i->index(), &rootmap, &flat, &q, &stk);
+    EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
      flat.back().set_last();
    }
  
@@ -554,7 +599,7 @@ void Prog::Flatten() {
    for (int i = 0; i < kNumInst; i++)
      inst_count_[i] = 0;
  
-  // Third pass: Remaps outs to flat-ids.
+  // Fourth pass: Remaps outs to flat-ids.
    // Counts instructions by opcode.
    for (int id = 0; id < static_cast<int>(flat.size()); id++) {
      Inst* ip = &flat[id];
@@ -586,8 +631,10 @@ void Prog::Flatten() {
    memmove(inst_, flat.data(), size_ * sizeof *inst_);
  }
  
-void Prog::MarkRoots(SparseArray<int>* rootmap, SparseSet* q,
-                     std::vector<int>* stk) {
+void Prog::MarkSuccessors(SparseArray<int>* rootmap,
+                          SparseArray<int>* predmap,
+                          std::vector<std::vector<int>>* predvec,
+                          SparseSet* reachable, std::vector<int>* stk) {
    // Mark the kInstFail instruction.
    rootmap->set_new(0, rootmap->size());
  
@@ -597,16 +644,16 @@ void Prog::MarkRoots(SparseArray<int>* rootmap, SparseSet* q,
    if (!rootmap->has_index(start()))
      rootmap->set_new(start(), rootmap->size());
  
-  q->clear();
+  reachable->clear();
    stk->clear();
    stk->push_back(start_unanchored());
    while (!stk->empty()) {
      int id = stk->back();
      stk->pop_back();
    Loop:
-    if (q->contains(id))
+    if (reachable->contains(id))
        continue;
-    q->insert_new(id);
+    reachable->insert_new(id);
  
      Inst* ip = inst(id);
      switch (ip->opcode()) {
@@ -616,6 +663,14 @@ void Prog::MarkRoots(SparseArray<int>* rootmap, SparseSet* q,
  
        case kInstAltMatch:
        case kInstAlt:
+        // Mark this instruction as a predecessor of each out.
+        for (int out : {ip->out(), ip->out1()}) {
+          if (!predmap->has_index(out)) {
+            predmap->set_new(out, static_cast<int>(predvec->size()));
+            predvec->emplace_back();
+          }
+          (*predvec)[predmap->get_existing(out)].emplace_back(id);
+        }
          stk->push_back(ip->out1());
          id = ip->out();
          goto Loop;
@@ -623,7 +678,7 @@ void Prog::MarkRoots(SparseArray<int>* rootmap, SparseSet* q,
        case kInstByteRange:
        case kInstCapture:
        case kInstEmptyWidth:
-        // Mark the out of this instruction.
+        // Mark the out of this instruction as a "root".
          if (!rootmap->has_index(ip->out()))
            rootmap->set_new(ip->out(), rootmap->size());
          id = ip->out();
@@ -640,19 +695,83 @@ void Prog::MarkRoots(SparseArray<int>* rootmap, SparseSet* q,
    }
  }
  
+void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
+                         SparseArray<int>* predmap,
+                         std::vector<std::vector<int>>* predvec,
+                         SparseSet* reachable, std::vector<int>* stk) {
+  reachable->clear();
+  stk->clear();
+  stk->push_back(root);
+  while (!stk->empty()) {
+    int id = stk->back();
+    stk->pop_back();
+  Loop:
+    if (reachable->contains(id))
+      continue;
+    reachable->insert_new(id);
+
+    if (id != root && rootmap->has_index(id)) {
+      // We reached another "tree" via epsilon transition.
+      continue;
+    }
+
+    Inst* ip = inst(id);
+    switch (ip->opcode()) {
+      default:
+        LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+        break;
+
+      case kInstAltMatch:
+      case kInstAlt:
+        stk->push_back(ip->out1());
+        id = ip->out();
+        goto Loop;
+
+      case kInstByteRange:
+      case kInstCapture:
+      case kInstEmptyWidth:
+        break;
+
+      case kInstNop:
+        id = ip->out();
+        goto Loop;
+
+      case kInstMatch:
+      case kInstFail:
+        break;
+    }
+  }
+
+  for (SparseSet::const_iterator i = reachable->begin();
+       i != reachable->end();
+       ++i) {
+    int id = *i;
+    if (predmap->has_index(id)) {
+      for (int pred : (*predvec)[predmap->get_existing(id)]) {
+        if (!reachable->contains(pred)) {
+          // id has a predecessor that cannot be reached from root!
+          // Therefore, id must be a "root" too - mark it as such.
+          if (!rootmap->has_index(id))
+            rootmap->set_new(id, rootmap->size());
+        }
+      }
+    }
+  }
+}
+
  void Prog::EmitList(int root, SparseArray<int>* rootmap,
-                    std::vector<Inst>* flat, SparseSet* q,
-                    std::vector<int>* stk) {
-  q->clear();
+                    std::vector<Inst>* flat,
+                    SparseSet* reachable, std::vector<int>* stk) {
+  reachable->clear();
    stk->clear();
    stk->push_back(root);
    while (!stk->empty()) {
      int id = stk->back();
      stk->pop_back();
    Loop:
-    if (q->contains(id))
+    if (reachable->contains(id))
        continue;
-    q->insert_new(id);
+    reachable->insert_new(id);
  
      if (id != root && rootmap->has_index(id)) {
        // We reached another "tree" via epsilon transition. Emit a kInstNop
diff --git a/re2/prog.h b/re2/prog.h

index 57a771c..43a00af 100644 (file)
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -332,16 +332,28 @@ class Prog {
    // operation in the sense that the old instructions are lost.
    void Flatten();
  
-  // Marks the "roots" in the Prog: the outs of kInstByteRange, kInstCapture
-  // and kInstEmptyWidth instructions.
-  void MarkRoots(SparseArray<int>* rootmap, SparseSet* q,
-                 std::vector<int>* stk);
-
-  // Emits one "list" via "tree" traversal from the given "root" instruction.
-  // The new instructions are appended to the given vector.
+  // Walks the Prog; the "successor roots" or predecessors of the reachable
+  // instructions are marked in rootmap or predmap/predvec, respectively.
+  // reachable and stk are preallocated scratch structures.
+  void MarkSuccessors(SparseArray<int>* rootmap,
+                      SparseArray<int>* predmap,
+                      std::vector<std::vector<int>>* predvec,
+                      SparseSet* reachable, std::vector<int>* stk);
+
+  // Walks the Prog from the given "root" instruction; the "dominator root"
+  // of the reachable instructions (if such exists) is marked in rootmap.
+  // reachable and stk are preallocated scratch structures.
+  void MarkDominator(int root, SparseArray<int>* rootmap,
+                     SparseArray<int>* predmap,
+                     std::vector<std::vector<int>>* predvec,
+                     SparseSet* reachable, std::vector<int>* stk);
+
+  // Walks the Prog from the given "root" instruction; the reachable
+  // instructions are emitted in "list" form and appended to flat.
+  // reachable and stk are preallocated scratch structures.
    void EmitList(int root, SparseArray<int>* rootmap,
-                std::vector<Inst>* flat, SparseSet* q,
-                std::vector<int>* stk);
+                std::vector<Inst>* flat,
+                SparseSet* reachable, std::vector<int>* stk);
  
   private:
    friend class Compiler;
diff --git a/re2/re2.cc b/re2/re2.cc

index fa29100..84cf992 100644 (file)
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -399,7 +399,13 @@ int RE2::GlobalReplace(string *str,
    const char* lastend = NULL;
    string out;
    int count = 0;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  // Iterate just once when fuzzing. Otherwise, we easily get bogged down
+  // and coverage is unlikely to improve despite significant expense.
+  while (p == str->data()) {
+#else
    while (p <= ep) {
+#endif
      if (!re.Match(*str, static_cast<size_t>(p - str->data()),
                    str->size(), UNANCHORED, vec, nvec))
        break;
diff --git a/re2/re2.h b/re2/re2.h

index c49de05..49559d5 100644 (file)
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -181,7 +181,6 @@
  
  #include <stddef.h>
  #include <stdint.h>
-#include <sys/types.h>
  #include <algorithm>
  #include <map>
  #include <mutex>
diff --git a/re2/regexp.cc b/re2/regexp.cc

index 950c985..712f92d 100644 (file)
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -190,31 +190,45 @@ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
    return re;
  }
  
-Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
-  if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
+Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
+  // Squash **, ++ and ??.
+  if (op == sub->op() && flags == sub->parse_flags())
      return sub;
-  Regexp* re = new Regexp(kRegexpPlus, flags);
+
+  // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
+  // op is Star/Plus/Quest, we just have to check that sub->op() is too.
+  if ((sub->op() == kRegexpStar ||
+       sub->op() == kRegexpPlus ||
+       sub->op() == kRegexpQuest) &&
+      flags == sub->parse_flags()) {
+    // If sub is Star, no need to rewrite it.
+    if (sub->op() == kRegexpStar)
+      return sub;
+
+    // Rewrite sub to Star.
+    Regexp* re = new Regexp(kRegexpStar, flags);
+    re->AllocSub(1);
+    re->sub()[0] = sub->sub()[0]->Incref();
+    sub->Decref();  // We didn't consume the reference after all.
+    return re;
+  }
+
+  Regexp* re = new Regexp(op, flags);
    re->AllocSub(1);
    re->sub()[0] = sub;
    return re;
  }
  
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
+  return StarPlusOrQuest(kRegexpPlus, sub, flags);
+}
+
  Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
-  if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
-    return sub;
-  Regexp* re = new Regexp(kRegexpStar, flags);
-  re->AllocSub(1);
-  re->sub()[0] = sub;
-  return re;
+  return StarPlusOrQuest(kRegexpStar, sub, flags);
  }
  
  Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
-  if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
-    return sub;
-  Regexp* re = new Regexp(kRegexpQuest, flags);
-  re->AllocSub(1);
-  re->sub()[0] = sub;
-  return re;
+  return StarPlusOrQuest(kRegexpQuest, sub, flags);
  }
  
  Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
diff --git a/re2/regexp.h b/re2/regexp.h

index e77d0d8..9b5d98d 100644 (file)
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -460,6 +460,10 @@ class Regexp {
    // Computes whether Regexp is already simple.
    bool ComputeSimple();
  
+  // Constructor that generates a Star, Plus or Quest,
+  // squashing the pair if sub is also a Star, Plus or Quest.
+  static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
+
    // Constructor that generates a concatenation or alternation,
    // enforcing the limit on the number of subexpressions for
    // a particular Regexp.
@@ -528,7 +532,7 @@ class Regexp {
    // exponential blowup in space requirements.
    // uint16_t to control space usage.
    // The standard regexp routines will never generate a
-  // ref greater than the maximum repeat count (1000),
+  // ref greater than the maximum repeat count (kMaxRepeat),
    // but even so, Incref and Decref consult an overflow map
    // when ref_ reaches kMaxRef.
    uint16_t ref_;
diff --git a/re2/stringpiece.cc b/re2/stringpiece.cc

index d25351f..ef2e287 100644 (file)
--- a/re2/stringpiece.cc
+++ b/re2/stringpiece.cc
@@ -8,7 +8,7 @@
  
  #include "util/util.h"
  
-using re2::StringPiece;
+namespace re2 {
  
  const StringPiece::size_type StringPiece::npos;  // initialized in stringpiece.h
  
@@ -61,3 +61,5 @@ std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
    o.write(p.data(), p.size());
    return o;
  }
+
+}  // namespace re2
diff --git a/re2/stringpiece.h b/re2/stringpiece.h

index 082d476..133cc82 100644 (file)
--- a/re2/stringpiece.h
+++ b/re2/stringpiece.h
@@ -115,24 +115,28 @@ class StringPiece {
    StringPiece substr(size_type pos = 0, size_type n = npos) const;
  
    int compare(const StringPiece& x) const {
-    int r = memcmp(data_, x.data_, std::min(size_, x.size_));
-    if (r == 0) {
-      if (size_ < x.size_) r = -1;
-      else if (size_ > x.size_) r = +1;
+    size_type min_size = std::min(size(), x.size());
+    if (min_size > 0) {
+      int r = memcmp(data(), x.data(), min_size);
+      if (r < 0) return -1;
+      if (r > 0) return 1;
      }
-    return r;
+    if (size() < x.size()) return -1;
+    if (size() > x.size()) return 1;
+    return 0;
    }
  
    // Does "this" start with "x"?
    bool starts_with(const StringPiece& x) const {
-    return size_ >= x.size_ &&
-           memcmp(data_, x.data_, x.size_) == 0;
+    return x.empty() ||
+           (size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
    }
  
    // Does "this" end with "x"?
    bool ends_with(const StringPiece& x) const {
-    return size_ >= x.size_ &&
-           memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0;
+    return x.empty() ||
+           (size() >= x.size() &&
+            memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
    }
  
    bool contains(const StringPiece& s) const {
@@ -150,8 +154,10 @@ class StringPiece {
  };
  
  inline bool operator==(const StringPiece& x, const StringPiece& y) {
-  return x.size() == y.size() &&
-         memcmp(x.data(), y.data(), x.size()) == 0;
+  StringPiece::size_type len = x.size();
+  if (len != y.size()) return false;
+  return x.data() == y.data() || len == 0 ||
+         memcmp(x.data(), y.data(), len) == 0;
  }
  
  inline bool operator!=(const StringPiece& x, const StringPiece& y) {
@@ -159,8 +165,9 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {
  }
  
  inline bool operator<(const StringPiece& x, const StringPiece& y) {
-  int r = memcmp(x.data(), y.data(), std::min(x.size(), y.size()));
-  return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+  StringPiece::size_type min_size = std::min(x.size(), y.size());
+  int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
+  return (r < 0) || (r == 0 && x.size() < y.size());
  }
  
  inline bool operator>(const StringPiece& x, const StringPiece& y) {
@@ -175,9 +182,9 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
    return !(x < y);
  }
  
-}  // namespace re2
-
  // Allow StringPiece to be logged.
-std::ostream& operator<<(std::ostream& o, const re2::StringPiece& p);
+std::ostream& operator<<(std::ostream& o, const StringPiece& p);
+
+}  // namespace re2
  
  #endif  // RE2_STRINGPIECE_H_
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc

index eb723bc..87171d6 100644 (file)
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -316,4 +316,44 @@ TEST(TestCompile, Bug26705922) {
              reverse);
  }
  
+TEST(TestCompile, Bug35237384) {
+  // Bug in the compiler caused inefficient bytecode to be generated for
+  // nested nullable subexpressions.
+
+  string forward;
+
+  Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+  EXPECT_EQ("3+ byte [61-61] -> 3\n"
+            "4. nop -> 5\n"
+            "5+ byte [61-61] -> 5\n"
+            "6. nop -> 7\n"
+            "7+ byte [61-61] -> 7\n"
+            "8. match! 0\n",
+            forward);
+
+  Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+  EXPECT_EQ("3+ nop -> 6\n"
+            "4+ nop -> 8\n"
+            "5. nop -> 21\n"
+            "6+ byte [61-61] -> 6\n"
+            "7. nop -> 3\n"
+            "8+ byte [62-62] -> 8\n"
+            "9. nop -> 3\n"
+            "10+ byte [61-61] -> 10\n"
+            "11. nop -> 21\n"
+            "12+ byte [62-62] -> 12\n"
+            "13. nop -> 21\n"
+            "14+ byte [61-61] -> 14\n"
+            "15. nop -> 18\n"
+            "16+ byte [62-62] -> 16\n"
+            "17. nop -> 18\n"
+            "18+ nop -> 14\n"
+            "19+ nop -> 16\n"
+            "20. match! 0\n"
+            "21+ nop -> 10\n"
+            "22+ nop -> 12\n"
+            "23. nop -> 18\n",
+      forward);
+}
+
  }  // namespace re2
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc

index 95cb0e6..ec80fc4 100644 (file)
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -115,6 +115,17 @@ static Test tests[] = {
    { "ab|cd", "alt{str{ab}str{cd}}" },
    { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
  
+  // Test squashing of **, ++, ?? et cetera.
+  { "(?:(?:a)*)*", "star{lit{a}}" },
+  { "(?:(?:a)+)+", "plus{lit{a}}" },
+  { "(?:(?:a)?)?", "que{lit{a}}" },
+  { "(?:(?:a)*)+", "star{lit{a}}" },
+  { "(?:(?:a)*)?", "star{lit{a}}" },
+  { "(?:(?:a)+)*", "star{lit{a}}" },
+  { "(?:(?:a)+)?", "star{lit{a}}" },
+  { "(?:(?:a)?)*", "star{lit{a}}" },
+  { "(?:(?:a)?)+", "star{lit{a}}" },
+
    // Test flattening.
    { "(?:a)", "lit{a}" },
    { "(?:ab)(?:cd)", "str{abcd}" },
diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc

index 33f8a8c..6bdb2af 100644 (file)
--- a/re2/testing/simplify_test.cc
+++ b/re2/testing/simplify_test.cc
@@ -231,6 +231,17 @@ static Test tests[] = {
    // capture. The new capture must have the cap of the old capture.
    // Failure to copy it results in cap=0 -> ToString() logs a fatal error.
    { "(a*aab)", "(aa+b)" },
+
+  // Test squashing of **, ++, ?? et cetera.
+  { "(?:(?:a){0,}){0,}", "a*" },
+  { "(?:(?:a){1,}){1,}", "a+" },
+  { "(?:(?:a){0,1}){0,1}", "a?" },
+  { "(?:(?:a){0,}){1,}", "a*" },
+  { "(?:(?:a){0,}){0,1}", "a*" },
+  { "(?:(?:a){1,}){0,}", "a*" },
+  { "(?:(?:a){1,}){0,1}", "a*" },
+  { "(?:(?:a){0,1}){0,}", "a*" },
+  { "(?:(?:a){0,1}){1,}", "a*" },
  };
  
  TEST(TestSimplify, SimpleRegexps) {
diff --git a/re2/testing/tester.cc b/re2/testing/tester.cc

index d7fe319..f2372b2 100644 (file)
--- a/re2/testing/tester.cc
+++ b/re2/testing/tester.cc
@@ -7,7 +7,6 @@
  #include <stddef.h>
  #include <stdint.h>
  #include <string.h>
-#include <sys/types.h>
  #include <string>
  
  #include "util/util.h"
diff --git a/util/sparse_array.h b/util/sparse_array.h

index b9853a7..eb0498e 100644 (file)
--- a/util/sparse_array.h
+++ b/util/sparse_array.h
@@ -273,7 +273,7 @@ class SparseArray {
    iterator SetInternal(bool allow_overwrite, int i, U&& v) {  // NOLINT
      DebugCheckInvariants();
      if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
-      assert(!"illegal index");
+      assert(false && "illegal index");
        // Semantically, end() would be better here, but we already know
        // the user did something stupid, so begin() insulates them from
        // dereferencing an invalid pointer.
diff --git a/util/sparse_set.h b/util/sparse_set.h

index b431643..191ac7c 100644 (file)
--- a/util/sparse_set.h
+++ b/util/sparse_set.h
@@ -132,7 +132,7 @@ class SparseSetT {
    iterator InsertInternal(bool allow_existing, int i) {
      DebugCheckInvariants();
      if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
-      assert(!"illegal index");
+      assert(false && "illegal index");
        // Semantically, end() would be better here, but we already know
        // the user did something stupid, so begin() insulates them from
        // dereferencing an invalid pointer.
author	DongHun Kwak <dh0128.kwak@samsung.com>
	Tue, 20 Feb 2018 06:34:04 +0000 (15:34 +0900)
committer	DongHun Kwak <dh0128.kwak@samsung.com>
	Tue, 20 Feb 2018 06:34:05 +0000 (15:34 +0900)
re2/compile.cc		patch \| blob \| history
re2/dfa.cc		patch \| blob \| history
re2/fuzzing/re2_fuzzer.cc		patch \| blob \| history
re2/parse.cc		patch \| blob \| history
re2/prog.cc		patch \| blob \| history
re2/prog.h		patch \| blob \| history
re2/re2.cc		patch \| blob \| history
re2/re2.h		patch \| blob \| history
re2/regexp.cc		patch \| blob \| history
re2/regexp.h		patch \| blob \| history
re2/stringpiece.cc		patch \| blob \| history
re2/stringpiece.h		patch \| blob \| history
re2/testing/compile_test.cc		patch \| blob \| history
re2/testing/parse_test.cc		patch \| blob \| history
re2/testing/simplify_test.cc		patch \| blob \| history
re2/testing/tester.cc		patch \| blob \| history
util/sparse_array.h		patch \| blob \| history
util/sparse_set.h		patch \| blob \| history