2013-09-14 Tim Shen <timshen91@gmail.com>

author timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4>

Sat, 14 Sep 2013 14:23:44 +0000 (14:23 +0000)

committer timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4>

Sat, 14 Sep 2013 14:23:44 +0000 (14:23 +0000)
author timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4>
Sat, 14 Sep 2013 14:23:44 +0000 (14:23 +0000)
committer timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4>
Sat, 14 Sep 2013 14:23:44 +0000 (14:23 +0000)
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog

index 8e0bfb7..9689105 100644 (file)
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,28 @@
+2013-09-14  Tim Shen  <timshen91@gmail.com>
+
+       * include/bits/regex.h (regex_match<>, regex_search<>):
+       Change regex_executor caller. Now use their return value instead
+       of checking __m[0].matched to find out if it's successful.
+       (regex_search<>): Move the search logic to regex_executor.
+       * include/bits/regex_automaton.h: Add some new _Opcode. Refactor
+       _NFA::_M_insert_*.
+       * include/bits/regex_automaton.tcc: Add DEBUG dump for new
+       _Opcode. Refactor _NFA::_M_insert_*.
+       * include/bits/regex_compiler.h (_Compiler<>::_M_get_nfa):
+       Use make_shared instead of construct by hand.
+       * include/bits/regex_compiler.tcc: Implement _Compiler<>::_M_assertion.
+       * include/bits/regex_constants.h: Fix indentation and line breaking.
+       * include/bits/regex_executor.h: Add _ResultsEntry to support
+       greedy/ungreedy mode. Move regex_search logic here.
+       * include/bits/regex_executor.tcc: Implement assertions and
+       greedy/ungreedy matching.
+       * include/bits/regex_scanner.h: Add a new token _S_token_ungreedy.
+       * include/bits/regex_scanner.tcc: Parse a new token _S_token_ungreedy.
+       * testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc: New.
+       * testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc: New.
+       * testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc:
+       Fix comment.
+
  2013-09-13  Paolo Carlini  <paolo.carlini@oracle.com>
  
         PR libstdc++/58415
diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h

index 412465a..659bee1 100644 (file)
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -2106,14 +2106,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        template<typename, typename, typename, typename>
         friend class __detail::_BFSExecutor;
  
-      template<typename _Bp, typename _Ap, typename _Ch_type, typename _Rx_traits>
+      template<typename _Bp, typename _Ap,
+       typename _Ch_type, typename _Rx_traits>
         friend bool
         regex_match(_Bp, _Bp, match_results<_Bp, _Ap>&,
                     const basic_regex<_Ch_type,
                     _Rx_traits>&,
                     regex_constants::match_flag_type);
  
-      template<typename _Bp, typename _Ap, typename _Ch_type, typename _Rx_traits>
+      template<typename _Bp, typename _Ap,
+       typename _Ch_type, typename _Rx_traits>
         friend bool
         regex_search(_Bp, _Bp, match_results<_Bp, _Ap>&,
                      const basic_regex<_Ch_type,
@@ -2213,8 +2215,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      {
        if (__re._M_automaton == nullptr)
         return false;
-      __detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match();
-      if (__m.size() > 0 && __m[0].matched)
+      if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match())
         {
           for (auto __it : __m)
             if (!__it.matched)
@@ -2373,29 +2374,22 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      {
        if (__re._M_automaton == nullptr)
         return false;
-      auto __cur = __first;
-      // Continue when __cur == __last
-      do
+      if (__detail::__get_executor(__first, __last, __m, __re, __flags)
+         ->_M_search())
         {
-         __detail::__get_executor(__cur, __last, __m, __re, __flags)
-           ->_M_search_from_first();
-         if (__m.size() > 0 && __m[0].matched)
-           {
-             for (auto __it : __m)
-               if (!__it.matched)
-                 __it.first = __it.second = __last;
-             __m.at(__m.size()).first = __first;
-             __m.at(__m.size()).second = __m[0].first;
-             __m.at(__m.size()+1).first = __m[0].second;
-             __m.at(__m.size()+1).second = __last;
-             __m.at(__m.size()).matched =
-               (__m.prefix().first != __m.prefix().second);
-             __m.at(__m.size()+1).matched =
-               (__m.suffix().first != __m.suffix().second);
-             return true;
-           }
+         for (auto __it : __m)
+           if (!__it.matched)
+             __it.first = __it.second = __last;
+         __m.at(__m.size()).first = __first;
+         __m.at(__m.size()).second = __m[0].first;
+         __m.at(__m.size()+1).first = __m[0].second;
+         __m.at(__m.size()+1).second = __last;
+         __m.at(__m.size()).matched =
+           (__m.prefix().first != __m.prefix().second);
+         __m.at(__m.size()+1).matched =
+           (__m.suffix().first != __m.suffix().second);
+         return true;
         }
-      while (__cur++ != __last);
        return false;
      }
  
diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h

index 7755175..94a14ce 100644 (file)
--- a/libstdc++-v3/include/bits/regex_automaton.h
+++ b/libstdc++-v3/include/bits/regex_automaton.h
@@ -51,14 +51,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
    /// that represents the regular expression.
    enum _Opcode
    {
-      _S_opcode_unknown       =   0,
-      _S_opcode_alternative   =   1,
-      _S_opcode_backref       =   2,
-      _S_opcode_subexpr_begin =   4,
-      _S_opcode_subexpr_end   =   5,
-      _S_opcode_dummy         =   6,
-      _S_opcode_match         = 100,
-      _S_opcode_accept        = 255
+      _S_opcode_unknown,
+      _S_opcode_alternative,
+      _S_opcode_backref,
+      _S_opcode_line_begin_assertion,
+      _S_opcode_line_end_assertion,
+      _S_opcode_word_boundry,
+      _S_opcode_subexpr_lookahead,
+      _S_opcode_subexpr_begin,
+      _S_opcode_subexpr_end,
+      _S_opcode_dummy,
+      _S_opcode_match,
+      _S_opcode_accept,
    };
  
    template<typename _CharT, typename _TraitsT>
@@ -72,35 +76,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        _StateIdT    _M_next;             // outgoing transition
        union // Since they are mutually exclusive.
        {
-       _StateIdT    _M_alt;            // for _S_opcode_alternative
         unsigned int _M_subexpr;        // for _S_opcode_subexpr_*
         unsigned int _M_backref_index;  // for _S_opcode_backref
+       struct
+       {
+         // for _S_opcode_alternative.
+         _StateIdT  _M_quant_index;
+         // for _S_opcode_alternative or _S_opcode_subexpr_lookahead
+         _StateIdT  _M_alt;
+         // for _S_opcode_word_boundry or _S_opcode_subexpr_lookahead or
+         // quantifiers(ungreedy if set true)
+         bool       _M_neg;
+       };
        };
-      _MatcherT    _M_matches;          // for _S_opcode_match
+      _MatcherT      _M_matches;        // for _S_opcode_match
  
        explicit _State(_OpcodeT __opcode)
        : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
        { }
  
-      _State(const _MatcherT& __m)
-      : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id),
-       _M_matches(__m)
-      { }
-
-      _State(_OpcodeT __opcode, unsigned __index)
-      : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
-      {
-       if (__opcode == _S_opcode_subexpr_begin
-           || __opcode == _S_opcode_subexpr_end)
-         _M_subexpr = __index;
-       else if (__opcode == _S_opcode_backref)
-         _M_backref_index = __index;
-      }
-
-      _State(_StateIdT __next, _StateIdT __alt)
-      : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
-      { }
-
  #ifdef _GLIBCXX_DEBUG
        std::ostream&
        _M_print(std::ostream& ostr) const;
@@ -141,7 +135,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  
        _NFA(_FlagT __f)
        : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
-      _M_has_backref(false)
+      _M_has_backref(false), _M_quant_count(0)
        { }
  
        _FlagT
@@ -163,23 +157,30 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        _StateIdT
        _M_insert_accept()
        {
-       this->push_back(_StateT(_S_opcode_accept));
-       _M_accepting_states.insert(this->size()-1);
-       return this->size()-1;
+       auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
+       _M_accepting_states.insert(__ret);
+       return __ret;
        }
  
        _StateIdT
-      _M_insert_alt(_StateIdT __next, _StateIdT __alt)
+      _M_insert_alt(_StateIdT __next, _StateIdT __alt, bool __neg)
        {
-       this->push_back(_StateT(__next, __alt));
-       return this->size()-1;
+       _StateT __tmp(_S_opcode_alternative);
+       // It labels every quantifier to make greedy comparison easier in BFS
+       // approach.
+       __tmp._M_quant_index = _M_quant_count++;
+       __tmp._M_next = __next;
+       __tmp._M_alt = __alt;
+       __tmp._M_neg = __neg;
+       return _M_insert_state(__tmp);
        }
  
        _StateIdT
        _M_insert_matcher(_MatcherT __m)
        {
-       this->push_back(_StateT(__m));
-       return this->size()-1;
+       _StateT __tmp(_S_opcode_match);
+       __tmp._M_matches = __m;
+       return _M_insert_state(__tmp);
        }
  
        _StateIdT
@@ -187,29 +188,53 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        {
         auto __id = _M_subexpr_count++;
         _M_paren_stack.push_back(__id);
-       this->push_back(_StateT(_S_opcode_subexpr_begin, __id));
-       return this->size()-1;
+       _StateT __tmp(_S_opcode_subexpr_begin);
+       __tmp._M_subexpr = __id;
+       return _M_insert_state(__tmp);
        }
  
        _StateIdT
        _M_insert_subexpr_end()
        {
-       this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.back()));
+       _StateT __tmp(_S_opcode_subexpr_end);
+       __tmp._M_subexpr = _M_paren_stack.back();
         _M_paren_stack.pop_back();
-       return this->size()-1;
+       return _M_insert_state(__tmp);
        }
  
        _StateIdT
        _M_insert_backref(unsigned int __index);
  
        _StateIdT
-      _M_insert_dummy()
+      _M_insert_line_begin()
+      { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
+
+      _StateIdT
+      _M_insert_line_end()
+      { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
+
+      _StateIdT
+      _M_insert_word_bound(bool __neg)
        {
-       this->push_back(_StateT(_S_opcode_dummy));
-       return this->size()-1;
+       _StateT __tmp(_S_opcode_word_boundry);
+       __tmp._M_neg = __neg;
+       return _M_insert_state(__tmp);
        }
  
        _StateIdT
+      _M_insert_lookahead(_StateIdT __alt, bool __neg)
+      {
+       _StateT __tmp(_S_opcode_subexpr_lookahead);
+       __tmp._M_alt = __alt;
+       __tmp._M_neg = __neg;
+       return _M_insert_state(__tmp);
+      }
+
+      _StateIdT
+      _M_insert_dummy()
+      { return _M_insert_state(_StateT(_S_opcode_dummy)); }
+
+      _StateIdT
        _M_insert_state(_StateT __s)
        {
         this->push_back(__s);
@@ -230,6 +255,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        _FlagT                    _M_flags;
        _StateIdT                 _M_start_state;
        _SizeT                    _M_subexpr_count;
+      _SizeT                    _M_quant_count;
        bool                      _M_has_backref;
      };
  
diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc

index 2d34b95..13af984 100644 (file)
--- a/libstdc++-v3/include/bits/regex_automaton.tcc
+++ b/libstdc++-v3/include/bits/regex_automaton.tcc
@@ -80,6 +80,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                  << __id << " -> " << _M_alt
                  << " [label=\"epsilon\", tailport=\"n\"];\n";
           break;
+       case _S_opcode_backref:
+         __ostr << __id << " [label=\"" << __id << "\\nBACKREF "
+                << _M_subexpr << "\"];\n"
+                << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
+         break;
+       case _S_opcode_line_begin_assertion:
+         __ostr << __id << " [label=\"" << __id << "\\nLINE_BEGIN \"];\n"
+                << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
+         break;
+       case _S_opcode_line_end_assertion:
+         __ostr << __id << " [label=\"" << __id << "\\nLINE_END \"];\n"
+                << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
+         break;
+       case _S_opcode_word_boundry:
+         __ostr << __id << " [label=\"" << __id << "\\nWORD_BOUNDRY "
+                << _M_neg << "\"];\n"
+                << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
+         break;
+       case _S_opcode_subexpr_lookahead:
+         __ostr << __id << " [label=\"" << __id << "\\nLOOK_AHEAD\"];\n"
+                << __id << " -> " << _M_next
+                << " [label=\"epsilon\", tailport=\"s\"];\n"
+                << __id << " -> " << _M_alt
+                << " [label=\"<assert>\", tailport=\"n\"];\n";
+         break;
         case _S_opcode_subexpr_begin:
           __ostr << __id << " [label=\"" << __id << "\\nSBEGIN "
                  << _M_subexpr << "\"];\n"
@@ -90,10 +115,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                  << _M_subexpr << "\"];\n"
                  << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
           break;
-       case _S_opcode_backref:
-         __ostr << __id << " [label=\"" << __id << "\\nBACKREF "
-                << _M_subexpr << "\"];\n"
-                << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
+       case _S_opcode_dummy:
           break;
         case _S_opcode_match:
           __ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
@@ -102,8 +124,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         case _S_opcode_accept:
           __ostr << __id << " [label=\"" << __id << "\\nACC\"];\n" ;
           break;
-       case _S_opcode_dummy:
-         break;
         default:
           _GLIBCXX_DEBUG_ASSERT(false);
           break;
@@ -141,8 +161,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         if (__index == __it)
           __throw_regex_error(regex_constants::error_backref);
        _M_has_backref = true;
-      this->push_back(_StateT(_S_opcode_backref, __index));
-      return this->size()-1;
+      _StateT __tmp(_S_opcode_backref);
+      __tmp._M_backref_index = __index;
+      return _M_insert_state(__tmp);
      }
  
    template<typename _CharT, typename _TraitsT>
@@ -152,7 +173,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        for (auto& __it : *this)
         {
           while (__it._M_next >= 0 && (*this)[__it._M_next]._M_opcode
-                == _S_opcode_dummy)
+                == _S_opcode_dummy)
             __it._M_next = (*this)[__it._M_next]._M_next;
           if (__it._M_opcode == _S_opcode_alternative)
             while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode
diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h

index 96a0d29..3b85d3a 100644 (file)
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -56,7 +56,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  
        std::shared_ptr<_RegexT>
        _M_get_nfa() const
-      { return std::shared_ptr<_RegexT>(new _RegexT(_M_nfa)); }
+      { return make_shared<_RegexT>(_M_nfa); }
  
      private:
        typedef _Scanner<_FwdIter>                              _ScannerT;
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc

index a574e8e..8dc779b 100644 (file)
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -96,7 +96,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           __alt2._M_append(__end);
           _M_stack.push(_StateSeqT(_M_nfa,
                                    _M_nfa._M_insert_alt(__alt1._M_start,
-                                                       __alt2._M_start),
+                                                       __alt2._M_start, false),
                                    __end));
         }
      }
@@ -132,25 +132,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        return false;
      }
  
-  // TODO Implement it.
    template<typename _FwdIter, typename _CharT, typename _TraitsT>
      bool
      _Compiler<_FwdIter, _CharT, _TraitsT>::
      _M_assertion()
      {
-      // temporary place holders.
        if (_M_match_token(_ScannerT::_S_token_line_begin))
-       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
+       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
+             _M_insert_line_begin()));
        else if (_M_match_token(_ScannerT::_S_token_line_end))
-       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
+       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
+             _M_insert_line_end()));
        else if (_M_match_token(_ScannerT::_S_token_word_bound))
-       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
-      else if (_M_match_token(_ScannerT::_S_token_neg_word_bound))
-       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
+       // _M_value[0] == 'n' means it's negtive, say "not word boundary".
+       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
+             _M_insert_word_bound(_M_value[0] == 'n')));
        else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
-       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
-      else if (_M_match_token(_ScannerT::_S_token_subexpr_neg_lookahead_begin))
-       _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
+       {
+         auto __neg = _M_value[0] == 'n';
+         this->_M_disjunction();
+         if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
+           __throw_regex_error(regex_constants::error_paren);
+         auto __tmp = _M_pop();
+         __tmp._M_append(_M_nfa._M_insert_accept());
+         _M_stack.push(
+             _StateSeqT(
+               _M_nfa,
+               _M_nfa._M_insert_lookahead(__tmp._M_start, __neg)));
+       }
        else
         return false;
        return true;
@@ -161,40 +170,44 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      _Compiler<_FwdIter, _CharT, _TraitsT>::
      _M_quantifier()
      {
-      if (_M_match_token(_ScannerT::_S_token_closure0))
+      bool __neg = regex_constants::ECMAScript;
+      auto __init = [this, &__neg]()
         {
           if (_M_stack.empty())
             __throw_regex_error(regex_constants::error_badrepeat);
+         __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
+       };
+      if (_M_match_token(_ScannerT::_S_token_closure0))
+       {
+         __init();
           auto __e = _M_pop();
           _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
-                                                     __e._M_start));
+                                                     __e._M_start, __neg));
           __e._M_append(__r);
           _M_stack.push(__r);
         }
        else if (_M_match_token(_ScannerT::_S_token_closure1))
         {
-         if (_M_stack.empty())
-           __throw_regex_error(regex_constants::error_badrepeat);
+         __init();
           auto __e = _M_pop();
-         __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start));
+         __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
+                                            __neg));
           _M_stack.push(__e);
         }
        else if (_M_match_token(_ScannerT::_S_token_opt))
         {
-         if (_M_stack.empty())
-           __throw_regex_error(regex_constants::error_badrepeat);
+         __init();
           auto __e = _M_pop();
           auto __end = _M_nfa._M_insert_dummy();
           _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
-                                                     __e._M_start));
+                                                     __e._M_start, __neg));
           __e._M_append(__end);
           __r._M_append(__end);
           _M_stack.push(__r);
         }
        else if (_M_match_token(_ScannerT::_S_token_interval_begin))
         {
-         if (_M_stack.empty())
-           __throw_regex_error(regex_constants::error_badrepeat);
+         __init();
           if (!_M_match_token(_ScannerT::_S_token_dup_count))
             __throw_regex_error(regex_constants::error_badbrace);
           _StateSeqT __r(_M_pop());
@@ -206,23 +219,27 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           if (_M_match_token(_ScannerT::_S_token_comma))
             if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
               {
-               int __n = _M_cur_int_value(10) - __min_rep;
-               if (__n < 0)
-                 __throw_regex_error(regex_constants::error_badbrace);
-               auto __end = _M_nfa._M_insert_dummy();
-               for (int __i = 0; __i < __n; ++__i)
-                 {
+               int __n = _M_cur_int_value(10) - __min_rep;
+               if (__n < 0)
+                 __throw_regex_error(regex_constants::error_badbrace);
+               auto __end = _M_nfa._M_insert_dummy();
+               for (int __i = 0; __i < __n; ++__i)
+                 {
                     auto __tmp = __r._M_clone();
-                   __e._M_append(_StateSeqT(_M_nfa, _M_nfa.
-                       _M_insert_alt(__tmp._M_start, __end), __tmp._M_end));
-                 }
+                   __e._M_append
+                     (_StateSeqT(_M_nfa,
+                                 _M_nfa._M_insert_alt(__tmp._M_start,
+                                                      __end, __neg),
+                                 __tmp._M_end));
+                 }
                 __e._M_append(__end);
               }
             else // {3,}
               {
                 auto __tmp = __r._M_clone();
-               _StateSeqT __s(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
-                                                           __tmp._M_start));
+               _StateSeqT __s(_M_nfa,
+                              _M_nfa._M_insert_alt(_S_invalid_state_id,
+                                                   __tmp._M_start, __neg));
                 __tmp._M_append(__s);
                 __e._M_append(__s);
               }
diff --git a/libstdc++-v3/include/bits/regex_constants.h b/libstdc++-v3/include/bits/regex_constants.h

index 23174be..10b962a 100644 (file)
--- a/libstdc++-v3/include/bits/regex_constants.h
+++ b/libstdc++-v3/include/bits/regex_constants.h
@@ -78,87 +78,87 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     * %set.
     */
    enum syntax_option_type : unsigned int
-    {
-      /**
-       * Specifies that the matching of regular expressions against a character
-       * sequence shall be performed without regard to case.
-       */
-      icase      = 1 << _S_icase,
-
-      /**
-       * Specifies that when a regular expression is matched against a character
-       * container sequence, no sub-expression matches are to be stored in the
-       * supplied match_results structure.
-       */
-      nosubs     = 1 << _S_nosubs,
-
-      /**
-       * Specifies that the regular expression engine should pay more attention to
-       * the speed with which regular expressions are matched, and less to the
-       * speed with which regular expression objects are constructed. Otherwise
-       * it has no detectable effect on the program output.
-       */
-      optimize   = 1 << _S_optimize,
-
-      /**
-       * Specifies that character ranges of the form [a-b] should be locale
-       * sensitive.
-       */
-      collate    = 1 << _S_collate,
-
-      /**
-       * Specifies that the grammar recognized by the regular expression engine is
-       * that used by ECMAScript in ECMA-262 [Ecma International, ECMAScript
-       * Language Specification, Standard Ecma-262, third edition, 1999], as
-       * modified in section [28.13].  This grammar is similar to that defined
-       * in the PERL scripting language but extended with elements found in the
-       * POSIX regular expression grammar.
-       */
-      ECMAScript = 1 << _S_ECMAScript,
-
-      /**
-       * Specifies that the grammar recognized by the regular expression engine is
-       * that used by POSIX basic regular expressions in IEEE Std 1003.1-2001,
-       * Portable Operating System Interface (POSIX), Base Definitions and
-       * Headers, Section 9, Regular Expressions [IEEE, Information Technology --
-       * Portable Operating System Interface (POSIX), IEEE Standard 1003.1-2001].
-       */
-      basic      = 1 << _S_basic,
-
-      /**
-       * Specifies that the grammar recognized by the regular expression engine is
-       * that used by POSIX extended regular expressions in IEEE Std 1003.1-2001,
-       * Portable Operating System Interface (POSIX), Base Definitions and Headers,
-       * Section 9, Regular Expressions.
-       */
-      extended   = 1 << _S_extended,
-
-      /**
-       * Specifies that the grammar recognized by the regular expression engine is
-       * that used by POSIX utility awk in IEEE Std 1003.1-2001.  This option is
-       * identical to syntax_option_type extended, except that C-style escape
-       * sequences are supported.  These sequences are:
-       * \\\\, \\a, \\b, \\f, \\n, \\r, \\t , \\v, \\&apos,, &apos,,
-       * and \\ddd (where ddd is one, two, or three octal digits).
-       */
-      awk        = 1 << _S_awk,
-
-      /**
-       * Specifies that the grammar recognized by the regular expression engine is
-       * that used by POSIX utility grep in IEEE Std 1003.1-2001.  This option is
-       * identical to syntax_option_type basic, except that newlines are treated
-       * as whitespace.
-       */
-      grep       = 1 << _S_grep,
-
-      /**
-       * Specifies that the grammar recognized by the regular expression engine is
-       * that used by POSIX utility grep when given the -E option in
-       * IEEE Std 1003.1-2001.  This option is identical to syntax_option_type
-       * extended, except that newlines are treated as whitespace.
-       */
-      egrep      = 1 << _S_egrep,
-    };
+  {
+    /**
+     * Specifies that the matching of regular expressions against a character
+     * sequence shall be performed without regard to case.
+     */
+    icase      = 1 << _S_icase,
+
+    /**
+     * Specifies that when a regular expression is matched against a character
+     * container sequence, no sub-expression matches are to be stored in the
+     * supplied match_results structure.
+     */
+    nosubs     = 1 << _S_nosubs,
+
+    /**
+     * Specifies that the regular expression engine should pay more attention to
+     * the speed with which regular expressions are matched, and less to the
+     * speed with which regular expression objects are constructed. Otherwise
+     * it has no detectable effect on the program output.
+     */
+    optimize   = 1 << _S_optimize,
+
+    /**
+     * Specifies that character ranges of the form [a-b] should be locale
+     * sensitive.
+     */
+    collate    = 1 << _S_collate,
+
+    /**
+     * Specifies that the grammar recognized by the regular expression engine is
+     * that used by ECMAScript in ECMA-262 [Ecma International, ECMAScript
+     * Language Specification, Standard Ecma-262, third edition, 1999], as
+     * modified in section [28.13].  This grammar is similar to that defined
+     * in the PERL scripting language but extended with elements found in the
+     * POSIX regular expression grammar.
+     */
+    ECMAScript = 1 << _S_ECMAScript,
+
+    /**
+     * Specifies that the grammar recognized by the regular expression engine is
+     * that used by POSIX basic regular expressions in IEEE Std 1003.1-2001,
+     * Portable Operating System Interface (POSIX), Base Definitions and
+     * Headers, Section 9, Regular Expressions [IEEE, Information Technology --
+     * Portable Operating System Interface (POSIX), IEEE Standard 1003.1-2001].
+     */
+    basic      = 1 << _S_basic,
+
+    /**
+     * Specifies that the grammar recognized by the regular expression engine is
+     * that used by POSIX extended regular expressions in IEEE Std 1003.1-2001,
+     * Portable Operating System Interface (POSIX), Base Definitions and
+     * Headers, Section 9, Regular Expressions.
+     */
+    extended   = 1 << _S_extended,
+
+    /**
+     * Specifies that the grammar recognized by the regular expression engine is
+     * that used by POSIX utility awk in IEEE Std 1003.1-2001.  This option is
+     * identical to syntax_option_type extended, except that C-style escape
+     * sequences are supported.  These sequences are:
+     * \\\\, \\a, \\b, \\f, \\n, \\r, \\t , \\v, \\&apos,, &apos,,
+     * and \\ddd (where ddd is one, two, or three octal digits).
+     */
+    awk        = 1 << _S_awk,
+
+    /**
+     * Specifies that the grammar recognized by the regular expression engine is
+     * that used by POSIX utility grep in IEEE Std 1003.1-2001.  This option is
+     * identical to syntax_option_type basic, except that newlines are treated
+     * as whitespace.
+     */
+    grep       = 1 << _S_grep,
+
+    /**
+     * Specifies that the grammar recognized by the regular expression engine is
+     * that used by POSIX utility grep when given the -E option in
+     * IEEE Std 1003.1-2001.  This option is identical to syntax_option_type
+     * extended, except that newlines are treated as whitespace.
+     */
+    egrep      = 1 << _S_egrep,
+  };
  
    constexpr inline syntax_option_type
    operator&(syntax_option_type __a, syntax_option_type __b)
diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h

index 6d66d88..3df33e0 100644 (file)
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -66,33 +66,46 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        { }
  
        // Set matched when string exactly match the pattern.
-      virtual void
+      virtual bool
        _M_match() = 0;
  
        // Set matched when some prefix of the string matches the pattern.
-      virtual void
-      _M_search_from_first() = 0;
+      virtual bool
+      _M_search() = 0;
  
      protected:
        typedef typename _NFA<_CharT, _TraitsT>::_SizeT _SizeT;
-      _Executor(_BiIter    __begin,
-               _BiIter    __end,
-               _ResultsT& __results,
-               _FlagT     __flags,
-               _SizeT     __size)
-      : _M_current(__begin), _M_end(__end), _M_results(__results),
-       _M_flags(__flags)
+      typedef typename _TraitsT::char_class_type      _ClassT;
+
+      _Executor(_BiIter         __begin,
+               _BiIter         __end,
+               _ResultsT&      __results,
+               _FlagT          __flags,
+               _SizeT          __size,
+               const _TraitsT& __traits)
+      : _M_current(__begin), _M_begin(__begin), _M_end(__end),
+      _M_results(__results), _M_flags(__flags), _M_traits(__traits)
        {
         __size += 2;
         _M_results.resize(__size);
-       for (auto __i = 0; __i < __size; __i++)
+       for (_SizeT __i = 0; __i < __size; ++__i)
           _M_results[__i].matched = false;
        }
  
-      _BiIter       _M_current;
-      _BiIter       _M_end;
-      _ResultsVec&  _M_results;
-      _FlagT        _M_flags;
+      bool
+      _M_is_word(_CharT __ch)
+      {
+       static const _CharT __s = 'w';
+       return _M_traits.isctype(__ch,
+                                _M_traits.lookup_classname(&__s, &__s+1));
+      }
+
+      _BiIter         _M_current;
+      const _BiIter   _M_begin;
+      const _BiIter   _M_end;
+      _ResultsVec&    _M_results;
+      const _TraitsT& _M_traits;
+      _FlagT          _M_flags;
      };
  
    // A _DFSExecutor perform a DFS on given NFA and input string. At the very
@@ -126,26 +139,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                    const _RegexT&  __nfa,
                    const _TraitsT& __traits,
                    _FlagT          __flags)
-      : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
-       _M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results)
+      : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(),
+              __traits),
+       _M_traits(__traits), _M_nfa(__nfa), _M_cur_results(this->_M_results),
+       _M_start_state(__nfa._M_start())
        { }
  
-      void
+      bool
        _M_match()
-      { _M_dfs<true>(_M_nfa._M_start()); }
+      {
+       this->_M_current = this->_M_begin;
+       return _M_dfs<true>(_M_start_state);
+      }
  
-      void
+      bool
        _M_search_from_first()
-      { _M_dfs<false>(_M_nfa._M_start()); }
+      {
+       this->_M_current = this->_M_begin;
+       return _M_dfs<false>(_M_start_state);
+      }
+
+      bool
+      _M_search()
+      {
+       auto __cur = this->_M_begin;
+       do
+         {
+           this->_M_current = __cur;
+           if (_M_dfs<false>(_M_start_state))
+             return true;
+         }
+       // Continue when __cur == _M_end
+       while (__cur++ != this->_M_end);
+       return false;
+      }
  
      private:
        template<bool __match_mode>
         bool
         _M_dfs(_StateIdT __i);
  
-      _ResultsVec     _M_results_ret;
+      // To record current solution.
+      _ResultsVec     _M_cur_results;
        const _TraitsT& _M_traits;
        const _RegexT&  _M_nfa;
+      _StateIdT       _M_start_state;
      };
  
    // Like the DFS approach, it try every possible state transition; Unlike DFS,
@@ -170,35 +208,129 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
        typedef _NFA<_CharT, _TraitsT>                       _RegexT;
        typedef typename _BaseT::_ResultsT                   _ResultsT;
-      typedef typename _BaseT::_ResultsVec                 _ResultsVec;
-      typedef std::unique_ptr<_ResultsVec>                 _ResultsPtr;
+      // Here's a solution for greedy/ungreedy mode in BFS approach. We need to
+      // carefully work out how to compare to conflict matching states.
+      //
+      // A matching state is a pair(where, when); `where` is a NFA node; `when`
+      // is a _BiIter, indicating which char is the next to be mathed one.  Two
+      // matching states conflict means that they have equivalent `where` and
+      // `when`.
+      //
+      // Now since we need to drop one and keep another, because at most one of
+      // them could be the final optimal solution. This behavior is affected by
+      // greedy policy.
+      //
+      // The definition of `greedy`:
+      // For the sequence of quantifiers in NFA sorted by there start position,
+      // now maintain a vector in a matching state, with equal length to
+      // quantifier seq, recording repeating times of every quantifier. Now to
+      // compare two matching states, we just lexically compare these two
+      // vectors. To win the compare(to survive), one matching state needs to
+      // make its greedy quantifier count larger, and ungreedy quantifiers
+      // count smaller.
+      //
+      // In the implementation, we recorded negtive numbers for greedy
+      // quantifiers and positive numbers of ungreedy ones. Now a simple
+      // operator<() for lexicographical_compare will emit the answer.
+      //
+      // When two vectors equal, it means the `where`, `when` and quantifier
+      // counts are identical, it indicates the same answer, so just return
+      // false.
+      struct _ResultsEntry
+      : private _BaseT::_ResultsVec
+      {
+      public:
+       _ResultsEntry(unsigned int __res_sz, unsigned int __sz)
+       : _BaseT::_ResultsVec(__res_sz), _M_quant_keys(__sz)
+       { }
+
+       sub_match<_BiIter>&
+       operator[](unsigned int __idx)
+       { return this->_BaseT::_ResultsVec::operator[](__idx); }
+
+       bool
+       operator<(const _ResultsEntry& __rhs) const
+       {
+         _GLIBCXX_DEBUG_ASSERT(_M_quant_keys.size()
+                               == __rhs._M_quant_keys.size());
+         return lexicographical_compare(_M_quant_keys.begin(),
+                                        _M_quant_keys.end(),
+                                        __rhs._M_quant_keys.begin(),
+                                        __rhs._M_quant_keys.end());
+       }
+
+       void
+       _M_inc(unsigned int __idx, bool __neg)
+       { _M_quant_keys[__idx] += __neg ? 1 : -1; }
+
+       typename _BaseT::_ResultsVec
+       _M_get()
+       { return *this; }
+
+      public:
+       std::vector<int> _M_quant_keys;
+      };
+
+      typedef std::unique_ptr<_ResultsEntry>               _ResultsPtr;
        typedef regex_constants::match_flag_type             _FlagT;
  
-      _BFSExecutor(_BiIter        __begin,
-                  _BiIter        __end,
-                  _ResultsT&     __results,
-                  const _RegexT& __nfa,
-                  _FlagT         __flags)
-      : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
-       _M_nfa(__nfa)
-      {
-       if (_M_nfa._M_start() != _S_invalid_state_id)
-         _M_covered[_M_nfa._M_start()] =
-           _ResultsPtr(new _ResultsVec(this->_M_results));
-       _M_e_closure();
-      }
+      _BFSExecutor(_BiIter         __begin,
+                  _BiIter         __end,
+                  _ResultsT&      __results,
+                  const _RegexT&  __nfa,
+                  const _TraitsT& __traits,
+                  _FlagT          __flags)
+      : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(),
+              __traits),
+       _M_nfa(__nfa),
+       _M_cur_results(nullptr),
+       _M_start_state(__nfa._M_start())
+      { }
  
-      void
+      bool
        _M_match()
-      { _M_main_loop<true>(); }
+      {
+       _M_init(this->_M_begin);
+       return _M_main_loop<true>();
+      }
  
-      void
+      bool
        _M_search_from_first()
-      { _M_main_loop<false>(); }
+      {
+       _M_init(this->_M_begin);
+       return _M_main_loop<false>();
+      }
+
+      bool
+      _M_search()
+      {
+       auto __cur = this->_M_begin;
+       do
+         {
+           _M_init(__cur);
+           if (_M_main_loop<false>())
+             return true;
+         }
+       // Continue when __cur == _M_end
+       while (__cur++ != this->_M_end);
+       return false;
+      }
  
      private:
+      void
+      _M_init(_BiIter __cur)
+      {
+       _GLIBCXX_DEBUG_ASSERT(_M_start_state != _S_invalid_state_id);
+       this->_M_current = __cur;
+       _M_covered.clear();
+       _M_covered[_M_start_state] =
+         _ResultsPtr(new _ResultsEntry(this->_M_results.size(),
+                                       _M_nfa._M_quant_count));
+       _M_e_closure();
+      }
+
        template<bool __match_mode>
-       void
+       bool
         _M_main_loop();
  
        void
@@ -208,13 +340,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        _M_move();
  
        bool
-      _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const;
-
-      bool
-      _M_includes_some() const;
+      _M_includes_some();
  
-      std::map<_StateIdT, _ResultsPtr>     _M_covered;
-      const _RegexT&                       _M_nfa;
+      std::map<_StateIdT, _ResultsPtr> _M_covered;
+      // To record global optimal solution.
+      _ResultsPtr                      _M_cur_results;
+      const _RegexT&                   _M_nfa;
+      _StateIdT                        _M_start_state;
      };
  
   //@} regex-detail
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc

index 788d65e..b110c5d 100644 (file)
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -44,18 +44,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         // This is not that certain. Need deeper investigate.
         return false;
        auto& __current = this->_M_current;
+      auto& __begin = this->_M_begin;
        auto& __end = this->_M_end;
-      auto& __results = _M_results_ret;
+      auto& __results = _M_cur_results;
        const auto& __state = _M_nfa[__i];
        bool __ret = false;
        switch (__state._M_opcode)
         {
         case _S_opcode_alternative:
-         // Greedy mode by default. For non-greedy mode,
-         // swap _M_alt and _M_next.
-         // TODO: Add greedy mode option.
-         __ret = _M_dfs<__match_mode>(__state._M_alt)
-           || _M_dfs<__match_mode>(__state._M_next);
+         // Greedy or not, this is a question ;)
+         if (!__state._M_neg)
+           __ret = _M_dfs<__match_mode>(__state._M_alt)
+             || _M_dfs<__match_mode>(__state._M_next);
+         else
+           __ret = _M_dfs<__match_mode>(__state._M_next)
+             || _M_dfs<__match_mode>(__state._M_alt);
           break;
         case _S_opcode_subexpr_begin:
           // Here's the critical part: if there's nothing changed since last
@@ -86,6 +89,52 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           else
             __ret = _M_dfs<__match_mode>(__state._M_next);
           break;
+       case _S_opcode_line_begin_assertion:
+         if (__current == __begin)
+           __ret = _M_dfs<__match_mode>(__state._M_next);
+         break;
+       case _S_opcode_line_end_assertion:
+         if (__current == __end)
+           __ret = _M_dfs<__match_mode>(__state._M_next);
+         break;
+         // By definition.
+       case _S_opcode_word_boundry:
+           {
+             bool __ans = false;
+             if (__current == __begin && this->_M_is_word(*__current))
+               __ans = true;
+             else if (__current == __end && this->_M_is_word(*__current))
+               __ans = true;
+             else
+               {
+                 auto __pre = __current;
+                 --__pre;
+                 if (this->_M_is_word(*__current)
+                     != this->_M_is_word(*__pre))
+                   __ans = true;
+               }
+             if (__ans == !__state._M_neg)
+               __ret = _M_dfs<__match_mode>(__state._M_next);
+           }
+         break;
+         // Here __state._M_alt offers a single start node for a sub-NFA.
+         // We recursivly invoke our algorithm to match the sub-NFA.
+       case _S_opcode_subexpr_lookahead:
+           {
+             _ResultsT __m;
+             // FIXME Here's not necessarily a DFSExecutor. But we need to
+             // refactor the whole NFA to a recursive tree structure first.
+             _DFSExecutor __sub(this->_M_current,
+                                this->_M_end,
+                                __m,
+                                this->_M_nfa,
+                                this->_M_traits,
+                                this->_M_flags);
+             __sub._M_start_state = __state._M_alt;
+             if (__sub._M_search_from_first() == !__state._M_neg)
+               __ret = _M_dfs<__match_mode>(__state._M_next);
+           }
+         break;
         case _S_opcode_match:
           if (__current != __end && __state._M_matches(*__current))
             {
@@ -138,19 +187,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
    template<typename _BiIter, typename _Alloc,
      typename _CharT, typename _TraitsT>
    template<bool __match_mode>
-    void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
+    bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
      _M_main_loop()
      {
+      bool __ret = false;
        while (this->_M_current != this->_M_end)
         {
           if (!__match_mode)
-           if (_M_includes_some())
-             return;
+           // To keep regex_search greedy, no "return true" here.
+           __ret = _M_includes_some() || __ret;
           _M_move();
           ++this->_M_current;
           _M_e_closure();
         }
-      _M_includes_some();
+      __ret = _M_includes_some() || __ret;
+      if (__ret)
+       this->_M_results = _M_cur_results->_M_get();
+      return __ret;
      }
  
    template<typename _BiIter, typename _Alloc,
@@ -161,6 +214,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        auto& __current = this->_M_current;
        std::queue<_StateIdT> __q;
        std::vector<bool> __in_q(_M_nfa.size(), false);
+      auto& __begin = this->_M_begin;
+      auto& __end = this->_M_end;
+
        for (auto& __it : _M_covered)
         {
           __in_q[__it.first] = true;
@@ -173,18 +229,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           __in_q[__u] = false;
           const auto& __state = _M_nfa[__u];
  
-         // Can be implemented using method, but there're too much arguments.
-         // I would use macro function before C++11, but lambda is a better
-         // choice, since hopefully compiler can inline it.
+         // Can be implemented using method, but there will be too many
+         // arguments. I would use macro function before C++11, but lambda is
+         // a better choice, since hopefully compiler can inline it.
           auto __add_visited_state = [&](_StateIdT __v)
           {
             if (__v == _S_invalid_state_id)
               return;
             if (_M_covered.count(__u) != 0
                 && (_M_covered.count(__v) == 0
-                   || _M_match_less_than(*_M_covered[__u], *_M_covered[__v])))
+                   || *_M_covered[__u] < *_M_covered[__v]))
               {
-               _M_covered[__v] = _ResultsPtr(new _ResultsVec(*_M_covered[__u]));
+               _M_covered[__v] =
+                 _ResultsPtr(new _ResultsEntry(*_M_covered[__u]));
                 // if a state is updated, it's outgoing neighbors should be
                 // reconsidered too. Push them to the queue.
                 if (!__in_q[__v])
@@ -195,19 +252,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
               }
           };
  
+         // Identical to DFS's switch part.
           switch (__state._M_opcode)
             {
+             // Needs to maintain quantifier count vector here. A quantifier
+             // must be concerned with a alt node.
               case _S_opcode_alternative:
-               __add_visited_state(__state._M_next);
-               __add_visited_state(__state._M_alt);
+               {
+                 __add_visited_state(__state._M_next);
+                 auto __back =
+                   _M_covered[__u]->_M_quant_keys[__state._M_quant_index];
+                 _M_covered[__u]->_M_inc(__state._M_quant_index,
+                                         __state._M_neg);
+                 __add_visited_state(__state._M_alt);
+                 _M_covered[__u]->_M_quant_keys[__state._M_quant_index]
+                   = __back;
+               }
                 break;
               case _S_opcode_subexpr_begin:
                 {
-                 auto& __cu = *_M_covered[__u];
-                 auto __back = __cu[__state._M_subexpr].first;
-                 __cu[__state._M_subexpr].first = __current;
-                 __add_visited_state(__state._M_next);
-                 __cu[__state._M_subexpr].first = __back;
+                 auto& __sub = (*_M_covered[__u])[__state._M_subexpr];
+                 if (!__sub.matched || __sub.first != __current)
+                   {
+                     auto __back = __sub.first;
+                     __sub.first = __current;
+                     __add_visited_state(__state._M_next);
+                     __sub.first = __back;
+                   }
                 }
                 break;
               case _S_opcode_subexpr_end:
@@ -220,10 +291,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                   __cu[__state._M_subexpr] = __back;
                 }
                 break;
+             case _S_opcode_line_begin_assertion:
+               if (__current == __begin)
+                 __add_visited_state(__state._M_next);
+               break;
+             case _S_opcode_line_end_assertion:
+               if (__current == __end)
+                 __add_visited_state(__state._M_next);
+               break;
+             case _S_opcode_word_boundry:
+                 {
+                   bool __ans = false;
+                   if (__current == __begin && this->_M_is_word(*__current))
+                     __ans = true;
+                   else if (__current == __end && this->_M_is_word(*__current))
+                     __ans = true;
+                   else
+                     {
+                       auto __pre = __current;
+                       --__pre;
+                       if (this->_M_is_word(*__current)
+                           != this->_M_is_word(*__pre))
+                         __ans = true;
+                     }
+                   if (__ans == !__state._M_neg)
+                     __add_visited_state(__state._M_next);
+                 }
+               break;
+             case _S_opcode_subexpr_lookahead:
+                 {
+                   _ResultsT __m;
+                   // Same comment as in DFS.
+                   _BFSExecutor __sub(this->_M_current,
+                                      this->_M_end,
+                                      __m,
+                                      this->_M_nfa,
+                                      this->_M_traits,
+                                      this->_M_flags);
+                   __sub._M_start_state = __state._M_alt;
+                   if (__sub._M_search_from_first() == !__state._M_neg)
+                     __add_visited_state(__state._M_next);
+                 }
+               break;
               case _S_opcode_match:
                 break;
               case _S_opcode_accept:
-               __add_visited_state(__state._M_next);
                 break;
               default:
                 _GLIBCXX_DEBUG_ASSERT(false);
@@ -244,7 +356,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
               && __state._M_matches(*this->_M_current))
             if (__state._M_next != _S_invalid_state_id)
               if (__next.count(__state._M_next) == 0
-                 || _M_match_less_than(*__it.second, *__next[__state._M_next]))
+                 || *__it.second < *__next[__state._M_next])
                 __next[__state._M_next] = move(__it.second);
         }
        _M_covered = move(__next);
@@ -253,37 +365,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
    template<typename _BiIter, typename _Alloc,
      typename _CharT, typename _TraitsT>
      bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
-    _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const
-    {
-      // TODO: Greedy and Non-greedy support
-      _GLIBCXX_DEBUG_ASSERT(__u.size() == __v.size());
-      auto __size = __u.size();
-      for (auto __i = 0; __i < __size; __i++)
-       {
-         auto __uit = __u[__i], __vit = __v[__i];
-         if (__uit.matched && !__vit.matched)
-           return true;
-         if (!__uit.matched && __vit.matched)
-           return false;
-         if (__uit.matched && __vit.matched)
-           {
-             // GREEDY
-             if (__uit.first != __vit.first)
-               return __uit.first < __vit.first;
-             if (__uit.second != __vit.second)
-               return __uit.second > __vit.second;
-           }
-       }
-      return false;
-    }
-
-  template<typename _BiIter, typename _Alloc,
-    typename _CharT, typename _TraitsT>
-    bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
-    _M_includes_some() const
+    _M_includes_some()
      {
        auto& __s = _M_nfa._M_final_states();
        auto& __t = _M_covered;
+      bool __succ = false;
        if (__s.size() > 0 && __t.size() > 0)
         {
           auto __first = __s.begin();
@@ -292,16 +378,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
             {
               if (*__first < __second->first)
                 ++__first;
-             else if (__second->first < *__first)
+             else if (*__first > __second->first)
                 ++__second;
               else
                 {
-                 this->_M_results = *__second->second;
-                 return true;
+                 if (_M_cur_results == nullptr
+                     || *__second->second < *_M_cur_results)
+                   _M_cur_results =
+                     _ResultsPtr(new _ResultsEntry(*__second->second));
+                 __succ = true;
+                 ++__first;
+                 ++__second;
                 }
             }
         }
-      return false;
+      return __succ;
      }
  
    template<typename _BiIter, typename _Alloc,
@@ -322,7 +413,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        if (__p->_M_has_backref)
         return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p,
                                               __re._M_traits, __flags));
-      return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags));
+      return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p,
+                                           __re._M_traits, __flags));
      }
  
  _GLIBCXX_END_NAMESPACE_VERSION
diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h

index 064c183..824d6ce 100644 (file)
--- a/libstdc++-v3/include/bits/regex_scanner.h
+++ b/libstdc++-v3/include/bits/regex_scanner.h
@@ -69,7 +69,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         _S_token_subexpr_begin,
         _S_token_subexpr_no_group_begin,
         _S_token_subexpr_lookahead_begin,
-       _S_token_subexpr_neg_lookahead_begin,
         _S_token_subexpr_end,
         _S_token_bracket_begin,
         _S_token_bracket_neg_begin,
@@ -84,10 +83,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         _S_token_or,
         _S_token_closure0,
         _S_token_closure1,
+       _S_token_ungreedy,
         _S_token_line_begin,
         _S_token_line_end,
         _S_token_word_bound,
-       _S_token_neg_word_bound,
         _S_token_comma,
         _S_token_dup_count,
         _S_token_eof,
diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc

index 3303aa5..4b66157 100644 (file)
--- a/libstdc++-v3/include/bits/regex_scanner.tcc
+++ b/libstdc++-v3/include/bits/regex_scanner.tcc
@@ -210,11 +210,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                 {
                   ++_M_current;
                   _M_token = _S_token_subexpr_lookahead_begin;
+                 _M_value.assign(1, 'p');
                 }
               else if (*_M_current == '!')
                 {
                   ++_M_current;
-                 _M_token = _S_token_subexpr_neg_lookahead_begin;
+                 _M_token = _S_token_subexpr_lookahead_begin;
+                 _M_value.assign(1, 'n');
                 }
               else
                 __throw_regex_error(regex_constants::error_paren);
@@ -371,9 +373,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           _M_value.assign(1, _M_escape_map.at(__c));
         }
        else if (__c == 'b')
-       _M_token = _S_token_word_bound;
+       {
+         _M_token = _S_token_word_bound;
+         _M_value.assign(1, 'p');
+       }
        else if (__c == 'B')
-       _M_token = _S_token_neg_word_bound;
+       {
+         _M_token = _S_token_word_bound;
+         _M_value.assign(1, 'n');
+       }
        // N3376 28.13
        else if (__c == 'd'
                || __c == 'D'
@@ -581,9 +589,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        case _S_token_subexpr_lookahead_begin:
         ostr << "lookahead subexpr begin\n";
         break;
-      case _S_token_subexpr_neg_lookahead_begin:
-       ostr << "neg lookahead subexpr begin\n";
-       break;
        case _S_token_subexpr_end:
         ostr << "subexpr end\n";
         break;
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc

new file mode 100644 (file)

index 0000000..82e9905
--- /dev/null
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc
@@ -0,0 +1,80 @@
+// { dg-options "-std=gnu++11" }
+// { dg-do run { xfail *-*-* } }
+
+//
+// 2013-09-14  Tim Shen <timshen91@gmail.com>
+//
+// Copyright (C) 2013 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 28.11.3 regex_search
+// Tests ECMAScript assertion.
+
+#include <regex>
+#include <testsuite_hooks.h>
+
+using namespace std;
+
+void
+test01()
+{
+  bool test __attribute__((unused)) = true;
+
+  VERIFY(!regex_search("2123456", regex("^1234")));
+  VERIFY(regex_search("123456", regex("^1234")));
+  VERIFY(regex_search("123456", regex("(5|^)1234")));
+  VERIFY(regex_search("5123456", regex("(5|^)1234")));
+  VERIFY(!regex_search("1234562", regex("3456$")));
+  VERIFY(regex_search("123456", regex("3456$")));
+  VERIFY(!regex_search("123456", regex("(?=1234)56")));
+  VERIFY(regex_search("123456", regex("(?=1234)123456")));
+  VERIFY(regex_search("123456", regex("(?!1234)56")));
+  VERIFY(!regex_search("123456", regex("(?!1234)123456")));
+
+  VERIFY(regex_search("a-", regex("a\\b-")));
+  VERIFY(!regex_search("ab", regex("a\\bb")));
+  VERIFY(!regex_search("a-", regex("a\\B-")));
+  VERIFY(regex_search("ab", regex("a\\Bb")));
+
+  string s("This is a regular expression");
+  string sol[] =
+    {
+      "This",
+      "is",
+      "a",
+      "regular",
+      "expression",
+    };
+
+  regex re("\\b\\w*\\b");
+  int i = 0;
+  for (auto it = sregex_iterator(s.begin(), s.end(), re);
+       it != sregex_iterator() && i < 5;
+       ++it)
+    {
+      string s((*it)[0].first, (*it)[0].second);
+      VERIFY(s == sol[i++]);
+    }
+  VERIFY(i == 5);
+}
+
+int
+main()
+{
+  test01();
+  return 0;
+}
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc

new file mode 100644 (file)

index 0000000..ad37ec8
--- /dev/null
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc
@@ -0,0 +1,71 @@
+// { dg-options "-std=gnu++11" }
+
+//
+// 2013-09-14  Tim Shen <timshen91@gmail.com>
+//
+// Copyright (C) 2013 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 28.11.3 regex_search
+// Tests ECMAScript greedy and ungreedy quantifiers.
+
+#include <regex>
+#include <testsuite_hooks.h>
+
+using namespace std;
+
+void
+test01()
+{
+  bool test __attribute__((unused)) = true;
+
+  cmatch m;
+#define TEST(i, s) VERIFY(m[i].matched && string(m[i].first, m[i].second) == s)
+  VERIFY(regex_search("aaaa", m, regex("a*")));
+  TEST(0, "aaaa");
+  VERIFY(regex_search("aaaa", m, regex("a*?")));
+  TEST(0, "");
+  VERIFY(regex_search("aaaa", m, regex("a+")));
+  TEST(0, "aaaa");
+  VERIFY(regex_search("aaaa", m, regex("a+?")));
+  TEST(0, "a");
+  VERIFY(regex_search("a", m, regex("a?")));
+  TEST(0, "a");
+  VERIFY(regex_search("a", m, regex("a??")));
+  TEST(0, "");
+  VERIFY(regex_search("", m, regex("a??")));
+  TEST(0, "");
+  VERIFY(regex_search("aaaa", m, regex("(a+)(a+)")));
+  TEST(1, "aaa");
+  TEST(2, "a");
+  VERIFY(regex_search("aaaa", m, regex("(a+?)(a+)")));
+  TEST(1, "a");
+  TEST(2, "aaa");
+  VERIFY(regex_search("aaaa", m, regex("(a+?)(a+)")));
+  TEST(1, "a");
+  TEST(2, "aaa");
+  VERIFY(regex_search("aaaa", m, regex("(a+?)(a+?)")));
+  TEST(1, "a");
+  TEST(2, "a");
+}
+
+int
+main()
+{
+  test01();
+  return 0;
+}
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc

index a2d290d..ec25875 100644 (file)
--- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc
@@ -21,7 +21,7 @@
  // <http://www.gnu.org/licenses/>.
  
  // 28.11.3 regex_search
-// Tests BRE against a std::string target.
+// Tests ECMAScript against a std::string target.
  
  #include <regex>
  #include <testsuite_hooks.h>
author	timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4>
	Sat, 14 Sep 2013 14:23:44 +0000 (14:23 +0000)
committer	timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4>
	Sat, 14 Sep 2013 14:23:44 +0000 (14:23 +0000)
libstdc++-v3/ChangeLog		patch \| blob \| history
libstdc++-v3/include/bits/regex.h		patch \| blob \| history
libstdc++-v3/include/bits/regex_automaton.h		patch \| blob \| history
libstdc++-v3/include/bits/regex_automaton.tcc		patch \| blob \| history
libstdc++-v3/include/bits/regex_compiler.h		patch \| blob \| history
libstdc++-v3/include/bits/regex_compiler.tcc		patch \| blob \| history
libstdc++-v3/include/bits/regex_constants.h		patch \| blob \| history
libstdc++-v3/include/bits/regex_executor.h		patch \| blob \| history
libstdc++-v3/include/bits/regex_executor.tcc		patch \| blob \| history
libstdc++-v3/include/bits/regex_scanner.h		patch \| blob \| history
libstdc++-v3/include/bits/regex_scanner.tcc		patch \| blob \| history
libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc		patch \| blob \| history