regex_automaton.h: _S_opcode_backref.
authorTim Shen <timshen91@gmail.com>
Sun, 18 Aug 2013 13:55:48 +0000 (13:55 +0000)
committerTim Shen <timshen@gcc.gnu.org>
Sun, 18 Aug 2013 13:55:48 +0000 (13:55 +0000)
2013-08-18  Tim Shen  <timshen91@gmail.com>

* include/bits/regex_automaton.h: _S_opcode_backref.
* include/bits/regex_automaton.tcc: Backref automaton support.
* include/bits/regex_compiler.tcc: Parsing support.
* include/bits/regex_executor.h: Add _M_traits for _DFSExecutor.
* include/bits/regex_executor.tcc: Add _S_opcode_backref support.
* testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc: New.

From-SVN: r201825

libstdc++-v3/ChangeLog
libstdc++-v3/include/bits/regex_automaton.h
libstdc++-v3/include/bits/regex_automaton.tcc
libstdc++-v3/include/bits/regex_compiler.tcc
libstdc++-v3/include/bits/regex_executor.h
libstdc++-v3/include/bits/regex_executor.tcc
libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc [new file with mode: 0644]

index 0c1cb43..6715e53 100644 (file)
@@ -1,3 +1,12 @@
+2013-08-18  Tim Shen  <timshen91@gmail.com>
+
+       * include/bits/regex_automaton.h: _S_opcode_backref.
+       * include/bits/regex_automaton.tcc: Backref automaton support.
+       * include/bits/regex_compiler.tcc: Parsing support.
+       * include/bits/regex_executor.h: Add _M_traits for _DFSExecutor.
+       * include/bits/regex_executor.tcc: Add _S_opcode_backref support.
+       * testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc: New.
+
 2013-08-16  Tim Shen  <timshen91@gmail.com>
 
        * include/bits/regex.h (regex_traits<>::transform_primary):
index 5817156..b58071e 100644 (file)
@@ -53,6 +53,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   {
       _S_opcode_unknown       =   0,
       _S_opcode_alternative   =   1,
+      _S_opcode_backref       =   2,
       _S_opcode_subexpr_begin =   4,
       _S_opcode_subexpr_end   =   5,
       _S_opcode_match         = 100,
@@ -66,11 +67,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       typedef int                        _OpcodeT;
       typedef _Matcher<_CharT>           _MatcherT;
 
-      _OpcodeT     _M_opcode;    // type of outgoing transition
-      _StateIdT    _M_next;      // outgoing transition
-      _StateIdT    _M_alt;       // for _S_opcode_alternative
-      unsigned int _M_subexpr;   // for _S_opcode_subexpr_*
-      _MatcherT    _M_matches;   // for _S_opcode_match
+      _OpcodeT     _M_opcode;           // type of outgoing transition
+      _StateIdT    _M_next;             // outgoing transition
+      union // Since they are mutual exclusive.
+      {
+        _StateIdT    _M_alt;            // for _S_opcode_alternative
+        unsigned int _M_subexpr;        // for _S_opcode_subexpr_*
+        unsigned int _M_backref_index;  // for _S_opcode_backref
+      };
+      _MatcherT    _M_matches;          // for _S_opcode_match
 
       explicit _State(_OpcodeT __opcode)
       : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
@@ -82,8 +87,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { }
 
       _State(_OpcodeT __opcode, unsigned __index)
-      : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__index)
-      { }
+      : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
+      {
+        if (__opcode == _S_opcode_subexpr_begin
+            || __opcode == _S_opcode_subexpr_end)
+          _M_subexpr = __index;
+        else if (__opcode == _S_opcode_backref)
+          _M_backref_index = __index;
+      }
 
       _State(_StateIdT __next, _StateIdT __alt)
       : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
@@ -174,7 +185,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _M_insert_subexpr_begin()
       {
         auto __id = _M_subexpr_count++;
-        _M_paren_stack.push(__id);
+        _M_paren_stack.push_back(__id);
         this->push_back(_StateT(_S_opcode_subexpr_begin, __id));
         return this->size()-1;
       }
@@ -182,26 +193,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _StateIdT
       _M_insert_subexpr_end()
       {
-        this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.top()));
-        _M_paren_stack.pop();
+        this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.back()));
+        _M_paren_stack.pop_back();
         return this->size()-1;
       }
 
-      void
-      _M_set_backref(bool __b)
-      { _M_has_backref = __b; }
+      _StateIdT
+      _M_insert_backref(unsigned int __index);
 
 #ifdef _GLIBCXX_DEBUG
       std::ostream&
       _M_dot(std::ostream& __ostr) const;
 #endif
 
-      _FlagT                   _M_flags;
-      _StateIdT                _M_start_state;
-      _StateSet                _M_accepting_states;
-      _SizeT                   _M_subexpr_count;
-      bool                     _M_has_backref;
-      std::stack<unsigned int> _M_paren_stack;
+      _FlagT                    _M_flags;
+      _StateIdT                 _M_start_state;
+      _StateSet                 _M_accepting_states;
+      _SizeT                    _M_subexpr_count;
+      bool                      _M_has_backref;
+      std::vector<unsigned int> _M_paren_stack;
     };
 
   /// Describes a sequence of one or more %_State, its current start
index cf9c8eb..40a1547 100644 (file)
@@ -50,6 +50,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         case _S_opcode_subexpr_end:
           ostr << "subexpr end next=" << _M_next << " index=" << _M_subexpr;
           break;
+        case _S_opcode_backref:
+          ostr << "backref next=" << _M_next << " index=" << _M_backref_index;
+          break;
         case _S_opcode_match:
           ostr << "match next=" << _M_next;
           break;
@@ -87,6 +90,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                  << _M_subexpr << "\"];\n"
                  << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
           break;
+        case _S_opcode_backref:
+          __ostr << __id << " [label=\"" << __id << "\\nBACKREF "
+                 << _M_subexpr << "\"];\n"
+                 << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
+          break;
         case _S_opcode_match:
           __ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
                  << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
@@ -116,6 +124,27 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 
   template<typename _CharT, typename _TraitsT>
+    _StateIdT _NFA<_CharT, _TraitsT>::
+    _M_insert_backref(unsigned int __index)
+    {
+      // To figure out whether a backref is valid, a stack is used to store
+      // unfinished sub-expressions. For example, when parsing
+      // "(a(b)(c\\1(d)))" at '\\1', _M_subexpr_count is 3, indicating that 3
+      // sub expressions are parsed or partially parsed(in the stack), aka,
+      // "(a..", "(b)" and "(c..").
+      // _M_paren_stack is {1, 3}, for incomplete "(a.." and "(c..". At this
+      // time, "\\2" is valid, but "\\1" and "\\3" are not.
+      if (__index >= _M_subexpr_count)
+        __throw_regex_error(regex_constants::error_backref);
+      for (auto __it : _M_paren_stack)
+        if (__index == __it)
+          __throw_regex_error(regex_constants::error_backref);
+      _M_has_backref = true;
+      this->push_back(_StateT(_S_opcode_backref, __index));
+      return this->size()-1;
+    }
+
+  template<typename _CharT, typename _TraitsT>
     _StateSeq<_CharT, _TraitsT>& _StateSeq<_CharT, _TraitsT>::
     operator=(const _StateSeq& __rhs)
     {
index 04301e4..2a5e2c6 100644 (file)
@@ -745,8 +745,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       if (_M_match_token(_ScannerT::_S_token_backref))
        {
          // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value);
-          _M_state_store._M_set_backref(true);
-         //return true;
+         _M_stack.push(_StateSeqT(_M_state_store, _M_state_store.
+                                   _M_insert_backref(_M_cur_int_value(10))));
+         return true;
        }
       if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
        {
index afac8d0..0006a29 100644 (file)
@@ -82,10 +82,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           __it.matched = false;
       }
 
-      _BiIter   _M_current;
-      _BiIter   _M_end;
+      _BiIter    _M_current;
+      _BiIter    _M_end;
       _ResultsT& _M_results;
-      _FlagT    _M_flags;
+      _FlagT     _M_flags;
     };
 
   template<typename _BiIter, typename _Alloc,
@@ -96,16 +96,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     public:
       typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
       typedef _NFA<_CharT, _TraitsT>                       _RegexT;
-      typedef typename _BaseT::_ResultsT                    _ResultsT;
+      typedef typename _BaseT::_ResultsT                   _ResultsT;
       typedef regex_constants::match_flag_type             _FlagT;
 
       _DFSExecutor(_BiIter        __begin,
                    _BiIter        __end,
-                   _ResultsT&      __results,
+                   _ResultsT&     __results,
                    const _RegexT& __nfa,
                    _FlagT         __flags)
       : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
-        _M_nfa(__nfa)
+        _M_traits(_TraitsT()), _M_nfa(__nfa)
       { }
 
       bool
@@ -121,6 +121,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
         bool
         _M_dfs(_StateIdT __i);
 
+      _TraitsT       _M_traits;
       const _RegexT& _M_nfa;
     };
 
index 32d1537..08b4915 100644 (file)
@@ -63,8 +63,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           __ret = _M_dfs<__match_mode>(__state._M_next);
           break;
         case _S_opcode_subexpr_end:
-          __ret = _M_dfs<__match_mode>(__state._M_next);
           __results.at(__state._M_subexpr).second = __current;
+          __results.at(__state._M_subexpr).matched = true;
+          __ret = _M_dfs<__match_mode>(__state._M_next);
           __results.at(__state._M_subexpr).matched = __ret;
           break;
         case _S_opcode_match:
@@ -75,6 +76,30 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
               --__current;
             }
           break;
+        // First fetch the matched result from __results as __submatch;
+        // then compare it with
+        // (__current, __current + (__submatch.second - __submatch.first))
+        // If matched, keep going; else just return to try another state.
+        case _S_opcode_backref:
+          {
+            auto& __submatch = __results.at(__state._M_backref_index);
+            if (!__submatch.matched)
+              break;
+            auto __last = __current;
+            for (auto __tmp = __submatch.first;
+                 __last != __end && __tmp != __submatch.second;
+                 ++__tmp)
+              ++__last;
+            if (_M_traits.transform(__submatch.first, __submatch.second)
+                == _M_traits.transform(__current, __last))
+              {
+                auto __backup = __current;
+                __current = __last;
+                __ret = _M_dfs<__match_mode>(__state._M_next);
+                __current = __backup;
+              }
+          }
+          break;
         case _S_opcode_accept:
           if (__match_mode)
             __ret = __current == __end;
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc
new file mode 100644 (file)
index 0000000..a828fea
--- /dev/null
@@ -0,0 +1,78 @@
+// { dg-options "-std=gnu++11" }
+
+//
+// 2013-08-10  Tim Shen <timshen91@gmail.com>
+//
+// Copyright (C) 2013 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 28.11.2 regex_match
+// Tests ECMAScript back-refernce against a std::string.
+
+#include <regex>
+#include <testsuite_hooks.h>
+
+using namespace std;
+
+void
+test01()
+{
+  bool test __attribute__((unused)) = true;
+
+  regex re("([A-Z])\\1*");
+  smatch m;
+  {
+    string s = "AAAA";
+    regex_match(s, m, re);
+    VERIFY( m[0].matched );
+    VERIFY( m[1].matched );
+    VERIFY( std::string(m[0].first, m[0].second) == "AAAA" );
+    VERIFY( std::string(m[1].first, m[1].second) == "A" );
+  }
+  {
+    string s = "BBBB";
+    regex_match(s, m, re);
+    VERIFY( m[0].matched );
+    VERIFY( m[1].matched );
+    VERIFY( std::string(m[0].first, m[0].second) == "BBBB" );
+    VERIFY( std::string(m[1].first, m[1].second) == "B" );
+  }
+  {
+    string s = "BBBA";
+    regex_match(s, m, re);
+    VERIFY( !m[0].matched );
+    VERIFY( !m[1].matched );
+  }
+  {
+    try
+      {
+        regex re("(a(b)(c\\1(d)))");
+        VERIFY( false );
+      }
+    catch (...)
+      {
+        VERIFY( true );
+      }
+  }
+}
+
+int
+main()
+{
+  test01();
+  return 0;
+}