+2013-08-18 Tim Shen <timshen91@gmail.com>
+
+ * include/bits/regex_automaton.h: _S_opcode_backref.
+ * include/bits/regex_automaton.tcc: Backref automaton support.
+ * include/bits/regex_compiler.tcc: Parsing support.
+ * include/bits/regex_executor.h: Add _M_traits for _DFSExecutor.
+ * include/bits/regex_executor.tcc: Add _S_opcode_backref support.
+ * testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc: New.
+
2013-08-16 Tim Shen <timshen91@gmail.com>
* include/bits/regex.h (regex_traits<>::transform_primary):
{
_S_opcode_unknown = 0,
_S_opcode_alternative = 1,
+ _S_opcode_backref = 2,
_S_opcode_subexpr_begin = 4,
_S_opcode_subexpr_end = 5,
_S_opcode_match = 100,
typedef int _OpcodeT;
typedef _Matcher<_CharT> _MatcherT;
- _OpcodeT _M_opcode; // type of outgoing transition
- _StateIdT _M_next; // outgoing transition
- _StateIdT _M_alt; // for _S_opcode_alternative
- unsigned int _M_subexpr; // for _S_opcode_subexpr_*
- _MatcherT _M_matches; // for _S_opcode_match
+ _OpcodeT _M_opcode; // type of outgoing transition
+ _StateIdT _M_next; // outgoing transition
+ union // Since they are mutual exclusive.
+ {
+ _StateIdT _M_alt; // for _S_opcode_alternative
+ unsigned int _M_subexpr; // for _S_opcode_subexpr_*
+ unsigned int _M_backref_index; // for _S_opcode_backref
+ };
+ _MatcherT _M_matches; // for _S_opcode_match
explicit _State(_OpcodeT __opcode)
: _M_opcode(__opcode), _M_next(_S_invalid_state_id)
{ }
_State(_OpcodeT __opcode, unsigned __index)
- : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__index)
- { }
+ : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
+ {
+ if (__opcode == _S_opcode_subexpr_begin
+ || __opcode == _S_opcode_subexpr_end)
+ _M_subexpr = __index;
+ else if (__opcode == _S_opcode_backref)
+ _M_backref_index = __index;
+ }
_State(_StateIdT __next, _StateIdT __alt)
: _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
_M_insert_subexpr_begin()
{
auto __id = _M_subexpr_count++;
- _M_paren_stack.push(__id);
+ _M_paren_stack.push_back(__id);
this->push_back(_StateT(_S_opcode_subexpr_begin, __id));
return this->size()-1;
}
_StateIdT
_M_insert_subexpr_end()
{
- this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.top()));
- _M_paren_stack.pop();
+ this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.back()));
+ _M_paren_stack.pop_back();
return this->size()-1;
}
- void
- _M_set_backref(bool __b)
- { _M_has_backref = __b; }
+ _StateIdT
+ _M_insert_backref(unsigned int __index);
#ifdef _GLIBCXX_DEBUG
std::ostream&
_M_dot(std::ostream& __ostr) const;
#endif
- _FlagT _M_flags;
- _StateIdT _M_start_state;
- _StateSet _M_accepting_states;
- _SizeT _M_subexpr_count;
- bool _M_has_backref;
- std::stack<unsigned int> _M_paren_stack;
+ _FlagT _M_flags;
+ _StateIdT _M_start_state;
+ _StateSet _M_accepting_states;
+ _SizeT _M_subexpr_count;
+ bool _M_has_backref;
+ std::vector<unsigned int> _M_paren_stack;
};
/// Describes a sequence of one or more %_State, its current start
case _S_opcode_subexpr_end:
ostr << "subexpr end next=" << _M_next << " index=" << _M_subexpr;
break;
+ case _S_opcode_backref:
+ ostr << "backref next=" << _M_next << " index=" << _M_backref_index;
+ break;
case _S_opcode_match:
ostr << "match next=" << _M_next;
break;
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
+ case _S_opcode_backref:
+ __ostr << __id << " [label=\"" << __id << "\\nBACKREF "
+ << _M_subexpr << "\"];\n"
+ << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
+ break;
case _S_opcode_match:
__ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
<< __id << " -> " << _M_next << " [label=\"<match>\"];\n";
#endif
template<typename _CharT, typename _TraitsT>
+ _StateIdT _NFA<_CharT, _TraitsT>::
+ _M_insert_backref(unsigned int __index)
+ {
+ // To figure out whether a backref is valid, a stack is used to store
+ // unfinished sub-expressions. For example, when parsing
+ // "(a(b)(c\\1(d)))" at '\\1', _M_subexpr_count is 3, indicating that 3
+ // sub expressions are parsed or partially parsed(in the stack), aka,
+ // "(a..", "(b)" and "(c..").
+ // _M_paren_stack is {1, 3}, for incomplete "(a.." and "(c..". At this
+ // time, "\\2" is valid, but "\\1" and "\\3" are not.
+ if (__index >= _M_subexpr_count)
+ __throw_regex_error(regex_constants::error_backref);
+ for (auto __it : _M_paren_stack)
+ if (__index == __it)
+ __throw_regex_error(regex_constants::error_backref);
+ _M_has_backref = true;
+ this->push_back(_StateT(_S_opcode_backref, __index));
+ return this->size()-1;
+ }
+
+ template<typename _CharT, typename _TraitsT>
_StateSeq<_CharT, _TraitsT>& _StateSeq<_CharT, _TraitsT>::
operator=(const _StateSeq& __rhs)
{
if (_M_match_token(_ScannerT::_S_token_backref))
{
// __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value);
- _M_state_store._M_set_backref(true);
- //return true;
+ _M_stack.push(_StateSeqT(_M_state_store, _M_state_store.
+ _M_insert_backref(_M_cur_int_value(10))));
+ return true;
}
if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
{
__it.matched = false;
}
- _BiIter _M_current;
- _BiIter _M_end;
+ _BiIter _M_current;
+ _BiIter _M_end;
_ResultsT& _M_results;
- _FlagT _M_flags;
+ _FlagT _M_flags;
};
template<typename _BiIter, typename _Alloc,
public:
typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
typedef _NFA<_CharT, _TraitsT> _RegexT;
- typedef typename _BaseT::_ResultsT _ResultsT;
+ typedef typename _BaseT::_ResultsT _ResultsT;
typedef regex_constants::match_flag_type _FlagT;
_DFSExecutor(_BiIter __begin,
_BiIter __end,
- _ResultsT& __results,
+ _ResultsT& __results,
const _RegexT& __nfa,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
- _M_nfa(__nfa)
+ _M_traits(_TraitsT()), _M_nfa(__nfa)
{ }
bool
bool
_M_dfs(_StateIdT __i);
+ _TraitsT _M_traits;
const _RegexT& _M_nfa;
};
__ret = _M_dfs<__match_mode>(__state._M_next);
break;
case _S_opcode_subexpr_end:
- __ret = _M_dfs<__match_mode>(__state._M_next);
__results.at(__state._M_subexpr).second = __current;
+ __results.at(__state._M_subexpr).matched = true;
+ __ret = _M_dfs<__match_mode>(__state._M_next);
__results.at(__state._M_subexpr).matched = __ret;
break;
case _S_opcode_match:
--__current;
}
break;
+ // First fetch the matched result from __results as __submatch;
+ // then compare it with
+ // (__current, __current + (__submatch.second - __submatch.first))
+ // If matched, keep going; else just return to try another state.
+ case _S_opcode_backref:
+ {
+ auto& __submatch = __results.at(__state._M_backref_index);
+ if (!__submatch.matched)
+ break;
+ auto __last = __current;
+ for (auto __tmp = __submatch.first;
+ __last != __end && __tmp != __submatch.second;
+ ++__tmp)
+ ++__last;
+ if (_M_traits.transform(__submatch.first, __submatch.second)
+ == _M_traits.transform(__current, __last))
+ {
+ auto __backup = __current;
+ __current = __last;
+ __ret = _M_dfs<__match_mode>(__state._M_next);
+ __current = __backup;
+ }
+ }
+ break;
case _S_opcode_accept:
if (__match_mode)
__ret = __current == __end;
--- /dev/null
+// { dg-options "-std=gnu++11" }
+
+//
+// 2013-08-10 Tim Shen <timshen91@gmail.com>
+//
+// Copyright (C) 2013 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// 28.11.2 regex_match
+// Tests ECMAScript back-refernce against a std::string.
+
+#include <regex>
+#include <testsuite_hooks.h>
+
+using namespace std;
+
+void
+test01()
+{
+ bool test __attribute__((unused)) = true;
+
+ regex re("([A-Z])\\1*");
+ smatch m;
+ {
+ string s = "AAAA";
+ regex_match(s, m, re);
+ VERIFY( m[0].matched );
+ VERIFY( m[1].matched );
+ VERIFY( std::string(m[0].first, m[0].second) == "AAAA" );
+ VERIFY( std::string(m[1].first, m[1].second) == "A" );
+ }
+ {
+ string s = "BBBB";
+ regex_match(s, m, re);
+ VERIFY( m[0].matched );
+ VERIFY( m[1].matched );
+ VERIFY( std::string(m[0].first, m[0].second) == "BBBB" );
+ VERIFY( std::string(m[1].first, m[1].second) == "B" );
+ }
+ {
+ string s = "BBBA";
+ regex_match(s, m, re);
+ VERIFY( !m[0].matched );
+ VERIFY( !m[1].matched );
+ }
+ {
+ try
+ {
+ regex re("(a(b)(c\\1(d)))");
+ VERIFY( false );
+ }
+ catch (...)
+ {
+ VERIFY( true );
+ }
+ }
+}
+
+int
+main()
+{
+ test01();
+ return 0;
+}