1 ///////////////////////////////////////////////////////////////////////////////
4 // Copyright 2008 Eric Niebler. Distributed under the Boost
5 // Software License, Version 1.0. (See accompanying file
6 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8 #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
9 #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
11 // MS compatible compilers support #pragma once
16 #include <boost/config.hpp>
17 #include <boost/integer.hpp>
18 #include <boost/mpl/bool.hpp>
19 #include <boost/throw_exception.hpp>
20 #include <boost/numeric/conversion/converter.hpp>
21 #include <boost/xpressive/detail/detail_fwd.hpp>
22 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
23 #include <boost/xpressive/detail/utility/literals.hpp>
24 #include <boost/xpressive/detail/utility/chset/chset.hpp>
25 #include <boost/xpressive/regex_constants.hpp>
27 namespace boost { namespace xpressive { namespace detail
37 ///////////////////////////////////////////////////////////////////////////////
40 template<typename Char, typename Class>
49 ///////////////////////////////////////////////////////////////////////////////
50 // char_overflow_handler
52 struct char_overflow_handler
54 void operator ()(numeric::range_check_result result) const // throw(regex_error)
56 if(numeric::cInRange != result)
58 BOOST_THROW_EXCEPTION(
60 regex_constants::error_escape
61 , "character escape too large to fit in target character type"
68 ///////////////////////////////////////////////////////////////////////////////
71 template<typename FwdIter, typename CompilerTraits>
72 escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type>
73 parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr)
75 using namespace regex_constants;
76 typedef typename iterator_value<FwdIter>::type char_type;
77 typedef typename CompilerTraits::regex_traits regex_traits;
78 typedef typename regex_traits::char_class_type char_class_type;
80 // define an unsigned type the same size as char_type
81 typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t;
82 BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type));
83 typedef numeric::conversion_traits<uchar_t, int> converstion_traits;
85 BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found");
86 numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter;
87 escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char };
88 bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
89 regex_traits const &rxtraits = tr.traits();
92 esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase);
95 esc.type_ = escape_class;
99 if(-1 != rxtraits.value(*begin, 8))
101 esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777));
108 case BOOST_XPR_CHAR_(char_type, 'a'):
109 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a');
113 case BOOST_XPR_CHAR_(char_type, 'e'):
114 esc.ch_ = converter(27);
118 case BOOST_XPR_CHAR_(char_type, 'c'):
119 BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
122 rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin)
123 || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin)
125 , "invalid escape control letter; must be one of a-z or A-Z"
127 // Convert to character according to ECMA-262, section 15.10.2.10:
128 esc.ch_ = converter(*begin % 32);
131 // formfeed character
132 case BOOST_XPR_CHAR_(char_type, 'f'):
133 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f');
137 case BOOST_XPR_CHAR_(char_type, 'n'):
138 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n');
142 case BOOST_XPR_CHAR_(char_type, 'r'):
143 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r');
147 case BOOST_XPR_CHAR_(char_type, 't'):
148 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t');
152 case BOOST_XPR_CHAR_(char_type, 'v'):
153 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v');
156 // hex escape sequence
157 case BOOST_XPR_CHAR_(char_type, 'x'):
158 BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
160 esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff));
161 BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : "
162 "must be \\x HexDigit HexDigit");
164 // Unicode escape sequence
165 case BOOST_XPR_CHAR_(char_type, 'u'):
166 BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
168 esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff));
169 BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : "
170 "must be \\u HexDigit HexDigit HexDigit HexDigit");
173 case BOOST_XPR_CHAR_(char_type, '\\'):
174 //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\');
177 // all other escaped characters represent themselves
187 //////////////////////////////////////////////////////////////////////////
190 template<typename FwdIter, typename RegexTraits, typename CompilerTraits>
191 inline void parse_charset
195 , compound_charset<RegexTraits> &chset
199 using namespace regex_constants;
200 typedef typename RegexTraits::char_type char_type;
201 typedef typename RegexTraits::char_class_type char_class_type;
202 BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
203 RegexTraits const &rxtraits = tr.traits();
204 bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
205 FwdIter iprev = FwdIter();
206 escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char};
209 // check to see if we have an inverse charset
210 if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end))
216 // skip the end token if-and-only-if it is the first token in the charset
217 if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end))
219 for(; begin != iprev; ++begin)
221 chset.set_char(*begin, rxtraits, icase);
225 compiler_token_type tok;
226 char_type ch_prev = char_type(), ch_next = char_type();
227 bool have_prev = false;
229 BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
231 // remember the current position and grab the next token
233 tok = tr.get_charset_token(begin, end);
236 BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
238 if(token_charset_hyphen == tok && have_prev)
240 // remember the current position
241 FwdIter iprev2 = begin;
244 // ch_prev is lower bound of a range
245 switch(tr.get_charset_token(begin, end))
247 case token_charset_hyphen:
248 case token_charset_invert:
249 begin = iprev2; // un-get these tokens and fall through
253 BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
254 chset.set_range(ch_prev, ch_next, rxtraits, icase);
256 case token_charset_backspace:
257 ch_next = char_type(8); // backspace
258 BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
259 chset.set_range(ch_prev, ch_next, rxtraits, icase);
262 esc = parse_escape(begin, end, tr);
263 if(escape_char == esc.type_)
265 BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range");
266 chset.set_range(ch_prev, esc.ch_, rxtraits, icase);
270 case token_charset_end:
271 default: // not a range.
272 begin = iprev; // backup to hyphen token
273 chset.set_char(ch_prev, rxtraits, icase);
274 chset.set_char(*begin++, rxtraits, icase);
281 chset.set_char(ch_prev, rxtraits, icase);
287 case token_charset_hyphen:
288 case token_charset_invert:
289 case token_charset_end:
290 case token_posix_charset_end:
291 begin = iprev; // un-get these tokens
296 case token_charset_backspace:
297 ch_prev = char_type(8); // backspace
301 case token_posix_charset_begin:
303 FwdIter tmp = begin, start = begin;
304 bool invert = (token_charset_invert == tr.get_charset_token(tmp, end));
309 while(token_literal == (tok = tr.get_charset_token(begin, end)))
312 BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
314 if(token_posix_charset_end == tok)
316 char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase);
317 BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name");
318 chset.set_class(chclass, invert);
321 begin = iprev; // un-get this token
328 esc = parse_escape(begin, end, tr);
329 if(escape_char == esc.type_)
334 else if(escape_class == esc.type_)
336 char_class_type upper_ = lookup_classname(rxtraits, "upper");
337 BOOST_ASSERT(0 != upper_);
338 chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_));
352 while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"),
353 token_charset_end != (tok = tr.get_charset_token(begin, end)));
357 chset.set_char(ch_prev, rxtraits, icase);
366 }}} // namespace boost::xpressive::detail