boost/xpressive/detail/dynamic/parse_charset.hpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 // parse_charset.hpp
   3 //
   4 //  Copyright 2008 Eric Niebler. Distributed under the Boost
   5 //  Software License, Version 1.0. (See accompanying file
   6 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   7
   8 #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
   9 #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
  10
  11 // MS compatible compilers support #pragma once
  12 #if defined(_MSC_VER)
  13 # pragma once
  14 #endif
  15
  16 #include <boost/config.hpp>
  17 #include <boost/integer.hpp>
  18 #include <boost/mpl/bool.hpp>
  19 #include <boost/throw_exception.hpp>
  20 #include <boost/numeric/conversion/converter.hpp>
  21 #include <boost/xpressive/detail/detail_fwd.hpp>
  22 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
  23 #include <boost/xpressive/detail/utility/literals.hpp>
  24 #include <boost/xpressive/detail/utility/chset/chset.hpp>
  25 #include <boost/xpressive/regex_constants.hpp>
  26
  27 namespace boost { namespace xpressive { namespace detail
  28 {
  29
  30 enum escape_type
  31 {
  32     escape_char
  33   , escape_mark
  34   , escape_class
  35 };
  36
  37 ///////////////////////////////////////////////////////////////////////////////
  38 // escape_value
  39 //
  40 template<typename Char, typename Class>
  41 struct escape_value
  42 {
  43     Char ch_;
  44     int mark_nbr_;
  45     Class class_;
  46     escape_type type_;
  47 };
  48
  49 ///////////////////////////////////////////////////////////////////////////////
  50 // char_overflow_handler
  51 //
  52 struct char_overflow_handler
  53 {
  54     void operator ()(numeric::range_check_result result) const // throw(regex_error)
  55     {
  56         if(numeric::cInRange != result)
  57         {
  58             BOOST_THROW_EXCEPTION(
  59                 regex_error(
  60                     regex_constants::error_escape
  61                   , "character escape too large to fit in target character type"
  62                 )
  63             );
  64         }
  65     }
  66 };
  67
  68 ///////////////////////////////////////////////////////////////////////////////
  69 // parse_escape
  70 //
  71 template<typename FwdIter, typename CompilerTraits>
  72 escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type>
  73 parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr)
  74 {
  75     using namespace regex_constants;
  76     typedef typename iterator_value<FwdIter>::type char_type;
  77     typedef typename CompilerTraits::regex_traits regex_traits;
  78     typedef typename regex_traits::char_class_type char_class_type;
  79
  80     // define an unsigned type the same size as char_type
  81     typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t;
  82     BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type));
  83     typedef numeric::conversion_traits<uchar_t, int> converstion_traits;
  84
  85     BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found");
  86     numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter;
  87     escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char };
  88     bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
  89     regex_traits const &rxtraits = tr.traits();
  90     FwdIter tmp;
  91
  92     esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase);
  93     if(0 != esc.class_)
  94     {
  95         esc.type_ = escape_class;
  96         return esc;
  97     }
  98
  99     if(-1 != rxtraits.value(*begin, 8))
 100     {
 101         esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777));
 102         return esc;
 103     }
 104
 105     switch(*begin)
 106     {
 107     // bell character
 108     case BOOST_XPR_CHAR_(char_type, 'a'):
 109         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a');
 110         ++begin;
 111         break;
 112     // escape character
 113     case BOOST_XPR_CHAR_(char_type, 'e'):
 114         esc.ch_ = converter(27);
 115         ++begin;
 116         break;
 117     // control character
 118     case BOOST_XPR_CHAR_(char_type, 'c'):
 119         BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
 120         BOOST_XPR_ENSURE_
 121         (
 122             rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin)
 123          || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin)
 124           , error_escape
 125           , "invalid escape control letter; must be one of a-z or A-Z"
 126         );
 127         // Convert to character according to ECMA-262, section 15.10.2.10:
 128         esc.ch_ = converter(*begin % 32);
 129         ++begin;
 130         break;
 131     // formfeed character
 132     case BOOST_XPR_CHAR_(char_type, 'f'):
 133         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f');
 134         ++begin;
 135         break;
 136     // newline
 137     case BOOST_XPR_CHAR_(char_type, 'n'):
 138         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n');
 139         ++begin;
 140         break;
 141     // return
 142     case BOOST_XPR_CHAR_(char_type, 'r'):
 143         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r');
 144         ++begin;
 145         break;
 146     // horizontal tab
 147     case BOOST_XPR_CHAR_(char_type, 't'):
 148         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t');
 149         ++begin;
 150         break;
 151     // vertical tab
 152     case BOOST_XPR_CHAR_(char_type, 'v'):
 153         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v');
 154         ++begin;
 155         break;
 156     // hex escape sequence
 157     case BOOST_XPR_CHAR_(char_type, 'x'):
 158         BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
 159         tmp = begin;
 160         esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff));
 161         BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : "
 162             "must be \\x HexDigit HexDigit");
 163         break;
 164     // Unicode escape sequence
 165     case BOOST_XPR_CHAR_(char_type, 'u'):
 166         BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
 167         tmp = begin;
 168         esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff));
 169         BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : "
 170             "must be \\u HexDigit HexDigit HexDigit HexDigit");
 171         break;
 172     // backslash
 173     case BOOST_XPR_CHAR_(char_type, '\\'):
 174         //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\');
 175         //++begin;
 176         //break;
 177     // all other escaped characters represent themselves
 178     default:
 179         esc.ch_ = *begin;
 180         ++begin;
 181         break;
 182     }
 183
 184     return esc;
 185 }
 186
 187 //////////////////////////////////////////////////////////////////////////
 188 // parse_charset
 189 //
 190 template<typename FwdIter, typename RegexTraits, typename CompilerTraits>
 191 inline void parse_charset
 192 (
 193     FwdIter &begin
 194   , FwdIter end
 195   , compound_charset<RegexTraits> &chset
 196   , CompilerTraits &tr
 197 )
 198 {
 199     using namespace regex_constants;
 200     typedef typename RegexTraits::char_type char_type;
 201     typedef typename RegexTraits::char_class_type char_class_type;
 202     BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
 203     RegexTraits const &rxtraits = tr.traits();
 204     bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
 205     FwdIter iprev = FwdIter();
 206     escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char};
 207     bool invert = false;
 208
 209     // check to see if we have an inverse charset
 210     if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end))
 211     {
 212         begin = iprev;
 213         invert = true;
 214     }
 215
 216     // skip the end token if-and-only-if it is the first token in the charset
 217     if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end))
 218     {
 219         for(; begin != iprev; ++begin)
 220         {
 221             chset.set_char(*begin, rxtraits, icase);
 222         }
 223     }
 224
 225     compiler_token_type tok;
 226     char_type ch_prev = char_type(), ch_next = char_type();
 227     bool have_prev = false;
 228
 229     BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
 230
 231     // remember the current position and grab the next token
 232     iprev = begin;
 233     tok = tr.get_charset_token(begin, end);
 234     do
 235     {
 236         BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
 237
 238         if(token_charset_hyphen == tok && have_prev)
 239         {
 240             // remember the current position
 241             FwdIter iprev2 = begin;
 242             have_prev = false;
 243
 244             // ch_prev is lower bound of a range
 245             switch(tr.get_charset_token(begin, end))
 246             {
 247             case token_charset_hyphen:
 248             case token_charset_invert:
 249                 begin = iprev2; // un-get these tokens and fall through
 250                 BOOST_FALLTHROUGH;
 251             case token_literal:
 252                 ch_next = *begin++;
 253                 BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
 254                 chset.set_range(ch_prev, ch_next, rxtraits, icase);
 255                 continue;
 256             case token_charset_backspace:
 257                 ch_next = char_type(8); // backspace
 258                 BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
 259                 chset.set_range(ch_prev, ch_next, rxtraits, icase);
 260                 continue;
 261             case token_escape:
 262                 esc = parse_escape(begin, end, tr);
 263                 if(escape_char == esc.type_)
 264                 {
 265                     BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range");
 266                     chset.set_range(ch_prev, esc.ch_, rxtraits, icase);
 267                     continue;
 268                 }
 269                 BOOST_FALLTHROUGH;
 270             case token_charset_end:
 271             default:                // not a range.
 272                 begin = iprev;      // backup to hyphen token
 273                 chset.set_char(ch_prev, rxtraits, icase);
 274                 chset.set_char(*begin++, rxtraits, icase);
 275                 continue;
 276             }
 277         }
 278
 279         if(have_prev)
 280         {
 281             chset.set_char(ch_prev, rxtraits, icase);
 282             have_prev = false;
 283         }
 284
 285         switch(tok)
 286         {
 287         case token_charset_hyphen:
 288         case token_charset_invert:
 289         case token_charset_end:
 290         case token_posix_charset_end:
 291             begin = iprev; // un-get these tokens
 292             ch_prev = *begin++;
 293             have_prev = true;
 294             continue;
 295
 296         case token_charset_backspace:
 297             ch_prev = char_type(8); // backspace
 298             have_prev = true;
 299             continue;
 300
 301         case token_posix_charset_begin:
 302             {
 303                 FwdIter tmp = begin, start = begin;
 304                 bool invert = (token_charset_invert == tr.get_charset_token(tmp, end));
 305                 if(invert)
 306                 {
 307                     begin = start = tmp;
 308                 }
 309                 while(token_literal == (tok = tr.get_charset_token(begin, end)))
 310                 {
 311                     tmp = ++begin;
 312                     BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
 313                 }
 314                 if(token_posix_charset_end == tok)
 315                 {
 316                     char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase);
 317                     BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name");
 318                     chset.set_class(chclass, invert);
 319                     continue;
 320                 }
 321                 begin = iprev; // un-get this token
 322                 ch_prev = *begin++;
 323                 have_prev = true;
 324             }
 325             continue;
 326
 327         case token_escape:
 328             esc = parse_escape(begin, end, tr);
 329             if(escape_char == esc.type_)
 330             {
 331                 ch_prev = esc.ch_;
 332                 have_prev = true;
 333             }
 334             else if(escape_class == esc.type_)
 335             {
 336                 char_class_type upper_ = lookup_classname(rxtraits, "upper");
 337                 BOOST_ASSERT(0 != upper_);
 338                 chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_));
 339             }
 340             else
 341             {
 342                 BOOST_ASSERT(false);
 343             }
 344             continue;
 345
 346         default:
 347             ch_prev = *begin++;
 348             have_prev = true;
 349             continue;
 350         }
 351     }
 352     while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"),
 353           token_charset_end != (tok = tr.get_charset_token(begin, end)));
 354
 355     if(have_prev)
 356     {
 357         chset.set_char(ch_prev, rxtraits, icase);
 358     }
 359
 360     if(invert)
 361     {
 362         chset.inverse();
 363     }
 364 }
 365
 366 }}} // namespace boost::xpressive::detail
 367
 368 #endif