libs/spirit/example/lex/example5.cpp

   1 //  Copyright (c) 2001-2010 Hartmut Kaiser
   2 //
   3 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
   4 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   5
   6 //  This example shows how to create a simple lexer recognizing a couple of
   7 //  different tokens aimed at a simple language and how to use this lexer with
   8 //  a grammar. It shows how to associate attributes to tokens and how to access the
   9 //  token attributes from inside the grammar.
  10 //
  11 //  Additionally, this example demonstrates, how to define a token set usable
  12 //  as the skip parser during parsing, allowing to define several tokens to be
  13 //  ignored.
  14 //
  15 //  The main purpose of this example is to show how inheritance can be used to
  16 //  overload parts of a base grammar and add token definitions to a base lexer.
  17 //
  18 //  Further, it shows how you can use the 'omit' attribute type specifier
  19 //  for token definitions to force the token to have no attribute (expose an
  20 //  unused attribute).
  21 //
  22 //  This example recognizes a very simple programming language having
  23 //  assignment statements and if and while control structures. Look at the file
  24 //  example5.input for an example.
  25
  26 #include <boost/config/warning_disable.hpp>
  27 #include <boost/spirit/include/qi.hpp>
  28 #include <boost/spirit/include/lex_lexertl.hpp>
  29 #include <boost/spirit/include/phoenix_operator.hpp>
  30
  31 #include <iostream>
  32 #include <fstream>
  33 #include <string>
  34
  35 #include "example.hpp"
  36
  37 using namespace boost::spirit;
  38 using boost::phoenix::val;
  39
  40 ///////////////////////////////////////////////////////////////////////////////
  41 //  Token definition base, defines all tokens for the base grammar below
  42 ///////////////////////////////////////////////////////////////////////////////
  43 template <typename Lexer>
  44 struct example5_base_tokens : lex::lexer<Lexer>
  45 {
  46 protected:
  47     // this lexer is supposed to be used as a base type only
  48     example5_base_tokens() {}
  49
  50 public:
  51     void init_token_definitions()
  52     {
  53         // define the tokens to match
  54         identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
  55         constant = "[0-9]+";
  56         if_ = "if";
  57         while_ = "while";
  58
  59         // associate the tokens and the token set with the lexer
  60         this->self += lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';' | constant;
  61         this->self += if_ | while_ | identifier;
  62
  63         // define the whitespace to ignore (spaces, tabs, newlines and C-style
  64         // comments)
  65         this->self("WS")
  66             =   lex::token_def<>("[ \\t\\n]+")
  67             |   "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"
  68             ;
  69     }
  70
  71     // these tokens have no attribute
  72     lex::token_def<lex::omit> if_, while_;
  73
  74     // The following two tokens have an associated attribute type, 'identifier'
  75     // carries a string (the identifier name) and 'constant' carries the
  76     // matched integer value.
  77     //
  78     // Note: any token attribute type explicitly specified in a token_def<>
  79     //       declaration needs to be listed during token type definition as
  80     //       well (see the typedef for the token_type below).
  81     //
  82     // The conversion of the matched input to an instance of this type occurs
  83     // once (on first access), which makes token attributes as efficient as
  84     // possible. Moreover, token instances are constructed once by the lexer
  85     // library. From this point on tokens are passed by reference only,
  86     // avoiding them being copied around.
  87     lex::token_def<std::string> identifier;
  88     lex::token_def<unsigned int> constant;
  89 };
  90
  91 ///////////////////////////////////////////////////////////////////////////////
  92 //  Grammar definition base, defines a basic language
  93 ///////////////////////////////////////////////////////////////////////////////
  94 template <typename Iterator, typename Lexer>
  95 struct example5_base_grammar
  96   : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
  97 {
  98     template <typename TokenDef>
  99     example5_base_grammar(TokenDef const& tok)
 100       : example5_base_grammar::base_type(program)
 101     {
 102         using boost::spirit::_val;
 103
 104         program
 105             =  +block
 106             ;
 107
 108         block
 109             =   '{' >> *statement >> '}'
 110             ;
 111
 112         statement
 113             =   assignment
 114             |   if_stmt
 115             |   while_stmt
 116             ;
 117
 118         assignment
 119             =   (tok.identifier >> '=' >> expression >> ';')
 120                 [
 121                     std::cout << val("assignment statement to: ") << _1 << "\n"
 122                 ]
 123             ;
 124
 125         if_stmt
 126             =   (tok.if_ >> '(' >> expression >> ')' >> block)
 127                 [
 128                     std::cout << val("if expression: ") << _1 << "\n"
 129                 ]
 130             ;
 131
 132         while_stmt
 133             =   (tok.while_ >> '(' >> expression >> ')' >> block)
 134                 [
 135                     std::cout << val("while expression: ") << _1 << "\n"
 136                 ]
 137             ;
 138
 139         //  since expression has a variant return type accommodating for
 140         //  std::string and unsigned integer, both possible values may be
 141         //  returned to the calling rule
 142         expression
 143             =   tok.identifier [ _val = _1 ]
 144             |   tok.constant   [ _val = _1 ]
 145             ;
 146     }
 147
 148     typedef qi::in_state_skipper<Lexer> skipper_type;
 149
 150     qi::rule<Iterator, skipper_type> program, block, statement;
 151     qi::rule<Iterator, skipper_type> assignment, if_stmt;
 152     qi::rule<Iterator, skipper_type> while_stmt;
 153
 154     //  the expression is the only rule having a return value
 155     typedef boost::variant<unsigned int, std::string> expression_type;
 156     qi::rule<Iterator, expression_type(), skipper_type>  expression;
 157 };
 158
 159 ///////////////////////////////////////////////////////////////////////////////
 160 //  Token definition for derived lexer, defines additional tokens
 161 ///////////////////////////////////////////////////////////////////////////////
 162 template <typename Lexer>
 163 struct example5_tokens : example5_base_tokens<Lexer>
 164 {
 165     typedef example5_base_tokens<Lexer> base_type;
 166
 167     example5_tokens()
 168     {
 169         // define the additional token to match
 170         else_ = "else";
 171
 172         // associate the new token with the lexer, note we add 'else' before
 173         // anything else to add it to the token set before the identifier
 174         // token, otherwise "else" would be matched as an identifier
 175         this->self = else_;
 176
 177         // now add the token definitions from the base class
 178         this->base_type::init_token_definitions();
 179     }
 180
 181     // this token has no attribute
 182     lex::token_def<lex::omit> else_;
 183 };
 184
 185 ///////////////////////////////////////////////////////////////////////////////
 186 //  Derived grammar definition, defines a language extension
 187 ///////////////////////////////////////////////////////////////////////////////
 188 template <typename Iterator, typename Lexer>
 189 struct example5_grammar : example5_base_grammar<Iterator, Lexer>
 190 {
 191     template <typename TokenDef>
 192     example5_grammar(TokenDef const& tok)
 193       : example5_base_grammar<Iterator, Lexer>(tok)
 194     {
 195         // we alter the if_stmt only
 196         this->if_stmt
 197             =   this->if_stmt.copy() >> -(tok.else_ >> this->block)
 198             ;
 199     }
 200 };
 201
 202 ///////////////////////////////////////////////////////////////////////////////
 203 int main()
 204 {
 205     // iterator type used to expose the underlying input stream
 206     typedef std::string::iterator base_iterator_type;
 207
 208     // This is the lexer token type to use. The second template parameter lists
 209     // all attribute types used for token_def's during token definition (see
 210     // example5_base_tokens<> above). Here we use the predefined lexertl token
 211     // type, but any compatible token type may be used instead.
 212     //
 213     // If you don't list any token attribute types in the following declaration
 214     // (or just use the default token type: lexertl_token<base_iterator_type>)
 215     // it will compile and work just fine, just a bit less efficient. This is
 216     // because the token attribute will be generated from the matched input
 217     // sequence every time it is requested. But as soon as you specify at
 218     // least one token attribute type you'll have to list all attribute types
 219     // used for token_def<> declarations in the token definition class above,
 220     // otherwise compilation errors will occur.
 221     typedef lex::lexertl::token<
 222         base_iterator_type, boost::mpl::vector<unsigned int, std::string>
 223     > token_type;
 224
 225     // Here we use the lexertl based lexer engine.
 226     typedef lex::lexertl::lexer<token_type> lexer_type;
 227
 228     // This is the token definition type (derived from the given lexer type).
 229     typedef example5_tokens<lexer_type> example5_tokens;
 230
 231     // this is the iterator type exposed by the lexer
 232     typedef example5_tokens::iterator_type iterator_type;
 233
 234     // this is the type of the grammar to parse
 235     typedef example5_grammar<iterator_type, example5_tokens::lexer_def> example5_grammar;
 236
 237     // now we use the types defined above to create the lexer and grammar
 238     // object instances needed to invoke the parsing process
 239     example5_tokens tokens;                         // Our lexer
 240     example5_grammar calc(tokens);                  // Our parser
 241
 242     std::string str (read_from_file("example5.input"));
 243
 244     // At this point we generate the iterator pair used to expose the
 245     // tokenized input stream.
 246     std::string::iterator it = str.begin();
 247     iterator_type iter = tokens.begin(it, str.end());
 248     iterator_type end = tokens.end();
 249
 250     // Parsing is done based on the the token stream, not the character
 251     // stream read from the input.
 252     // Note how we use the lexer defined above as the skip parser. It must
 253     // be explicitly wrapped inside a state directive, switching the lexer
 254     // state for the duration of skipping whitespace.
 255     std::string ws("WS");
 256     bool r = qi::phrase_parse(iter, end, calc, qi::in_state(ws)[tokens.self]);
 257
 258     if (r && iter == end)
 259     {
 260         std::cout << "-------------------------\n";
 261         std::cout << "Parsing succeeded\n";
 262         std::cout << "-------------------------\n";
 263     }
 264     else
 265     {
 266         std::cout << "-------------------------\n";
 267         std::cout << "Parsing failed\n";
 268         std::cout << "-------------------------\n";
 269     }
 270
 271     std::cout << "Bye... :-) \n\n";
 272     return 0;
 273 }