1 // Copyright (c) 2001-2010 Hartmut Kaiser
3 // Distributed under the Boost Software License, Version 1.0. (See accompanying
4 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 // This example shows how to create a simple lexer recognizing a couple of
7 // different tokens aimed at a simple language and how to use this lexer with
8 // a grammar. It shows how to associate attributes to tokens and how to access the
9 // token attributes from inside the grammar.
11 // Additionally, this example demonstrates, how to define a token set usable
12 // as the skip parser during parsing, allowing to define several tokens to be
15 // The main purpose of this example is to show how inheritance can be used to
16 // overload parts of a base grammar and add token definitions to a base lexer.
18 // Further, it shows how you can use the 'omit' attribute type specifier
19 // for token definitions to force the token to have no attribute (expose an
22 // This example recognizes a very simple programming language having
23 // assignment statements and if and while control structures. Look at the file
24 // example5.input for an example.
26 #include <boost/config/warning_disable.hpp>
27 #include <boost/spirit/include/qi.hpp>
28 #include <boost/spirit/include/lex_lexertl.hpp>
29 #include <boost/spirit/include/phoenix_operator.hpp>
35 #include "example.hpp"
37 using namespace boost::spirit;
38 using boost::phoenix::val;
40 ///////////////////////////////////////////////////////////////////////////////
41 // Token definition base, defines all tokens for the base grammar below
42 ///////////////////////////////////////////////////////////////////////////////
43 template <typename Lexer>
44 struct example5_base_tokens : lex::lexer<Lexer>
47 // this lexer is supposed to be used as a base type only
48 example5_base_tokens() {}
51 void init_token_definitions()
53 // define the tokens to match
54 identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
59 // associate the tokens and the token set with the lexer
60 this->self += lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';' | constant;
61 this->self += if_ | while_ | identifier;
63 // define the whitespace to ignore (spaces, tabs, newlines and C-style
66 = lex::token_def<>("[ \\t\\n]+")
67 | "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"
71 // these tokens have no attribute
72 lex::token_def<lex::omit> if_, while_;
74 // The following two tokens have an associated attribute type, 'identifier'
75 // carries a string (the identifier name) and 'constant' carries the
76 // matched integer value.
78 // Note: any token attribute type explicitly specified in a token_def<>
79 // declaration needs to be listed during token type definition as
80 // well (see the typedef for the token_type below).
82 // The conversion of the matched input to an instance of this type occurs
83 // once (on first access), which makes token attributes as efficient as
84 // possible. Moreover, token instances are constructed once by the lexer
85 // library. From this point on tokens are passed by reference only,
86 // avoiding them being copied around.
87 lex::token_def<std::string> identifier;
88 lex::token_def<unsigned int> constant;
91 ///////////////////////////////////////////////////////////////////////////////
92 // Grammar definition base, defines a basic language
93 ///////////////////////////////////////////////////////////////////////////////
94 template <typename Iterator, typename Lexer>
95 struct example5_base_grammar
96 : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
98 template <typename TokenDef>
99 example5_base_grammar(TokenDef const& tok)
100 : example5_base_grammar::base_type(program)
102 using boost::spirit::_val;
109 = '{' >> *statement >> '}'
119 = (tok.identifier >> '=' >> expression >> ';')
121 std::cout << val("assignment statement to: ") << _1 << "\n"
126 = (tok.if_ >> '(' >> expression >> ')' >> block)
128 std::cout << val("if expression: ") << _1 << "\n"
133 = (tok.while_ >> '(' >> expression >> ')' >> block)
135 std::cout << val("while expression: ") << _1 << "\n"
139 // since expression has a variant return type accommodating for
140 // std::string and unsigned integer, both possible values may be
141 // returned to the calling rule
143 = tok.identifier [ _val = _1 ]
144 | tok.constant [ _val = _1 ]
148 typedef qi::in_state_skipper<Lexer> skipper_type;
150 qi::rule<Iterator, skipper_type> program, block, statement;
151 qi::rule<Iterator, skipper_type> assignment, if_stmt;
152 qi::rule<Iterator, skipper_type> while_stmt;
154 // the expression is the only rule having a return value
155 typedef boost::variant<unsigned int, std::string> expression_type;
156 qi::rule<Iterator, expression_type(), skipper_type> expression;
159 ///////////////////////////////////////////////////////////////////////////////
160 // Token definition for derived lexer, defines additional tokens
161 ///////////////////////////////////////////////////////////////////////////////
162 template <typename Lexer>
163 struct example5_tokens : example5_base_tokens<Lexer>
165 typedef example5_base_tokens<Lexer> base_type;
169 // define the additional token to match
172 // associate the new token with the lexer, note we add 'else' before
173 // anything else to add it to the token set before the identifier
174 // token, otherwise "else" would be matched as an identifier
177 // now add the token definitions from the base class
178 this->base_type::init_token_definitions();
181 // this token has no attribute
182 lex::token_def<lex::omit> else_;
185 ///////////////////////////////////////////////////////////////////////////////
186 // Derived grammar definition, defines a language extension
187 ///////////////////////////////////////////////////////////////////////////////
188 template <typename Iterator, typename Lexer>
189 struct example5_grammar : example5_base_grammar<Iterator, Lexer>
191 template <typename TokenDef>
192 example5_grammar(TokenDef const& tok)
193 : example5_base_grammar<Iterator, Lexer>(tok)
195 // we alter the if_stmt only
197 = this->if_stmt.copy() >> -(tok.else_ >> this->block)
202 ///////////////////////////////////////////////////////////////////////////////
205 // iterator type used to expose the underlying input stream
206 typedef std::string::iterator base_iterator_type;
208 // This is the lexer token type to use. The second template parameter lists
209 // all attribute types used for token_def's during token definition (see
210 // example5_base_tokens<> above). Here we use the predefined lexertl token
211 // type, but any compatible token type may be used instead.
213 // If you don't list any token attribute types in the following declaration
214 // (or just use the default token type: lexertl_token<base_iterator_type>)
215 // it will compile and work just fine, just a bit less efficient. This is
216 // because the token attribute will be generated from the matched input
217 // sequence every time it is requested. But as soon as you specify at
218 // least one token attribute type you'll have to list all attribute types
219 // used for token_def<> declarations in the token definition class above,
220 // otherwise compilation errors will occur.
221 typedef lex::lexertl::token<
222 base_iterator_type, boost::mpl::vector<unsigned int, std::string>
225 // Here we use the lexertl based lexer engine.
226 typedef lex::lexertl::lexer<token_type> lexer_type;
228 // This is the token definition type (derived from the given lexer type).
229 typedef example5_tokens<lexer_type> example5_tokens;
231 // this is the iterator type exposed by the lexer
232 typedef example5_tokens::iterator_type iterator_type;
234 // this is the type of the grammar to parse
235 typedef example5_grammar<iterator_type, example5_tokens::lexer_def> example5_grammar;
237 // now we use the types defined above to create the lexer and grammar
238 // object instances needed to invoke the parsing process
239 example5_tokens tokens; // Our lexer
240 example5_grammar calc(tokens); // Our parser
242 std::string str (read_from_file("example5.input"));
244 // At this point we generate the iterator pair used to expose the
245 // tokenized input stream.
246 std::string::iterator it = str.begin();
247 iterator_type iter = tokens.begin(it, str.end());
248 iterator_type end = tokens.end();
250 // Parsing is done based on the the token stream, not the character
251 // stream read from the input.
252 // Note how we use the lexer defined above as the skip parser. It must
253 // be explicitly wrapped inside a state directive, switching the lexer
254 // state for the duration of skipping whitespace.
255 std::string ws("WS");
256 bool r = qi::phrase_parse(iter, end, calc, qi::in_state(ws)[tokens.self]);
258 if (r && iter == end)
260 std::cout << "-------------------------\n";
261 std::cout << "Parsing succeeded\n";
262 std::cout << "-------------------------\n";
266 std::cout << "-------------------------\n";
267 std::cout << "Parsing failed\n";
268 std::cout << "-------------------------\n";
271 std::cout << "Bye... :-) \n\n";