Updated examples for the first of the 6.0 changes: removed write eof and added
[external/ragel.git] / examples / rlscan.rl
1 /*
2  * Lexes Ragel input files.
3  */
4
5 #include <iostream>
6 #include <stdlib.h>
7 #include <stdio.h>
8
9 using namespace std;
10
11 void escapeXML( char *data )
12 {
13         while ( *data != 0 ) {
14                 switch ( *data ) {
15                         case '<': cout << "&lt;"; break;
16                         case '>': cout << "&gt;"; break;
17                         case '&': cout << "&amp;"; break;
18                         default: cout << *data; break;
19                 }
20                 data += 1;
21         }
22 }
23
24 void escapeXML( char c )
25 {
26         switch ( c ) {
27                 case '<': cout << "&lt;"; break;
28                 case '>': cout << "&gt;"; break;
29                 case '&': cout << "&amp;"; break;
30                 default: cout << c; break;
31         }
32 }
33
34 void escapeXML( char *data, int len )
35 {
36         for ( char *end = data + len; data != end; data++  ) {
37                 switch ( *data ) {
38                         case '<': cout << "&lt;"; break;
39                         case '>': cout << "&gt;"; break;
40                         case '&': cout << "&amp;"; break;
41                         default: cout << *data; break;
42                 }
43         }
44 }
45
46 inline void write( char *data )
47 {
48         cout << data;
49 }
50
51 inline void write( char c )
52 {
53         cout << c;
54 }
55
56 inline void write( char *data, int len )
57 {
58         cout.write( data, len );
59 }
60
61
62 %%{
63         machine RagelScan;
64
65         word = [a-zA-Z_][a-zA-Z_0-9]*;
66         integer = [0-9]+;
67         hex = '0x' [0-9a-fA-F] [0-9a-fA-F]*;
68
69         default = ^0;
70         EOF = 0;
71
72         # Handles comments in outside code and inline blocks.
73         c_comment := 
74                 ( default* :>> '*/' )
75                 ${ escapeXML( fc ); }
76                 @{ fret; };
77
78         action emit {
79                 escapeXML( tokstart, tokend-tokstart );
80         }
81
82         #
83         # Inline action code
84         #
85
86         ilscan := |*
87
88                 "'" ( [^'\\] | /\\./ )* "'" => emit;
89                 '"' ( [^"\\] | /\\./ )* '"' => emit;
90                 '/*' {
91                         write( "/*" );
92                         fcall c_comment;
93                 };
94                 '//' [^\n]* '\n' => emit;
95
96                 '{' {
97                         write( '{' );
98                         inline_depth += 1; 
99                 };
100
101                 '}' {
102                         write( '}' );
103                         /* If dropping down to the last } then return 
104                          * to ragel code. */
105                         if ( --inline_depth == 0 ) {
106                                 write( "</inline>\n" );
107                                 fgoto rlscan;
108                         }
109                 };
110
111                 default => { escapeXML( *tokstart ); };
112         *|;
113
114         #
115         # Ragel Tokens
116         #
117
118         rlscan := |*
119                 '}%%' {
120                         if ( !single_line ) {
121                                 write( "</section>\n" );
122                                 fgoto main;
123                         }
124                 };
125
126                 '\n' {
127                         if ( single_line ) {
128                                 write( "</section>\n" );
129                                 fgoto main;
130                         }
131                 };
132
133                 # Word
134                 word {
135                         write( "<word>" );
136                         write( tokstart, tokend-tokstart );
137                         write( "</word>\n" );
138                 };
139
140                 # Decimal integer.
141                 integer {
142                         write( "<int>" );
143                         write( tokstart, tokend-tokstart );
144                         write( "</int>\n" );
145                 };
146
147                 # Hexidecimal integer.
148                 hex {
149                         write( "<hex>" );
150                         write( tokstart, tokend-tokstart );
151                         write( "</hex>\n" );
152                 };
153
154                 # Consume comments.
155                 '#' [^\n]* '\n';
156
157                 # Single literal string.
158                 "'" ( [^'\\] | /\\./ )* "'" {
159                         write( "<single_lit>" );
160                         escapeXML( tokstart, tokend-tokstart );
161                         write( "</single_lit>\n" );
162                 };
163
164                 # Double literal string.
165                 '"' ( [^"\\] | /\\./ )* '"' {
166                         write( "<double_lit>" );
167                         escapeXML( tokstart, tokend-tokstart );
168                         write( "</double_lit>\n" );
169                 };
170
171                 # Or literal.
172                 '[' ( [^\]\\] | /\\./ )* ']' {
173                         write( "<or_lit>" );
174                         escapeXML( tokstart, tokend-tokstart );
175                         write( "</or_lit>\n" );
176                 };
177
178                 # Regex Literal.
179                 '/' ( [^/\\] | /\\./ ) * '/' {
180                         write( "<re_lit>" );
181                         escapeXML( tokstart, tokend-tokstart );
182                         write( "</re_lit>\n" );
183                 };
184
185                 # Open an inline block
186                 '{' {
187                         inline_depth = 1;
188                         write( "<inline>{" );
189                         fgoto ilscan;
190                 };
191
192                 punct {
193                         write( "<symbol>" );
194                         escapeXML( fc );
195                         write( "</symbol>\n" );
196                 };
197                 
198                 default;
199         *|;
200
201         #
202         # Outside code.
203         #
204
205         main := |*
206
207                 "'" ( [^'\\] | /\\./ )* "'" => emit;
208                 '"' ( [^"\\] | /\\./ )* '"' => emit;
209
210                 '/*' {
211                         escapeXML( tokstart, tokend-tokstart );
212                         fcall c_comment;
213                 };
214
215                 '//' [^\n]* '\n' => emit;
216
217                 '%%{' { 
218                         write( "<section>\n" );
219                         single_line = false;
220                         fgoto rlscan;
221                 };
222
223                 '%%' {
224                         write( "<section>\n" ); 
225                         single_line = true; 
226                         fgoto rlscan;
227                 };
228
229                 default { 
230                         escapeXML( *tokstart );
231                 };
232
233                 # EOF.
234                 EOF;
235         *|;
236 }%%
237
238 %% write data nofinal;
239
240 #define BUFSIZE 2048
241
242 int main()
243 {
244         std::ios::sync_with_stdio(false);
245
246         int cs, act;
247         char *tokstart, *tokend;
248         int stack[1], top;
249
250         static char inbuf[BUFSIZE];
251         bool single_line = false;
252         int inline_depth = 0;
253
254         %% write init;
255
256         bool done = false;
257         int have = 0;
258         while ( !done ) {
259                 /* How much space is in the buffer? */
260                 int space = BUFSIZE - have;
261                 if ( space == 0 ) {
262                         /* Buffer is full. */
263                         cerr << "TOKEN TOO BIG" << endl;
264                         exit(1);
265                 }
266
267                 /* Read in a block. */
268                 char *p = inbuf + have;
269                 cin.read( p, space );
270                 int len = cin.gcount();
271                 char *pe = p + len;
272                 char *eof = 0;
273
274                 /* Check for EOF. */
275                 if ( len == 0 ) {
276                         eof = pe;
277                         done = true;
278                 }
279
280                 %% write exec;
281
282                 if ( cs == RagelScan_error ) {
283                         /* Machine failed before finding a token. */
284                         cerr << "PARSE ERROR" << endl;
285                         exit(1);
286                 }
287
288                 if ( tokstart == 0 )
289                         have = 0;
290                 else {
291                         /* There is a prefix to preserve, shift it over. */
292                         have = pe - tokstart;
293                         memmove( inbuf, tokstart, have );
294                         tokend = inbuf + (tokend-tokstart);
295                         tokstart = inbuf;
296                 }
297         }
298         return 0;
299 }