Removed the old flex and bison files. Fixed a bug in regexes and OR literals:
[external/ragel.git] / rlcodegen / xmlscan.rl
1 /*
2  *  Copyright 2001-2007 Adrian Thurston <thurston@cs.queensu.ca>
3  */
4
5 /*  This file is part of Ragel.
6  *
7  *  Ragel is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  Ragel is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  * 
17  *  You should have received a copy of the GNU General Public License
18  *  along with Ragel; if not, write to the Free Software
19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20  */
21 #include <iostream>
22 #include <string.h>
23 #include "vector.h"
24 #include "xmlparse.h"
25 #include "rlcodegen.h"
26 #include "buffer.h"
27
28 using std::istream;
29 using std::cout;
30 using std::cerr;
31 using std::endl;
32
33 #define BUFSIZE 4096
34
35 %%{
36         machine Scanner;
37         write data;
38 }%%
39
40 class Perfect_Hash
41 {
42 private:
43         static inline unsigned int hash (const char *str, unsigned int len);
44
45 public:
46         static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
47 };
48
49 struct Scanner
50 {
51         Scanner( char *fileName, istream &input ) : 
52                 fileName(fileName),
53                 input(input), 
54                 curline(1), 
55                 curcol(1),
56                 p(0), pe(0), 
57                 done(false),
58                 data(0), data_len(0),
59                 value(0)
60         {
61                 %%{
62                         machine Scanner;
63                         write init;
64                 }%%
65         }
66         
67         int scan();
68         void adjustAttrPointers( int distance );
69         std::ostream &error();
70
71         char *fileName;
72         istream &input;
73
74         /* Scanner State. */
75         int cs, act, have, curline, curcol;
76         char *tokstart, *tokend;
77         char *p, *pe;
78         int done;
79
80         /* Token data */
81         char *data;
82         int data_len;
83         int value;
84         AttrMkList attrMkList;
85         Buffer buffer;
86         char *tag_id_start;
87         int tag_id_len;
88         int token_col, token_line;
89
90         char buf[BUFSIZE];
91 };
92
93
94 #define TK_NO_TOKEN (-1)
95 #define TK_ERR 1
96 #define TK_SPACE 2
97 #define TK_EOF 3
98 #define TK_OpenTag 4
99 #define TK_CloseTag 5
100
101 #define ret_tok( _tok ) token = (_tok); data = tokstart
102
103 void Scanner::adjustAttrPointers( int distance )
104 {
105         for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
106                 attr->id -= distance;
107                 attr->value -= distance;
108         }
109 }
110
111 /* There is no claim that this is a proper XML parser, but it is good
112  * enough for our purposes. */
113 %%{
114         machine Scanner;
115
116         action colup { curcol++; }
117         action start_tok { token_col = curcol; token_line = curline; }
118         NL = '\n' @{ curcol = 0; curline++; };
119
120         WS = [\r\t ] | NL;
121         id = [_a-zA-Z][_a-zA-Z0-9]*;
122         literal = '"' ( [^"] | NL )* '"';
123
124         # Attribute identifiers.
125         action start_attr_id { attr_id_start = p; }
126         action leave_attr_id { attr_id_len = p - attr_id_start; }
127
128         attr_id = id >start_attr_id %leave_attr_id;
129
130         # Attribute values
131         action start_attr_value { attr_value_start = p; }
132         action leave_attr_value
133         {
134                 attr_value_len = p - attr_value_start;
135
136                 AttrMarker newAttr;
137                 newAttr.id = attr_id_start;
138                 newAttr.idLen = attr_id_len;
139                 newAttr.value = attr_value_start;
140                 newAttr.valueLen = attr_value_len;
141                 attrMkList.append( newAttr );
142         }
143
144         attr_value = literal >start_attr_value %leave_attr_value;
145
146         # Attribute list. 
147         attribute = attr_id WS* '=' WS* attr_value WS*;
148
149         # Tag identifiers.
150         action tag_id_start { tag_id_start = p; }
151         action leave_tag_id { tag_id_len = p - tag_id_start; }
152
153         tag_id = id >tag_id_start %leave_tag_id;
154
155         main := |*
156                 # Tags
157                 ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup 
158                         => { ret_tok( TK_OpenTag ); fbreak; };
159
160                 ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup 
161                         => { ret_tok( TK_CloseTag ); fbreak; };
162
163                 # Data in between tags.
164                 ( [^<&\0] | NL ) $colup 
165                         => { buffer.append( *p ); };
166
167                 # Specials.
168                 "&amp;" $colup
169                         => { buffer.append( '&' ); };
170                 "&lt;" $colup
171                         => { buffer.append( '<' ); };
172                 "&gt;" $colup
173                         => { buffer.append( '>' ); };
174                 
175                 # EOF
176                 0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
177
178         *|;
179 }%%
180
181 int Scanner::scan( )
182 {
183         int token = TK_NO_TOKEN;
184         int space, readlen;
185         char *attr_id_start;
186         char *attr_value_start;
187         int attr_id_len;
188         int attr_value_len;
189
190         attrMkList.empty();
191         buffer.clear();
192
193         while ( 1 ) {
194                 if ( p == pe ) {
195                         //printf("scanner: need more data\n");
196
197                         if ( tokstart == 0 )
198                                 have = 0;
199                         else {
200                                 /* There is data that needs to be shifted over. */
201                                 //printf("scanner: buffer broken mid token\n");
202                                 have = pe - tokstart;
203                                 memmove( buf, tokstart, have );
204
205                                 int distance = tokstart - buf;
206                                 tokend -= distance;
207                                 tag_id_start -= distance;
208                                 attr_id_start -= distance;
209                                 attr_value_start -= distance;
210                                 adjustAttrPointers( distance );
211                                 tokstart = buf;
212                         }
213
214                         p = buf + have;
215                         space = BUFSIZE - have;
216
217                         if ( space == 0 ) {
218                                 /* We filled up the buffer trying to scan a token. */
219                                 return TK_SPACE;
220                         }
221
222                         if ( done ) {
223                                 //printf("scanner: end of file\n");
224                                 p[0] = 0;
225                                 readlen = 1;
226                         }
227                         else {
228                                 input.read( p, space );
229                                 readlen = input.gcount();
230                                 if ( input.eof() ) {
231                                         //printf("scanner: setting done flag\n");
232                                         done = 1;
233                                 }
234                         }
235
236                         pe = p + readlen;
237                 }
238
239                 %% write exec;
240
241                 if ( cs == Scanner_error )
242                         return TK_ERR;
243
244                 if ( token != TK_NO_TOKEN ) {
245                         /* fbreak does not advance p, so we do it manually. */
246                         p = p + 1;
247                         data_len = p - data;
248                         return token;
249                 }
250         }
251 }
252
253
254 int xml_parse( istream &input, char *fileName )
255 {
256         Scanner scanner( fileName, input );
257         Parser parser( fileName );
258
259         parser.init();
260
261         while ( 1 ) {
262                 int token = scanner.scan();
263                 if ( token == TK_NO_TOKEN ) {
264                         cerr << PROGNAME << ": interal error: scanner returned NO_TOKEN" << endl;
265                         exit(1);
266                 }
267                 else if ( token == TK_EOF ) {
268                         parser.token( _eof, scanner.token_col, scanner.token_line );
269                         break;
270                 }
271                 else if ( token == TK_ERR ) {
272                         scanner.error() << "scanner error" << endl;
273                         break;
274                 }
275                 else if ( token == TK_SPACE ) {
276                         scanner.error() << "scanner is out of buffer space" << endl;
277                         break;
278                 }
279                 else {
280                         /* All other tokens are either open or close tags. */
281                         XMLTagHashPair *tagId = Perfect_Hash::in_word_set( 
282                                         scanner.tag_id_start, scanner.tag_id_len );
283
284                         XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ? 
285                                         XMLTag::Open : XMLTag::Close );
286
287                         if ( tagId != 0 ) {
288                                 /* Get attributes for open tags. */
289                                 if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
290                                         tag->attrList = new AttrList;
291                                         for ( AttrMkList::Iter attr = scanner.attrMkList; 
292                                                         attr.lte(); attr++ )
293                                         {
294                                                 Attribute newAttr;
295                                                 newAttr.id = new char[attr->idLen+1];
296                                                 memcpy( newAttr.id, attr->id, attr->idLen );
297                                                 newAttr.id[attr->idLen] = 0;
298
299                                                 /* Exclude the surrounding quotes. */
300                                                 newAttr.value = new char[attr->valueLen-1];
301                                                 memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
302                                                 newAttr.value[attr->valueLen-2] = 0;
303
304                                                 tag->attrList->append( newAttr );
305                                         }
306                                 }
307
308                                 /* Get content for closing tags. */
309                                 if ( token == TK_CloseTag ) {
310                                         switch ( tagId->id ) {
311                                         case TAG_host: case TAG_option:
312                                         case TAG_t: case TAG_alphtype:
313                                         case TAG_text: case TAG_goto:
314                                         case TAG_call: case TAG_next:
315                                         case TAG_entry: case TAG_set_tokend:
316                                         case TAG_set_act: case TAG_start_state:
317                                         case TAG_state_actions: case TAG_action_table:
318                                         case TAG_cond_space: case TAG_c:
319                                                 tag->content = new char[scanner.buffer.length+1];
320                                                 memcpy( tag->content, scanner.buffer.data,
321                                                                 scanner.buffer.length );
322                                                 tag->content[scanner.buffer.length] = 0;
323                                                 break;
324                                         }
325                                 }
326                         }
327
328                         #if 0
329                         cerr << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
330                                         ": " << (tag->tagId != 0 ? tag->tagId->name : "<unknown>") << endl;
331                         if ( tag->attrList != 0 ) {
332                                 for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
333                                         cerr << "    " << attr->id << ": " << attr->value << endl;
334                         }
335                         if ( tag->content != 0 )
336                                 cerr << "    content: " << tag->content << endl;
337                         #endif
338
339                         parser.token( tag, scanner.token_col, scanner.token_line );
340                 }
341         }
342
343         return 0;
344 }
345
346 std::ostream &Scanner::error()
347 {
348         cerr << fileName << ":" << curline << ":" << curcol << ": ";
349         return cerr;
350 }