Parse errors in the intermediate XML file now cause the backend to exit
[external/ragel.git] / redfsm / xmlscan.rl
1 /*
2  *  Copyright 2001-2007 Adrian Thurston <thurston@cs.queensu.ca>
3  */
4
5 /*  This file is part of Ragel.
6  *
7  *  Ragel is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  Ragel is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  * 
17  *  You should have received a copy of the GNU General Public License
18  *  along with Ragel; if not, write to the Free Software
19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20  */
21 #include <iostream>
22 #include <string.h>
23 #include "vector.h"
24 #include "xmlparse.h"
25 #include "buffer.h"
26
27 using std::istream;
28 using std::cout;
29 using std::cerr;
30 using std::endl;
31
32 #define BUFSIZE 4096
33
34 %%{
35         machine Scanner;
36         write data;
37 }%%
38
39 class Perfect_Hash
40 {
41 private:
42         static inline unsigned int hash (const char *str, unsigned int len);
43
44 public:
45         static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
46 };
47
48 struct Scanner
49 {
50         Scanner( char *fileName, istream &input ) : 
51                 fileName(fileName),
52                 input(input), 
53                 curline(1), 
54                 curcol(1),
55                 p(0), pe(0), 
56                 done(false),
57                 data(0), data_len(0),
58                 value(0)
59         {
60                 %%{
61                         machine Scanner;
62                         write init;
63                 }%%
64         }
65         
66         int scan();
67         void adjustAttrPointers( int distance );
68         std::ostream &error();
69
70         char *fileName;
71         istream &input;
72
73         /* Scanner State. */
74         int cs, act, have, curline, curcol;
75         char *tokstart, *tokend;
76         char *p, *pe;
77         int done;
78
79         /* Token data */
80         char *data;
81         int data_len;
82         int value;
83         AttrMkList attrMkList;
84         Buffer buffer;
85         char *tag_id_start;
86         int tag_id_len;
87         int token_col, token_line;
88
89         char buf[BUFSIZE];
90 };
91
92
93 #define TK_NO_TOKEN (-1)
94 #define TK_ERR 1
95 #define TK_SPACE 2
96 #define TK_EOF 3
97 #define TK_OpenTag 4
98 #define TK_CloseTag 5
99
100 #define ret_tok( _tok ) token = (_tok); data = tokstart
101
102 void Scanner::adjustAttrPointers( int distance )
103 {
104         for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
105                 attr->id -= distance;
106                 attr->value -= distance;
107         }
108 }
109
110 /* There is no claim that this is a proper XML parser, but it is good
111  * enough for our purposes. */
112 %%{
113         machine Scanner;
114
115         action colup { curcol++; }
116         action start_tok { token_col = curcol; token_line = curline; }
117         NL = '\n' @{ curcol = 0; curline++; };
118
119         WS = [\r\t ] | NL;
120         id = [_a-zA-Z][_a-zA-Z0-9]*;
121         literal = '"' ( [^"] | NL )* '"';
122
123         # Attribute identifiers.
124         action start_attr_id { attr_id_start = p; }
125         action leave_attr_id { attr_id_len = p - attr_id_start; }
126
127         attr_id = id >start_attr_id %leave_attr_id;
128
129         # Attribute values
130         action start_attr_value { attr_value_start = p; }
131         action leave_attr_value
132         {
133                 attr_value_len = p - attr_value_start;
134
135                 AttrMarker newAttr;
136                 newAttr.id = attr_id_start;
137                 newAttr.idLen = attr_id_len;
138                 newAttr.value = attr_value_start;
139                 newAttr.valueLen = attr_value_len;
140                 attrMkList.append( newAttr );
141         }
142
143         attr_value = literal >start_attr_value %leave_attr_value;
144
145         # Attribute list. 
146         attribute = attr_id WS* '=' WS* attr_value WS*;
147
148         # Tag identifiers.
149         action tag_id_start { tag_id_start = p; }
150         action leave_tag_id { tag_id_len = p - tag_id_start; }
151
152         tag_id = id >tag_id_start %leave_tag_id;
153
154         main := |*
155                 # Tags
156                 ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup 
157                         => { ret_tok( TK_OpenTag ); fbreak; };
158
159                 ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup 
160                         => { ret_tok( TK_CloseTag ); fbreak; };
161
162                 # Data in between tags.
163                 ( [^<&\0] | NL ) $colup 
164                         => { buffer.append( *p ); };
165
166                 # Specials.
167                 "&amp;" $colup
168                         => { buffer.append( '&' ); };
169                 "&lt;" $colup
170                         => { buffer.append( '<' ); };
171                 "&gt;" $colup
172                         => { buffer.append( '>' ); };
173                 
174                 # EOF
175                 0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
176
177         *|;
178 }%%
179
180 int Scanner::scan( )
181 {
182         int token = TK_NO_TOKEN;
183         int space = 0, readlen = 0;
184         char *attr_id_start = 0;
185         char *attr_value_start = 0;
186         int attr_id_len = 0;
187         int attr_value_len = 0;
188
189         attrMkList.empty();
190         buffer.clear();
191
192         while ( 1 ) {
193                 if ( p == pe ) {
194                         //printf("scanner: need more data\n");
195
196                         if ( tokstart == 0 )
197                                 have = 0;
198                         else {
199                                 /* There is data that needs to be shifted over. */
200                                 //printf("scanner: buffer broken mid token\n");
201                                 have = pe - tokstart;
202                                 memmove( buf, tokstart, have );
203
204                                 int distance = tokstart - buf;
205                                 tokend -= distance;
206                                 tag_id_start -= distance;
207                                 attr_id_start -= distance;
208                                 attr_value_start -= distance;
209                                 adjustAttrPointers( distance );
210                                 tokstart = buf;
211                         }
212
213                         p = buf + have;
214                         space = BUFSIZE - have;
215
216                         if ( space == 0 ) {
217                                 /* We filled up the buffer trying to scan a token. */
218                                 return TK_SPACE;
219                         }
220
221                         if ( done ) {
222                                 //printf("scanner: end of file\n");
223                                 p[0] = 0;
224                                 readlen = 1;
225                         }
226                         else {
227                                 input.read( p, space );
228                                 readlen = input.gcount();
229                                 if ( input.eof() ) {
230                                         //printf("scanner: setting done flag\n");
231                                         done = 1;
232                                 }
233                         }
234
235                         pe = p + readlen;
236                 }
237
238                 %% write exec;
239
240                 if ( cs == Scanner_error )
241                         return TK_ERR;
242
243                 if ( token != TK_NO_TOKEN ) {
244                         /* fbreak does not advance p, so we do it manually. */
245                         p = p + 1;
246                         data_len = p - data;
247                         return token;
248                 }
249         }
250 }
251
252 int xml_parse( std::istream &input, char *fileName, 
253                 bool outputActive, bool wantComplete )
254 {
255         Scanner scanner( fileName, input );
256         Parser parser( fileName, outputActive, wantComplete );
257
258         parser.init();
259
260         while ( 1 ) {
261                 int token = scanner.scan();
262                 if ( token == TK_NO_TOKEN ) {
263                         cerr << "xmlscan: interal error: scanner returned NO_TOKEN" << endl;
264                         exit(1);
265                 }
266                 else if ( token == TK_EOF ) {
267                         parser.token( _eof, scanner.token_col, scanner.token_line );
268                         break;
269                 }
270                 else if ( token == TK_ERR ) {
271                         scanner.error() << "scanner error" << endl;
272                         break;
273                 }
274                 else if ( token == TK_SPACE ) {
275                         scanner.error() << "scanner is out of buffer space" << endl;
276                         break;
277                 }
278                 else {
279                         /* All other tokens are either open or close tags. */
280                         XMLTagHashPair *tagId = Perfect_Hash::in_word_set( 
281                                         scanner.tag_id_start, scanner.tag_id_len );
282
283                         XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ? 
284                                         XMLTag::Open : XMLTag::Close );
285
286                         if ( tagId != 0 ) {
287                                 /* Get attributes for open tags. */
288                                 if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
289                                         tag->attrList = new AttrList;
290                                         for ( AttrMkList::Iter attr = scanner.attrMkList; 
291                                                         attr.lte(); attr++ )
292                                         {
293                                                 Attribute newAttr;
294                                                 newAttr.id = new char[attr->idLen+1];
295                                                 memcpy( newAttr.id, attr->id, attr->idLen );
296                                                 newAttr.id[attr->idLen] = 0;
297
298                                                 /* Exclude the surrounding quotes. */
299                                                 newAttr.value = new char[attr->valueLen-1];
300                                                 memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
301                                                 newAttr.value[attr->valueLen-2] = 0;
302
303                                                 tag->attrList->append( newAttr );
304                                         }
305                                 }
306
307                                 /* Get content for closing tags. */
308                                 if ( token == TK_CloseTag ) {
309                                         switch ( tagId->id ) {
310                                         case TAG_host: case TAG_arg:
311                                         case TAG_t: case TAG_alphtype:
312                                         case TAG_text: case TAG_goto:
313                                         case TAG_call: case TAG_next:
314                                         case TAG_entry: case TAG_set_tokend:
315                                         case TAG_set_act: case TAG_start_state:
316                                         case TAG_error_state: case TAG_state_actions: 
317                                         case TAG_action_table: case TAG_cond_space: 
318                                         case TAG_c: case TAG_ex:
319                                                 tag->content = new char[scanner.buffer.length+1];
320                                                 memcpy( tag->content, scanner.buffer.data,
321                                                                 scanner.buffer.length );
322                                                 tag->content[scanner.buffer.length] = 0;
323                                                 break;
324                                         }
325                                 }
326                         }
327
328                         #if 0
329                         cerr << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
330                                         ": " << (tag->tagId != 0 ? tag->tagId->name : "<unknown>") << endl;
331                         if ( tag->attrList != 0 ) {
332                                 for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
333                                         cerr << "    " << attr->id << ": " << attr->value << endl;
334                         }
335                         if ( tag->content != 0 )
336                                 cerr << "    content: " << tag->content << endl;
337                         #endif
338
339                         parser.token( tag, scanner.token_col, scanner.token_line );
340                 }
341         }
342
343         return 0;
344 }
345
346 std::ostream &Scanner::error()
347 {
348         gblErrorCount += 1;
349         cerr << fileName << ":" << curline << ":" << curcol << ": ";
350         return cerr;
351 }