08356999a051d8ecce11e492688058fbaa44ebd2
[external/ragel.git] / redfsm / xmlscan.rl
1 /*
2  *  Copyright 2001-2007 Adrian Thurston <thurston@cs.queensu.ca>
3  */
4
5 /*  This file is part of Ragel.
6  *
7  *  Ragel is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  Ragel is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  * 
17  *  You should have received a copy of the GNU General Public License
18  *  along with Ragel; if not, write to the Free Software
19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20  */
21 #include <iostream>
22 #include <string.h>
23 #include "vector.h"
24 #include "xmlparse.h"
25 #include "buffer.h"
26
27 using std::istream;
28 using std::cout;
29 using std::cerr;
30 using std::endl;
31
32 #define BUFSIZE 4096
33
34 %%{
35         machine XmlScanner;
36         write data;
37 }%%
38
39 class Perfect_Hash
40 {
41 private:
42         static inline unsigned int hash (const char *str, unsigned int len);
43
44 public:
45         static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
46 };
47
48 struct XmlScanner
49 {
50         XmlScanner( const char *fileName, istream &input ) : 
51                 fileName(fileName),
52                 input(input), 
53                 curline(1), 
54                 curcol(1),
55                 p(0), pe(0), 
56                 done(false),
57                 data(0), data_len(0),
58                 value(0)
59         {
60                 %%{
61                         machine XmlScanner;
62                         write init;
63                 }%%
64         }
65         
66         int scan();
67         void adjustAttrPointers( int distance );
68         std::ostream &error();
69
70         const char *fileName;
71         istream &input;
72
73         /* Scanner State. */
74         int cs, act, have, curline, curcol;
75         char *ts, *te;
76         char *p, *pe;
77         int done;
78
79         /* Token data */
80         char *data;
81         int data_len;
82         int value;
83         AttrMkList attrMkList;
84         Buffer buffer;
85         char *tag_id_start;
86         int tag_id_len;
87         int token_col, token_line;
88
89         char buf[BUFSIZE];
90 };
91
92
93 #define TK_NO_TOKEN (-1)
94 #define TK_ERR 1
95 #define TK_SPACE 2
96 #define TK_EOF 3
97 #define TK_OpenTag 4
98 #define TK_CloseTag 5
99
100 #define ret_tok( _tok ) token = (_tok); data = ts
101
102 void XmlScanner::adjustAttrPointers( int distance )
103 {
104         for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
105                 attr->id -= distance;
106                 attr->value -= distance;
107         }
108 }
109
110 /* There is no claim that this is a proper XML parser, but it is good
111  * enough for our purposes. */
112 %%{
113         machine XmlScanner;
114
115         action colup { curcol++; }
116         action start_tok { token_col = curcol; token_line = curline; }
117         NL = '\n' @{ curcol = 0; curline++; };
118
119         WS = [\r\t ] | NL;
120         id = [_a-zA-Z][_a-zA-Z0-9]*;
121         literal = '"' ( [^"] | NL )* '"';
122
123         # Attribute identifiers.
124         action start_attr_id { attr_id_start = p; }
125         action leave_attr_id { attr_id_len = p - attr_id_start; }
126
127         attr_id = id >start_attr_id %leave_attr_id;
128
129         # Attribute values
130         action start_attr_value { attr_value_start = p; }
131         action leave_attr_value
132         {
133                 attr_value_len = p - attr_value_start;
134
135                 AttrMarker newAttr;
136                 newAttr.id = attr_id_start;
137                 newAttr.idLen = attr_id_len;
138                 newAttr.value = attr_value_start;
139                 newAttr.valueLen = attr_value_len;
140                 attrMkList.append( newAttr );
141         }
142
143         attr_value = literal >start_attr_value %leave_attr_value;
144
145         # Attribute list. 
146         attribute = attr_id WS* '=' WS* attr_value WS*;
147
148         # Tag identifiers.
149         action tag_id_start { tag_id_start = p; }
150         action leave_tag_id { tag_id_len = p - tag_id_start; }
151
152         tag_id = id >tag_id_start %leave_tag_id;
153
154         main := |*
155                 # Tags
156                 ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup 
157                         => { ret_tok( TK_OpenTag ); fbreak; };
158
159                 ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup 
160                         => { ret_tok( TK_CloseTag ); fbreak; };
161
162                 # Data in between tags.
163                 ( [^<&\0] | NL ) $colup 
164                         => { buffer.append( *p ); };
165
166                 # Specials.
167                 "&amp;" $colup
168                         => { buffer.append( '&' ); };
169                 "&lt;" $colup
170                         => { buffer.append( '<' ); };
171                 "&gt;" $colup
172                         => { buffer.append( '>' ); };
173                 
174                 # EOF
175                 0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
176
177         *|;
178 }%%
179
180 int XmlScanner::scan( )
181 {
182         int token = TK_NO_TOKEN;
183         int space = 0, readlen = 0;
184         char *attr_id_start = 0;
185         char *attr_value_start = 0;
186         int attr_id_len = 0;
187         int attr_value_len = 0;
188
189         attrMkList.empty();
190         buffer.clear();
191
192         while ( 1 ) {
193                 if ( p == pe ) {
194                         //printf("scanner: need more data\n");
195
196                         if ( ts == 0 )
197                                 have = 0;
198                         else {
199                                 /* There is data that needs to be shifted over. */
200                                 //printf("scanner: buffer broken mid token\n");
201                                 have = pe - ts;
202                                 memmove( buf, ts, have );
203
204                                 int distance = ts - buf;
205                                 te -= distance;
206                                 tag_id_start -= distance;
207                                 attr_id_start -= distance;
208                                 attr_value_start -= distance;
209                                 adjustAttrPointers( distance );
210                                 ts = buf;
211                         }
212
213                         p = buf + have;
214                         space = BUFSIZE - have;
215
216                         if ( space == 0 ) {
217                                 /* We filled up the buffer trying to scan a token. */
218                                 return TK_SPACE;
219                         }
220
221                         if ( done ) {
222                                 //printf("scanner: end of file\n");
223                                 p[0] = 0;
224                                 readlen = 1;
225                         }
226                         else {
227                                 input.read( p, space );
228                                 readlen = input.gcount();
229                                 if ( input.eof() ) {
230                                         //printf("scanner: setting done flag\n");
231                                         done = 1;
232                                 }
233                         }
234
235                         pe = p + readlen;
236                 }
237
238                 %% write exec;
239
240                 if ( cs == XmlScanner_error )
241                         return TK_ERR;
242
243                 if ( token != TK_NO_TOKEN ) {
244                         data_len = p - data;
245                         return token;
246                 }
247         }
248 }
249
250 int xml_parse( std::istream &input, const char *fileName, 
251                 bool outputActive, bool wantComplete )
252 {
253         XmlScanner scanner( fileName, input );
254         XmlParser parser( fileName, outputActive, wantComplete );
255
256         parser.init();
257
258         while ( 1 ) {
259                 int token = scanner.scan();
260                 if ( token == TK_NO_TOKEN ) {
261                         cerr << "xmlscan: interal error: scanner returned NO_TOKEN" << endl;
262                         exit(1);
263                 }
264                 else if ( token == TK_EOF ) {
265                         parser.token( _eof, scanner.token_col, scanner.token_line );
266                         break;
267                 }
268                 else if ( token == TK_ERR ) {
269                         scanner.error() << "scanner error" << endl;
270                         break;
271                 }
272                 else if ( token == TK_SPACE ) {
273                         scanner.error() << "scanner is out of buffer space" << endl;
274                         break;
275                 }
276                 else {
277                         /* All other tokens are either open or close tags. */
278                         XMLTagHashPair *tagId = Perfect_Hash::in_word_set( 
279                                         scanner.tag_id_start, scanner.tag_id_len );
280
281                         XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ? 
282                                         XMLTag::Open : XMLTag::Close );
283
284                         if ( tagId != 0 ) {
285                                 /* Get attributes for open tags. */
286                                 if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
287                                         tag->attrList = new AttrList;
288                                         for ( AttrMkList::Iter attr = scanner.attrMkList; 
289                                                         attr.lte(); attr++ )
290                                         {
291                                                 Attribute newAttr;
292                                                 newAttr.id = new char[attr->idLen+1];
293                                                 memcpy( newAttr.id, attr->id, attr->idLen );
294                                                 newAttr.id[attr->idLen] = 0;
295
296                                                 /* Exclude the surrounding quotes. */
297                                                 newAttr.value = new char[attr->valueLen-1];
298                                                 memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
299                                                 newAttr.value[attr->valueLen-2] = 0;
300
301                                                 tag->attrList->append( newAttr );
302                                         }
303                                 }
304
305                                 /* Get content for closing tags. */
306                                 if ( token == TK_CloseTag ) {
307                                         switch ( tagId->id ) {
308                                         case TAG_host: case TAG_arg:
309                                         case TAG_t: case TAG_alphtype:
310                                         case TAG_text: case TAG_goto:
311                                         case TAG_call: case TAG_next:
312                                         case TAG_entry: case TAG_set_tokend:
313                                         case TAG_set_act: case TAG_start_state:
314                                         case TAG_error_state: case TAG_state_actions: 
315                                         case TAG_action_table: case TAG_cond_space: 
316                                         case TAG_c: case TAG_ex: case TAG_eof_t:
317                                                 tag->content = new char[scanner.buffer.length+1];
318                                                 memcpy( tag->content, scanner.buffer.data,
319                                                                 scanner.buffer.length );
320                                                 tag->content[scanner.buffer.length] = 0;
321                                                 break;
322                                         }
323                                 }
324                         }
325
326                         #if 0
327                         cerr << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
328                                         ": " << (tag->tagId != 0 ? tag->tagId->name : "<unknown>") << endl;
329                         if ( tag->attrList != 0 ) {
330                                 for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
331                                         cerr << "    " << attr->id << ": " << attr->value << endl;
332                         }
333                         if ( tag->content != 0 )
334                                 cerr << "    content: " << tag->content << endl;
335                         #endif
336
337                         parser.token( tag, scanner.token_col, scanner.token_line );
338                 }
339         }
340
341         return 0;
342 }
343
344 std::ostream &XmlScanner::error()
345 {
346         gblErrorCount += 1;
347         cerr << fileName << ":" << curline << ":" << curcol << ": ";
348         return cerr;
349 }