926c211ebfdd6d236ab0e4411994834a5ebef948
[external/ragel.git] / rlcodegen / xmlscan.rl
1 /*
2  *  Copyright 2001-2007 Adrian Thurston <thurston@cs.queensu.ca>
3  */
4
5 /*  This file is part of Ragel.
6  *
7  *  Ragel is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  Ragel is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  * 
17  *  You should have received a copy of the GNU General Public License
18  *  along with Ragel; if not, write to the Free Software
19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20  */
21 #include <iostream>
22 #include <string.h>
23 #include "vector.h"
24 #include "xmlparse.h"
25 #include "rlcodegen.h"
26 #include "buffer.h"
27
28 using std::istream;
29 using std::cout;
30 using std::endl;
31
32 #define BUFSIZE 4096
33
34 %%{
35         machine Scanner;
36         write data;
37 }%%
38
39 class Perfect_Hash
40 {
41 private:
42         static inline unsigned int hash (const char *str, unsigned int len);
43
44 public:
45         static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
46 };
47
48 struct Scanner
49 {
50         Scanner( istream &input ) : 
51                 input(input), 
52                 curline(1), 
53                 curcol(1),
54                 p(0), pe(0), 
55                 done(false),
56                 data(0), data_len(0),
57                 value(0)
58         {
59                 %%{
60                         machine Scanner;
61                         write init;
62                 }%%
63         }
64         
65         int scan();
66         void adjustAttrPointers( int distance );
67
68         istream &input;
69
70         /* Scanner State. */
71         int cs, act, have, curline, curcol;
72         char *tokstart, *tokend;
73         char *p, *pe;
74         int done;
75
76         /* Token data */
77         char *data;
78         int data_len;
79         int value;
80         AttrMkList attrMkList;
81         Buffer buffer;
82         char *tag_id_start;
83         int tag_id_len;
84         int token_col, token_line;
85
86         char buf[BUFSIZE];
87 };
88
89
90 #define TK_NO_TOKEN (-1)
91 #define TK_ERR 1
92 #define TK_EOF 2
93 #define TK_OpenTag 3
94 #define TK_CloseTag 4
95
96 #define ret_tok( _tok ) token = (_tok); data = tokstart
97
98 void Scanner::adjustAttrPointers( int distance )
99 {
100         for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
101                 attr->id -= distance;
102                 attr->value -= distance;
103         }
104 }
105
106 int Scanner::scan( )
107 {
108         int token = TK_NO_TOKEN;
109         int space, readlen;
110         char *attr_id_start;
111         char *attr_value_start;
112         int attr_id_len;
113         int attr_value_len;
114
115         attrMkList.empty();
116         buffer.clear();
117
118         while ( 1 ) {
119                 if ( p == pe ) {
120                         //printf("scanner: need more data\n");
121
122                         if ( tokstart == 0 )
123                                 have = 0;
124                         else {
125                                 /* There is data that needs to be shifted over. */
126                                 //printf("scanner: buffer broken mid token\n");
127                                 have = pe - tokstart;
128                                 memmove( buf, tokstart, have );
129
130                                 int distance = tokstart - buf;
131                                 tokend -= distance;
132                                 tag_id_start -= distance;
133                                 attr_id_start -= distance;
134                                 attr_value_start -= distance;
135                                 adjustAttrPointers( distance );
136                                 tokstart = buf;
137                         }
138
139                         p = buf + have;
140                         space = BUFSIZE - have;
141
142                         if ( space == 0 ) {
143                                 /* We filled up the buffer trying to scan a token. */
144                                 //printf("scanner: out of buffer space, you have a really long tag\n");
145                                 return TK_ERR;
146                         }
147
148                         if ( done ) {
149                                 //printf("scanner: end of file\n");
150                                 p[0] = 0;
151                                 readlen = 1;
152                         }
153                         else {
154                                 input.read( p, space );
155                                 readlen = input.gcount();
156                                 if ( input.eof() ) {
157                                         //printf("scanner: setting done flag\n");
158                                         done = 1;
159                                 }
160                         }
161
162                         pe = p + readlen;
163                 }
164
165                 /* There is no claim that this is a proper XML parser, but it is good
166                  * enough for our purposes. */
167                 %%{
168                         machine Scanner;
169
170                         action colup { curcol++; }
171                         action start_tok { token_col = curcol; token_line = curline; }
172                         NL = '\n' @{ curcol = 0; curline++; };
173
174                         WS = [\r\t ] | NL;
175                         id = [_a-zA-Z][_a-zA-Z0-9]*;
176                         literal = '"' ( [^"] | NL )* '"';
177
178                         # Attribute identifiers.
179                         action start_attr_id { attr_id_start = p; }
180                         action leave_attr_id { attr_id_len = p - attr_id_start; }
181
182                         attr_id = id >start_attr_id %leave_attr_id;
183
184                         # Attribute values
185                         action start_attr_value { attr_value_start = p; }
186                         action leave_attr_value
187                         {
188                                 attr_value_len = p - attr_value_start;
189
190                                 AttrMarker newAttr;
191                                 newAttr.id = attr_id_start;
192                                 newAttr.idLen = attr_id_len;
193                                 newAttr.value = attr_value_start;
194                                 newAttr.valueLen = attr_value_len;
195                                 attrMkList.append( newAttr );
196                         }
197
198                         attr_value = literal >start_attr_value %leave_attr_value;
199
200                         # Attribute list. 
201                         attribute = attr_id WS* '=' WS* attr_value WS*;
202
203                         # Tag identifiers.
204                         action tag_id_start { tag_id_start = p; }
205                         action leave_tag_id { tag_id_len = p - tag_id_start; }
206
207                         tag_id = id >tag_id_start %leave_tag_id;
208
209                         main := |*
210                                 # Tags
211                                 ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup 
212                                         => { ret_tok( TK_OpenTag ); fbreak; };
213
214                                 ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup 
215                                         => { ret_tok( TK_CloseTag ); fbreak; };
216
217                                 # Data in between tags.
218                                 ( [^<&\0] | NL ) $colup 
219                                         => { buffer.append( *p ); };
220
221                                 # Specials.
222                                 "&amp;" $colup
223                                         => { buffer.append( '&' ); };
224                                 "&lt;" $colup
225                                         => { buffer.append( '<' ); };
226                                 "&gt;" $colup
227                                         => { buffer.append( '>' ); };
228                                 
229                                 # EOF
230                                 0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
231
232                         *|;
233
234                         write exec;
235                 }%%
236
237                 if ( cs == Scanner_error )
238                         return TK_ERR;
239
240                 if ( token != TK_NO_TOKEN ) {
241                         /* fbreak does not advance p, so we do it manually. */
242                         p = p + 1;
243                         data_len = p - data;
244                         return token;
245                 }
246         }
247 }
248
249
250 int xml_parse( istream &input, char *fileName )
251 {
252         Scanner scanner( input );
253         Parser parser( fileName );
254
255         parser.init();
256
257         while ( 1 ) {
258                 int token = scanner.scan();
259                 if ( token == TK_EOF ) {
260                         //cout << "parser_driver: EOF" << endl;
261                         parser.token( _eof );
262                         break;
263                 }
264                 else if ( token == TK_ERR ) {
265                         //cout << "parser_driver: ERR" << endl;
266                         break;
267                 }
268                 else {
269                         /* All other tokens are either open or close tags. */
270                         XMLTagHashPair *tagId = Perfect_Hash::in_word_set( 
271                                         scanner.tag_id_start, scanner.tag_id_len );
272
273                         XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ? 
274                                         XMLTag::Open : XMLTag::Close );
275
276                         if ( tagId != 0 ) {
277                                 /* Get attributes for open tags. */
278                                 if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
279                                         tag->attrList = new AttrList;
280                                         for ( AttrMkList::Iter attr = scanner.attrMkList; 
281                                                         attr.lte(); attr++ )
282                                         {
283                                                 Attribute newAttr;
284                                                 newAttr.id = new char[attr->idLen+1];
285                                                 memcpy( newAttr.id, attr->id, attr->idLen );
286                                                 newAttr.id[attr->idLen] = 0;
287
288                                                 /* Exclude the surrounding quotes. */
289                                                 newAttr.value = new char[attr->valueLen-1];
290                                                 memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
291                                                 newAttr.value[attr->valueLen-2] = 0;
292
293                                                 tag->attrList->append( newAttr );
294                                         }
295                                 }
296
297                                 /* Get content for closing tags. */
298                                 if ( token == TK_CloseTag ) {
299                                         switch ( tagId->id ) {
300                                         case TAG_host: case TAG_option:
301                                         case TAG_t: case TAG_alphtype:
302                                         case TAG_text: case TAG_goto:
303                                         case TAG_call: case TAG_next:
304                                         case TAG_entry: case TAG_set_tokend:
305                                         case TAG_set_act: case TAG_start_state:
306                                         case TAG_state_actions: case TAG_action_table:
307                                         case TAG_cond_space: case TAG_c:
308                                                 tag->content = new char[scanner.buffer.length+1];
309                                                 memcpy( tag->content, scanner.buffer.data,
310                                                                 scanner.buffer.length );
311                                                 tag->content[scanner.buffer.length] = 0;
312                                                 break;
313                                         }
314                                 }
315                         }
316
317                         #if 0
318                         cout << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
319                                         ": " << tag->tagId->name << endl;
320                         if ( tag->attrList != 0 ) {
321                                 for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
322                                         cout << "    " << attr->id << ": " << attr->value << endl;
323                         }
324                         if ( tag->content != 0 )
325                                 cout << "    content: " << tag->content << endl;
326                         #endif
327
328                         parser.token( tag, scanner.token_col, scanner.token_line );
329                 }
330         }
331
332         return 0;
333 }