In kelbt 0.13 the eof token is prefixed with the parser name. Also updated the
[external/ragel.git] / redfsm / xmlscan.rl
1 /*
2  *  Copyright 2001-2007 Adrian Thurston <thurston@cs.queensu.ca>
3  */
4
5 /*  This file is part of Ragel.
6  *
7  *  Ragel is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  * 
12  *  Ragel is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  * 
17  *  You should have received a copy of the GNU General Public License
18  *  along with Ragel; if not, write to the Free Software
19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20  */
21 #include <iostream>
22 #include <string.h>
23 #include "vector.h"
24 #include "xmlparse.h"
25
26 using std::istream;
27 using std::cout;
28 using std::cerr;
29 using std::endl;
30
31 %%{
32         machine XmlScanner;
33         write data;
34 }%%
35
36 class Perfect_Hash
37 {
38 private:
39         static inline unsigned int hash (const char *str, unsigned int len);
40
41 public:
42         static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
43 };
44
45 XmlScanner::XmlScanner( const char *fileName, istream &input ) : 
46         fileName(fileName),
47         input(input), 
48         curline(1), 
49         curcol(1),
50         p(0), pe(0), 
51         done(false),
52         data(0), data_len(0),
53         value(0)
54 {
55         %%{
56                 machine XmlScanner;
57                 write init;
58         }%%
59 }
60
61 #define TK_NO_TOKEN (-1)
62 #define TK_ERR 1
63 #define TK_SPACE 2
64 #define TK_EOF 3
65 #define TK_OpenTag 4
66 #define TK_CloseTag 5
67
68 #define ret_tok( _tok ) token = (_tok); data = ts
69
70 void XmlScanner::adjustAttrPointers( int distance )
71 {
72         for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
73                 attr->id -= distance;
74                 attr->value -= distance;
75         }
76 }
77
78 /* There is no claim that this is a proper XML parser, but it is good
79  * enough for our purposes. */
80 %%{
81         machine XmlScanner;
82
83         action colup { curcol++; }
84         action start_tok { token_col = curcol; token_line = curline; }
85         NL = '\n' @{ curcol = 0; curline++; };
86
87         WS = [\r\t ] | NL;
88         id = [_a-zA-Z][_a-zA-Z0-9]*;
89         literal = '"' ( [^"] | NL )* '"';
90
91         # Attribute identifiers.
92         action start_attr_id { attr_id_start = p; }
93         action leave_attr_id { attr_id_len = p - attr_id_start; }
94
95         attr_id = id >start_attr_id %leave_attr_id;
96
97         # Attribute values
98         action start_attr_value { attr_value_start = p; }
99         action leave_attr_value
100         {
101                 attr_value_len = p - attr_value_start;
102
103                 AttrMarker newAttr;
104                 newAttr.id = attr_id_start;
105                 newAttr.idLen = attr_id_len;
106                 newAttr.value = attr_value_start;
107                 newAttr.valueLen = attr_value_len;
108                 attrMkList.append( newAttr );
109         }
110
111         attr_value = literal >start_attr_value %leave_attr_value;
112
113         # Attribute list. 
114         attribute = attr_id WS* '=' WS* attr_value WS*;
115
116         # Tag identifiers.
117         action tag_id_start { tag_id_start = p; }
118         action leave_tag_id { tag_id_len = p - tag_id_start; }
119
120         tag_id = id >tag_id_start %leave_tag_id;
121
122         main := |*
123                 # Tags
124                 ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup 
125                         => { ret_tok( TK_OpenTag ); fbreak; };
126
127                 ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup 
128                         => { ret_tok( TK_CloseTag ); fbreak; };
129
130                 # Data in between tags.
131                 ( [^<&\0] | NL ) $colup 
132                         => { buffer.append( *p ); };
133
134                 # Specials.
135                 "&amp;" $colup
136                         => { buffer.append( '&' ); };
137                 "&lt;" $colup
138                         => { buffer.append( '<' ); };
139                 "&gt;" $colup
140                         => { buffer.append( '>' ); };
141                 
142                 # EOF
143                 0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
144
145         *|;
146 }%%
147
148 int XmlScanner::scan( )
149 {
150         int token = TK_NO_TOKEN;
151         int space = 0, readlen = 0;
152         char *attr_id_start = 0;
153         char *attr_value_start = 0;
154         int attr_id_len = 0;
155         int attr_value_len = 0;
156
157         attrMkList.empty();
158         buffer.clear();
159
160         while ( 1 ) {
161                 if ( p == pe ) {
162                         //printf("scanner: need more data\n");
163
164                         if ( ts == 0 )
165                                 have = 0;
166                         else {
167                                 /* There is data that needs to be shifted over. */
168                                 //printf("scanner: buffer broken mid token\n");
169                                 have = pe - ts;
170                                 memmove( buf, ts, have );
171
172                                 int distance = ts - buf;
173                                 te -= distance;
174                                 tag_id_start -= distance;
175                                 attr_id_start -= distance;
176                                 attr_value_start -= distance;
177                                 adjustAttrPointers( distance );
178                                 ts = buf;
179                         }
180
181                         p = buf + have;
182                         space = XML_BUFSIZE - have;
183
184                         if ( space == 0 ) {
185                                 /* We filled up the buffer trying to scan a token. */
186                                 return TK_SPACE;
187                         }
188
189                         if ( done ) {
190                                 //printf("scanner: end of file\n");
191                                 p[0] = 0;
192                                 readlen = 1;
193                         }
194                         else {
195                                 input.read( p, space );
196                                 readlen = input.gcount();
197                                 if ( input.eof() ) {
198                                         //printf("scanner: setting done flag\n");
199                                         done = 1;
200                                 }
201                         }
202
203                         pe = p + readlen;
204                 }
205
206                 %% write exec;
207
208                 if ( cs == XmlScanner_error )
209                         return TK_ERR;
210
211                 if ( token != TK_NO_TOKEN ) {
212                         data_len = p - data;
213                         return token;
214                 }
215         }
216 }
217
218 int xml_parse( std::istream &input, const char *fileName, 
219                 bool outputActive, bool wantComplete, 
220                 XmlScanner &scanner, XmlParser &parser )
221 {
222         while ( 1 ) {
223                 int token = scanner.scan();
224                 if ( token == TK_NO_TOKEN ) {
225                         cerr << "xmlscan: interal error: scanner returned NO_TOKEN" << endl;
226                         exit(1);
227                 }
228                 else if ( token == TK_EOF ) {
229                         parser.token( XmlParser_tk_eof, scanner.token_col, scanner.token_line );
230                         break;
231                 }
232                 else if ( token == TK_ERR ) {
233                         scanner.error() << "scanner error" << endl;
234                         break;
235                 }
236                 else if ( token == TK_SPACE ) {
237                         scanner.error() << "scanner is out of buffer space" << endl;
238                         break;
239                 }
240                 else {
241                         /* All other tokens are either open or close tags. */
242                         XMLTagHashPair *tagId = Perfect_Hash::in_word_set( 
243                                         scanner.tag_id_start, scanner.tag_id_len );
244
245                         XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ? 
246                                         XMLTag::Open : XMLTag::Close );
247
248                         if ( tagId != 0 ) {
249                                 /* Get attributes for open tags. */
250                                 if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
251                                         tag->attrList = new AttrList;
252                                         for ( AttrMkList::Iter attr = scanner.attrMkList; 
253                                                         attr.lte(); attr++ )
254                                         {
255                                                 Attribute newAttr;
256                                                 newAttr.id = new char[attr->idLen+1];
257                                                 memcpy( newAttr.id, attr->id, attr->idLen );
258                                                 newAttr.id[attr->idLen] = 0;
259
260                                                 /* Exclude the surrounding quotes. */
261                                                 newAttr.value = new char[attr->valueLen-1];
262                                                 memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
263                                                 newAttr.value[attr->valueLen-2] = 0;
264
265                                                 tag->attrList->append( newAttr );
266                                         }
267                                 }
268
269                                 /* Get content for closing tags. */
270                                 if ( token == TK_CloseTag ) {
271                                         switch ( tagId->id ) {
272                                         case TAG_host: case TAG_arg:
273                                         case TAG_t: case TAG_alphtype:
274                                         case TAG_text: case TAG_goto:
275                                         case TAG_call: case TAG_next:
276                                         case TAG_entry: case TAG_set_tokend:
277                                         case TAG_set_act: case TAG_start_state:
278                                         case TAG_error_state: case TAG_state_actions: 
279                                         case TAG_action_table: case TAG_cond_space: 
280                                         case TAG_c: case TAG_ex: case TAG_eof_t:
281                                                 tag->content = new char[scanner.buffer.length+1];
282                                                 memcpy( tag->content, scanner.buffer.data,
283                                                                 scanner.buffer.length );
284                                                 tag->content[scanner.buffer.length] = 0;
285                                                 break;
286                                         }
287                                 }
288                         }
289
290                         #if 0
291                         cerr << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
292                                         ": " << (tag->tagId != 0 ? tag->tagId->name : "<unknown>") << endl;
293                         if ( tag->attrList != 0 ) {
294                                 for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
295                                         cerr << "    " << attr->id << ": " << attr->value << endl;
296                         }
297                         if ( tag->content != 0 )
298                                 cerr << "    content: " << tag->content << endl;
299                         #endif
300
301                         parser.token( tag, scanner.token_col, scanner.token_line );
302                 }
303         }
304
305         return 0;
306 }
307
308 std::ostream &XmlScanner::error()
309 {
310         gblErrorCount += 1;
311         cerr << fileName << ":" << curline << ":" << curcol << ": ";
312         return cerr;
313 }