2 * Copyright 2001-2007 Adrian Thurston <thurston@cs.queensu.ca>
5 /* This file is part of Ragel.
7 * Ragel is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * Ragel is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Ragel; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
42 static inline unsigned int hash (const char *str, unsigned int len);
45 static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
50 Scanner( char *fileName, istream &input ) :
67 void adjustAttrPointers( int distance );
68 std::ostream &error();
74 int cs, act, have, curline, curcol;
75 char *tokstart, *tokend;
83 AttrMkList attrMkList;
87 int token_col, token_line;
93 #define TK_NO_TOKEN (-1)
100 #define ret_tok( _tok ) token = (_tok); data = tokstart
102 void Scanner::adjustAttrPointers( int distance )
104 for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
105 attr->id -= distance;
106 attr->value -= distance;
110 /* There is no claim that this is a proper XML parser, but it is good
111 * enough for our purposes. */
115 action colup { curcol++; }
116 action start_tok { token_col = curcol; token_line = curline; }
117 NL = '\n' @{ curcol = 0; curline++; };
120 id = [_a-zA-Z][_a-zA-Z0-9]*;
121 literal = '"' ( [^"] | NL )* '"';
123 # Attribute identifiers.
124 action start_attr_id { attr_id_start = p; }
125 action leave_attr_id { attr_id_len = p - attr_id_start; }
127 attr_id = id >start_attr_id %leave_attr_id;
130 action start_attr_value { attr_value_start = p; }
131 action leave_attr_value
133 attr_value_len = p - attr_value_start;
136 newAttr.id = attr_id_start;
137 newAttr.idLen = attr_id_len;
138 newAttr.value = attr_value_start;
139 newAttr.valueLen = attr_value_len;
140 attrMkList.append( newAttr );
143 attr_value = literal >start_attr_value %leave_attr_value;
146 attribute = attr_id WS* '=' WS* attr_value WS*;
149 action tag_id_start { tag_id_start = p; }
150 action leave_tag_id { tag_id_len = p - tag_id_start; }
152 tag_id = id >tag_id_start %leave_tag_id;
156 ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup
157 => { ret_tok( TK_OpenTag ); fbreak; };
159 ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup
160 => { ret_tok( TK_CloseTag ); fbreak; };
162 # Data in between tags.
163 ( [^<&\0] | NL ) $colup
164 => { buffer.append( *p ); };
168 => { buffer.append( '&' ); };
170 => { buffer.append( '<' ); };
172 => { buffer.append( '>' ); };
175 0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
182 int token = TK_NO_TOKEN;
185 char *attr_value_start;
194 //printf("scanner: need more data\n");
199 /* There is data that needs to be shifted over. */
200 //printf("scanner: buffer broken mid token\n");
201 have = pe - tokstart;
202 memmove( buf, tokstart, have );
204 int distance = tokstart - buf;
206 tag_id_start -= distance;
207 attr_id_start -= distance;
208 attr_value_start -= distance;
209 adjustAttrPointers( distance );
214 space = BUFSIZE - have;
217 /* We filled up the buffer trying to scan a token. */
222 //printf("scanner: end of file\n");
227 input.read( p, space );
228 readlen = input.gcount();
230 //printf("scanner: setting done flag\n");
240 if ( cs == Scanner_error )
243 if ( token != TK_NO_TOKEN ) {
244 /* fbreak does not advance p, so we do it manually. */
252 int xml_parse( std::istream &input, char *fileName,
253 bool outputActive, bool wantComplete )
255 Scanner scanner( fileName, input );
256 Parser parser( fileName, outputActive, wantComplete );
261 int token = scanner.scan();
262 if ( token == TK_NO_TOKEN ) {
263 cerr << "xmlscan: interal error: scanner returned NO_TOKEN" << endl;
266 else if ( token == TK_EOF ) {
267 parser.token( _eof, scanner.token_col, scanner.token_line );
270 else if ( token == TK_ERR ) {
271 scanner.error() << "scanner error" << endl;
274 else if ( token == TK_SPACE ) {
275 scanner.error() << "scanner is out of buffer space" << endl;
279 /* All other tokens are either open or close tags. */
280 XMLTagHashPair *tagId = Perfect_Hash::in_word_set(
281 scanner.tag_id_start, scanner.tag_id_len );
283 XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ?
284 XMLTag::Open : XMLTag::Close );
287 /* Get attributes for open tags. */
288 if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
289 tag->attrList = new AttrList;
290 for ( AttrMkList::Iter attr = scanner.attrMkList;
294 newAttr.id = new char[attr->idLen+1];
295 memcpy( newAttr.id, attr->id, attr->idLen );
296 newAttr.id[attr->idLen] = 0;
298 /* Exclude the surrounding quotes. */
299 newAttr.value = new char[attr->valueLen-1];
300 memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
301 newAttr.value[attr->valueLen-2] = 0;
303 tag->attrList->append( newAttr );
307 /* Get content for closing tags. */
308 if ( token == TK_CloseTag ) {
309 switch ( tagId->id ) {
310 case TAG_host: case TAG_arg:
311 case TAG_t: case TAG_alphtype:
312 case TAG_text: case TAG_goto:
313 case TAG_call: case TAG_next:
314 case TAG_entry: case TAG_set_tokend:
315 case TAG_set_act: case TAG_start_state:
316 case TAG_error_state: case TAG_state_actions:
317 case TAG_action_table: case TAG_cond_space:
318 case TAG_c: case TAG_ex:
319 tag->content = new char[scanner.buffer.length+1];
320 memcpy( tag->content, scanner.buffer.data,
321 scanner.buffer.length );
322 tag->content[scanner.buffer.length] = 0;
329 cerr << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
330 ": " << (tag->tagId != 0 ? tag->tagId->name : "<unknown>") << endl;
331 if ( tag->attrList != 0 ) {
332 for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
333 cerr << " " << attr->id << ": " << attr->value << endl;
335 if ( tag->content != 0 )
336 cerr << " content: " << tag->content << endl;
339 parser.token( tag, scanner.token_col, scanner.token_line );
346 std::ostream &Scanner::error()
348 cerr << fileName << ":" << curline << ":" << curcol << ": ";