2 www.sourceforge.net/projects/tinyxml
\r
3 Original code by Lee Thomason (www.grinninglizard.com)
\r
5 This software is provided 'as-is', without any express or implied
\r
6 warranty. In no event will the authors be held liable for any
\r
7 damages arising from the use of this software.
\r
9 Permission is granted to anyone to use this software for any
\r
10 purpose, including commercial applications, and to alter it and
\r
11 redistribute it freely, subject to the following restrictions:
\r
13 1. The origin of this software must not be misrepresented; you must
\r
14 not claim that you wrote the original software. If you use this
\r
15 software in a product, an acknowledgment in the product documentation
\r
16 would be appreciated but is not required.
\r
18 2. Altered source versions must be plainly marked as such, and
\r
19 must not be misrepresented as being the original software.
\r
21 3. This notice may not be removed or altered from any source
\r
28 #include "tinyxml.h"
\r
30 //#define DEBUG_PARSER
\r
31 #if defined( DEBUG_PARSER )
\r
32 # if defined( DEBUG ) && defined( _MSC_VER )
\r
33 # include <windows.h>
\r
34 # define TIXML_LOG OutputDebugString
\r
36 # define TIXML_LOG printf
\r
40 // Note tha "PutString" hardcodes the same list. This
\r
41 // is less flexible than it appears. Changing the entries
\r
42 // or order will break putstring.
\r
43 TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] =
\r
45 { "&", 5, '&' },
\r
48 { """, 6, '\"' },
\r
49 { "'", 6, '\'' }
\r
52 // Bunch of unicode info at:
\r
53 // http://www.unicode.org/faq/utf_bom.html
\r
54 // Including the basic of this table, which determines the #bytes in the
\r
55 // sequence from the lead byte. 1 placed for invalid sequences --
\r
56 // although the result will be junk, pass it through as much as possible.
\r
57 // Beware of the non-characters in UTF-8:
\r
58 // ef bb bf (Microsoft "lead bytes")
\r
62 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
\r
63 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
\r
64 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
\r
66 const int TiXmlBase::utf8ByteTable[256] =
\r
68 // 0 1 2 3 4 5 6 7 8 9 a b c d e f
\r
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
\r
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
\r
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
\r
72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
\r
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
\r
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
\r
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
\r
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
\r
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
\r
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
\r
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
\r
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
\r
81 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
\r
82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
\r
83 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
\r
84 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
\r
88 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
\r
90 const unsigned long BYTE_MASK = 0xBF;
\r
91 const unsigned long BYTE_MARK = 0x80;
\r
92 const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
\r
96 else if ( input < 0x800 )
\r
98 else if ( input < 0x10000 )
\r
100 else if ( input < 0x200000 )
\r
103 { *length = 0; return; } // This code won't covert this correctly anyway.
\r
107 // Scary scary fall throughs.
\r
112 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
\r
116 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
\r
120 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
\r
124 *output = (char)(input | FIRST_BYTE_MARK[*length]);
\r
129 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
\r
131 // This will only work for low-ascii, everything else is assumed to be a valid
\r
132 // letter. I'm not sure this is the best approach, but it is quite tricky trying
\r
133 // to figure out alhabetical vs. not across encoding. So take a very
\r
134 // conservative approach.
\r
136 // if ( encoding == TIXML_ENCODING_UTF8 )
\r
138 if ( anyByte < 127 )
\r
139 return isalpha( anyByte );
\r
141 return 1; // What else to do? The unicode set is huge...get the english ones right.
\r
145 // return isalpha( anyByte );
\r
150 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
\r
152 // This will only work for low-ascii, everything else is assumed to be a valid
\r
153 // letter. I'm not sure this is the best approach, but it is quite tricky trying
\r
154 // to figure out alhabetical vs. not across encoding. So take a very
\r
155 // conservative approach.
\r
157 // if ( encoding == TIXML_ENCODING_UTF8 )
\r
159 if ( anyByte < 127 )
\r
160 return isalnum( anyByte );
\r
162 return 1; // What else to do? The unicode set is huge...get the english ones right.
\r
166 // return isalnum( anyByte );
\r
171 class TiXmlParsingData
\r
173 friend class TiXmlDocument;
\r
175 void Stamp( const char* now, TiXmlEncoding encoding );
\r
177 const TiXmlCursor& Cursor() const { return cursor; }
\r
180 // Only used by the document!
\r
181 TiXmlParsingData( const char* start, int _tabsize, int row, int col )
\r
185 tabsize = _tabsize;
\r
190 TiXmlCursor cursor;
\r
196 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
\r
200 // Do nothing if the tabsize is 0.
\r
206 // Get the current row, column.
\r
207 int row = cursor.row;
\r
208 int col = cursor.col;
\r
209 const char* p = stamp;
\r
214 // Treat p as unsigned, so we have a happy compiler.
\r
215 const unsigned char* pU = (const unsigned char*)p;
\r
217 // Code contributed by Fletcher Dunn: (modified by lee)
\r
220 // We *should* never get here, but in case we do, don't
\r
221 // advance past the terminating null character, ever
\r
225 // bump down to the next line
\r
228 // Eat the character
\r
231 // Check for \r\n sequence, and treat this as a single character
\r
238 // bump down to the next line
\r
242 // Eat the character
\r
245 // Check for \n\r sequence, and treat this as a single
\r
246 // character. (Yes, this bizarre thing does occur still
\r
247 // on some arcane platforms...)
\r
254 // Eat the character
\r
257 // Skip to next tab stop
\r
258 col = (col / tabsize + 1) * tabsize;
\r
261 case TIXML_UTF_LEAD_0:
\r
262 if ( encoding == TIXML_ENCODING_UTF8 )
\r
264 if ( *(p+1) && *(p+2) )
\r
266 // In these cases, don't advance the column. These are
\r
268 if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
\r
270 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
\r
272 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
\r
275 { p +=3; ++col; } // A normal character.
\r
286 if ( encoding == TIXML_ENCODING_UTF8 )
\r
288 // Eat the 1 to 4 byte utf8 character.
\r
289 int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
\r
291 step = 1; // Error case from bad encoding, but handle gracefully.
\r
294 // Just advance one column, of course.
\r
307 assert( cursor.row >= -1 );
\r
308 assert( cursor.col >= -1 );
\r
314 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
\r
320 if ( encoding == TIXML_ENCODING_UTF8 )
\r
324 const unsigned char* pU = (const unsigned char*)p;
\r
326 // Skip the stupid Microsoft UTF-8 Byte order marks
\r
327 if ( *(pU+0)==TIXML_UTF_LEAD_0
\r
328 && *(pU+1)==TIXML_UTF_LEAD_1
\r
329 && *(pU+2)==TIXML_UTF_LEAD_2 )
\r
334 else if(*(pU+0)==TIXML_UTF_LEAD_0
\r
336 && *(pU+2)==0xbeU )
\r
341 else if(*(pU+0)==TIXML_UTF_LEAD_0
\r
343 && *(pU+2)==0xbfU )
\r
349 if ( IsWhiteSpace( *p ) ) // Still using old rules for white space.
\r
357 while ( *p && IsWhiteSpace( *p ) )
\r
364 #ifdef TIXML_USE_STL
\r
365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
\r
369 if ( !in->good() ) return false;
\r
371 int c = in->peek();
\r
372 // At this scope, we can't get to a document. So fail silently.
\r
373 if ( !IsWhiteSpace( c ) || c <= 0 )
\r
376 *tag += (char) in->get();
\r
380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
\r
382 //assert( character > 0 && character < 128 ); // else it won't work in utf-8
\r
383 while ( in->good() )
\r
385 int c = in->peek();
\r
386 if ( c == character )
\r
388 if ( c <= 0 ) // Silent failure: can't get document at this scope
\r
398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
\r
399 // "assign" optimization removes over 10% of the execution time.
\r
401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
\r
403 // Oddly, not supported on some comilers,
\r
409 // Names start with letters or underscores.
\r
410 // Of course, in unicode, tinyxml has no idea what a letter *is*. The
\r
411 // algorithm is generous.
\r
413 // After that, they can be letters, underscores, numbers,
\r
414 // hyphens, or colons. (Colons are valid ony for namespaces,
\r
415 // but tinyxml can't tell namespaces from names.)
\r
417 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
\r
419 const char* start = p;
\r
421 && ( IsAlphaNum( (unsigned char ) *p, encoding )
\r
427 //(*name) += *p; // expensive
\r
430 if ( p-start > 0 ) {
\r
431 name->assign( start, p-start );
\r
438 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
\r
440 // Presume an entity, and pull it out.
\r
445 if ( *(p+1) && *(p+1) == '#' && *(p+2) )
\r
447 unsigned long ucs = 0;
\r
448 ptrdiff_t delta = 0;
\r
451 if ( *(p+2) == 'x' )
\r
454 if ( !*(p+3) ) return 0;
\r
456 const char* q = p+3;
\r
457 q = strchr( q, ';' );
\r
459 if ( !q || !*q ) return 0;
\r
464 while ( *q != 'x' )
\r
466 if ( *q >= '0' && *q <= '9' )
\r
467 ucs += mult * (*q - '0');
\r
468 else if ( *q >= 'a' && *q <= 'f' )
\r
469 ucs += mult * (*q - 'a' + 10);
\r
470 else if ( *q >= 'A' && *q <= 'F' )
\r
471 ucs += mult * (*q - 'A' + 10 );
\r
481 if ( !*(p+2) ) return 0;
\r
483 const char* q = p+2;
\r
484 q = strchr( q, ';' );
\r
486 if ( !q || !*q ) return 0;
\r
491 while ( *q != '#' )
\r
493 if ( *q >= '0' && *q <= '9' )
\r
494 ucs += mult * (*q - '0');
\r
501 if ( encoding == TIXML_ENCODING_UTF8 )
\r
503 // convert the UCS to UTF-8
\r
504 ConvertUTF32ToUTF8( ucs, value, length );
\r
508 *value = (char)ucs;
\r
511 return p + delta + 1;
\r
514 // Now try to match it.
\r
515 for( i=0; i<NUM_ENTITY; ++i )
\r
517 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
\r
519 assert( strlen( entity[i].str ) == entity[i].strLength );
\r
520 *value = entity[i].chr;
\r
522 return ( p + entity[i].strLength );
\r
526 // So it wasn't an entity, its unrecognized, or something like that.
\r
527 *value = *p; // Don't put back the last one, since we return it!
\r
528 //*length = 1; // Leave unrecognized entities - this doesn't really work.
\r
529 // Just writes strange XML.
\r
534 bool TiXmlBase::StringEqual( const char* p,
\r
537 TiXmlEncoding encoding )
\r
551 while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
\r
562 while ( *q && *tag && *q == *tag )
\r
568 if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
\r
574 const char* TiXmlBase::ReadText( const char* p,
\r
575 TIXML_STRING * text,
\r
576 bool trimWhiteSpace,
\r
577 const char* endTag,
\r
578 bool caseInsensitive,
\r
579 TiXmlEncoding encoding )
\r
582 if ( !trimWhiteSpace // certain tags always keep whitespace
\r
583 || !condenseWhiteSpace ) // if true, whitespace is always kept
\r
585 // Keep all the white space.
\r
587 && !StringEqual( p, endTag, caseInsensitive, encoding )
\r
591 char cArr[4] = { 0, 0, 0, 0 };
\r
592 p = GetChar( p, cArr, &len, encoding );
\r
593 text->append( cArr, len );
\r
598 bool whitespace = false;
\r
600 // Remove leading white space:
\r
601 p = SkipWhiteSpace( p, encoding );
\r
603 && !StringEqual( p, endTag, caseInsensitive, encoding ) )
\r
605 if ( *p == '\r' || *p == '\n' )
\r
610 else if ( IsWhiteSpace( *p ) )
\r
617 // If we've found whitespace, add it before the
\r
618 // new character. Any whitespace just becomes a space.
\r
622 whitespace = false;
\r
625 char cArr[4] = { 0, 0, 0, 0 };
\r
626 p = GetChar( p, cArr, &len, encoding );
\r
628 (*text) += cArr[0]; // more efficient
\r
630 text->append( cArr, len );
\r
635 p += strlen( endTag );
\r
636 return ( p && *p ) ? p : 0;
\r
639 #ifdef TIXML_USE_STL
\r
641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
\r
643 // The basic issue with a document is that we don't know what we're
\r
644 // streaming. Read something presumed to be a tag (and hope), then
\r
645 // identify it, and call the appropriate stream method on the tag.
\r
647 // This "pre-streaming" will never read the closing ">" so the
\r
648 // sub-tag can orient itself.
\r
650 if ( !StreamTo( in, '<', tag ) )
\r
652 SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
656 while ( in->good() )
\r
658 int tagIndex = (int) tag->length();
\r
659 while ( in->good() && in->peek() != '>' )
\r
664 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
667 (*tag) += (char) c;
\r
672 // We now have something we presume to be a node of
\r
673 // some sort. Identify it, and call the node to
\r
674 // continue streaming.
\r
675 TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
\r
679 node->StreamIn( in, tag );
\r
680 bool isElement = node->ToElement() != 0;
\r
684 // If this is the root element, we're done. Parsing will be
\r
685 // done by the >> operator.
\r
693 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
698 // We should have returned sooner.
\r
699 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
704 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
\r
708 // Parse away, at the document level. Since a document
\r
709 // contains nothing but other tags, most of what happens
\r
710 // here is skipping white space.
\r
713 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
717 // Note that, for a document, this needs to come
\r
718 // before the while space skip, so that parsing
\r
719 // starts from the pointer we are given.
\r
723 location.row = prevData->cursor.row;
\r
724 location.col = prevData->cursor.col;
\r
731 TiXmlParsingData data( p, TabSize(), location.row, location.col );
\r
732 location = data.Cursor();
\r
734 if ( encoding == TIXML_ENCODING_UNKNOWN )
\r
736 // Check for the Microsoft UTF-8 lead bytes.
\r
737 const unsigned char* pU = (const unsigned char*)p;
\r
738 if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
\r
739 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
\r
740 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
\r
742 encoding = TIXML_ENCODING_UTF8;
\r
743 useMicrosoftBOM = true;
\r
747 p = SkipWhiteSpace( p, encoding );
\r
750 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
756 TiXmlNode* node = Identify( p, encoding );
\r
759 p = node->Parse( p, &data, encoding );
\r
760 LinkEndChild( node );
\r
767 // Did we get encoding info?
\r
768 if ( encoding == TIXML_ENCODING_UNKNOWN
\r
769 && node->ToDeclaration() )
\r
771 TiXmlDeclaration* dec = node->ToDeclaration();
\r
772 const char* enc = dec->Encoding();
\r
776 encoding = TIXML_ENCODING_UTF8;
\r
777 else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
\r
778 encoding = TIXML_ENCODING_UTF8;
\r
779 else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
\r
780 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
\r
782 encoding = TIXML_ENCODING_LEGACY;
\r
785 p = SkipWhiteSpace( p, encoding );
\r
789 if ( !firstChild ) {
\r
790 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
\r
798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
\r
800 // The first error in a chain is more accurate - don't set again!
\r
804 assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
\r
807 errorDesc = errorString[ errorId ];
\r
809 errorLocation.Clear();
\r
810 if ( pError && data )
\r
812 data->Stamp( pError, encoding );
\r
813 errorLocation = data->Cursor();
\r
818 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
\r
820 TiXmlNode* returnNode = 0;
\r
822 p = SkipWhiteSpace( p, encoding );
\r
823 if( !p || !*p || *p != '<' )
\r
828 p = SkipWhiteSpace( p, encoding );
\r
835 // What is this thing?
\r
836 // - Elements start with a letter or underscore, but xml is reserved.
\r
837 // - Comments: <!--
\r
838 // - Decleration: <?xml
\r
839 // - Everthing else is unknown to tinyxml.
\r
842 const char* xmlHeader = { "<?xml" };
\r
843 const char* commentHeader = { "<!--" };
\r
844 const char* dtdHeader = { "<!" };
\r
845 const char* cdataHeader = { "<![CDATA[" };
\r
847 if ( StringEqual( p, xmlHeader, true, encoding ) )
\r
849 #ifdef DEBUG_PARSER
\r
850 TIXML_LOG( "XML parsing Declaration\n" );
\r
852 returnNode = new TiXmlDeclaration();
\r
854 else if ( StringEqual( p, commentHeader, false, encoding ) )
\r
856 #ifdef DEBUG_PARSER
\r
857 TIXML_LOG( "XML parsing Comment\n" );
\r
859 returnNode = new TiXmlComment();
\r
861 else if ( StringEqual( p, cdataHeader, false, encoding ) )
\r
863 #ifdef DEBUG_PARSER
\r
864 TIXML_LOG( "XML parsing CDATA\n" );
\r
866 TiXmlText* text = new TiXmlText( "" );
\r
867 text->SetCDATA( true );
\r
870 else if ( StringEqual( p, dtdHeader, false, encoding ) )
\r
872 #ifdef DEBUG_PARSER
\r
873 TIXML_LOG( "XML parsing Unknown(1)\n" );
\r
875 returnNode = new TiXmlUnknown();
\r
877 else if ( IsAlpha( *(p+1), encoding )
\r
880 #ifdef DEBUG_PARSER
\r
881 TIXML_LOG( "XML parsing Element\n" );
\r
883 returnNode = new TiXmlElement( "" );
\r
887 #ifdef DEBUG_PARSER
\r
888 TIXML_LOG( "XML parsing Unknown(2)\n" );
\r
890 returnNode = new TiXmlUnknown();
\r
895 // Set the parent, so it can report errors
\r
896 returnNode->parent = this;
\r
901 #ifdef TIXML_USE_STL
\r
903 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
\r
905 // We're called with some amount of pre-parsing. That is, some of "this"
\r
906 // element is in "tag". Go ahead and stream to the closing ">"
\r
907 while( in->good() )
\r
912 TiXmlDocument* document = GetDocument();
\r
914 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
917 (*tag) += (char) c ;
\r
923 if ( tag->length() < 3 ) return;
\r
925 // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
\r
926 // If not, identify and stream.
\r
928 if ( tag->at( tag->length() - 1 ) == '>'
\r
929 && tag->at( tag->length() - 2 ) == '/' )
\r
934 else if ( tag->at( tag->length() - 1 ) == '>' )
\r
936 // There is more. Could be:
\r
938 // cdata text (which looks like another node)
\r
943 StreamWhiteSpace( in, tag );
\r
945 // Do we have text?
\r
946 if ( in->good() && in->peek() != '<' )
\r
949 TiXmlText text( "" );
\r
950 text.StreamIn( in, tag );
\r
952 // What follows text is a closing tag or another node.
\r
953 // Go around again and figure it out.
\r
957 // We now have either a closing tag...or another node.
\r
958 // We should be at a "<", regardless.
\r
959 if ( !in->good() ) return;
\r
960 assert( in->peek() == '<' );
\r
961 int tagIndex = (int) tag->length();
\r
963 bool closingTag = false;
\r
964 bool firstCharFound = false;
\r
971 int c = in->peek();
\r
974 TiXmlDocument* document = GetDocument();
\r
976 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
986 // Early out if we find the CDATA id.
\r
987 if ( c == '[' && tag->size() >= 9 )
\r
989 size_t len = tag->size();
\r
990 const char* start = tag->c_str() + len - 9;
\r
991 if ( strcmp( start, "<![CDATA[" ) == 0 ) {
\r
992 assert( !closingTag );
\r
997 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
\r
999 firstCharFound = true;
\r
1001 closingTag = true;
\r
1004 // If it was a closing tag, then read in the closing '>' to clean up the input stream.
\r
1005 // If it was not, the streaming will be done by the tag.
\r
1008 if ( !in->good() )
\r
1011 int c = in->get();
\r
1014 TiXmlDocument* document = GetDocument();
\r
1016 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
1019 assert( c == '>' );
\r
1022 // We are done, once we've found our closing tag.
\r
1027 // If not a closing tag, id it, and stream.
\r
1028 const char* tagloc = tag->c_str() + tagIndex;
\r
1029 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
\r
1032 node->StreamIn( in, tag );
\r
1036 // No return: go around from the beginning: text, closing tag, or node.
\r
1043 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
\r
1045 p = SkipWhiteSpace( p, encoding );
\r
1046 TiXmlDocument* document = GetDocument();
\r
1050 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
\r
1056 data->Stamp( p, encoding );
\r
1057 location = data->Cursor();
\r
1062 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
\r
1066 p = SkipWhiteSpace( p+1, encoding );
\r
1069 const char* pErr = p;
\r
1071 p = ReadName( p, &value, encoding );
\r
1074 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
\r
1078 TIXML_STRING endTag ("</");
\r
1081 // Check for and read attributes. Also look for an empty
\r
1082 // tag or an end tag.
\r
1086 p = SkipWhiteSpace( p, encoding );
\r
1089 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
\r
1098 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
\r
1103 else if ( *p == '>' )
\r
1105 // Done with attributes (if there were any.)
\r
1106 // Read the value -- which can include other
\r
1107 // elements -- read the end tag, and return.
\r
1109 p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
\r
1110 if ( !p || !*p ) {
\r
1111 // We were looking for the end tag, but found nothing.
\r
1112 // Fix for [ 1663758 ] Failure to report error on bad XML
\r
1113 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
\r
1117 // We should find the end tag now
\r
1121 // are both valid end tags.
\r
1122 if ( StringEqual( p, endTag.c_str(), false, encoding ) )
\r
1124 p += endTag.length();
\r
1125 p = SkipWhiteSpace( p, encoding );
\r
1126 if ( p && *p && *p == '>' ) {
\r
1130 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
\r
1135 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
\r
1141 // Try to read an attribute:
\r
1142 TiXmlAttribute* attrib = new TiXmlAttribute();
\r
1148 attrib->SetDocument( document );
\r
1150 p = attrib->Parse( p, data, encoding );
\r
1154 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
\r
1159 // Handle the strange case of double attributes:
\r
1160 #ifdef TIXML_USE_STL
\r
1161 TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
\r
1163 TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
\r
1167 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
\r
1172 attributeSet.Add( attrib );
\r
1179 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
\r
1181 TiXmlDocument* document = GetDocument();
\r
1183 // Read in text and elements in any order.
\r
1184 const char* pWithWhiteSpace = p;
\r
1185 p = SkipWhiteSpace( p, encoding );
\r
1191 // Take what we have, make a text element.
\r
1192 TiXmlText* textNode = new TiXmlText( "" );
\r
1199 if ( TiXmlBase::IsWhiteSpaceCondensed() )
\r
1201 p = textNode->Parse( p, data, encoding );
\r
1205 // Special case: we want to keep the white space
\r
1206 // so that leading spaces aren't removed.
\r
1207 p = textNode->Parse( pWithWhiteSpace, data, encoding );
\r
1210 if ( !textNode->Blank() )
\r
1211 LinkEndChild( textNode );
\r
1218 // Have we hit a new element or an end tag? This could also be
\r
1219 // a TiXmlText in the "CDATA" style.
\r
1220 if ( StringEqual( p, "</", false, encoding ) )
\r
1226 TiXmlNode* node = Identify( p, encoding );
\r
1229 p = node->Parse( p, data, encoding );
\r
1230 LinkEndChild( node );
\r
1238 pWithWhiteSpace = p;
\r
1239 p = SkipWhiteSpace( p, encoding );
\r
1244 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
\r
1250 #ifdef TIXML_USE_STL
\r
1251 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
\r
1253 while ( in->good() )
\r
1255 int c = in->get();
\r
1258 TiXmlDocument* document = GetDocument();
\r
1260 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
1263 (*tag) += (char) c;
\r
1275 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
\r
1277 TiXmlDocument* document = GetDocument();
\r
1278 p = SkipWhiteSpace( p, encoding );
\r
1282 data->Stamp( p, encoding );
\r
1283 location = data->Cursor();
\r
1285 if ( !p || !*p || *p != '<' )
\r
1287 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
\r
1293 while ( p && *p && *p != '>' )
\r
1302 document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
\r
1304 if ( p && *p == '>' )
\r
1309 #ifdef TIXML_USE_STL
\r
1310 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
\r
1312 while ( in->good() )
\r
1314 int c = in->get();
\r
1317 TiXmlDocument* document = GetDocument();
\r
1319 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
1323 (*tag) += (char) c;
\r
1326 && tag->at( tag->length() - 2 ) == '-'
\r
1327 && tag->at( tag->length() - 3 ) == '-' )
\r
1337 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
\r
1339 TiXmlDocument* document = GetDocument();
\r
1342 p = SkipWhiteSpace( p, encoding );
\r
1346 data->Stamp( p, encoding );
\r
1347 location = data->Cursor();
\r
1349 const char* startTag = "<!--";
\r
1350 const char* endTag = "-->";
\r
1352 if ( !StringEqual( p, startTag, false, encoding ) )
\r
1355 document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
\r
1358 p += strlen( startTag );
\r
1360 // [ 1475201 ] TinyXML parses entities in comments
\r
1361 // Oops - ReadText doesn't work, because we don't want to parse the entities.
\r
1362 // p = ReadText( p, &value, false, endTag, false, encoding );
\r
1364 // from the XML spec:
\r
1366 [Definition: Comments may appear anywhere in a document outside other markup; in addition,
\r
1367 they may appear within the document type declaration at places allowed by the grammar.
\r
1368 They are not part of the document's character data; an XML processor MAY, but need not,
\r
1369 make it possible for an application to retrieve the text of comments. For compatibility,
\r
1370 the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
\r
1371 references MUST NOT be recognized within comments.
\r
1373 An example of a comment:
\r
1375 <!-- declarations for <head> & <body> -->
\r
1379 // Keep all the white space.
\r
1380 while ( p && *p && !StringEqual( p, endTag, false, encoding ) )
\r
1382 value.append( p, 1 );
\r
1386 p += strlen( endTag );
\r
1392 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
\r
1394 p = SkipWhiteSpace( p, encoding );
\r
1395 if ( !p || !*p ) return 0;
\r
1399 data->Stamp( p, encoding );
\r
1400 location = data->Cursor();
\r
1402 // Read the name, the '=' and the value.
\r
1403 const char* pErr = p;
\r
1404 p = ReadName( p, &name, encoding );
\r
1407 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
\r
1410 p = SkipWhiteSpace( p, encoding );
\r
1411 if ( !p || !*p || *p != '=' )
\r
1413 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
\r
1418 p = SkipWhiteSpace( p, encoding );
\r
1421 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
\r
1426 const char SINGLE_QUOTE = '\'';
\r
1427 const char DOUBLE_QUOTE = '\"';
\r
1429 if ( *p == SINGLE_QUOTE )
\r
1432 end = "\'"; // single quote in string
\r
1433 p = ReadText( p, &value, false, end, false, encoding );
\r
1435 else if ( *p == DOUBLE_QUOTE )
\r
1438 end = "\""; // double quote in string
\r
1439 p = ReadText( p, &value, false, end, false, encoding );
\r
1443 // All attribute values should be in single or double quotes.
\r
1444 // But this is such a common error that the parser will try
\r
1445 // its best, even without them.
\r
1447 while ( p && *p // existence
\r
1448 && !IsWhiteSpace( *p ) // whitespace
\r
1449 && *p != '/' && *p != '>' ) // tag end
\r
1451 if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
\r
1452 // [ 1451649 ] Attribute values with trailing quotes not handled correctly
\r
1453 // We did not have an opening quote but seem to have a
\r
1454 // closing one. Give up and throw an error.
\r
1455 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
\r
1465 #ifdef TIXML_USE_STL
\r
1466 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
\r
1468 while ( in->good() )
\r
1470 int c = in->peek();
\r
1471 if ( !cdata && (c == '<' ) )
\r
1477 TiXmlDocument* document = GetDocument();
\r
1479 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
1483 (*tag) += (char) c;
\r
1484 in->get(); // "commits" the peek made above
\r
1486 if ( cdata && c == '>' && tag->size() >= 3 ) {
\r
1487 size_t len = tag->size();
\r
1488 if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
\r
1489 // terminator of cdata.
\r
1497 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
\r
1500 TiXmlDocument* document = GetDocument();
\r
1504 data->Stamp( p, encoding );
\r
1505 location = data->Cursor();
\r
1508 const char* const startTag = "<![CDATA[";
\r
1509 const char* const endTag = "]]>";
\r
1511 if ( cdata || StringEqual( p, startTag, false, encoding ) )
\r
1515 if ( !StringEqual( p, startTag, false, encoding ) )
\r
1518 document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
\r
1521 p += strlen( startTag );
\r
1523 // Keep all the white space, ignore the encoding, etc.
\r
1525 && !StringEqual( p, endTag, false, encoding )
\r
1532 TIXML_STRING dummy;
\r
1533 p = ReadText( p, &dummy, false, endTag, false, encoding );
\r
1538 bool ignoreWhite = true;
\r
1540 const char* end = "<";
\r
1541 p = ReadText( p, &value, ignoreWhite, end, false, encoding );
\r
1543 return p-1; // don't truncate the '<'
\r
1548 #ifdef TIXML_USE_STL
\r
1549 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
\r
1551 while ( in->good() )
\r
1553 int c = in->get();
\r
1556 TiXmlDocument* document = GetDocument();
\r
1558 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
\r
1561 (*tag) += (char) c;
\r
1572 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
\r
1574 p = SkipWhiteSpace( p, _encoding );
\r
1575 // Find the beginning, find the end, and look for
\r
1576 // the stuff in-between.
\r
1577 TiXmlDocument* document = GetDocument();
\r
1578 if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
\r
1580 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
\r
1585 data->Stamp( p, _encoding );
\r
1586 location = data->Cursor();
\r
1602 p = SkipWhiteSpace( p, _encoding );
\r
1603 if ( StringEqual( p, "version", true, _encoding ) )
\r
1605 TiXmlAttribute attrib;
\r
1606 p = attrib.Parse( p, data, _encoding );
\r
1607 version = attrib.Value();
\r
1609 else if ( StringEqual( p, "encoding", true, _encoding ) )
\r
1611 TiXmlAttribute attrib;
\r
1612 p = attrib.Parse( p, data, _encoding );
\r
1613 encoding = attrib.Value();
\r
1615 else if ( StringEqual( p, "standalone", true, _encoding ) )
\r
1617 TiXmlAttribute attrib;
\r
1618 p = attrib.Parse( p, data, _encoding );
\r
1619 standalone = attrib.Value();
\r
1623 // Read over whatever it is.
\r
1624 while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
\r
1631 bool TiXmlText::Blank() const
\r
1633 for ( unsigned i=0; i<value.length(); i++ )
\r
1634 if ( !IsWhiteSpace( value[i] ) )
\r