1 /* This file is included!
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
35 #ifndef IS_INVALID_CHAR
36 #define IS_INVALID_CHAR(enc, ptr, n) (0)
39 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
42 return XML_TOK_PARTIAL_CHAR; \
43 if (IS_INVALID_CHAR(enc, ptr, n)) { \
44 *(nextTokPtr) = (ptr); \
45 return XML_TOK_INVALID; \
50 #define INVALID_CASES(ptr, nextTokPtr) \
51 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
57 *(nextTokPtr) = (ptr); \
58 return XML_TOK_INVALID;
60 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
63 return XML_TOK_PARTIAL_CHAR; \
64 if (!IS_NAME_CHAR(enc, ptr, n)) { \
66 return XML_TOK_INVALID; \
71 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
73 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
75 return XML_TOK_INVALID; \
84 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
85 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
86 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
88 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
91 return XML_TOK_PARTIAL_CHAR; \
92 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
94 return XML_TOK_INVALID; \
99 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
101 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
103 return XML_TOK_INVALID; \
107 ptr += MINBPC(enc); \
109 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
110 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
111 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
114 #define PREFIX(ident) ident
118 #define HAS_CHARS(enc, ptr, end, count) \
119 (end - ptr >= count * MINBPC(enc))
121 #define HAS_CHAR(enc, ptr, end) \
122 HAS_CHARS(enc, ptr, end, 1)
124 #define REQUIRE_CHARS(enc, ptr, end, count) \
126 if (! HAS_CHARS(enc, ptr, end, count)) { \
127 return XML_TOK_PARTIAL; \
131 #define REQUIRE_CHAR(enc, ptr, end) \
132 REQUIRE_CHARS(enc, ptr, end, 1)
135 /* ptr points to character following "<!-" */
138 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
139 const char *end, const char **nextTokPtr)
141 if (HAS_CHAR(enc, ptr, end)) {
142 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
144 return XML_TOK_INVALID;
147 while (HAS_CHAR(enc, ptr, end)) {
148 switch (BYTE_TYPE(enc, ptr)) {
149 INVALID_CASES(ptr, nextTokPtr)
152 REQUIRE_CHAR(enc, ptr, end);
153 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
155 REQUIRE_CHAR(enc, ptr, end);
156 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
158 return XML_TOK_INVALID;
160 *nextTokPtr = ptr + MINBPC(enc);
161 return XML_TOK_COMMENT;
170 return XML_TOK_PARTIAL;
173 /* ptr points to character following "<!" */
176 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
177 const char *end, const char **nextTokPtr)
179 REQUIRE_CHAR(enc, ptr, end);
180 switch (BYTE_TYPE(enc, ptr)) {
182 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
184 *nextTokPtr = ptr + MINBPC(enc);
185 return XML_TOK_COND_SECT_OPEN;
192 return XML_TOK_INVALID;
194 while (HAS_CHAR(enc, ptr, end)) {
195 switch (BYTE_TYPE(enc, ptr)) {
197 REQUIRE_CHARS(enc, ptr, end, 2);
198 /* don't allow <!ENTITY% foo "whatever"> */
199 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
200 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
202 return XML_TOK_INVALID;
205 case BT_S: case BT_CR: case BT_LF:
207 return XML_TOK_DECL_OPEN;
214 return XML_TOK_INVALID;
217 return XML_TOK_PARTIAL;
221 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
222 const char *end, int *tokPtr)
225 *tokPtr = XML_TOK_PI;
226 if (end - ptr != MINBPC(enc)*3)
228 switch (BYTE_TO_ASCII(enc, ptr)) {
238 switch (BYTE_TO_ASCII(enc, ptr)) {
248 switch (BYTE_TO_ASCII(enc, ptr)) {
259 *tokPtr = XML_TOK_XML_DECL;
263 /* ptr points to character following "<?" */
266 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
267 const char *end, const char **nextTokPtr)
270 const char *target = ptr;
271 REQUIRE_CHAR(enc, ptr, end);
272 switch (BYTE_TYPE(enc, ptr)) {
273 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
276 return XML_TOK_INVALID;
278 while (HAS_CHAR(enc, ptr, end)) {
279 switch (BYTE_TYPE(enc, ptr)) {
280 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
281 case BT_S: case BT_CR: case BT_LF:
282 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
284 return XML_TOK_INVALID;
287 while (HAS_CHAR(enc, ptr, end)) {
288 switch (BYTE_TYPE(enc, ptr)) {
289 INVALID_CASES(ptr, nextTokPtr)
292 REQUIRE_CHAR(enc, ptr, end);
293 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
294 *nextTokPtr = ptr + MINBPC(enc);
303 return XML_TOK_PARTIAL;
305 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
307 return XML_TOK_INVALID;
310 REQUIRE_CHAR(enc, ptr, end);
311 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
312 *nextTokPtr = ptr + MINBPC(enc);
318 return XML_TOK_INVALID;
321 return XML_TOK_PARTIAL;
325 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
326 const char *end, const char **nextTokPtr)
328 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
329 ASCII_T, ASCII_A, ASCII_LSQB };
332 REQUIRE_CHARS(enc, ptr, end, 6);
333 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
334 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
336 return XML_TOK_INVALID;
340 return XML_TOK_CDATA_SECT_OPEN;
344 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
345 const char *end, const char **nextTokPtr)
349 if (MINBPC(enc) > 1) {
350 size_t n = end - ptr;
351 if (n & (MINBPC(enc) - 1)) {
352 n &= ~(MINBPC(enc) - 1);
354 return XML_TOK_PARTIAL;
358 switch (BYTE_TYPE(enc, ptr)) {
361 REQUIRE_CHAR(enc, ptr, end);
362 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
365 REQUIRE_CHAR(enc, ptr, end);
366 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
370 *nextTokPtr = ptr + MINBPC(enc);
371 return XML_TOK_CDATA_SECT_CLOSE;
374 REQUIRE_CHAR(enc, ptr, end);
375 if (BYTE_TYPE(enc, ptr) == BT_LF)
378 return XML_TOK_DATA_NEWLINE;
380 *nextTokPtr = ptr + MINBPC(enc);
381 return XML_TOK_DATA_NEWLINE;
382 INVALID_CASES(ptr, nextTokPtr)
387 while (HAS_CHAR(enc, ptr, end)) {
388 switch (BYTE_TYPE(enc, ptr)) {
389 #define LEAD_CASE(n) \
391 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
393 return XML_TOK_DATA_CHARS; \
397 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
406 return XML_TOK_DATA_CHARS;
413 return XML_TOK_DATA_CHARS;
416 /* ptr points to character following "</" */
419 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
420 const char *end, const char **nextTokPtr)
422 REQUIRE_CHAR(enc, ptr, end);
423 switch (BYTE_TYPE(enc, ptr)) {
424 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
427 return XML_TOK_INVALID;
429 while (HAS_CHAR(enc, ptr, end)) {
430 switch (BYTE_TYPE(enc, ptr)) {
431 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
432 case BT_S: case BT_CR: case BT_LF:
433 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
434 switch (BYTE_TYPE(enc, ptr)) {
435 case BT_S: case BT_CR: case BT_LF:
438 *nextTokPtr = ptr + MINBPC(enc);
439 return XML_TOK_END_TAG;
442 return XML_TOK_INVALID;
445 return XML_TOK_PARTIAL;
448 /* no need to check qname syntax here,
449 since end-tag must match exactly */
454 *nextTokPtr = ptr + MINBPC(enc);
455 return XML_TOK_END_TAG;
458 return XML_TOK_INVALID;
461 return XML_TOK_PARTIAL;
464 /* ptr points to character following "&#X" */
467 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
468 const char *end, const char **nextTokPtr)
470 if (HAS_CHAR(enc, ptr, end)) {
471 switch (BYTE_TYPE(enc, ptr)) {
477 return XML_TOK_INVALID;
479 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
480 switch (BYTE_TYPE(enc, ptr)) {
485 *nextTokPtr = ptr + MINBPC(enc);
486 return XML_TOK_CHAR_REF;
489 return XML_TOK_INVALID;
493 return XML_TOK_PARTIAL;
496 /* ptr points to character following "&#" */
499 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
500 const char *end, const char **nextTokPtr)
502 if (HAS_CHAR(enc, ptr, end)) {
503 if (CHAR_MATCHES(enc, ptr, ASCII_x))
504 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
505 switch (BYTE_TYPE(enc, ptr)) {
510 return XML_TOK_INVALID;
512 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
513 switch (BYTE_TYPE(enc, ptr)) {
517 *nextTokPtr = ptr + MINBPC(enc);
518 return XML_TOK_CHAR_REF;
521 return XML_TOK_INVALID;
525 return XML_TOK_PARTIAL;
528 /* ptr points to character following "&" */
531 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
532 const char **nextTokPtr)
534 REQUIRE_CHAR(enc, ptr, end);
535 switch (BYTE_TYPE(enc, ptr)) {
536 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
538 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
541 return XML_TOK_INVALID;
543 while (HAS_CHAR(enc, ptr, end)) {
544 switch (BYTE_TYPE(enc, ptr)) {
545 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
547 *nextTokPtr = ptr + MINBPC(enc);
548 return XML_TOK_ENTITY_REF;
551 return XML_TOK_INVALID;
554 return XML_TOK_PARTIAL;
557 /* ptr points to character following first character of attribute name */
560 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
561 const char **nextTokPtr)
566 while (HAS_CHAR(enc, ptr, end)) {
567 switch (BYTE_TYPE(enc, ptr)) {
568 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
573 return XML_TOK_INVALID;
577 REQUIRE_CHAR(enc, ptr, end);
578 switch (BYTE_TYPE(enc, ptr)) {
579 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
582 return XML_TOK_INVALID;
586 case BT_S: case BT_CR: case BT_LF:
591 REQUIRE_CHAR(enc, ptr, end);
592 t = BYTE_TYPE(enc, ptr);
602 return XML_TOK_INVALID;
614 REQUIRE_CHAR(enc, ptr, end);
615 open = BYTE_TYPE(enc, ptr);
616 if (open == BT_QUOT || open == BT_APOS)
625 return XML_TOK_INVALID;
629 /* in attribute value */
632 REQUIRE_CHAR(enc, ptr, end);
633 t = BYTE_TYPE(enc, ptr);
637 INVALID_CASES(ptr, nextTokPtr)
640 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
642 if (tok == XML_TOK_INVALID)
650 return XML_TOK_INVALID;
657 REQUIRE_CHAR(enc, ptr, end);
658 switch (BYTE_TYPE(enc, ptr)) {
669 return XML_TOK_INVALID;
671 /* ptr points to closing quote */
674 REQUIRE_CHAR(enc, ptr, end);
675 switch (BYTE_TYPE(enc, ptr)) {
676 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
677 case BT_S: case BT_CR: case BT_LF:
681 *nextTokPtr = ptr + MINBPC(enc);
682 return XML_TOK_START_TAG_WITH_ATTS;
686 REQUIRE_CHAR(enc, ptr, end);
687 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
689 return XML_TOK_INVALID;
691 *nextTokPtr = ptr + MINBPC(enc);
692 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
695 return XML_TOK_INVALID;
703 return XML_TOK_INVALID;
706 return XML_TOK_PARTIAL;
709 /* ptr points to character following "<" */
712 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
713 const char **nextTokPtr)
718 REQUIRE_CHAR(enc, ptr, end);
719 switch (BYTE_TYPE(enc, ptr)) {
720 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
723 REQUIRE_CHAR(enc, ptr, end);
724 switch (BYTE_TYPE(enc, ptr)) {
726 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
728 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
732 return XML_TOK_INVALID;
734 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
736 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739 return XML_TOK_INVALID;
744 /* we have a start-tag */
745 while (HAS_CHAR(enc, ptr, end)) {
746 switch (BYTE_TYPE(enc, ptr)) {
747 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
752 return XML_TOK_INVALID;
756 REQUIRE_CHAR(enc, ptr, end);
757 switch (BYTE_TYPE(enc, ptr)) {
758 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
761 return XML_TOK_INVALID;
765 case BT_S: case BT_CR: case BT_LF:
768 while (HAS_CHAR(enc, ptr, end)) {
769 switch (BYTE_TYPE(enc, ptr)) {
770 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
775 case BT_S: case BT_CR: case BT_LF:
780 return XML_TOK_INVALID;
782 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
784 return XML_TOK_PARTIAL;
788 *nextTokPtr = ptr + MINBPC(enc);
789 return XML_TOK_START_TAG_NO_ATTS;
793 REQUIRE_CHAR(enc, ptr, end);
794 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
796 return XML_TOK_INVALID;
798 *nextTokPtr = ptr + MINBPC(enc);
799 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
802 return XML_TOK_INVALID;
805 return XML_TOK_PARTIAL;
809 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
810 const char **nextTokPtr)
814 if (MINBPC(enc) > 1) {
815 size_t n = end - ptr;
816 if (n & (MINBPC(enc) - 1)) {
817 n &= ~(MINBPC(enc) - 1);
819 return XML_TOK_PARTIAL;
823 switch (BYTE_TYPE(enc, ptr)) {
825 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
827 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
830 if (! HAS_CHAR(enc, ptr, end))
831 return XML_TOK_TRAILING_CR;
832 if (BYTE_TYPE(enc, ptr) == BT_LF)
835 return XML_TOK_DATA_NEWLINE;
837 *nextTokPtr = ptr + MINBPC(enc);
838 return XML_TOK_DATA_NEWLINE;
841 if (! HAS_CHAR(enc, ptr, end))
842 return XML_TOK_TRAILING_RSQB;
843 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
846 if (! HAS_CHAR(enc, ptr, end))
847 return XML_TOK_TRAILING_RSQB;
848 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
853 return XML_TOK_INVALID;
854 INVALID_CASES(ptr, nextTokPtr)
859 while (HAS_CHAR(enc, ptr, end)) {
860 switch (BYTE_TYPE(enc, ptr)) {
861 #define LEAD_CASE(n) \
863 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
865 return XML_TOK_DATA_CHARS; \
869 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
872 if (HAS_CHARS(enc, ptr, end, 2)) {
873 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
877 if (HAS_CHARS(enc, ptr, end, 3)) {
878 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
882 *nextTokPtr = ptr + 2*MINBPC(enc);
883 return XML_TOK_INVALID;
895 return XML_TOK_DATA_CHARS;
902 return XML_TOK_DATA_CHARS;
905 /* ptr points to character following "%" */
908 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
909 const char **nextTokPtr)
911 REQUIRE_CHAR(enc, ptr, end);
912 switch (BYTE_TYPE(enc, ptr)) {
913 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
914 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
916 return XML_TOK_PERCENT;
919 return XML_TOK_INVALID;
921 while (HAS_CHAR(enc, ptr, end)) {
922 switch (BYTE_TYPE(enc, ptr)) {
923 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
925 *nextTokPtr = ptr + MINBPC(enc);
926 return XML_TOK_PARAM_ENTITY_REF;
929 return XML_TOK_INVALID;
932 return XML_TOK_PARTIAL;
936 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
937 const char **nextTokPtr)
939 REQUIRE_CHAR(enc, ptr, end);
940 switch (BYTE_TYPE(enc, ptr)) {
941 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
944 return XML_TOK_INVALID;
946 while (HAS_CHAR(enc, ptr, end)) {
947 switch (BYTE_TYPE(enc, ptr)) {
948 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
949 case BT_CR: case BT_LF: case BT_S:
950 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
952 return XML_TOK_POUND_NAME;
955 return XML_TOK_INVALID;
958 return -XML_TOK_POUND_NAME;
962 PREFIX(scanLit)(int open, const ENCODING *enc,
963 const char *ptr, const char *end,
964 const char **nextTokPtr)
966 while (HAS_CHAR(enc, ptr, end)) {
967 int t = BYTE_TYPE(enc, ptr);
969 INVALID_CASES(ptr, nextTokPtr)
975 if (! HAS_CHAR(enc, ptr, end))
976 return -XML_TOK_LITERAL;
978 switch (BYTE_TYPE(enc, ptr)) {
979 case BT_S: case BT_CR: case BT_LF:
980 case BT_GT: case BT_PERCNT: case BT_LSQB:
981 return XML_TOK_LITERAL;
983 return XML_TOK_INVALID;
990 return XML_TOK_PARTIAL;
994 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
995 const char **nextTokPtr)
1000 if (MINBPC(enc) > 1) {
1001 size_t n = end - ptr;
1002 if (n & (MINBPC(enc) - 1)) {
1003 n &= ~(MINBPC(enc) - 1);
1005 return XML_TOK_PARTIAL;
1009 switch (BYTE_TYPE(enc, ptr)) {
1011 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1013 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1017 REQUIRE_CHAR(enc, ptr, end);
1018 switch (BYTE_TYPE(enc, ptr)) {
1020 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1022 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1029 *nextTokPtr = ptr - MINBPC(enc);
1030 return XML_TOK_INSTANCE_START;
1033 return XML_TOK_INVALID;
1036 if (ptr + MINBPC(enc) == end) {
1038 /* indicate that this might be part of a CR/LF pair */
1039 return -XML_TOK_PROLOG_S;
1042 case BT_S: case BT_LF:
1045 if (! HAS_CHAR(enc, ptr, end))
1047 switch (BYTE_TYPE(enc, ptr)) {
1048 case BT_S: case BT_LF:
1051 /* don't split CR/LF pair */
1052 if (ptr + MINBPC(enc) != end)
1057 return XML_TOK_PROLOG_S;
1061 return XML_TOK_PROLOG_S;
1063 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1065 *nextTokPtr = ptr + MINBPC(enc);
1066 return XML_TOK_COMMA;
1068 *nextTokPtr = ptr + MINBPC(enc);
1069 return XML_TOK_OPEN_BRACKET;
1072 if (! HAS_CHAR(enc, ptr, end))
1073 return -XML_TOK_CLOSE_BRACKET;
1074 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1075 REQUIRE_CHARS(enc, ptr, end, 2);
1076 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1077 *nextTokPtr = ptr + 2*MINBPC(enc);
1078 return XML_TOK_COND_SECT_CLOSE;
1082 return XML_TOK_CLOSE_BRACKET;
1084 *nextTokPtr = ptr + MINBPC(enc);
1085 return XML_TOK_OPEN_PAREN;
1088 if (! HAS_CHAR(enc, ptr, end))
1089 return -XML_TOK_CLOSE_PAREN;
1090 switch (BYTE_TYPE(enc, ptr)) {
1092 *nextTokPtr = ptr + MINBPC(enc);
1093 return XML_TOK_CLOSE_PAREN_ASTERISK;
1095 *nextTokPtr = ptr + MINBPC(enc);
1096 return XML_TOK_CLOSE_PAREN_QUESTION;
1098 *nextTokPtr = ptr + MINBPC(enc);
1099 return XML_TOK_CLOSE_PAREN_PLUS;
1100 case BT_CR: case BT_LF: case BT_S:
1101 case BT_GT: case BT_COMMA: case BT_VERBAR:
1104 return XML_TOK_CLOSE_PAREN;
1107 return XML_TOK_INVALID;
1109 *nextTokPtr = ptr + MINBPC(enc);
1112 *nextTokPtr = ptr + MINBPC(enc);
1113 return XML_TOK_DECL_CLOSE;
1115 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1116 #define LEAD_CASE(n) \
1117 case BT_LEAD ## n: \
1118 if (end - ptr < n) \
1119 return XML_TOK_PARTIAL_CHAR; \
1120 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1122 tok = XML_TOK_NAME; \
1125 if (IS_NAME_CHAR(enc, ptr, n)) { \
1127 tok = XML_TOK_NMTOKEN; \
1130 *nextTokPtr = ptr; \
1131 return XML_TOK_INVALID;
1132 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1145 tok = XML_TOK_NMTOKEN;
1149 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1154 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1156 tok = XML_TOK_NMTOKEN;
1162 return XML_TOK_INVALID;
1164 while (HAS_CHAR(enc, ptr, end)) {
1165 switch (BYTE_TYPE(enc, ptr)) {
1166 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1167 case BT_GT: case BT_RPAR: case BT_COMMA:
1168 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1169 case BT_S: case BT_CR: case BT_LF:
1177 REQUIRE_CHAR(enc, ptr, end);
1178 tok = XML_TOK_PREFIXED_NAME;
1179 switch (BYTE_TYPE(enc, ptr)) {
1180 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1182 tok = XML_TOK_NMTOKEN;
1186 case XML_TOK_PREFIXED_NAME:
1187 tok = XML_TOK_NMTOKEN;
1193 if (tok == XML_TOK_NMTOKEN) {
1195 return XML_TOK_INVALID;
1197 *nextTokPtr = ptr + MINBPC(enc);
1198 return XML_TOK_NAME_PLUS;
1200 if (tok == XML_TOK_NMTOKEN) {
1202 return XML_TOK_INVALID;
1204 *nextTokPtr = ptr + MINBPC(enc);
1205 return XML_TOK_NAME_ASTERISK;
1207 if (tok == XML_TOK_NMTOKEN) {
1209 return XML_TOK_INVALID;
1211 *nextTokPtr = ptr + MINBPC(enc);
1212 return XML_TOK_NAME_QUESTION;
1215 return XML_TOK_INVALID;
1222 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1223 const char *end, const char **nextTokPtr)
1227 return XML_TOK_NONE;
1228 else if (! HAS_CHAR(enc, ptr, end)) {
1229 /* This line cannot be executed. The incoming data has already
1230 * been tokenized once, so incomplete characters like this have
1231 * already been eliminated from the input. Retaining the paranoia
1232 * check is still valuable, however.
1234 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1237 while (HAS_CHAR(enc, ptr, end)) {
1238 switch (BYTE_TYPE(enc, ptr)) {
1239 #define LEAD_CASE(n) \
1240 case BT_LEAD ## n: ptr += n; break;
1241 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1245 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1247 return XML_TOK_DATA_CHARS;
1249 /* this is for inside entity references */
1251 return XML_TOK_INVALID;
1254 *nextTokPtr = ptr + MINBPC(enc);
1255 return XML_TOK_DATA_NEWLINE;
1258 return XML_TOK_DATA_CHARS;
1262 if (! HAS_CHAR(enc, ptr, end))
1263 return XML_TOK_TRAILING_CR;
1264 if (BYTE_TYPE(enc, ptr) == BT_LF)
1267 return XML_TOK_DATA_NEWLINE;
1270 return XML_TOK_DATA_CHARS;
1273 *nextTokPtr = ptr + MINBPC(enc);
1274 return XML_TOK_ATTRIBUTE_VALUE_S;
1277 return XML_TOK_DATA_CHARS;
1284 return XML_TOK_DATA_CHARS;
1288 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1289 const char *end, const char **nextTokPtr)
1293 return XML_TOK_NONE;
1294 else if (! HAS_CHAR(enc, ptr, end)) {
1295 /* This line cannot be executed. The incoming data has already
1296 * been tokenized once, so incomplete characters like this have
1297 * already been eliminated from the input. Retaining the paranoia
1298 * check is still valuable, however.
1300 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1303 while (HAS_CHAR(enc, ptr, end)) {
1304 switch (BYTE_TYPE(enc, ptr)) {
1305 #define LEAD_CASE(n) \
1306 case BT_LEAD ## n: ptr += n; break;
1307 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1311 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1313 return XML_TOK_DATA_CHARS;
1316 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1318 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1321 return XML_TOK_DATA_CHARS;
1324 *nextTokPtr = ptr + MINBPC(enc);
1325 return XML_TOK_DATA_NEWLINE;
1328 return XML_TOK_DATA_CHARS;
1332 if (! HAS_CHAR(enc, ptr, end))
1333 return XML_TOK_TRAILING_CR;
1334 if (BYTE_TYPE(enc, ptr) == BT_LF)
1337 return XML_TOK_DATA_NEWLINE;
1340 return XML_TOK_DATA_CHARS;
1347 return XML_TOK_DATA_CHARS;
1353 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1354 const char *end, const char **nextTokPtr)
1357 if (MINBPC(enc) > 1) {
1358 size_t n = end - ptr;
1359 if (n & (MINBPC(enc) - 1)) {
1360 n &= ~(MINBPC(enc) - 1);
1364 while (HAS_CHAR(enc, ptr, end)) {
1365 switch (BYTE_TYPE(enc, ptr)) {
1366 INVALID_CASES(ptr, nextTokPtr)
1369 REQUIRE_CHAR(enc, ptr, end);
1370 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1372 REQUIRE_CHAR(enc, ptr, end);
1373 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1381 REQUIRE_CHAR(enc, ptr, end);
1382 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1384 REQUIRE_CHAR(enc, ptr, end);
1385 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1389 return XML_TOK_IGNORE_SECT;
1400 return XML_TOK_PARTIAL;
1403 #endif /* XML_DTD */
1406 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1407 const char **badPtr)
1411 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1412 switch (BYTE_TYPE(enc, ptr)) {
1436 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1443 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1446 switch (BYTE_TO_ASCII(enc, ptr)) {
1460 /* This must only be called for a well-formed start-tag or empty
1461 element tag. Returns the number of attributes. Pointers to the
1462 first attsMax attributes are stored in atts.
1466 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1467 int attsMax, ATTRIBUTE *atts)
1469 enum { other, inName, inValue } state = inName;
1471 int open = 0; /* defined when state == inValue;
1472 initialization just to shut up compilers */
1474 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1475 switch (BYTE_TYPE(enc, ptr)) {
1476 #define START_NAME \
1477 if (state == other) { \
1478 if (nAtts < attsMax) { \
1479 atts[nAtts].name = ptr; \
1480 atts[nAtts].normalized = 1; \
1484 #define LEAD_CASE(n) \
1485 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1486 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1495 if (state != inValue) {
1496 if (nAtts < attsMax)
1497 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1501 else if (open == BT_QUOT) {
1503 if (nAtts < attsMax)
1504 atts[nAtts].valueEnd = ptr;
1509 if (state != inValue) {
1510 if (nAtts < attsMax)
1511 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1515 else if (open == BT_APOS) {
1517 if (nAtts < attsMax)
1518 atts[nAtts].valueEnd = ptr;
1523 if (nAtts < attsMax)
1524 atts[nAtts].normalized = 0;
1527 if (state == inName)
1529 else if (state == inValue
1531 && atts[nAtts].normalized
1532 && (ptr == atts[nAtts].valuePtr
1533 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1534 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1535 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1536 atts[nAtts].normalized = 0;
1538 case BT_CR: case BT_LF:
1539 /* This case ensures that the first attribute name is counted
1540 Apart from that we could just change state on the quote. */
1541 if (state == inName)
1543 else if (state == inValue && nAtts < attsMax)
1544 atts[nAtts].normalized = 0;
1548 if (state != inValue)
1558 static int PTRFASTCALL
1559 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1563 ptr += 2*MINBPC(enc);
1564 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1565 for (ptr += MINBPC(enc);
1566 !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1567 ptr += MINBPC(enc)) {
1568 int c = BYTE_TO_ASCII(enc, ptr);
1570 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1571 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1573 result |= (c - ASCII_0);
1575 case ASCII_A: case ASCII_B: case ASCII_C:
1576 case ASCII_D: case ASCII_E: case ASCII_F:
1578 result += 10 + (c - ASCII_A);
1580 case ASCII_a: case ASCII_b: case ASCII_c:
1581 case ASCII_d: case ASCII_e: case ASCII_f:
1583 result += 10 + (c - ASCII_a);
1586 if (result >= 0x110000)
1591 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1592 int c = BYTE_TO_ASCII(enc, ptr);
1594 result += (c - ASCII_0);
1595 if (result >= 0x110000)
1599 return checkCharRefNumber(result);
1603 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1606 switch ((end - ptr)/MINBPC(enc)) {
1608 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1609 switch (BYTE_TO_ASCII(enc, ptr)) {
1618 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1620 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1622 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1628 switch (BYTE_TO_ASCII(enc, ptr)) {
1631 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1633 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1635 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1642 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1644 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1646 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1657 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1658 const char *end1, const char *ptr2)
1660 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1661 if (end1 - ptr1 < MINBPC(enc)) {
1662 /* This line cannot be executed. THe incoming data has already
1663 * been tokenized once, so imcomplete characters like this have
1664 * already been eliminated from the input. Retaining the
1665 * paranoia check is still valuable, however.
1667 return 0; /* LCOV_EXCL_LINE */
1669 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1672 return ptr1 == end1;
1675 static int PTRFASTCALL
1676 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1678 const char *start = ptr;
1680 switch (BYTE_TYPE(enc, ptr)) {
1681 #define LEAD_CASE(n) \
1682 case BT_LEAD ## n: ptr += n; break;
1683 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1697 return (int)(ptr - start);
1702 static const char * PTRFASTCALL
1703 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1706 switch (BYTE_TYPE(enc, ptr)) {
1719 PREFIX(updatePosition)(const ENCODING *enc,
1724 while (HAS_CHAR(enc, ptr, end)) {
1725 switch (BYTE_TYPE(enc, ptr)) {
1726 #define LEAD_CASE(n) \
1727 case BT_LEAD ## n: \
1730 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1733 pos->columnNumber = (XML_Size)-1;
1740 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1742 pos->columnNumber = (XML_Size)-1;
1748 pos->columnNumber++;
1753 #undef MULTIBYTE_CASES
1754 #undef INVALID_CASES
1755 #undef CHECK_NAME_CASE
1756 #undef CHECK_NAME_CASES
1757 #undef CHECK_NMSTRT_CASE
1758 #undef CHECK_NMSTRT_CASES
1760 #endif /* XML_TOK_IMPL_C */