2 * HTMLparser.c : an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
11 #ifdef LIBXML_HTML_ENABLED
20 #ifdef HAVE_SYS_STAT_H
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
55 /* #define DEBUG_PUSH */
57 static int htmlOmittedDefaultValue = 1;
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
63 /************************************************************************
65 * Some factorized error routines *
67 ************************************************************************/
71 * @ctxt: an HTML parser context
72 * @extra: extra informations
74 * Handle a redefinition of attribute error
77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
91 "Memory allocation failed : %s\n", extra);
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
108 static void LIBXML_ATTR_FORMAT(3,0)
109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
123 ctxt->wellFormed = 0;
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
135 static void LIBXML_ATTR_FORMAT(3,0)
136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
148 ctxt->wellFormed = 0;
151 /************************************************************************
153 * Parser stacks related functions and macros *
155 ************************************************************************/
159 * @ctxt: an HTML parser context
160 * @value: the element name
162 * Pushes a new element name on top of the name stack
164 * Returns 0 in case of error, the index in the stack otherwise
167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
173 if (ctxt->nameNr >= ctxt->nameMax) {
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
184 ctxt->nameTab[ctxt->nameNr] = value;
186 return (ctxt->nameNr++);
190 * @ctxt: an HTML parser context
192 * Pops the top element name from the name stack
194 * Returns the name just removed
196 static const xmlChar *
197 htmlnamePop(htmlParserCtxtPtr ctxt)
201 if (ctxt->nameNr <= 0)
204 if (ctxt->nameNr < 0)
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
217 * @ctxt: an HTML parser context
218 * @value: the node info
220 * Pushes a new element name on top of the node info stack
222 * Returns 0 in case of error, the index in the stack otherwise
225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
247 * @ctxt: an HTML parser context
249 * Pops the top element name from the node info stack
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
253 static htmlParserNodeInfo *
254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
256 if (ctxt->nodeInfoNr <= 0)
259 if (ctxt->nodeInfoNr < 0)
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
269 * Macros for accessing the content. Those should be used only by the parser,
272 * Dirty macros, i.e. one need to make assumption on the context to use them
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
297 #define UPPER (toupper(*ctxt->input->cur))
299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
301 #define NXT(val) ctxt->input->cur[(val)]
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
312 #define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
316 #define CURRENT ((int) (*ctxt->input->cur))
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
320 /* Inported from XML */
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
329 #define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
333 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
345 #define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
351 * @the HTML parser context
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
360 * Returns an encoding string or NULL if not found, the string need to
364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
398 return(xmlStrndup(start, cur - start));
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
412 * Returns the current char value and its length
416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417 if (ctxt->instate == XML_PARSER_EOF)
420 if (ctxt->token != 0) {
424 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
426 * We are supposed to handle UTF8, check it's valid
427 * From rfc2044: encoding of the Unicode values on UTF-8:
429 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
430 * 0000 0000-0000 007F 0xxxxxxx
431 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
432 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
434 * Check for the 0x110000 limit too
436 const unsigned char *cur = ctxt->input->cur;
443 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
444 cur = ctxt->input->cur;
446 if ((cur[1] & 0xc0) != 0x80)
448 if ((c & 0xe0) == 0xe0) {
451 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
452 cur = ctxt->input->cur;
454 if ((cur[2] & 0xc0) != 0x80)
456 if ((c & 0xf0) == 0xf0) {
458 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
459 cur = ctxt->input->cur;
461 if (((c & 0xf8) != 0xf0) ||
462 ((cur[3] & 0xc0) != 0x80))
466 val = (cur[0] & 0x7) << 18;
467 val |= (cur[1] & 0x3f) << 12;
468 val |= (cur[2] & 0x3f) << 6;
469 val |= cur[3] & 0x3f;
473 val = (cur[0] & 0xf) << 12;
474 val |= (cur[1] & 0x3f) << 6;
475 val |= cur[2] & 0x3f;
480 val = (cur[0] & 0x1f) << 6;
481 val |= cur[1] & 0x3f;
484 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
485 "Char 0x%X out of allowed range\n", val);
489 if ((*ctxt->input->cur == 0) &&
490 (ctxt->input->cur < ctxt->input->end)) {
491 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
492 "Char 0x%X out of allowed range\n", 0);
498 return((int) *ctxt->input->cur);
502 * Assume it's a fixed length encoding (1) with
503 * a compatible encoding for the ASCII set, since
504 * XML constructs only use < 128 chars
507 if ((int) *ctxt->input->cur < 0x80)
508 return((int) *ctxt->input->cur);
511 * Humm this is bad, do an automatic flow conversion
515 xmlCharEncodingHandlerPtr handler;
517 guess = htmlFindEncoding(ctxt);
519 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
521 if (ctxt->input->encoding != NULL)
522 xmlFree((xmlChar *) ctxt->input->encoding);
523 ctxt->input->encoding = guess;
524 handler = xmlFindCharEncodingHandler((const char *) guess);
525 if (handler != NULL) {
526 xmlSwitchToEncoding(ctxt, handler);
528 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
529 "Unsupported encoding %s", guess, NULL);
532 ctxt->charset = XML_CHAR_ENCODING_UTF8;
535 return(xmlCurrentChar(ctxt, len));
539 * If we detect an UTF8 error that probably mean that the
540 * input encoding didn't get properly advertized in the
541 * declaration header. Report the error and switch the encoding
542 * to ISO-Latin-1 (if you don't like this policy, just declare the
548 if (ctxt->input->end - ctxt->input->cur >= 4) {
549 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
550 ctxt->input->cur[0], ctxt->input->cur[1],
551 ctxt->input->cur[2], ctxt->input->cur[3]);
553 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
555 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
556 "Input is not proper UTF-8, indicate encoding !\n",
557 BAD_CAST buffer, NULL);
560 ctxt->charset = XML_CHAR_ENCODING_8859_1;
562 return((int) *ctxt->input->cur);
566 * htmlSkipBlankChars:
567 * @ctxt: the HTML parser context
569 * skip all blanks character found at that point in the input streams.
571 * Returns the number of space chars skipped
575 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
578 while (IS_BLANK_CH(*(ctxt->input->cur))) {
579 if ((*ctxt->input->cur == 0) &&
580 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
583 if (*(ctxt->input->cur) == '\n') {
584 ctxt->input->line++; ctxt->input->col = 1;
585 } else ctxt->input->col++;
588 if (*ctxt->input->cur == 0)
589 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
598 /************************************************************************
600 * The list of HTML elements and their properties *
602 ************************************************************************/
605 * Start Tag: 1 means the start tag can be ommited
606 * End Tag: 1 means the end tag can be ommited
607 * 2 means it's forbidden (empty elements)
608 * 3 means the tag is stylistic and should be closed easily
609 * Depr: this element is deprecated
610 * DTD: 1 means that this element is valid only in the Loose DTD
611 * 2 means that this element is valid only in the Frameset DTD
613 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
614 , subElements , impliedsubelt , Attributes, userdata
617 /* Definitions and a couple of vars for HTML Elements */
619 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
620 #define NB_FONTSTYLE 8
621 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
623 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
624 #define NB_SPECIAL 16
625 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
626 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
627 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
628 #define NB_BLOCK NB_HEADING + NB_LIST + 14
629 #define FORMCTRL "input", "select", "textarea", "label", "button"
630 #define NB_FORMCTRL 5
633 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
635 #define LIST "ul", "ol", "dir", "menu"
638 #define NB_MODIFIER 0
639 #define FLOW BLOCK,INLINE
640 #define NB_FLOW NB_BLOCK + NB_INLINE
644 static const char* const html_flow[] = { FLOW, NULL } ;
645 static const char* const html_inline[] = { INLINE, NULL } ;
647 /* placeholders: elts with content but no subelements */
648 static const char* const html_pcdata[] = { NULL } ;
649 #define html_cdata html_pcdata
652 /* ... and for HTML Attributes */
654 #define COREATTRS "id", "class", "style", "title"
655 #define NB_COREATTRS 4
656 #define I18N "lang", "dir"
658 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
660 #define ATTRS COREATTRS,I18N,EVENTS
661 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
662 #define CELLHALIGN "align", "char", "charoff"
663 #define NB_CELLHALIGN 3
664 #define CELLVALIGN "valign"
665 #define NB_CELLVALIGN 1
667 static const char* const html_attrs[] = { ATTRS, NULL } ;
668 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
669 static const char* const core_attrs[] = { COREATTRS, NULL } ;
670 static const char* const i18n_attrs[] = { I18N, NULL } ;
673 /* Other declarations that should go inline ... */
674 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
675 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
676 "tabindex", "onfocus", "onblur", NULL } ;
677 static const char* const target_attr[] = { "target", NULL } ;
678 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
679 static const char* const alt_attr[] = { "alt", NULL } ;
680 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
681 static const char* const href_attrs[] = { "href", NULL } ;
682 static const char* const clear_attrs[] = { "clear", NULL } ;
683 static const char* const inline_p[] = { INLINE, "p", NULL } ;
685 static const char* const flow_param[] = { FLOW, "param", NULL } ;
686 static const char* const applet_attrs[] = { COREATTRS , "codebase",
687 "archive", "alt", "name", "height", "width", "align",
688 "hspace", "vspace", NULL } ;
689 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
690 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
691 static const char* const basefont_attrs[] =
692 { "id", "size", "color", "face", NULL } ;
693 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
694 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
695 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
696 static const char* const body_depr[] = { "background", "bgcolor", "text",
697 "link", "vlink", "alink", NULL } ;
698 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
699 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
702 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
703 static const char* const col_elt[] = { "col", NULL } ;
704 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
705 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
706 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
707 static const char* const compact_attr[] = { "compact", NULL } ;
708 static const char* const label_attr[] = { "label", NULL } ;
709 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
710 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
711 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
712 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
713 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
714 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
715 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
716 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
717 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
718 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
719 static const char* const version_attr[] = { "version", NULL } ;
720 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
721 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
722 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
723 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
724 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
725 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
726 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
727 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
728 static const char* const align_attr[] = { "align", NULL } ;
729 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
730 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
731 static const char* const name_attr[] = { "name", NULL } ;
732 static const char* const action_attr[] = { "action", NULL } ;
733 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
734 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
735 static const char* const content_attr[] = { "content", NULL } ;
736 static const char* const type_attr[] = { "type", NULL } ;
737 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
738 static const char* const object_contents[] = { FLOW, "param", NULL } ;
739 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
740 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
741 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
742 static const char* const option_elt[] = { "option", NULL } ;
743 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
744 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
745 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
746 static const char* const width_attr[] = { "width", NULL } ;
747 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
748 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
749 static const char* const language_attr[] = { "language", NULL } ;
750 static const char* const select_content[] = { "optgroup", "option", NULL } ;
751 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
752 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
753 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
754 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
755 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
756 static const char* const tr_elt[] = { "tr", NULL } ;
757 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
758 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
759 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
760 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
761 static const char* const tr_contents[] = { "th", "td", NULL } ;
762 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
763 static const char* const li_elt[] = { "li", NULL } ;
764 static const char* const ul_depr[] = { "type", "compact", NULL} ;
765 static const char* const dir_attr[] = { "dir", NULL} ;
767 #define DECL (const char**)
769 static const htmlElemDesc
770 html40ElementTable[] = {
771 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
772 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
774 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
775 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
777 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
778 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
780 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
781 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
783 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
784 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
786 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
787 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
789 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
790 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
792 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
793 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
795 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
796 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
798 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
799 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
801 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
802 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
804 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
805 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
807 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
808 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
810 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
811 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
813 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
814 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
816 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
817 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
819 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
820 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
822 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
823 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
825 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
826 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
828 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
829 EMPTY , NULL , DECL col_attrs , NULL, NULL
831 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
832 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
834 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
835 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
837 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
838 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
840 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
841 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
843 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
844 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
846 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
847 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
849 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
850 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
852 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
855 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
856 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
858 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
859 EMPTY, NULL, DECL embed_attrs, NULL, NULL
861 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
862 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
864 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
865 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
867 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
868 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
870 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
871 EMPTY, NULL, NULL, DECL frame_attrs, NULL
873 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
874 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
876 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
877 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
879 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
880 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
882 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
883 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
885 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
886 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
888 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
889 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
891 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
895 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
897 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
898 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
900 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
901 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
903 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
904 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
906 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
907 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
909 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
910 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
912 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
913 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
915 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
916 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
918 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
919 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
921 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
922 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
924 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
925 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
927 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
928 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
930 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
931 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
933 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
934 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
936 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
937 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
939 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
940 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
942 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
943 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
945 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
946 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
948 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
949 DECL html_flow, "div", DECL html_attrs, NULL, NULL
951 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
952 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
954 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
955 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
957 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
958 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
960 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
961 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
963 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
964 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
966 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
967 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
969 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
970 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
972 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
973 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
975 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
976 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
978 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
979 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
981 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
982 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
984 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
985 DECL select_content, NULL, DECL select_attrs, NULL, NULL
987 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
988 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
990 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
991 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
993 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
994 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
996 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
997 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
999 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1000 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1002 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1003 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1005 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1006 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1008 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1009 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1011 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1012 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1014 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1015 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1017 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1018 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1020 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1021 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1023 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1024 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1026 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1027 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1029 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1030 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1032 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1033 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1035 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1036 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1038 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1039 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1041 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1042 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1044 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1045 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1050 * start tags that imply the end of current element
1052 static const char * const htmlStartClose[] = {
1053 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1054 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1055 "listing", "xmp", "head", NULL,
1058 "body", "head", "style", "link", "title", "p", NULL,
1059 "frameset", "head", "style", "link", "title", "p", NULL,
1060 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1061 "pre", "listing", "xmp", "head", "li", NULL,
1062 "hr", "p", "head", NULL,
1063 "h1", "p", "head", NULL,
1064 "h2", "p", "head", NULL,
1065 "h3", "p", "head", NULL,
1066 "h4", "p", "head", NULL,
1067 "h5", "p", "head", NULL,
1068 "h6", "p", "head", NULL,
1069 "dir", "p", "head", NULL,
1070 "address", "p", "head", "ul", NULL,
1071 "pre", "p", "head", "ul", NULL,
1072 "listing", "p", "head", NULL,
1073 "xmp", "p", "head", NULL,
1074 "blockquote", "p", "head", NULL,
1075 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1076 "xmp", "head", NULL,
1077 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1079 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1081 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1082 "listing", "xmp", NULL,
1083 "ol", "p", "head", "ul", NULL,
1084 "menu", "p", "head", "ul", NULL,
1085 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1086 "div", "p", "head", NULL,
1087 "noscript", "p", NULL,
1088 "center", "font", "b", "i", "p", "head", NULL,
1089 "a", "a", "head", NULL,
1090 "caption", "p", NULL,
1091 "colgroup", "caption", "colgroup", "col", "p", NULL,
1092 "col", "caption", "col", "p", NULL,
1093 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1094 "listing", "xmp", "a", NULL,
1095 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1096 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1097 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1098 "thead", "caption", "col", "colgroup", NULL,
1099 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1101 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1102 "tfoot", "tbody", "p", NULL,
1103 "optgroup", "option", NULL,
1104 "option", "option", NULL,
1105 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1106 "pre", "listing", "xmp", "a", NULL,
1107 /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1113 "strike", "head", NULL,
1114 "big", "head", NULL,
1115 "small", "head", NULL,
1118 "strong", "head", NULL,
1119 "dfn", "head", NULL,
1120 "code", "head", NULL,
1121 "samp", "head", NULL,
1122 "kbd", "head", NULL,
1123 "var", "head", NULL,
1124 "cite", "head", NULL,
1125 "abbr", "head", NULL,
1126 "acronym", "head", NULL,
1129 "img", "head", NULL,
1133 "font", "head", NULL,
1137 "map", "head", NULL,
1139 "sub", "head", NULL,
1140 "sup", "head", NULL,
1141 "span", "head", NULL,
1142 "bdo", "head", NULL,
1143 "iframe", "head", NULL,
1148 * The list of HTML elements which are supposed not to have
1149 * CDATA content and where a p element will be implied
1151 * TODO: extend that list by reading the HTML SGML DTD on
1154 static const char *const htmlNoContentElements[] = {
1161 * The list of HTML attributes which are of content %Script;
1162 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1163 * it assumes the name starts with 'on'
1165 static const char *const htmlScriptAttributes[] = {
1187 * This table is used by the htmlparser to know what to do with
1188 * broken html pages. By assigning different priorities to different
1189 * elements the parser can decide how to handle extra endtags.
1190 * Endtags are only allowed to close elements with lower or equal
1199 static const elementPriority htmlEndPriority[] = {
1211 {NULL, 100} /* Default priority */
1214 static const char** htmlStartCloseIndex[100];
1215 static int htmlStartCloseIndexinitialized = 0;
1217 /************************************************************************
1219 * functions to handle HTML specific data *
1221 ************************************************************************/
1224 * htmlInitAutoClose:
1226 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1227 * This is not reentrant. Call xmlInitParser() once before processing in
1228 * case of use in multithreaded programs.
1231 htmlInitAutoClose(void) {
1234 if (htmlStartCloseIndexinitialized) return;
1236 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1238 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1239 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1240 while (htmlStartClose[i] != NULL) i++;
1243 htmlStartCloseIndexinitialized = 1;
1248 * @tag: The tag name in lowercase
1250 * Lookup the HTML tag in the ElementTable
1252 * Returns the related htmlElemDescPtr or NULL if not found.
1254 const htmlElemDesc *
1255 htmlTagLookup(const xmlChar *tag) {
1258 for (i = 0; i < (sizeof(html40ElementTable) /
1259 sizeof(html40ElementTable[0]));i++) {
1260 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1261 return((htmlElemDescPtr) &html40ElementTable[i]);
1267 * htmlGetEndPriority:
1268 * @name: The name of the element to look up the priority for.
1270 * Return value: The "endtag" priority.
1273 htmlGetEndPriority (const xmlChar *name) {
1276 while ((htmlEndPriority[i].name != NULL) &&
1277 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1280 return(htmlEndPriority[i].priority);
1285 * htmlCheckAutoClose:
1286 * @newtag: The new tag name
1287 * @oldtag: The old tag name
1289 * Checks whether the new tag is one of the registered valid tags for
1291 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1293 * Returns 0 if no, 1 if yes.
1296 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1299 const char **closed = NULL;
1301 if (htmlStartCloseIndexinitialized == 0)
1302 htmlInitAutoClose();
1304 /* inefficient, but not a big deal */
1305 for (indx = 0; indx < 100; indx++) {
1306 closed = htmlStartCloseIndex[indx];
1309 if (xmlStrEqual(BAD_CAST * closed, newtag))
1313 i = closed - htmlStartClose;
1315 while (htmlStartClose[i] != NULL) {
1316 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1325 * htmlAutoCloseOnClose:
1326 * @ctxt: an HTML parser context
1327 * @newtag: The new tag name
1328 * @force: force the tag closure
1330 * The HTML DTD allows an ending tag to implicitly close other tags.
1333 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1335 const htmlElemDesc *info;
1338 priority = htmlGetEndPriority(newtag);
1340 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1342 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1345 * A missplaced endtag can only close elements with lower
1346 * or equal priority, so if we find an element with higher
1347 * priority before we find an element with
1348 * matching name, we just ignore this endtag
1350 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1356 while (!xmlStrEqual(newtag, ctxt->name)) {
1357 info = htmlTagLookup(ctxt->name);
1358 if ((info != NULL) && (info->endTag == 3)) {
1359 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1360 "Opening and ending tag mismatch: %s and %s\n",
1361 newtag, ctxt->name);
1363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1364 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1370 * htmlAutoCloseOnEnd:
1371 * @ctxt: an HTML parser context
1373 * Close all remaining tags at the end of the stream
1376 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1380 if (ctxt->nameNr == 0)
1382 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1383 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1384 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1391 * @ctxt: an HTML parser context
1392 * @newtag: The new tag name or NULL
1394 * The HTML DTD allows a tag to implicitly close other tags.
1395 * The list is kept in htmlStartClose array. This function is
1396 * called when a new tag has been detected and generates the
1397 * appropriates closes if possible/needed.
1398 * If newtag is NULL this mean we are at the end of the resource
1399 * and we should check
1402 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1404 while ((newtag != NULL) && (ctxt->name != NULL) &&
1405 (htmlCheckAutoClose(newtag, ctxt->name))) {
1406 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1407 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1410 if (newtag == NULL) {
1411 htmlAutoCloseOnEnd(ctxt);
1414 while ((newtag == NULL) && (ctxt->name != NULL) &&
1415 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1416 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1417 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1418 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1426 * @doc: the HTML document
1427 * @name: The tag name
1428 * @elem: the HTML element
1430 * The HTML DTD allows a tag to implicitly close other tags.
1431 * The list is kept in htmlStartClose array. This function checks
1432 * if the element or one of it's children would autoclose the
1435 * Returns 1 if autoclose, 0 otherwise
1438 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1441 if (elem == NULL) return(1);
1442 if (xmlStrEqual(name, elem->name)) return(0);
1443 if (htmlCheckAutoClose(elem->name, name)) return(1);
1444 child = elem->children;
1445 while (child != NULL) {
1446 if (htmlAutoCloseTag(doc, name, child)) return(1);
1447 child = child->next;
1454 * @doc: the HTML document
1455 * @elem: the HTML element
1457 * The HTML DTD allows a tag to implicitly close other tags.
1458 * The list is kept in htmlStartClose array. This function checks
1459 * if a tag is autoclosed by one of it's child
1461 * Returns 1 if autoclosed, 0 otherwise
1464 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1467 if (elem == NULL) return(1);
1468 child = elem->children;
1469 while (child != NULL) {
1470 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471 child = child->next;
1478 * @ctxt: an HTML parser context
1479 * @newtag: The new tag name
1481 * The HTML DTD allows a tag to exists only implicitly
1482 * called when a new tag has been detected and generates the
1483 * appropriates implicit tags if missing
1486 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1489 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1491 if (!htmlOmittedDefaultValue)
1493 if (xmlStrEqual(newtag, BAD_CAST"html"))
1495 if (ctxt->nameNr <= 0) {
1496 htmlnamePush(ctxt, BAD_CAST"html");
1497 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1498 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1500 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1502 if ((ctxt->nameNr <= 1) &&
1503 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1504 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1505 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1506 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1507 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1508 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1509 if (ctxt->html >= 3) {
1510 /* we already saw or generated an <head> before */
1514 * dropped OBJECT ... i you put it first BODY will be
1517 htmlnamePush(ctxt, BAD_CAST"head");
1518 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1519 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1520 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1521 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1522 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1523 if (ctxt->html >= 10) {
1524 /* we already saw or generated a <body> before */
1527 for (i = 0;i < ctxt->nameNr;i++) {
1528 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1531 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1536 htmlnamePush(ctxt, BAD_CAST"body");
1537 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1538 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1543 * htmlCheckParagraph
1544 * @ctxt: an HTML parser context
1546 * Check whether a p element need to be implied before inserting
1547 * characters in the current element.
1549 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1554 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1562 htmlAutoClose(ctxt, BAD_CAST"p");
1563 htmlCheckImplied(ctxt, BAD_CAST"p");
1564 htmlnamePush(ctxt, BAD_CAST"p");
1565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1569 if (!htmlOmittedDefaultValue)
1571 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1572 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1573 htmlAutoClose(ctxt, BAD_CAST"p");
1574 htmlCheckImplied(ctxt, BAD_CAST"p");
1575 htmlnamePush(ctxt, BAD_CAST"p");
1576 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1577 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1585 * htmlIsScriptAttribute:
1586 * @name: an attribute name
1588 * Check if an attribute is of content type Script
1590 * Returns 1 is the attribute is a script 0 otherwise
1593 htmlIsScriptAttribute(const xmlChar *name) {
1599 * all script attributes start with 'on'
1601 if ((name[0] != 'o') || (name[1] != 'n'))
1604 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1606 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1612 /************************************************************************
1614 * The list of HTML predefined entities *
1616 ************************************************************************/
1619 static const htmlEntityDesc html40EntitiesTable[] = {
1621 * the 4 absolute ones, plus apostrophe.
1623 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1624 { 38, "amp", "ampersand, U+0026 ISOnum" },
1625 { 39, "apos", "single quote" },
1626 { 60, "lt", "less-than sign, U+003C ISOnum" },
1627 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1630 * A bunch still in the 128-255 range
1631 * Replacing them depend really on the charset used.
1633 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1634 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1635 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1636 { 163, "pound","pound sign, U+00A3 ISOnum" },
1637 { 164, "curren","currency sign, U+00A4 ISOnum" },
1638 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1639 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1640 { 167, "sect", "section sign, U+00A7 ISOnum" },
1641 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1642 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1643 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1644 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1645 { 172, "not", "not sign, U+00AC ISOnum" },
1646 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1647 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1648 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1649 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1650 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1651 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1652 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1653 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1654 { 181, "micro","micro sign, U+00B5 ISOnum" },
1655 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1656 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1657 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1658 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1659 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1660 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1661 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1662 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1663 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1664 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1665 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1666 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1667 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1668 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1669 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1670 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1671 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1672 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1673 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1674 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1675 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1676 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1677 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1678 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1679 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1680 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1681 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1682 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1683 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1684 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1685 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1686 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1687 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1688 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1689 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1690 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1691 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1692 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1693 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1694 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1695 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1696 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1697 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1698 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1699 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1700 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1701 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1702 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1703 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1704 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1705 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1706 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1707 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1708 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1709 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1710 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1711 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1712 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1713 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1714 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1715 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1716 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1717 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1718 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1719 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1720 { 247, "divide","division sign, U+00F7 ISOnum" },
1721 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1722 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1723 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1724 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1725 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1726 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1727 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1728 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1730 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1731 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1732 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1733 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1734 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1737 * Anything below should really be kept as entities references
1739 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1741 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1742 { 732, "tilde","small tilde, U+02DC ISOdia" },
1744 { 913, "Alpha","greek capital letter alpha, U+0391" },
1745 { 914, "Beta", "greek capital letter beta, U+0392" },
1746 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1747 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1748 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1749 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1750 { 919, "Eta", "greek capital letter eta, U+0397" },
1751 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1752 { 921, "Iota", "greek capital letter iota, U+0399" },
1753 { 922, "Kappa","greek capital letter kappa, U+039A" },
1754 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1755 { 924, "Mu", "greek capital letter mu, U+039C" },
1756 { 925, "Nu", "greek capital letter nu, U+039D" },
1757 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1758 { 927, "Omicron","greek capital letter omicron, U+039F" },
1759 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1760 { 929, "Rho", "greek capital letter rho, U+03A1" },
1761 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1762 { 932, "Tau", "greek capital letter tau, U+03A4" },
1763 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1764 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1765 { 935, "Chi", "greek capital letter chi, U+03A7" },
1766 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1767 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1769 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1770 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1771 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1772 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1773 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1774 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1775 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1776 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1777 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1778 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1779 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1780 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1781 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1782 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1783 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1784 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1785 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1786 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1787 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1788 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1789 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1790 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1791 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1792 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1793 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1794 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1795 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1796 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1798 { 8194, "ensp", "en space, U+2002 ISOpub" },
1799 { 8195, "emsp", "em space, U+2003 ISOpub" },
1800 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1801 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1802 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1803 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1804 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1805 { 8211, "ndash","en dash, U+2013 ISOpub" },
1806 { 8212, "mdash","em dash, U+2014 ISOpub" },
1807 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1808 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1809 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1810 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1811 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1812 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1813 { 8224, "dagger","dagger, U+2020 ISOpub" },
1814 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1816 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1817 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1819 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1821 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1822 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1824 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1825 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1827 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1828 { 8260, "frasl","fraction slash, U+2044 NEW" },
1830 { 8364, "euro", "euro sign, U+20AC NEW" },
1832 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1833 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1834 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1835 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1836 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1837 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1838 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1839 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1840 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1841 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1842 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1843 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1844 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1845 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1846 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1847 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1849 { 8704, "forall","for all, U+2200 ISOtech" },
1850 { 8706, "part", "partial differential, U+2202 ISOtech" },
1851 { 8707, "exist","there exists, U+2203 ISOtech" },
1852 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1853 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1854 { 8712, "isin", "element of, U+2208 ISOtech" },
1855 { 8713, "notin","not an element of, U+2209 ISOtech" },
1856 { 8715, "ni", "contains as member, U+220B ISOtech" },
1857 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1858 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1859 { 8722, "minus","minus sign, U+2212 ISOtech" },
1860 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1861 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1862 { 8733, "prop", "proportional to, U+221D ISOtech" },
1863 { 8734, "infin","infinity, U+221E ISOtech" },
1864 { 8736, "ang", "angle, U+2220 ISOamso" },
1865 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1866 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1867 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1868 { 8746, "cup", "union = cup, U+222A ISOtech" },
1869 { 8747, "int", "integral, U+222B ISOtech" },
1870 { 8756, "there4","therefore, U+2234 ISOtech" },
1871 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1872 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1873 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1874 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1875 { 8801, "equiv","identical to, U+2261 ISOtech" },
1876 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1877 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1878 { 8834, "sub", "subset of, U+2282 ISOtech" },
1879 { 8835, "sup", "superset of, U+2283 ISOtech" },
1880 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1881 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1882 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1883 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1884 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1885 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1886 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1887 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1888 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1889 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1890 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1891 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1892 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1893 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1895 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1896 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1897 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1898 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1902 /************************************************************************
1904 * Commodity functions to handle entities *
1906 ************************************************************************/
1909 * Macro used to grow the current buffer.
1911 #define growBuffer(buffer) { \
1913 buffer##_size *= 2; \
1914 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1915 if (tmp == NULL) { \
1916 htmlErrMemory(ctxt, "growing buffer\n"); \
1925 * @name: the entity name
1927 * Lookup the given entity in EntitiesTable
1929 * TODO: the linear scan is really ugly, an hash table is really needed.
1931 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1933 const htmlEntityDesc *
1934 htmlEntityLookup(const xmlChar *name) {
1937 for (i = 0;i < (sizeof(html40EntitiesTable)/
1938 sizeof(html40EntitiesTable[0]));i++) {
1939 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1940 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1947 * htmlEntityValueLookup:
1948 * @value: the entity's unicode value
1950 * Lookup the given entity in EntitiesTable
1952 * TODO: the linear scan is really ugly, an hash table is really needed.
1954 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1956 const htmlEntityDesc *
1957 htmlEntityValueLookup(unsigned int value) {
1960 for (i = 0;i < (sizeof(html40EntitiesTable)/
1961 sizeof(html40EntitiesTable[0]));i++) {
1962 if (html40EntitiesTable[i].value >= value) {
1963 if (html40EntitiesTable[i].value > value)
1965 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1973 * @out: a pointer to an array of bytes to store the result
1974 * @outlen: the length of @out
1975 * @in: a pointer to an array of UTF-8 chars
1976 * @inlen: the length of @in
1978 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1979 * plus HTML entities block of chars out.
1981 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1982 * The value of @inlen after return is the number of octets consumed
1983 * as the return value is positive, else unpredictable.
1984 * The value of @outlen after return is the number of octets consumed.
1987 UTF8ToHtml(unsigned char* out, int *outlen,
1988 const unsigned char* in, int *inlen) {
1989 const unsigned char* processed = in;
1990 const unsigned char* outend;
1991 const unsigned char* outstart = out;
1992 const unsigned char* instart = in;
1993 const unsigned char* inend;
1997 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2000 * initialization nothing to do
2006 inend = in + (*inlen);
2007 outend = out + (*outlen);
2008 while (in < inend) {
2010 if (d < 0x80) { c= d; trailing= 0; }
2011 else if (d < 0xC0) {
2012 /* trailing byte in leading position */
2013 *outlen = out - outstart;
2014 *inlen = processed - instart;
2016 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2017 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2018 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2020 /* no chance for this in Ascii */
2021 *outlen = out - outstart;
2022 *inlen = processed - instart;
2026 if (inend - in < trailing) {
2030 for ( ; trailing; trailing--) {
2031 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2037 /* assertion: c is a single UTF-4 value */
2039 if (out + 1 >= outend)
2044 const htmlEntityDesc * ent;
2049 * Try to lookup a predefined HTML entity for it
2052 ent = htmlEntityValueLookup(c);
2054 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2060 if (out + 2 + len >= outend)
2063 memcpy(out, cp, len);
2069 *outlen = out - outstart;
2070 *inlen = processed - instart;
2075 * htmlEncodeEntities:
2076 * @out: a pointer to an array of bytes to store the result
2077 * @outlen: the length of @out
2078 * @in: a pointer to an array of UTF-8 chars
2079 * @inlen: the length of @in
2080 * @quoteChar: the quote character to escape (' or ") or zero.
2082 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2083 * plus HTML entities block of chars out.
2085 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2086 * The value of @inlen after return is the number of octets consumed
2087 * as the return value is positive, else unpredictable.
2088 * The value of @outlen after return is the number of octets consumed.
2091 htmlEncodeEntities(unsigned char* out, int *outlen,
2092 const unsigned char* in, int *inlen, int quoteChar) {
2093 const unsigned char* processed = in;
2094 const unsigned char* outend;
2095 const unsigned char* outstart = out;
2096 const unsigned char* instart = in;
2097 const unsigned char* inend;
2101 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2103 outend = out + (*outlen);
2104 inend = in + (*inlen);
2105 while (in < inend) {
2107 if (d < 0x80) { c= d; trailing= 0; }
2108 else if (d < 0xC0) {
2109 /* trailing byte in leading position */
2110 *outlen = out - outstart;
2111 *inlen = processed - instart;
2113 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2114 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2115 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2117 /* no chance for this in Ascii */
2118 *outlen = out - outstart;
2119 *inlen = processed - instart;
2123 if (inend - in < trailing)
2126 while (trailing--) {
2127 if (((d= *in++) & 0xC0) != 0x80) {
2128 *outlen = out - outstart;
2129 *inlen = processed - instart;
2136 /* assertion: c is a single UTF-4 value */
2137 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2138 (c != '&') && (c != '<') && (c != '>')) {
2143 const htmlEntityDesc * ent;
2149 * Try to lookup a predefined HTML entity for it
2151 ent = htmlEntityValueLookup(c);
2153 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2159 if (out + 2 + len > outend)
2162 memcpy(out, cp, len);
2168 *outlen = out - outstart;
2169 *inlen = processed - instart;
2173 /************************************************************************
2175 * Commodity functions to handle streams *
2177 ************************************************************************/
2180 * htmlNewInputStream:
2181 * @ctxt: an HTML parser context
2183 * Create a new input stream structure
2184 * Returns the new input stream or NULL
2186 static htmlParserInputPtr
2187 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2188 htmlParserInputPtr input;
2190 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2191 if (input == NULL) {
2192 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2195 memset(input, 0, sizeof(htmlParserInput));
2196 input->filename = NULL;
2197 input->directory = NULL;
2205 input->version = NULL;
2206 input->consumed = 0;
2212 /************************************************************************
2214 * Commodity functions, cleanup needed ? *
2216 ************************************************************************/
2218 * all tags allowing pc data from the html 4.01 loose dtd
2219 * NOTE: it might be more apropriate to integrate this information
2220 * into the html40ElementTable array but I don't want to risk any
2221 * binary incomptibility
2223 static const char *allowPCData[] = {
2224 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2225 "blockquote", "body", "button", "caption", "center", "cite", "code",
2226 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2227 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2228 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2229 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2234 * @ctxt: an HTML parser context
2236 * @len: the size of @str
2238 * Is this a sequence of blank chars that one can ignore ?
2240 * Returns 1 if ignorable 0 otherwise.
2243 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2246 xmlNodePtr lastChild;
2249 for (j = 0;j < len;j++)
2250 if (!(IS_BLANK_CH(str[j]))) return(0);
2252 if (CUR == 0) return(1);
2253 if (CUR != '<') return(0);
2254 if (ctxt->name == NULL)
2256 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2258 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2261 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2262 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2263 dtd = xmlGetIntSubset(ctxt->myDoc);
2264 if (dtd != NULL && dtd->ExternalID != NULL) {
2265 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2266 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2271 if (ctxt->node == NULL) return(0);
2272 lastChild = xmlGetLastChild(ctxt->node);
2273 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2274 lastChild = lastChild->prev;
2275 if (lastChild == NULL) {
2276 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2277 (ctxt->node->content != NULL)) return(0);
2278 /* keep ws in constructs like ...<b> </b>...
2279 for all tags "b" allowing PCDATA */
2280 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2281 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2285 } else if (xmlNodeIsText(lastChild)) {
2288 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2289 for all tags "p" allowing PCDATA */
2290 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2291 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2301 * @URI: URI for the dtd, or NULL
2302 * @ExternalID: the external ID of the DTD, or NULL
2304 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2307 * Returns a new document, do not initialize the DTD if not provided
2310 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2314 * Allocate a new document and fill the fields.
2316 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2318 htmlErrMemory(NULL, "HTML document creation failed\n");
2321 memset(cur, 0, sizeof(xmlDoc));
2323 cur->type = XML_HTML_DOCUMENT_NODE;
2324 cur->version = NULL;
2325 cur->intSubset = NULL;
2328 cur->children = NULL;
2329 cur->extSubset = NULL;
2331 cur->encoding = NULL;
2332 cur->standalone = 1;
2333 cur->compression = 0;
2336 cur->_private = NULL;
2337 cur->charset = XML_CHAR_ENCODING_UTF8;
2338 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2339 if ((ExternalID != NULL) ||
2341 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2347 * @URI: URI for the dtd, or NULL
2348 * @ExternalID: the external ID of the DTD, or NULL
2350 * Creates a new HTML document
2352 * Returns a new document
2355 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2356 if ((URI == NULL) && (ExternalID == NULL))
2357 return(htmlNewDocNoDtD(
2358 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2359 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2361 return(htmlNewDocNoDtD(URI, ExternalID));
2365 /************************************************************************
2367 * The parser itself *
2368 * Relates to http://www.w3.org/TR/html40 *
2370 ************************************************************************/
2372 /************************************************************************
2374 * The parser itself *
2376 ************************************************************************/
2378 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2381 * htmlParseHTMLName:
2382 * @ctxt: an HTML parser context
2384 * parse an HTML tag or attribute name, note that we convert it to lowercase
2385 * since HTML names are not case-sensitive.
2387 * Returns the Tag Name parsed or NULL
2390 static const xmlChar *
2391 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2393 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2395 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2396 (CUR != ':') && (CUR != '.')) return(NULL);
2398 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2399 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2400 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2402 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2409 return(xmlDictLookup(ctxt->dict, loc, i));
2414 * htmlParseHTMLName_nonInvasive:
2415 * @ctxt: an HTML parser context
2417 * parse an HTML tag or attribute name, note that we convert it to lowercase
2418 * since HTML names are not case-sensitive, this doesn't consume the data
2419 * from the stream, it's a look-ahead
2421 * Returns the Tag Name parsed or NULL
2424 static const xmlChar *
2425 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2427 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2429 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2430 (NXT(1) != ':')) return(NULL);
2432 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2433 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2434 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2435 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2436 else loc[i] = NXT(1+i);
2440 return(xmlDictLookup(ctxt->dict, loc, i));
2446 * @ctxt: an HTML parser context
2448 * parse an HTML name, this routine is case sensitive.
2450 * Returns the Name parsed or NULL
2453 static const xmlChar *
2454 htmlParseName(htmlParserCtxtPtr ctxt) {
2462 * Accelerator for simple ASCII names
2464 in = ctxt->input->cur;
2465 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2466 ((*in >= 0x41) && (*in <= 0x5A)) ||
2467 (*in == '_') || (*in == ':')) {
2469 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2470 ((*in >= 0x41) && (*in <= 0x5A)) ||
2471 ((*in >= 0x30) && (*in <= 0x39)) ||
2472 (*in == '_') || (*in == '-') ||
2473 (*in == ':') || (*in == '.'))
2476 if (in == ctxt->input->end)
2479 if ((*in > 0) && (*in < 0x80)) {
2480 count = in - ctxt->input->cur;
2481 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2482 ctxt->input->cur = in;
2483 ctxt->nbChars += count;
2484 ctxt->input->col += count;
2488 return(htmlParseNameComplex(ctxt));
2491 static const xmlChar *
2492 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2496 const xmlChar *base = ctxt->input->base;
2499 * Handler for more complex cases
2503 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2504 (!IS_LETTER(c) && (c != '_') &&
2509 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2510 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2511 (c == '.') || (c == '-') ||
2512 (c == '_') || (c == ':') ||
2513 (IS_COMBINING(c)) ||
2514 (IS_EXTENDER(c)))) {
2515 if (count++ > 100) {
2522 if (ctxt->input->base != base) {
2524 * We changed encoding from an unknown encoding
2525 * Input buffer changed location, so we better start again
2527 return(htmlParseNameComplex(ctxt));
2531 if (ctxt->input->base > ctxt->input->cur - len)
2534 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2539 * htmlParseHTMLAttribute:
2540 * @ctxt: an HTML parser context
2541 * @stop: a char stop value
2543 * parse an HTML attribute value till the stop (quote), if
2544 * stop is 0 then it stops at the first space
2546 * Returns the attribute parsed or NULL
2550 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2551 xmlChar *buffer = NULL;
2552 int buffer_size = 0;
2553 xmlChar *out = NULL;
2554 const xmlChar *name = NULL;
2555 const xmlChar *cur = NULL;
2556 const htmlEntityDesc * ent;
2559 * allocate a translation buffer.
2561 buffer_size = HTML_PARSER_BUFFER_SIZE;
2562 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2563 if (buffer == NULL) {
2564 htmlErrMemory(ctxt, "buffer allocation failed\n");
2570 * Ok loop until we reach one of the ending chars
2572 while ((CUR != 0) && (CUR != stop)) {
2573 if ((stop == 0) && (CUR == '>')) break;
2574 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2576 if (NXT(1) == '#') {
2580 c = htmlParseCharRef(ctxt);
2582 { *out++ = c; bits= -6; }
2584 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2585 else if (c < 0x10000)
2586 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2588 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2590 for ( ; bits >= 0; bits-= 6) {
2591 *out++ = ((c >> bits) & 0x3F) | 0x80;
2594 if (out - buffer > buffer_size - 100) {
2595 int indx = out - buffer;
2598 out = &buffer[indx];
2601 ent = htmlParseEntityRef(ctxt, &name);
2604 if (out - buffer > buffer_size - 100) {
2605 int indx = out - buffer;
2608 out = &buffer[indx];
2610 } else if (ent == NULL) {
2614 if (out - buffer > buffer_size - 100) {
2615 int indx = out - buffer;
2618 out = &buffer[indx];
2626 if (out - buffer > buffer_size - 100) {
2627 int indx = out - buffer;
2630 out = &buffer[indx];
2634 { *out++ = c; bits= -6; }
2636 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2637 else if (c < 0x10000)
2638 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2640 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2642 for ( ; bits >= 0; bits-= 6) {
2643 *out++ = ((c >> bits) & 0x3F) | 0x80;
2651 if (out - buffer > buffer_size - 100) {
2652 int indx = out - buffer;
2655 out = &buffer[indx];
2659 { *out++ = c; bits= -6; }
2661 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2662 else if (c < 0x10000)
2663 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2665 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2667 for ( ; bits >= 0; bits-= 6) {
2668 *out++ = ((c >> bits) & 0x3F) | 0x80;
2678 * htmlParseEntityRef:
2679 * @ctxt: an HTML parser context
2680 * @str: location to store the entity name
2682 * parse an HTML ENTITY references
2684 * [68] EntityRef ::= '&' Name ';'
2686 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2687 * if non-NULL *str will have to be freed by the caller.
2689 const htmlEntityDesc *
2690 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2691 const xmlChar *name;
2692 const htmlEntityDesc * ent = NULL;
2694 if (str != NULL) *str = NULL;
2695 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2699 name = htmlParseName(ctxt);
2701 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2702 "htmlParseEntityRef: no name\n", NULL, NULL);
2710 * Lookup the entity in the table.
2712 ent = htmlEntityLookup(name);
2713 if (ent != NULL) /* OK that's ugly !!! */
2716 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2717 "htmlParseEntityRef: expecting ';'\n",
2728 * htmlParseAttValue:
2729 * @ctxt: an HTML parser context
2731 * parse a value for an attribute
2732 * Note: the parser won't do substitution of entities here, this
2733 * will be handled later in xmlStringGetNodeList, unless it was
2734 * asked for ctxt->replaceEntities != 0
2736 * Returns the AttValue parsed or NULL.
2740 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2741 xmlChar *ret = NULL;
2745 ret = htmlParseHTMLAttribute(ctxt, '"');
2747 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2748 "AttValue: \" expected\n", NULL, NULL);
2751 } else if (CUR == '\'') {
2753 ret = htmlParseHTMLAttribute(ctxt, '\'');
2755 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2756 "AttValue: ' expected\n", NULL, NULL);
2761 * That's an HTMLism, the attribute value may not be quoted
2763 ret = htmlParseHTMLAttribute(ctxt, 0);
2765 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2766 "AttValue: no value found\n", NULL, NULL);
2773 * htmlParseSystemLiteral:
2774 * @ctxt: an HTML parser context
2776 * parse an HTML Literal
2778 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2780 * Returns the SystemLiteral parsed or NULL
2784 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2785 size_t len = 0, startPosition = 0;
2786 xmlChar *ret = NULL;
2791 if (CUR_PTR < BASE_PTR)
2793 startPosition = CUR_PTR - BASE_PTR;
2795 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
2799 if (!IS_CHAR_CH(CUR)) {
2800 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2801 "Unfinished SystemLiteral\n", NULL, NULL);
2803 ret = xmlStrndup((BASE_PTR+startPosition), len);
2806 } else if (CUR == '\'') {
2809 if (CUR_PTR < BASE_PTR)
2811 startPosition = CUR_PTR - BASE_PTR;
2813 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
2817 if (!IS_CHAR_CH(CUR)) {
2818 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2819 "Unfinished SystemLiteral\n", NULL, NULL);
2821 ret = xmlStrndup((BASE_PTR+startPosition), len);
2825 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2826 " or ' expected\n", NULL, NULL);
2833 * htmlParsePubidLiteral:
2834 * @ctxt: an HTML parser context
2836 * parse an HTML public literal
2838 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2840 * Returns the PubidLiteral parsed or NULL.
2844 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2845 size_t len = 0, startPosition = 0;
2846 xmlChar *ret = NULL;
2848 * Name ::= (Letter | '_') (NameChar)*
2853 if (CUR_PTR < BASE_PTR)
2855 startPosition = CUR_PTR - BASE_PTR;
2857 while (IS_PUBIDCHAR_CH(CUR)) {
2863 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2864 "Unfinished PubidLiteral\n", NULL, NULL);
2866 ret = xmlStrndup((BASE_PTR + startPosition), len);
2869 } else if (CUR == '\'') {
2872 if (CUR_PTR < BASE_PTR)
2874 startPosition = CUR_PTR - BASE_PTR;
2876 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2882 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2883 "Unfinished PubidLiteral\n", NULL, NULL);
2885 ret = xmlStrndup((BASE_PTR + startPosition), len);
2889 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2890 "PubidLiteral \" or ' expected\n", NULL, NULL);
2898 * @ctxt: an HTML parser context
2900 * parse the content of an HTML SCRIPT or STYLE element
2901 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2902 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2903 * http://www.w3.org/TR/html4/types.html#type-script
2904 * http://www.w3.org/TR/html4/types.html#h-6.15
2905 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2907 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2908 * element and the value of intrinsic event attributes. User agents must
2909 * not evaluate script data as HTML markup but instead must pass it on as
2910 * data to a script engine.
2912 * - The content is passed like CDATA
2913 * - the attributes for style and scripting "onXXX" are also described
2914 * as CDATA but SGML allows entities references in attributes so their
2915 * processing is identical as other attributes
2918 htmlParseScript(htmlParserCtxtPtr ctxt) {
2919 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2925 while (IS_CHAR_CH(cur)) {
2926 if ((cur == '<') && (NXT(1) == '/')) {
2928 * One should break here, the specification is clear:
2929 * Authors should therefore escape "</" within the content.
2930 * Escape mechanisms are specific to each scripting or
2931 * style sheet language.
2933 * In recovery mode, only break if end tag match the
2934 * current tag, effectively ignoring all tags inside the
2935 * script/style block and treating the entire block as
2938 if (ctxt->recovery) {
2939 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2940 xmlStrlen(ctxt->name)) == 0)
2944 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2945 "Element %s embeds close tag\n",
2949 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2950 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2956 COPY_BUF(l,buf,nbchar,cur);
2957 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2958 if (ctxt->sax->cdataBlock!= NULL) {
2960 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2962 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2963 } else if (ctxt->sax->characters != NULL) {
2964 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2973 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2974 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2975 "Invalid char in CDATA 0x%X\n", cur);
2976 if (ctxt->input->cur < ctxt->input->end) {
2981 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2982 if (ctxt->sax->cdataBlock!= NULL) {
2984 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2986 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2987 } else if (ctxt->sax->characters != NULL) {
2988 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2995 * htmlParseCharDataInternal:
2996 * @ctxt: an HTML parser context
2997 * @readahead: optional read ahead character in ascii range
2999 * parse a CharData section.
3000 * if we are within a CDATA section ']]>' marks an end of section.
3002 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3006 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3007 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3013 buf[nbchar++] = readahead;
3017 while (((cur != '<') || (ctxt->token == '<')) &&
3018 ((cur != '&') || (ctxt->token == '&')) &&
3020 if (!(IS_CHAR(cur))) {
3021 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3022 "Invalid char in CDATA 0x%X\n", cur);
3024 COPY_BUF(l,buf,nbchar,cur);
3026 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3028 * Ok the segment is to be consumed as chars.
3030 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3031 if (areBlanks(ctxt, buf, nbchar)) {
3032 if (ctxt->keepBlanks) {
3033 if (ctxt->sax->characters != NULL)
3034 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3036 if (ctxt->sax->ignorableWhitespace != NULL)
3037 ctxt->sax->ignorableWhitespace(ctxt->userData,
3041 htmlCheckParagraph(ctxt);
3042 if (ctxt->sax->characters != NULL)
3043 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3050 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3066 * Ok the segment is to be consumed as chars.
3068 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3069 if (areBlanks(ctxt, buf, nbchar)) {
3070 if (ctxt->keepBlanks) {
3071 if (ctxt->sax->characters != NULL)
3072 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3074 if (ctxt->sax->ignorableWhitespace != NULL)
3075 ctxt->sax->ignorableWhitespace(ctxt->userData,
3079 htmlCheckParagraph(ctxt);
3080 if (ctxt->sax->characters != NULL)
3081 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3089 ctxt->instate = XML_PARSER_EOF;
3094 * htmlParseCharData:
3095 * @ctxt: an HTML parser context
3097 * parse a CharData section.
3098 * if we are within a CDATA section ']]>' marks an end of section.
3100 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3104 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3105 htmlParseCharDataInternal(ctxt, 0);
3109 * htmlParseExternalID:
3110 * @ctxt: an HTML parser context
3111 * @publicID: a xmlChar** receiving PubidLiteral
3113 * Parse an External ID or a Public ID
3115 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3116 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3118 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3120 * Returns the function returns SystemLiteral and in the second
3121 * case publicID receives PubidLiteral, is strict is off
3122 * it is possible to return NULL and have publicID set.
3126 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3127 xmlChar *URI = NULL;
3129 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3130 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3131 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3133 if (!IS_BLANK_CH(CUR)) {
3134 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3135 "Space required after 'SYSTEM'\n", NULL, NULL);
3138 URI = htmlParseSystemLiteral(ctxt);
3140 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3141 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3143 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3144 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3145 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3147 if (!IS_BLANK_CH(CUR)) {
3148 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3149 "Space required after 'PUBLIC'\n", NULL, NULL);
3152 *publicID = htmlParsePubidLiteral(ctxt);
3153 if (*publicID == NULL) {
3154 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3155 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3159 if ((CUR == '"') || (CUR == '\'')) {
3160 URI = htmlParseSystemLiteral(ctxt);
3168 * @ctxt: an XML parser context
3170 * parse an XML Processing Instruction.
3172 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3175 htmlParsePI(htmlParserCtxtPtr ctxt) {
3176 xmlChar *buf = NULL;
3178 int size = HTML_PARSER_BUFFER_SIZE;
3180 const xmlChar *target;
3181 xmlParserInputState state;
3184 if ((RAW == '<') && (NXT(1) == '?')) {
3185 state = ctxt->instate;
3186 ctxt->instate = XML_PARSER_PI;
3188 * this is a Processing Instruction.
3194 * Parse the target name and check for special support like
3197 target = htmlParseName(ctxt);
3198 if (target != NULL) {
3205 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3206 (ctxt->sax->processingInstruction != NULL))
3207 ctxt->sax->processingInstruction(ctxt->userData,
3209 ctxt->instate = state;
3212 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3214 htmlErrMemory(ctxt, NULL);
3215 ctxt->instate = state;
3219 if (!IS_BLANK(cur)) {
3220 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3221 "ParsePI: PI %s space expected\n", target, NULL);
3225 while (IS_CHAR(cur) && (cur != '>')) {
3226 if (len + 5 >= size) {
3230 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3232 htmlErrMemory(ctxt, NULL);
3234 ctxt->instate = state;
3244 COPY_BUF(l,buf,len,cur);
3255 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3256 "ParsePI: PI %s never end ...\n", target, NULL);
3263 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3264 (ctxt->sax->processingInstruction != NULL))
3265 ctxt->sax->processingInstruction(ctxt->userData,
3270 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3271 "PI is not started correctly", NULL, NULL);
3273 ctxt->instate = state;
3279 * @ctxt: an HTML parser context
3281 * Parse an XML (SGML) comment <!-- .... -->
3283 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3286 htmlParseComment(htmlParserCtxtPtr ctxt) {
3287 xmlChar *buf = NULL;
3289 int size = HTML_PARSER_BUFFER_SIZE;
3293 xmlParserInputState state;
3296 * Check that there is a comment right here.
3298 if ((RAW != '<') || (NXT(1) != '!') ||
3299 (NXT(2) != '-') || (NXT(3) != '-')) return;
3301 state = ctxt->instate;
3302 ctxt->instate = XML_PARSER_COMMENT;
3305 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3307 htmlErrMemory(ctxt, "buffer allocation failed\n");
3308 ctxt->instate = state;
3322 while (IS_CHAR(cur) &&
3324 (r != '-') || (q != '-'))) {
3325 if (len + 5 >= size) {
3329 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3332 htmlErrMemory(ctxt, "growing buffer failed\n");
3333 ctxt->instate = state;
3338 COPY_BUF(ql,buf,len,q);
3354 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3355 (!ctxt->disableSAX))
3356 ctxt->sax->comment(ctxt->userData, buf);
3358 ctxt->instate = state;
3363 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3364 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3370 * @ctxt: an HTML parser context
3372 * parse Reference declarations
3374 * [66] CharRef ::= '&#' [0-9]+ ';' |
3375 * '&#x' [0-9a-fA-F]+ ';'
3377 * Returns the value parsed (as an int)
3380 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3383 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3384 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3385 "htmlParseCharRef: context error\n",
3389 if ((CUR == '&') && (NXT(1) == '#') &&
3390 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3392 while (CUR != ';') {
3393 if ((CUR >= '0') && (CUR <= '9'))
3394 val = val * 16 + (CUR - '0');
3395 else if ((CUR >= 'a') && (CUR <= 'f'))
3396 val = val * 16 + (CUR - 'a') + 10;
3397 else if ((CUR >= 'A') && (CUR <= 'F'))
3398 val = val * 16 + (CUR - 'A') + 10;
3400 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3401 "htmlParseCharRef: missing semicolon\n",
3409 } else if ((CUR == '&') && (NXT(1) == '#')) {
3411 while (CUR != ';') {
3412 if ((CUR >= '0') && (CUR <= '9'))
3413 val = val * 10 + (CUR - '0');
3415 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3416 "htmlParseCharRef: missing semicolon\n",
3425 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3426 "htmlParseCharRef: invalid value\n", NULL, NULL);
3429 * Check the value IS_CHAR ...
3434 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3435 "htmlParseCharRef: invalid xmlChar value %d\n",
3443 * htmlParseDocTypeDecl:
3444 * @ctxt: an HTML parser context
3446 * parse a DOCTYPE declaration
3448 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3449 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3453 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3454 const xmlChar *name;
3455 xmlChar *ExternalID = NULL;
3456 xmlChar *URI = NULL;
3459 * We know that '<!DOCTYPE' has been detected.
3466 * Parse the DOCTYPE name.
3468 name = htmlParseName(ctxt);
3470 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3471 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3475 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3481 * Check for SystemID and ExternalID
3483 URI = htmlParseExternalID(ctxt, &ExternalID);
3487 * We should be at the end of the DOCTYPE declaration.
3490 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3491 "DOCTYPE improperly terminated\n", NULL, NULL);
3492 /* We shouldn't try to resynchronize ... */
3497 * Create or update the document accordingly to the DOCTYPE
3499 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3500 (!ctxt->disableSAX))
3501 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3504 * Cleanup, since we don't use all those identifiers
3506 if (URI != NULL) xmlFree(URI);
3507 if (ExternalID != NULL) xmlFree(ExternalID);
3511 * htmlParseAttribute:
3512 * @ctxt: an HTML parser context
3513 * @value: a xmlChar ** used to store the value of the attribute
3515 * parse an attribute
3517 * [41] Attribute ::= Name Eq AttValue
3519 * [25] Eq ::= S? '=' S?
3523 * [NS 11] Attribute ::= QName Eq AttValue
3525 * Also the case QName == xmlns:??? is handled independently as a namespace
3528 * Returns the attribute name, and the value in *value.
3531 static const xmlChar *
3532 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3533 const xmlChar *name;
3534 xmlChar *val = NULL;
3537 name = htmlParseHTMLName(ctxt);
3539 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3540 "error parsing attribute name\n", NULL, NULL);
3551 val = htmlParseAttValue(ctxt);
3559 * htmlCheckEncodingDirect:
3560 * @ctxt: an HTML parser context
3561 * @attvalue: the attribute value
3563 * Checks an attribute value to detect
3565 * If a new encoding is detected the parser is switched to decode
3569 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3571 if ((ctxt == NULL) || (encoding == NULL) ||
3572 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3575 /* do not change encoding */
3576 if (ctxt->input->encoding != NULL)
3579 if (encoding != NULL) {
3580 xmlCharEncoding enc;
3581 xmlCharEncodingHandlerPtr handler;
3583 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3585 if (ctxt->input->encoding != NULL)
3586 xmlFree((xmlChar *) ctxt->input->encoding);
3587 ctxt->input->encoding = xmlStrdup(encoding);
3589 enc = xmlParseCharEncoding((const char *) encoding);
3591 * registered set of known encodings
3593 if (enc != XML_CHAR_ENCODING_ERROR) {
3594 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3595 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3596 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3597 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3598 (ctxt->input->buf != NULL) &&
3599 (ctxt->input->buf->encoder == NULL)) {
3600 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3601 "htmlCheckEncoding: wrong encoding meta\n",
3604 xmlSwitchEncoding(ctxt, enc);
3606 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3609 * fallback for unknown encodings
3611 handler = xmlFindCharEncodingHandler((const char *) encoding);
3612 if (handler != NULL) {
3613 xmlSwitchToEncoding(ctxt, handler);
3614 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3616 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3617 "htmlCheckEncoding: unknown encoding %s\n",
3622 if ((ctxt->input->buf != NULL) &&
3623 (ctxt->input->buf->encoder != NULL) &&
3624 (ctxt->input->buf->raw != NULL) &&
3625 (ctxt->input->buf->buffer != NULL)) {
3630 * convert as much as possible to the parser reading buffer.
3632 processed = ctxt->input->cur - ctxt->input->base;
3633 xmlBufShrink(ctxt->input->buf->buffer, processed);
3634 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3636 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3637 "htmlCheckEncoding: encoder error\n",
3640 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3646 * htmlCheckEncoding:
3647 * @ctxt: an HTML parser context
3648 * @attvalue: the attribute value
3650 * Checks an http-equiv attribute from a Meta tag to detect
3652 * If a new encoding is detected the parser is switched to decode
3656 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3657 const xmlChar *encoding;
3662 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3663 if (encoding != NULL) {
3669 if (encoding && IS_BLANK_CH(*encoding))
3670 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3671 if (encoding && *encoding == '=') {
3673 htmlCheckEncodingDirect(ctxt, encoding);
3679 * @ctxt: an HTML parser context
3680 * @atts: the attributes values
3682 * Checks an attributes from a Meta tag
3685 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3687 const xmlChar *att, *value;
3689 const xmlChar *content = NULL;
3691 if ((ctxt == NULL) || (atts == NULL))
3696 while (att != NULL) {
3698 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3699 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3701 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3702 htmlCheckEncodingDirect(ctxt, value);
3703 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3707 if ((http) && (content != NULL))
3708 htmlCheckEncoding(ctxt, content);
3713 * htmlParseStartTag:
3714 * @ctxt: an HTML parser context
3716 * parse a start of tag either for rule element or
3717 * EmptyElement. In both case we don't parse the tag closing chars.
3719 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3721 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3725 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3727 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3729 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3733 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3734 const xmlChar *name;
3735 const xmlChar *attname;
3737 const xmlChar **atts;
3744 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3745 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3746 "htmlParseStartTag: context error\n", NULL, NULL);
3749 if (ctxt->instate == XML_PARSER_EOF)
3751 if (CUR != '<') return -1;
3755 maxatts = ctxt->maxatts;
3758 name = htmlParseHTMLName(ctxt);
3760 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3761 "htmlParseStartTag: invalid element name\n",
3763 /* if recover preserve text on classic misconstructs */
3764 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3765 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3766 htmlParseCharDataInternal(ctxt, '<');
3771 /* Dump the bogus tag like browsers do */
3772 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3773 (ctxt->instate != XML_PARSER_EOF))
3777 if (xmlStrEqual(name, BAD_CAST"meta"))
3781 * Check for auto-closure of HTML elements.
3783 htmlAutoClose(ctxt, name);
3786 * Check for implied HTML elements.
3788 htmlCheckImplied(ctxt, name);
3791 * Avoid html at any level > 0, head at any level != 1
3792 * or any attempt to recurse body
3794 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3795 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3796 "htmlParseStartTag: misplaced <html> tag\n",
3801 if ((ctxt->nameNr != 1) &&
3802 (xmlStrEqual(name, BAD_CAST"head"))) {
3803 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3804 "htmlParseStartTag: misplaced <head> tag\n",
3809 if (xmlStrEqual(name, BAD_CAST"body")) {
3811 for (indx = 0;indx < ctxt->nameNr;indx++) {
3812 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3813 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3814 "htmlParseStartTag: misplaced <body> tag\n",
3823 * Now parse the attributes, it ends up with the ending
3828 while ((IS_CHAR_CH(CUR)) &&
3830 ((CUR != '/') || (NXT(1) != '>'))) {
3831 long cons = ctxt->nbChars;
3834 attname = htmlParseAttribute(ctxt, &attvalue);
3835 if (attname != NULL) {
3838 * Well formedness requires at most one declaration of an attribute
3840 for (i = 0; i < nbatts;i += 2) {
3841 if (xmlStrEqual(atts[i], attname)) {
3842 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3843 "Attribute %s redefined\n", attname, NULL);
3844 if (attvalue != NULL)
3851 * Add the pair to atts
3854 maxatts = 22; /* allow for 10 attrs by default */
3855 atts = (const xmlChar **)
3856 xmlMalloc(maxatts * sizeof(xmlChar *));
3858 htmlErrMemory(ctxt, NULL);
3859 if (attvalue != NULL)
3864 ctxt->maxatts = maxatts;
3865 } else if (nbatts + 4 > maxatts) {
3869 n = (const xmlChar **) xmlRealloc((void *) atts,
3870 maxatts * sizeof(const xmlChar *));
3872 htmlErrMemory(ctxt, NULL);
3873 if (attvalue != NULL)
3879 ctxt->maxatts = maxatts;
3881 atts[nbatts++] = attname;
3882 atts[nbatts++] = attvalue;
3883 atts[nbatts] = NULL;
3884 atts[nbatts + 1] = NULL;
3887 if (attvalue != NULL)
3889 /* Dump the bogus attribute string up to the next blank or
3890 * the end of the tag. */
3891 while ((IS_CHAR_CH(CUR)) &&
3892 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3893 ((CUR != '/') || (NXT(1) != '>')))
3899 if (cons == ctxt->nbChars) {
3900 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3901 "htmlParseStartTag: problem parsing attributes\n",
3908 * Handle specific association to the META tag
3910 if (meta && (nbatts != 0))
3911 htmlCheckMeta(ctxt, atts);
3914 * SAX: Start of Element !
3917 htmlnamePush(ctxt, name);
3918 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3920 ctxt->sax->startElement(ctxt->userData, name, atts);
3922 ctxt->sax->startElement(ctxt->userData, name, NULL);
3927 for (i = 1;i < nbatts;i += 2) {
3928 if (atts[i] != NULL)
3929 xmlFree((xmlChar *) atts[i]);
3938 * @ctxt: an HTML parser context
3940 * parse an end of tag
3942 * [42] ETag ::= '</' Name S? '>'
3946 * [NS 9] ETag ::= '</' QName S? '>'
3948 * Returns 1 if the current level should be closed.
3952 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3954 const xmlChar *name;
3955 const xmlChar *oldname;
3958 if ((CUR != '<') || (NXT(1) != '/')) {
3959 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3960 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3965 name = htmlParseHTMLName(ctxt);
3969 * We should definitely be at the ending "S? '>'" part
3972 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3973 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3974 "End tag : expected '>'\n", NULL, NULL);
3975 if (ctxt->recovery) {
3977 * We're not at the ending > !!
3978 * Error, unless in recover mode where we search forwards
3981 while (CUR != '\0' && CUR != '>') NEXT;
3988 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3991 if ((ctxt->depth > 0) &&
3992 (xmlStrEqual(name, BAD_CAST "html") ||
3993 xmlStrEqual(name, BAD_CAST "body") ||
3994 xmlStrEqual(name, BAD_CAST "head"))) {
4000 * If the name read is not one of the element in the parsing stack
4001 * then return, it's just an error.
4003 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4004 if (xmlStrEqual(name, ctxt->nameTab[i]))
4008 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4009 "Unexpected end tag : %s\n", name, NULL);
4015 * Check for auto-closure of HTML elements.
4018 htmlAutoCloseOnClose(ctxt, name);
4021 * Well formedness constraints, opening and closing must match.
4022 * With the exception that the autoclose may have popped stuff out
4025 if (!xmlStrEqual(name, ctxt->name)) {
4026 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4027 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4028 "Opening and ending tag mismatch: %s and %s\n",
4036 oldname = ctxt->name;
4037 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4038 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4039 ctxt->sax->endElement(ctxt->userData, name);
4040 htmlNodeInfoPop(ctxt);
4052 * htmlParseReference:
4053 * @ctxt: an HTML parser context
4055 * parse and handle entity references in content,
4056 * this will end-up in a call to character() since this is either a
4057 * CharRef, or a predefined entity.
4060 htmlParseReference(htmlParserCtxtPtr ctxt) {
4061 const htmlEntityDesc * ent;
4063 const xmlChar *name;
4064 if (CUR != '&') return;
4066 if (NXT(1) == '#') {
4070 c = htmlParseCharRef(ctxt);
4074 if (c < 0x80) { out[i++]= c; bits= -6; }
4075 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4076 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4077 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4079 for ( ; bits >= 0; bits-= 6) {
4080 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4084 htmlCheckParagraph(ctxt);
4085 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4086 ctxt->sax->characters(ctxt->userData, out, i);
4088 ent = htmlParseEntityRef(ctxt, &name);
4090 htmlCheckParagraph(ctxt);
4091 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4092 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4095 if ((ent == NULL) || !(ent->value > 0)) {
4096 htmlCheckParagraph(ctxt);
4097 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4098 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4099 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4100 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4108 { out[i++]= c; bits= -6; }
4110 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4111 else if (c < 0x10000)
4112 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4114 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4116 for ( ; bits >= 0; bits-= 6) {
4117 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4121 htmlCheckParagraph(ctxt);
4122 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4123 ctxt->sax->characters(ctxt->userData, out, i);
4130 * @ctxt: an HTML parser context
4132 * Parse a content: comment, sub-element, reference or text.
4133 * Kept for compatibility with old code
4137 htmlParseContent(htmlParserCtxtPtr ctxt) {
4138 xmlChar *currentNode;
4140 const xmlChar *name;
4142 currentNode = xmlStrdup(ctxt->name);
4143 depth = ctxt->nameNr;
4145 long cons = ctxt->nbChars;
4149 if (ctxt->instate == XML_PARSER_EOF)
4153 * Our tag or one of it's parent or children is ending.
4155 if ((CUR == '<') && (NXT(1) == '/')) {
4156 if (htmlParseEndTag(ctxt) &&
4157 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4158 if (currentNode != NULL)
4159 xmlFree(currentNode);
4162 continue; /* while */
4165 else if ((CUR == '<') &&
4166 ((IS_ASCII_LETTER(NXT(1))) ||
4167 (NXT(1) == '_') || (NXT(1) == ':'))) {
4168 name = htmlParseHTMLName_nonInvasive(ctxt);
4170 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4171 "htmlParseStartTag: invalid element name\n",
4173 /* Dump the bogus tag like browsers do */
4174 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4177 if (currentNode != NULL)
4178 xmlFree(currentNode);
4182 if (ctxt->name != NULL) {
4183 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4184 htmlAutoClose(ctxt, name);
4191 * Has this node been popped out during parsing of
4194 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4195 (!xmlStrEqual(currentNode, ctxt->name)))
4197 if (currentNode != NULL) xmlFree(currentNode);
4201 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4202 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4204 * Handle SCRIPT/STYLE separately
4206 htmlParseScript(ctxt);
4209 * Sometimes DOCTYPE arrives in the middle of the document
4211 if ((CUR == '<') && (NXT(1) == '!') &&
4212 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4213 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4214 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4216 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4217 "Misplaced DOCTYPE declaration\n",
4218 BAD_CAST "DOCTYPE" , NULL);
4219 htmlParseDocTypeDecl(ctxt);
4223 * First case : a comment
4225 if ((CUR == '<') && (NXT(1) == '!') &&
4226 (NXT(2) == '-') && (NXT(3) == '-')) {
4227 htmlParseComment(ctxt);
4231 * Second case : a Processing Instruction.
4233 else if ((CUR == '<') && (NXT(1) == '?')) {
4238 * Third case : a sub-element.
4240 else if (CUR == '<') {
4241 htmlParseElement(ctxt);
4245 * Fourth case : a reference. If if has not been resolved,
4246 * parsing returns it's Name, create the node
4248 else if (CUR == '&') {
4249 htmlParseReference(ctxt);
4253 * Fifth case : end of the resource
4255 else if (CUR == 0) {
4256 htmlAutoCloseOnEnd(ctxt);
4261 * Last case, text. Note that References are handled directly.
4264 htmlParseCharData(ctxt);
4267 if (cons == ctxt->nbChars) {
4268 if (ctxt->node != NULL) {
4269 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4270 "detected an error in element content\n",
4278 if (currentNode != NULL) xmlFree(currentNode);
4283 * @ctxt: an HTML parser context
4285 * parse an HTML element, this is highly recursive
4286 * this is kept for compatibility with previous code versions
4288 * [39] element ::= EmptyElemTag | STag content ETag
4290 * [41] Attribute ::= Name Eq AttValue
4294 htmlParseElement(htmlParserCtxtPtr ctxt) {
4295 const xmlChar *name;
4296 xmlChar *currentNode = NULL;
4297 const htmlElemDesc * info;
4298 htmlParserNodeInfo node_info;
4301 const xmlChar *oldptr;
4303 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4304 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4305 "htmlParseElement: context error\n", NULL, NULL);
4309 if (ctxt->instate == XML_PARSER_EOF)
4312 /* Capture start position */
4313 if (ctxt->record_info) {
4314 node_info.begin_pos = ctxt->input->consumed +
4315 (CUR_PTR - ctxt->input->base);
4316 node_info.begin_line = ctxt->input->line;
4319 failed = htmlParseStartTag(ctxt);
4321 if ((failed == -1) || (name == NULL)) {
4328 * Lookup the info for that element.
4330 info = htmlTagLookup(name);
4332 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4333 "Tag %s invalid\n", name, NULL);
4337 * Check for an Empty Element labeled the XML/SGML way
4339 if ((CUR == '/') && (NXT(1) == '>')) {
4341 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4342 ctxt->sax->endElement(ctxt->userData, name);
4350 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4351 "Couldn't find end of Start Tag %s\n", name, NULL);
4354 * end of parsing of this node.
4356 if (xmlStrEqual(name, ctxt->name)) {
4362 * Capture end position and add node
4364 if (ctxt->record_info) {
4365 node_info.end_pos = ctxt->input->consumed +
4366 (CUR_PTR - ctxt->input->base);
4367 node_info.end_line = ctxt->input->line;
4368 node_info.node = ctxt->node;
4369 xmlParserAddNodeInfo(ctxt, &node_info);
4375 * Check for an Empty Element from DTD definition
4377 if ((info != NULL) && (info->empty)) {
4378 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4379 ctxt->sax->endElement(ctxt->userData, name);
4385 * Parse the content of the element:
4387 currentNode = xmlStrdup(ctxt->name);
4388 depth = ctxt->nameNr;
4389 while (IS_CHAR_CH(CUR)) {
4390 oldptr = ctxt->input->cur;
4391 htmlParseContent(ctxt);
4392 if (oldptr==ctxt->input->cur) break;
4393 if (ctxt->nameNr < depth) break;
4397 * Capture end position and add node
4399 if ( currentNode != NULL && ctxt->record_info ) {
4400 node_info.end_pos = ctxt->input->consumed +
4401 (CUR_PTR - ctxt->input->base);
4402 node_info.end_line = ctxt->input->line;
4403 node_info.node = ctxt->node;
4404 xmlParserAddNodeInfo(ctxt, &node_info);
4406 if (!IS_CHAR_CH(CUR)) {
4407 htmlAutoCloseOnEnd(ctxt);
4410 if (currentNode != NULL)
4411 xmlFree(currentNode);
4415 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4417 * Capture end position and add node
4419 if ( ctxt->node != NULL && ctxt->record_info ) {
4420 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4421 (CUR_PTR - ctxt->input->base);
4422 ctxt->nodeInfo->end_line = ctxt->input->line;
4423 ctxt->nodeInfo->node = ctxt->node;
4424 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4425 htmlNodeInfoPop(ctxt);
4427 if (!IS_CHAR_CH(CUR)) {
4428 htmlAutoCloseOnEnd(ctxt);
4433 * htmlParseElementInternal:
4434 * @ctxt: an HTML parser context
4436 * parse an HTML element, new version, non recursive
4438 * [39] element ::= EmptyElemTag | STag content ETag
4440 * [41] Attribute ::= Name Eq AttValue
4444 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4445 const xmlChar *name;
4446 const htmlElemDesc * info;
4447 htmlParserNodeInfo node_info = { 0, };
4450 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4451 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4452 "htmlParseElementInternal: context error\n", NULL, NULL);
4456 if (ctxt->instate == XML_PARSER_EOF)
4459 /* Capture start position */
4460 if (ctxt->record_info) {
4461 node_info.begin_pos = ctxt->input->consumed +
4462 (CUR_PTR - ctxt->input->base);
4463 node_info.begin_line = ctxt->input->line;
4466 failed = htmlParseStartTag(ctxt);
4468 if ((failed == -1) || (name == NULL)) {
4475 * Lookup the info for that element.
4477 info = htmlTagLookup(name);
4479 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4480 "Tag %s invalid\n", name, NULL);
4484 * Check for an Empty Element labeled the XML/SGML way
4486 if ((CUR == '/') && (NXT(1) == '>')) {
4488 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4489 ctxt->sax->endElement(ctxt->userData, name);
4497 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4498 "Couldn't find end of Start Tag %s\n", name, NULL);
4501 * end of parsing of this node.
4503 if (xmlStrEqual(name, ctxt->name)) {
4508 if (ctxt->record_info)
4509 htmlNodeInfoPush(ctxt, &node_info);
4510 htmlParserFinishElementParsing(ctxt);
4515 * Check for an Empty Element from DTD definition
4517 if ((info != NULL) && (info->empty)) {
4518 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4519 ctxt->sax->endElement(ctxt->userData, name);
4524 if (ctxt->record_info)
4525 htmlNodeInfoPush(ctxt, &node_info);
4529 * htmlParseContentInternal:
4530 * @ctxt: an HTML parser context
4532 * Parse a content: comment, sub-element, reference or text.
4533 * New version for non recursive htmlParseElementInternal
4537 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4538 xmlChar *currentNode;
4540 const xmlChar *name;
4542 currentNode = xmlStrdup(ctxt->name);
4543 depth = ctxt->nameNr;
4545 long cons = ctxt->nbChars;
4549 if (ctxt->instate == XML_PARSER_EOF)
4553 * Our tag or one of it's parent or children is ending.
4555 if ((CUR == '<') && (NXT(1) == '/')) {
4556 if (htmlParseEndTag(ctxt) &&
4557 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4558 if (currentNode != NULL)
4559 xmlFree(currentNode);
4561 currentNode = xmlStrdup(ctxt->name);
4562 depth = ctxt->nameNr;
4564 continue; /* while */
4567 else if ((CUR == '<') &&
4568 ((IS_ASCII_LETTER(NXT(1))) ||
4569 (NXT(1) == '_') || (NXT(1) == ':'))) {
4570 name = htmlParseHTMLName_nonInvasive(ctxt);
4572 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4573 "htmlParseStartTag: invalid element name\n",
4575 /* Dump the bogus tag like browsers do */
4576 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4579 htmlParserFinishElementParsing(ctxt);
4580 if (currentNode != NULL)
4581 xmlFree(currentNode);
4583 currentNode = xmlStrdup(ctxt->name);
4584 depth = ctxt->nameNr;
4588 if (ctxt->name != NULL) {
4589 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4590 htmlAutoClose(ctxt, name);
4597 * Has this node been popped out during parsing of
4600 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4601 (!xmlStrEqual(currentNode, ctxt->name)))
4603 htmlParserFinishElementParsing(ctxt);
4604 if (currentNode != NULL) xmlFree(currentNode);
4606 currentNode = xmlStrdup(ctxt->name);
4607 depth = ctxt->nameNr;
4611 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4612 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4614 * Handle SCRIPT/STYLE separately
4616 htmlParseScript(ctxt);
4619 * Sometimes DOCTYPE arrives in the middle of the document
4621 if ((CUR == '<') && (NXT(1) == '!') &&
4622 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4623 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4624 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4626 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4627 "Misplaced DOCTYPE declaration\n",
4628 BAD_CAST "DOCTYPE" , NULL);
4629 htmlParseDocTypeDecl(ctxt);
4633 * First case : a comment
4635 if ((CUR == '<') && (NXT(1) == '!') &&
4636 (NXT(2) == '-') && (NXT(3) == '-')) {
4637 htmlParseComment(ctxt);
4641 * Second case : a Processing Instruction.
4643 else if ((CUR == '<') && (NXT(1) == '?')) {
4648 * Third case : a sub-element.
4650 else if (CUR == '<') {
4651 htmlParseElementInternal(ctxt);
4652 if (currentNode != NULL) xmlFree(currentNode);
4654 currentNode = xmlStrdup(ctxt->name);
4655 depth = ctxt->nameNr;
4659 * Fourth case : a reference. If if has not been resolved,
4660 * parsing returns it's Name, create the node
4662 else if (CUR == '&') {
4663 htmlParseReference(ctxt);
4667 * Fifth case : end of the resource
4669 else if (CUR == 0) {
4670 htmlAutoCloseOnEnd(ctxt);
4675 * Last case, text. Note that References are handled directly.
4678 htmlParseCharData(ctxt);
4681 if (cons == ctxt->nbChars) {
4682 if (ctxt->node != NULL) {
4683 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4684 "detected an error in element content\n",
4692 if (currentNode != NULL) xmlFree(currentNode);
4697 * @ctxt: an HTML parser context
4699 * Parse a content: comment, sub-element, reference or text.
4700 * This is the entry point when called from parser.c
4704 __htmlParseContent(void *ctxt) {
4706 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4710 * htmlParseDocument:
4711 * @ctxt: an HTML parser context
4713 * parse an HTML document (and build a tree if using the standard SAX
4716 * Returns 0, -1 in case of error. the parser context is augmented
4717 * as a result of the parsing.
4721 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4723 xmlCharEncoding enc;
4728 htmlDefaultSAXHandlerInit();
4730 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4731 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4732 "htmlParseDocument: context error\n", NULL, NULL);
4733 return(XML_ERR_INTERNAL_ERROR);
4736 ctxt->linenumbers = 1;
4739 * SAX: beginning of the document processing.
4741 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4742 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4744 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4745 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4747 * Get the 4 first bytes and decode the charset
4748 * if enc != XML_CHAR_ENCODING_NONE
4749 * plug some encoding conversion routines.
4755 enc = xmlDetectCharEncoding(&start[0], 4);
4756 if (enc != XML_CHAR_ENCODING_NONE) {
4757 xmlSwitchEncoding(ctxt, enc);
4762 * Wipe out everything which is before the first '<'
4766 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4767 "Document is empty\n", NULL, NULL);
4770 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4771 ctxt->sax->startDocument(ctxt->userData);
4775 * Parse possible comments and PIs before any content
4777 while (((CUR == '<') && (NXT(1) == '!') &&
4778 (NXT(2) == '-') && (NXT(3) == '-')) ||
4779 ((CUR == '<') && (NXT(1) == '?'))) {
4780 htmlParseComment(ctxt);
4787 * Then possibly doc type declaration(s) and more Misc
4788 * (doctypedecl Misc*)?
4790 if ((CUR == '<') && (NXT(1) == '!') &&
4791 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4792 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4793 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4795 htmlParseDocTypeDecl(ctxt);
4800 * Parse possible comments and PIs before any content
4802 while (((CUR == '<') && (NXT(1) == '!') &&
4803 (NXT(2) == '-') && (NXT(3) == '-')) ||
4804 ((CUR == '<') && (NXT(1) == '?'))) {
4805 htmlParseComment(ctxt);
4811 * Time to start parsing the tree itself
4813 htmlParseContentInternal(ctxt);
4819 htmlAutoCloseOnEnd(ctxt);
4823 * SAX: end of the document processing.
4825 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4826 ctxt->sax->endDocument(ctxt->userData);
4828 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4829 dtd = xmlGetIntSubset(ctxt->myDoc);
4831 ctxt->myDoc->intSubset =
4832 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4833 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4834 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4836 if (! ctxt->wellFormed) return(-1);
4841 /************************************************************************
4843 * Parser contexts handling *
4845 ************************************************************************/
4848 * htmlInitParserCtxt:
4849 * @ctxt: an HTML parser context
4851 * Initialize a parser context
4853 * Returns 0 in case of success and -1 in case of error
4857 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4859 htmlSAXHandler *sax;
4861 if (ctxt == NULL) return(-1);
4862 memset(ctxt, 0, sizeof(htmlParserCtxt));
4864 ctxt->dict = xmlDictCreate();
4865 if (ctxt->dict == NULL) {
4866 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4869 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4871 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4875 memset(sax, 0, sizeof(htmlSAXHandler));
4877 /* Allocate the Input stack */
4878 ctxt->inputTab = (htmlParserInputPtr *)
4879 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4880 if (ctxt->inputTab == NULL) {
4881 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4890 ctxt->version = NULL;
4891 ctxt->encoding = NULL;
4892 ctxt->standalone = -1;
4893 ctxt->instate = XML_PARSER_START;
4895 /* Allocate the Node stack */
4896 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4897 if (ctxt->nodeTab == NULL) {
4898 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4911 /* Allocate the Name stack */
4912 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4913 if (ctxt->nameTab == NULL) {
4914 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4930 ctxt->nodeInfoTab = NULL;
4931 ctxt->nodeInfoNr = 0;
4932 ctxt->nodeInfoMax = 0;
4934 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4937 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4939 ctxt->userData = ctxt;
4941 ctxt->wellFormed = 1;
4942 ctxt->replaceEntities = 0;
4943 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4945 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4946 ctxt->vctxt.userData = ctxt;
4947 ctxt->vctxt.error = xmlParserValidityError;
4948 ctxt->vctxt.warning = xmlParserValidityWarning;
4949 ctxt->record_info = 0;
4952 ctxt->checkIndex = 0;
4953 ctxt->catalogs = NULL;
4954 xmlInitNodeInfoSeq(&ctxt->node_seq);
4959 * htmlFreeParserCtxt:
4960 * @ctxt: an HTML parser context
4962 * Free all the memory used by a parser context. However the parsed
4963 * document in ctxt->myDoc is not freed.
4967 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4969 xmlFreeParserCtxt(ctxt);
4973 * htmlNewParserCtxt:
4975 * Allocate and initialize a new parser context.
4977 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4981 htmlNewParserCtxt(void)
4983 xmlParserCtxtPtr ctxt;
4985 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4987 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4990 memset(ctxt, 0, sizeof(xmlParserCtxt));
4991 if (htmlInitParserCtxt(ctxt) < 0) {
4992 htmlFreeParserCtxt(ctxt);
4999 * htmlCreateMemoryParserCtxt:
5000 * @buffer: a pointer to a char array
5001 * @size: the size of the array
5003 * Create a parser context for an HTML in-memory document.
5005 * Returns the new parser context or NULL
5008 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5009 xmlParserCtxtPtr ctxt;
5010 xmlParserInputPtr input;
5011 xmlParserInputBufferPtr buf;
5018 ctxt = htmlNewParserCtxt();
5022 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5023 if (buf == NULL) return(NULL);
5025 input = xmlNewInputStream(ctxt);
5026 if (input == NULL) {
5027 xmlFreeParserCtxt(ctxt);
5031 input->filename = NULL;
5033 xmlBufResetInput(buf->buffer, input);
5035 inputPush(ctxt, input);
5040 * htmlCreateDocParserCtxt:
5041 * @cur: a pointer to an array of xmlChar
5042 * @encoding: a free form C string describing the HTML document encoding, or NULL
5044 * Create a parser context for an HTML document.
5046 * TODO: check the need to add encoding handling there
5048 * Returns the new parser context or NULL
5050 static htmlParserCtxtPtr
5051 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5053 htmlParserCtxtPtr ctxt;
5057 len = xmlStrlen(cur);
5058 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5062 if (encoding != NULL) {
5063 xmlCharEncoding enc;
5064 xmlCharEncodingHandlerPtr handler;
5066 if (ctxt->input->encoding != NULL)
5067 xmlFree((xmlChar *) ctxt->input->encoding);
5068 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5070 enc = xmlParseCharEncoding(encoding);
5072 * registered set of known encodings
5074 if (enc != XML_CHAR_ENCODING_ERROR) {
5075 xmlSwitchEncoding(ctxt, enc);
5076 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5077 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5078 "Unsupported encoding %s\n",
5079 (const xmlChar *) encoding, NULL);
5083 * fallback for unknown encodings
5085 handler = xmlFindCharEncodingHandler((const char *) encoding);
5086 if (handler != NULL) {
5087 xmlSwitchToEncoding(ctxt, handler);
5089 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5090 "Unsupported encoding %s\n",
5091 (const xmlChar *) encoding, NULL);
5098 #ifdef LIBXML_PUSH_ENABLED
5099 /************************************************************************
5101 * Progressive parsing interfaces *
5103 ************************************************************************/
5106 * htmlParseLookupSequence:
5107 * @ctxt: an HTML parser context
5108 * @first: the first char to lookup
5109 * @next: the next char to lookup or zero
5110 * @third: the next char to lookup or zero
5111 * @comment: flag to force checking inside comments
5113 * Try to find if a sequence (first, next, third) or just (first next) or
5114 * (first) is available in the input stream.
5115 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5116 * to avoid rescanning sequences of bytes, it DOES change the state of the
5117 * parser, do not use liberally.
5118 * This is basically similar to xmlParseLookupSequence()
5120 * Returns the index to the current parsing point if the full sequence
5121 * is available, -1 otherwise.
5124 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5125 xmlChar next, xmlChar third, int iscomment,
5129 htmlParserInputPtr in;
5133 char valdellim = 0x0;
5139 base = in->cur - in->base;
5143 if (ctxt->checkIndex > base)
5144 base = ctxt->checkIndex;
5146 if (in->buf == NULL) {
5150 buf = xmlBufContent(in->buf->buffer);
5151 len = xmlBufUse(in->buf->buffer);
5154 /* take into account the sequence length */
5159 for (; base < len; base++) {
5160 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5161 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5162 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5164 /* do not increment past <! - some people use <!--> */
5168 if (ignoreattrval) {
5169 if (buf[base] == '"' || buf[base] == '\'') {
5171 if (buf[base] == valdellim) {
5176 valdellim = buf[base];
5180 } else if (invalue) {
5187 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5188 (buf[base + 2] == '>')) {
5194 if (buf[base] == first) {
5196 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5198 } else if (next != 0) {
5199 if (buf[base + 1] != next)
5202 ctxt->checkIndex = 0;
5205 xmlGenericError(xmlGenericErrorContext,
5206 "HPP: lookup '%c' found at %d\n",
5208 else if (third == 0)
5209 xmlGenericError(xmlGenericErrorContext,
5210 "HPP: lookup '%c%c' found at %d\n",
5213 xmlGenericError(xmlGenericErrorContext,
5214 "HPP: lookup '%c%c%c' found at %d\n",
5215 first, next, third, base);
5217 return (base - (in->cur - in->base));
5220 if ((!incomment) && (!invalue))
5221 ctxt->checkIndex = base;
5224 xmlGenericError(xmlGenericErrorContext,
5225 "HPP: lookup '%c' failed\n", first);
5226 else if (third == 0)
5227 xmlGenericError(xmlGenericErrorContext,
5228 "HPP: lookup '%c%c' failed\n", first, next);
5230 xmlGenericError(xmlGenericErrorContext,
5231 "HPP: lookup '%c%c%c' failed\n", first, next,
5238 * htmlParseLookupChars:
5239 * @ctxt: an HTML parser context
5240 * @stop: Array of chars, which stop the lookup.
5241 * @stopLen: Length of stop-Array
5243 * Try to find if any char of the stop-Array is available in the input
5245 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5246 * to avoid rescanning sequences of bytes, it DOES change the state of the
5247 * parser, do not use liberally.
5249 * Returns the index to the current parsing point if a stopChar
5250 * is available, -1 otherwise.
5253 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5257 htmlParserInputPtr in;
5266 base = in->cur - in->base;
5270 if (ctxt->checkIndex > base)
5271 base = ctxt->checkIndex;
5273 if (in->buf == NULL) {
5277 buf = xmlBufContent(in->buf->buffer);
5278 len = xmlBufUse(in->buf->buffer);
5281 for (; base < len; base++) {
5282 if (!incomment && (base + 4 < len)) {
5283 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5284 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5286 /* do not increment past <! - some people use <!--> */
5293 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5294 (buf[base + 2] == '>')) {
5300 for (i = 0; i < stopLen; ++i) {
5301 if (buf[base] == stop[i]) {
5302 ctxt->checkIndex = 0;
5303 return (base - (in->cur - in->base));
5307 ctxt->checkIndex = base;
5312 * htmlParseTryOrFinish:
5313 * @ctxt: an HTML parser context
5314 * @terminate: last chunk indicator
5316 * Try to progress on parsing
5318 * Returns zero if no parsing was possible
5321 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5323 htmlParserInputPtr in;
5327 htmlParserNodeInfo node_info;
5330 switch (ctxt->instate) {
5331 case XML_PARSER_EOF:
5332 xmlGenericError(xmlGenericErrorContext,
5333 "HPP: try EOF\n"); break;
5334 case XML_PARSER_START:
5335 xmlGenericError(xmlGenericErrorContext,
5336 "HPP: try START\n"); break;
5337 case XML_PARSER_MISC:
5338 xmlGenericError(xmlGenericErrorContext,
5339 "HPP: try MISC\n");break;
5340 case XML_PARSER_COMMENT:
5341 xmlGenericError(xmlGenericErrorContext,
5342 "HPP: try COMMENT\n");break;
5343 case XML_PARSER_PROLOG:
5344 xmlGenericError(xmlGenericErrorContext,
5345 "HPP: try PROLOG\n");break;
5346 case XML_PARSER_START_TAG:
5347 xmlGenericError(xmlGenericErrorContext,
5348 "HPP: try START_TAG\n");break;
5349 case XML_PARSER_CONTENT:
5350 xmlGenericError(xmlGenericErrorContext,
5351 "HPP: try CONTENT\n");break;
5352 case XML_PARSER_CDATA_SECTION:
5353 xmlGenericError(xmlGenericErrorContext,
5354 "HPP: try CDATA_SECTION\n");break;
5355 case XML_PARSER_END_TAG:
5356 xmlGenericError(xmlGenericErrorContext,
5357 "HPP: try END_TAG\n");break;
5358 case XML_PARSER_ENTITY_DECL:
5359 xmlGenericError(xmlGenericErrorContext,
5360 "HPP: try ENTITY_DECL\n");break;
5361 case XML_PARSER_ENTITY_VALUE:
5362 xmlGenericError(xmlGenericErrorContext,
5363 "HPP: try ENTITY_VALUE\n");break;
5364 case XML_PARSER_ATTRIBUTE_VALUE:
5365 xmlGenericError(xmlGenericErrorContext,
5366 "HPP: try ATTRIBUTE_VALUE\n");break;
5367 case XML_PARSER_DTD:
5368 xmlGenericError(xmlGenericErrorContext,
5369 "HPP: try DTD\n");break;
5370 case XML_PARSER_EPILOG:
5371 xmlGenericError(xmlGenericErrorContext,
5372 "HPP: try EPILOG\n");break;
5374 xmlGenericError(xmlGenericErrorContext,
5375 "HPP: try PI\n");break;
5376 case XML_PARSER_SYSTEM_LITERAL:
5377 xmlGenericError(xmlGenericErrorContext,
5378 "HPP: try SYSTEM_LITERAL\n");break;
5385 if (in == NULL) break;
5386 if (in->buf == NULL)
5387 avail = in->length - (in->cur - in->base);
5389 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5390 if ((avail == 0) && (terminate)) {
5391 htmlAutoCloseOnEnd(ctxt);
5392 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5394 * SAX: end of the document processing.
5396 ctxt->instate = XML_PARSER_EOF;
5397 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5398 ctxt->sax->endDocument(ctxt->userData);
5409 switch (ctxt->instate) {
5410 case XML_PARSER_EOF:
5412 * Document parsing is done !
5415 case XML_PARSER_START:
5417 * Very first chars read from the document flow.
5420 if (IS_BLANK_CH(cur)) {
5422 if (in->buf == NULL)
5423 avail = in->length - (in->cur - in->base);
5425 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5427 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5428 ctxt->sax->setDocumentLocator(ctxt->userData,
5429 &xmlDefaultSAXLocator);
5430 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5431 (!ctxt->disableSAX))
5432 ctxt->sax->startDocument(ctxt->userData);
5436 if ((cur == '<') && (next == '!') &&
5437 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5438 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5439 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5442 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5445 xmlGenericError(xmlGenericErrorContext,
5446 "HPP: Parsing internal subset\n");
5448 htmlParseDocTypeDecl(ctxt);
5449 ctxt->instate = XML_PARSER_PROLOG;
5451 xmlGenericError(xmlGenericErrorContext,
5452 "HPP: entering PROLOG\n");
5455 ctxt->instate = XML_PARSER_MISC;
5457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: entering MISC\n");
5462 case XML_PARSER_MISC:
5464 if (in->buf == NULL)
5465 avail = in->length - (in->cur - in->base);
5467 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5469 * no chars in buffer
5474 * not enouth chars in buffer
5485 if ((cur == '<') && (next == '!') &&
5486 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5488 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5491 xmlGenericError(xmlGenericErrorContext,
5492 "HPP: Parsing Comment\n");
5494 htmlParseComment(ctxt);
5495 ctxt->instate = XML_PARSER_MISC;
5496 } else if ((cur == '<') && (next == '?')) {
5498 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5501 xmlGenericError(xmlGenericErrorContext,
5502 "HPP: Parsing PI\n");
5505 ctxt->instate = XML_PARSER_MISC;
5506 } else if ((cur == '<') && (next == '!') &&
5507 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5508 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5509 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5512 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5515 xmlGenericError(xmlGenericErrorContext,
5516 "HPP: Parsing internal subset\n");
5518 htmlParseDocTypeDecl(ctxt);
5519 ctxt->instate = XML_PARSER_PROLOG;
5521 xmlGenericError(xmlGenericErrorContext,
5522 "HPP: entering PROLOG\n");
5524 } else if ((cur == '<') && (next == '!') &&
5528 ctxt->instate = XML_PARSER_START_TAG;
5530 xmlGenericError(xmlGenericErrorContext,
5531 "HPP: entering START_TAG\n");
5535 case XML_PARSER_PROLOG:
5537 if (in->buf == NULL)
5538 avail = in->length - (in->cur - in->base);
5540 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5545 if ((cur == '<') && (next == '!') &&
5546 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5548 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5551 xmlGenericError(xmlGenericErrorContext,
5552 "HPP: Parsing Comment\n");
5554 htmlParseComment(ctxt);
5555 ctxt->instate = XML_PARSER_PROLOG;
5556 } else if ((cur == '<') && (next == '?')) {
5558 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5561 xmlGenericError(xmlGenericErrorContext,
5562 "HPP: Parsing PI\n");
5565 ctxt->instate = XML_PARSER_PROLOG;
5566 } else if ((cur == '<') && (next == '!') &&
5570 ctxt->instate = XML_PARSER_START_TAG;
5572 xmlGenericError(xmlGenericErrorContext,
5573 "HPP: entering START_TAG\n");
5577 case XML_PARSER_EPILOG:
5578 if (in->buf == NULL)
5579 avail = in->length - (in->cur - in->base);
5581 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5585 if (IS_BLANK_CH(cur)) {
5586 htmlParseCharData(ctxt);
5592 if ((cur == '<') && (next == '!') &&
5593 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5595 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5598 xmlGenericError(xmlGenericErrorContext,
5599 "HPP: Parsing Comment\n");
5601 htmlParseComment(ctxt);
5602 ctxt->instate = XML_PARSER_EPILOG;
5603 } else if ((cur == '<') && (next == '?')) {
5605 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5608 xmlGenericError(xmlGenericErrorContext,
5609 "HPP: Parsing PI\n");
5612 ctxt->instate = XML_PARSER_EPILOG;
5613 } else if ((cur == '<') && (next == '!') &&
5617 ctxt->errNo = XML_ERR_DOCUMENT_END;
5618 ctxt->wellFormed = 0;
5619 ctxt->instate = XML_PARSER_EOF;
5621 xmlGenericError(xmlGenericErrorContext,
5622 "HPP: entering EOF\n");
5624 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5625 ctxt->sax->endDocument(ctxt->userData);
5629 case XML_PARSER_START_TAG: {
5630 const xmlChar *name;
5632 const htmlElemDesc * info;
5635 * no chars in buffer
5640 * not enouth chars in buffer
5652 ctxt->instate = XML_PARSER_CONTENT;
5654 xmlGenericError(xmlGenericErrorContext,
5655 "HPP: entering CONTENT\n");
5660 ctxt->instate = XML_PARSER_END_TAG;
5661 ctxt->checkIndex = 0;
5663 xmlGenericError(xmlGenericErrorContext,
5664 "HPP: entering END_TAG\n");
5669 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5672 /* Capture start position */
5673 if (ctxt->record_info) {
5674 node_info.begin_pos = ctxt->input->consumed +
5675 (CUR_PTR - ctxt->input->base);
5676 node_info.begin_line = ctxt->input->line;
5680 failed = htmlParseStartTag(ctxt);
5682 if ((failed == -1) ||
5690 * Lookup the info for that element.
5692 info = htmlTagLookup(name);
5694 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5695 "Tag %s invalid\n", name, NULL);
5699 * Check for an Empty Element labeled the XML/SGML way
5701 if ((CUR == '/') && (NXT(1) == '>')) {
5703 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5704 ctxt->sax->endElement(ctxt->userData, name);
5706 ctxt->instate = XML_PARSER_CONTENT;
5708 xmlGenericError(xmlGenericErrorContext,
5709 "HPP: entering CONTENT\n");
5717 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5718 "Couldn't find end of Start Tag %s\n",
5722 * end of parsing of this node.
5724 if (xmlStrEqual(name, ctxt->name)) {
5729 if (ctxt->record_info)
5730 htmlNodeInfoPush(ctxt, &node_info);
5732 ctxt->instate = XML_PARSER_CONTENT;
5734 xmlGenericError(xmlGenericErrorContext,
5735 "HPP: entering CONTENT\n");
5741 * Check for an Empty Element from DTD definition
5743 if ((info != NULL) && (info->empty)) {
5744 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5745 ctxt->sax->endElement(ctxt->userData, name);
5749 if (ctxt->record_info)
5750 htmlNodeInfoPush(ctxt, &node_info);
5752 ctxt->instate = XML_PARSER_CONTENT;
5754 xmlGenericError(xmlGenericErrorContext,
5755 "HPP: entering CONTENT\n");
5759 case XML_PARSER_CONTENT: {
5762 * Handle preparsed entities and charRef
5764 if (ctxt->token != 0) {
5765 xmlChar chr[2] = { 0 , 0 } ;
5767 chr[0] = (xmlChar) ctxt->token;
5768 htmlCheckParagraph(ctxt);
5769 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5770 ctxt->sax->characters(ctxt->userData, chr, 1);
5772 ctxt->checkIndex = 0;
5774 if ((avail == 1) && (terminate)) {
5776 if ((cur != '<') && (cur != '&')) {
5777 if (ctxt->sax != NULL) {
5778 if (IS_BLANK_CH(cur)) {
5779 if (ctxt->keepBlanks) {
5780 if (ctxt->sax->characters != NULL)
5781 ctxt->sax->characters(
5782 ctxt->userData, &in->cur[0], 1);
5784 if (ctxt->sax->ignorableWhitespace != NULL)
5785 ctxt->sax->ignorableWhitespace(
5786 ctxt->userData, &in->cur[0], 1);
5789 htmlCheckParagraph(ctxt);
5790 if (ctxt->sax->characters != NULL)
5791 ctxt->sax->characters(
5792 ctxt->userData, &in->cur[0], 1);
5796 ctxt->checkIndex = 0;
5805 cons = ctxt->nbChars;
5806 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5807 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5809 * Handle SCRIPT/STYLE separately
5815 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5818 val = in->cur[idx + 2];
5819 if (val == 0) /* bad cut of input */
5822 htmlParseScript(ctxt);
5823 if ((cur == '<') && (next == '/')) {
5824 ctxt->instate = XML_PARSER_END_TAG;
5825 ctxt->checkIndex = 0;
5827 xmlGenericError(xmlGenericErrorContext,
5828 "HPP: entering END_TAG\n");
5834 * Sometimes DOCTYPE arrives in the middle of the document
5836 if ((cur == '<') && (next == '!') &&
5837 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5838 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5839 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5842 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5844 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5845 "Misplaced DOCTYPE declaration\n",
5846 BAD_CAST "DOCTYPE" , NULL);
5847 htmlParseDocTypeDecl(ctxt);
5848 } else if ((cur == '<') && (next == '!') &&
5849 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5851 (htmlParseLookupSequence(
5852 ctxt, '-', '-', '>', 1, 1) < 0))
5855 xmlGenericError(xmlGenericErrorContext,
5856 "HPP: Parsing Comment\n");
5858 htmlParseComment(ctxt);
5859 ctxt->instate = XML_PARSER_CONTENT;
5860 } else if ((cur == '<') && (next == '?')) {
5862 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5865 xmlGenericError(xmlGenericErrorContext,
5866 "HPP: Parsing PI\n");
5869 ctxt->instate = XML_PARSER_CONTENT;
5870 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5872 } else if ((cur == '<') && (next == '/')) {
5873 ctxt->instate = XML_PARSER_END_TAG;
5874 ctxt->checkIndex = 0;
5876 xmlGenericError(xmlGenericErrorContext,
5877 "HPP: entering END_TAG\n");
5880 } else if (cur == '<') {
5881 ctxt->instate = XML_PARSER_START_TAG;
5882 ctxt->checkIndex = 0;
5884 xmlGenericError(xmlGenericErrorContext,
5885 "HPP: entering START_TAG\n");
5888 } else if (cur == '&') {
5890 (htmlParseLookupChars(ctxt,
5891 BAD_CAST "; >/", 4) < 0))
5894 xmlGenericError(xmlGenericErrorContext,
5895 "HPP: Parsing Reference\n");
5897 /* TODO: check generation of subtrees if noent !!! */
5898 htmlParseReference(ctxt);
5901 * check that the text sequence is complete
5902 * before handing out the data to the parser
5903 * to avoid problems with erroneous end of
5907 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5909 ctxt->checkIndex = 0;
5911 xmlGenericError(xmlGenericErrorContext,
5912 "HPP: Parsing char data\n");
5914 htmlParseCharData(ctxt);
5917 if (cons == ctxt->nbChars) {
5918 if (ctxt->node != NULL) {
5919 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5920 "detected an error in element content\n",
5929 case XML_PARSER_END_TAG:
5933 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5935 htmlParseEndTag(ctxt);
5936 if (ctxt->nameNr == 0) {
5937 ctxt->instate = XML_PARSER_EPILOG;
5939 ctxt->instate = XML_PARSER_CONTENT;
5941 ctxt->checkIndex = 0;
5943 xmlGenericError(xmlGenericErrorContext,
5944 "HPP: entering CONTENT\n");
5947 case XML_PARSER_CDATA_SECTION:
5948 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5949 "HPP: internal error, state == CDATA\n",
5951 ctxt->instate = XML_PARSER_CONTENT;
5952 ctxt->checkIndex = 0;
5954 xmlGenericError(xmlGenericErrorContext,
5955 "HPP: entering CONTENT\n");
5958 case XML_PARSER_DTD:
5959 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5960 "HPP: internal error, state == DTD\n",
5962 ctxt->instate = XML_PARSER_CONTENT;
5963 ctxt->checkIndex = 0;
5965 xmlGenericError(xmlGenericErrorContext,
5966 "HPP: entering CONTENT\n");
5969 case XML_PARSER_COMMENT:
5970 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5971 "HPP: internal error, state == COMMENT\n",
5973 ctxt->instate = XML_PARSER_CONTENT;
5974 ctxt->checkIndex = 0;
5976 xmlGenericError(xmlGenericErrorContext,
5977 "HPP: entering CONTENT\n");
5981 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5982 "HPP: internal error, state == PI\n",
5984 ctxt->instate = XML_PARSER_CONTENT;
5985 ctxt->checkIndex = 0;
5987 xmlGenericError(xmlGenericErrorContext,
5988 "HPP: entering CONTENT\n");
5991 case XML_PARSER_ENTITY_DECL:
5992 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5993 "HPP: internal error, state == ENTITY_DECL\n",
5995 ctxt->instate = XML_PARSER_CONTENT;
5996 ctxt->checkIndex = 0;
5998 xmlGenericError(xmlGenericErrorContext,
5999 "HPP: entering CONTENT\n");
6002 case XML_PARSER_ENTITY_VALUE:
6003 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6004 "HPP: internal error, state == ENTITY_VALUE\n",
6006 ctxt->instate = XML_PARSER_CONTENT;
6007 ctxt->checkIndex = 0;
6009 xmlGenericError(xmlGenericErrorContext,
6010 "HPP: entering DTD\n");
6013 case XML_PARSER_ATTRIBUTE_VALUE:
6014 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6015 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6017 ctxt->instate = XML_PARSER_START_TAG;
6018 ctxt->checkIndex = 0;
6020 xmlGenericError(xmlGenericErrorContext,
6021 "HPP: entering START_TAG\n");
6024 case XML_PARSER_SYSTEM_LITERAL:
6025 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6026 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6028 ctxt->instate = XML_PARSER_CONTENT;
6029 ctxt->checkIndex = 0;
6031 xmlGenericError(xmlGenericErrorContext,
6032 "HPP: entering CONTENT\n");
6035 case XML_PARSER_IGNORE:
6036 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6037 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6039 ctxt->instate = XML_PARSER_CONTENT;
6040 ctxt->checkIndex = 0;
6042 xmlGenericError(xmlGenericErrorContext,
6043 "HPP: entering CONTENT\n");
6046 case XML_PARSER_PUBLIC_LITERAL:
6047 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6048 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6050 ctxt->instate = XML_PARSER_CONTENT;
6051 ctxt->checkIndex = 0;
6053 xmlGenericError(xmlGenericErrorContext,
6054 "HPP: entering CONTENT\n");
6061 if ((avail == 0) && (terminate)) {
6062 htmlAutoCloseOnEnd(ctxt);
6063 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6065 * SAX: end of the document processing.
6067 ctxt->instate = XML_PARSER_EOF;
6068 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6069 ctxt->sax->endDocument(ctxt->userData);
6072 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6073 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6074 (ctxt->instate == XML_PARSER_EPILOG))) {
6076 dtd = xmlGetIntSubset(ctxt->myDoc);
6078 ctxt->myDoc->intSubset =
6079 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6080 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6081 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6084 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6091 * @ctxt: an HTML parser context
6092 * @chunk: an char array
6093 * @size: the size in byte of the chunk
6094 * @terminate: last chunk indicator
6096 * Parse a Chunk of memory
6098 * Returns zero if no error, the xmlParserErrors otherwise.
6101 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6103 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6104 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6105 "htmlParseChunk: context error\n", NULL, NULL);
6106 return(XML_ERR_INTERNAL_ERROR);
6108 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6109 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6110 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6111 size_t cur = ctxt->input->cur - ctxt->input->base;
6114 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6116 ctxt->errNo = XML_PARSER_EOF;
6117 ctxt->disableSAX = 1;
6118 return (XML_PARSER_EOF);
6120 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6122 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6126 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6127 htmlParseTryOrFinish(ctxt, terminate);
6129 } else if (ctxt->instate != XML_PARSER_EOF) {
6130 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6131 xmlParserInputBufferPtr in = ctxt->input->buf;
6132 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6133 (in->raw != NULL)) {
6135 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6136 size_t current = ctxt->input->cur - ctxt->input->base;
6138 nbchars = xmlCharEncInput(in, terminate);
6140 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6141 "encoder error\n", NULL, NULL);
6142 return(XML_ERR_INVALID_ENCODING);
6144 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6148 htmlParseTryOrFinish(ctxt, terminate);
6150 if ((ctxt->instate != XML_PARSER_EOF) &&
6151 (ctxt->instate != XML_PARSER_EPILOG) &&
6152 (ctxt->instate != XML_PARSER_MISC)) {
6153 ctxt->errNo = XML_ERR_DOCUMENT_END;
6154 ctxt->wellFormed = 0;
6156 if (ctxt->instate != XML_PARSER_EOF) {
6157 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6158 ctxt->sax->endDocument(ctxt->userData);
6160 ctxt->instate = XML_PARSER_EOF;
6162 return((xmlParserErrors) ctxt->errNo);
6165 /************************************************************************
6167 * User entry points *
6169 ************************************************************************/
6172 * htmlCreatePushParserCtxt:
6173 * @sax: a SAX handler
6174 * @user_data: The user data returned on SAX callbacks
6175 * @chunk: a pointer to an array of chars
6176 * @size: number of chars in the array
6177 * @filename: an optional file name or URI
6178 * @enc: an optional encoding
6180 * Create a parser context for using the HTML parser in push mode
6181 * The value of @filename is used for fetching external entities
6182 * and error/warning reports.
6184 * Returns the new parser context or NULL
6187 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6188 const char *chunk, int size, const char *filename,
6189 xmlCharEncoding enc) {
6190 htmlParserCtxtPtr ctxt;
6191 htmlParserInputPtr inputStream;
6192 xmlParserInputBufferPtr buf;
6196 buf = xmlAllocParserInputBuffer(enc);
6197 if (buf == NULL) return(NULL);
6199 ctxt = htmlNewParserCtxt();
6201 xmlFreeParserInputBuffer(buf);
6204 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6205 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6207 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6209 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6210 if (ctxt->sax == NULL) {
6215 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6216 if (user_data != NULL)
6217 ctxt->userData = user_data;
6219 if (filename == NULL) {
6220 ctxt->directory = NULL;
6222 ctxt->directory = xmlParserGetDirectory(filename);
6225 inputStream = htmlNewInputStream(ctxt);
6226 if (inputStream == NULL) {
6227 xmlFreeParserCtxt(ctxt);
6232 if (filename == NULL)
6233 inputStream->filename = NULL;
6235 inputStream->filename = (char *)
6236 xmlCanonicPath((const xmlChar *) filename);
6237 inputStream->buf = buf;
6238 xmlBufResetInput(buf->buffer, inputStream);
6240 inputPush(ctxt, inputStream);
6242 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6243 (ctxt->input->buf != NULL)) {
6244 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6245 size_t cur = ctxt->input->cur - ctxt->input->base;
6247 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6249 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6251 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6254 ctxt->progressive = 1;
6258 #endif /* LIBXML_PUSH_ENABLED */
6262 * @cur: a pointer to an array of xmlChar
6263 * @encoding: a free form C string describing the HTML document encoding, or NULL
6264 * @sax: the SAX handler block
6265 * @userData: if using SAX, this pointer will be provided on callbacks.
6267 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6268 * to handle parse events. If sax is NULL, fallback to the default DOM
6269 * behavior and return a tree.
6271 * Returns the resulting document tree unless SAX is NULL or the document is
6276 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6278 htmlParserCtxtPtr ctxt;
6282 if (cur == NULL) return(NULL);
6285 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6286 if (ctxt == NULL) return(NULL);
6288 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6290 ctxt->userData = userData;
6293 htmlParseDocument(ctxt);
6297 ctxt->userData = NULL;
6299 htmlFreeParserCtxt(ctxt);
6306 * @cur: a pointer to an array of xmlChar
6307 * @encoding: a free form C string describing the HTML document encoding, or NULL
6309 * parse an HTML in-memory document and build a tree.
6311 * Returns the resulting document tree
6315 htmlParseDoc(xmlChar *cur, const char *encoding) {
6316 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6321 * htmlCreateFileParserCtxt:
6322 * @filename: the filename
6323 * @encoding: a free form C string describing the HTML document encoding, or NULL
6325 * Create a parser context for a file content.
6326 * Automatic support for ZLIB/Compress compressed document is provided
6327 * by default if found at compile-time.
6329 * Returns the new parser context or NULL
6332 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6334 htmlParserCtxtPtr ctxt;
6335 htmlParserInputPtr inputStream;
6336 char *canonicFilename;
6337 /* htmlCharEncoding enc; */
6338 xmlChar *content, *content_line = (xmlChar *) "charset=";
6340 if (filename == NULL)
6343 ctxt = htmlNewParserCtxt();
6347 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6348 if (canonicFilename == NULL) {
6349 #ifdef LIBXML_SAX1_ENABLED
6350 if (xmlDefaultSAXHandler.error != NULL) {
6351 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6354 xmlFreeParserCtxt(ctxt);
6358 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6359 xmlFree(canonicFilename);
6360 if (inputStream == NULL) {
6361 xmlFreeParserCtxt(ctxt);
6365 inputPush(ctxt, inputStream);
6369 size_t l = strlen(encoding);
6372 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6374 strcpy ((char *)content, (char *)content_line);
6375 strcat ((char *)content, (char *)encoding);
6376 htmlCheckEncoding (ctxt, content);
6387 * @filename: the filename
6388 * @encoding: a free form C string describing the HTML document encoding, or NULL
6389 * @sax: the SAX handler block
6390 * @userData: if using SAX, this pointer will be provided on callbacks.
6392 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6393 * compressed document is provided by default if found at compile-time.
6394 * It use the given SAX function block to handle the parsing callback.
6395 * If sax is NULL, fallback to the default DOM tree building routines.
6397 * Returns the resulting document tree unless SAX is NULL or the document is
6402 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6405 htmlParserCtxtPtr ctxt;
6406 htmlSAXHandlerPtr oldsax = NULL;
6410 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6411 if (ctxt == NULL) return(NULL);
6415 ctxt->userData = userData;
6418 htmlParseDocument(ctxt);
6423 ctxt->userData = NULL;
6425 htmlFreeParserCtxt(ctxt);
6432 * @filename: the filename
6433 * @encoding: a free form C string describing the HTML document encoding, or NULL
6435 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6436 * compressed document is provided by default if found at compile-time.
6438 * Returns the resulting document tree
6442 htmlParseFile(const char *filename, const char *encoding) {
6443 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6447 * htmlHandleOmittedElem:
6450 * Set and return the previous value for handling HTML omitted tags.
6452 * Returns the last value for 0 for no handling, 1 for auto insertion.
6456 htmlHandleOmittedElem(int val) {
6457 int old = htmlOmittedDefaultValue;
6459 htmlOmittedDefaultValue = val;
6464 * htmlElementAllowedHere:
6465 * @parent: HTML parent element
6466 * @elt: HTML element
6468 * Checks whether an HTML element may be a direct child of a parent element.
6469 * Note - doesn't check for deprecated elements
6471 * Returns 1 if allowed; 0 otherwise.
6474 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6477 if ( ! elt || ! parent || ! parent->subelts )
6480 for ( p = parent->subelts; *p; ++p )
6481 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6487 * htmlElementStatusHere:
6488 * @parent: HTML parent element
6489 * @elt: HTML element
6491 * Checks whether an HTML element may be a direct child of a parent element.
6492 * and if so whether it is valid or deprecated.
6494 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6497 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6498 if ( ! parent || ! elt )
6499 return HTML_INVALID ;
6500 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6501 return HTML_INVALID ;
6503 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6507 * @elt: HTML element
6508 * @attr: HTML attribute
6509 * @legacy: whether to allow deprecated attributes
6511 * Checks whether an attribute is valid for an element
6512 * Has full knowledge of Required and Deprecated attributes
6514 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6517 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6520 if ( !elt || ! attr )
6521 return HTML_INVALID ;
6523 if ( elt->attrs_req )
6524 for ( p = elt->attrs_req; *p; ++p)
6525 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6526 return HTML_REQUIRED ;
6528 if ( elt->attrs_opt )
6529 for ( p = elt->attrs_opt; *p; ++p)
6530 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6533 if ( legacy && elt->attrs_depr )
6534 for ( p = elt->attrs_depr; *p; ++p)
6535 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6536 return HTML_DEPRECATED ;
6538 return HTML_INVALID ;
6542 * @node: an htmlNodePtr in a tree
6543 * @legacy: whether to allow deprecated elements (YES is faster here
6544 * for Element nodes)
6546 * Checks whether the tree node is valid. Experimental (the author
6547 * only uses the HTML enhancements in a SAX parser)
6549 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6550 * legacy allowed) or htmlElementStatusHere (otherwise).
6551 * for Attribute nodes, a return from htmlAttrAllowed
6552 * for other nodes, HTML_NA (no checks performed)
6555 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6557 return HTML_INVALID ;
6559 switch ( node->type ) {
6560 case XML_ELEMENT_NODE:
6562 ? ( htmlElementAllowedHere (
6563 htmlTagLookup(node->parent->name) , node->name
6564 ) ? HTML_VALID : HTML_INVALID )
6565 : htmlElementStatusHere(
6566 htmlTagLookup(node->parent->name) ,
6567 htmlTagLookup(node->name) )
6569 case XML_ATTRIBUTE_NODE:
6570 return htmlAttrAllowed(
6571 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6572 default: return HTML_NA ;
6575 /************************************************************************
6577 * New set (2.6.0) of simpler and more flexible APIs *
6579 ************************************************************************/
6584 * Free a string if it is not owned by the "dict" dictionary in the
6587 #define DICT_FREE(str) \
6588 if ((str) && ((!dict) || \
6589 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6590 xmlFree((char *)(str));
6594 * @ctxt: an HTML parser context
6596 * Reset a parser context
6599 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6601 xmlParserInputPtr input;
6610 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6611 xmlFreeInputStream(input);
6617 if (ctxt->spaceTab != NULL) {
6618 ctxt->spaceTab[0] = -1;
6619 ctxt->space = &ctxt->spaceTab[0];
6631 DICT_FREE(ctxt->version);
6632 ctxt->version = NULL;
6633 DICT_FREE(ctxt->encoding);
6634 ctxt->encoding = NULL;
6635 DICT_FREE(ctxt->directory);
6636 ctxt->directory = NULL;
6637 DICT_FREE(ctxt->extSubURI);
6638 ctxt->extSubURI = NULL;
6639 DICT_FREE(ctxt->extSubSystem);
6640 ctxt->extSubSystem = NULL;
6641 if (ctxt->myDoc != NULL)
6642 xmlFreeDoc(ctxt->myDoc);
6645 ctxt->standalone = -1;
6646 ctxt->hasExternalSubset = 0;
6647 ctxt->hasPErefs = 0;
6650 ctxt->instate = XML_PARSER_START;
6653 ctxt->wellFormed = 1;
6654 ctxt->nsWellFormed = 1;
6655 ctxt->disableSAX = 0;
6657 ctxt->vctxt.userData = ctxt;
6658 ctxt->vctxt.error = xmlParserValidityError;
6659 ctxt->vctxt.warning = xmlParserValidityWarning;
6660 ctxt->record_info = 0;
6662 ctxt->checkIndex = 0;
6664 ctxt->errNo = XML_ERR_OK;
6666 ctxt->charset = XML_CHAR_ENCODING_NONE;
6667 ctxt->catalogs = NULL;
6668 xmlInitNodeInfoSeq(&ctxt->node_seq);
6670 if (ctxt->attsDefault != NULL) {
6671 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6672 ctxt->attsDefault = NULL;
6674 if (ctxt->attsSpecial != NULL) {
6675 xmlHashFree(ctxt->attsSpecial, NULL);
6676 ctxt->attsSpecial = NULL;
6681 * htmlCtxtUseOptions:
6682 * @ctxt: an HTML parser context
6683 * @options: a combination of htmlParserOption(s)
6685 * Applies the options to the parser context
6687 * Returns 0 in case of success, the set of unknown or unimplemented options
6691 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6696 if (options & HTML_PARSE_NOWARNING) {
6697 ctxt->sax->warning = NULL;
6698 ctxt->vctxt.warning = NULL;
6699 options -= XML_PARSE_NOWARNING;
6700 ctxt->options |= XML_PARSE_NOWARNING;
6702 if (options & HTML_PARSE_NOERROR) {
6703 ctxt->sax->error = NULL;
6704 ctxt->vctxt.error = NULL;
6705 ctxt->sax->fatalError = NULL;
6706 options -= XML_PARSE_NOERROR;
6707 ctxt->options |= XML_PARSE_NOERROR;
6709 if (options & HTML_PARSE_PEDANTIC) {
6711 options -= XML_PARSE_PEDANTIC;
6712 ctxt->options |= XML_PARSE_PEDANTIC;
6715 if (options & XML_PARSE_NOBLANKS) {
6716 ctxt->keepBlanks = 0;
6717 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6718 options -= XML_PARSE_NOBLANKS;
6719 ctxt->options |= XML_PARSE_NOBLANKS;
6721 ctxt->keepBlanks = 1;
6722 if (options & HTML_PARSE_RECOVER) {
6724 options -= HTML_PARSE_RECOVER;
6727 if (options & HTML_PARSE_COMPACT) {
6728 ctxt->options |= HTML_PARSE_COMPACT;
6729 options -= HTML_PARSE_COMPACT;
6731 if (options & XML_PARSE_HUGE) {
6732 ctxt->options |= XML_PARSE_HUGE;
6733 options -= XML_PARSE_HUGE;
6735 if (options & HTML_PARSE_NODEFDTD) {
6736 ctxt->options |= HTML_PARSE_NODEFDTD;
6737 options -= HTML_PARSE_NODEFDTD;
6739 if (options & HTML_PARSE_IGNORE_ENC) {
6740 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6741 options -= HTML_PARSE_IGNORE_ENC;
6743 if (options & HTML_PARSE_NOIMPLIED) {
6744 ctxt->options |= HTML_PARSE_NOIMPLIED;
6745 options -= HTML_PARSE_NOIMPLIED;
6747 ctxt->dictNames = 0;
6753 * @ctxt: an HTML parser context
6754 * @URL: the base URL to use for the document
6755 * @encoding: the document encoding, or NULL
6756 * @options: a combination of htmlParserOption(s)
6757 * @reuse: keep the context for reuse
6759 * Common front-end for the htmlRead functions
6761 * Returns the resulting document tree or NULL
6764 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6765 int options, int reuse)
6769 htmlCtxtUseOptions(ctxt, options);
6771 if (encoding != NULL) {
6772 xmlCharEncodingHandlerPtr hdlr;
6774 hdlr = xmlFindCharEncodingHandler(encoding);
6776 xmlSwitchToEncoding(ctxt, hdlr);
6777 if (ctxt->input->encoding != NULL)
6778 xmlFree((xmlChar *) ctxt->input->encoding);
6779 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6782 if ((URL != NULL) && (ctxt->input != NULL) &&
6783 (ctxt->input->filename == NULL))
6784 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6785 htmlParseDocument(ctxt);
6789 if ((ctxt->dictNames) &&
6791 (ret->dict == ctxt->dict))
6793 xmlFreeParserCtxt(ctxt);
6800 * @cur: a pointer to a zero terminated string
6801 * @URL: the base URL to use for the document
6802 * @encoding: the document encoding, or NULL
6803 * @options: a combination of htmlParserOption(s)
6805 * parse an XML in-memory document and build a tree.
6807 * Returns the resulting document tree
6810 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6812 htmlParserCtxtPtr ctxt;
6818 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6821 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6826 * @filename: a file or URL
6827 * @encoding: the document encoding, or NULL
6828 * @options: a combination of htmlParserOption(s)
6830 * parse an XML file from the filesystem or the network.
6832 * Returns the resulting document tree
6835 htmlReadFile(const char *filename, const char *encoding, int options)
6837 htmlParserCtxtPtr ctxt;
6840 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6843 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6848 * @buffer: a pointer to a char array
6849 * @size: the size of the array
6850 * @URL: the base URL to use for the document
6851 * @encoding: the document encoding, or NULL
6852 * @options: a combination of htmlParserOption(s)
6854 * parse an XML in-memory document and build a tree.
6856 * Returns the resulting document tree
6859 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6861 htmlParserCtxtPtr ctxt;
6864 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6867 htmlDefaultSAXHandlerInit();
6868 if (ctxt->sax != NULL)
6869 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6870 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6875 * @fd: an open file descriptor
6876 * @URL: the base URL to use for the document
6877 * @encoding: the document encoding, or NULL
6878 * @options: a combination of htmlParserOption(s)
6880 * parse an XML from a file descriptor and build a tree.
6882 * Returns the resulting document tree
6885 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6887 htmlParserCtxtPtr ctxt;
6888 xmlParserInputBufferPtr input;
6889 xmlParserInputPtr stream;
6896 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6899 ctxt = xmlNewParserCtxt();
6901 xmlFreeParserInputBuffer(input);
6904 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6905 if (stream == NULL) {
6906 xmlFreeParserInputBuffer(input);
6907 xmlFreeParserCtxt(ctxt);
6910 inputPush(ctxt, stream);
6911 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6916 * @ioread: an I/O read function
6917 * @ioclose: an I/O close function
6918 * @ioctx: an I/O handler
6919 * @URL: the base URL to use for the document
6920 * @encoding: the document encoding, or NULL
6921 * @options: a combination of htmlParserOption(s)
6923 * parse an HTML document from I/O functions and source and build a tree.
6925 * Returns the resulting document tree
6928 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6929 void *ioctx, const char *URL, const char *encoding, int options)
6931 htmlParserCtxtPtr ctxt;
6932 xmlParserInputBufferPtr input;
6933 xmlParserInputPtr stream;
6939 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6940 XML_CHAR_ENCODING_NONE);
6941 if (input == NULL) {
6942 if (ioclose != NULL)
6946 ctxt = htmlNewParserCtxt();
6948 xmlFreeParserInputBuffer(input);
6951 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6952 if (stream == NULL) {
6953 xmlFreeParserInputBuffer(input);
6954 xmlFreeParserCtxt(ctxt);
6957 inputPush(ctxt, stream);
6958 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6963 * @ctxt: an HTML parser context
6964 * @cur: a pointer to a zero terminated string
6965 * @URL: the base URL to use for the document
6966 * @encoding: the document encoding, or NULL
6967 * @options: a combination of htmlParserOption(s)
6969 * parse an XML in-memory document and build a tree.
6970 * This reuses the existing @ctxt parser context
6972 * Returns the resulting document tree
6975 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6976 const char *URL, const char *encoding, int options)
6978 xmlParserInputPtr stream;
6986 htmlCtxtReset(ctxt);
6988 stream = xmlNewStringInputStream(ctxt, cur);
6989 if (stream == NULL) {
6992 inputPush(ctxt, stream);
6993 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6998 * @ctxt: an HTML parser context
6999 * @filename: a file or URL
7000 * @encoding: the document encoding, or NULL
7001 * @options: a combination of htmlParserOption(s)
7003 * parse an XML file from the filesystem or the network.
7004 * This reuses the existing @ctxt parser context
7006 * Returns the resulting document tree
7009 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7010 const char *encoding, int options)
7012 xmlParserInputPtr stream;
7014 if (filename == NULL)
7020 htmlCtxtReset(ctxt);
7022 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7023 if (stream == NULL) {
7026 inputPush(ctxt, stream);
7027 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7031 * htmlCtxtReadMemory:
7032 * @ctxt: an HTML parser context
7033 * @buffer: a pointer to a char array
7034 * @size: the size of the array
7035 * @URL: the base URL to use for the document
7036 * @encoding: the document encoding, or NULL
7037 * @options: a combination of htmlParserOption(s)
7039 * parse an XML in-memory document and build a tree.
7040 * This reuses the existing @ctxt parser context
7042 * Returns the resulting document tree
7045 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7046 const char *URL, const char *encoding, int options)
7048 xmlParserInputBufferPtr input;
7049 xmlParserInputPtr stream;
7057 htmlCtxtReset(ctxt);
7059 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7060 if (input == NULL) {
7064 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7065 if (stream == NULL) {
7066 xmlFreeParserInputBuffer(input);
7070 inputPush(ctxt, stream);
7071 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7076 * @ctxt: an HTML parser context
7077 * @fd: an open file descriptor
7078 * @URL: the base URL to use for the document
7079 * @encoding: the document encoding, or NULL
7080 * @options: a combination of htmlParserOption(s)
7082 * parse an XML from a file descriptor and build a tree.
7083 * This reuses the existing @ctxt parser context
7085 * Returns the resulting document tree
7088 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7089 const char *URL, const char *encoding, int options)
7091 xmlParserInputBufferPtr input;
7092 xmlParserInputPtr stream;
7100 htmlCtxtReset(ctxt);
7103 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7106 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7107 if (stream == NULL) {
7108 xmlFreeParserInputBuffer(input);
7111 inputPush(ctxt, stream);
7112 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7117 * @ctxt: an HTML parser context
7118 * @ioread: an I/O read function
7119 * @ioclose: an I/O close function
7120 * @ioctx: an I/O handler
7121 * @URL: the base URL to use for the document
7122 * @encoding: the document encoding, or NULL
7123 * @options: a combination of htmlParserOption(s)
7125 * parse an HTML document from I/O functions and source and build a tree.
7126 * This reuses the existing @ctxt parser context
7128 * Returns the resulting document tree
7131 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7132 xmlInputCloseCallback ioclose, void *ioctx,
7134 const char *encoding, int options)
7136 xmlParserInputBufferPtr input;
7137 xmlParserInputPtr stream;
7145 htmlCtxtReset(ctxt);
7147 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7148 XML_CHAR_ENCODING_NONE);
7149 if (input == NULL) {
7150 if (ioclose != NULL)
7154 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7155 if (stream == NULL) {
7156 xmlFreeParserInputBuffer(input);
7159 inputPush(ctxt, stream);
7160 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7163 #define bottom_HTMLparser
7164 #include "elfgcchack.h"
7165 #endif /* LIBXML_HTML_ENABLED */