2 * HTMLparser.c : an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
11 #ifdef LIBXML_HTML_ENABLED
20 #ifdef HAVE_SYS_STAT_H
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
47 #define HTML_MAX_NAMELEN 1000
48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
49 #define HTML_PARSER_BUFFER_SIZE 100
52 /* #define DEBUG_PUSH */
54 static int htmlOmittedDefaultValue = 1;
56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
60 /************************************************************************
62 * Some factorized error routines *
64 ************************************************************************/
68 * @ctxt: an HTML parser context
69 * @extra: extra informations
71 * Handle a redefinition of attribute error
74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
88 "Memory allocation failed : %s\n", extra);
90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
120 ctxt->wellFormed = 0;
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
145 ctxt->wellFormed = 0;
148 /************************************************************************
150 * Parser stacks related functions and macros *
152 ************************************************************************/
156 * @ctxt: an HTML parser context
157 * @value: the element name
159 * Pushes a new element name on top of the name stack
161 * Returns 0 in case of error, the index in the stack otherwise
164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
170 if (ctxt->nameNr >= ctxt->nameMax) {
172 ctxt->nameTab = (const xmlChar * *)
173 xmlRealloc((xmlChar * *)ctxt->nameTab,
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
177 htmlErrMemory(ctxt, NULL);
181 ctxt->nameTab[ctxt->nameNr] = value;
183 return (ctxt->nameNr++);
187 * @ctxt: an HTML parser context
189 * Pops the top element name from the name stack
191 * Returns the name just removed
193 static const xmlChar *
194 htmlnamePop(htmlParserCtxtPtr ctxt)
198 if (ctxt->nameNr <= 0)
201 if (ctxt->nameNr < 0)
203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
207 ret = ctxt->nameTab[ctxt->nameNr];
208 ctxt->nameTab[ctxt->nameNr] = NULL;
214 * @ctxt: an HTML parser context
215 * @value: the node info
217 * Pushes a new element name on top of the node info stack
219 * Returns 0 in case of error, the index in the stack otherwise
222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
224 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
225 if (ctxt->nodeInfoMax == 0)
226 ctxt->nodeInfoMax = 5;
227 ctxt->nodeInfoMax *= 2;
228 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
229 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
231 sizeof(ctxt->nodeInfoTab[0]));
232 if (ctxt->nodeInfoTab == NULL) {
233 htmlErrMemory(ctxt, NULL);
237 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
238 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
239 return (ctxt->nodeInfoNr++);
244 * @ctxt: an HTML parser context
246 * Pops the top element name from the node info stack
248 * Returns 0 in case of error, the pointer to NodeInfo otherwise
250 static htmlParserNodeInfo *
251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
253 if (ctxt->nodeInfoNr <= 0)
256 if (ctxt->nodeInfoNr < 0)
258 if (ctxt->nodeInfoNr > 0)
259 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
261 ctxt->nodeInfo = NULL;
262 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266 * Macros for accessing the content. Those should be used only by the parser,
269 * Dirty macros, i.e. one need to make assumption on the context to use them
271 * CUR_PTR return the current pointer to the xmlChar to be parsed.
272 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
273 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
274 * in UNICODE mode. This should be used internally by the parser
275 * only to compare to ASCII values otherwise it would break when
276 * running with UTF-8 encoding.
277 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
278 * to compare on ASCII based substring.
279 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
280 * it should be used only to compare on ASCII based substring.
281 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
282 * strings without newlines within the parser.
284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
286 * CURRENT Returns the current char value, with the full decoding of
287 * UTF-8 if we are using this mode. It returns an int.
288 * NEXT Skip to the next character, this does the proper decoding
289 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
290 * NEXTL(l) Skip the current unicode character of l xmlChars long.
291 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
294 #define UPPER (toupper(*ctxt->input->cur))
296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
298 #define NXT(val) ctxt->input->cur[(val)]
300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
302 #define CUR_PTR ctxt->input->cur
304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
305 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
306 xmlParserInputShrink(ctxt->input)
308 #define GROW if ((ctxt->progressive == 0) && \
309 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
310 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
312 #define CURRENT ((int) (*ctxt->input->cur))
314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
316 /* Inported from XML */
318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
319 #define CUR ((int) (*ctxt->input->cur))
320 #define NEXT xmlNextChar(ctxt)
322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
325 #define NEXTL(l) do { \
326 if (*(ctxt->input->cur) == '\n') { \
327 ctxt->input->line++; ctxt->input->col = 1; \
328 } else ctxt->input->col++; \
329 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
334 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
335 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
341 #define COPY_BUF(l,b,i,v) \
342 if (l == 1) b[i++] = (xmlChar) v; \
343 else i += xmlCopyChar(l,&b[i],v)
347 * @the HTML parser context
349 * Ty to find and encoding in the current data available in the input
350 * buffer this is needed to try to switch to the proper encoding when
351 * one face a character error.
352 * That's an heuristic, since it's operating outside of parsing it could
353 * try to use a meta which had been commented out, that's the reason it
354 * should only be used in case of error, not as a default.
356 * Returns an encoding string or NULL if not found, the string need to
360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
361 const xmlChar *start, *cur, *end;
363 if ((ctxt == NULL) || (ctxt->input == NULL) ||
364 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
365 (ctxt->input->buf->encoder != NULL))
367 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
370 start = ctxt->input->cur;
371 end = ctxt->input->end;
372 /* we also expect the input buffer to be zero terminated */
376 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
379 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
382 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 while (((*cur >= 'A') && (*cur <= 'Z')) ||
388 ((*cur >= 'a') && (*cur <= 'z')) ||
389 ((*cur >= '0') && (*cur <= '9')) ||
390 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
394 return(xmlStrndup(start, cur - start));
399 * @ctxt: the HTML parser context
400 * @len: pointer to the length of the char read
402 * The current char value, if using UTF-8 this may actually span multiple
403 * bytes in the input buffer. Implement the end of line normalization:
404 * 2.11 End-of-Line Handling
405 * If the encoding is unspecified, in the case we find an ISO-Latin-1
406 * char, then the encoding converter is plugged in automatically.
408 * Returns the current char value and its length
412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
413 if (ctxt->instate == XML_PARSER_EOF)
416 if (ctxt->token != 0) {
420 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
422 * We are supposed to handle UTF8, check it's valid
423 * From rfc2044: encoding of the Unicode values on UTF-8:
425 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
426 * 0000 0000-0000 007F 0xxxxxxx
427 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
428 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
430 * Check for the 0x110000 limit too
432 const unsigned char *cur = ctxt->input->cur;
439 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
440 cur = ctxt->input->cur;
442 if ((cur[1] & 0xc0) != 0x80)
444 if ((c & 0xe0) == 0xe0) {
447 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
448 cur = ctxt->input->cur;
450 if ((cur[2] & 0xc0) != 0x80)
452 if ((c & 0xf0) == 0xf0) {
454 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
455 cur = ctxt->input->cur;
457 if (((c & 0xf8) != 0xf0) ||
458 ((cur[3] & 0xc0) != 0x80))
462 val = (cur[0] & 0x7) << 18;
463 val |= (cur[1] & 0x3f) << 12;
464 val |= (cur[2] & 0x3f) << 6;
465 val |= cur[3] & 0x3f;
469 val = (cur[0] & 0xf) << 12;
470 val |= (cur[1] & 0x3f) << 6;
471 val |= cur[2] & 0x3f;
476 val = (cur[0] & 0x1f) << 6;
477 val |= cur[1] & 0x3f;
480 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
481 "Char 0x%X out of allowed range\n", val);
485 if ((*ctxt->input->cur == 0) &&
486 (ctxt->input->cur < ctxt->input->end)) {
487 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
488 "Char 0x%X out of allowed range\n", 0);
494 return((int) *ctxt->input->cur);
498 * Assume it's a fixed length encoding (1) with
499 * a compatible encoding for the ASCII set, since
500 * XML constructs only use < 128 chars
503 if ((int) *ctxt->input->cur < 0x80)
504 return((int) *ctxt->input->cur);
507 * Humm this is bad, do an automatic flow conversion
511 xmlCharEncodingHandlerPtr handler;
513 guess = htmlFindEncoding(ctxt);
515 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
517 if (ctxt->input->encoding != NULL)
518 xmlFree((xmlChar *) ctxt->input->encoding);
519 ctxt->input->encoding = guess;
520 handler = xmlFindCharEncodingHandler((const char *) guess);
521 if (handler != NULL) {
522 xmlSwitchToEncoding(ctxt, handler);
524 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
525 "Unsupported encoding %s", guess, NULL);
528 ctxt->charset = XML_CHAR_ENCODING_UTF8;
531 return(xmlCurrentChar(ctxt, len));
535 * If we detect an UTF8 error that probably mean that the
536 * input encoding didn't get properly advertized in the
537 * declaration header. Report the error and switch the encoding
538 * to ISO-Latin-1 (if you don't like this policy, just declare the
544 if (ctxt->input->end - ctxt->input->cur >= 4) {
545 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
546 ctxt->input->cur[0], ctxt->input->cur[1],
547 ctxt->input->cur[2], ctxt->input->cur[3]);
549 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
551 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
552 "Input is not proper UTF-8, indicate encoding !\n",
553 BAD_CAST buffer, NULL);
556 ctxt->charset = XML_CHAR_ENCODING_8859_1;
558 return((int) *ctxt->input->cur);
562 * htmlSkipBlankChars:
563 * @ctxt: the HTML parser context
565 * skip all blanks character found at that point in the input streams.
567 * Returns the number of space chars skipped
571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
574 while (IS_BLANK_CH(*(ctxt->input->cur))) {
575 if ((*ctxt->input->cur == 0) &&
576 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
579 if (*(ctxt->input->cur) == '\n') {
580 ctxt->input->line++; ctxt->input->col = 1;
581 } else ctxt->input->col++;
584 if (*ctxt->input->cur == 0)
585 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
594 /************************************************************************
596 * The list of HTML elements and their properties *
598 ************************************************************************/
601 * Start Tag: 1 means the start tag can be ommited
602 * End Tag: 1 means the end tag can be ommited
603 * 2 means it's forbidden (empty elements)
604 * 3 means the tag is stylistic and should be closed easily
605 * Depr: this element is deprecated
606 * DTD: 1 means that this element is valid only in the Loose DTD
607 * 2 means that this element is valid only in the Frameset DTD
609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
610 , subElements , impliedsubelt , Attributes, userdata
613 /* Definitions and a couple of vars for HTML Elements */
615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
616 #define NB_FONTSTYLE 8
617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
620 #define NB_SPECIAL 16
621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
625 #define FORMCTRL "input", "select", "textarea", "label", "button"
626 #define NB_FORMCTRL 5
629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
631 #define LIST "ul", "ol", "dir", "menu"
634 #define NB_MODIFIER 0
635 #define FLOW BLOCK,INLINE
636 #define NB_FLOW NB_BLOCK + NB_INLINE
640 static const char* const html_flow[] = { FLOW, NULL } ;
641 static const char* const html_inline[] = { INLINE, NULL } ;
643 /* placeholders: elts with content but no subelements */
644 static const char* const html_pcdata[] = { NULL } ;
645 #define html_cdata html_pcdata
648 /* ... and for HTML Attributes */
650 #define COREATTRS "id", "class", "style", "title"
651 #define NB_COREATTRS 4
652 #define I18N "lang", "dir"
654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
656 #define ATTRS COREATTRS,I18N,EVENTS
657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
658 #define CELLHALIGN "align", "char", "charoff"
659 #define NB_CELLHALIGN 3
660 #define CELLVALIGN "valign"
661 #define NB_CELLVALIGN 1
663 static const char* const html_attrs[] = { ATTRS, NULL } ;
664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
666 static const char* const i18n_attrs[] = { I18N, NULL } ;
669 /* Other declarations that should go inline ... */
670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
671 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
672 "tabindex", "onfocus", "onblur", NULL } ;
673 static const char* const target_attr[] = { "target", NULL } ;
674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
675 static const char* const alt_attr[] = { "alt", NULL } ;
676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
677 static const char* const href_attrs[] = { "href", NULL } ;
678 static const char* const clear_attrs[] = { "clear", NULL } ;
679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
683 "archive", "alt", "name", "height", "width", "align",
684 "hspace", "vspace", NULL } ;
685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
686 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
687 static const char* const basefont_attrs[] =
688 { "id", "size", "color", "face", NULL } ;
689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
692 static const char* const body_depr[] = { "background", "bgcolor", "text",
693 "link", "vlink", "alink", NULL } ;
694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
695 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
699 static const char* const col_elt[] = { "col", NULL } ;
700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
703 static const char* const compact_attr[] = { "compact", NULL } ;
704 static const char* const label_attr[] = { "label", NULL } ;
705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
715 static const char* const version_attr[] = { "version", NULL } ;
716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
724 static const char* const align_attr[] = { "align", NULL } ;
725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
727 static const char* const name_attr[] = { "name", NULL } ;
728 static const char* const action_attr[] = { "action", NULL } ;
729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
731 static const char* const content_attr[] = { "content", NULL } ;
732 static const char* const type_attr[] = { "type", NULL } ;
733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
738 static const char* const option_elt[] = { "option", NULL } ;
739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
742 static const char* const width_attr[] = { "width", NULL } ;
743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
745 static const char* const language_attr[] = { "language", NULL } ;
746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
752 static const char* const tr_elt[] = { "tr", NULL } ;
753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
757 static const char* const tr_contents[] = { "th", "td", NULL } ;
758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
759 static const char* const li_elt[] = { "li", NULL } ;
760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
761 static const char* const dir_attr[] = { "dir", NULL} ;
763 #define DECL (const char**)
765 static const htmlElemDesc
766 html40ElementTable[] = {
767 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
768 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
770 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
773 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
776 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
777 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
779 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
780 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
782 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
783 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
785 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
788 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
789 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
791 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
792 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
794 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
795 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
797 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
798 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
800 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
801 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
803 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
804 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
806 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
807 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
809 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
810 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
812 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
813 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
815 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
816 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
818 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
819 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
821 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
824 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
825 EMPTY , NULL , DECL col_attrs , NULL, NULL
827 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
828 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
830 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
831 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
833 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
834 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
836 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
837 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
839 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
840 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
842 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
843 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
845 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
846 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
848 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
855 EMPTY, NULL, DECL embed_attrs, NULL, NULL
857 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
858 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
860 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
861 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
863 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
864 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
866 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
867 EMPTY, NULL, NULL, DECL frame_attrs, NULL
869 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
870 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
872 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
873 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
875 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
878 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
881 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
884 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
887 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
891 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
893 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
894 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
896 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
897 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
899 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
900 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
902 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
903 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
905 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
906 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
908 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
909 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
911 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
912 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
914 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
915 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
917 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
920 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
921 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
923 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
924 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
926 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
927 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
929 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
930 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
932 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
933 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
935 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
936 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
938 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
939 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
941 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
942 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
944 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
945 DECL html_flow, "div", DECL html_attrs, NULL, NULL
947 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
948 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
950 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
951 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
953 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
954 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
956 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
957 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
959 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
960 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
962 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
963 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
965 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
966 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
968 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
969 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
971 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
972 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
974 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
977 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
978 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
980 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
981 DECL select_content, NULL, DECL select_attrs, NULL, NULL
983 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
986 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
989 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
990 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
992 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
993 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
995 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
996 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
998 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
999 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1001 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1005 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1007 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1008 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1010 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1011 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1013 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1016 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1017 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1019 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1020 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1022 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1023 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1025 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1026 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1028 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1029 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1031 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1034 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1037 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1040 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1046 * start tags that imply the end of current element
1048 static const char * const htmlStartClose[] = {
1049 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1051 "listing", "xmp", "head", NULL,
1054 "body", "head", "style", "link", "title", "p", NULL,
1055 "frameset", "head", "style", "link", "title", "p", NULL,
1056 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057 "pre", "listing", "xmp", "head", "li", NULL,
1058 "hr", "p", "head", NULL,
1059 "h1", "p", "head", NULL,
1060 "h2", "p", "head", NULL,
1061 "h3", "p", "head", NULL,
1062 "h4", "p", "head", NULL,
1063 "h5", "p", "head", NULL,
1064 "h6", "p", "head", NULL,
1065 "dir", "p", "head", NULL,
1066 "address", "p", "head", "ul", NULL,
1067 "pre", "p", "head", "ul", NULL,
1068 "listing", "p", "head", NULL,
1069 "xmp", "p", "head", NULL,
1070 "blockquote", "p", "head", NULL,
1071 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1072 "xmp", "head", NULL,
1073 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1075 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1077 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1078 "listing", "xmp", NULL,
1079 "ol", "p", "head", "ul", NULL,
1080 "menu", "p", "head", "ul", NULL,
1081 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1082 "div", "p", "head", NULL,
1083 "noscript", "p", NULL,
1084 "center", "font", "b", "i", "p", "head", NULL,
1086 "caption", "p", NULL,
1087 "colgroup", "caption", "colgroup", "col", "p", NULL,
1088 "col", "caption", "col", "p", NULL,
1089 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090 "listing", "xmp", "a", NULL,
1091 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1092 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1093 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094 "thead", "caption", "col", "colgroup", NULL,
1095 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1097 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1098 "tfoot", "tbody", "p", NULL,
1099 "optgroup", "option", NULL,
1100 "option", "option", NULL,
1101 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102 "pre", "listing", "xmp", "a", NULL,
1107 * The list of HTML elements which are supposed not to have
1108 * CDATA content and where a p element will be implied
1110 * TODO: extend that list by reading the HTML SGML DTD on
1113 static const char *const htmlNoContentElements[] = {
1120 * The list of HTML attributes which are of content %Script;
1121 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122 * it assumes the name starts with 'on'
1124 static const char *const htmlScriptAttributes[] = {
1146 * This table is used by the htmlparser to know what to do with
1147 * broken html pages. By assigning different priorities to different
1148 * elements the parser can decide how to handle extra endtags.
1149 * Endtags are only allowed to close elements with lower or equal
1158 static const elementPriority htmlEndPriority[] = {
1170 {NULL, 100} /* Default priority */
1173 static const char** htmlStartCloseIndex[100];
1174 static int htmlStartCloseIndexinitialized = 0;
1176 /************************************************************************
1178 * functions to handle HTML specific data *
1180 ************************************************************************/
1183 * htmlInitAutoClose:
1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186 * This is not reentrant. Call xmlInitParser() once before processing in
1187 * case of use in multithreaded programs.
1190 htmlInitAutoClose(void) {
1193 if (htmlStartCloseIndexinitialized) return;
1195 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1197 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1198 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1199 while (htmlStartClose[i] != NULL) i++;
1202 htmlStartCloseIndexinitialized = 1;
1207 * @tag: The tag name in lowercase
1209 * Lookup the HTML tag in the ElementTable
1211 * Returns the related htmlElemDescPtr or NULL if not found.
1213 const htmlElemDesc *
1214 htmlTagLookup(const xmlChar *tag) {
1217 for (i = 0; i < (sizeof(html40ElementTable) /
1218 sizeof(html40ElementTable[0]));i++) {
1219 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1220 return((htmlElemDescPtr) &html40ElementTable[i]);
1226 * htmlGetEndPriority:
1227 * @name: The name of the element to look up the priority for.
1229 * Return value: The "endtag" priority.
1232 htmlGetEndPriority (const xmlChar *name) {
1235 while ((htmlEndPriority[i].name != NULL) &&
1236 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1239 return(htmlEndPriority[i].priority);
1244 * htmlCheckAutoClose:
1245 * @newtag: The new tag name
1246 * @oldtag: The old tag name
1248 * Checks whether the new tag is one of the registered valid tags for
1250 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1252 * Returns 0 if no, 1 if yes.
1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1258 const char **closed = NULL;
1260 if (htmlStartCloseIndexinitialized == 0)
1261 htmlInitAutoClose();
1263 /* inefficient, but not a big deal */
1264 for (indx = 0; indx < 100; indx++) {
1265 closed = htmlStartCloseIndex[indx];
1268 if (xmlStrEqual(BAD_CAST * closed, newtag))
1272 i = closed - htmlStartClose;
1274 while (htmlStartClose[i] != NULL) {
1275 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1284 * htmlAutoCloseOnClose:
1285 * @ctxt: an HTML parser context
1286 * @newtag: The new tag name
1287 * @force: force the tag closure
1289 * The HTML DTD allows an ending tag to implicitly close other tags.
1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1294 const htmlElemDesc *info;
1297 priority = htmlGetEndPriority(newtag);
1299 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1301 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1304 * A missplaced endtag can only close elements with lower
1305 * or equal priority, so if we find an element with higher
1306 * priority before we find an element with
1307 * matching name, we just ignore this endtag
1309 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1315 while (!xmlStrEqual(newtag, ctxt->name)) {
1316 info = htmlTagLookup(ctxt->name);
1317 if ((info != NULL) && (info->endTag == 3)) {
1318 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319 "Opening and ending tag mismatch: %s and %s\n",
1320 newtag, ctxt->name);
1322 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1329 * htmlAutoCloseOnEnd:
1330 * @ctxt: an HTML parser context
1332 * Close all remaining tags at the end of the stream
1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1339 if (ctxt->nameNr == 0)
1341 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1342 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1350 * @ctxt: an HTML parser context
1351 * @newtag: The new tag name or NULL
1353 * The HTML DTD allows a tag to implicitly close other tags.
1354 * The list is kept in htmlStartClose array. This function is
1355 * called when a new tag has been detected and generates the
1356 * appropriates closes if possible/needed.
1357 * If newtag is NULL this mean we are at the end of the resource
1358 * and we should check
1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1363 while ((newtag != NULL) && (ctxt->name != NULL) &&
1364 (htmlCheckAutoClose(newtag, ctxt->name))) {
1365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1369 if (newtag == NULL) {
1370 htmlAutoCloseOnEnd(ctxt);
1373 while ((newtag == NULL) && (ctxt->name != NULL) &&
1374 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1385 * @doc: the HTML document
1386 * @name: The tag name
1387 * @elem: the HTML element
1389 * The HTML DTD allows a tag to implicitly close other tags.
1390 * The list is kept in htmlStartClose array. This function checks
1391 * if the element or one of it's children would autoclose the
1394 * Returns 1 if autoclose, 0 otherwise
1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1400 if (elem == NULL) return(1);
1401 if (xmlStrEqual(name, elem->name)) return(0);
1402 if (htmlCheckAutoClose(elem->name, name)) return(1);
1403 child = elem->children;
1404 while (child != NULL) {
1405 if (htmlAutoCloseTag(doc, name, child)) return(1);
1406 child = child->next;
1413 * @doc: the HTML document
1414 * @elem: the HTML element
1416 * The HTML DTD allows a tag to implicitly close other tags.
1417 * The list is kept in htmlStartClose array. This function checks
1418 * if a tag is autoclosed by one of it's child
1420 * Returns 1 if autoclosed, 0 otherwise
1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1426 if (elem == NULL) return(1);
1427 child = elem->children;
1428 while (child != NULL) {
1429 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430 child = child->next;
1437 * @ctxt: an HTML parser context
1438 * @newtag: The new tag name
1440 * The HTML DTD allows a tag to exists only implicitly
1441 * called when a new tag has been detected and generates the
1442 * appropriates implicit tags if missing
1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1448 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1450 if (!htmlOmittedDefaultValue)
1452 if (xmlStrEqual(newtag, BAD_CAST"html"))
1454 if (ctxt->nameNr <= 0) {
1455 htmlnamePush(ctxt, BAD_CAST"html");
1456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1459 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1461 if ((ctxt->nameNr <= 1) &&
1462 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1468 if (ctxt->html >= 3) {
1469 /* we already saw or generated an <head> before */
1473 * dropped OBJECT ... i you put it first BODY will be
1476 htmlnamePush(ctxt, BAD_CAST"head");
1477 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1479 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1482 if (ctxt->html >= 10) {
1483 /* we already saw or generated a <body> before */
1486 for (i = 0;i < ctxt->nameNr;i++) {
1487 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1495 htmlnamePush(ctxt, BAD_CAST"body");
1496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1502 * htmlCheckParagraph
1503 * @ctxt: an HTML parser context
1505 * Check whether a p element need to be implied before inserting
1506 * characters in the current element.
1508 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1521 htmlAutoClose(ctxt, BAD_CAST"p");
1522 htmlCheckImplied(ctxt, BAD_CAST"p");
1523 htmlnamePush(ctxt, BAD_CAST"p");
1524 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1528 if (!htmlOmittedDefaultValue)
1530 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1532 htmlAutoClose(ctxt, BAD_CAST"p");
1533 htmlCheckImplied(ctxt, BAD_CAST"p");
1534 htmlnamePush(ctxt, BAD_CAST"p");
1535 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1544 * htmlIsScriptAttribute:
1545 * @name: an attribute name
1547 * Check if an attribute is of content type Script
1549 * Returns 1 is the attribute is a script 0 otherwise
1552 htmlIsScriptAttribute(const xmlChar *name) {
1558 * all script attributes start with 'on'
1560 if ((name[0] != 'o') || (name[1] != 'n'))
1563 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1565 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1571 /************************************************************************
1573 * The list of HTML predefined entities *
1575 ************************************************************************/
1578 static const htmlEntityDesc html40EntitiesTable[] = {
1580 * the 4 absolute ones, plus apostrophe.
1582 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1583 { 38, "amp", "ampersand, U+0026 ISOnum" },
1584 { 39, "apos", "single quote" },
1585 { 60, "lt", "less-than sign, U+003C ISOnum" },
1586 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1589 * A bunch still in the 128-255 range
1590 * Replacing them depend really on the charset used.
1592 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1593 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1595 { 163, "pound","pound sign, U+00A3 ISOnum" },
1596 { 164, "curren","currency sign, U+00A4 ISOnum" },
1597 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1598 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599 { 167, "sect", "section sign, U+00A7 ISOnum" },
1600 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1602 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1603 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604 { 172, "not", "not sign, U+00AC ISOnum" },
1605 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1607 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1609 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613 { 181, "micro","micro sign, U+00B5 ISOnum" },
1614 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1618 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1619 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1641 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1648 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1668 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1672 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1673 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679 { 247, "divide","division sign, U+00F7 ISOnum" },
1680 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1685 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1689 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1691 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1696 * Anything below should really be kept as entities references
1698 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1700 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1701 { 732, "tilde","small tilde, U+02DC ISOdia" },
1703 { 913, "Alpha","greek capital letter alpha, U+0391" },
1704 { 914, "Beta", "greek capital letter beta, U+0392" },
1705 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1708 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1709 { 919, "Eta", "greek capital letter eta, U+0397" },
1710 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711 { 921, "Iota", "greek capital letter iota, U+0399" },
1712 { 922, "Kappa","greek capital letter kappa, U+039A" },
1713 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1714 { 924, "Mu", "greek capital letter mu, U+039C" },
1715 { 925, "Nu", "greek capital letter nu, U+039D" },
1716 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1717 { 927, "Omicron","greek capital letter omicron, U+039F" },
1718 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1719 { 929, "Rho", "greek capital letter rho, U+03A1" },
1720 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721 { 932, "Tau", "greek capital letter tau, U+03A4" },
1722 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1724 { 935, "Chi", "greek capital letter chi, U+03A7" },
1725 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1726 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1728 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1730 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1732 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1734 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1735 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1736 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1737 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1740 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1741 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1742 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1743 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1744 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1745 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1748 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1750 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1751 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1752 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1753 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1757 { 8194, "ensp", "en space, U+2002 ISOpub" },
1758 { 8195, "emsp", "em space, U+2003 ISOpub" },
1759 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1760 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1761 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1762 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1763 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1764 { 8211, "ndash","en dash, U+2013 ISOpub" },
1765 { 8212, "mdash","em dash, U+2014 ISOpub" },
1766 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1767 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1768 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1769 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1770 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1771 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1772 { 8224, "dagger","dagger, U+2020 ISOpub" },
1773 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1775 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1776 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1778 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1780 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1781 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1783 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1786 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1787 { 8260, "frasl","fraction slash, U+2044 NEW" },
1789 { 8364, "euro", "euro sign, U+20AC NEW" },
1791 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1794 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1795 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1797 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1798 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1799 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1800 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1801 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1803 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1804 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1805 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1806 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1808 { 8704, "forall","for all, U+2200 ISOtech" },
1809 { 8706, "part", "partial differential, U+2202 ISOtech" },
1810 { 8707, "exist","there exists, U+2203 ISOtech" },
1811 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1812 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1813 { 8712, "isin", "element of, U+2208 ISOtech" },
1814 { 8713, "notin","not an element of, U+2209 ISOtech" },
1815 { 8715, "ni", "contains as member, U+220B ISOtech" },
1816 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1817 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1818 { 8722, "minus","minus sign, U+2212 ISOtech" },
1819 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1820 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1821 { 8733, "prop", "proportional to, U+221D ISOtech" },
1822 { 8734, "infin","infinity, U+221E ISOtech" },
1823 { 8736, "ang", "angle, U+2220 ISOamso" },
1824 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1825 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1826 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1827 { 8746, "cup", "union = cup, U+222A ISOtech" },
1828 { 8747, "int", "integral, U+222B ISOtech" },
1829 { 8756, "there4","therefore, U+2234 ISOtech" },
1830 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1831 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1832 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1834 { 8801, "equiv","identical to, U+2261 ISOtech" },
1835 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1836 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1837 { 8834, "sub", "subset of, U+2282 ISOtech" },
1838 { 8835, "sup", "superset of, U+2283 ISOtech" },
1839 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1840 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1841 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1842 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1844 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1846 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1848 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1850 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1851 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1852 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1854 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1855 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1856 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1857 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1861 /************************************************************************
1863 * Commodity functions to handle entities *
1865 ************************************************************************/
1868 * Macro used to grow the current buffer.
1870 #define growBuffer(buffer) { \
1872 buffer##_size *= 2; \
1873 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874 if (tmp == NULL) { \
1875 htmlErrMemory(ctxt, "growing buffer\n"); \
1884 * @name: the entity name
1886 * Lookup the given entity in EntitiesTable
1888 * TODO: the linear scan is really ugly, an hash table is really needed.
1890 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1892 const htmlEntityDesc *
1893 htmlEntityLookup(const xmlChar *name) {
1896 for (i = 0;i < (sizeof(html40EntitiesTable)/
1897 sizeof(html40EntitiesTable[0]));i++) {
1898 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1899 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1906 * htmlEntityValueLookup:
1907 * @value: the entity's unicode value
1909 * Lookup the given entity in EntitiesTable
1911 * TODO: the linear scan is really ugly, an hash table is really needed.
1913 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1915 const htmlEntityDesc *
1916 htmlEntityValueLookup(unsigned int value) {
1919 for (i = 0;i < (sizeof(html40EntitiesTable)/
1920 sizeof(html40EntitiesTable[0]));i++) {
1921 if (html40EntitiesTable[i].value >= value) {
1922 if (html40EntitiesTable[i].value > value)
1924 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1932 * @out: a pointer to an array of bytes to store the result
1933 * @outlen: the length of @out
1934 * @in: a pointer to an array of UTF-8 chars
1935 * @inlen: the length of @in
1937 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938 * plus HTML entities block of chars out.
1940 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941 * The value of @inlen after return is the number of octets consumed
1942 * as the return value is positive, else unpredictable.
1943 * The value of @outlen after return is the number of octets consumed.
1946 UTF8ToHtml(unsigned char* out, int *outlen,
1947 const unsigned char* in, int *inlen) {
1948 const unsigned char* processed = in;
1949 const unsigned char* outend;
1950 const unsigned char* outstart = out;
1951 const unsigned char* instart = in;
1952 const unsigned char* inend;
1956 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1959 * initialization nothing to do
1965 inend = in + (*inlen);
1966 outend = out + (*outlen);
1967 while (in < inend) {
1969 if (d < 0x80) { c= d; trailing= 0; }
1970 else if (d < 0xC0) {
1971 /* trailing byte in leading position */
1972 *outlen = out - outstart;
1973 *inlen = processed - instart;
1975 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1976 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1977 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1979 /* no chance for this in Ascii */
1980 *outlen = out - outstart;
1981 *inlen = processed - instart;
1985 if (inend - in < trailing) {
1989 for ( ; trailing; trailing--) {
1990 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1996 /* assertion: c is a single UTF-4 value */
1998 if (out + 1 >= outend)
2003 const htmlEntityDesc * ent;
2008 * Try to lookup a predefined HTML entity for it
2011 ent = htmlEntityValueLookup(c);
2013 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2019 if (out + 2 + len >= outend)
2022 memcpy(out, cp, len);
2028 *outlen = out - outstart;
2029 *inlen = processed - instart;
2034 * htmlEncodeEntities:
2035 * @out: a pointer to an array of bytes to store the result
2036 * @outlen: the length of @out
2037 * @in: a pointer to an array of UTF-8 chars
2038 * @inlen: the length of @in
2039 * @quoteChar: the quote character to escape (' or ") or zero.
2041 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042 * plus HTML entities block of chars out.
2044 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045 * The value of @inlen after return is the number of octets consumed
2046 * as the return value is positive, else unpredictable.
2047 * The value of @outlen after return is the number of octets consumed.
2050 htmlEncodeEntities(unsigned char* out, int *outlen,
2051 const unsigned char* in, int *inlen, int quoteChar) {
2052 const unsigned char* processed = in;
2053 const unsigned char* outend;
2054 const unsigned char* outstart = out;
2055 const unsigned char* instart = in;
2056 const unsigned char* inend;
2060 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2062 outend = out + (*outlen);
2063 inend = in + (*inlen);
2064 while (in < inend) {
2066 if (d < 0x80) { c= d; trailing= 0; }
2067 else if (d < 0xC0) {
2068 /* trailing byte in leading position */
2069 *outlen = out - outstart;
2070 *inlen = processed - instart;
2072 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2073 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2074 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2076 /* no chance for this in Ascii */
2077 *outlen = out - outstart;
2078 *inlen = processed - instart;
2082 if (inend - in < trailing)
2085 while (trailing--) {
2086 if (((d= *in++) & 0xC0) != 0x80) {
2087 *outlen = out - outstart;
2088 *inlen = processed - instart;
2095 /* assertion: c is a single UTF-4 value */
2096 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097 (c != '&') && (c != '<') && (c != '>')) {
2102 const htmlEntityDesc * ent;
2108 * Try to lookup a predefined HTML entity for it
2110 ent = htmlEntityValueLookup(c);
2112 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2118 if (out + 2 + len > outend)
2121 memcpy(out, cp, len);
2127 *outlen = out - outstart;
2128 *inlen = processed - instart;
2132 /************************************************************************
2134 * Commodity functions to handle streams *
2136 ************************************************************************/
2139 * htmlNewInputStream:
2140 * @ctxt: an HTML parser context
2142 * Create a new input stream structure
2143 * Returns the new input stream or NULL
2145 static htmlParserInputPtr
2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147 htmlParserInputPtr input;
2149 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150 if (input == NULL) {
2151 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2154 memset(input, 0, sizeof(htmlParserInput));
2155 input->filename = NULL;
2156 input->directory = NULL;
2164 input->version = NULL;
2165 input->consumed = 0;
2171 /************************************************************************
2173 * Commodity functions, cleanup needed ? *
2175 ************************************************************************/
2177 * all tags allowing pc data from the html 4.01 loose dtd
2178 * NOTE: it might be more apropriate to integrate this information
2179 * into the html40ElementTable array but I don't want to risk any
2180 * binary incomptibility
2182 static const char *allowPCData[] = {
2183 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184 "blockquote", "body", "button", "caption", "center", "cite", "code",
2185 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2193 * @ctxt: an HTML parser context
2195 * @len: the size of @str
2197 * Is this a sequence of blank chars that one can ignore ?
2199 * Returns 1 if ignorable 0 otherwise.
2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2205 xmlNodePtr lastChild;
2208 for (j = 0;j < len;j++)
2209 if (!(IS_BLANK_CH(str[j]))) return(0);
2211 if (CUR == 0) return(1);
2212 if (CUR != '<') return(0);
2213 if (ctxt->name == NULL)
2215 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2217 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2220 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222 dtd = xmlGetIntSubset(ctxt->myDoc);
2223 if (dtd != NULL && dtd->ExternalID != NULL) {
2224 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2230 if (ctxt->node == NULL) return(0);
2231 lastChild = xmlGetLastChild(ctxt->node);
2232 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233 lastChild = lastChild->prev;
2234 if (lastChild == NULL) {
2235 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236 (ctxt->node->content != NULL)) return(0);
2237 /* keep ws in constructs like ...<b> </b>...
2238 for all tags "b" allowing PCDATA */
2239 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2244 } else if (xmlNodeIsText(lastChild)) {
2247 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2248 for all tags "p" allowing PCDATA */
2249 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2260 * @URI: URI for the dtd, or NULL
2261 * @ExternalID: the external ID of the DTD, or NULL
2263 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2266 * Returns a new document, do not initialize the DTD if not provided
2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2273 * Allocate a new document and fill the fields.
2275 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2277 htmlErrMemory(NULL, "HTML document creation failed\n");
2280 memset(cur, 0, sizeof(xmlDoc));
2282 cur->type = XML_HTML_DOCUMENT_NODE;
2283 cur->version = NULL;
2284 cur->intSubset = NULL;
2287 cur->children = NULL;
2288 cur->extSubset = NULL;
2290 cur->encoding = NULL;
2291 cur->standalone = 1;
2292 cur->compression = 0;
2295 cur->_private = NULL;
2296 cur->charset = XML_CHAR_ENCODING_UTF8;
2297 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2298 if ((ExternalID != NULL) ||
2300 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2306 * @URI: URI for the dtd, or NULL
2307 * @ExternalID: the external ID of the DTD, or NULL
2309 * Creates a new HTML document
2311 * Returns a new document
2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315 if ((URI == NULL) && (ExternalID == NULL))
2316 return(htmlNewDocNoDtD(
2317 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2320 return(htmlNewDocNoDtD(URI, ExternalID));
2324 /************************************************************************
2326 * The parser itself *
2327 * Relates to http://www.w3.org/TR/html40 *
2329 ************************************************************************/
2331 /************************************************************************
2333 * The parser itself *
2335 ************************************************************************/
2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2340 * htmlParseHTMLName:
2341 * @ctxt: an HTML parser context
2343 * parse an HTML tag or attribute name, note that we convert it to lowercase
2344 * since HTML names are not case-sensitive.
2346 * Returns the Tag Name parsed or NULL
2349 static const xmlChar *
2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2352 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2354 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2355 (CUR != ':') && (CUR != '.')) return(NULL);
2357 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2358 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2359 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2361 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2368 return(xmlDictLookup(ctxt->dict, loc, i));
2373 * htmlParseHTMLName_nonInvasive:
2374 * @ctxt: an HTML parser context
2376 * parse an HTML tag or attribute name, note that we convert it to lowercase
2377 * since HTML names are not case-sensitive, this doesn't consume the data
2378 * from the stream, it's a look-ahead
2380 * Returns the Tag Name parsed or NULL
2383 static const xmlChar *
2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2386 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2388 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389 (NXT(1) != ':')) return(NULL);
2391 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395 else loc[i] = NXT(1+i);
2399 return(xmlDictLookup(ctxt->dict, loc, i));
2405 * @ctxt: an HTML parser context
2407 * parse an HTML name, this routine is case sensitive.
2409 * Returns the Name parsed or NULL
2412 static const xmlChar *
2413 htmlParseName(htmlParserCtxtPtr ctxt) {
2421 * Accelerator for simple ASCII names
2423 in = ctxt->input->cur;
2424 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425 ((*in >= 0x41) && (*in <= 0x5A)) ||
2426 (*in == '_') || (*in == ':')) {
2428 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429 ((*in >= 0x41) && (*in <= 0x5A)) ||
2430 ((*in >= 0x30) && (*in <= 0x39)) ||
2431 (*in == '_') || (*in == '-') ||
2432 (*in == ':') || (*in == '.'))
2434 if ((*in > 0) && (*in < 0x80)) {
2435 count = in - ctxt->input->cur;
2436 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2437 ctxt->input->cur = in;
2438 ctxt->nbChars += count;
2439 ctxt->input->col += count;
2443 return(htmlParseNameComplex(ctxt));
2446 static const xmlChar *
2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2453 * Handler for more complex cases
2457 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458 (!IS_LETTER(c) && (c != '_') &&
2463 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465 (c == '.') || (c == '-') ||
2466 (c == '_') || (c == ':') ||
2467 (IS_COMBINING(c)) ||
2468 (IS_EXTENDER(c)))) {
2469 if (count++ > 100) {
2477 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2482 * htmlParseHTMLAttribute:
2483 * @ctxt: an HTML parser context
2484 * @stop: a char stop value
2486 * parse an HTML attribute value till the stop (quote), if
2487 * stop is 0 then it stops at the first space
2489 * Returns the attribute parsed or NULL
2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494 xmlChar *buffer = NULL;
2495 int buffer_size = 0;
2496 xmlChar *out = NULL;
2497 const xmlChar *name = NULL;
2498 const xmlChar *cur = NULL;
2499 const htmlEntityDesc * ent;
2502 * allocate a translation buffer.
2504 buffer_size = HTML_PARSER_BUFFER_SIZE;
2505 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2506 if (buffer == NULL) {
2507 htmlErrMemory(ctxt, "buffer allocation failed\n");
2513 * Ok loop until we reach one of the ending chars
2515 while ((CUR != 0) && (CUR != stop)) {
2516 if ((stop == 0) && (CUR == '>')) break;
2517 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2519 if (NXT(1) == '#') {
2523 c = htmlParseCharRef(ctxt);
2525 { *out++ = c; bits= -6; }
2527 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2528 else if (c < 0x10000)
2529 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2531 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2533 for ( ; bits >= 0; bits-= 6) {
2534 *out++ = ((c >> bits) & 0x3F) | 0x80;
2537 if (out - buffer > buffer_size - 100) {
2538 int indx = out - buffer;
2541 out = &buffer[indx];
2544 ent = htmlParseEntityRef(ctxt, &name);
2547 if (out - buffer > buffer_size - 100) {
2548 int indx = out - buffer;
2551 out = &buffer[indx];
2553 } else if (ent == NULL) {
2557 if (out - buffer > buffer_size - 100) {
2558 int indx = out - buffer;
2561 out = &buffer[indx];
2569 if (out - buffer > buffer_size - 100) {
2570 int indx = out - buffer;
2573 out = &buffer[indx];
2577 { *out++ = c; bits= -6; }
2579 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2580 else if (c < 0x10000)
2581 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2583 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2585 for ( ; bits >= 0; bits-= 6) {
2586 *out++ = ((c >> bits) & 0x3F) | 0x80;
2594 if (out - buffer > buffer_size - 100) {
2595 int indx = out - buffer;
2598 out = &buffer[indx];
2602 { *out++ = c; bits= -6; }
2604 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2605 else if (c < 0x10000)
2606 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2608 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2610 for ( ; bits >= 0; bits-= 6) {
2611 *out++ = ((c >> bits) & 0x3F) | 0x80;
2621 * htmlParseEntityRef:
2622 * @ctxt: an HTML parser context
2623 * @str: location to store the entity name
2625 * parse an HTML ENTITY references
2627 * [68] EntityRef ::= '&' Name ';'
2629 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630 * if non-NULL *str will have to be freed by the caller.
2632 const htmlEntityDesc *
2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634 const xmlChar *name;
2635 const htmlEntityDesc * ent = NULL;
2637 if (str != NULL) *str = NULL;
2638 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2642 name = htmlParseName(ctxt);
2644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645 "htmlParseEntityRef: no name\n", NULL, NULL);
2653 * Lookup the entity in the table.
2655 ent = htmlEntityLookup(name);
2656 if (ent != NULL) /* OK that's ugly !!! */
2659 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660 "htmlParseEntityRef: expecting ';'\n",
2671 * htmlParseAttValue:
2672 * @ctxt: an HTML parser context
2674 * parse a value for an attribute
2675 * Note: the parser won't do substitution of entities here, this
2676 * will be handled later in xmlStringGetNodeList, unless it was
2677 * asked for ctxt->replaceEntities != 0
2679 * Returns the AttValue parsed or NULL.
2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2684 xmlChar *ret = NULL;
2688 ret = htmlParseHTMLAttribute(ctxt, '"');
2690 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2691 "AttValue: \" expected\n", NULL, NULL);
2694 } else if (CUR == '\'') {
2696 ret = htmlParseHTMLAttribute(ctxt, '\'');
2698 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2699 "AttValue: ' expected\n", NULL, NULL);
2704 * That's an HTMLism, the attribute value may not be quoted
2706 ret = htmlParseHTMLAttribute(ctxt, 0);
2708 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2709 "AttValue: no value found\n", NULL, NULL);
2716 * htmlParseSystemLiteral:
2717 * @ctxt: an HTML parser context
2719 * parse an HTML Literal
2721 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2723 * Returns the SystemLiteral parsed or NULL
2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2729 xmlChar *ret = NULL;
2734 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2736 if (!IS_CHAR_CH(CUR)) {
2737 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2738 "Unfinished SystemLiteral\n", NULL, NULL);
2740 ret = xmlStrndup(q, CUR_PTR - q);
2743 } else if (CUR == '\'') {
2746 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2748 if (!IS_CHAR_CH(CUR)) {
2749 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2750 "Unfinished SystemLiteral\n", NULL, NULL);
2752 ret = xmlStrndup(q, CUR_PTR - q);
2756 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2757 " or ' expected\n", NULL, NULL);
2764 * htmlParsePubidLiteral:
2765 * @ctxt: an HTML parser context
2767 * parse an HTML public literal
2769 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2771 * Returns the PubidLiteral parsed or NULL.
2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2777 xmlChar *ret = NULL;
2779 * Name ::= (Letter | '_') (NameChar)*
2784 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2786 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2787 "Unfinished PubidLiteral\n", NULL, NULL);
2789 ret = xmlStrndup(q, CUR_PTR - q);
2792 } else if (CUR == '\'') {
2795 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2798 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2799 "Unfinished PubidLiteral\n", NULL, NULL);
2801 ret = xmlStrndup(q, CUR_PTR - q);
2805 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2806 "PubidLiteral \" or ' expected\n", NULL, NULL);
2814 * @ctxt: an HTML parser context
2816 * parse the content of an HTML SCRIPT or STYLE element
2817 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2818 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2819 * http://www.w3.org/TR/html4/types.html#type-script
2820 * http://www.w3.org/TR/html4/types.html#h-6.15
2821 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2823 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2824 * element and the value of intrinsic event attributes. User agents must
2825 * not evaluate script data as HTML markup but instead must pass it on as
2826 * data to a script engine.
2828 * - The content is passed like CDATA
2829 * - the attributes for style and scripting "onXXX" are also described
2830 * as CDATA but SGML allows entities references in attributes so their
2831 * processing is identical as other attributes
2834 htmlParseScript(htmlParserCtxtPtr ctxt) {
2835 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2841 while (IS_CHAR_CH(cur)) {
2842 if ((cur == '<') && (NXT(1) == '/')) {
2844 * One should break here, the specification is clear:
2845 * Authors should therefore escape "</" within the content.
2846 * Escape mechanisms are specific to each scripting or
2847 * style sheet language.
2849 * In recovery mode, only break if end tag match the
2850 * current tag, effectively ignoring all tags inside the
2851 * script/style block and treating the entire block as
2854 if (ctxt->recovery) {
2855 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2856 xmlStrlen(ctxt->name)) == 0)
2860 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2861 "Element %s embeds close tag\n",
2865 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2866 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2872 COPY_BUF(l,buf,nbchar,cur);
2873 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2874 if (ctxt->sax->cdataBlock!= NULL) {
2876 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2878 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2879 } else if (ctxt->sax->characters != NULL) {
2880 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2889 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2890 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891 "Invalid char in CDATA 0x%X\n", cur);
2892 if (ctxt->input->cur < ctxt->input->end) {
2897 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2898 if (ctxt->sax->cdataBlock!= NULL) {
2900 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2902 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2903 } else if (ctxt->sax->characters != NULL) {
2904 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2911 * htmlParseCharData:
2912 * @ctxt: an HTML parser context
2914 * parse a CharData section.
2915 * if we are within a CDATA section ']]>' marks an end of section.
2917 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2921 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2922 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2929 while (((cur != '<') || (ctxt->token == '<')) &&
2930 ((cur != '&') || (ctxt->token == '&')) &&
2932 if (!(IS_CHAR(cur))) {
2933 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2934 "Invalid char in CDATA 0x%X\n", cur);
2936 COPY_BUF(l,buf,nbchar,cur);
2938 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2940 * Ok the segment is to be consumed as chars.
2942 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2943 if (areBlanks(ctxt, buf, nbchar)) {
2944 if (ctxt->sax->ignorableWhitespace != NULL)
2945 ctxt->sax->ignorableWhitespace(ctxt->userData,
2948 htmlCheckParagraph(ctxt);
2949 if (ctxt->sax->characters != NULL)
2950 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2957 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2973 * Ok the segment is to be consumed as chars.
2975 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2976 if (areBlanks(ctxt, buf, nbchar)) {
2977 if (ctxt->sax->ignorableWhitespace != NULL)
2978 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2980 htmlCheckParagraph(ctxt);
2981 if (ctxt->sax->characters != NULL)
2982 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2990 ctxt->instate = XML_PARSER_EOF;
2995 * htmlParseExternalID:
2996 * @ctxt: an HTML parser context
2997 * @publicID: a xmlChar** receiving PubidLiteral
2999 * Parse an External ID or a Public ID
3001 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3002 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3004 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3006 * Returns the function returns SystemLiteral and in the second
3007 * case publicID receives PubidLiteral, is strict is off
3008 * it is possible to return NULL and have publicID set.
3012 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3013 xmlChar *URI = NULL;
3015 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3016 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3017 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3019 if (!IS_BLANK_CH(CUR)) {
3020 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3021 "Space required after 'SYSTEM'\n", NULL, NULL);
3024 URI = htmlParseSystemLiteral(ctxt);
3026 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3027 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3029 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3030 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3031 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3033 if (!IS_BLANK_CH(CUR)) {
3034 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3035 "Space required after 'PUBLIC'\n", NULL, NULL);
3038 *publicID = htmlParsePubidLiteral(ctxt);
3039 if (*publicID == NULL) {
3040 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3041 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3045 if ((CUR == '"') || (CUR == '\'')) {
3046 URI = htmlParseSystemLiteral(ctxt);
3054 * @ctxt: an XML parser context
3056 * parse an XML Processing Instruction.
3058 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3061 htmlParsePI(htmlParserCtxtPtr ctxt) {
3062 xmlChar *buf = NULL;
3064 int size = HTML_PARSER_BUFFER_SIZE;
3066 const xmlChar *target;
3067 xmlParserInputState state;
3070 if ((RAW == '<') && (NXT(1) == '?')) {
3071 state = ctxt->instate;
3072 ctxt->instate = XML_PARSER_PI;
3074 * this is a Processing Instruction.
3080 * Parse the target name and check for special support like
3083 target = htmlParseName(ctxt);
3084 if (target != NULL) {
3091 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3092 (ctxt->sax->processingInstruction != NULL))
3093 ctxt->sax->processingInstruction(ctxt->userData,
3095 ctxt->instate = state;
3098 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3100 htmlErrMemory(ctxt, NULL);
3101 ctxt->instate = state;
3105 if (!IS_BLANK(cur)) {
3106 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3107 "ParsePI: PI %s space expected\n", target, NULL);
3111 while (IS_CHAR(cur) && (cur != '>')) {
3112 if (len + 5 >= size) {
3116 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3118 htmlErrMemory(ctxt, NULL);
3120 ctxt->instate = state;
3130 COPY_BUF(l,buf,len,cur);
3141 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3142 "ParsePI: PI %s never end ...\n", target, NULL);
3149 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3150 (ctxt->sax->processingInstruction != NULL))
3151 ctxt->sax->processingInstruction(ctxt->userData,
3156 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3157 "PI is not started correctly", NULL, NULL);
3159 ctxt->instate = state;
3165 * @ctxt: an HTML parser context
3167 * Parse an XML (SGML) comment <!-- .... -->
3169 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3172 htmlParseComment(htmlParserCtxtPtr ctxt) {
3173 xmlChar *buf = NULL;
3175 int size = HTML_PARSER_BUFFER_SIZE;
3179 xmlParserInputState state;
3182 * Check that there is a comment right here.
3184 if ((RAW != '<') || (NXT(1) != '!') ||
3185 (NXT(2) != '-') || (NXT(3) != '-')) return;
3187 state = ctxt->instate;
3188 ctxt->instate = XML_PARSER_COMMENT;
3191 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3193 htmlErrMemory(ctxt, "buffer allocation failed\n");
3194 ctxt->instate = state;
3203 while (IS_CHAR(cur) &&
3205 (r != '-') || (q != '-'))) {
3206 if (len + 5 >= size) {
3210 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3213 htmlErrMemory(ctxt, "growing buffer failed\n");
3214 ctxt->instate = state;
3219 COPY_BUF(ql,buf,len,q);
3233 if (!IS_CHAR(cur)) {
3234 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3235 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3239 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3240 (!ctxt->disableSAX))
3241 ctxt->sax->comment(ctxt->userData, buf);
3244 ctxt->instate = state;
3249 * @ctxt: an HTML parser context
3251 * parse Reference declarations
3253 * [66] CharRef ::= '&#' [0-9]+ ';' |
3254 * '&#x' [0-9a-fA-F]+ ';'
3256 * Returns the value parsed (as an int)
3259 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3262 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3263 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3264 "htmlParseCharRef: context error\n",
3268 if ((CUR == '&') && (NXT(1) == '#') &&
3269 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3271 while (CUR != ';') {
3272 if ((CUR >= '0') && (CUR <= '9'))
3273 val = val * 16 + (CUR - '0');
3274 else if ((CUR >= 'a') && (CUR <= 'f'))
3275 val = val * 16 + (CUR - 'a') + 10;
3276 else if ((CUR >= 'A') && (CUR <= 'F'))
3277 val = val * 16 + (CUR - 'A') + 10;
3279 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3280 "htmlParseCharRef: missing semicolon\n",
3288 } else if ((CUR == '&') && (NXT(1) == '#')) {
3290 while (CUR != ';') {
3291 if ((CUR >= '0') && (CUR <= '9'))
3292 val = val * 10 + (CUR - '0');
3294 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3295 "htmlParseCharRef: missing semicolon\n",
3304 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3305 "htmlParseCharRef: invalid value\n", NULL, NULL);
3308 * Check the value IS_CHAR ...
3313 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3314 "htmlParseCharRef: invalid xmlChar value %d\n",
3322 * htmlParseDocTypeDecl:
3323 * @ctxt: an HTML parser context
3325 * parse a DOCTYPE declaration
3327 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3328 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3332 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3333 const xmlChar *name;
3334 xmlChar *ExternalID = NULL;
3335 xmlChar *URI = NULL;
3338 * We know that '<!DOCTYPE' has been detected.
3345 * Parse the DOCTYPE name.
3347 name = htmlParseName(ctxt);
3349 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3350 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3354 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3360 * Check for SystemID and ExternalID
3362 URI = htmlParseExternalID(ctxt, &ExternalID);
3366 * We should be at the end of the DOCTYPE declaration.
3369 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3370 "DOCTYPE improperly terminated\n", NULL, NULL);
3371 /* We shouldn't try to resynchronize ... */
3376 * Create or update the document accordingly to the DOCTYPE
3378 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3379 (!ctxt->disableSAX))
3380 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3383 * Cleanup, since we don't use all those identifiers
3385 if (URI != NULL) xmlFree(URI);
3386 if (ExternalID != NULL) xmlFree(ExternalID);
3390 * htmlParseAttribute:
3391 * @ctxt: an HTML parser context
3392 * @value: a xmlChar ** used to store the value of the attribute
3394 * parse an attribute
3396 * [41] Attribute ::= Name Eq AttValue
3398 * [25] Eq ::= S? '=' S?
3402 * [NS 11] Attribute ::= QName Eq AttValue
3404 * Also the case QName == xmlns:??? is handled independently as a namespace
3407 * Returns the attribute name, and the value in *value.
3410 static const xmlChar *
3411 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3412 const xmlChar *name;
3413 xmlChar *val = NULL;
3416 name = htmlParseHTMLName(ctxt);
3418 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3419 "error parsing attribute name\n", NULL, NULL);
3430 val = htmlParseAttValue(ctxt);
3438 * htmlCheckEncodingDirect:
3439 * @ctxt: an HTML parser context
3440 * @attvalue: the attribute value
3442 * Checks an attribute value to detect
3444 * If a new encoding is detected the parser is switched to decode
3448 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3450 if ((ctxt == NULL) || (encoding == NULL) ||
3451 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3454 /* do not change encoding */
3455 if (ctxt->input->encoding != NULL)
3458 if (encoding != NULL) {
3459 xmlCharEncoding enc;
3460 xmlCharEncodingHandlerPtr handler;
3462 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3464 if (ctxt->input->encoding != NULL)
3465 xmlFree((xmlChar *) ctxt->input->encoding);
3466 ctxt->input->encoding = xmlStrdup(encoding);
3468 enc = xmlParseCharEncoding((const char *) encoding);
3470 * registered set of known encodings
3472 if (enc != XML_CHAR_ENCODING_ERROR) {
3473 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3474 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3475 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3476 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3477 (ctxt->input->buf != NULL) &&
3478 (ctxt->input->buf->encoder == NULL)) {
3479 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3480 "htmlCheckEncoding: wrong encoding meta\n",
3483 xmlSwitchEncoding(ctxt, enc);
3485 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3488 * fallback for unknown encodings
3490 handler = xmlFindCharEncodingHandler((const char *) encoding);
3491 if (handler != NULL) {
3492 xmlSwitchToEncoding(ctxt, handler);
3493 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3495 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3496 "htmlCheckEncoding: unknown encoding %s\n",
3501 if ((ctxt->input->buf != NULL) &&
3502 (ctxt->input->buf->encoder != NULL) &&
3503 (ctxt->input->buf->raw != NULL) &&
3504 (ctxt->input->buf->buffer != NULL)) {
3509 * convert as much as possible to the parser reading buffer.
3511 processed = ctxt->input->cur - ctxt->input->base;
3512 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3513 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3514 ctxt->input->buf->buffer,
3515 ctxt->input->buf->raw);
3517 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3518 "htmlCheckEncoding: encoder error\n",
3522 ctxt->input->cur = ctxt->input->buf->buffer->content;
3524 &ctxt->input->base[ctxt->input->buf->buffer->use];
3530 * htmlCheckEncoding:
3531 * @ctxt: an HTML parser context
3532 * @attvalue: the attribute value
3534 * Checks an http-equiv attribute from a Meta tag to detect
3536 * If a new encoding is detected the parser is switched to decode
3540 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3541 const xmlChar *encoding;
3546 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3547 if (encoding != NULL) {
3553 if (encoding && IS_BLANK_CH(*encoding))
3554 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3555 if (encoding && *encoding == '=') {
3557 htmlCheckEncodingDirect(ctxt, encoding);
3563 * @ctxt: an HTML parser context
3564 * @atts: the attributes values
3566 * Checks an attributes from a Meta tag
3569 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3571 const xmlChar *att, *value;
3573 const xmlChar *content = NULL;
3575 if ((ctxt == NULL) || (atts == NULL))
3580 while (att != NULL) {
3582 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3583 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3585 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3586 htmlCheckEncodingDirect(ctxt, value);
3587 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3591 if ((http) && (content != NULL))
3592 htmlCheckEncoding(ctxt, content);
3597 * htmlParseStartTag:
3598 * @ctxt: an HTML parser context
3600 * parse a start of tag either for rule element or
3601 * EmptyElement. In both case we don't parse the tag closing chars.
3603 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3605 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3609 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3611 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3613 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3617 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3618 const xmlChar *name;
3619 const xmlChar *attname;
3621 const xmlChar **atts;
3628 if (ctxt->instate == XML_PARSER_EOF)
3630 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3631 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3632 "htmlParseStartTag: context error\n", NULL, NULL);
3635 if (CUR != '<') return -1;
3639 maxatts = ctxt->maxatts;
3642 name = htmlParseHTMLName(ctxt);
3644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3645 "htmlParseStartTag: invalid element name\n",
3647 /* Dump the bogus tag like browsers do */
3648 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3649 (ctxt->instate != XML_PARSER_EOF))
3653 if (xmlStrEqual(name, BAD_CAST"meta"))
3657 * Check for auto-closure of HTML elements.
3659 htmlAutoClose(ctxt, name);
3662 * Check for implied HTML elements.
3664 htmlCheckImplied(ctxt, name);
3667 * Avoid html at any level > 0, head at any level != 1
3668 * or any attempt to recurse body
3670 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3671 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3672 "htmlParseStartTag: misplaced <html> tag\n",
3677 if ((ctxt->nameNr != 1) &&
3678 (xmlStrEqual(name, BAD_CAST"head"))) {
3679 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3680 "htmlParseStartTag: misplaced <head> tag\n",
3685 if (xmlStrEqual(name, BAD_CAST"body")) {
3687 for (indx = 0;indx < ctxt->nameNr;indx++) {
3688 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3689 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3690 "htmlParseStartTag: misplaced <body> tag\n",
3699 * Now parse the attributes, it ends up with the ending
3704 while ((IS_CHAR_CH(CUR)) &&
3706 ((CUR != '/') || (NXT(1) != '>'))) {
3707 long cons = ctxt->nbChars;
3710 attname = htmlParseAttribute(ctxt, &attvalue);
3711 if (attname != NULL) {
3714 * Well formedness requires at most one declaration of an attribute
3716 for (i = 0; i < nbatts;i += 2) {
3717 if (xmlStrEqual(atts[i], attname)) {
3718 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3719 "Attribute %s redefined\n", attname, NULL);
3720 if (attvalue != NULL)
3727 * Add the pair to atts
3730 maxatts = 22; /* allow for 10 attrs by default */
3731 atts = (const xmlChar **)
3732 xmlMalloc(maxatts * sizeof(xmlChar *));
3734 htmlErrMemory(ctxt, NULL);
3735 if (attvalue != NULL)
3740 ctxt->maxatts = maxatts;
3741 } else if (nbatts + 4 > maxatts) {
3745 n = (const xmlChar **) xmlRealloc((void *) atts,
3746 maxatts * sizeof(const xmlChar *));
3748 htmlErrMemory(ctxt, NULL);
3749 if (attvalue != NULL)
3755 ctxt->maxatts = maxatts;
3757 atts[nbatts++] = attname;
3758 atts[nbatts++] = attvalue;
3759 atts[nbatts] = NULL;
3760 atts[nbatts + 1] = NULL;
3763 if (attvalue != NULL)
3765 /* Dump the bogus attribute string up to the next blank or
3766 * the end of the tag. */
3767 while ((IS_CHAR_CH(CUR)) &&
3768 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3769 ((CUR != '/') || (NXT(1) != '>')))
3775 if (cons == ctxt->nbChars) {
3776 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3777 "htmlParseStartTag: problem parsing attributes\n",
3784 * Handle specific association to the META tag
3786 if (meta && (nbatts != 0))
3787 htmlCheckMeta(ctxt, atts);
3790 * SAX: Start of Element !
3793 htmlnamePush(ctxt, name);
3794 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3796 ctxt->sax->startElement(ctxt->userData, name, atts);
3798 ctxt->sax->startElement(ctxt->userData, name, NULL);
3803 for (i = 1;i < nbatts;i += 2) {
3804 if (atts[i] != NULL)
3805 xmlFree((xmlChar *) atts[i]);
3814 * @ctxt: an HTML parser context
3816 * parse an end of tag
3818 * [42] ETag ::= '</' Name S? '>'
3822 * [NS 9] ETag ::= '</' QName S? '>'
3824 * Returns 1 if the current level should be closed.
3828 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3830 const xmlChar *name;
3831 const xmlChar *oldname;
3834 if ((CUR != '<') || (NXT(1) != '/')) {
3835 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3836 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3841 name = htmlParseHTMLName(ctxt);
3845 * We should definitely be at the ending "S? '>'" part
3848 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3849 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3850 "End tag : expected '>'\n", NULL, NULL);
3851 if (ctxt->recovery) {
3853 * We're not at the ending > !!
3854 * Error, unless in recover mode where we search forwards
3857 while (CUR != '\0' && CUR != '>') NEXT;
3864 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3867 if ((ctxt->depth > 0) &&
3868 (xmlStrEqual(name, BAD_CAST "html") ||
3869 xmlStrEqual(name, BAD_CAST "body") ||
3870 xmlStrEqual(name, BAD_CAST "head"))) {
3876 * If the name read is not one of the element in the parsing stack
3877 * then return, it's just an error.
3879 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3880 if (xmlStrEqual(name, ctxt->nameTab[i]))
3884 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3885 "Unexpected end tag : %s\n", name, NULL);
3891 * Check for auto-closure of HTML elements.
3894 htmlAutoCloseOnClose(ctxt, name);
3897 * Well formedness constraints, opening and closing must match.
3898 * With the exception that the autoclose may have popped stuff out
3901 if (!xmlStrEqual(name, ctxt->name)) {
3902 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3903 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3904 "Opening and ending tag mismatch: %s and %s\n",
3912 oldname = ctxt->name;
3913 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3914 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3915 ctxt->sax->endElement(ctxt->userData, name);
3916 htmlNodeInfoPop(ctxt);
3928 * htmlParseReference:
3929 * @ctxt: an HTML parser context
3931 * parse and handle entity references in content,
3932 * this will end-up in a call to character() since this is either a
3933 * CharRef, or a predefined entity.
3936 htmlParseReference(htmlParserCtxtPtr ctxt) {
3937 const htmlEntityDesc * ent;
3939 const xmlChar *name;
3940 if (CUR != '&') return;
3942 if (NXT(1) == '#') {
3946 c = htmlParseCharRef(ctxt);
3950 if (c < 0x80) { out[i++]= c; bits= -6; }
3951 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3952 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3953 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3955 for ( ; bits >= 0; bits-= 6) {
3956 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3960 htmlCheckParagraph(ctxt);
3961 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3962 ctxt->sax->characters(ctxt->userData, out, i);
3964 ent = htmlParseEntityRef(ctxt, &name);
3966 htmlCheckParagraph(ctxt);
3967 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3968 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3971 if ((ent == NULL) || !(ent->value > 0)) {
3972 htmlCheckParagraph(ctxt);
3973 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3974 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3975 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3976 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3984 { out[i++]= c; bits= -6; }
3986 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3987 else if (c < 0x10000)
3988 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3990 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3992 for ( ; bits >= 0; bits-= 6) {
3993 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3997 htmlCheckParagraph(ctxt);
3998 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3999 ctxt->sax->characters(ctxt->userData, out, i);
4006 * @ctxt: an HTML parser context
4008 * Parse a content: comment, sub-element, reference or text.
4009 * Kept for compatibility with old code
4013 htmlParseContent(htmlParserCtxtPtr ctxt) {
4014 xmlChar *currentNode;
4016 const xmlChar *name;
4018 currentNode = xmlStrdup(ctxt->name);
4019 depth = ctxt->nameNr;
4021 long cons = ctxt->nbChars;
4025 if (ctxt->instate == XML_PARSER_EOF)
4029 * Our tag or one of it's parent or children is ending.
4031 if ((CUR == '<') && (NXT(1) == '/')) {
4032 if (htmlParseEndTag(ctxt) &&
4033 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4034 if (currentNode != NULL)
4035 xmlFree(currentNode);
4038 continue; /* while */
4041 else if ((CUR == '<') &&
4042 ((IS_ASCII_LETTER(NXT(1))) ||
4043 (NXT(1) == '_') || (NXT(1) == ':'))) {
4044 name = htmlParseHTMLName_nonInvasive(ctxt);
4046 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4047 "htmlParseStartTag: invalid element name\n",
4049 /* Dump the bogus tag like browsers do */
4050 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4053 if (currentNode != NULL)
4054 xmlFree(currentNode);
4058 if (ctxt->name != NULL) {
4059 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4060 htmlAutoClose(ctxt, name);
4067 * Has this node been popped out during parsing of
4070 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4071 (!xmlStrEqual(currentNode, ctxt->name)))
4073 if (currentNode != NULL) xmlFree(currentNode);
4077 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4078 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4080 * Handle SCRIPT/STYLE separately
4082 htmlParseScript(ctxt);
4085 * Sometimes DOCTYPE arrives in the middle of the document
4087 if ((CUR == '<') && (NXT(1) == '!') &&
4088 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4089 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4090 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4092 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4093 "Misplaced DOCTYPE declaration\n",
4094 BAD_CAST "DOCTYPE" , NULL);
4095 htmlParseDocTypeDecl(ctxt);
4099 * First case : a comment
4101 if ((CUR == '<') && (NXT(1) == '!') &&
4102 (NXT(2) == '-') && (NXT(3) == '-')) {
4103 htmlParseComment(ctxt);
4107 * Second case : a Processing Instruction.
4109 else if ((CUR == '<') && (NXT(1) == '?')) {
4114 * Third case : a sub-element.
4116 else if (CUR == '<') {
4117 htmlParseElement(ctxt);
4121 * Fourth case : a reference. If if has not been resolved,
4122 * parsing returns it's Name, create the node
4124 else if (CUR == '&') {
4125 htmlParseReference(ctxt);
4129 * Fifth case : end of the resource
4131 else if (CUR == 0) {
4132 htmlAutoCloseOnEnd(ctxt);
4137 * Last case, text. Note that References are handled directly.
4140 htmlParseCharData(ctxt);
4143 if (cons == ctxt->nbChars) {
4144 if (ctxt->node != NULL) {
4145 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4146 "detected an error in element content\n",
4154 if (currentNode != NULL) xmlFree(currentNode);
4159 * @ctxt: an HTML parser context
4161 * parse an HTML element, this is highly recursive
4162 * this is kept for compatibility with previous code versions
4164 * [39] element ::= EmptyElemTag | STag content ETag
4166 * [41] Attribute ::= Name Eq AttValue
4170 htmlParseElement(htmlParserCtxtPtr ctxt) {
4171 const xmlChar *name;
4172 xmlChar *currentNode = NULL;
4173 const htmlElemDesc * info;
4174 htmlParserNodeInfo node_info;
4177 const xmlChar *oldptr;
4179 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4180 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4181 "htmlParseElement: context error\n", NULL, NULL);
4185 if (ctxt->instate == XML_PARSER_EOF)
4188 /* Capture start position */
4189 if (ctxt->record_info) {
4190 node_info.begin_pos = ctxt->input->consumed +
4191 (CUR_PTR - ctxt->input->base);
4192 node_info.begin_line = ctxt->input->line;
4195 failed = htmlParseStartTag(ctxt);
4197 if ((failed == -1) || (name == NULL)) {
4204 * Lookup the info for that element.
4206 info = htmlTagLookup(name);
4208 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4209 "Tag %s invalid\n", name, NULL);
4213 * Check for an Empty Element labeled the XML/SGML way
4215 if ((CUR == '/') && (NXT(1) == '>')) {
4217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4218 ctxt->sax->endElement(ctxt->userData, name);
4226 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4227 "Couldn't find end of Start Tag %s\n", name, NULL);
4230 * end of parsing of this node.
4232 if (xmlStrEqual(name, ctxt->name)) {
4238 * Capture end position and add node
4240 if (ctxt->record_info) {
4241 node_info.end_pos = ctxt->input->consumed +
4242 (CUR_PTR - ctxt->input->base);
4243 node_info.end_line = ctxt->input->line;
4244 node_info.node = ctxt->node;
4245 xmlParserAddNodeInfo(ctxt, &node_info);
4251 * Check for an Empty Element from DTD definition
4253 if ((info != NULL) && (info->empty)) {
4254 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4255 ctxt->sax->endElement(ctxt->userData, name);
4261 * Parse the content of the element:
4263 currentNode = xmlStrdup(ctxt->name);
4264 depth = ctxt->nameNr;
4265 while (IS_CHAR_CH(CUR)) {
4266 oldptr = ctxt->input->cur;
4267 htmlParseContent(ctxt);
4268 if (oldptr==ctxt->input->cur) break;
4269 if (ctxt->nameNr < depth) break;
4273 * Capture end position and add node
4275 if ( currentNode != NULL && ctxt->record_info ) {
4276 node_info.end_pos = ctxt->input->consumed +
4277 (CUR_PTR - ctxt->input->base);
4278 node_info.end_line = ctxt->input->line;
4279 node_info.node = ctxt->node;
4280 xmlParserAddNodeInfo(ctxt, &node_info);
4282 if (!IS_CHAR_CH(CUR)) {
4283 htmlAutoCloseOnEnd(ctxt);
4286 if (currentNode != NULL)
4287 xmlFree(currentNode);
4291 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4293 * Capture end position and add node
4295 if ( ctxt->node != NULL && ctxt->record_info ) {
4296 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4297 (CUR_PTR - ctxt->input->base);
4298 ctxt->nodeInfo->end_line = ctxt->input->line;
4299 ctxt->nodeInfo->node = ctxt->node;
4300 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4301 htmlNodeInfoPop(ctxt);
4303 if (!IS_CHAR_CH(CUR)) {
4304 htmlAutoCloseOnEnd(ctxt);
4309 * htmlParseElementInternal:
4310 * @ctxt: an HTML parser context
4312 * parse an HTML element, new version, non recursive
4314 * [39] element ::= EmptyElemTag | STag content ETag
4316 * [41] Attribute ::= Name Eq AttValue
4320 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4321 const xmlChar *name;
4322 const htmlElemDesc * info;
4323 htmlParserNodeInfo node_info;
4326 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4327 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4328 "htmlParseElementInternal: context error\n", NULL, NULL);
4332 if (ctxt->instate == XML_PARSER_EOF)
4335 /* Capture start position */
4336 if (ctxt->record_info) {
4337 node_info.begin_pos = ctxt->input->consumed +
4338 (CUR_PTR - ctxt->input->base);
4339 node_info.begin_line = ctxt->input->line;
4342 failed = htmlParseStartTag(ctxt);
4344 if ((failed == -1) || (name == NULL)) {
4351 * Lookup the info for that element.
4353 info = htmlTagLookup(name);
4355 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4356 "Tag %s invalid\n", name, NULL);
4360 * Check for an Empty Element labeled the XML/SGML way
4362 if ((CUR == '/') && (NXT(1) == '>')) {
4364 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4365 ctxt->sax->endElement(ctxt->userData, name);
4373 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4374 "Couldn't find end of Start Tag %s\n", name, NULL);
4377 * end of parsing of this node.
4379 if (xmlStrEqual(name, ctxt->name)) {
4384 if (ctxt->record_info)
4385 htmlNodeInfoPush(ctxt, &node_info);
4386 htmlParserFinishElementParsing(ctxt);
4391 * Check for an Empty Element from DTD definition
4393 if ((info != NULL) && (info->empty)) {
4394 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4395 ctxt->sax->endElement(ctxt->userData, name);
4400 if (ctxt->record_info)
4401 htmlNodeInfoPush(ctxt, &node_info);
4405 * htmlParseContentInternal:
4406 * @ctxt: an HTML parser context
4408 * Parse a content: comment, sub-element, reference or text.
4409 * New version for non recursive htmlParseElementInternal
4413 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4414 xmlChar *currentNode;
4416 const xmlChar *name;
4418 currentNode = xmlStrdup(ctxt->name);
4419 depth = ctxt->nameNr;
4421 long cons = ctxt->nbChars;
4425 if (ctxt->instate == XML_PARSER_EOF)
4429 * Our tag or one of it's parent or children is ending.
4431 if ((CUR == '<') && (NXT(1) == '/')) {
4432 if (htmlParseEndTag(ctxt) &&
4433 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4434 if (currentNode != NULL)
4435 xmlFree(currentNode);
4437 currentNode = xmlStrdup(ctxt->name);
4438 depth = ctxt->nameNr;
4440 continue; /* while */
4443 else if ((CUR == '<') &&
4444 ((IS_ASCII_LETTER(NXT(1))) ||
4445 (NXT(1) == '_') || (NXT(1) == ':'))) {
4446 name = htmlParseHTMLName_nonInvasive(ctxt);
4448 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4449 "htmlParseStartTag: invalid element name\n",
4451 /* Dump the bogus tag like browsers do */
4452 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4455 htmlParserFinishElementParsing(ctxt);
4456 if (currentNode != NULL)
4457 xmlFree(currentNode);
4459 currentNode = xmlStrdup(ctxt->name);
4460 depth = ctxt->nameNr;
4464 if (ctxt->name != NULL) {
4465 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4466 htmlAutoClose(ctxt, name);
4473 * Has this node been popped out during parsing of
4476 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4477 (!xmlStrEqual(currentNode, ctxt->name)))
4479 htmlParserFinishElementParsing(ctxt);
4480 if (currentNode != NULL) xmlFree(currentNode);
4482 currentNode = xmlStrdup(ctxt->name);
4483 depth = ctxt->nameNr;
4487 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4488 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4490 * Handle SCRIPT/STYLE separately
4492 htmlParseScript(ctxt);
4495 * Sometimes DOCTYPE arrives in the middle of the document
4497 if ((CUR == '<') && (NXT(1) == '!') &&
4498 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4499 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4500 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4502 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4503 "Misplaced DOCTYPE declaration\n",
4504 BAD_CAST "DOCTYPE" , NULL);
4505 htmlParseDocTypeDecl(ctxt);
4509 * First case : a comment
4511 if ((CUR == '<') && (NXT(1) == '!') &&
4512 (NXT(2) == '-') && (NXT(3) == '-')) {
4513 htmlParseComment(ctxt);
4517 * Second case : a Processing Instruction.
4519 else if ((CUR == '<') && (NXT(1) == '?')) {
4524 * Third case : a sub-element.
4526 else if (CUR == '<') {
4527 htmlParseElementInternal(ctxt);
4528 if (currentNode != NULL) xmlFree(currentNode);
4530 currentNode = xmlStrdup(ctxt->name);
4531 depth = ctxt->nameNr;
4535 * Fourth case : a reference. If if has not been resolved,
4536 * parsing returns it's Name, create the node
4538 else if (CUR == '&') {
4539 htmlParseReference(ctxt);
4543 * Fifth case : end of the resource
4545 else if (CUR == 0) {
4546 htmlAutoCloseOnEnd(ctxt);
4551 * Last case, text. Note that References are handled directly.
4554 htmlParseCharData(ctxt);
4557 if (cons == ctxt->nbChars) {
4558 if (ctxt->node != NULL) {
4559 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4560 "detected an error in element content\n",
4568 if (currentNode != NULL) xmlFree(currentNode);
4573 * @ctxt: an HTML parser context
4575 * Parse a content: comment, sub-element, reference or text.
4576 * This is the entry point when called from parser.c
4580 __htmlParseContent(void *ctxt) {
4582 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4586 * htmlParseDocument:
4587 * @ctxt: an HTML parser context
4589 * parse an HTML document (and build a tree if using the standard SAX
4592 * Returns 0, -1 in case of error. the parser context is augmented
4593 * as a result of the parsing.
4597 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4599 xmlCharEncoding enc;
4604 htmlDefaultSAXHandlerInit();
4606 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4607 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4608 "htmlParseDocument: context error\n", NULL, NULL);
4609 return(XML_ERR_INTERNAL_ERROR);
4612 ctxt->linenumbers = 1;
4615 * SAX: beginning of the document processing.
4617 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4618 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4620 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4621 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4623 * Get the 4 first bytes and decode the charset
4624 * if enc != XML_CHAR_ENCODING_NONE
4625 * plug some encoding conversion routines.
4631 enc = xmlDetectCharEncoding(&start[0], 4);
4632 if (enc != XML_CHAR_ENCODING_NONE) {
4633 xmlSwitchEncoding(ctxt, enc);
4638 * Wipe out everything which is before the first '<'
4642 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4643 "Document is empty\n", NULL, NULL);
4646 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4647 ctxt->sax->startDocument(ctxt->userData);
4651 * Parse possible comments and PIs before any content
4653 while (((CUR == '<') && (NXT(1) == '!') &&
4654 (NXT(2) == '-') && (NXT(3) == '-')) ||
4655 ((CUR == '<') && (NXT(1) == '?'))) {
4656 htmlParseComment(ctxt);
4663 * Then possibly doc type declaration(s) and more Misc
4664 * (doctypedecl Misc*)?
4666 if ((CUR == '<') && (NXT(1) == '!') &&
4667 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4668 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4669 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4671 htmlParseDocTypeDecl(ctxt);
4676 * Parse possible comments and PIs before any content
4678 while (((CUR == '<') && (NXT(1) == '!') &&
4679 (NXT(2) == '-') && (NXT(3) == '-')) ||
4680 ((CUR == '<') && (NXT(1) == '?'))) {
4681 htmlParseComment(ctxt);
4687 * Time to start parsing the tree itself
4689 htmlParseContentInternal(ctxt);
4695 htmlAutoCloseOnEnd(ctxt);
4699 * SAX: end of the document processing.
4701 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4702 ctxt->sax->endDocument(ctxt->userData);
4704 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4705 dtd = xmlGetIntSubset(ctxt->myDoc);
4707 ctxt->myDoc->intSubset =
4708 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4709 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4710 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4712 if (! ctxt->wellFormed) return(-1);
4717 /************************************************************************
4719 * Parser contexts handling *
4721 ************************************************************************/
4724 * htmlInitParserCtxt:
4725 * @ctxt: an HTML parser context
4727 * Initialize a parser context
4729 * Returns 0 in case of success and -1 in case of error
4733 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4735 htmlSAXHandler *sax;
4737 if (ctxt == NULL) return(-1);
4738 memset(ctxt, 0, sizeof(htmlParserCtxt));
4740 ctxt->dict = xmlDictCreate();
4741 if (ctxt->dict == NULL) {
4742 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4745 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4747 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4751 memset(sax, 0, sizeof(htmlSAXHandler));
4753 /* Allocate the Input stack */
4754 ctxt->inputTab = (htmlParserInputPtr *)
4755 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4756 if (ctxt->inputTab == NULL) {
4757 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4766 ctxt->version = NULL;
4767 ctxt->encoding = NULL;
4768 ctxt->standalone = -1;
4769 ctxt->instate = XML_PARSER_START;
4771 /* Allocate the Node stack */
4772 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4773 if (ctxt->nodeTab == NULL) {
4774 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4787 /* Allocate the Name stack */
4788 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4789 if (ctxt->nameTab == NULL) {
4790 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4806 ctxt->nodeInfoTab = NULL;
4807 ctxt->nodeInfoNr = 0;
4808 ctxt->nodeInfoMax = 0;
4810 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4813 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4815 ctxt->userData = ctxt;
4817 ctxt->wellFormed = 1;
4818 ctxt->replaceEntities = 0;
4819 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4821 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4822 ctxt->vctxt.userData = ctxt;
4823 ctxt->vctxt.error = xmlParserValidityError;
4824 ctxt->vctxt.warning = xmlParserValidityWarning;
4825 ctxt->record_info = 0;
4828 ctxt->checkIndex = 0;
4829 ctxt->catalogs = NULL;
4830 xmlInitNodeInfoSeq(&ctxt->node_seq);
4835 * htmlFreeParserCtxt:
4836 * @ctxt: an HTML parser context
4838 * Free all the memory used by a parser context. However the parsed
4839 * document in ctxt->myDoc is not freed.
4843 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4845 xmlFreeParserCtxt(ctxt);
4849 * htmlNewParserCtxt:
4851 * Allocate and initialize a new parser context.
4853 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4857 htmlNewParserCtxt(void)
4859 xmlParserCtxtPtr ctxt;
4861 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4863 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4866 memset(ctxt, 0, sizeof(xmlParserCtxt));
4867 if (htmlInitParserCtxt(ctxt) < 0) {
4868 htmlFreeParserCtxt(ctxt);
4875 * htmlCreateMemoryParserCtxt:
4876 * @buffer: a pointer to a char array
4877 * @size: the size of the array
4879 * Create a parser context for an HTML in-memory document.
4881 * Returns the new parser context or NULL
4884 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4885 xmlParserCtxtPtr ctxt;
4886 xmlParserInputPtr input;
4887 xmlParserInputBufferPtr buf;
4894 ctxt = htmlNewParserCtxt();
4898 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4899 if (buf == NULL) return(NULL);
4901 input = xmlNewInputStream(ctxt);
4902 if (input == NULL) {
4903 xmlFreeParserCtxt(ctxt);
4907 input->filename = NULL;
4909 input->base = input->buf->buffer->content;
4910 input->cur = input->buf->buffer->content;
4911 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4913 inputPush(ctxt, input);
4918 * htmlCreateDocParserCtxt:
4919 * @cur: a pointer to an array of xmlChar
4920 * @encoding: a free form C string describing the HTML document encoding, or NULL
4922 * Create a parser context for an HTML document.
4924 * TODO: check the need to add encoding handling there
4926 * Returns the new parser context or NULL
4928 static htmlParserCtxtPtr
4929 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4931 htmlParserCtxtPtr ctxt;
4935 len = xmlStrlen(cur);
4936 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4940 if (encoding != NULL) {
4941 xmlCharEncoding enc;
4942 xmlCharEncodingHandlerPtr handler;
4944 if (ctxt->input->encoding != NULL)
4945 xmlFree((xmlChar *) ctxt->input->encoding);
4946 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4948 enc = xmlParseCharEncoding(encoding);
4950 * registered set of known encodings
4952 if (enc != XML_CHAR_ENCODING_ERROR) {
4953 xmlSwitchEncoding(ctxt, enc);
4954 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4955 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4956 "Unsupported encoding %s\n",
4957 (const xmlChar *) encoding, NULL);
4961 * fallback for unknown encodings
4963 handler = xmlFindCharEncodingHandler((const char *) encoding);
4964 if (handler != NULL) {
4965 xmlSwitchToEncoding(ctxt, handler);
4967 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4968 "Unsupported encoding %s\n",
4969 (const xmlChar *) encoding, NULL);
4976 #ifdef LIBXML_PUSH_ENABLED
4977 /************************************************************************
4979 * Progressive parsing interfaces *
4981 ************************************************************************/
4984 * htmlParseLookupSequence:
4985 * @ctxt: an HTML parser context
4986 * @first: the first char to lookup
4987 * @next: the next char to lookup or zero
4988 * @third: the next char to lookup or zero
4989 * @comment: flag to force checking inside comments
4991 * Try to find if a sequence (first, next, third) or just (first next) or
4992 * (first) is available in the input stream.
4993 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4994 * to avoid rescanning sequences of bytes, it DOES change the state of the
4995 * parser, do not use liberally.
4996 * This is basically similar to xmlParseLookupSequence()
4998 * Returns the index to the current parsing point if the full sequence
4999 * is available, -1 otherwise.
5002 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5003 xmlChar next, xmlChar third, int iscomment,
5007 htmlParserInputPtr in;
5011 char valdellim = 0x0;
5017 base = in->cur - in->base;
5021 if (ctxt->checkIndex > base)
5022 base = ctxt->checkIndex;
5024 if (in->buf == NULL) {
5028 buf = in->buf->buffer->content;
5029 len = in->buf->buffer->use;
5032 /* take into account the sequence length */
5037 for (; base < len; base++) {
5038 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5039 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5040 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5042 /* do not increment past <! - some people use <!--> */
5046 if (ignoreattrval) {
5047 if (buf[base] == '"' || buf[base] == '\'') {
5049 if (buf[base] == valdellim) {
5054 valdellim = buf[base];
5058 } else if (invalue) {
5065 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5066 (buf[base + 2] == '>')) {
5072 if (buf[base] == first) {
5074 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5076 } else if (next != 0) {
5077 if (buf[base + 1] != next)
5080 ctxt->checkIndex = 0;
5083 xmlGenericError(xmlGenericErrorContext,
5084 "HPP: lookup '%c' found at %d\n",
5086 else if (third == 0)
5087 xmlGenericError(xmlGenericErrorContext,
5088 "HPP: lookup '%c%c' found at %d\n",
5091 xmlGenericError(xmlGenericErrorContext,
5092 "HPP: lookup '%c%c%c' found at %d\n",
5093 first, next, third, base);
5095 return (base - (in->cur - in->base));
5098 if ((!incomment) && (!invalue))
5099 ctxt->checkIndex = base;
5102 xmlGenericError(xmlGenericErrorContext,
5103 "HPP: lookup '%c' failed\n", first);
5104 else if (third == 0)
5105 xmlGenericError(xmlGenericErrorContext,
5106 "HPP: lookup '%c%c' failed\n", first, next);
5108 xmlGenericError(xmlGenericErrorContext,
5109 "HPP: lookup '%c%c%c' failed\n", first, next,
5116 * htmlParseLookupChars:
5117 * @ctxt: an HTML parser context
5118 * @stop: Array of chars, which stop the lookup.
5119 * @stopLen: Length of stop-Array
5121 * Try to find if any char of the stop-Array is available in the input
5123 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5124 * to avoid rescanning sequences of bytes, it DOES change the state of the
5125 * parser, do not use liberally.
5127 * Returns the index to the current parsing point if a stopChar
5128 * is available, -1 otherwise.
5131 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5135 htmlParserInputPtr in;
5144 base = in->cur - in->base;
5148 if (ctxt->checkIndex > base)
5149 base = ctxt->checkIndex;
5151 if (in->buf == NULL) {
5155 buf = in->buf->buffer->content;
5156 len = in->buf->buffer->use;
5159 for (; base < len; base++) {
5160 if (!incomment && (base + 4 < len)) {
5161 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5162 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5164 /* do not increment past <! - some people use <!--> */
5171 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5172 (buf[base + 2] == '>')) {
5178 for (i = 0; i < stopLen; ++i) {
5179 if (buf[base] == stop[i]) {
5180 ctxt->checkIndex = 0;
5181 return (base - (in->cur - in->base));
5185 ctxt->checkIndex = base;
5190 * htmlParseTryOrFinish:
5191 * @ctxt: an HTML parser context
5192 * @terminate: last chunk indicator
5194 * Try to progress on parsing
5196 * Returns zero if no parsing was possible
5199 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5201 htmlParserInputPtr in;
5205 htmlParserNodeInfo node_info;
5208 switch (ctxt->instate) {
5209 case XML_PARSER_EOF:
5210 xmlGenericError(xmlGenericErrorContext,
5211 "HPP: try EOF\n"); break;
5212 case XML_PARSER_START:
5213 xmlGenericError(xmlGenericErrorContext,
5214 "HPP: try START\n"); break;
5215 case XML_PARSER_MISC:
5216 xmlGenericError(xmlGenericErrorContext,
5217 "HPP: try MISC\n");break;
5218 case XML_PARSER_COMMENT:
5219 xmlGenericError(xmlGenericErrorContext,
5220 "HPP: try COMMENT\n");break;
5221 case XML_PARSER_PROLOG:
5222 xmlGenericError(xmlGenericErrorContext,
5223 "HPP: try PROLOG\n");break;
5224 case XML_PARSER_START_TAG:
5225 xmlGenericError(xmlGenericErrorContext,
5226 "HPP: try START_TAG\n");break;
5227 case XML_PARSER_CONTENT:
5228 xmlGenericError(xmlGenericErrorContext,
5229 "HPP: try CONTENT\n");break;
5230 case XML_PARSER_CDATA_SECTION:
5231 xmlGenericError(xmlGenericErrorContext,
5232 "HPP: try CDATA_SECTION\n");break;
5233 case XML_PARSER_END_TAG:
5234 xmlGenericError(xmlGenericErrorContext,
5235 "HPP: try END_TAG\n");break;
5236 case XML_PARSER_ENTITY_DECL:
5237 xmlGenericError(xmlGenericErrorContext,
5238 "HPP: try ENTITY_DECL\n");break;
5239 case XML_PARSER_ENTITY_VALUE:
5240 xmlGenericError(xmlGenericErrorContext,
5241 "HPP: try ENTITY_VALUE\n");break;
5242 case XML_PARSER_ATTRIBUTE_VALUE:
5243 xmlGenericError(xmlGenericErrorContext,
5244 "HPP: try ATTRIBUTE_VALUE\n");break;
5245 case XML_PARSER_DTD:
5246 xmlGenericError(xmlGenericErrorContext,
5247 "HPP: try DTD\n");break;
5248 case XML_PARSER_EPILOG:
5249 xmlGenericError(xmlGenericErrorContext,
5250 "HPP: try EPILOG\n");break;
5252 xmlGenericError(xmlGenericErrorContext,
5253 "HPP: try PI\n");break;
5254 case XML_PARSER_SYSTEM_LITERAL:
5255 xmlGenericError(xmlGenericErrorContext,
5256 "HPP: try SYSTEM_LITERAL\n");break;
5263 if (in == NULL) break;
5264 if (in->buf == NULL)
5265 avail = in->length - (in->cur - in->base);
5267 avail = in->buf->buffer->use - (in->cur - in->base);
5268 if ((avail == 0) && (terminate)) {
5269 htmlAutoCloseOnEnd(ctxt);
5270 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5272 * SAX: end of the document processing.
5274 ctxt->instate = XML_PARSER_EOF;
5275 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5276 ctxt->sax->endDocument(ctxt->userData);
5287 switch (ctxt->instate) {
5288 case XML_PARSER_EOF:
5290 * Document parsing is done !
5293 case XML_PARSER_START:
5295 * Very first chars read from the document flow.
5298 if (IS_BLANK_CH(cur)) {
5300 if (in->buf == NULL)
5301 avail = in->length - (in->cur - in->base);
5303 avail = in->buf->buffer->use - (in->cur - in->base);
5305 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5306 ctxt->sax->setDocumentLocator(ctxt->userData,
5307 &xmlDefaultSAXLocator);
5308 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5309 (!ctxt->disableSAX))
5310 ctxt->sax->startDocument(ctxt->userData);
5314 if ((cur == '<') && (next == '!') &&
5315 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5316 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5317 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5320 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5323 xmlGenericError(xmlGenericErrorContext,
5324 "HPP: Parsing internal subset\n");
5326 htmlParseDocTypeDecl(ctxt);
5327 ctxt->instate = XML_PARSER_PROLOG;
5329 xmlGenericError(xmlGenericErrorContext,
5330 "HPP: entering PROLOG\n");
5333 ctxt->instate = XML_PARSER_MISC;
5335 xmlGenericError(xmlGenericErrorContext,
5336 "HPP: entering MISC\n");
5340 case XML_PARSER_MISC:
5342 if (in->buf == NULL)
5343 avail = in->length - (in->cur - in->base);
5345 avail = in->buf->buffer->use - (in->cur - in->base);
5347 * no chars in buffer
5352 * not enouth chars in buffer
5363 if ((cur == '<') && (next == '!') &&
5364 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5366 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5369 xmlGenericError(xmlGenericErrorContext,
5370 "HPP: Parsing Comment\n");
5372 htmlParseComment(ctxt);
5373 ctxt->instate = XML_PARSER_MISC;
5374 } else if ((cur == '<') && (next == '?')) {
5376 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5379 xmlGenericError(xmlGenericErrorContext,
5380 "HPP: Parsing PI\n");
5383 ctxt->instate = XML_PARSER_MISC;
5384 } else if ((cur == '<') && (next == '!') &&
5385 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5386 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5387 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5390 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5393 xmlGenericError(xmlGenericErrorContext,
5394 "HPP: Parsing internal subset\n");
5396 htmlParseDocTypeDecl(ctxt);
5397 ctxt->instate = XML_PARSER_PROLOG;
5399 xmlGenericError(xmlGenericErrorContext,
5400 "HPP: entering PROLOG\n");
5402 } else if ((cur == '<') && (next == '!') &&
5406 ctxt->instate = XML_PARSER_START_TAG;
5408 xmlGenericError(xmlGenericErrorContext,
5409 "HPP: entering START_TAG\n");
5413 case XML_PARSER_PROLOG:
5415 if (in->buf == NULL)
5416 avail = in->length - (in->cur - in->base);
5418 avail = in->buf->buffer->use - (in->cur - in->base);
5423 if ((cur == '<') && (next == '!') &&
5424 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5426 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5429 xmlGenericError(xmlGenericErrorContext,
5430 "HPP: Parsing Comment\n");
5432 htmlParseComment(ctxt);
5433 ctxt->instate = XML_PARSER_PROLOG;
5434 } else if ((cur == '<') && (next == '?')) {
5436 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5439 xmlGenericError(xmlGenericErrorContext,
5440 "HPP: Parsing PI\n");
5443 ctxt->instate = XML_PARSER_PROLOG;
5444 } else if ((cur == '<') && (next == '!') &&
5448 ctxt->instate = XML_PARSER_START_TAG;
5450 xmlGenericError(xmlGenericErrorContext,
5451 "HPP: entering START_TAG\n");
5455 case XML_PARSER_EPILOG:
5456 if (in->buf == NULL)
5457 avail = in->length - (in->cur - in->base);
5459 avail = in->buf->buffer->use - (in->cur - in->base);
5463 if (IS_BLANK_CH(cur)) {
5464 htmlParseCharData(ctxt);
5470 if ((cur == '<') && (next == '!') &&
5471 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5473 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5476 xmlGenericError(xmlGenericErrorContext,
5477 "HPP: Parsing Comment\n");
5479 htmlParseComment(ctxt);
5480 ctxt->instate = XML_PARSER_EPILOG;
5481 } else if ((cur == '<') && (next == '?')) {
5483 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5486 xmlGenericError(xmlGenericErrorContext,
5487 "HPP: Parsing PI\n");
5490 ctxt->instate = XML_PARSER_EPILOG;
5491 } else if ((cur == '<') && (next == '!') &&
5495 ctxt->errNo = XML_ERR_DOCUMENT_END;
5496 ctxt->wellFormed = 0;
5497 ctxt->instate = XML_PARSER_EOF;
5499 xmlGenericError(xmlGenericErrorContext,
5500 "HPP: entering EOF\n");
5502 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5503 ctxt->sax->endDocument(ctxt->userData);
5507 case XML_PARSER_START_TAG: {
5508 const xmlChar *name;
5510 const htmlElemDesc * info;
5513 * no chars in buffer
5518 * not enouth chars in buffer
5530 ctxt->instate = XML_PARSER_CONTENT;
5532 xmlGenericError(xmlGenericErrorContext,
5533 "HPP: entering CONTENT\n");
5538 ctxt->instate = XML_PARSER_END_TAG;
5539 ctxt->checkIndex = 0;
5541 xmlGenericError(xmlGenericErrorContext,
5542 "HPP: entering END_TAG\n");
5547 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5550 /* Capture start position */
5551 if (ctxt->record_info) {
5552 node_info.begin_pos = ctxt->input->consumed +
5553 (CUR_PTR - ctxt->input->base);
5554 node_info.begin_line = ctxt->input->line;
5558 failed = htmlParseStartTag(ctxt);
5560 if ((failed == -1) ||
5568 * Lookup the info for that element.
5570 info = htmlTagLookup(name);
5572 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5573 "Tag %s invalid\n", name, NULL);
5577 * Check for an Empty Element labeled the XML/SGML way
5579 if ((CUR == '/') && (NXT(1) == '>')) {
5581 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5582 ctxt->sax->endElement(ctxt->userData, name);
5584 ctxt->instate = XML_PARSER_CONTENT;
5586 xmlGenericError(xmlGenericErrorContext,
5587 "HPP: entering CONTENT\n");
5595 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5596 "Couldn't find end of Start Tag %s\n",
5600 * end of parsing of this node.
5602 if (xmlStrEqual(name, ctxt->name)) {
5607 if (ctxt->record_info)
5608 htmlNodeInfoPush(ctxt, &node_info);
5610 ctxt->instate = XML_PARSER_CONTENT;
5612 xmlGenericError(xmlGenericErrorContext,
5613 "HPP: entering CONTENT\n");
5619 * Check for an Empty Element from DTD definition
5621 if ((info != NULL) && (info->empty)) {
5622 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5623 ctxt->sax->endElement(ctxt->userData, name);
5627 if (ctxt->record_info)
5628 htmlNodeInfoPush(ctxt, &node_info);
5630 ctxt->instate = XML_PARSER_CONTENT;
5632 xmlGenericError(xmlGenericErrorContext,
5633 "HPP: entering CONTENT\n");
5637 case XML_PARSER_CONTENT: {
5640 * Handle preparsed entities and charRef
5642 if (ctxt->token != 0) {
5643 xmlChar chr[2] = { 0 , 0 } ;
5645 chr[0] = (xmlChar) ctxt->token;
5646 htmlCheckParagraph(ctxt);
5647 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5648 ctxt->sax->characters(ctxt->userData, chr, 1);
5650 ctxt->checkIndex = 0;
5652 if ((avail == 1) && (terminate)) {
5654 if ((cur != '<') && (cur != '&')) {
5655 if (ctxt->sax != NULL) {
5656 if (IS_BLANK_CH(cur)) {
5657 if (ctxt->sax->ignorableWhitespace != NULL)
5658 ctxt->sax->ignorableWhitespace(
5659 ctxt->userData, &cur, 1);
5661 htmlCheckParagraph(ctxt);
5662 if (ctxt->sax->characters != NULL)
5663 ctxt->sax->characters(
5664 ctxt->userData, &cur, 1);
5668 ctxt->checkIndex = 0;
5677 cons = ctxt->nbChars;
5678 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5679 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5681 * Handle SCRIPT/STYLE separately
5687 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5690 val = in->cur[idx + 2];
5691 if (val == 0) /* bad cut of input */
5694 htmlParseScript(ctxt);
5695 if ((cur == '<') && (next == '/')) {
5696 ctxt->instate = XML_PARSER_END_TAG;
5697 ctxt->checkIndex = 0;
5699 xmlGenericError(xmlGenericErrorContext,
5700 "HPP: entering END_TAG\n");
5706 * Sometimes DOCTYPE arrives in the middle of the document
5708 if ((cur == '<') && (next == '!') &&
5709 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5710 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5711 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5714 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5716 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5717 "Misplaced DOCTYPE declaration\n",
5718 BAD_CAST "DOCTYPE" , NULL);
5719 htmlParseDocTypeDecl(ctxt);
5720 } else if ((cur == '<') && (next == '!') &&
5721 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5723 (htmlParseLookupSequence(
5724 ctxt, '-', '-', '>', 1, 1) < 0))
5727 xmlGenericError(xmlGenericErrorContext,
5728 "HPP: Parsing Comment\n");
5730 htmlParseComment(ctxt);
5731 ctxt->instate = XML_PARSER_CONTENT;
5732 } else if ((cur == '<') && (next == '?')) {
5734 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5737 xmlGenericError(xmlGenericErrorContext,
5738 "HPP: Parsing PI\n");
5741 ctxt->instate = XML_PARSER_CONTENT;
5742 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5744 } else if ((cur == '<') && (next == '/')) {
5745 ctxt->instate = XML_PARSER_END_TAG;
5746 ctxt->checkIndex = 0;
5748 xmlGenericError(xmlGenericErrorContext,
5749 "HPP: entering END_TAG\n");
5752 } else if (cur == '<') {
5753 ctxt->instate = XML_PARSER_START_TAG;
5754 ctxt->checkIndex = 0;
5756 xmlGenericError(xmlGenericErrorContext,
5757 "HPP: entering START_TAG\n");
5760 } else if (cur == '&') {
5762 (htmlParseLookupChars(ctxt,
5763 BAD_CAST "; >/", 4) < 0))
5766 xmlGenericError(xmlGenericErrorContext,
5767 "HPP: Parsing Reference\n");
5769 /* TODO: check generation of subtrees if noent !!! */
5770 htmlParseReference(ctxt);
5773 * check that the text sequence is complete
5774 * before handing out the data to the parser
5775 * to avoid problems with erroneous end of
5779 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5781 ctxt->checkIndex = 0;
5783 xmlGenericError(xmlGenericErrorContext,
5784 "HPP: Parsing char data\n");
5786 htmlParseCharData(ctxt);
5789 if (cons == ctxt->nbChars) {
5790 if (ctxt->node != NULL) {
5791 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5792 "detected an error in element content\n",
5801 case XML_PARSER_END_TAG:
5805 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5807 htmlParseEndTag(ctxt);
5808 if (ctxt->nameNr == 0) {
5809 ctxt->instate = XML_PARSER_EPILOG;
5811 ctxt->instate = XML_PARSER_CONTENT;
5813 ctxt->checkIndex = 0;
5815 xmlGenericError(xmlGenericErrorContext,
5816 "HPP: entering CONTENT\n");
5819 case XML_PARSER_CDATA_SECTION:
5820 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5821 "HPP: internal error, state == CDATA\n",
5823 ctxt->instate = XML_PARSER_CONTENT;
5824 ctxt->checkIndex = 0;
5826 xmlGenericError(xmlGenericErrorContext,
5827 "HPP: entering CONTENT\n");
5830 case XML_PARSER_DTD:
5831 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5832 "HPP: internal error, state == DTD\n",
5834 ctxt->instate = XML_PARSER_CONTENT;
5835 ctxt->checkIndex = 0;
5837 xmlGenericError(xmlGenericErrorContext,
5838 "HPP: entering CONTENT\n");
5841 case XML_PARSER_COMMENT:
5842 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5843 "HPP: internal error, state == COMMENT\n",
5845 ctxt->instate = XML_PARSER_CONTENT;
5846 ctxt->checkIndex = 0;
5848 xmlGenericError(xmlGenericErrorContext,
5849 "HPP: entering CONTENT\n");
5853 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5854 "HPP: internal error, state == PI\n",
5856 ctxt->instate = XML_PARSER_CONTENT;
5857 ctxt->checkIndex = 0;
5859 xmlGenericError(xmlGenericErrorContext,
5860 "HPP: entering CONTENT\n");
5863 case XML_PARSER_ENTITY_DECL:
5864 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5865 "HPP: internal error, state == ENTITY_DECL\n",
5867 ctxt->instate = XML_PARSER_CONTENT;
5868 ctxt->checkIndex = 0;
5870 xmlGenericError(xmlGenericErrorContext,
5871 "HPP: entering CONTENT\n");
5874 case XML_PARSER_ENTITY_VALUE:
5875 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5876 "HPP: internal error, state == ENTITY_VALUE\n",
5878 ctxt->instate = XML_PARSER_CONTENT;
5879 ctxt->checkIndex = 0;
5881 xmlGenericError(xmlGenericErrorContext,
5882 "HPP: entering DTD\n");
5885 case XML_PARSER_ATTRIBUTE_VALUE:
5886 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5887 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5889 ctxt->instate = XML_PARSER_START_TAG;
5890 ctxt->checkIndex = 0;
5892 xmlGenericError(xmlGenericErrorContext,
5893 "HPP: entering START_TAG\n");
5896 case XML_PARSER_SYSTEM_LITERAL:
5897 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5898 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5900 ctxt->instate = XML_PARSER_CONTENT;
5901 ctxt->checkIndex = 0;
5903 xmlGenericError(xmlGenericErrorContext,
5904 "HPP: entering CONTENT\n");
5907 case XML_PARSER_IGNORE:
5908 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5909 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5911 ctxt->instate = XML_PARSER_CONTENT;
5912 ctxt->checkIndex = 0;
5914 xmlGenericError(xmlGenericErrorContext,
5915 "HPP: entering CONTENT\n");
5918 case XML_PARSER_PUBLIC_LITERAL:
5919 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5920 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5922 ctxt->instate = XML_PARSER_CONTENT;
5923 ctxt->checkIndex = 0;
5925 xmlGenericError(xmlGenericErrorContext,
5926 "HPP: entering CONTENT\n");
5933 if ((avail == 0) && (terminate)) {
5934 htmlAutoCloseOnEnd(ctxt);
5935 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5937 * SAX: end of the document processing.
5939 ctxt->instate = XML_PARSER_EOF;
5940 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5941 ctxt->sax->endDocument(ctxt->userData);
5944 if ((ctxt->myDoc != NULL) &&
5945 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5946 (ctxt->instate == XML_PARSER_EPILOG))) {
5948 dtd = xmlGetIntSubset(ctxt->myDoc);
5950 ctxt->myDoc->intSubset =
5951 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5952 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5953 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5956 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5963 * @ctxt: an HTML parser context
5964 * @chunk: an char array
5965 * @size: the size in byte of the chunk
5966 * @terminate: last chunk indicator
5968 * Parse a Chunk of memory
5970 * Returns zero if no error, the xmlParserErrors otherwise.
5973 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5975 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5976 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5977 "htmlParseChunk: context error\n", NULL, NULL);
5978 return(XML_ERR_INTERNAL_ERROR);
5980 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5981 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5982 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5983 int cur = ctxt->input->cur - ctxt->input->base;
5986 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5988 ctxt->errNo = XML_PARSER_EOF;
5989 ctxt->disableSAX = 1;
5990 return (XML_PARSER_EOF);
5992 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5993 ctxt->input->cur = ctxt->input->base + cur;
5995 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5997 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6001 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6002 htmlParseTryOrFinish(ctxt, terminate);
6004 } else if (ctxt->instate != XML_PARSER_EOF) {
6005 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6006 xmlParserInputBufferPtr in = ctxt->input->buf;
6007 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6008 (in->raw != NULL)) {
6011 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
6013 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6014 "encoder error\n", NULL, NULL);
6015 return(XML_ERR_INVALID_ENCODING);
6020 htmlParseTryOrFinish(ctxt, terminate);
6022 if ((ctxt->instate != XML_PARSER_EOF) &&
6023 (ctxt->instate != XML_PARSER_EPILOG) &&
6024 (ctxt->instate != XML_PARSER_MISC)) {
6025 ctxt->errNo = XML_ERR_DOCUMENT_END;
6026 ctxt->wellFormed = 0;
6028 if (ctxt->instate != XML_PARSER_EOF) {
6029 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6030 ctxt->sax->endDocument(ctxt->userData);
6032 ctxt->instate = XML_PARSER_EOF;
6034 return((xmlParserErrors) ctxt->errNo);
6037 /************************************************************************
6039 * User entry points *
6041 ************************************************************************/
6044 * htmlCreatePushParserCtxt:
6045 * @sax: a SAX handler
6046 * @user_data: The user data returned on SAX callbacks
6047 * @chunk: a pointer to an array of chars
6048 * @size: number of chars in the array
6049 * @filename: an optional file name or URI
6050 * @enc: an optional encoding
6052 * Create a parser context for using the HTML parser in push mode
6053 * The value of @filename is used for fetching external entities
6054 * and error/warning reports.
6056 * Returns the new parser context or NULL
6059 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6060 const char *chunk, int size, const char *filename,
6061 xmlCharEncoding enc) {
6062 htmlParserCtxtPtr ctxt;
6063 htmlParserInputPtr inputStream;
6064 xmlParserInputBufferPtr buf;
6068 buf = xmlAllocParserInputBuffer(enc);
6069 if (buf == NULL) return(NULL);
6071 ctxt = htmlNewParserCtxt();
6073 xmlFreeParserInputBuffer(buf);
6076 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6077 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6079 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6081 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6082 if (ctxt->sax == NULL) {
6087 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6088 if (user_data != NULL)
6089 ctxt->userData = user_data;
6091 if (filename == NULL) {
6092 ctxt->directory = NULL;
6094 ctxt->directory = xmlParserGetDirectory(filename);
6097 inputStream = htmlNewInputStream(ctxt);
6098 if (inputStream == NULL) {
6099 xmlFreeParserCtxt(ctxt);
6104 if (filename == NULL)
6105 inputStream->filename = NULL;
6107 inputStream->filename = (char *)
6108 xmlCanonicPath((const xmlChar *) filename);
6109 inputStream->buf = buf;
6110 inputStream->base = inputStream->buf->buffer->content;
6111 inputStream->cur = inputStream->buf->buffer->content;
6113 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
6115 inputPush(ctxt, inputStream);
6117 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6118 (ctxt->input->buf != NULL)) {
6119 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
6120 int cur = ctxt->input->cur - ctxt->input->base;
6122 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6124 ctxt->input->base = ctxt->input->buf->buffer->content + base;
6125 ctxt->input->cur = ctxt->input->base + cur;
6127 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
6129 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6132 ctxt->progressive = 1;
6136 #endif /* LIBXML_PUSH_ENABLED */
6140 * @cur: a pointer to an array of xmlChar
6141 * @encoding: a free form C string describing the HTML document encoding, or NULL
6142 * @sax: the SAX handler block
6143 * @userData: if using SAX, this pointer will be provided on callbacks.
6145 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6146 * to handle parse events. If sax is NULL, fallback to the default DOM
6147 * behavior and return a tree.
6149 * Returns the resulting document tree unless SAX is NULL or the document is
6154 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6156 htmlParserCtxtPtr ctxt;
6160 if (cur == NULL) return(NULL);
6163 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6164 if (ctxt == NULL) return(NULL);
6166 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6168 ctxt->userData = userData;
6171 htmlParseDocument(ctxt);
6175 ctxt->userData = NULL;
6177 htmlFreeParserCtxt(ctxt);
6184 * @cur: a pointer to an array of xmlChar
6185 * @encoding: a free form C string describing the HTML document encoding, or NULL
6187 * parse an HTML in-memory document and build a tree.
6189 * Returns the resulting document tree
6193 htmlParseDoc(xmlChar *cur, const char *encoding) {
6194 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6199 * htmlCreateFileParserCtxt:
6200 * @filename: the filename
6201 * @encoding: a free form C string describing the HTML document encoding, or NULL
6203 * Create a parser context for a file content.
6204 * Automatic support for ZLIB/Compress compressed document is provided
6205 * by default if found at compile-time.
6207 * Returns the new parser context or NULL
6210 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6212 htmlParserCtxtPtr ctxt;
6213 htmlParserInputPtr inputStream;
6214 char *canonicFilename;
6215 /* htmlCharEncoding enc; */
6216 xmlChar *content, *content_line = (xmlChar *) "charset=";
6218 if (filename == NULL)
6221 ctxt = htmlNewParserCtxt();
6225 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6226 if (canonicFilename == NULL) {
6227 #ifdef LIBXML_SAX1_ENABLED
6228 if (xmlDefaultSAXHandler.error != NULL) {
6229 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6232 xmlFreeParserCtxt(ctxt);
6236 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6237 xmlFree(canonicFilename);
6238 if (inputStream == NULL) {
6239 xmlFreeParserCtxt(ctxt);
6243 inputPush(ctxt, inputStream);
6247 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
6249 strcpy ((char *)content, (char *)content_line);
6250 strcat ((char *)content, (char *)encoding);
6251 htmlCheckEncoding (ctxt, content);
6261 * @filename: the filename
6262 * @encoding: a free form C string describing the HTML document encoding, or NULL
6263 * @sax: the SAX handler block
6264 * @userData: if using SAX, this pointer will be provided on callbacks.
6266 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6267 * compressed document is provided by default if found at compile-time.
6268 * It use the given SAX function block to handle the parsing callback.
6269 * If sax is NULL, fallback to the default DOM tree building routines.
6271 * Returns the resulting document tree unless SAX is NULL or the document is
6276 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6279 htmlParserCtxtPtr ctxt;
6280 htmlSAXHandlerPtr oldsax = NULL;
6284 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6285 if (ctxt == NULL) return(NULL);
6289 ctxt->userData = userData;
6292 htmlParseDocument(ctxt);
6297 ctxt->userData = NULL;
6299 htmlFreeParserCtxt(ctxt);
6306 * @filename: the filename
6307 * @encoding: a free form C string describing the HTML document encoding, or NULL
6309 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6310 * compressed document is provided by default if found at compile-time.
6312 * Returns the resulting document tree
6316 htmlParseFile(const char *filename, const char *encoding) {
6317 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6321 * htmlHandleOmittedElem:
6324 * Set and return the previous value for handling HTML omitted tags.
6326 * Returns the last value for 0 for no handling, 1 for auto insertion.
6330 htmlHandleOmittedElem(int val) {
6331 int old = htmlOmittedDefaultValue;
6333 htmlOmittedDefaultValue = val;
6338 * htmlElementAllowedHere:
6339 * @parent: HTML parent element
6340 * @elt: HTML element
6342 * Checks whether an HTML element may be a direct child of a parent element.
6343 * Note - doesn't check for deprecated elements
6345 * Returns 1 if allowed; 0 otherwise.
6348 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6351 if ( ! elt || ! parent || ! parent->subelts )
6354 for ( p = parent->subelts; *p; ++p )
6355 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6361 * htmlElementStatusHere:
6362 * @parent: HTML parent element
6363 * @elt: HTML element
6365 * Checks whether an HTML element may be a direct child of a parent element.
6366 * and if so whether it is valid or deprecated.
6368 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6371 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6372 if ( ! parent || ! elt )
6373 return HTML_INVALID ;
6374 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6375 return HTML_INVALID ;
6377 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6381 * @elt: HTML element
6382 * @attr: HTML attribute
6383 * @legacy: whether to allow deprecated attributes
6385 * Checks whether an attribute is valid for an element
6386 * Has full knowledge of Required and Deprecated attributes
6388 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6391 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6394 if ( !elt || ! attr )
6395 return HTML_INVALID ;
6397 if ( elt->attrs_req )
6398 for ( p = elt->attrs_req; *p; ++p)
6399 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6400 return HTML_REQUIRED ;
6402 if ( elt->attrs_opt )
6403 for ( p = elt->attrs_opt; *p; ++p)
6404 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6407 if ( legacy && elt->attrs_depr )
6408 for ( p = elt->attrs_depr; *p; ++p)
6409 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6410 return HTML_DEPRECATED ;
6412 return HTML_INVALID ;
6416 * @node: an htmlNodePtr in a tree
6417 * @legacy: whether to allow deprecated elements (YES is faster here
6418 * for Element nodes)
6420 * Checks whether the tree node is valid. Experimental (the author
6421 * only uses the HTML enhancements in a SAX parser)
6423 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6424 * legacy allowed) or htmlElementStatusHere (otherwise).
6425 * for Attribute nodes, a return from htmlAttrAllowed
6426 * for other nodes, HTML_NA (no checks performed)
6429 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6431 return HTML_INVALID ;
6433 switch ( node->type ) {
6434 case XML_ELEMENT_NODE:
6436 ? ( htmlElementAllowedHere (
6437 htmlTagLookup(node->parent->name) , node->name
6438 ) ? HTML_VALID : HTML_INVALID )
6439 : htmlElementStatusHere(
6440 htmlTagLookup(node->parent->name) ,
6441 htmlTagLookup(node->name) )
6443 case XML_ATTRIBUTE_NODE:
6444 return htmlAttrAllowed(
6445 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6446 default: return HTML_NA ;
6449 /************************************************************************
6451 * New set (2.6.0) of simpler and more flexible APIs *
6453 ************************************************************************/
6458 * Free a string if it is not owned by the "dict" dictionnary in the
6461 #define DICT_FREE(str) \
6462 if ((str) && ((!dict) || \
6463 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6464 xmlFree((char *)(str));
6468 * @ctxt: an HTML parser context
6470 * Reset a parser context
6473 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6475 xmlParserInputPtr input;
6484 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6485 xmlFreeInputStream(input);
6491 if (ctxt->spaceTab != NULL) {
6492 ctxt->spaceTab[0] = -1;
6493 ctxt->space = &ctxt->spaceTab[0];
6505 DICT_FREE(ctxt->version);
6506 ctxt->version = NULL;
6507 DICT_FREE(ctxt->encoding);
6508 ctxt->encoding = NULL;
6509 DICT_FREE(ctxt->directory);
6510 ctxt->directory = NULL;
6511 DICT_FREE(ctxt->extSubURI);
6512 ctxt->extSubURI = NULL;
6513 DICT_FREE(ctxt->extSubSystem);
6514 ctxt->extSubSystem = NULL;
6515 if (ctxt->myDoc != NULL)
6516 xmlFreeDoc(ctxt->myDoc);
6519 ctxt->standalone = -1;
6520 ctxt->hasExternalSubset = 0;
6521 ctxt->hasPErefs = 0;
6524 ctxt->instate = XML_PARSER_START;
6527 ctxt->wellFormed = 1;
6528 ctxt->nsWellFormed = 1;
6529 ctxt->disableSAX = 0;
6531 ctxt->vctxt.userData = ctxt;
6532 ctxt->vctxt.error = xmlParserValidityError;
6533 ctxt->vctxt.warning = xmlParserValidityWarning;
6534 ctxt->record_info = 0;
6536 ctxt->checkIndex = 0;
6538 ctxt->errNo = XML_ERR_OK;
6540 ctxt->charset = XML_CHAR_ENCODING_NONE;
6541 ctxt->catalogs = NULL;
6542 xmlInitNodeInfoSeq(&ctxt->node_seq);
6544 if (ctxt->attsDefault != NULL) {
6545 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6546 ctxt->attsDefault = NULL;
6548 if (ctxt->attsSpecial != NULL) {
6549 xmlHashFree(ctxt->attsSpecial, NULL);
6550 ctxt->attsSpecial = NULL;
6555 * htmlCtxtUseOptions:
6556 * @ctxt: an HTML parser context
6557 * @options: a combination of htmlParserOption(s)
6559 * Applies the options to the parser context
6561 * Returns 0 in case of success, the set of unknown or unimplemented options
6565 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6570 if (options & HTML_PARSE_NOWARNING) {
6571 ctxt->sax->warning = NULL;
6572 ctxt->vctxt.warning = NULL;
6573 options -= XML_PARSE_NOWARNING;
6574 ctxt->options |= XML_PARSE_NOWARNING;
6576 if (options & HTML_PARSE_NOERROR) {
6577 ctxt->sax->error = NULL;
6578 ctxt->vctxt.error = NULL;
6579 ctxt->sax->fatalError = NULL;
6580 options -= XML_PARSE_NOERROR;
6581 ctxt->options |= XML_PARSE_NOERROR;
6583 if (options & HTML_PARSE_PEDANTIC) {
6585 options -= XML_PARSE_PEDANTIC;
6586 ctxt->options |= XML_PARSE_PEDANTIC;
6589 if (options & XML_PARSE_NOBLANKS) {
6590 ctxt->keepBlanks = 0;
6591 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6592 options -= XML_PARSE_NOBLANKS;
6593 ctxt->options |= XML_PARSE_NOBLANKS;
6595 ctxt->keepBlanks = 1;
6596 if (options & HTML_PARSE_RECOVER) {
6598 options -= HTML_PARSE_RECOVER;
6601 if (options & HTML_PARSE_COMPACT) {
6602 ctxt->options |= HTML_PARSE_COMPACT;
6603 options -= HTML_PARSE_COMPACT;
6605 if (options & XML_PARSE_HUGE) {
6606 ctxt->options |= XML_PARSE_HUGE;
6607 options -= XML_PARSE_HUGE;
6609 if (options & HTML_PARSE_NODEFDTD) {
6610 ctxt->options |= HTML_PARSE_NODEFDTD;
6611 options -= HTML_PARSE_NODEFDTD;
6613 if (options & HTML_PARSE_IGNORE_ENC) {
6614 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6615 options -= HTML_PARSE_IGNORE_ENC;
6617 if (options & HTML_PARSE_NOIMPLIED) {
6618 ctxt->options |= HTML_PARSE_NOIMPLIED;
6619 options -= HTML_PARSE_NOIMPLIED;
6621 ctxt->dictNames = 0;
6627 * @ctxt: an HTML parser context
6628 * @URL: the base URL to use for the document
6629 * @encoding: the document encoding, or NULL
6630 * @options: a combination of htmlParserOption(s)
6631 * @reuse: keep the context for reuse
6633 * Common front-end for the htmlRead functions
6635 * Returns the resulting document tree or NULL
6638 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6639 int options, int reuse)
6643 htmlCtxtUseOptions(ctxt, options);
6645 if (encoding != NULL) {
6646 xmlCharEncodingHandlerPtr hdlr;
6648 hdlr = xmlFindCharEncodingHandler(encoding);
6650 xmlSwitchToEncoding(ctxt, hdlr);
6651 if (ctxt->input->encoding != NULL)
6652 xmlFree((xmlChar *) ctxt->input->encoding);
6653 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6656 if ((URL != NULL) && (ctxt->input != NULL) &&
6657 (ctxt->input->filename == NULL))
6658 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6659 htmlParseDocument(ctxt);
6663 if ((ctxt->dictNames) &&
6665 (ret->dict == ctxt->dict))
6667 xmlFreeParserCtxt(ctxt);
6674 * @cur: a pointer to a zero terminated string
6675 * @URL: the base URL to use for the document
6676 * @encoding: the document encoding, or NULL
6677 * @options: a combination of htmlParserOption(s)
6679 * parse an XML in-memory document and build a tree.
6681 * Returns the resulting document tree
6684 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6686 htmlParserCtxtPtr ctxt;
6692 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6695 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6700 * @filename: a file or URL
6701 * @encoding: the document encoding, or NULL
6702 * @options: a combination of htmlParserOption(s)
6704 * parse an XML file from the filesystem or the network.
6706 * Returns the resulting document tree
6709 htmlReadFile(const char *filename, const char *encoding, int options)
6711 htmlParserCtxtPtr ctxt;
6714 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6717 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6722 * @buffer: a pointer to a char array
6723 * @size: the size of the array
6724 * @URL: the base URL to use for the document
6725 * @encoding: the document encoding, or NULL
6726 * @options: a combination of htmlParserOption(s)
6728 * parse an XML in-memory document and build a tree.
6730 * Returns the resulting document tree
6733 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6735 htmlParserCtxtPtr ctxt;
6738 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6741 htmlDefaultSAXHandlerInit();
6742 if (ctxt->sax != NULL)
6743 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6744 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6749 * @fd: an open file descriptor
6750 * @URL: the base URL to use for the document
6751 * @encoding: the document encoding, or NULL
6752 * @options: a combination of htmlParserOption(s)
6754 * parse an XML from a file descriptor and build a tree.
6756 * Returns the resulting document tree
6759 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6761 htmlParserCtxtPtr ctxt;
6762 xmlParserInputBufferPtr input;
6763 xmlParserInputPtr stream;
6769 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6772 ctxt = xmlNewParserCtxt();
6774 xmlFreeParserInputBuffer(input);
6777 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6778 if (stream == NULL) {
6779 xmlFreeParserInputBuffer(input);
6780 xmlFreeParserCtxt(ctxt);
6783 inputPush(ctxt, stream);
6784 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6789 * @ioread: an I/O read function
6790 * @ioclose: an I/O close function
6791 * @ioctx: an I/O handler
6792 * @URL: the base URL to use for the document
6793 * @encoding: the document encoding, or NULL
6794 * @options: a combination of htmlParserOption(s)
6796 * parse an HTML document from I/O functions and source and build a tree.
6798 * Returns the resulting document tree
6801 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6802 void *ioctx, const char *URL, const char *encoding, int options)
6804 htmlParserCtxtPtr ctxt;
6805 xmlParserInputBufferPtr input;
6806 xmlParserInputPtr stream;
6812 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6813 XML_CHAR_ENCODING_NONE);
6814 if (input == NULL) {
6815 if (ioclose != NULL)
6819 ctxt = htmlNewParserCtxt();
6821 xmlFreeParserInputBuffer(input);
6824 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6825 if (stream == NULL) {
6826 xmlFreeParserInputBuffer(input);
6827 xmlFreeParserCtxt(ctxt);
6830 inputPush(ctxt, stream);
6831 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6836 * @ctxt: an HTML parser context
6837 * @cur: a pointer to a zero terminated string
6838 * @URL: the base URL to use for the document
6839 * @encoding: the document encoding, or NULL
6840 * @options: a combination of htmlParserOption(s)
6842 * parse an XML in-memory document and build a tree.
6843 * This reuses the existing @ctxt parser context
6845 * Returns the resulting document tree
6848 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6849 const char *URL, const char *encoding, int options)
6851 xmlParserInputPtr stream;
6858 htmlCtxtReset(ctxt);
6860 stream = xmlNewStringInputStream(ctxt, cur);
6861 if (stream == NULL) {
6864 inputPush(ctxt, stream);
6865 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6870 * @ctxt: an HTML parser context
6871 * @filename: a file or URL
6872 * @encoding: the document encoding, or NULL
6873 * @options: a combination of htmlParserOption(s)
6875 * parse an XML file from the filesystem or the network.
6876 * This reuses the existing @ctxt parser context
6878 * Returns the resulting document tree
6881 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6882 const char *encoding, int options)
6884 xmlParserInputPtr stream;
6886 if (filename == NULL)
6891 htmlCtxtReset(ctxt);
6893 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6894 if (stream == NULL) {
6897 inputPush(ctxt, stream);
6898 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6902 * htmlCtxtReadMemory:
6903 * @ctxt: an HTML parser context
6904 * @buffer: a pointer to a char array
6905 * @size: the size of the array
6906 * @URL: the base URL to use for the document
6907 * @encoding: the document encoding, or NULL
6908 * @options: a combination of htmlParserOption(s)
6910 * parse an XML in-memory document and build a tree.
6911 * This reuses the existing @ctxt parser context
6913 * Returns the resulting document tree
6916 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6917 const char *URL, const char *encoding, int options)
6919 xmlParserInputBufferPtr input;
6920 xmlParserInputPtr stream;
6927 htmlCtxtReset(ctxt);
6929 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6930 if (input == NULL) {
6934 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6935 if (stream == NULL) {
6936 xmlFreeParserInputBuffer(input);
6940 inputPush(ctxt, stream);
6941 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6946 * @ctxt: an HTML parser context
6947 * @fd: an open file descriptor
6948 * @URL: the base URL to use for the document
6949 * @encoding: the document encoding, or NULL
6950 * @options: a combination of htmlParserOption(s)
6952 * parse an XML from a file descriptor and build a tree.
6953 * This reuses the existing @ctxt parser context
6955 * Returns the resulting document tree
6958 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6959 const char *URL, const char *encoding, int options)
6961 xmlParserInputBufferPtr input;
6962 xmlParserInputPtr stream;
6969 htmlCtxtReset(ctxt);
6972 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6975 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6976 if (stream == NULL) {
6977 xmlFreeParserInputBuffer(input);
6980 inputPush(ctxt, stream);
6981 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6986 * @ctxt: an HTML parser context
6987 * @ioread: an I/O read function
6988 * @ioclose: an I/O close function
6989 * @ioctx: an I/O handler
6990 * @URL: the base URL to use for the document
6991 * @encoding: the document encoding, or NULL
6992 * @options: a combination of htmlParserOption(s)
6994 * parse an HTML document from I/O functions and source and build a tree.
6995 * This reuses the existing @ctxt parser context
6997 * Returns the resulting document tree
7000 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7001 xmlInputCloseCallback ioclose, void *ioctx,
7003 const char *encoding, int options)
7005 xmlParserInputBufferPtr input;
7006 xmlParserInputPtr stream;
7013 htmlCtxtReset(ctxt);
7015 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7016 XML_CHAR_ENCODING_NONE);
7017 if (input == NULL) {
7018 if (ioclose != NULL)
7022 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7023 if (stream == NULL) {
7024 xmlFreeParserInputBuffer(input);
7027 inputPush(ctxt, stream);
7028 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7031 #define bottom_HTMLparser
7032 #include "elfgcchack.h"
7033 #endif /* LIBXML_HTML_ENABLED */