2 * HTMLparser.c : an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
11 #ifdef LIBXML_HTML_ENABLED
20 #ifdef HAVE_SYS_STAT_H
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
47 #define HTML_MAX_NAMELEN 1000
48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
49 #define HTML_PARSER_BUFFER_SIZE 100
52 /* #define DEBUG_PUSH */
54 static int htmlOmittedDefaultValue = 1;
56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
60 /************************************************************************
62 * Some factorized error routines *
64 ************************************************************************/
68 * @ctxt: an HTML parser context
69 * @extra: extra informations
71 * Handle a redefinition of attribute error
74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
88 "Memory allocation failed : %s\n", extra);
90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
120 ctxt->wellFormed = 0;
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
145 ctxt->wellFormed = 0;
148 /************************************************************************
150 * Parser stacks related functions and macros *
152 ************************************************************************/
156 * @ctxt: an HTML parser context
157 * @value: the element name
159 * Pushes a new element name on top of the name stack
161 * Returns 0 in case of error, the index in the stack otherwise
164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
166 if (ctxt->nameNr >= ctxt->nameMax) {
168 ctxt->nameTab = (const xmlChar * *)
169 xmlRealloc((xmlChar * *)ctxt->nameTab,
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
173 htmlErrMemory(ctxt, NULL);
177 ctxt->nameTab[ctxt->nameNr] = value;
179 return (ctxt->nameNr++);
183 * @ctxt: an HTML parser context
185 * Pops the top element name from the name stack
187 * Returns the name just removed
189 static const xmlChar *
190 htmlnamePop(htmlParserCtxtPtr ctxt)
194 if (ctxt->nameNr <= 0)
197 if (ctxt->nameNr < 0)
199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
203 ret = ctxt->nameTab[ctxt->nameNr];
204 ctxt->nameTab[ctxt->nameNr] = NULL;
209 * Macros for accessing the content. Those should be used only by the parser,
212 * Dirty macros, i.e. one need to make assumption on the context to use them
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225 * strings without newlines within the parser.
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
237 #define UPPER (toupper(*ctxt->input->cur))
239 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
241 #define NXT(val) ctxt->input->cur[(val)]
243 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
245 #define CUR_PTR ctxt->input->cur
247 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
251 #define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
255 #define CURRENT ((int) (*ctxt->input->cur))
257 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
259 /* Inported from XML */
261 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262 #define CUR ((int) (*ctxt->input->cur))
263 #define NEXT xmlNextChar(ctxt)
265 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266 #define NXT(val) ctxt->input->cur[(val)]
267 #define CUR_PTR ctxt->input->cur
270 #define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
283 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
286 #define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
295 * The current char value, if using UTF-8 this may actually span multiple
296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
301 * Returns the current char value and its length
305 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
309 if (ctxt->token != 0) {
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
323 * Check for the 0x110000 limit too
325 const unsigned char *cur = ctxt->input->cur;
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
335 if ((c & 0xe0) == 0xe0) {
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
341 if ((c & 0xf0) == 0xf0) {
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
374 return((int) *ctxt->input->cur);
378 * Assume it's a fixed length encoding (1) with
379 * a compatible encoding for the ASCII set, since
380 * XML constructs only use < 128 chars
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
387 * Humm this is bad, do an automatic flow conversion
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
405 ctxt->input->cur[0], ctxt->input->cur[1],
406 ctxt->input->cur[2], ctxt->input->cur[3]);
407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408 "Input is not proper UTF-8, indicate encoding !\n",
409 BAD_CAST buffer, NULL);
412 ctxt->charset = XML_CHAR_ENCODING_8859_1;
414 return((int) *ctxt->input->cur);
418 * htmlSkipBlankChars:
419 * @ctxt: the HTML parser context
421 * skip all blanks character found at that point in the input streams.
423 * Returns the number of space chars skipped
427 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
430 while (IS_BLANK_CH(*(ctxt->input->cur))) {
431 if ((*ctxt->input->cur == 0) &&
432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
435 if (*(ctxt->input->cur) == '\n') {
436 ctxt->input->line++; ctxt->input->col = 1;
437 } else ctxt->input->col++;
440 if (*ctxt->input->cur == 0)
441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
450 /************************************************************************
452 * The list of HTML elements and their properties *
454 ************************************************************************/
457 * Start Tag: 1 means the start tag can be ommited
458 * End Tag: 1 means the end tag can be ommited
459 * 2 means it's forbidden (empty elements)
460 * 3 means the tag is stylistic and should be closed easily
461 * Depr: this element is deprecated
462 * DTD: 1 means that this element is valid only in the Loose DTD
463 * 2 means that this element is valid only in the Frameset DTD
465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
466 , subElements , impliedsubelt , Attributes, userdata
469 /* Definitions and a couple of vars for HTML Elements */
471 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
472 #define NB_FONTSTYLE 8
473 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
475 #define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
476 #define NB_SPECIAL 15
477 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
478 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479 #define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480 #define NB_BLOCK NB_HEADING + NB_LIST + 14
481 #define FORMCTRL "input", "select", "textarea", "label", "button"
482 #define NB_FORMCTRL 5
485 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
487 #define LIST "ul", "ol", "dir", "menu"
490 #define NB_MODIFIER 0
491 #define FLOW BLOCK,INLINE
492 #define NB_FLOW NB_BLOCK + NB_INLINE
496 static const char* const html_flow[] = { FLOW, NULL } ;
497 static const char* const html_inline[] = { INLINE, NULL } ;
499 /* placeholders: elts with content but no subelements */
500 static const char* const html_pcdata[] = { NULL } ;
501 #define html_cdata html_pcdata
504 /* ... and for HTML Attributes */
506 #define COREATTRS "id", "class", "style", "title"
507 #define NB_COREATTRS 4
508 #define I18N "lang", "dir"
510 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
512 #define ATTRS COREATTRS,I18N,EVENTS
513 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
514 #define CELLHALIGN "align", "char", "charoff"
515 #define NB_CELLHALIGN 3
516 #define CELLVALIGN "valign"
517 #define NB_CELLVALIGN 1
519 static const char* const html_attrs[] = { ATTRS, NULL } ;
520 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521 static const char* const core_attrs[] = { COREATTRS, NULL } ;
522 static const char* const i18n_attrs[] = { I18N, NULL } ;
525 /* Other declarations that should go inline ... */
526 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528 "tabindex", "onfocus", "onblur", NULL } ;
529 static const char* const target_attr[] = { "target", NULL } ;
530 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
531 static const char* const alt_attr[] = { "alt", NULL } ;
532 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
533 static const char* const href_attrs[] = { "href", NULL } ;
534 static const char* const clear_attrs[] = { "clear", NULL } ;
535 static const char* const inline_p[] = { INLINE, "p", NULL } ;
537 static const char* const flow_param[] = { FLOW, "param", NULL } ;
538 static const char* const applet_attrs[] = { COREATTRS , "codebase",
539 "archive", "alt", "name", "height", "width", "align",
540 "hspace", "vspace", NULL } ;
541 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
542 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
543 static const char* const basefont_attrs[] =
544 { "id", "size", "color", "face", NULL } ;
545 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
546 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
547 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
548 static const char* const body_depr[] = { "background", "bgcolor", "text",
549 "link", "vlink", "alink", NULL } ;
550 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
551 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
554 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
555 static const char* const col_elt[] = { "col", NULL } ;
556 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
557 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
558 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
559 static const char* const compact_attr[] = { "compact", NULL } ;
560 static const char* const label_attr[] = { "label", NULL } ;
561 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
562 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
563 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
564 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
565 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
566 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
567 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
568 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
569 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
570 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
571 static const char* const version_attr[] = { "version", NULL } ;
572 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
573 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
574 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
575 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
576 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
577 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
578 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
579 static const char* const align_attr[] = { "align", NULL } ;
580 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
581 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
582 static const char* const name_attr[] = { "name", NULL } ;
583 static const char* const action_attr[] = { "action", NULL } ;
584 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
585 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
586 static const char* const content_attr[] = { "content", NULL } ;
587 static const char* const type_attr[] = { "type", NULL } ;
588 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
589 static const char* const object_contents[] = { FLOW, "param", NULL } ;
590 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
591 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
592 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
593 static const char* const option_elt[] = { "option", NULL } ;
594 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
595 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
596 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
597 static const char* const width_attr[] = { "width", NULL } ;
598 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
599 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
600 static const char* const language_attr[] = { "language", NULL } ;
601 static const char* const select_content[] = { "optgroup", "option", NULL } ;
602 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
603 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
604 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
605 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
606 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
607 static const char* const tr_elt[] = { "tr", NULL } ;
608 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
609 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
610 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
611 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
612 static const char* const tr_contents[] = { "th", "td", NULL } ;
613 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
614 static const char* const li_elt[] = { "li", NULL } ;
615 static const char* const ul_depr[] = { "type", "compact", NULL} ;
616 static const char* const dir_attr[] = { "dir", NULL} ;
618 #define DECL (const char**)
620 static const htmlElemDesc
621 html40ElementTable[] = {
622 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
623 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
625 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
626 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
628 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
629 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
631 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
632 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
634 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
635 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
637 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
638 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
640 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
641 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
643 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
644 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
646 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
647 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
649 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
650 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
652 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
653 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
655 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
656 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
658 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
659 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
661 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
662 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
664 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
665 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
667 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
668 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
670 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
671 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
673 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
674 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
676 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
677 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
679 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
680 EMPTY , NULL , DECL col_attrs , NULL, NULL
682 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
683 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
685 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
686 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
688 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
689 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
691 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
692 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
694 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
695 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
697 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
698 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
700 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
701 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
703 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
704 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
706 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
707 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
709 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
710 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
712 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
713 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
715 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
716 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
718 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
719 EMPTY, NULL, NULL, DECL frame_attrs, NULL
721 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
722 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
724 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
725 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
727 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
728 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
730 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
731 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
733 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
734 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
736 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
737 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
739 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
740 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
742 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
743 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
745 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
746 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
748 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
749 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
751 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
752 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
754 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
755 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
757 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
758 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
760 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
761 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
763 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
764 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
766 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
767 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
769 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
770 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
772 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
773 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
775 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
776 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
778 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
779 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
781 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
782 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
784 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
785 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
787 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
788 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
790 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
791 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
793 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
794 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
796 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
797 DECL html_flow, "div", DECL html_attrs, NULL, NULL
799 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
800 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
802 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
803 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
805 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
806 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
808 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
809 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
811 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
814 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
815 EMPTY, NULL, DECL param_attrs, NULL, name_attr
817 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
818 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
820 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
821 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
823 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
824 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
826 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
827 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
829 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
830 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
832 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
833 DECL select_content, NULL, DECL select_attrs, NULL, NULL
835 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
836 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
838 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
839 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
841 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
842 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
844 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
845 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
847 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
848 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
850 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
854 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
856 { "table", 0, 0, 0, 0, 0, 0, 0, "",
857 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
859 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
860 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
862 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
863 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
865 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
866 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
868 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
869 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
871 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
872 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
874 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
875 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
877 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
878 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
880 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
881 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
883 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
884 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
886 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
887 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
889 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
890 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
892 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
893 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
898 * start tags that imply the end of current element
900 static const char * const htmlStartClose[] = {
901 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
902 "dl", "ul", "ol", "menu", "dir", "address", "pre",
903 "listing", "xmp", "head", NULL,
906 "body", "head", "style", "link", "title", "p", NULL,
907 "frameset", "head", "style", "link", "title", "p", NULL,
908 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
909 "pre", "listing", "xmp", "head", "li", NULL,
910 "hr", "p", "head", NULL,
911 "h1", "p", "head", NULL,
912 "h2", "p", "head", NULL,
913 "h3", "p", "head", NULL,
914 "h4", "p", "head", NULL,
915 "h5", "p", "head", NULL,
916 "h6", "p", "head", NULL,
917 "dir", "p", "head", NULL,
918 "address", "p", "head", "ul", NULL,
919 "pre", "p", "head", "ul", NULL,
920 "listing", "p", "head", NULL,
921 "xmp", "p", "head", NULL,
922 "blockquote", "p", "head", NULL,
923 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
925 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
927 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
929 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
930 "listing", "xmp", NULL,
931 "ol", "p", "head", "ul", NULL,
932 "menu", "p", "head", "ul", NULL,
933 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
934 "div", "p", "head", NULL,
935 "noscript", "p", "head", NULL,
936 "center", "font", "b", "i", "p", "head", NULL,
938 "caption", "p", NULL,
939 "colgroup", "caption", "colgroup", "col", "p", NULL,
940 "col", "caption", "col", "p", NULL,
941 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
942 "listing", "xmp", "a", NULL,
943 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
944 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
945 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
946 "thead", "caption", "col", "colgroup", NULL,
947 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
949 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
950 "tfoot", "tbody", "p", NULL,
951 "optgroup", "option", NULL,
952 "option", "option", NULL,
953 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
954 "pre", "listing", "xmp", "a", NULL,
959 * The list of HTML elements which are supposed not to have
960 * CDATA content and where a p element will be implied
962 * TODO: extend that list by reading the HTML SGML DTD on
965 static const char *const htmlNoContentElements[] = {
972 * The list of HTML attributes which are of content %Script;
973 * NOTE: when adding ones, check htmlIsScriptAttribute() since
974 * it assumes the name starts with 'on'
976 static const char *const htmlScriptAttributes[] = {
998 * This table is used by the htmlparser to know what to do with
999 * broken html pages. By assigning different priorities to different
1000 * elements the parser can decide how to handle extra endtags.
1001 * Endtags are only allowed to close elements with lower or equal
1010 static const elementPriority htmlEndPriority[] = {
1022 {NULL, 100} /* Default priority */
1025 static const char** htmlStartCloseIndex[100];
1026 static int htmlStartCloseIndexinitialized = 0;
1028 /************************************************************************
1030 * functions to handle HTML specific data *
1032 ************************************************************************/
1035 * htmlInitAutoClose:
1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038 * This is not reentrant. Call xmlInitParser() once before processing in
1039 * case of use in multithreaded programs.
1042 htmlInitAutoClose(void) {
1045 if (htmlStartCloseIndexinitialized) return;
1047 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1049 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1050 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1051 while (htmlStartClose[i] != NULL) i++;
1054 htmlStartCloseIndexinitialized = 1;
1059 * @tag: The tag name in lowercase
1061 * Lookup the HTML tag in the ElementTable
1063 * Returns the related htmlElemDescPtr or NULL if not found.
1065 const htmlElemDesc *
1066 htmlTagLookup(const xmlChar *tag) {
1069 for (i = 0; i < (sizeof(html40ElementTable) /
1070 sizeof(html40ElementTable[0]));i++) {
1071 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1072 return((htmlElemDescPtr) &html40ElementTable[i]);
1078 * htmlGetEndPriority:
1079 * @name: The name of the element to look up the priority for.
1081 * Return value: The "endtag" priority.
1084 htmlGetEndPriority (const xmlChar *name) {
1087 while ((htmlEndPriority[i].name != NULL) &&
1088 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1091 return(htmlEndPriority[i].priority);
1096 * htmlCheckAutoClose:
1097 * @newtag: The new tag name
1098 * @oldtag: The old tag name
1100 * Checks whether the new tag is one of the registered valid tags for
1102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1104 * Returns 0 if no, 1 if yes.
1107 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1110 const char **closed = NULL;
1112 if (htmlStartCloseIndexinitialized == 0)
1113 htmlInitAutoClose();
1115 /* inefficient, but not a big deal */
1116 for (indx = 0; indx < 100; indx++) {
1117 closed = htmlStartCloseIndex[indx];
1120 if (xmlStrEqual(BAD_CAST * closed, newtag))
1124 i = closed - htmlStartClose;
1126 while (htmlStartClose[i] != NULL) {
1127 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1136 * htmlAutoCloseOnClose:
1137 * @ctxt: an HTML parser context
1138 * @newtag: The new tag name
1139 * @force: force the tag closure
1141 * The HTML DTD allows an ending tag to implicitly close other tags.
1144 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1146 const htmlElemDesc *info;
1149 priority = htmlGetEndPriority(newtag);
1151 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1153 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1156 * A missplaced endtag can only close elements with lower
1157 * or equal priority, so if we find an element with higher
1158 * priority before we find an element with
1159 * matching name, we just ignore this endtag
1161 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1167 while (!xmlStrEqual(newtag, ctxt->name)) {
1168 info = htmlTagLookup(ctxt->name);
1169 if ((info != NULL) && (info->endTag == 3)) {
1170 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171 "Opening and ending tag mismatch: %s and %s\n",
1172 newtag, ctxt->name);
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1181 * htmlAutoCloseOnEnd:
1182 * @ctxt: an HTML parser context
1184 * Close all remaining tags at the end of the stream
1187 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1191 if (ctxt->nameNr == 0)
1193 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1202 * @ctxt: an HTML parser context
1203 * @newtag: The new tag name or NULL
1205 * The HTML DTD allows a tag to implicitly close other tags.
1206 * The list is kept in htmlStartClose array. This function is
1207 * called when a new tag has been detected and generates the
1208 * appropriates closes if possible/needed.
1209 * If newtag is NULL this mean we are at the end of the resource
1210 * and we should check
1213 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1215 while ((newtag != NULL) && (ctxt->name != NULL) &&
1216 (htmlCheckAutoClose(newtag, ctxt->name))) {
1217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1221 if (newtag == NULL) {
1222 htmlAutoCloseOnEnd(ctxt);
1225 while ((newtag == NULL) && (ctxt->name != NULL) &&
1226 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1237 * @doc: the HTML document
1238 * @name: The tag name
1239 * @elem: the HTML element
1241 * The HTML DTD allows a tag to implicitly close other tags.
1242 * The list is kept in htmlStartClose array. This function checks
1243 * if the element or one of it's children would autoclose the
1246 * Returns 1 if autoclose, 0 otherwise
1249 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1252 if (elem == NULL) return(1);
1253 if (xmlStrEqual(name, elem->name)) return(0);
1254 if (htmlCheckAutoClose(elem->name, name)) return(1);
1255 child = elem->children;
1256 while (child != NULL) {
1257 if (htmlAutoCloseTag(doc, name, child)) return(1);
1258 child = child->next;
1265 * @doc: the HTML document
1266 * @elem: the HTML element
1268 * The HTML DTD allows a tag to implicitly close other tags.
1269 * The list is kept in htmlStartClose array. This function checks
1270 * if a tag is autoclosed by one of it's child
1272 * Returns 1 if autoclosed, 0 otherwise
1275 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1278 if (elem == NULL) return(1);
1279 child = elem->children;
1280 while (child != NULL) {
1281 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282 child = child->next;
1289 * @ctxt: an HTML parser context
1290 * @newtag: The new tag name
1292 * The HTML DTD allows a tag to exists only implicitly
1293 * called when a new tag has been detected and generates the
1294 * appropriates implicit tags if missing
1297 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298 if (!htmlOmittedDefaultValue)
1300 if (xmlStrEqual(newtag, BAD_CAST"html"))
1302 if (ctxt->nameNr <= 0) {
1303 htmlnamePush(ctxt, BAD_CAST"html");
1304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1307 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1309 if ((ctxt->nameNr <= 1) &&
1310 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1317 * dropped OBJECT ... i you put it first BODY will be
1320 htmlnamePush(ctxt, BAD_CAST"head");
1321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1327 for (i = 0;i < ctxt->nameNr;i++) {
1328 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1331 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1336 htmlnamePush(ctxt, BAD_CAST"body");
1337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1343 * htmlCheckParagraph
1344 * @ctxt: an HTML parser context
1346 * Check whether a p element need to be implied before inserting
1347 * characters in the current element.
1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1354 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1362 htmlAutoClose(ctxt, BAD_CAST"p");
1363 htmlCheckImplied(ctxt, BAD_CAST"p");
1364 htmlnamePush(ctxt, BAD_CAST"p");
1365 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1369 if (!htmlOmittedDefaultValue)
1371 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1373 htmlAutoClose(ctxt, BAD_CAST"p");
1374 htmlCheckImplied(ctxt, BAD_CAST"p");
1375 htmlnamePush(ctxt, BAD_CAST"p");
1376 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1385 * htmlIsScriptAttribute:
1386 * @name: an attribute name
1388 * Check if an attribute is of content type Script
1390 * Returns 1 is the attribute is a script 0 otherwise
1393 htmlIsScriptAttribute(const xmlChar *name) {
1399 * all script attributes start with 'on'
1401 if ((name[0] != 'o') || (name[1] != 'n'))
1404 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1406 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1412 /************************************************************************
1414 * The list of HTML predefined entities *
1416 ************************************************************************/
1419 static const htmlEntityDesc html40EntitiesTable[] = {
1421 * the 4 absolute ones, plus apostrophe.
1423 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1424 { 38, "amp", "ampersand, U+0026 ISOnum" },
1425 { 39, "apos", "single quote" },
1426 { 60, "lt", "less-than sign, U+003C ISOnum" },
1427 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1430 * A bunch still in the 128-255 range
1431 * Replacing them depend really on the charset used.
1433 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1434 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1436 { 163, "pound","pound sign, U+00A3 ISOnum" },
1437 { 164, "curren","currency sign, U+00A4 ISOnum" },
1438 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1439 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440 { 167, "sect", "section sign, U+00A7 ISOnum" },
1441 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1443 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1444 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445 { 172, "not", "not sign, U+00AC ISOnum" },
1446 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1448 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1450 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454 { 181, "micro","micro sign, U+00B5 ISOnum" },
1455 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1459 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1460 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1482 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1489 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1509 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1513 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1514 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520 { 247, "divide","division sign, U+00F7 ISOnum" },
1521 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1526 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1530 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1532 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1537 * Anything below should really be kept as entities references
1539 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1541 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1542 { 732, "tilde","small tilde, U+02DC ISOdia" },
1544 { 913, "Alpha","greek capital letter alpha, U+0391" },
1545 { 914, "Beta", "greek capital letter beta, U+0392" },
1546 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1549 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1550 { 919, "Eta", "greek capital letter eta, U+0397" },
1551 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552 { 921, "Iota", "greek capital letter iota, U+0399" },
1553 { 922, "Kappa","greek capital letter kappa, U+039A" },
1554 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1555 { 924, "Mu", "greek capital letter mu, U+039C" },
1556 { 925, "Nu", "greek capital letter nu, U+039D" },
1557 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1558 { 927, "Omicron","greek capital letter omicron, U+039F" },
1559 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1560 { 929, "Rho", "greek capital letter rho, U+03A1" },
1561 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562 { 932, "Tau", "greek capital letter tau, U+03A4" },
1563 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1565 { 935, "Chi", "greek capital letter chi, U+03A7" },
1566 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1567 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1569 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1571 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1573 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1575 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1576 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1577 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1578 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1581 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1582 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1583 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1584 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1585 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1586 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1589 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1591 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1592 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1593 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1594 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1598 { 8194, "ensp", "en space, U+2002 ISOpub" },
1599 { 8195, "emsp", "em space, U+2003 ISOpub" },
1600 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1601 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1602 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1603 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1604 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1605 { 8211, "ndash","en dash, U+2013 ISOpub" },
1606 { 8212, "mdash","em dash, U+2014 ISOpub" },
1607 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1608 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1609 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1610 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1611 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1612 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1613 { 8224, "dagger","dagger, U+2020 ISOpub" },
1614 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1616 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1617 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1619 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1621 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1622 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1624 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1627 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1628 { 8260, "frasl","fraction slash, U+2044 NEW" },
1630 { 8364, "euro", "euro sign, U+20AC NEW" },
1632 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1635 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1636 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1638 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1639 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1640 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1641 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1642 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1644 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1645 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1646 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1647 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1649 { 8704, "forall","for all, U+2200 ISOtech" },
1650 { 8706, "part", "partial differential, U+2202 ISOtech" },
1651 { 8707, "exist","there exists, U+2203 ISOtech" },
1652 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1653 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1654 { 8712, "isin", "element of, U+2208 ISOtech" },
1655 { 8713, "notin","not an element of, U+2209 ISOtech" },
1656 { 8715, "ni", "contains as member, U+220B ISOtech" },
1657 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1658 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1659 { 8722, "minus","minus sign, U+2212 ISOtech" },
1660 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1661 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1662 { 8733, "prop", "proportional to, U+221D ISOtech" },
1663 { 8734, "infin","infinity, U+221E ISOtech" },
1664 { 8736, "ang", "angle, U+2220 ISOamso" },
1665 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1666 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1667 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1668 { 8746, "cup", "union = cup, U+222A ISOtech" },
1669 { 8747, "int", "integral, U+222B ISOtech" },
1670 { 8756, "there4","therefore, U+2234 ISOtech" },
1671 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1672 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1673 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1675 { 8801, "equiv","identical to, U+2261 ISOtech" },
1676 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1677 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1678 { 8834, "sub", "subset of, U+2282 ISOtech" },
1679 { 8835, "sup", "superset of, U+2283 ISOtech" },
1680 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1681 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1682 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1683 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1685 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1687 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1689 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1691 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1692 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1693 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1695 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1696 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1697 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1698 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1702 /************************************************************************
1704 * Commodity functions to handle entities *
1706 ************************************************************************/
1709 * Macro used to grow the current buffer.
1711 #define growBuffer(buffer) { \
1713 buffer##_size *= 2; \
1714 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715 if (tmp == NULL) { \
1716 htmlErrMemory(ctxt, "growing buffer\n"); \
1725 * @name: the entity name
1727 * Lookup the given entity in EntitiesTable
1729 * TODO: the linear scan is really ugly, an hash table is really needed.
1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1733 const htmlEntityDesc *
1734 htmlEntityLookup(const xmlChar *name) {
1737 for (i = 0;i < (sizeof(html40EntitiesTable)/
1738 sizeof(html40EntitiesTable[0]));i++) {
1739 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1740 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1747 * htmlEntityValueLookup:
1748 * @value: the entity's unicode value
1750 * Lookup the given entity in EntitiesTable
1752 * TODO: the linear scan is really ugly, an hash table is really needed.
1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1756 const htmlEntityDesc *
1757 htmlEntityValueLookup(unsigned int value) {
1760 for (i = 0;i < (sizeof(html40EntitiesTable)/
1761 sizeof(html40EntitiesTable[0]));i++) {
1762 if (html40EntitiesTable[i].value >= value) {
1763 if (html40EntitiesTable[i].value > value)
1765 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1773 * @out: a pointer to an array of bytes to store the result
1774 * @outlen: the length of @out
1775 * @in: a pointer to an array of UTF-8 chars
1776 * @inlen: the length of @in
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
1783 * as the return value is positive, else unpredictable.
1784 * The value of @outlen after return is the number of octets consumed.
1787 UTF8ToHtml(unsigned char* out, int *outlen,
1788 const unsigned char* in, int *inlen) {
1789 const unsigned char* processed = in;
1790 const unsigned char* outend;
1791 const unsigned char* outstart = out;
1792 const unsigned char* instart = in;
1793 const unsigned char* inend;
1797 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1800 * initialization nothing to do
1806 inend = in + (*inlen);
1807 outend = out + (*outlen);
1808 while (in < inend) {
1810 if (d < 0x80) { c= d; trailing= 0; }
1811 else if (d < 0xC0) {
1812 /* trailing byte in leading position */
1813 *outlen = out - outstart;
1814 *inlen = processed - instart;
1816 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1817 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1818 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1820 /* no chance for this in Ascii */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1826 if (inend - in < trailing) {
1830 for ( ; trailing; trailing--) {
1831 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1837 /* assertion: c is a single UTF-4 value */
1839 if (out + 1 >= outend)
1844 const htmlEntityDesc * ent;
1847 * Try to lookup a predefined HTML entity for it
1850 ent = htmlEntityValueLookup(c);
1852 /* no chance for this in Ascii */
1853 *outlen = out - outstart;
1854 *inlen = processed - instart;
1857 len = strlen(ent->name);
1858 if (out + 2 + len >= outend)
1861 memcpy(out, ent->name, len);
1867 *outlen = out - outstart;
1868 *inlen = processed - instart;
1873 * htmlEncodeEntities:
1874 * @out: a pointer to an array of bytes to store the result
1875 * @outlen: the length of @out
1876 * @in: a pointer to an array of UTF-8 chars
1877 * @inlen: the length of @in
1878 * @quoteChar: the quote character to escape (' or ") or zero.
1880 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1881 * plus HTML entities block of chars out.
1883 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1884 * The value of @inlen after return is the number of octets consumed
1885 * as the return value is positive, else unpredictable.
1886 * The value of @outlen after return is the number of octets consumed.
1889 htmlEncodeEntities(unsigned char* out, int *outlen,
1890 const unsigned char* in, int *inlen, int quoteChar) {
1891 const unsigned char* processed = in;
1892 const unsigned char* outend;
1893 const unsigned char* outstart = out;
1894 const unsigned char* instart = in;
1895 const unsigned char* inend;
1899 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1901 outend = out + (*outlen);
1902 inend = in + (*inlen);
1903 while (in < inend) {
1905 if (d < 0x80) { c= d; trailing= 0; }
1906 else if (d < 0xC0) {
1907 /* trailing byte in leading position */
1908 *outlen = out - outstart;
1909 *inlen = processed - instart;
1911 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1912 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1913 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1915 /* no chance for this in Ascii */
1916 *outlen = out - outstart;
1917 *inlen = processed - instart;
1921 if (inend - in < trailing)
1924 while (trailing--) {
1925 if (((d= *in++) & 0xC0) != 0x80) {
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1934 /* assertion: c is a single UTF-4 value */
1935 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1936 (c != '&') && (c != '<') && (c != '>')) {
1941 const htmlEntityDesc * ent;
1947 * Try to lookup a predefined HTML entity for it
1949 ent = htmlEntityValueLookup(c);
1951 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1957 if (out + 2 + len > outend)
1960 memcpy(out, cp, len);
1966 *outlen = out - outstart;
1967 *inlen = processed - instart;
1971 /************************************************************************
1973 * Commodity functions to handle streams *
1975 ************************************************************************/
1978 * htmlNewInputStream:
1979 * @ctxt: an HTML parser context
1981 * Create a new input stream structure
1982 * Returns the new input stream or NULL
1984 static htmlParserInputPtr
1985 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1986 htmlParserInputPtr input;
1988 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1989 if (input == NULL) {
1990 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
1993 memset(input, 0, sizeof(htmlParserInput));
1994 input->filename = NULL;
1995 input->directory = NULL;
2003 input->version = NULL;
2004 input->consumed = 0;
2010 /************************************************************************
2012 * Commodity functions, cleanup needed ? *
2014 ************************************************************************/
2016 * all tags allowing pc data from the html 4.01 loose dtd
2017 * NOTE: it might be more apropriate to integrate this information
2018 * into the html40ElementTable array but I don't want to risk any
2019 * binary incomptibility
2021 static const char *allowPCData[] = {
2022 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2023 "blockquote", "body", "button", "caption", "center", "cite", "code",
2024 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2025 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2026 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2027 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2032 * @ctxt: an HTML parser context
2034 * @len: the size of @str
2036 * Is this a sequence of blank chars that one can ignore ?
2038 * Returns 1 if ignorable 0 otherwise.
2041 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2044 xmlNodePtr lastChild;
2047 for (j = 0;j < len;j++)
2048 if (!(IS_BLANK_CH(str[j]))) return(0);
2050 if (CUR == 0) return(1);
2051 if (CUR != '<') return(0);
2052 if (ctxt->name == NULL)
2054 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2056 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2059 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2060 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2061 dtd = xmlGetIntSubset(ctxt->myDoc);
2062 if (dtd != NULL && dtd->ExternalID != NULL) {
2063 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2064 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2069 if (ctxt->node == NULL) return(0);
2070 lastChild = xmlGetLastChild(ctxt->node);
2071 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2072 lastChild = lastChild->prev;
2073 if (lastChild == NULL) {
2074 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2075 (ctxt->node->content != NULL)) return(0);
2076 /* keep ws in constructs like ...<b> </b>...
2077 for all tags "b" allowing PCDATA */
2078 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2079 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2083 } else if (xmlNodeIsText(lastChild)) {
2086 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2087 for all tags "p" allowing PCDATA */
2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2099 * @URI: URI for the dtd, or NULL
2100 * @ExternalID: the external ID of the DTD, or NULL
2102 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2105 * Returns a new document, do not initialize the DTD if not provided
2108 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2112 * Allocate a new document and fill the fields.
2114 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2116 htmlErrMemory(NULL, "HTML document creation failed\n");
2119 memset(cur, 0, sizeof(xmlDoc));
2121 cur->type = XML_HTML_DOCUMENT_NODE;
2122 cur->version = NULL;
2123 cur->intSubset = NULL;
2126 cur->children = NULL;
2127 cur->extSubset = NULL;
2129 cur->encoding = NULL;
2130 cur->standalone = 1;
2131 cur->compression = 0;
2134 cur->_private = NULL;
2135 cur->charset = XML_CHAR_ENCODING_UTF8;
2136 if ((ExternalID != NULL) ||
2138 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2144 * @URI: URI for the dtd, or NULL
2145 * @ExternalID: the external ID of the DTD, or NULL
2147 * Creates a new HTML document
2149 * Returns a new document
2152 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2153 if ((URI == NULL) && (ExternalID == NULL))
2154 return(htmlNewDocNoDtD(
2155 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2156 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2158 return(htmlNewDocNoDtD(URI, ExternalID));
2162 /************************************************************************
2164 * The parser itself *
2165 * Relates to http://www.w3.org/TR/html40 *
2167 ************************************************************************/
2169 /************************************************************************
2171 * The parser itself *
2173 ************************************************************************/
2175 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2178 * htmlParseHTMLName:
2179 * @ctxt: an HTML parser context
2181 * parse an HTML tag or attribute name, note that we convert it to lowercase
2182 * since HTML names are not case-sensitive.
2184 * Returns the Tag Name parsed or NULL
2187 static const xmlChar *
2188 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2190 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2192 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2193 (CUR != ':')) return(NULL);
2195 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2196 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2197 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2198 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2205 return(xmlDictLookup(ctxt->dict, loc, i));
2210 * @ctxt: an HTML parser context
2212 * parse an HTML name, this routine is case sensitive.
2214 * Returns the Name parsed or NULL
2217 static const xmlChar *
2218 htmlParseName(htmlParserCtxtPtr ctxt) {
2226 * Accelerator for simple ASCII names
2228 in = ctxt->input->cur;
2229 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2230 ((*in >= 0x41) && (*in <= 0x5A)) ||
2231 (*in == '_') || (*in == ':')) {
2233 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2234 ((*in >= 0x41) && (*in <= 0x5A)) ||
2235 ((*in >= 0x30) && (*in <= 0x39)) ||
2236 (*in == '_') || (*in == '-') ||
2237 (*in == ':') || (*in == '.'))
2239 if ((*in > 0) && (*in < 0x80)) {
2240 count = in - ctxt->input->cur;
2241 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2242 ctxt->input->cur = in;
2243 ctxt->nbChars += count;
2244 ctxt->input->col += count;
2248 return(htmlParseNameComplex(ctxt));
2251 static const xmlChar *
2252 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2258 * Handler for more complex cases
2262 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2263 (!IS_LETTER(c) && (c != '_') &&
2268 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2269 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2270 (c == '.') || (c == '-') ||
2271 (c == '_') || (c == ':') ||
2272 (IS_COMBINING(c)) ||
2273 (IS_EXTENDER(c)))) {
2274 if (count++ > 100) {
2282 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2287 * htmlParseHTMLAttribute:
2288 * @ctxt: an HTML parser context
2289 * @stop: a char stop value
2291 * parse an HTML attribute value till the stop (quote), if
2292 * stop is 0 then it stops at the first space
2294 * Returns the attribute parsed or NULL
2298 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2299 xmlChar *buffer = NULL;
2300 int buffer_size = 0;
2301 xmlChar *out = NULL;
2302 const xmlChar *name = NULL;
2303 const xmlChar *cur = NULL;
2304 const htmlEntityDesc * ent;
2307 * allocate a translation buffer.
2309 buffer_size = HTML_PARSER_BUFFER_SIZE;
2310 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2311 if (buffer == NULL) {
2312 htmlErrMemory(ctxt, "buffer allocation failed\n");
2318 * Ok loop until we reach one of the ending chars
2320 while ((CUR != 0) && (CUR != stop)) {
2321 if ((stop == 0) && (CUR == '>')) break;
2322 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2324 if (NXT(1) == '#') {
2328 c = htmlParseCharRef(ctxt);
2330 { *out++ = c; bits= -6; }
2332 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2333 else if (c < 0x10000)
2334 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2336 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2338 for ( ; bits >= 0; bits-= 6) {
2339 *out++ = ((c >> bits) & 0x3F) | 0x80;
2342 if (out - buffer > buffer_size - 100) {
2343 int indx = out - buffer;
2346 out = &buffer[indx];
2349 ent = htmlParseEntityRef(ctxt, &name);
2352 if (out - buffer > buffer_size - 100) {
2353 int indx = out - buffer;
2356 out = &buffer[indx];
2358 } else if (ent == NULL) {
2362 if (out - buffer > buffer_size - 100) {
2363 int indx = out - buffer;
2366 out = &buffer[indx];
2374 if (out - buffer > buffer_size - 100) {
2375 int indx = out - buffer;
2378 out = &buffer[indx];
2382 { *out++ = c; bits= -6; }
2384 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2385 else if (c < 0x10000)
2386 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2388 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2390 for ( ; bits >= 0; bits-= 6) {
2391 *out++ = ((c >> bits) & 0x3F) | 0x80;
2399 if (out - buffer > buffer_size - 100) {
2400 int indx = out - buffer;
2403 out = &buffer[indx];
2407 { *out++ = c; bits= -6; }
2409 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2410 else if (c < 0x10000)
2411 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2413 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2415 for ( ; bits >= 0; bits-= 6) {
2416 *out++ = ((c >> bits) & 0x3F) | 0x80;
2426 * htmlParseEntityRef:
2427 * @ctxt: an HTML parser context
2428 * @str: location to store the entity name
2430 * parse an HTML ENTITY references
2432 * [68] EntityRef ::= '&' Name ';'
2434 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2435 * if non-NULL *str will have to be freed by the caller.
2437 const htmlEntityDesc *
2438 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2439 const xmlChar *name;
2440 const htmlEntityDesc * ent = NULL;
2442 if (str != NULL) *str = NULL;
2443 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2447 name = htmlParseName(ctxt);
2449 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2450 "htmlParseEntityRef: no name\n", NULL, NULL);
2458 * Lookup the entity in the table.
2460 ent = htmlEntityLookup(name);
2461 if (ent != NULL) /* OK that's ugly !!! */
2464 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2465 "htmlParseEntityRef: expecting ';'\n",
2476 * htmlParseAttValue:
2477 * @ctxt: an HTML parser context
2479 * parse a value for an attribute
2480 * Note: the parser won't do substitution of entities here, this
2481 * will be handled later in xmlStringGetNodeList, unless it was
2482 * asked for ctxt->replaceEntities != 0
2484 * Returns the AttValue parsed or NULL.
2488 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2489 xmlChar *ret = NULL;
2493 ret = htmlParseHTMLAttribute(ctxt, '"');
2495 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2496 "AttValue: \" expected\n", NULL, NULL);
2499 } else if (CUR == '\'') {
2501 ret = htmlParseHTMLAttribute(ctxt, '\'');
2503 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2504 "AttValue: ' expected\n", NULL, NULL);
2509 * That's an HTMLism, the attribute value may not be quoted
2511 ret = htmlParseHTMLAttribute(ctxt, 0);
2513 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2514 "AttValue: no value found\n", NULL, NULL);
2521 * htmlParseSystemLiteral:
2522 * @ctxt: an HTML parser context
2524 * parse an HTML Literal
2526 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2528 * Returns the SystemLiteral parsed or NULL
2532 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2534 xmlChar *ret = NULL;
2539 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2541 if (!IS_CHAR_CH(CUR)) {
2542 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2543 "Unfinished SystemLiteral\n", NULL, NULL);
2545 ret = xmlStrndup(q, CUR_PTR - q);
2548 } else if (CUR == '\'') {
2551 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2553 if (!IS_CHAR_CH(CUR)) {
2554 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2555 "Unfinished SystemLiteral\n", NULL, NULL);
2557 ret = xmlStrndup(q, CUR_PTR - q);
2561 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2562 " or ' expected\n", NULL, NULL);
2569 * htmlParsePubidLiteral:
2570 * @ctxt: an HTML parser context
2572 * parse an HTML public literal
2574 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2576 * Returns the PubidLiteral parsed or NULL.
2580 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2582 xmlChar *ret = NULL;
2584 * Name ::= (Letter | '_') (NameChar)*
2589 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2591 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2592 "Unfinished PubidLiteral\n", NULL, NULL);
2594 ret = xmlStrndup(q, CUR_PTR - q);
2597 } else if (CUR == '\'') {
2600 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2603 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2604 "Unfinished PubidLiteral\n", NULL, NULL);
2606 ret = xmlStrndup(q, CUR_PTR - q);
2610 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2611 "PubidLiteral \" or ' expected\n", NULL, NULL);
2619 * @ctxt: an HTML parser context
2621 * parse the content of an HTML SCRIPT or STYLE element
2622 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2623 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2624 * http://www.w3.org/TR/html4/types.html#type-script
2625 * http://www.w3.org/TR/html4/types.html#h-6.15
2626 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2628 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2629 * element and the value of intrinsic event attributes. User agents must
2630 * not evaluate script data as HTML markup but instead must pass it on as
2631 * data to a script engine.
2633 * - The content is passed like CDATA
2634 * - the attributes for style and scripting "onXXX" are also described
2635 * as CDATA but SGML allows entities references in attributes so their
2636 * processing is identical as other attributes
2639 htmlParseScript(htmlParserCtxtPtr ctxt) {
2640 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2646 while (IS_CHAR_CH(cur)) {
2647 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2649 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2650 if (ctxt->sax->cdataBlock!= NULL) {
2652 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2654 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2655 } else if (ctxt->sax->characters != NULL) {
2656 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2660 htmlParseComment(ctxt);
2663 } else if ((cur == '<') && (NXT(1) == '/')) {
2665 * One should break here, the specification is clear:
2666 * Authors should therefore escape "</" within the content.
2667 * Escape mechanisms are specific to each scripting or
2668 * style sheet language.
2670 * In recovery mode, only break if end tag match the
2671 * current tag, effectively ignoring all tags inside the
2672 * script/style block and treating the entire block as
2675 if (ctxt->recovery) {
2676 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2677 xmlStrlen(ctxt->name)) == 0)
2681 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2682 "Element %s embeds close tag\n",
2686 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2687 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2693 COPY_BUF(l,buf,nbchar,cur);
2694 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2695 if (ctxt->sax->cdataBlock!= NULL) {
2697 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2699 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2700 } else if (ctxt->sax->characters != NULL) {
2701 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2710 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2711 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2712 "Invalid char in CDATA 0x%X\n", cur);
2716 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2717 if (ctxt->sax->cdataBlock!= NULL) {
2719 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2721 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2722 } else if (ctxt->sax->characters != NULL) {
2723 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2730 * htmlParseCharData:
2731 * @ctxt: an HTML parser context
2733 * parse a CharData section.
2734 * if we are within a CDATA section ']]>' marks an end of section.
2736 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2740 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2741 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2747 while (((cur != '<') || (ctxt->token == '<')) &&
2748 ((cur != '&') || (ctxt->token == '&')) &&
2750 COPY_BUF(l,buf,nbchar,cur);
2751 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2753 * Ok the segment is to be consumed as chars.
2755 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2756 if (areBlanks(ctxt, buf, nbchar)) {
2757 if (ctxt->sax->ignorableWhitespace != NULL)
2758 ctxt->sax->ignorableWhitespace(ctxt->userData,
2761 htmlCheckParagraph(ctxt);
2762 if (ctxt->sax->characters != NULL)
2763 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2780 * Ok the segment is to be consumed as chars.
2782 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2783 if (areBlanks(ctxt, buf, nbchar)) {
2784 if (ctxt->sax->ignorableWhitespace != NULL)
2785 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2787 htmlCheckParagraph(ctxt);
2788 if (ctxt->sax->characters != NULL)
2789 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2797 ctxt->instate = XML_PARSER_EOF;
2802 * htmlParseExternalID:
2803 * @ctxt: an HTML parser context
2804 * @publicID: a xmlChar** receiving PubidLiteral
2806 * Parse an External ID or a Public ID
2808 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2809 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2811 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2813 * Returns the function returns SystemLiteral and in the second
2814 * case publicID receives PubidLiteral, is strict is off
2815 * it is possible to return NULL and have publicID set.
2819 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2820 xmlChar *URI = NULL;
2822 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2823 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2824 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2826 if (!IS_BLANK_CH(CUR)) {
2827 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2828 "Space required after 'SYSTEM'\n", NULL, NULL);
2831 URI = htmlParseSystemLiteral(ctxt);
2833 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2834 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2836 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2837 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2838 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2840 if (!IS_BLANK_CH(CUR)) {
2841 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2842 "Space required after 'PUBLIC'\n", NULL, NULL);
2845 *publicID = htmlParsePubidLiteral(ctxt);
2846 if (*publicID == NULL) {
2847 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2848 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2852 if ((CUR == '"') || (CUR == '\'')) {
2853 URI = htmlParseSystemLiteral(ctxt);
2861 * @ctxt: an XML parser context
2863 * parse an XML Processing Instruction.
2865 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2868 htmlParsePI(htmlParserCtxtPtr ctxt) {
2869 xmlChar *buf = NULL;
2871 int size = HTML_PARSER_BUFFER_SIZE;
2873 const xmlChar *target;
2874 xmlParserInputState state;
2877 if ((RAW == '<') && (NXT(1) == '?')) {
2878 state = ctxt->instate;
2879 ctxt->instate = XML_PARSER_PI;
2881 * this is a Processing Instruction.
2887 * Parse the target name and check for special support like
2890 target = htmlParseName(ctxt);
2891 if (target != NULL) {
2898 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2899 (ctxt->sax->processingInstruction != NULL))
2900 ctxt->sax->processingInstruction(ctxt->userData,
2902 ctxt->instate = state;
2905 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2907 htmlErrMemory(ctxt, NULL);
2908 ctxt->instate = state;
2912 if (!IS_BLANK(cur)) {
2913 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2914 "ParsePI: PI %s space expected\n", target, NULL);
2918 while (IS_CHAR(cur) && (cur != '>')) {
2919 if (len + 5 >= size) {
2923 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2925 htmlErrMemory(ctxt, NULL);
2927 ctxt->instate = state;
2937 COPY_BUF(l,buf,len,cur);
2948 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2949 "ParsePI: PI %s never end ...\n", target, NULL);
2956 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2957 (ctxt->sax->processingInstruction != NULL))
2958 ctxt->sax->processingInstruction(ctxt->userData,
2963 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2964 "PI is not started correctly", NULL, NULL);
2966 ctxt->instate = state;
2972 * @ctxt: an HTML parser context
2974 * Parse an XML (SGML) comment <!-- .... -->
2976 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2979 htmlParseComment(htmlParserCtxtPtr ctxt) {
2980 xmlChar *buf = NULL;
2982 int size = HTML_PARSER_BUFFER_SIZE;
2986 xmlParserInputState state;
2989 * Check that there is a comment right here.
2991 if ((RAW != '<') || (NXT(1) != '!') ||
2992 (NXT(2) != '-') || (NXT(3) != '-')) return;
2994 state = ctxt->instate;
2995 ctxt->instate = XML_PARSER_COMMENT;
2998 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3000 htmlErrMemory(ctxt, "buffer allocation failed\n");
3001 ctxt->instate = state;
3010 while (IS_CHAR(cur) &&
3012 (r != '-') || (q != '-'))) {
3013 if (len + 5 >= size) {
3017 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3020 htmlErrMemory(ctxt, "growing buffer failed\n");
3021 ctxt->instate = state;
3026 COPY_BUF(ql,buf,len,q);
3040 if (!IS_CHAR(cur)) {
3041 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3042 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3046 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3047 (!ctxt->disableSAX))
3048 ctxt->sax->comment(ctxt->userData, buf);
3051 ctxt->instate = state;
3056 * @ctxt: an HTML parser context
3058 * parse Reference declarations
3060 * [66] CharRef ::= '&#' [0-9]+ ';' |
3061 * '&#x' [0-9a-fA-F]+ ';'
3063 * Returns the value parsed (as an int)
3066 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3069 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3070 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3071 "htmlParseCharRef: context error\n",
3075 if ((CUR == '&') && (NXT(1) == '#') &&
3076 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3078 while (CUR != ';') {
3079 if ((CUR >= '0') && (CUR <= '9'))
3080 val = val * 16 + (CUR - '0');
3081 else if ((CUR >= 'a') && (CUR <= 'f'))
3082 val = val * 16 + (CUR - 'a') + 10;
3083 else if ((CUR >= 'A') && (CUR <= 'F'))
3084 val = val * 16 + (CUR - 'A') + 10;
3086 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3087 "htmlParseCharRef: invalid hexadecimal value\n",
3095 } else if ((CUR == '&') && (NXT(1) == '#')) {
3097 while (CUR != ';') {
3098 if ((CUR >= '0') && (CUR <= '9'))
3099 val = val * 10 + (CUR - '0');
3101 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3102 "htmlParseCharRef: invalid decimal value\n",
3111 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3112 "htmlParseCharRef: invalid value\n", NULL, NULL);
3115 * Check the value IS_CHAR ...
3120 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3121 "htmlParseCharRef: invalid xmlChar value %d\n",
3129 * htmlParseDocTypeDecl:
3130 * @ctxt: an HTML parser context
3132 * parse a DOCTYPE declaration
3134 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3135 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3139 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3140 const xmlChar *name;
3141 xmlChar *ExternalID = NULL;
3142 xmlChar *URI = NULL;
3145 * We know that '<!DOCTYPE' has been detected.
3152 * Parse the DOCTYPE name.
3154 name = htmlParseName(ctxt);
3156 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3157 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3161 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3167 * Check for SystemID and ExternalID
3169 URI = htmlParseExternalID(ctxt, &ExternalID);
3173 * We should be at the end of the DOCTYPE declaration.
3176 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3177 "DOCTYPE improperly terminated\n", NULL, NULL);
3178 /* We shouldn't try to resynchronize ... */
3183 * Create or update the document accordingly to the DOCTYPE
3185 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3186 (!ctxt->disableSAX))
3187 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3190 * Cleanup, since we don't use all those identifiers
3192 if (URI != NULL) xmlFree(URI);
3193 if (ExternalID != NULL) xmlFree(ExternalID);
3197 * htmlParseAttribute:
3198 * @ctxt: an HTML parser context
3199 * @value: a xmlChar ** used to store the value of the attribute
3201 * parse an attribute
3203 * [41] Attribute ::= Name Eq AttValue
3205 * [25] Eq ::= S? '=' S?
3209 * [NS 11] Attribute ::= QName Eq AttValue
3211 * Also the case QName == xmlns:??? is handled independently as a namespace
3214 * Returns the attribute name, and the value in *value.
3217 static const xmlChar *
3218 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3219 const xmlChar *name;
3220 xmlChar *val = NULL;
3223 name = htmlParseHTMLName(ctxt);
3225 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3226 "error parsing attribute name\n", NULL, NULL);
3237 val = htmlParseAttValue(ctxt);
3238 } else if (htmlIsBooleanAttr(name)) {
3240 * assume a minimized attribute
3242 val = xmlStrdup(name);
3250 * htmlCheckEncoding:
3251 * @ctxt: an HTML parser context
3252 * @attvalue: the attribute value
3254 * Checks an http-equiv attribute from a Meta tag to detect
3256 * If a new encoding is detected the parser is switched to decode
3260 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3261 const xmlChar *encoding;
3263 if ((ctxt == NULL) || (attvalue == NULL))
3266 /* do not change encoding */
3267 if (ctxt->input->encoding != NULL)
3270 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3271 if (encoding != NULL) {
3274 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3275 if (encoding != NULL)
3278 if (encoding != NULL) {
3279 xmlCharEncoding enc;
3280 xmlCharEncodingHandlerPtr handler;
3282 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3284 if (ctxt->input->encoding != NULL)
3285 xmlFree((xmlChar *) ctxt->input->encoding);
3286 ctxt->input->encoding = xmlStrdup(encoding);
3288 enc = xmlParseCharEncoding((const char *) encoding);
3290 * registered set of known encodings
3292 if (enc != XML_CHAR_ENCODING_ERROR) {
3293 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3294 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3295 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3296 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3297 (ctxt->input->buf != NULL) &&
3298 (ctxt->input->buf->encoder == NULL)) {
3299 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3300 "htmlCheckEncoding: wrong encoding meta\n",
3303 xmlSwitchEncoding(ctxt, enc);
3305 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3308 * fallback for unknown encodings
3310 handler = xmlFindCharEncodingHandler((const char *) encoding);
3311 if (handler != NULL) {
3312 xmlSwitchToEncoding(ctxt, handler);
3313 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3315 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3319 if ((ctxt->input->buf != NULL) &&
3320 (ctxt->input->buf->encoder != NULL) &&
3321 (ctxt->input->buf->raw != NULL) &&
3322 (ctxt->input->buf->buffer != NULL)) {
3327 * convert as much as possible to the parser reading buffer.
3329 processed = ctxt->input->cur - ctxt->input->base;
3330 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3331 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3332 ctxt->input->buf->buffer,
3333 ctxt->input->buf->raw);
3335 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3336 "htmlCheckEncoding: encoder error\n",
3340 ctxt->input->cur = ctxt->input->buf->buffer->content;
3347 * @ctxt: an HTML parser context
3348 * @atts: the attributes values
3350 * Checks an attributes from a Meta tag
3353 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3355 const xmlChar *att, *value;
3357 const xmlChar *content = NULL;
3359 if ((ctxt == NULL) || (atts == NULL))
3364 while (att != NULL) {
3366 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3367 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3369 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3373 if ((http) && (content != NULL))
3374 htmlCheckEncoding(ctxt, content);
3379 * htmlParseStartTag:
3380 * @ctxt: an HTML parser context
3382 * parse a start of tag either for rule element or
3383 * EmptyElement. In both case we don't parse the tag closing chars.
3385 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3387 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3391 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3393 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3395 * Returns 0 in case of success and -1 in case of error.
3399 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3400 const xmlChar *name;
3401 const xmlChar *attname;
3403 const xmlChar **atts;
3409 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3410 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3411 "htmlParseStartTag: context error\n", NULL, NULL);
3414 if (CUR != '<') return -1;
3418 maxatts = ctxt->maxatts;
3421 name = htmlParseHTMLName(ctxt);
3423 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3424 "htmlParseStartTag: invalid element name\n",
3426 /* Dump the bogus tag like browsers do */
3427 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3431 if (xmlStrEqual(name, BAD_CAST"meta"))
3435 * Check for auto-closure of HTML elements.
3437 htmlAutoClose(ctxt, name);
3440 * Check for implied HTML elements.
3442 htmlCheckImplied(ctxt, name);
3445 * Avoid html at any level > 0, head at any level != 1
3446 * or any attempt to recurse body
3448 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3449 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3450 "htmlParseStartTag: misplaced <html> tag\n",
3454 if ((ctxt->nameNr != 1) &&
3455 (xmlStrEqual(name, BAD_CAST"head"))) {
3456 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3457 "htmlParseStartTag: misplaced <head> tag\n",
3461 if (xmlStrEqual(name, BAD_CAST"body")) {
3463 for (indx = 0;indx < ctxt->nameNr;indx++) {
3464 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3465 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3466 "htmlParseStartTag: misplaced <body> tag\n",
3468 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3476 * Now parse the attributes, it ends up with the ending
3481 while ((IS_CHAR_CH(CUR)) &&
3483 ((CUR != '/') || (NXT(1) != '>'))) {
3484 long cons = ctxt->nbChars;
3487 attname = htmlParseAttribute(ctxt, &attvalue);
3488 if (attname != NULL) {
3491 * Well formedness requires at most one declaration of an attribute
3493 for (i = 0; i < nbatts;i += 2) {
3494 if (xmlStrEqual(atts[i], attname)) {
3495 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3496 "Attribute %s redefined\n", attname, NULL);
3497 if (attvalue != NULL)
3504 * Add the pair to atts
3507 maxatts = 22; /* allow for 10 attrs by default */
3508 atts = (const xmlChar **)
3509 xmlMalloc(maxatts * sizeof(xmlChar *));
3511 htmlErrMemory(ctxt, NULL);
3512 if (attvalue != NULL)
3517 ctxt->maxatts = maxatts;
3518 } else if (nbatts + 4 > maxatts) {
3522 n = (const xmlChar **) xmlRealloc((void *) atts,
3523 maxatts * sizeof(const xmlChar *));
3525 htmlErrMemory(ctxt, NULL);
3526 if (attvalue != NULL)
3532 ctxt->maxatts = maxatts;
3534 atts[nbatts++] = attname;
3535 atts[nbatts++] = attvalue;
3536 atts[nbatts] = NULL;
3537 atts[nbatts + 1] = NULL;
3540 if (attvalue != NULL)
3542 /* Dump the bogus attribute string up to the next blank or
3543 * the end of the tag. */
3544 while ((IS_CHAR_CH(CUR)) &&
3545 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3546 ((CUR != '/') || (NXT(1) != '>')))
3552 if (cons == ctxt->nbChars) {
3553 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3554 "htmlParseStartTag: problem parsing attributes\n",
3561 * Handle specific association to the META tag
3564 htmlCheckMeta(ctxt, atts);
3567 * SAX: Start of Element !
3569 htmlnamePush(ctxt, name);
3570 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3572 ctxt->sax->startElement(ctxt->userData, name, atts);
3574 ctxt->sax->startElement(ctxt->userData, name, NULL);
3578 for (i = 1;i < nbatts;i += 2) {
3579 if (atts[i] != NULL)
3580 xmlFree((xmlChar *) atts[i]);
3589 * @ctxt: an HTML parser context
3591 * parse an end of tag
3593 * [42] ETag ::= '</' Name S? '>'
3597 * [NS 9] ETag ::= '</' QName S? '>'
3599 * Returns 1 if the current level should be closed.
3603 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3605 const xmlChar *name;
3606 const xmlChar *oldname;
3609 if ((CUR != '<') || (NXT(1) != '/')) {
3610 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3611 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3616 name = htmlParseHTMLName(ctxt);
3621 * We should definitely be at the ending "S? '>'" part
3624 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3625 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3626 "End tag : expected '>'\n", NULL, NULL);
3627 if (ctxt->recovery) {
3629 * We're not at the ending > !!
3630 * Error, unless in recover mode where we search forwards
3633 while (CUR != '\0' && CUR != '>') NEXT;
3640 * If the name read is not one of the element in the parsing stack
3641 * then return, it's just an error.
3643 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3644 if (xmlStrEqual(name, ctxt->nameTab[i]))
3648 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3649 "Unexpected end tag : %s\n", name, NULL);
3655 * Check for auto-closure of HTML elements.
3658 htmlAutoCloseOnClose(ctxt, name);
3661 * Well formedness constraints, opening and closing must match.
3662 * With the exception that the autoclose may have popped stuff out
3665 if (!xmlStrEqual(name, ctxt->name)) {
3666 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3667 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3668 "Opening and ending tag mismatch: %s and %s\n",
3676 oldname = ctxt->name;
3677 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3678 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3679 ctxt->sax->endElement(ctxt->userData, name);
3691 * htmlParseReference:
3692 * @ctxt: an HTML parser context
3694 * parse and handle entity references in content,
3695 * this will end-up in a call to character() since this is either a
3696 * CharRef, or a predefined entity.
3699 htmlParseReference(htmlParserCtxtPtr ctxt) {
3700 const htmlEntityDesc * ent;
3702 const xmlChar *name;
3703 if (CUR != '&') return;
3705 if (NXT(1) == '#') {
3709 c = htmlParseCharRef(ctxt);
3713 if (c < 0x80) { out[i++]= c; bits= -6; }
3714 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3715 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3716 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3718 for ( ; bits >= 0; bits-= 6) {
3719 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3723 htmlCheckParagraph(ctxt);
3724 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3725 ctxt->sax->characters(ctxt->userData, out, i);
3727 ent = htmlParseEntityRef(ctxt, &name);
3729 htmlCheckParagraph(ctxt);
3730 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3731 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3734 if ((ent == NULL) || !(ent->value > 0)) {
3735 htmlCheckParagraph(ctxt);
3736 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3737 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3738 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3739 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3747 { out[i++]= c; bits= -6; }
3749 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3750 else if (c < 0x10000)
3751 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3753 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3755 for ( ; bits >= 0; bits-= 6) {
3756 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3760 htmlCheckParagraph(ctxt);
3761 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3762 ctxt->sax->characters(ctxt->userData, out, i);
3769 * @ctxt: an HTML parser context
3771 * Parse a content: comment, sub-element, reference or text.
3775 htmlParseContent(htmlParserCtxtPtr ctxt) {
3776 xmlChar *currentNode;
3779 currentNode = xmlStrdup(ctxt->name);
3780 depth = ctxt->nameNr;
3782 long cons = ctxt->nbChars;
3786 * Our tag or one of it's parent or children is ending.
3788 if ((CUR == '<') && (NXT(1) == '/')) {
3789 if (htmlParseEndTag(ctxt) &&
3790 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3791 if (currentNode != NULL)
3792 xmlFree(currentNode);
3795 continue; /* while */
3799 * Has this node been popped out during parsing of
3802 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3803 (!xmlStrEqual(currentNode, ctxt->name)))
3805 if (currentNode != NULL) xmlFree(currentNode);
3809 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3810 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3812 * Handle SCRIPT/STYLE separately
3814 htmlParseScript(ctxt);
3817 * Sometimes DOCTYPE arrives in the middle of the document
3819 if ((CUR == '<') && (NXT(1) == '!') &&
3820 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3821 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3822 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3824 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3825 "Misplaced DOCTYPE declaration\n",
3826 BAD_CAST "DOCTYPE" , NULL);
3827 htmlParseDocTypeDecl(ctxt);
3831 * First case : a comment
3833 if ((CUR == '<') && (NXT(1) == '!') &&
3834 (NXT(2) == '-') && (NXT(3) == '-')) {
3835 htmlParseComment(ctxt);
3839 * Second case : a Processing Instruction.
3841 else if ((CUR == '<') && (NXT(1) == '?')) {
3846 * Third case : a sub-element.
3848 else if (CUR == '<') {
3849 htmlParseElement(ctxt);
3853 * Fourth case : a reference. If if has not been resolved,
3854 * parsing returns it's Name, create the node
3856 else if (CUR == '&') {
3857 htmlParseReference(ctxt);
3861 * Fifth case : end of the resource
3863 else if (CUR == 0) {
3864 htmlAutoCloseOnEnd(ctxt);
3869 * Last case, text. Note that References are handled directly.
3872 htmlParseCharData(ctxt);
3875 if (cons == ctxt->nbChars) {
3876 if (ctxt->node != NULL) {
3877 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3878 "detected an error in element content\n",
3886 if (currentNode != NULL) xmlFree(currentNode);
3891 * @ctxt: an HTML parser context
3893 * Parse a content: comment, sub-element, reference or text.
3897 __htmlParseContent(void *ctxt) {
3899 htmlParseContent((htmlParserCtxtPtr) ctxt);
3904 * @ctxt: an HTML parser context
3906 * parse an HTML element, this is highly recursive
3908 * [39] element ::= EmptyElemTag | STag content ETag
3910 * [41] Attribute ::= Name Eq AttValue
3914 htmlParseElement(htmlParserCtxtPtr ctxt) {
3915 const xmlChar *name;
3916 xmlChar *currentNode = NULL;
3917 const htmlElemDesc * info;
3918 htmlParserNodeInfo node_info;
3921 const xmlChar *oldptr;
3923 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3924 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3925 "htmlParseElement: context error\n", NULL, NULL);
3928 /* Capture start position */
3929 if (ctxt->record_info) {
3930 node_info.begin_pos = ctxt->input->consumed +
3931 (CUR_PTR - ctxt->input->base);
3932 node_info.begin_line = ctxt->input->line;
3935 failed = htmlParseStartTag(ctxt);
3937 if (failed || (name == NULL)) {
3944 * Lookup the info for that element.
3946 info = htmlTagLookup(name);
3948 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3949 "Tag %s invalid\n", name, NULL);
3953 * Check for an Empty Element labeled the XML/SGML way
3955 if ((CUR == '/') && (NXT(1) == '>')) {
3957 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3958 ctxt->sax->endElement(ctxt->userData, name);
3966 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3967 "Couldn't find end of Start Tag %s\n", name, NULL);
3970 * end of parsing of this node.
3972 if (xmlStrEqual(name, ctxt->name)) {
3978 * Capture end position and add node
3980 if (ctxt->record_info) {
3981 node_info.end_pos = ctxt->input->consumed +
3982 (CUR_PTR - ctxt->input->base);
3983 node_info.end_line = ctxt->input->line;
3984 node_info.node = ctxt->node;
3985 xmlParserAddNodeInfo(ctxt, &node_info);
3991 * Check for an Empty Element from DTD definition
3993 if ((info != NULL) && (info->empty)) {
3994 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3995 ctxt->sax->endElement(ctxt->userData, name);
4001 * Parse the content of the element:
4003 currentNode = xmlStrdup(ctxt->name);
4004 depth = ctxt->nameNr;
4005 while (IS_CHAR_CH(CUR)) {
4006 oldptr = ctxt->input->cur;
4007 htmlParseContent(ctxt);
4008 if (oldptr==ctxt->input->cur) break;
4009 if (ctxt->nameNr < depth) break;
4013 * Capture end position and add node
4015 if ( currentNode != NULL && ctxt->record_info ) {
4016 node_info.end_pos = ctxt->input->consumed +
4017 (CUR_PTR - ctxt->input->base);
4018 node_info.end_line = ctxt->input->line;
4019 node_info.node = ctxt->node;
4020 xmlParserAddNodeInfo(ctxt, &node_info);
4022 if (!IS_CHAR_CH(CUR)) {
4023 htmlAutoCloseOnEnd(ctxt);
4026 if (currentNode != NULL)
4027 xmlFree(currentNode);
4031 * htmlParseDocument:
4032 * @ctxt: an HTML parser context
4034 * parse an HTML document (and build a tree if using the standard SAX
4037 * Returns 0, -1 in case of error. the parser context is augmented
4038 * as a result of the parsing.
4042 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4047 htmlDefaultSAXHandlerInit();
4049 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4050 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4051 "htmlParseDocument: context error\n", NULL, NULL);
4052 return(XML_ERR_INTERNAL_ERROR);
4057 * SAX: beginning of the document processing.
4059 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4060 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4063 * Wipe out everything which is before the first '<'
4067 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4068 "Document is empty\n", NULL, NULL);
4071 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4072 ctxt->sax->startDocument(ctxt->userData);
4076 * Parse possible comments and PIs before any content
4078 while (((CUR == '<') && (NXT(1) == '!') &&
4079 (NXT(2) == '-') && (NXT(3) == '-')) ||
4080 ((CUR == '<') && (NXT(1) == '?'))) {
4081 htmlParseComment(ctxt);
4088 * Then possibly doc type declaration(s) and more Misc
4089 * (doctypedecl Misc*)?
4091 if ((CUR == '<') && (NXT(1) == '!') &&
4092 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4093 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4094 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4096 htmlParseDocTypeDecl(ctxt);
4101 * Parse possible comments and PIs before any content
4103 while (((CUR == '<') && (NXT(1) == '!') &&
4104 (NXT(2) == '-') && (NXT(3) == '-')) ||
4105 ((CUR == '<') && (NXT(1) == '?'))) {
4106 htmlParseComment(ctxt);
4112 * Time to start parsing the tree itself
4114 htmlParseContent(ctxt);
4120 htmlAutoCloseOnEnd(ctxt);
4124 * SAX: end of the document processing.
4126 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4127 ctxt->sax->endDocument(ctxt->userData);
4129 if (ctxt->myDoc != NULL) {
4130 dtd = xmlGetIntSubset(ctxt->myDoc);
4132 ctxt->myDoc->intSubset =
4133 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4134 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4135 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4137 if (! ctxt->wellFormed) return(-1);
4142 /************************************************************************
4144 * Parser contexts handling *
4146 ************************************************************************/
4149 * htmlInitParserCtxt:
4150 * @ctxt: an HTML parser context
4152 * Initialize a parser context
4154 * Returns 0 in case of success and -1 in case of error
4158 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4160 htmlSAXHandler *sax;
4162 if (ctxt == NULL) return(-1);
4163 memset(ctxt, 0, sizeof(htmlParserCtxt));
4165 ctxt->dict = xmlDictCreate();
4166 if (ctxt->dict == NULL) {
4167 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4170 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4172 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4176 memset(sax, 0, sizeof(htmlSAXHandler));
4178 /* Allocate the Input stack */
4179 ctxt->inputTab = (htmlParserInputPtr *)
4180 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4181 if (ctxt->inputTab == NULL) {
4182 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4191 ctxt->version = NULL;
4192 ctxt->encoding = NULL;
4193 ctxt->standalone = -1;
4194 ctxt->instate = XML_PARSER_START;
4196 /* Allocate the Node stack */
4197 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4198 if (ctxt->nodeTab == NULL) {
4199 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4212 /* Allocate the Name stack */
4213 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4214 if (ctxt->nameTab == NULL) {
4215 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4231 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4234 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4236 ctxt->userData = ctxt;
4238 ctxt->wellFormed = 1;
4239 ctxt->replaceEntities = 0;
4240 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4242 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4243 ctxt->vctxt.userData = ctxt;
4244 ctxt->vctxt.error = xmlParserValidityError;
4245 ctxt->vctxt.warning = xmlParserValidityWarning;
4246 ctxt->record_info = 0;
4249 ctxt->checkIndex = 0;
4250 ctxt->catalogs = NULL;
4251 xmlInitNodeInfoSeq(&ctxt->node_seq);
4256 * htmlFreeParserCtxt:
4257 * @ctxt: an HTML parser context
4259 * Free all the memory used by a parser context. However the parsed
4260 * document in ctxt->myDoc is not freed.
4264 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4266 xmlFreeParserCtxt(ctxt);
4270 * htmlNewParserCtxt:
4272 * Allocate and initialize a new parser context.
4274 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4278 htmlNewParserCtxt(void)
4280 xmlParserCtxtPtr ctxt;
4282 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4284 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4287 memset(ctxt, 0, sizeof(xmlParserCtxt));
4288 if (htmlInitParserCtxt(ctxt) < 0) {
4289 htmlFreeParserCtxt(ctxt);
4296 * htmlCreateMemoryParserCtxt:
4297 * @buffer: a pointer to a char array
4298 * @size: the size of the array
4300 * Create a parser context for an HTML in-memory document.
4302 * Returns the new parser context or NULL
4305 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4306 xmlParserCtxtPtr ctxt;
4307 xmlParserInputPtr input;
4308 xmlParserInputBufferPtr buf;
4315 ctxt = htmlNewParserCtxt();
4319 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4320 if (buf == NULL) return(NULL);
4322 input = xmlNewInputStream(ctxt);
4323 if (input == NULL) {
4324 xmlFreeParserCtxt(ctxt);
4328 input->filename = NULL;
4330 input->base = input->buf->buffer->content;
4331 input->cur = input->buf->buffer->content;
4332 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4334 inputPush(ctxt, input);
4339 * htmlCreateDocParserCtxt:
4340 * @cur: a pointer to an array of xmlChar
4341 * @encoding: a free form C string describing the HTML document encoding, or NULL
4343 * Create a parser context for an HTML document.
4345 * TODO: check the need to add encoding handling there
4347 * Returns the new parser context or NULL
4349 static htmlParserCtxtPtr
4350 htmlCreateDocParserCtxt(const xmlChar *cur,
4351 const char *encoding ATTRIBUTE_UNUSED) {
4353 htmlParserCtxtPtr ctxt;
4357 len = xmlStrlen(cur);
4358 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4360 if (encoding != NULL) {
4361 xmlCharEncoding enc;
4362 xmlCharEncodingHandlerPtr handler;
4364 if (ctxt->input->encoding != NULL)
4365 xmlFree((xmlChar *) ctxt->input->encoding);
4366 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4368 enc = xmlParseCharEncoding(encoding);
4370 * registered set of known encodings
4372 if (enc != XML_CHAR_ENCODING_ERROR) {
4373 xmlSwitchEncoding(ctxt, enc);
4374 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4375 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4376 "Unsupported encoding %s\n",
4377 (const xmlChar *) encoding, NULL);
4381 * fallback for unknown encodings
4383 handler = xmlFindCharEncodingHandler((const char *) encoding);
4384 if (handler != NULL) {
4385 xmlSwitchToEncoding(ctxt, handler);
4387 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4388 "Unsupported encoding %s\n",
4389 (const xmlChar *) encoding, NULL);
4396 #ifdef LIBXML_PUSH_ENABLED
4397 /************************************************************************
4399 * Progressive parsing interfaces *
4401 ************************************************************************/
4404 * htmlParseLookupSequence:
4405 * @ctxt: an HTML parser context
4406 * @first: the first char to lookup
4407 * @next: the next char to lookup or zero
4408 * @third: the next char to lookup or zero
4409 * @comment: flag to force checking inside comments
4411 * Try to find if a sequence (first, next, third) or just (first next) or
4412 * (first) is available in the input stream.
4413 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4414 * to avoid rescanning sequences of bytes, it DOES change the state of the
4415 * parser, do not use liberally.
4416 * This is basically similar to xmlParseLookupSequence()
4418 * Returns the index to the current parsing point if the full sequence
4419 * is available, -1 otherwise.
4422 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4423 xmlChar next, xmlChar third, int iscomment) {
4425 htmlParserInputPtr in;
4430 if (in == NULL) return(-1);
4431 base = in->cur - in->base;
4432 if (base < 0) return(-1);
4433 if (ctxt->checkIndex > base)
4434 base = ctxt->checkIndex;
4435 if (in->buf == NULL) {
4439 buf = in->buf->buffer->content;
4440 len = in->buf->buffer->use;
4442 /* take into account the sequence length */
4443 if (third) len -= 2;
4444 else if (next) len --;
4445 for (;base < len;base++) {
4446 if (!incomment && (base + 4 < len) && !iscomment) {
4447 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4448 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4450 /* do not increment past <! - some people use <!--> */
4457 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4458 (buf[base + 2] == '>')) {
4464 if (buf[base] == first) {
4466 if ((buf[base + 1] != next) ||
4467 (buf[base + 2] != third)) continue;
4468 } else if (next != 0) {
4469 if (buf[base + 1] != next) continue;
4471 ctxt->checkIndex = 0;
4474 xmlGenericError(xmlGenericErrorContext,
4475 "HPP: lookup '%c' found at %d\n",
4477 else if (third == 0)
4478 xmlGenericError(xmlGenericErrorContext,
4479 "HPP: lookup '%c%c' found at %d\n",
4482 xmlGenericError(xmlGenericErrorContext,
4483 "HPP: lookup '%c%c%c' found at %d\n",
4484 first, next, third, base);
4486 return(base - (in->cur - in->base));
4489 ctxt->checkIndex = base;
4492 xmlGenericError(xmlGenericErrorContext,
4493 "HPP: lookup '%c' failed\n", first);
4494 else if (third == 0)
4495 xmlGenericError(xmlGenericErrorContext,
4496 "HPP: lookup '%c%c' failed\n", first, next);
4498 xmlGenericError(xmlGenericErrorContext,
4499 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4505 * htmlParseTryOrFinish:
4506 * @ctxt: an HTML parser context
4507 * @terminate: last chunk indicator
4509 * Try to progress on parsing
4511 * Returns zero if no parsing was possible
4514 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4516 htmlParserInputPtr in;
4521 switch (ctxt->instate) {
4522 case XML_PARSER_EOF:
4523 xmlGenericError(xmlGenericErrorContext,
4524 "HPP: try EOF\n"); break;
4525 case XML_PARSER_START:
4526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: try START\n"); break;
4528 case XML_PARSER_MISC:
4529 xmlGenericError(xmlGenericErrorContext,
4530 "HPP: try MISC\n");break;
4531 case XML_PARSER_COMMENT:
4532 xmlGenericError(xmlGenericErrorContext,
4533 "HPP: try COMMENT\n");break;
4534 case XML_PARSER_PROLOG:
4535 xmlGenericError(xmlGenericErrorContext,
4536 "HPP: try PROLOG\n");break;
4537 case XML_PARSER_START_TAG:
4538 xmlGenericError(xmlGenericErrorContext,
4539 "HPP: try START_TAG\n");break;
4540 case XML_PARSER_CONTENT:
4541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: try CONTENT\n");break;
4543 case XML_PARSER_CDATA_SECTION:
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: try CDATA_SECTION\n");break;
4546 case XML_PARSER_END_TAG:
4547 xmlGenericError(xmlGenericErrorContext,
4548 "HPP: try END_TAG\n");break;
4549 case XML_PARSER_ENTITY_DECL:
4550 xmlGenericError(xmlGenericErrorContext,
4551 "HPP: try ENTITY_DECL\n");break;
4552 case XML_PARSER_ENTITY_VALUE:
4553 xmlGenericError(xmlGenericErrorContext,
4554 "HPP: try ENTITY_VALUE\n");break;
4555 case XML_PARSER_ATTRIBUTE_VALUE:
4556 xmlGenericError(xmlGenericErrorContext,
4557 "HPP: try ATTRIBUTE_VALUE\n");break;
4558 case XML_PARSER_DTD:
4559 xmlGenericError(xmlGenericErrorContext,
4560 "HPP: try DTD\n");break;
4561 case XML_PARSER_EPILOG:
4562 xmlGenericError(xmlGenericErrorContext,
4563 "HPP: try EPILOG\n");break;
4565 xmlGenericError(xmlGenericErrorContext,
4566 "HPP: try PI\n");break;
4567 case XML_PARSER_SYSTEM_LITERAL:
4568 xmlGenericError(xmlGenericErrorContext,
4569 "HPP: try SYSTEM_LITERAL\n");break;
4576 if (in == NULL) break;
4577 if (in->buf == NULL)
4578 avail = in->length - (in->cur - in->base);
4580 avail = in->buf->buffer->use - (in->cur - in->base);
4581 if ((avail == 0) && (terminate)) {
4582 htmlAutoCloseOnEnd(ctxt);
4583 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4585 * SAX: end of the document processing.
4587 ctxt->instate = XML_PARSER_EOF;
4588 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4589 ctxt->sax->endDocument(ctxt->userData);
4600 switch (ctxt->instate) {
4601 case XML_PARSER_EOF:
4603 * Document parsing is done !
4606 case XML_PARSER_START:
4608 * Very first chars read from the document flow.
4611 if (IS_BLANK_CH(cur)) {
4613 if (in->buf == NULL)
4614 avail = in->length - (in->cur - in->base);
4616 avail = in->buf->buffer->use - (in->cur - in->base);
4618 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4619 ctxt->sax->setDocumentLocator(ctxt->userData,
4620 &xmlDefaultSAXLocator);
4621 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4622 (!ctxt->disableSAX))
4623 ctxt->sax->startDocument(ctxt->userData);
4627 if ((cur == '<') && (next == '!') &&
4628 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4629 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4630 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4633 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4636 xmlGenericError(xmlGenericErrorContext,
4637 "HPP: Parsing internal subset\n");
4639 htmlParseDocTypeDecl(ctxt);
4640 ctxt->instate = XML_PARSER_PROLOG;
4642 xmlGenericError(xmlGenericErrorContext,
4643 "HPP: entering PROLOG\n");
4646 ctxt->instate = XML_PARSER_MISC;
4648 xmlGenericError(xmlGenericErrorContext,
4649 "HPP: entering MISC\n");
4653 case XML_PARSER_MISC:
4655 if (in->buf == NULL)
4656 avail = in->length - (in->cur - in->base);
4658 avail = in->buf->buffer->use - (in->cur - in->base);
4663 if ((cur == '<') && (next == '!') &&
4664 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4666 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4669 xmlGenericError(xmlGenericErrorContext,
4670 "HPP: Parsing Comment\n");
4672 htmlParseComment(ctxt);
4673 ctxt->instate = XML_PARSER_MISC;
4674 } else if ((cur == '<') && (next == '?')) {
4676 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4679 xmlGenericError(xmlGenericErrorContext,
4680 "HPP: Parsing PI\n");
4683 ctxt->instate = XML_PARSER_MISC;
4684 } else if ((cur == '<') && (next == '!') &&
4685 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4686 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4687 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4690 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4693 xmlGenericError(xmlGenericErrorContext,
4694 "HPP: Parsing internal subset\n");
4696 htmlParseDocTypeDecl(ctxt);
4697 ctxt->instate = XML_PARSER_PROLOG;
4699 xmlGenericError(xmlGenericErrorContext,
4700 "HPP: entering PROLOG\n");
4702 } else if ((cur == '<') && (next == '!') &&
4706 ctxt->instate = XML_PARSER_START_TAG;
4708 xmlGenericError(xmlGenericErrorContext,
4709 "HPP: entering START_TAG\n");
4713 case XML_PARSER_PROLOG:
4715 if (in->buf == NULL)
4716 avail = in->length - (in->cur - in->base);
4718 avail = in->buf->buffer->use - (in->cur - in->base);
4723 if ((cur == '<') && (next == '!') &&
4724 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4726 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4729 xmlGenericError(xmlGenericErrorContext,
4730 "HPP: Parsing Comment\n");
4732 htmlParseComment(ctxt);
4733 ctxt->instate = XML_PARSER_PROLOG;
4734 } else if ((cur == '<') && (next == '?')) {
4736 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4739 xmlGenericError(xmlGenericErrorContext,
4740 "HPP: Parsing PI\n");
4743 ctxt->instate = XML_PARSER_PROLOG;
4744 } else if ((cur == '<') && (next == '!') &&
4748 ctxt->instate = XML_PARSER_START_TAG;
4750 xmlGenericError(xmlGenericErrorContext,
4751 "HPP: entering START_TAG\n");
4755 case XML_PARSER_EPILOG:
4756 if (in->buf == NULL)
4757 avail = in->length - (in->cur - in->base);
4759 avail = in->buf->buffer->use - (in->cur - in->base);
4763 if (IS_BLANK_CH(cur)) {
4764 htmlParseCharData(ctxt);
4770 if ((cur == '<') && (next == '!') &&
4771 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4773 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4776 xmlGenericError(xmlGenericErrorContext,
4777 "HPP: Parsing Comment\n");
4779 htmlParseComment(ctxt);
4780 ctxt->instate = XML_PARSER_EPILOG;
4781 } else if ((cur == '<') && (next == '?')) {
4783 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4786 xmlGenericError(xmlGenericErrorContext,
4787 "HPP: Parsing PI\n");
4790 ctxt->instate = XML_PARSER_EPILOG;
4791 } else if ((cur == '<') && (next == '!') &&
4795 ctxt->errNo = XML_ERR_DOCUMENT_END;
4796 ctxt->wellFormed = 0;
4797 ctxt->instate = XML_PARSER_EOF;
4799 xmlGenericError(xmlGenericErrorContext,
4800 "HPP: entering EOF\n");
4802 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4803 ctxt->sax->endDocument(ctxt->userData);
4807 case XML_PARSER_START_TAG: {
4808 const xmlChar *name;
4810 const htmlElemDesc * info;
4816 ctxt->instate = XML_PARSER_CONTENT;
4818 xmlGenericError(xmlGenericErrorContext,
4819 "HPP: entering CONTENT\n");
4823 if (in->cur[1] == '/') {
4824 ctxt->instate = XML_PARSER_END_TAG;
4825 ctxt->checkIndex = 0;
4827 xmlGenericError(xmlGenericErrorContext,
4828 "HPP: entering END_TAG\n");
4833 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4836 failed = htmlParseStartTag(ctxt);
4846 * Lookup the info for that element.
4848 info = htmlTagLookup(name);
4850 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4851 "Tag %s invalid\n", name, NULL);
4855 * Check for an Empty Element labeled the XML/SGML way
4857 if ((CUR == '/') && (NXT(1) == '>')) {
4859 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4860 ctxt->sax->endElement(ctxt->userData, name);
4862 ctxt->instate = XML_PARSER_CONTENT;
4864 xmlGenericError(xmlGenericErrorContext,
4865 "HPP: entering CONTENT\n");
4873 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4874 "Couldn't find end of Start Tag %s\n",
4878 * end of parsing of this node.
4880 if (xmlStrEqual(name, ctxt->name)) {
4885 ctxt->instate = XML_PARSER_CONTENT;
4887 xmlGenericError(xmlGenericErrorContext,
4888 "HPP: entering CONTENT\n");
4894 * Check for an Empty Element from DTD definition
4896 if ((info != NULL) && (info->empty)) {
4897 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4898 ctxt->sax->endElement(ctxt->userData, name);
4901 ctxt->instate = XML_PARSER_CONTENT;
4903 xmlGenericError(xmlGenericErrorContext,
4904 "HPP: entering CONTENT\n");
4908 case XML_PARSER_CONTENT: {
4911 * Handle preparsed entities and charRef
4913 if (ctxt->token != 0) {
4914 xmlChar chr[2] = { 0 , 0 } ;
4916 chr[0] = (xmlChar) ctxt->token;
4917 htmlCheckParagraph(ctxt);
4918 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4919 ctxt->sax->characters(ctxt->userData, chr, 1);
4921 ctxt->checkIndex = 0;
4923 if ((avail == 1) && (terminate)) {
4925 if ((cur != '<') && (cur != '&')) {
4926 if (ctxt->sax != NULL) {
4927 if (IS_BLANK_CH(cur)) {
4928 if (ctxt->sax->ignorableWhitespace != NULL)
4929 ctxt->sax->ignorableWhitespace(
4930 ctxt->userData, &cur, 1);
4932 htmlCheckParagraph(ctxt);
4933 if (ctxt->sax->characters != NULL)
4934 ctxt->sax->characters(
4935 ctxt->userData, &cur, 1);
4939 ctxt->checkIndex = 0;
4948 cons = ctxt->nbChars;
4949 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4950 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4952 * Handle SCRIPT/STYLE separately
4958 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
4961 val = in->cur[idx + 2];
4962 if (val == 0) /* bad cut of input */
4965 htmlParseScript(ctxt);
4966 if ((cur == '<') && (next == '/')) {
4967 ctxt->instate = XML_PARSER_END_TAG;
4968 ctxt->checkIndex = 0;
4970 xmlGenericError(xmlGenericErrorContext,
4971 "HPP: entering END_TAG\n");
4977 * Sometimes DOCTYPE arrives in the middle of the document
4979 if ((cur == '<') && (next == '!') &&
4980 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4981 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4982 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4985 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4987 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4988 "Misplaced DOCTYPE declaration\n",
4989 BAD_CAST "DOCTYPE" , NULL);
4990 htmlParseDocTypeDecl(ctxt);
4991 } else if ((cur == '<') && (next == '!') &&
4992 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4994 (htmlParseLookupSequence(
4995 ctxt, '-', '-', '>', 1) < 0))
4998 xmlGenericError(xmlGenericErrorContext,
4999 "HPP: Parsing Comment\n");
5001 htmlParseComment(ctxt);
5002 ctxt->instate = XML_PARSER_CONTENT;
5003 } else if ((cur == '<') && (next == '?')) {
5005 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5008 xmlGenericError(xmlGenericErrorContext,
5009 "HPP: Parsing PI\n");
5012 ctxt->instate = XML_PARSER_CONTENT;
5013 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5015 } else if ((cur == '<') && (next == '/')) {
5016 ctxt->instate = XML_PARSER_END_TAG;
5017 ctxt->checkIndex = 0;
5019 xmlGenericError(xmlGenericErrorContext,
5020 "HPP: entering END_TAG\n");
5023 } else if (cur == '<') {
5024 ctxt->instate = XML_PARSER_START_TAG;
5025 ctxt->checkIndex = 0;
5027 xmlGenericError(xmlGenericErrorContext,
5028 "HPP: entering START_TAG\n");
5031 } else if (cur == '&') {
5033 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5036 xmlGenericError(xmlGenericErrorContext,
5037 "HPP: Parsing Reference\n");
5039 /* TODO: check generation of subtrees if noent !!! */
5040 htmlParseReference(ctxt);
5043 * check that the text sequence is complete
5044 * before handing out the data to the parser
5045 * to avoid problems with erroneous end of
5049 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5051 ctxt->checkIndex = 0;
5053 xmlGenericError(xmlGenericErrorContext,
5054 "HPP: Parsing char data\n");
5056 htmlParseCharData(ctxt);
5059 if (cons == ctxt->nbChars) {
5060 if (ctxt->node != NULL) {
5061 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5062 "detected an error in element content\n",
5071 case XML_PARSER_END_TAG:
5075 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5077 htmlParseEndTag(ctxt);
5078 if (ctxt->nameNr == 0) {
5079 ctxt->instate = XML_PARSER_EPILOG;
5081 ctxt->instate = XML_PARSER_CONTENT;
5083 ctxt->checkIndex = 0;
5085 xmlGenericError(xmlGenericErrorContext,
5086 "HPP: entering CONTENT\n");
5089 case XML_PARSER_CDATA_SECTION:
5090 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5091 "HPP: internal error, state == CDATA\n",
5093 ctxt->instate = XML_PARSER_CONTENT;
5094 ctxt->checkIndex = 0;
5096 xmlGenericError(xmlGenericErrorContext,
5097 "HPP: entering CONTENT\n");
5100 case XML_PARSER_DTD:
5101 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5102 "HPP: internal error, state == DTD\n",
5104 ctxt->instate = XML_PARSER_CONTENT;
5105 ctxt->checkIndex = 0;
5107 xmlGenericError(xmlGenericErrorContext,
5108 "HPP: entering CONTENT\n");
5111 case XML_PARSER_COMMENT:
5112 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5113 "HPP: internal error, state == COMMENT\n",
5115 ctxt->instate = XML_PARSER_CONTENT;
5116 ctxt->checkIndex = 0;
5118 xmlGenericError(xmlGenericErrorContext,
5119 "HPP: entering CONTENT\n");
5123 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5124 "HPP: internal error, state == PI\n",
5126 ctxt->instate = XML_PARSER_CONTENT;
5127 ctxt->checkIndex = 0;
5129 xmlGenericError(xmlGenericErrorContext,
5130 "HPP: entering CONTENT\n");
5133 case XML_PARSER_ENTITY_DECL:
5134 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5135 "HPP: internal error, state == ENTITY_DECL\n",
5137 ctxt->instate = XML_PARSER_CONTENT;
5138 ctxt->checkIndex = 0;
5140 xmlGenericError(xmlGenericErrorContext,
5141 "HPP: entering CONTENT\n");
5144 case XML_PARSER_ENTITY_VALUE:
5145 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5146 "HPP: internal error, state == ENTITY_VALUE\n",
5148 ctxt->instate = XML_PARSER_CONTENT;
5149 ctxt->checkIndex = 0;
5151 xmlGenericError(xmlGenericErrorContext,
5152 "HPP: entering DTD\n");
5155 case XML_PARSER_ATTRIBUTE_VALUE:
5156 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5157 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5159 ctxt->instate = XML_PARSER_START_TAG;
5160 ctxt->checkIndex = 0;
5162 xmlGenericError(xmlGenericErrorContext,
5163 "HPP: entering START_TAG\n");
5166 case XML_PARSER_SYSTEM_LITERAL:
5167 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5168 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5170 ctxt->instate = XML_PARSER_CONTENT;
5171 ctxt->checkIndex = 0;
5173 xmlGenericError(xmlGenericErrorContext,
5174 "HPP: entering CONTENT\n");
5177 case XML_PARSER_IGNORE:
5178 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5179 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5181 ctxt->instate = XML_PARSER_CONTENT;
5182 ctxt->checkIndex = 0;
5184 xmlGenericError(xmlGenericErrorContext,
5185 "HPP: entering CONTENT\n");
5188 case XML_PARSER_PUBLIC_LITERAL:
5189 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5190 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5192 ctxt->instate = XML_PARSER_CONTENT;
5193 ctxt->checkIndex = 0;
5195 xmlGenericError(xmlGenericErrorContext,
5196 "HPP: entering CONTENT\n");
5203 if ((avail == 0) && (terminate)) {
5204 htmlAutoCloseOnEnd(ctxt);
5205 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5207 * SAX: end of the document processing.
5209 ctxt->instate = XML_PARSER_EOF;
5210 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5211 ctxt->sax->endDocument(ctxt->userData);
5214 if ((ctxt->myDoc != NULL) &&
5215 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5216 (ctxt->instate == XML_PARSER_EPILOG))) {
5218 dtd = xmlGetIntSubset(ctxt->myDoc);
5220 ctxt->myDoc->intSubset =
5221 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5222 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5223 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5226 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5233 * @ctxt: an HTML parser context
5234 * @chunk: an char array
5235 * @size: the size in byte of the chunk
5236 * @terminate: last chunk indicator
5238 * Parse a Chunk of memory
5240 * Returns zero if no error, the xmlParserErrors otherwise.
5243 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5245 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5246 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5247 "htmlParseChunk: context error\n", NULL, NULL);
5248 return(XML_ERR_INTERNAL_ERROR);
5250 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5251 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5252 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5253 int cur = ctxt->input->cur - ctxt->input->base;
5256 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5258 ctxt->errNo = XML_PARSER_EOF;
5259 ctxt->disableSAX = 1;
5260 return (XML_PARSER_EOF);
5262 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5263 ctxt->input->cur = ctxt->input->base + cur;
5265 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5267 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5271 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5272 htmlParseTryOrFinish(ctxt, terminate);
5274 } else if (ctxt->instate != XML_PARSER_EOF) {
5275 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5276 xmlParserInputBufferPtr in = ctxt->input->buf;
5277 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5278 (in->raw != NULL)) {
5281 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5283 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5284 "encoder error\n", NULL, NULL);
5285 return(XML_ERR_INVALID_ENCODING);
5290 htmlParseTryOrFinish(ctxt, terminate);
5292 if ((ctxt->instate != XML_PARSER_EOF) &&
5293 (ctxt->instate != XML_PARSER_EPILOG) &&
5294 (ctxt->instate != XML_PARSER_MISC)) {
5295 ctxt->errNo = XML_ERR_DOCUMENT_END;
5296 ctxt->wellFormed = 0;
5298 if (ctxt->instate != XML_PARSER_EOF) {
5299 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5300 ctxt->sax->endDocument(ctxt->userData);
5302 ctxt->instate = XML_PARSER_EOF;
5304 return((xmlParserErrors) ctxt->errNo);
5307 /************************************************************************
5309 * User entry points *
5311 ************************************************************************/
5314 * htmlCreatePushParserCtxt:
5315 * @sax: a SAX handler
5316 * @user_data: The user data returned on SAX callbacks
5317 * @chunk: a pointer to an array of chars
5318 * @size: number of chars in the array
5319 * @filename: an optional file name or URI
5320 * @enc: an optional encoding
5322 * Create a parser context for using the HTML parser in push mode
5323 * The value of @filename is used for fetching external entities
5324 * and error/warning reports.
5326 * Returns the new parser context or NULL
5329 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5330 const char *chunk, int size, const char *filename,
5331 xmlCharEncoding enc) {
5332 htmlParserCtxtPtr ctxt;
5333 htmlParserInputPtr inputStream;
5334 xmlParserInputBufferPtr buf;
5338 buf = xmlAllocParserInputBuffer(enc);
5339 if (buf == NULL) return(NULL);
5341 ctxt = htmlNewParserCtxt();
5343 xmlFreeParserInputBuffer(buf);
5346 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5347 ctxt->charset=XML_CHAR_ENCODING_UTF8;
5349 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5351 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5352 if (ctxt->sax == NULL) {
5357 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5358 if (user_data != NULL)
5359 ctxt->userData = user_data;
5361 if (filename == NULL) {
5362 ctxt->directory = NULL;
5364 ctxt->directory = xmlParserGetDirectory(filename);
5367 inputStream = htmlNewInputStream(ctxt);
5368 if (inputStream == NULL) {
5369 xmlFreeParserCtxt(ctxt);
5374 if (filename == NULL)
5375 inputStream->filename = NULL;
5377 inputStream->filename = (char *)
5378 xmlCanonicPath((const xmlChar *) filename);
5379 inputStream->buf = buf;
5380 inputStream->base = inputStream->buf->buffer->content;
5381 inputStream->cur = inputStream->buf->buffer->content;
5383 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5385 inputPush(ctxt, inputStream);
5387 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5388 (ctxt->input->buf != NULL)) {
5389 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5390 int cur = ctxt->input->cur - ctxt->input->base;
5392 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5394 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5395 ctxt->input->cur = ctxt->input->base + cur;
5397 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5399 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5402 ctxt->progressive = 1;
5406 #endif /* LIBXML_PUSH_ENABLED */
5410 * @cur: a pointer to an array of xmlChar
5411 * @encoding: a free form C string describing the HTML document encoding, or NULL
5412 * @sax: the SAX handler block
5413 * @userData: if using SAX, this pointer will be provided on callbacks.
5415 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5416 * to handle parse events. If sax is NULL, fallback to the default DOM
5417 * behavior and return a tree.
5419 * Returns the resulting document tree unless SAX is NULL or the document is
5424 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5426 htmlParserCtxtPtr ctxt;
5430 if (cur == NULL) return(NULL);
5433 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5434 if (ctxt == NULL) return(NULL);
5436 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5438 ctxt->userData = userData;
5441 htmlParseDocument(ctxt);
5445 ctxt->userData = NULL;
5447 htmlFreeParserCtxt(ctxt);
5454 * @cur: a pointer to an array of xmlChar
5455 * @encoding: a free form C string describing the HTML document encoding, or NULL
5457 * parse an HTML in-memory document and build a tree.
5459 * Returns the resulting document tree
5463 htmlParseDoc(xmlChar *cur, const char *encoding) {
5464 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5469 * htmlCreateFileParserCtxt:
5470 * @filename: the filename
5471 * @encoding: a free form C string describing the HTML document encoding, or NULL
5473 * Create a parser context for a file content.
5474 * Automatic support for ZLIB/Compress compressed document is provided
5475 * by default if found at compile-time.
5477 * Returns the new parser context or NULL
5480 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5482 htmlParserCtxtPtr ctxt;
5483 htmlParserInputPtr inputStream;
5484 char *canonicFilename;
5485 /* htmlCharEncoding enc; */
5486 xmlChar *content, *content_line = (xmlChar *) "charset=";
5488 if (filename == NULL)
5491 ctxt = htmlNewParserCtxt();
5495 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5496 if (canonicFilename == NULL) {
5497 #ifdef LIBXML_SAX1_ENABLED
5498 if (xmlDefaultSAXHandler.error != NULL) {
5499 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5502 xmlFreeParserCtxt(ctxt);
5506 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5507 xmlFree(canonicFilename);
5508 if (inputStream == NULL) {
5509 xmlFreeParserCtxt(ctxt);
5513 inputPush(ctxt, inputStream);
5517 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5519 strcpy ((char *)content, (char *)content_line);
5520 strcat ((char *)content, (char *)encoding);
5521 htmlCheckEncoding (ctxt, content);
5531 * @filename: the filename
5532 * @encoding: a free form C string describing the HTML document encoding, or NULL
5533 * @sax: the SAX handler block
5534 * @userData: if using SAX, this pointer will be provided on callbacks.
5536 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5537 * compressed document is provided by default if found at compile-time.
5538 * It use the given SAX function block to handle the parsing callback.
5539 * If sax is NULL, fallback to the default DOM tree building routines.
5541 * Returns the resulting document tree unless SAX is NULL or the document is
5546 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5549 htmlParserCtxtPtr ctxt;
5550 htmlSAXHandlerPtr oldsax = NULL;
5554 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5555 if (ctxt == NULL) return(NULL);
5559 ctxt->userData = userData;
5562 htmlParseDocument(ctxt);
5567 ctxt->userData = NULL;
5569 htmlFreeParserCtxt(ctxt);
5576 * @filename: the filename
5577 * @encoding: a free form C string describing the HTML document encoding, or NULL
5579 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5580 * compressed document is provided by default if found at compile-time.
5582 * Returns the resulting document tree
5586 htmlParseFile(const char *filename, const char *encoding) {
5587 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5591 * htmlHandleOmittedElem:
5594 * Set and return the previous value for handling HTML omitted tags.
5596 * Returns the last value for 0 for no handling, 1 for auto insertion.
5600 htmlHandleOmittedElem(int val) {
5601 int old = htmlOmittedDefaultValue;
5603 htmlOmittedDefaultValue = val;
5608 * htmlElementAllowedHere:
5609 * @parent: HTML parent element
5610 * @elt: HTML element
5612 * Checks whether an HTML element may be a direct child of a parent element.
5613 * Note - doesn't check for deprecated elements
5615 * Returns 1 if allowed; 0 otherwise.
5618 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5621 if ( ! elt || ! parent || ! parent->subelts )
5624 for ( p = parent->subelts; *p; ++p )
5625 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5631 * htmlElementStatusHere:
5632 * @parent: HTML parent element
5633 * @elt: HTML element
5635 * Checks whether an HTML element may be a direct child of a parent element.
5636 * and if so whether it is valid or deprecated.
5638 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5641 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5642 if ( ! parent || ! elt )
5643 return HTML_INVALID ;
5644 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5645 return HTML_INVALID ;
5647 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5651 * @elt: HTML element
5652 * @attr: HTML attribute
5653 * @legacy: whether to allow deprecated attributes
5655 * Checks whether an attribute is valid for an element
5656 * Has full knowledge of Required and Deprecated attributes
5658 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5661 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5664 if ( !elt || ! attr )
5665 return HTML_INVALID ;
5667 if ( elt->attrs_req )
5668 for ( p = elt->attrs_req; *p; ++p)
5669 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5670 return HTML_REQUIRED ;
5672 if ( elt->attrs_opt )
5673 for ( p = elt->attrs_opt; *p; ++p)
5674 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5677 if ( legacy && elt->attrs_depr )
5678 for ( p = elt->attrs_depr; *p; ++p)
5679 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5680 return HTML_DEPRECATED ;
5682 return HTML_INVALID ;
5686 * @node: an htmlNodePtr in a tree
5687 * @legacy: whether to allow deprecated elements (YES is faster here
5688 * for Element nodes)
5690 * Checks whether the tree node is valid. Experimental (the author
5691 * only uses the HTML enhancements in a SAX parser)
5693 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5694 * legacy allowed) or htmlElementStatusHere (otherwise).
5695 * for Attribute nodes, a return from htmlAttrAllowed
5696 * for other nodes, HTML_NA (no checks performed)
5699 htmlNodeStatus(const htmlNodePtr node, int legacy) {
5701 return HTML_INVALID ;
5703 switch ( node->type ) {
5704 case XML_ELEMENT_NODE:
5706 ? ( htmlElementAllowedHere (
5707 htmlTagLookup(node->parent->name) , node->name
5708 ) ? HTML_VALID : HTML_INVALID )
5709 : htmlElementStatusHere(
5710 htmlTagLookup(node->parent->name) ,
5711 htmlTagLookup(node->name) )
5713 case XML_ATTRIBUTE_NODE:
5714 return htmlAttrAllowed(
5715 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5716 default: return HTML_NA ;
5719 /************************************************************************
5721 * New set (2.6.0) of simpler and more flexible APIs *
5723 ************************************************************************/
5728 * Free a string if it is not owned by the "dict" dictionnary in the
5731 #define DICT_FREE(str) \
5732 if ((str) && ((!dict) || \
5733 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5734 xmlFree((char *)(str));
5738 * @ctxt: an HTML parser context
5740 * Reset a parser context
5743 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5745 xmlParserInputPtr input;
5754 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5755 xmlFreeInputStream(input);
5761 if (ctxt->spaceTab != NULL) {
5762 ctxt->spaceTab[0] = -1;
5763 ctxt->space = &ctxt->spaceTab[0];
5775 DICT_FREE(ctxt->version);
5776 ctxt->version = NULL;
5777 DICT_FREE(ctxt->encoding);
5778 ctxt->encoding = NULL;
5779 DICT_FREE(ctxt->directory);
5780 ctxt->directory = NULL;
5781 DICT_FREE(ctxt->extSubURI);
5782 ctxt->extSubURI = NULL;
5783 DICT_FREE(ctxt->extSubSystem);
5784 ctxt->extSubSystem = NULL;
5785 if (ctxt->myDoc != NULL)
5786 xmlFreeDoc(ctxt->myDoc);
5789 ctxt->standalone = -1;
5790 ctxt->hasExternalSubset = 0;
5791 ctxt->hasPErefs = 0;
5794 ctxt->instate = XML_PARSER_START;
5797 ctxt->wellFormed = 1;
5798 ctxt->nsWellFormed = 1;
5800 ctxt->vctxt.userData = ctxt;
5801 ctxt->vctxt.error = xmlParserValidityError;
5802 ctxt->vctxt.warning = xmlParserValidityWarning;
5803 ctxt->record_info = 0;
5805 ctxt->checkIndex = 0;
5807 ctxt->errNo = XML_ERR_OK;
5809 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5810 ctxt->catalogs = NULL;
5811 xmlInitNodeInfoSeq(&ctxt->node_seq);
5813 if (ctxt->attsDefault != NULL) {
5814 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5815 ctxt->attsDefault = NULL;
5817 if (ctxt->attsSpecial != NULL) {
5818 xmlHashFree(ctxt->attsSpecial, NULL);
5819 ctxt->attsSpecial = NULL;
5824 * htmlCtxtUseOptions:
5825 * @ctxt: an HTML parser context
5826 * @options: a combination of htmlParserOption(s)
5828 * Applies the options to the parser context
5830 * Returns 0 in case of success, the set of unknown or unimplemented options
5834 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5839 if (options & HTML_PARSE_NOWARNING) {
5840 ctxt->sax->warning = NULL;
5841 ctxt->vctxt.warning = NULL;
5842 options -= XML_PARSE_NOWARNING;
5843 ctxt->options |= XML_PARSE_NOWARNING;
5845 if (options & HTML_PARSE_NOERROR) {
5846 ctxt->sax->error = NULL;
5847 ctxt->vctxt.error = NULL;
5848 ctxt->sax->fatalError = NULL;
5849 options -= XML_PARSE_NOERROR;
5850 ctxt->options |= XML_PARSE_NOERROR;
5852 if (options & HTML_PARSE_PEDANTIC) {
5854 options -= XML_PARSE_PEDANTIC;
5855 ctxt->options |= XML_PARSE_PEDANTIC;
5858 if (options & XML_PARSE_NOBLANKS) {
5859 ctxt->keepBlanks = 0;
5860 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5861 options -= XML_PARSE_NOBLANKS;
5862 ctxt->options |= XML_PARSE_NOBLANKS;
5864 ctxt->keepBlanks = 1;
5865 if (options & HTML_PARSE_RECOVER) {
5867 options -= HTML_PARSE_RECOVER;
5870 if (options & HTML_PARSE_COMPACT) {
5871 ctxt->options |= HTML_PARSE_COMPACT;
5872 options -= HTML_PARSE_COMPACT;
5874 ctxt->dictNames = 0;
5880 * @ctxt: an HTML parser context
5881 * @URL: the base URL to use for the document
5882 * @encoding: the document encoding, or NULL
5883 * @options: a combination of htmlParserOption(s)
5884 * @reuse: keep the context for reuse
5886 * Common front-end for the htmlRead functions
5888 * Returns the resulting document tree or NULL
5891 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5892 int options, int reuse)
5896 htmlCtxtUseOptions(ctxt, options);
5898 if (encoding != NULL) {
5899 xmlCharEncodingHandlerPtr hdlr;
5901 hdlr = xmlFindCharEncodingHandler(encoding);
5903 xmlSwitchToEncoding(ctxt, hdlr);
5905 if ((URL != NULL) && (ctxt->input != NULL) &&
5906 (ctxt->input->filename == NULL))
5907 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5908 htmlParseDocument(ctxt);
5912 if ((ctxt->dictNames) &&
5914 (ret->dict == ctxt->dict))
5916 xmlFreeParserCtxt(ctxt);
5923 * @cur: a pointer to a zero terminated string
5924 * @URL: the base URL to use for the document
5925 * @encoding: the document encoding, or NULL
5926 * @options: a combination of htmlParserOption(s)
5928 * parse an XML in-memory document and build a tree.
5930 * Returns the resulting document tree
5933 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5935 htmlParserCtxtPtr ctxt;
5941 ctxt = htmlCreateDocParserCtxt(cur, NULL);
5944 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5949 * @filename: a file or URL
5950 * @encoding: the document encoding, or NULL
5951 * @options: a combination of htmlParserOption(s)
5953 * parse an XML file from the filesystem or the network.
5955 * Returns the resulting document tree
5958 htmlReadFile(const char *filename, const char *encoding, int options)
5960 htmlParserCtxtPtr ctxt;
5963 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5966 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5971 * @buffer: a pointer to a char array
5972 * @size: the size of the array
5973 * @URL: the base URL to use for the document
5974 * @encoding: the document encoding, or NULL
5975 * @options: a combination of htmlParserOption(s)
5977 * parse an XML in-memory document and build a tree.
5979 * Returns the resulting document tree
5982 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5984 htmlParserCtxtPtr ctxt;
5987 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5990 htmlDefaultSAXHandlerInit();
5991 if (ctxt->sax != NULL)
5992 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5993 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5998 * @fd: an open file descriptor
5999 * @URL: the base URL to use for the document
6000 * @encoding: the document encoding, or NULL
6001 * @options: a combination of htmlParserOption(s)
6003 * parse an XML from a file descriptor and build a tree.
6005 * Returns the resulting document tree
6008 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6010 htmlParserCtxtPtr ctxt;
6011 xmlParserInputBufferPtr input;
6012 xmlParserInputPtr stream;
6018 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6021 ctxt = xmlNewParserCtxt();
6023 xmlFreeParserInputBuffer(input);
6026 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6027 if (stream == NULL) {
6028 xmlFreeParserInputBuffer(input);
6029 xmlFreeParserCtxt(ctxt);
6032 inputPush(ctxt, stream);
6033 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6038 * @ioread: an I/O read function
6039 * @ioclose: an I/O close function
6040 * @ioctx: an I/O handler
6041 * @URL: the base URL to use for the document
6042 * @encoding: the document encoding, or NULL
6043 * @options: a combination of htmlParserOption(s)
6045 * parse an HTML document from I/O functions and source and build a tree.
6047 * Returns the resulting document tree
6050 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6051 void *ioctx, const char *URL, const char *encoding, int options)
6053 htmlParserCtxtPtr ctxt;
6054 xmlParserInputBufferPtr input;
6055 xmlParserInputPtr stream;
6061 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6062 XML_CHAR_ENCODING_NONE);
6065 ctxt = htmlNewParserCtxt();
6067 xmlFreeParserInputBuffer(input);
6070 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6071 if (stream == NULL) {
6072 xmlFreeParserInputBuffer(input);
6073 xmlFreeParserCtxt(ctxt);
6076 inputPush(ctxt, stream);
6077 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6082 * @ctxt: an HTML parser context
6083 * @cur: a pointer to a zero terminated string
6084 * @URL: the base URL to use for the document
6085 * @encoding: the document encoding, or NULL
6086 * @options: a combination of htmlParserOption(s)
6088 * parse an XML in-memory document and build a tree.
6089 * This reuses the existing @ctxt parser context
6091 * Returns the resulting document tree
6094 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6095 const char *URL, const char *encoding, int options)
6097 xmlParserInputPtr stream;
6104 htmlCtxtReset(ctxt);
6106 stream = xmlNewStringInputStream(ctxt, cur);
6107 if (stream == NULL) {
6110 inputPush(ctxt, stream);
6111 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6116 * @ctxt: an HTML parser context
6117 * @filename: a file or URL
6118 * @encoding: the document encoding, or NULL
6119 * @options: a combination of htmlParserOption(s)
6121 * parse an XML file from the filesystem or the network.
6122 * This reuses the existing @ctxt parser context
6124 * Returns the resulting document tree
6127 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6128 const char *encoding, int options)
6130 xmlParserInputPtr stream;
6132 if (filename == NULL)
6137 htmlCtxtReset(ctxt);
6139 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6140 if (stream == NULL) {
6143 inputPush(ctxt, stream);
6144 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6148 * htmlCtxtReadMemory:
6149 * @ctxt: an HTML parser context
6150 * @buffer: a pointer to a char array
6151 * @size: the size of the array
6152 * @URL: the base URL to use for the document
6153 * @encoding: the document encoding, or NULL
6154 * @options: a combination of htmlParserOption(s)
6156 * parse an XML in-memory document and build a tree.
6157 * This reuses the existing @ctxt parser context
6159 * Returns the resulting document tree
6162 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6163 const char *URL, const char *encoding, int options)
6165 xmlParserInputBufferPtr input;
6166 xmlParserInputPtr stream;
6173 htmlCtxtReset(ctxt);
6175 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6176 if (input == NULL) {
6180 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6181 if (stream == NULL) {
6182 xmlFreeParserInputBuffer(input);
6186 inputPush(ctxt, stream);
6187 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6192 * @ctxt: an HTML parser context
6193 * @fd: an open file descriptor
6194 * @URL: the base URL to use for the document
6195 * @encoding: the document encoding, or NULL
6196 * @options: a combination of htmlParserOption(s)
6198 * parse an XML from a file descriptor and build a tree.
6199 * This reuses the existing @ctxt parser context
6201 * Returns the resulting document tree
6204 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6205 const char *URL, const char *encoding, int options)
6207 xmlParserInputBufferPtr input;
6208 xmlParserInputPtr stream;
6215 htmlCtxtReset(ctxt);
6218 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6221 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6222 if (stream == NULL) {
6223 xmlFreeParserInputBuffer(input);
6226 inputPush(ctxt, stream);
6227 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6232 * @ctxt: an HTML parser context
6233 * @ioread: an I/O read function
6234 * @ioclose: an I/O close function
6235 * @ioctx: an I/O handler
6236 * @URL: the base URL to use for the document
6237 * @encoding: the document encoding, or NULL
6238 * @options: a combination of htmlParserOption(s)
6240 * parse an HTML document from I/O functions and source and build a tree.
6241 * This reuses the existing @ctxt parser context
6243 * Returns the resulting document tree
6246 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6247 xmlInputCloseCallback ioclose, void *ioctx,
6249 const char *encoding, int options)
6251 xmlParserInputBufferPtr input;
6252 xmlParserInputPtr stream;
6259 htmlCtxtReset(ctxt);
6261 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6262 XML_CHAR_ENCODING_NONE);
6265 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6266 if (stream == NULL) {
6267 xmlFreeParserInputBuffer(input);
6270 inputPush(ctxt, stream);
6271 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6274 #define bottom_HTMLparser
6275 #include "elfgcchack.h"
6276 #endif /* LIBXML_HTML_ENABLED */