gettext-tools/gnulib-lib/libxml/HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #ifdef HAVE_CTYPE_H
  15 #include <ctype.h>
  16 #endif
  17 #ifdef HAVE_STDLIB_H
  18 #include <stdlib.h>
  19 #endif
  20 #ifdef HAVE_SYS_STAT_H
  21 #include <sys/stat.h>
  22 #endif
  23 #ifdef HAVE_FCNTL_H
  24 #include <fcntl.h>
  25 #endif
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif
  29 #ifdef HAVE_ZLIB_H
  30 #include <zlib.h>
  31 #endif
  32
  33 #include <libxml/xmlmemory.h>
  34 #include <libxml/tree.h>
  35 #include <libxml/parser.h>
  36 #include <libxml/parserInternals.h>
  37 #include <libxml/xmlerror.h>
  38 #include <libxml/HTMLparser.h>
  39 #include <libxml/HTMLtree.h>
  40 #include <libxml/entities.h>
  41 #include <libxml/encoding.h>
  42 #include <libxml/valid.h>
  43 #include <libxml/xmlIO.h>
  44 #include <libxml/globals.h>
  45 #include <libxml/uri.h>
  46
  47 #define HTML_MAX_NAMELEN 1000
  48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  49 #define HTML_PARSER_BUFFER_SIZE 100
  50
  51 /* #define DEBUG */
  52 /* #define DEBUG_PUSH */
  53
  54 static int htmlOmittedDefaultValue = 1;
  55
  56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  57                              xmlChar end, xmlChar  end2, xmlChar end3);
  58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  59
  60 /************************************************************************
  61  *                                                                      *
  62  *              Some factorized error routines                          *
  63  *                                                                      *
  64  ************************************************************************/
  65
  66 /**
  67  * htmlErrMemory:
  68  * @ctxt:  an HTML parser context
  69  * @extra:  extra informations
  70  *
  71  * Handle a redefinition of attribute error
  72  */
  73 static void
  74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  75 {
  76     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  77         (ctxt->instate == XML_PARSER_EOF))
  78         return;
  79     if (ctxt != NULL) {
  80         ctxt->errNo = XML_ERR_NO_MEMORY;
  81         ctxt->instate = XML_PARSER_EOF;
  82         ctxt->disableSAX = 1;
  83     }
  84     if (extra)
  85         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  86                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  87                         NULL, NULL, 0, 0,
  88                         "Memory allocation failed : %s\n", extra);
  89     else
  90         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  91                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  92                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  93 }
  94
  95 /**
  96  * htmlParseErr:
  97  * @ctxt:  an HTML parser context
  98  * @error:  the error number
  99  * @msg:  the error message
 100  * @str1:  string infor
 101  * @str2:  string infor
 102  *
 103  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 104  */
 105 static void
 106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 107              const char *msg, const xmlChar *str1, const xmlChar *str2)
 108 {
 109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 110         (ctxt->instate == XML_PARSER_EOF))
 111         return;
 112     if (ctxt != NULL)
 113         ctxt->errNo = error;
 114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 115                     XML_ERR_ERROR, NULL, 0,
 116                     (const char *) str1, (const char *) str2,
 117                     NULL, 0, 0,
 118                     msg, str1, str2);
 119     if (ctxt != NULL)
 120         ctxt->wellFormed = 0;
 121 }
 122
 123 /**
 124  * htmlParseErrInt:
 125  * @ctxt:  an HTML parser context
 126  * @error:  the error number
 127  * @msg:  the error message
 128  * @val:  integer info
 129  *
 130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 131  */
 132 static void
 133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 134              const char *msg, int val)
 135 {
 136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 137         (ctxt->instate == XML_PARSER_EOF))
 138         return;
 139     if (ctxt != NULL)
 140         ctxt->errNo = error;
 141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 143                     NULL, val, 0, msg, val);
 144     if (ctxt != NULL)
 145         ctxt->wellFormed = 0;
 146 }
 147
 148 /************************************************************************
 149  *                                                                      *
 150  *              Parser stacks related functions and macros              *
 151  *                                                                      *
 152  ************************************************************************/
 153
 154 /**
 155  * htmlnamePush:
 156  * @ctxt:  an HTML parser context
 157  * @value:  the element name
 158  *
 159  * Pushes a new element name on top of the name stack
 160  *
 161  * Returns 0 in case of error, the index in the stack otherwise
 162  */
 163 static int
 164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 165 {
 166     if (ctxt->nameNr >= ctxt->nameMax) {
 167         ctxt->nameMax *= 2;
 168         ctxt->nameTab = (const xmlChar * *)
 169                          xmlRealloc((xmlChar * *)ctxt->nameTab,
 170                                     ctxt->nameMax *
 171                                     sizeof(ctxt->nameTab[0]));
 172         if (ctxt->nameTab == NULL) {
 173             htmlErrMemory(ctxt, NULL);
 174             return (0);
 175         }
 176     }
 177     ctxt->nameTab[ctxt->nameNr] = value;
 178     ctxt->name = value;
 179     return (ctxt->nameNr++);
 180 }
 181 /**
 182  * htmlnamePop:
 183  * @ctxt: an HTML parser context
 184  *
 185  * Pops the top element name from the name stack
 186  *
 187  * Returns the name just removed
 188  */
 189 static const xmlChar *
 190 htmlnamePop(htmlParserCtxtPtr ctxt)
 191 {
 192     const xmlChar *ret;
 193
 194     if (ctxt->nameNr <= 0)
 195         return (NULL);
 196     ctxt->nameNr--;
 197     if (ctxt->nameNr < 0)
 198         return (NULL);
 199     if (ctxt->nameNr > 0)
 200         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 201     else
 202         ctxt->name = NULL;
 203     ret = ctxt->nameTab[ctxt->nameNr];
 204     ctxt->nameTab[ctxt->nameNr] = NULL;
 205     return (ret);
 206 }
 207
 208 /*
 209  * Macros for accessing the content. Those should be used only by the parser,
 210  * and not exported.
 211  *
 212  * Dirty macros, i.e. one need to make assumption on the context to use them
 213  *
 214  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 215  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 216  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 217  *           in UNICODE mode. This should be used internally by the parser
 218  *           only to compare to ASCII values otherwise it would break when
 219  *           running with UTF-8 encoding.
 220  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 221  *           to compare on ASCII based substring.
 222  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 223  *           it should be used only to compare on ASCII based substring.
 224  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 225  *           strings without newlines within the parser.
 226  *
 227  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 228  *
 229  *   CURRENT Returns the current char value, with the full decoding of
 230  *           UTF-8 if we are using this mode. It returns an int.
 231  *   NEXT    Skip to the next character, this does the proper decoding
 232  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 233  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 234  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 235  */
 236
 237 #define UPPER (toupper(*ctxt->input->cur))
 238
 239 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
 240
 241 #define NXT(val) ctxt->input->cur[(val)]
 242
 243 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 244
 245 #define CUR_PTR ctxt->input->cur
 246
 247 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 248                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 249         xmlParserInputShrink(ctxt->input)
 250
 251 #define GROW if ((ctxt->progressive == 0) &&                            \
 252                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 253         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 254
 255 #define CURRENT ((int) (*ctxt->input->cur))
 256
 257 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 258
 259 /* Inported from XML */
 260
 261 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 262 #define CUR ((int) (*ctxt->input->cur))
 263 #define NEXT xmlNextChar(ctxt)
 264
 265 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 266 #define NXT(val) ctxt->input->cur[(val)]
 267 #define CUR_PTR ctxt->input->cur
 268
 269
 270 #define NEXTL(l) do {                                                   \
 271     if (*(ctxt->input->cur) == '\n') {                                  \
 272         ctxt->input->line++; ctxt->input->col = 1;                      \
 273     } else ctxt->input->col++;                                          \
 274     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;            \
 275   } while (0)
 276
 277 /************
 278     \
 279     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 280     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 281  ************/
 282
 283 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 284 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 285
 286 #define COPY_BUF(l,b,i,v)                                               \
 287     if (l == 1) b[i++] = (xmlChar) v;                                   \
 288     else i += xmlCopyChar(l,&b[i],v)
 289
 290 /**
 291  * htmlCurrentChar:
 292  * @ctxt:  the HTML parser context
 293  * @len:  pointer to the length of the char read
 294  *
 295  * The current char value, if using UTF-8 this may actually span multiple
 296  * bytes in the input buffer. Implement the end of line normalization:
 297  * 2.11 End-of-Line Handling
 298  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 299  * char, then the encoding converter is plugged in automatically.
 300  *
 301  * Returns the current char value and its length
 302  */
 303
 304 static int
 305 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 306     if (ctxt->instate == XML_PARSER_EOF)
 307         return(0);
 308
 309     if (ctxt->token != 0) {
 310         *len = 0;
 311         return(ctxt->token);
 312     }
 313     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
 314         /*
 315          * We are supposed to handle UTF8, check it's valid
 316          * From rfc2044: encoding of the Unicode values on UTF-8:
 317          *
 318          * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 319          * 0000 0000-0000 007F   0xxxxxxx
 320          * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 321          * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 322          *
 323          * Check for the 0x110000 limit too
 324          */
 325         const unsigned char *cur = ctxt->input->cur;
 326         unsigned char c;
 327         unsigned int val;
 328
 329         c = *cur;
 330         if (c & 0x80) {
 331             if (cur[1] == 0)
 332                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 333             if ((cur[1] & 0xc0) != 0x80)
 334                 goto encoding_error;
 335             if ((c & 0xe0) == 0xe0) {
 336
 337                 if (cur[2] == 0)
 338                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 339                 if ((cur[2] & 0xc0) != 0x80)
 340                     goto encoding_error;
 341                 if ((c & 0xf0) == 0xf0) {
 342                     if (cur[3] == 0)
 343                         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 344                     if (((c & 0xf8) != 0xf0) ||
 345                         ((cur[3] & 0xc0) != 0x80))
 346                         goto encoding_error;
 347                     /* 4-byte code */
 348                     *len = 4;
 349                     val = (cur[0] & 0x7) << 18;
 350                     val |= (cur[1] & 0x3f) << 12;
 351                     val |= (cur[2] & 0x3f) << 6;
 352                     val |= cur[3] & 0x3f;
 353                 } else {
 354                   /* 3-byte code */
 355                     *len = 3;
 356                     val = (cur[0] & 0xf) << 12;
 357                     val |= (cur[1] & 0x3f) << 6;
 358                     val |= cur[2] & 0x3f;
 359                 }
 360             } else {
 361               /* 2-byte code */
 362                 *len = 2;
 363                 val = (cur[0] & 0x1f) << 6;
 364                 val |= cur[1] & 0x3f;
 365             }
 366             if (!IS_CHAR(val)) {
 367                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 368                                 "Char 0x%X out of allowed range\n", val);
 369             }
 370             return(val);
 371         } else {
 372             /* 1-byte code */
 373             *len = 1;
 374             return((int) *ctxt->input->cur);
 375         }
 376     }
 377     /*
 378      * Assume it's a fixed length encoding (1) with
 379      * a compatible encoding for the ASCII set, since
 380      * XML constructs only use < 128 chars
 381      */
 382     *len = 1;
 383     if ((int) *ctxt->input->cur < 0x80)
 384         return((int) *ctxt->input->cur);
 385
 386     /*
 387      * Humm this is bad, do an automatic flow conversion
 388      */
 389     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 390     ctxt->charset = XML_CHAR_ENCODING_UTF8;
 391     return(xmlCurrentChar(ctxt, len));
 392
 393 encoding_error:
 394     /*
 395      * If we detect an UTF8 error that probably mean that the
 396      * input encoding didn't get properly advertized in the
 397      * declaration header. Report the error and switch the encoding
 398      * to ISO-Latin-1 (if you don't like this policy, just declare the
 399      * encoding !)
 400      */
 401     {
 402         char buffer[150];
 403
 404         snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 405                         ctxt->input->cur[0], ctxt->input->cur[1],
 406                         ctxt->input->cur[2], ctxt->input->cur[3]);
 407         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 408                      "Input is not proper UTF-8, indicate encoding !\n",
 409                      BAD_CAST buffer, NULL);
 410     }
 411
 412     ctxt->charset = XML_CHAR_ENCODING_8859_1;
 413     *len = 1;
 414     return((int) *ctxt->input->cur);
 415 }
 416
 417 /**
 418  * htmlSkipBlankChars:
 419  * @ctxt:  the HTML parser context
 420  *
 421  * skip all blanks character found at that point in the input streams.
 422  *
 423  * Returns the number of space chars skipped
 424  */
 425
 426 static int
 427 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 428     int res = 0;
 429
 430     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 431         if ((*ctxt->input->cur == 0) &&
 432             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 433                 xmlPopInput(ctxt);
 434         } else {
 435             if (*(ctxt->input->cur) == '\n') {
 436                 ctxt->input->line++; ctxt->input->col = 1;
 437             } else ctxt->input->col++;
 438             ctxt->input->cur++;
 439             ctxt->nbChars++;
 440             if (*ctxt->input->cur == 0)
 441                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 442         }
 443         res++;
 444     }
 445     return(res);
 446 }
 447
 448
 449
 450 /************************************************************************
 451  *                                                                      *
 452  *              The list of HTML elements and their properties          *
 453  *                                                                      *
 454  ************************************************************************/
 455
 456 /*
 457  *  Start Tag: 1 means the start tag can be ommited
 458  *  End Tag:   1 means the end tag can be ommited
 459  *             2 means it's forbidden (empty elements)
 460  *             3 means the tag is stylistic and should be closed easily
 461  *  Depr:      this element is deprecated
 462  *  DTD:       1 means that this element is valid only in the Loose DTD
 463  *             2 means that this element is valid only in the Frameset DTD
 464  *
 465  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 466         , subElements , impliedsubelt , Attributes, userdata
 467  */
 468
 469 /* Definitions and a couple of vars for HTML Elements */
 470
 471 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 472 #define NB_FONTSTYLE 8
 473 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 474 #define NB_PHRASE 10
 475 #define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 476 #define NB_SPECIAL 15
 477 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
 478 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 479 #define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 480 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 481 #define FORMCTRL "input", "select", "textarea", "label", "button"
 482 #define NB_FORMCTRL 5
 483 #define PCDATA
 484 #define NB_PCDATA 0
 485 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 486 #define NB_HEADING 6
 487 #define LIST "ul", "ol", "dir", "menu"
 488 #define NB_LIST 4
 489 #define MODIFIER
 490 #define NB_MODIFIER 0
 491 #define FLOW BLOCK,INLINE
 492 #define NB_FLOW NB_BLOCK + NB_INLINE
 493 #define EMPTY NULL
 494
 495
 496 static const char* const html_flow[] = { FLOW, NULL } ;
 497 static const char* const html_inline[] = { INLINE, NULL } ;
 498
 499 /* placeholders: elts with content but no subelements */
 500 static const char* const html_pcdata[] = { NULL } ;
 501 #define html_cdata html_pcdata
 502
 503
 504 /* ... and for HTML Attributes */
 505
 506 #define COREATTRS "id", "class", "style", "title"
 507 #define NB_COREATTRS 4
 508 #define I18N "lang", "dir"
 509 #define NB_I18N 2
 510 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 511 #define NB_EVENTS 9
 512 #define ATTRS COREATTRS,I18N,EVENTS
 513 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 514 #define CELLHALIGN "align", "char", "charoff"
 515 #define NB_CELLHALIGN 3
 516 #define CELLVALIGN "valign"
 517 #define NB_CELLVALIGN 1
 518
 519 static const char* const html_attrs[] = { ATTRS, NULL } ;
 520 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 521 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 522 static const char* const i18n_attrs[] = { I18N, NULL } ;
 523
 524
 525 /* Other declarations that should go inline ... */
 526 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 527         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 528         "tabindex", "onfocus", "onblur", NULL } ;
 529 static const char* const target_attr[] = { "target", NULL } ;
 530 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 531 static const char* const alt_attr[] = { "alt", NULL } ;
 532 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 533 static const char* const href_attrs[] = { "href", NULL } ;
 534 static const char* const clear_attrs[] = { "clear", NULL } ;
 535 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 536
 537 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 538 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 539                 "archive", "alt", "name", "height", "width", "align",
 540                 "hspace", "vspace", NULL } ;
 541 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 542         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 543 static const char* const basefont_attrs[] =
 544         { "id", "size", "color", "face", NULL } ;
 545 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 546 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 547 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 548 static const char* const body_depr[] = { "background", "bgcolor", "text",
 549         "link", "vlink", "alink", NULL } ;
 550 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 551         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 552
 553
 554 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 555 static const char* const col_elt[] = { "col", NULL } ;
 556 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 557 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 558 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 559 static const char* const compact_attr[] = { "compact", NULL } ;
 560 static const char* const label_attr[] = { "label", NULL } ;
 561 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 562 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 563 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 564 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 565 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 566 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 567 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 568 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 569 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 570 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 571 static const char* const version_attr[] = { "version", NULL } ;
 572 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 573 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 574 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 575 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 576 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 577 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 578 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 579 static const char* const align_attr[] = { "align", NULL } ;
 580 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 581 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 582 static const char* const name_attr[] = { "name", NULL } ;
 583 static const char* const action_attr[] = { "action", NULL } ;
 584 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 585 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
 586 static const char* const content_attr[] = { "content", NULL } ;
 587 static const char* const type_attr[] = { "type", NULL } ;
 588 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 589 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 590 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 591 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 592 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 593 static const char* const option_elt[] = { "option", NULL } ;
 594 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 595 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 596 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 597 static const char* const width_attr[] = { "width", NULL } ;
 598 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 599 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 600 static const char* const language_attr[] = { "language", NULL } ;
 601 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 602 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 603 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 604 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 605 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 606 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 607 static const char* const tr_elt[] = { "tr", NULL } ;
 608 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 609 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 610 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 611 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 612 static const char* const tr_contents[] = { "th", "td", NULL } ;
 613 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 614 static const char* const li_elt[] = { "li", NULL } ;
 615 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 616 static const char* const dir_attr[] = { "dir", NULL} ;
 617
 618 #define DECL (const char**)
 619
 620 static const htmlElemDesc
 621 html40ElementTable[] = {
 622 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 623         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 624 },
 625 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 626         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 627 },
 628 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 629         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 630 },
 631 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 632         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 633 },
 634 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 635         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 636 },
 637 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 638         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 639 },
 640 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 641         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 642 },
 643 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 644         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 645 },
 646 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 647         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 648 },
 649 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 650         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 651 },
 652 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 653         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 654 },
 655 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 656         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 657 },
 658 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 659         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 660 },
 661 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 662         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 663 },
 664 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 665         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 666 },
 667 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 668         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 669 },
 670 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 671         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 672 },
 673 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 674         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 675 },
 676 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 677         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 678 },
 679 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 680         EMPTY , NULL , DECL col_attrs , NULL, NULL
 681 },
 682 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 683         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 684 },
 685 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 686         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 687 },
 688 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 689         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 690 },
 691 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 692         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 693 },
 694 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 695         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 696 },
 697 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 698         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 699 },
 700 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 701         DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
 702 },
 703 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 704         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 705 },
 706 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 707         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 708 },
 709 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 710         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 711 },
 712 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 713         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 714 },
 715 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 716         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 717 },
 718 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 719         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 720 },
 721 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 722         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 723 },
 724 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 725         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 726 },
 727 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 728         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 729 },
 730 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 731         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 732 },
 733 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 734         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 735 },
 736 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 737         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 738 },
 739 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 740         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 741 },
 742 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 743         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 744 },
 745 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 746         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 747 },
 748 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 749         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 750 },
 751 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 752         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 753 },
 754 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 755         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 756 },
 757 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 758         EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
 759 },
 760 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 761         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 762 },
 763 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 764         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 765 },
 766 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 767         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 768 },
 769 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 770         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 771 },
 772 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 773         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 774 },
 775 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 776         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 777 },
 778 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 779         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 780 },
 781 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 782         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 783 },
 784 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 785         DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
 786 },
 787 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 788         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 789 },
 790 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 791         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 792 },
 793 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 794         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 795 },
 796 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 797         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 798 },
 799 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 800         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 801 },
 802 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 803         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 804 },
 805 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 806         option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 807 },
 808 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 809         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 810 },
 811 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 812         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 813 },
 814 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 815         EMPTY, NULL, DECL param_attrs, NULL, name_attr
 816 },
 817 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 818         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 819 },
 820 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
 821         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
 822 },
 823 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
 824         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 825 },
 826 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
 827         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 828 },
 829 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
 830         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
 831 },
 832 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
 833         DECL select_content, NULL, DECL select_attrs, NULL, NULL
 834 },
 835 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
 836         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 837 },
 838 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
 839         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 840 },
 841 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
 842         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 843 },
 844 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
 845         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 846 },
 847 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
 848         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
 849 },
 850 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
 851         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 852 },
 853 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
 854         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 855 },
 856 { "table",      0, 0, 0, 0, 0, 0, 0, "",
 857         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
 858 },
 859 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
 860         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
 861 },
 862 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
 863         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
 864 },
 865 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
 866         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
 867 },
 868 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
 869         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
 870 },
 871 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
 872         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
 873 },
 874 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
 875         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
 876 },
 877 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
 878         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
 879 },
 880 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
 881         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
 882 },
 883 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
 884         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 885 },
 886 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
 887         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 888 },
 889 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
 890         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
 891 },
 892 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
 893         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 894 }
 895 };
 896
 897 /*
 898  * start tags that imply the end of current element
 899  */
 900 static const char * const htmlStartClose[] = {
 901 "form",         "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
 902                 "dl", "ul", "ol", "menu", "dir", "address", "pre",
 903                 "listing", "xmp", "head", NULL,
 904 "head",         "p", NULL,
 905 "title",        "p", NULL,
 906 "body",         "head", "style", "link", "title", "p", NULL,
 907 "frameset",     "head", "style", "link", "title", "p", NULL,
 908 "li",           "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
 909                 "pre", "listing", "xmp", "head", "li", NULL,
 910 "hr",           "p", "head", NULL,
 911 "h1",           "p", "head", NULL,
 912 "h2",           "p", "head", NULL,
 913 "h3",           "p", "head", NULL,
 914 "h4",           "p", "head", NULL,
 915 "h5",           "p", "head", NULL,
 916 "h6",           "p", "head", NULL,
 917 "dir",          "p", "head", NULL,
 918 "address",      "p", "head", "ul", NULL,
 919 "pre",          "p", "head", "ul", NULL,
 920 "listing",      "p", "head", NULL,
 921 "xmp",          "p", "head", NULL,
 922 "blockquote",   "p", "head", NULL,
 923 "dl",           "p", "dt", "menu", "dir", "address", "pre", "listing",
 924                 "xmp", "head", NULL,
 925 "dt",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
 926                 "head", "dd", NULL,
 927 "dd",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
 928                 "head", "dt", NULL,
 929 "ul",           "p", "head", "ol", "menu", "dir", "address", "pre",
 930                 "listing", "xmp", NULL,
 931 "ol",           "p", "head", "ul", NULL,
 932 "menu",         "p", "head", "ul", NULL,
 933 "p",            "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
 934 "div",          "p", "head", NULL,
 935 "noscript",     "p", "head", NULL,
 936 "center",       "font", "b", "i", "p", "head", NULL,
 937 "a",            "a", NULL,
 938 "caption",      "p", NULL,
 939 "colgroup",     "caption", "colgroup", "col", "p", NULL,
 940 "col",          "caption", "col", "p", NULL,
 941 "table",        "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
 942                 "listing", "xmp", "a", NULL,
 943 "th",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
 944 "td",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
 945 "tr",           "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
 946 "thead",        "caption", "col", "colgroup", NULL,
 947 "tfoot",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
 948                 "tbody", "p", NULL,
 949 "tbody",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
 950                 "tfoot", "tbody", "p", NULL,
 951 "optgroup",     "option", NULL,
 952 "option",       "option", NULL,
 953 "fieldset",     "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
 954                 "pre", "listing", "xmp", "a", NULL,
 955 NULL
 956 };
 957
 958 /*
 959  * The list of HTML elements which are supposed not to have
 960  * CDATA content and where a p element will be implied
 961  *
 962  * TODO: extend that list by reading the HTML SGML DTD on
 963  *       implied paragraph
 964  */
 965 static const char *const htmlNoContentElements[] = {
 966     "html",
 967     "head",
 968     NULL
 969 };
 970
 971 /*
 972  * The list of HTML attributes which are of content %Script;
 973  * NOTE: when adding ones, check htmlIsScriptAttribute() since
 974  *       it assumes the name starts with 'on'
 975  */
 976 static const char *const htmlScriptAttributes[] = {
 977     "onclick",
 978     "ondblclick",
 979     "onmousedown",
 980     "onmouseup",
 981     "onmouseover",
 982     "onmousemove",
 983     "onmouseout",
 984     "onkeypress",
 985     "onkeydown",
 986     "onkeyup",
 987     "onload",
 988     "onunload",
 989     "onfocus",
 990     "onblur",
 991     "onsubmit",
 992     "onrest",
 993     "onchange",
 994     "onselect"
 995 };
 996
 997 /*
 998  * This table is used by the htmlparser to know what to do with
 999  * broken html pages. By assigning different priorities to different
1000  * elements the parser can decide how to handle extra endtags.
1001  * Endtags are only allowed to close elements with lower or equal
1002  * priority.
1003  */
1004
1005 typedef struct {
1006     const char *name;
1007     int priority;
1008 } elementPriority;
1009
1010 static const elementPriority htmlEndPriority[] = {
1011     {"div",   150},
1012     {"td",    160},
1013     {"th",    160},
1014     {"tr",    170},
1015     {"thead", 180},
1016     {"tbody", 180},
1017     {"tfoot", 180},
1018     {"table", 190},
1019     {"head",  200},
1020     {"body",  200},
1021     {"html",  220},
1022     {NULL,    100} /* Default priority */
1023 };
1024
1025 static const char** htmlStartCloseIndex[100];
1026 static int htmlStartCloseIndexinitialized = 0;
1027
1028 /************************************************************************
1029  *                                                                      *
1030  *              functions to handle HTML specific data                  *
1031  *                                                                      *
1032  ************************************************************************/
1033
1034 /**
1035  * htmlInitAutoClose:
1036  *
1037  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038  * This is not reentrant. Call xmlInitParser() once before processing in
1039  * case of use in multithreaded programs.
1040  */
1041 void
1042 htmlInitAutoClose(void) {
1043     int indx, i = 0;
1044
1045     if (htmlStartCloseIndexinitialized) return;
1046
1047     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1048     indx = 0;
1049     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1050         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1051         while (htmlStartClose[i] != NULL) i++;
1052         i++;
1053     }
1054     htmlStartCloseIndexinitialized = 1;
1055 }
1056
1057 /**
1058  * htmlTagLookup:
1059  * @tag:  The tag name in lowercase
1060  *
1061  * Lookup the HTML tag in the ElementTable
1062  *
1063  * Returns the related htmlElemDescPtr or NULL if not found.
1064  */
1065 const htmlElemDesc *
1066 htmlTagLookup(const xmlChar *tag) {
1067     unsigned int i;
1068
1069     for (i = 0; i < (sizeof(html40ElementTable) /
1070                      sizeof(html40ElementTable[0]));i++) {
1071         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1072             return((htmlElemDescPtr) &html40ElementTable[i]);
1073     }
1074     return(NULL);
1075 }
1076
1077 /**
1078  * htmlGetEndPriority:
1079  * @name: The name of the element to look up the priority for.
1080  *
1081  * Return value: The "endtag" priority.
1082  **/
1083 static int
1084 htmlGetEndPriority (const xmlChar *name) {
1085     int i = 0;
1086
1087     while ((htmlEndPriority[i].name != NULL) &&
1088            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1089         i++;
1090
1091     return(htmlEndPriority[i].priority);
1092 }
1093
1094
1095 /**
1096  * htmlCheckAutoClose:
1097  * @newtag:  The new tag name
1098  * @oldtag:  The old tag name
1099  *
1100  * Checks whether the new tag is one of the registered valid tags for
1101  * closing old.
1102  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1103  *
1104  * Returns 0 if no, 1 if yes.
1105  */
1106 static int
1107 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1108 {
1109     int i, indx;
1110     const char **closed = NULL;
1111
1112     if (htmlStartCloseIndexinitialized == 0)
1113         htmlInitAutoClose();
1114
1115     /* inefficient, but not a big deal */
1116     for (indx = 0; indx < 100; indx++) {
1117         closed = htmlStartCloseIndex[indx];
1118         if (closed == NULL)
1119             return (0);
1120         if (xmlStrEqual(BAD_CAST * closed, newtag))
1121             break;
1122     }
1123
1124     i = closed - htmlStartClose;
1125     i++;
1126     while (htmlStartClose[i] != NULL) {
1127         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1128             return (1);
1129         }
1130         i++;
1131     }
1132     return (0);
1133 }
1134
1135 /**
1136  * htmlAutoCloseOnClose:
1137  * @ctxt:  an HTML parser context
1138  * @newtag:  The new tag name
1139  * @force:  force the tag closure
1140  *
1141  * The HTML DTD allows an ending tag to implicitly close other tags.
1142  */
1143 static void
1144 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1145 {
1146     const htmlElemDesc *info;
1147     int i, priority;
1148
1149     priority = htmlGetEndPriority(newtag);
1150
1151     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1152
1153         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1154             break;
1155         /*
1156          * A missplaced endtag can only close elements with lower
1157          * or equal priority, so if we find an element with higher
1158          * priority before we find an element with
1159          * matching name, we just ignore this endtag
1160          */
1161         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1162             return;
1163     }
1164     if (i < 0)
1165         return;
1166
1167     while (!xmlStrEqual(newtag, ctxt->name)) {
1168         info = htmlTagLookup(ctxt->name);
1169         if ((info != NULL) && (info->endTag == 3)) {
1170             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171                          "Opening and ending tag mismatch: %s and %s\n",
1172                          newtag, ctxt->name);
1173         }
1174         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1176         htmlnamePop(ctxt);
1177     }
1178 }
1179
1180 /**
1181  * htmlAutoCloseOnEnd:
1182  * @ctxt:  an HTML parser context
1183  *
1184  * Close all remaining tags at the end of the stream
1185  */
1186 static void
1187 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1188 {
1189     int i;
1190
1191     if (ctxt->nameNr == 0)
1192         return;
1193     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1194         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1196         htmlnamePop(ctxt);
1197     }
1198 }
1199
1200 /**
1201  * htmlAutoClose:
1202  * @ctxt:  an HTML parser context
1203  * @newtag:  The new tag name or NULL
1204  *
1205  * The HTML DTD allows a tag to implicitly close other tags.
1206  * The list is kept in htmlStartClose array. This function is
1207  * called when a new tag has been detected and generates the
1208  * appropriates closes if possible/needed.
1209  * If newtag is NULL this mean we are at the end of the resource
1210  * and we should check
1211  */
1212 static void
1213 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1214 {
1215     while ((newtag != NULL) && (ctxt->name != NULL) &&
1216            (htmlCheckAutoClose(newtag, ctxt->name))) {
1217         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1219         htmlnamePop(ctxt);
1220     }
1221     if (newtag == NULL) {
1222         htmlAutoCloseOnEnd(ctxt);
1223         return;
1224     }
1225     while ((newtag == NULL) && (ctxt->name != NULL) &&
1226            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1229         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1231         htmlnamePop(ctxt);
1232     }
1233 }
1234
1235 /**
1236  * htmlAutoCloseTag:
1237  * @doc:  the HTML document
1238  * @name:  The tag name
1239  * @elem:  the HTML element
1240  *
1241  * The HTML DTD allows a tag to implicitly close other tags.
1242  * The list is kept in htmlStartClose array. This function checks
1243  * if the element or one of it's children would autoclose the
1244  * given tag.
1245  *
1246  * Returns 1 if autoclose, 0 otherwise
1247  */
1248 int
1249 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1250     htmlNodePtr child;
1251
1252     if (elem == NULL) return(1);
1253     if (xmlStrEqual(name, elem->name)) return(0);
1254     if (htmlCheckAutoClose(elem->name, name)) return(1);
1255     child = elem->children;
1256     while (child != NULL) {
1257         if (htmlAutoCloseTag(doc, name, child)) return(1);
1258         child = child->next;
1259     }
1260     return(0);
1261 }
1262
1263 /**
1264  * htmlIsAutoClosed:
1265  * @doc:  the HTML document
1266  * @elem:  the HTML element
1267  *
1268  * The HTML DTD allows a tag to implicitly close other tags.
1269  * The list is kept in htmlStartClose array. This function checks
1270  * if a tag is autoclosed by one of it's child
1271  *
1272  * Returns 1 if autoclosed, 0 otherwise
1273  */
1274 int
1275 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1276     htmlNodePtr child;
1277
1278     if (elem == NULL) return(1);
1279     child = elem->children;
1280     while (child != NULL) {
1281         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282         child = child->next;
1283     }
1284     return(0);
1285 }
1286
1287 /**
1288  * htmlCheckImplied:
1289  * @ctxt:  an HTML parser context
1290  * @newtag:  The new tag name
1291  *
1292  * The HTML DTD allows a tag to exists only implicitly
1293  * called when a new tag has been detected and generates the
1294  * appropriates implicit tags if missing
1295  */
1296 static void
1297 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298     if (!htmlOmittedDefaultValue)
1299         return;
1300     if (xmlStrEqual(newtag, BAD_CAST"html"))
1301         return;
1302     if (ctxt->nameNr <= 0) {
1303         htmlnamePush(ctxt, BAD_CAST"html");
1304         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1306     }
1307     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1308         return;
1309     if ((ctxt->nameNr <= 1) &&
1310         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1316             /*
1317              * dropped OBJECT ... i you put it first BODY will be
1318              * assumed !
1319              */
1320             htmlnamePush(ctxt, BAD_CAST"head");
1321             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1326         int i;
1327         for (i = 0;i < ctxt->nameNr;i++) {
1328             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1329                 return;
1330             }
1331             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1332                 return;
1333             }
1334         }
1335
1336         htmlnamePush(ctxt, BAD_CAST"body");
1337         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1339     }
1340 }
1341
1342 /**
1343  * htmlCheckParagraph
1344  * @ctxt:  an HTML parser context
1345  *
1346  * Check whether a p element need to be implied before inserting
1347  * characters in the current element.
1348  *
1349  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1350  *         in case of error.
1351  */
1352
1353 static int
1354 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1355     const xmlChar *tag;
1356     int i;
1357
1358     if (ctxt == NULL)
1359         return(-1);
1360     tag = ctxt->name;
1361     if (tag == NULL) {
1362         htmlAutoClose(ctxt, BAD_CAST"p");
1363         htmlCheckImplied(ctxt, BAD_CAST"p");
1364         htmlnamePush(ctxt, BAD_CAST"p");
1365         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1367         return(1);
1368     }
1369     if (!htmlOmittedDefaultValue)
1370         return(0);
1371     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1373             htmlAutoClose(ctxt, BAD_CAST"p");
1374             htmlCheckImplied(ctxt, BAD_CAST"p");
1375             htmlnamePush(ctxt, BAD_CAST"p");
1376             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1378             return(1);
1379         }
1380     }
1381     return(0);
1382 }
1383
1384 /**
1385  * htmlIsScriptAttribute:
1386  * @name:  an attribute name
1387  *
1388  * Check if an attribute is of content type Script
1389  *
1390  * Returns 1 is the attribute is a script 0 otherwise
1391  */
1392 int
1393 htmlIsScriptAttribute(const xmlChar *name) {
1394     unsigned int i;
1395
1396     if (name == NULL)
1397         return(0);
1398     /*
1399      * all script attributes start with 'on'
1400      */
1401     if ((name[0] != 'o') || (name[1] != 'n'))
1402         return(0);
1403     for (i = 0;
1404          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1405          i++) {
1406         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1407             return(1);
1408     }
1409     return(0);
1410 }
1411
1412 /************************************************************************
1413  *                                                                      *
1414  *              The list of HTML predefined entities                    *
1415  *                                                                      *
1416  ************************************************************************/
1417
1418
1419 static const htmlEntityDesc  html40EntitiesTable[] = {
1420 /*
1421  * the 4 absolute ones, plus apostrophe.
1422  */
1423 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1424 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1425 { 39,   "apos", "single quote" },
1426 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1427 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1428
1429 /*
1430  * A bunch still in the 128-255 range
1431  * Replacing them depend really on the charset used.
1432  */
1433 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1434 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1436 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1437 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1438 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1439 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1441 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1443 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1444 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445 { 172,  "not",  "not sign, U+00AC ISOnum" },
1446 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1448 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1450 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1455 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1459 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1460 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1482 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1489 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1509 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1513 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1514 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520 { 247,  "divide","division sign, U+00F7 ISOnum" },
1521 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1526 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1529
1530 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1532 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1535
1536 /*
1537  * Anything below should really be kept as entities references
1538  */
1539 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1540
1541 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1542 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1543
1544 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1545 { 914,  "Beta", "greek capital letter beta, U+0392" },
1546 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1549 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1550 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1551 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552 { 921,  "Iota", "greek capital letter iota, U+0399" },
1553 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1554 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1555 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1556 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1557 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1558 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1559 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1560 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1561 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1563 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1565 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1566 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1567 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1568
1569 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1571 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1573 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1575 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1576 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1577 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1578 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1581 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1582 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1583 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1584 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1585 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1586 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1589 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1591 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1592 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1593 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1594 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1597
1598 { 8194, "ensp", "en space, U+2002 ISOpub" },
1599 { 8195, "emsp", "em space, U+2003 ISOpub" },
1600 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1601 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1602 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1603 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1604 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1605 { 8211, "ndash","en dash, U+2013 ISOpub" },
1606 { 8212, "mdash","em dash, U+2014 ISOpub" },
1607 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1608 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1609 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1610 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1611 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1612 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1613 { 8224, "dagger","dagger, U+2020 ISOpub" },
1614 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1615
1616 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1617 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1618
1619 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1620
1621 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1622 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1623
1624 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1626
1627 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1628 { 8260, "frasl","fraction slash, U+2044 NEW" },
1629
1630 { 8364, "euro", "euro sign, U+20AC NEW" },
1631
1632 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1635 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1636 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1638 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1639 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1640 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1641 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1642 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1644 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1645 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1646 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1647 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1648
1649 { 8704, "forall","for all, U+2200 ISOtech" },
1650 { 8706, "part", "partial differential, U+2202 ISOtech" },
1651 { 8707, "exist","there exists, U+2203 ISOtech" },
1652 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1653 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1654 { 8712, "isin", "element of, U+2208 ISOtech" },
1655 { 8713, "notin","not an element of, U+2209 ISOtech" },
1656 { 8715, "ni",   "contains as member, U+220B ISOtech" },
1657 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1658 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1659 { 8722, "minus","minus sign, U+2212 ISOtech" },
1660 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1661 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1662 { 8733, "prop", "proportional to, U+221D ISOtech" },
1663 { 8734, "infin","infinity, U+221E ISOtech" },
1664 { 8736, "ang",  "angle, U+2220 ISOamso" },
1665 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1666 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
1667 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1668 { 8746, "cup",  "union = cup, U+222A ISOtech" },
1669 { 8747, "int",  "integral, U+222B ISOtech" },
1670 { 8756, "there4","therefore, U+2234 ISOtech" },
1671 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1672 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1673 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
1675 { 8801, "equiv","identical to, U+2261 ISOtech" },
1676 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
1677 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
1678 { 8834, "sub",  "subset of, U+2282 ISOtech" },
1679 { 8835, "sup",  "superset of, U+2283 ISOtech" },
1680 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1681 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1682 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1683 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1685 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1687 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1689 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1691 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1692 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1693 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
1694
1695 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1696 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1697 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1698 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1699
1700 };
1701
1702 /************************************************************************
1703  *                                                                      *
1704  *              Commodity functions to handle entities                  *
1705  *                                                                      *
1706  ************************************************************************/
1707
1708 /*
1709  * Macro used to grow the current buffer.
1710  */
1711 #define growBuffer(buffer) {                                            \
1712     xmlChar *tmp;                                                       \
1713     buffer##_size *= 2;                                                 \
1714     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715     if (tmp == NULL) {                                          \
1716         htmlErrMemory(ctxt, "growing buffer\n");                        \
1717         xmlFree(buffer);                                                \
1718         return(NULL);                                                   \
1719     }                                                                   \
1720     buffer = tmp;                                                       \
1721 }
1722
1723 /**
1724  * htmlEntityLookup:
1725  * @name: the entity name
1726  *
1727  * Lookup the given entity in EntitiesTable
1728  *
1729  * TODO: the linear scan is really ugly, an hash table is really needed.
1730  *
1731  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1732  */
1733 const htmlEntityDesc *
1734 htmlEntityLookup(const xmlChar *name) {
1735     unsigned int i;
1736
1737     for (i = 0;i < (sizeof(html40EntitiesTable)/
1738                     sizeof(html40EntitiesTable[0]));i++) {
1739         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1740             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1741         }
1742     }
1743     return(NULL);
1744 }
1745
1746 /**
1747  * htmlEntityValueLookup:
1748  * @value: the entity's unicode value
1749  *
1750  * Lookup the given entity in EntitiesTable
1751  *
1752  * TODO: the linear scan is really ugly, an hash table is really needed.
1753  *
1754  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1755  */
1756 const htmlEntityDesc *
1757 htmlEntityValueLookup(unsigned int value) {
1758     unsigned int i;
1759
1760     for (i = 0;i < (sizeof(html40EntitiesTable)/
1761                     sizeof(html40EntitiesTable[0]));i++) {
1762         if (html40EntitiesTable[i].value >= value) {
1763             if (html40EntitiesTable[i].value > value)
1764                 break;
1765             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1766         }
1767     }
1768     return(NULL);
1769 }
1770
1771 /**
1772  * UTF8ToHtml:
1773  * @out:  a pointer to an array of bytes to store the result
1774  * @outlen:  the length of @out
1775  * @in:  a pointer to an array of UTF-8 chars
1776  * @inlen:  the length of @in
1777  *
1778  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779  * plus HTML entities block of chars out.
1780  *
1781  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782  * The value of @inlen after return is the number of octets consumed
1783  *     as the return value is positive, else unpredictable.
1784  * The value of @outlen after return is the number of octets consumed.
1785  */
1786 int
1787 UTF8ToHtml(unsigned char* out, int *outlen,
1788               const unsigned char* in, int *inlen) {
1789     const unsigned char* processed = in;
1790     const unsigned char* outend;
1791     const unsigned char* outstart = out;
1792     const unsigned char* instart = in;
1793     const unsigned char* inend;
1794     unsigned int c, d;
1795     int trailing;
1796
1797     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1798     if (in == NULL) {
1799         /*
1800          * initialization nothing to do
1801          */
1802         *outlen = 0;
1803         *inlen = 0;
1804         return(0);
1805     }
1806     inend = in + (*inlen);
1807     outend = out + (*outlen);
1808     while (in < inend) {
1809         d = *in++;
1810         if      (d < 0x80)  { c= d; trailing= 0; }
1811         else if (d < 0xC0) {
1812             /* trailing byte in leading position */
1813             *outlen = out - outstart;
1814             *inlen = processed - instart;
1815             return(-2);
1816         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1817         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1818         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1819         else {
1820             /* no chance for this in Ascii */
1821             *outlen = out - outstart;
1822             *inlen = processed - instart;
1823             return(-2);
1824         }
1825
1826         if (inend - in < trailing) {
1827             break;
1828         }
1829
1830         for ( ; trailing; trailing--) {
1831             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1832                 break;
1833             c <<= 6;
1834             c |= d & 0x3F;
1835         }
1836
1837         /* assertion: c is a single UTF-4 value */
1838         if (c < 0x80) {
1839             if (out + 1 >= outend)
1840                 break;
1841             *out++ = c;
1842         } else {
1843             int len;
1844             const htmlEntityDesc * ent;
1845
1846             /*
1847              * Try to lookup a predefined HTML entity for it
1848              */
1849
1850             ent = htmlEntityValueLookup(c);
1851             if (ent == NULL) {
1852                 /* no chance for this in Ascii */
1853                 *outlen = out - outstart;
1854                 *inlen = processed - instart;
1855                 return(-2);
1856             }
1857             len = strlen(ent->name);
1858             if (out + 2 + len >= outend)
1859                 break;
1860             *out++ = '&';
1861             memcpy(out, ent->name, len);
1862             out += len;
1863             *out++ = ';';
1864         }
1865         processed = in;
1866     }
1867     *outlen = out - outstart;
1868     *inlen = processed - instart;
1869     return(0);
1870 }
1871
1872 /**
1873  * htmlEncodeEntities:
1874  * @out:  a pointer to an array of bytes to store the result
1875  * @outlen:  the length of @out
1876  * @in:  a pointer to an array of UTF-8 chars
1877  * @inlen:  the length of @in
1878  * @quoteChar: the quote character to escape (' or ") or zero.
1879  *
1880  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1881  * plus HTML entities block of chars out.
1882  *
1883  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1884  * The value of @inlen after return is the number of octets consumed
1885  *     as the return value is positive, else unpredictable.
1886  * The value of @outlen after return is the number of octets consumed.
1887  */
1888 int
1889 htmlEncodeEntities(unsigned char* out, int *outlen,
1890                    const unsigned char* in, int *inlen, int quoteChar) {
1891     const unsigned char* processed = in;
1892     const unsigned char* outend;
1893     const unsigned char* outstart = out;
1894     const unsigned char* instart = in;
1895     const unsigned char* inend;
1896     unsigned int c, d;
1897     int trailing;
1898
1899     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1900         return(-1);
1901     outend = out + (*outlen);
1902     inend = in + (*inlen);
1903     while (in < inend) {
1904         d = *in++;
1905         if      (d < 0x80)  { c= d; trailing= 0; }
1906         else if (d < 0xC0) {
1907             /* trailing byte in leading position */
1908             *outlen = out - outstart;
1909             *inlen = processed - instart;
1910             return(-2);
1911         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1912         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1913         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1914         else {
1915             /* no chance for this in Ascii */
1916             *outlen = out - outstart;
1917             *inlen = processed - instart;
1918             return(-2);
1919         }
1920
1921         if (inend - in < trailing)
1922             break;
1923
1924         while (trailing--) {
1925             if (((d= *in++) & 0xC0) != 0x80) {
1926                 *outlen = out - outstart;
1927                 *inlen = processed - instart;
1928                 return(-2);
1929             }
1930             c <<= 6;
1931             c |= d & 0x3F;
1932         }
1933
1934         /* assertion: c is a single UTF-4 value */
1935         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1936             (c != '&') && (c != '<') && (c != '>')) {
1937             if (out >= outend)
1938                 break;
1939             *out++ = c;
1940         } else {
1941             const htmlEntityDesc * ent;
1942             const char *cp;
1943             char nbuf[16];
1944             int len;
1945
1946             /*
1947              * Try to lookup a predefined HTML entity for it
1948              */
1949             ent = htmlEntityValueLookup(c);
1950             if (ent == NULL) {
1951                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1952                 cp = nbuf;
1953             }
1954             else
1955                 cp = ent->name;
1956             len = strlen(cp);
1957             if (out + 2 + len > outend)
1958                 break;
1959             *out++ = '&';
1960             memcpy(out, cp, len);
1961             out += len;
1962             *out++ = ';';
1963         }
1964         processed = in;
1965     }
1966     *outlen = out - outstart;
1967     *inlen = processed - instart;
1968     return(0);
1969 }
1970
1971 /************************************************************************
1972  *                                                                      *
1973  *              Commodity functions to handle streams                   *
1974  *                                                                      *
1975  ************************************************************************/
1976
1977 /**
1978  * htmlNewInputStream:
1979  * @ctxt:  an HTML parser context
1980  *
1981  * Create a new input stream structure
1982  * Returns the new input stream or NULL
1983  */
1984 static htmlParserInputPtr
1985 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1986     htmlParserInputPtr input;
1987
1988     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1989     if (input == NULL) {
1990         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
1991         return(NULL);
1992     }
1993     memset(input, 0, sizeof(htmlParserInput));
1994     input->filename = NULL;
1995     input->directory = NULL;
1996     input->base = NULL;
1997     input->cur = NULL;
1998     input->buf = NULL;
1999     input->line = 1;
2000     input->col = 1;
2001     input->buf = NULL;
2002     input->free = NULL;
2003     input->version = NULL;
2004     input->consumed = 0;
2005     input->length = 0;
2006     return(input);
2007 }
2008
2009
2010 /************************************************************************
2011  *                                                                      *
2012  *              Commodity functions, cleanup needed ?                   *
2013  *                                                                      *
2014  ************************************************************************/
2015 /*
2016  * all tags allowing pc data from the html 4.01 loose dtd
2017  * NOTE: it might be more apropriate to integrate this information
2018  * into the html40ElementTable array but I don't want to risk any
2019  * binary incomptibility
2020  */
2021 static const char *allowPCData[] = {
2022     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2023     "blockquote", "body", "button", "caption", "center", "cite", "code",
2024     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2025     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2026     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2027     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2028 };
2029
2030 /**
2031  * areBlanks:
2032  * @ctxt:  an HTML parser context
2033  * @str:  a xmlChar *
2034  * @len:  the size of @str
2035  *
2036  * Is this a sequence of blank chars that one can ignore ?
2037  *
2038  * Returns 1 if ignorable 0 otherwise.
2039  */
2040
2041 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2042     unsigned int i;
2043     int j;
2044     xmlNodePtr lastChild;
2045     xmlDtdPtr dtd;
2046
2047     for (j = 0;j < len;j++)
2048         if (!(IS_BLANK_CH(str[j]))) return(0);
2049
2050     if (CUR == 0) return(1);
2051     if (CUR != '<') return(0);
2052     if (ctxt->name == NULL)
2053         return(1);
2054     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2055         return(1);
2056     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2057         return(1);
2058
2059     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2060     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2061         dtd = xmlGetIntSubset(ctxt->myDoc);
2062         if (dtd != NULL && dtd->ExternalID != NULL) {
2063             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2064                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2065                 return(1);
2066         }
2067     }
2068
2069     if (ctxt->node == NULL) return(0);
2070     lastChild = xmlGetLastChild(ctxt->node);
2071     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2072         lastChild = lastChild->prev;
2073     if (lastChild == NULL) {
2074         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2075             (ctxt->node->content != NULL)) return(0);
2076         /* keep ws in constructs like ...<b> </b>...
2077            for all tags "b" allowing PCDATA */
2078         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2079             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2080                 return(0);
2081             }
2082         }
2083     } else if (xmlNodeIsText(lastChild)) {
2084         return(0);
2085     } else {
2086         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2087            for all tags "p" allowing PCDATA */
2088         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2090                 return(0);
2091             }
2092         }
2093     }
2094     return(1);
2095 }
2096
2097 /**
2098  * htmlNewDocNoDtD:
2099  * @URI:  URI for the dtd, or NULL
2100  * @ExternalID:  the external ID of the DTD, or NULL
2101  *
2102  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2103  * are NULL
2104  *
2105  * Returns a new document, do not initialize the DTD if not provided
2106  */
2107 htmlDocPtr
2108 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2109     xmlDocPtr cur;
2110
2111     /*
2112      * Allocate a new document and fill the fields.
2113      */
2114     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2115     if (cur == NULL) {
2116         htmlErrMemory(NULL, "HTML document creation failed\n");
2117         return(NULL);
2118     }
2119     memset(cur, 0, sizeof(xmlDoc));
2120
2121     cur->type = XML_HTML_DOCUMENT_NODE;
2122     cur->version = NULL;
2123     cur->intSubset = NULL;
2124     cur->doc = cur;
2125     cur->name = NULL;
2126     cur->children = NULL;
2127     cur->extSubset = NULL;
2128     cur->oldNs = NULL;
2129     cur->encoding = NULL;
2130     cur->standalone = 1;
2131     cur->compression = 0;
2132     cur->ids = NULL;
2133     cur->refs = NULL;
2134     cur->_private = NULL;
2135     cur->charset = XML_CHAR_ENCODING_UTF8;
2136     if ((ExternalID != NULL) ||
2137         (URI != NULL))
2138         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2139     return(cur);
2140 }
2141
2142 /**
2143  * htmlNewDoc:
2144  * @URI:  URI for the dtd, or NULL
2145  * @ExternalID:  the external ID of the DTD, or NULL
2146  *
2147  * Creates a new HTML document
2148  *
2149  * Returns a new document
2150  */
2151 htmlDocPtr
2152 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2153     if ((URI == NULL) && (ExternalID == NULL))
2154         return(htmlNewDocNoDtD(
2155                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2156                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2157
2158     return(htmlNewDocNoDtD(URI, ExternalID));
2159 }
2160
2161
2162 /************************************************************************
2163  *                                                                      *
2164  *                      The parser itself                               *
2165  *      Relates to http://www.w3.org/TR/html40                          *
2166  *                                                                      *
2167  ************************************************************************/
2168
2169 /************************************************************************
2170  *                                                                      *
2171  *                      The parser itself                               *
2172  *                                                                      *
2173  ************************************************************************/
2174
2175 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2176
2177 /**
2178  * htmlParseHTMLName:
2179  * @ctxt:  an HTML parser context
2180  *
2181  * parse an HTML tag or attribute name, note that we convert it to lowercase
2182  * since HTML names are not case-sensitive.
2183  *
2184  * Returns the Tag Name parsed or NULL
2185  */
2186
2187 static const xmlChar *
2188 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2189     int i = 0;
2190     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2191
2192     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2193         (CUR != ':')) return(NULL);
2194
2195     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2196            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2197            (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2198         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2199         else loc[i] = CUR;
2200         i++;
2201
2202         NEXT;
2203     }
2204
2205     return(xmlDictLookup(ctxt->dict, loc, i));
2206 }
2207
2208 /**
2209  * htmlParseName:
2210  * @ctxt:  an HTML parser context
2211  *
2212  * parse an HTML name, this routine is case sensitive.
2213  *
2214  * Returns the Name parsed or NULL
2215  */
2216
2217 static const xmlChar *
2218 htmlParseName(htmlParserCtxtPtr ctxt) {
2219     const xmlChar *in;
2220     const xmlChar *ret;
2221     int count = 0;
2222
2223     GROW;
2224
2225     /*
2226      * Accelerator for simple ASCII names
2227      */
2228     in = ctxt->input->cur;
2229     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2230         ((*in >= 0x41) && (*in <= 0x5A)) ||
2231         (*in == '_') || (*in == ':')) {
2232         in++;
2233         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2234                ((*in >= 0x41) && (*in <= 0x5A)) ||
2235                ((*in >= 0x30) && (*in <= 0x39)) ||
2236                (*in == '_') || (*in == '-') ||
2237                (*in == ':') || (*in == '.'))
2238             in++;
2239         if ((*in > 0) && (*in < 0x80)) {
2240             count = in - ctxt->input->cur;
2241             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2242             ctxt->input->cur = in;
2243             ctxt->nbChars += count;
2244             ctxt->input->col += count;
2245             return(ret);
2246         }
2247     }
2248     return(htmlParseNameComplex(ctxt));
2249 }
2250
2251 static const xmlChar *
2252 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2253     int len = 0, l;
2254     int c;
2255     int count = 0;
2256
2257     /*
2258      * Handler for more complex cases
2259      */
2260     GROW;
2261     c = CUR_CHAR(l);
2262     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2263         (!IS_LETTER(c) && (c != '_') &&
2264          (c != ':'))) {
2265         return(NULL);
2266     }
2267
2268     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2269            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2270             (c == '.') || (c == '-') ||
2271             (c == '_') || (c == ':') ||
2272             (IS_COMBINING(c)) ||
2273             (IS_EXTENDER(c)))) {
2274         if (count++ > 100) {
2275             count = 0;
2276             GROW;
2277         }
2278         len += l;
2279         NEXTL(l);
2280         c = CUR_CHAR(l);
2281     }
2282     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2283 }
2284
2285
2286 /**
2287  * htmlParseHTMLAttribute:
2288  * @ctxt:  an HTML parser context
2289  * @stop:  a char stop value
2290  *
2291  * parse an HTML attribute value till the stop (quote), if
2292  * stop is 0 then it stops at the first space
2293  *
2294  * Returns the attribute parsed or NULL
2295  */
2296
2297 static xmlChar *
2298 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2299     xmlChar *buffer = NULL;
2300     int buffer_size = 0;
2301     xmlChar *out = NULL;
2302     const xmlChar *name = NULL;
2303     const xmlChar *cur = NULL;
2304     const htmlEntityDesc * ent;
2305
2306     /*
2307      * allocate a translation buffer.
2308      */
2309     buffer_size = HTML_PARSER_BUFFER_SIZE;
2310     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2311     if (buffer == NULL) {
2312         htmlErrMemory(ctxt, "buffer allocation failed\n");
2313         return(NULL);
2314     }
2315     out = buffer;
2316
2317     /*
2318      * Ok loop until we reach one of the ending chars
2319      */
2320     while ((CUR != 0) && (CUR != stop)) {
2321         if ((stop == 0) && (CUR == '>')) break;
2322         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2323         if (CUR == '&') {
2324             if (NXT(1) == '#') {
2325                 unsigned int c;
2326                 int bits;
2327
2328                 c = htmlParseCharRef(ctxt);
2329                 if      (c <    0x80)
2330                         { *out++  = c;                bits= -6; }
2331                 else if (c <   0x800)
2332                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2333                 else if (c < 0x10000)
2334                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2335                 else
2336                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2337
2338                 for ( ; bits >= 0; bits-= 6) {
2339                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2340                 }
2341
2342                 if (out - buffer > buffer_size - 100) {
2343                         int indx = out - buffer;
2344
2345                         growBuffer(buffer);
2346                         out = &buffer[indx];
2347                 }
2348             } else {
2349                 ent = htmlParseEntityRef(ctxt, &name);
2350                 if (name == NULL) {
2351                     *out++ = '&';
2352                     if (out - buffer > buffer_size - 100) {
2353                         int indx = out - buffer;
2354
2355                         growBuffer(buffer);
2356                         out = &buffer[indx];
2357                     }
2358                 } else if (ent == NULL) {
2359                     *out++ = '&';
2360                     cur = name;
2361                     while (*cur != 0) {
2362                         if (out - buffer > buffer_size - 100) {
2363                             int indx = out - buffer;
2364
2365                             growBuffer(buffer);
2366                             out = &buffer[indx];
2367                         }
2368                         *out++ = *cur++;
2369                     }
2370                 } else {
2371                     unsigned int c;
2372                     int bits;
2373
2374                     if (out - buffer > buffer_size - 100) {
2375                         int indx = out - buffer;
2376
2377                         growBuffer(buffer);
2378                         out = &buffer[indx];
2379                     }
2380                     c = ent->value;
2381                     if      (c <    0x80)
2382                         { *out++  = c;                bits= -6; }
2383                     else if (c <   0x800)
2384                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2385                     else if (c < 0x10000)
2386                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2387                     else
2388                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2389
2390                     for ( ; bits >= 0; bits-= 6) {
2391                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2392                     }
2393                 }
2394             }
2395         } else {
2396             unsigned int c;
2397             int bits, l;
2398
2399             if (out - buffer > buffer_size - 100) {
2400                 int indx = out - buffer;
2401
2402                 growBuffer(buffer);
2403                 out = &buffer[indx];
2404             }
2405             c = CUR_CHAR(l);
2406             if      (c <    0x80)
2407                     { *out++  = c;                bits= -6; }
2408             else if (c <   0x800)
2409                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2410             else if (c < 0x10000)
2411                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2412             else
2413                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2414
2415             for ( ; bits >= 0; bits-= 6) {
2416                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2417             }
2418             NEXT;
2419         }
2420     }
2421     *out++ = 0;
2422     return(buffer);
2423 }
2424
2425 /**
2426  * htmlParseEntityRef:
2427  * @ctxt:  an HTML parser context
2428  * @str:  location to store the entity name
2429  *
2430  * parse an HTML ENTITY references
2431  *
2432  * [68] EntityRef ::= '&' Name ';'
2433  *
2434  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2435  *         if non-NULL *str will have to be freed by the caller.
2436  */
2437 const htmlEntityDesc *
2438 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2439     const xmlChar *name;
2440     const htmlEntityDesc * ent = NULL;
2441
2442     if (str != NULL) *str = NULL;
2443     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2444
2445     if (CUR == '&') {
2446         NEXT;
2447         name = htmlParseName(ctxt);
2448         if (name == NULL) {
2449             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2450                          "htmlParseEntityRef: no name\n", NULL, NULL);
2451         } else {
2452             GROW;
2453             if (CUR == ';') {
2454                 if (str != NULL)
2455                     *str = name;
2456
2457                 /*
2458                  * Lookup the entity in the table.
2459                  */
2460                 ent = htmlEntityLookup(name);
2461                 if (ent != NULL) /* OK that's ugly !!! */
2462                     NEXT;
2463             } else {
2464                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2465                              "htmlParseEntityRef: expecting ';'\n",
2466                              NULL, NULL);
2467                 if (str != NULL)
2468                     *str = name;
2469             }
2470         }
2471     }
2472     return(ent);
2473 }
2474
2475 /**
2476  * htmlParseAttValue:
2477  * @ctxt:  an HTML parser context
2478  *
2479  * parse a value for an attribute
2480  * Note: the parser won't do substitution of entities here, this
2481  * will be handled later in xmlStringGetNodeList, unless it was
2482  * asked for ctxt->replaceEntities != 0
2483  *
2484  * Returns the AttValue parsed or NULL.
2485  */
2486
2487 static xmlChar *
2488 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2489     xmlChar *ret = NULL;
2490
2491     if (CUR == '"') {
2492         NEXT;
2493         ret = htmlParseHTMLAttribute(ctxt, '"');
2494         if (CUR != '"') {
2495             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2496                          "AttValue: \" expected\n", NULL, NULL);
2497         } else
2498             NEXT;
2499     } else if (CUR == '\'') {
2500         NEXT;
2501         ret = htmlParseHTMLAttribute(ctxt, '\'');
2502         if (CUR != '\'') {
2503             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2504                          "AttValue: ' expected\n", NULL, NULL);
2505         } else
2506             NEXT;
2507     } else {
2508         /*
2509          * That's an HTMLism, the attribute value may not be quoted
2510          */
2511         ret = htmlParseHTMLAttribute(ctxt, 0);
2512         if (ret == NULL) {
2513             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2514                          "AttValue: no value found\n", NULL, NULL);
2515         }
2516     }
2517     return(ret);
2518 }
2519
2520 /**
2521  * htmlParseSystemLiteral:
2522  * @ctxt:  an HTML parser context
2523  *
2524  * parse an HTML Literal
2525  *
2526  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2527  *
2528  * Returns the SystemLiteral parsed or NULL
2529  */
2530
2531 static xmlChar *
2532 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2533     const xmlChar *q;
2534     xmlChar *ret = NULL;
2535
2536     if (CUR == '"') {
2537         NEXT;
2538         q = CUR_PTR;
2539         while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2540             NEXT;
2541         if (!IS_CHAR_CH(CUR)) {
2542             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2543                          "Unfinished SystemLiteral\n", NULL, NULL);
2544         } else {
2545             ret = xmlStrndup(q, CUR_PTR - q);
2546             NEXT;
2547         }
2548     } else if (CUR == '\'') {
2549         NEXT;
2550         q = CUR_PTR;
2551         while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2552             NEXT;
2553         if (!IS_CHAR_CH(CUR)) {
2554             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2555                          "Unfinished SystemLiteral\n", NULL, NULL);
2556         } else {
2557             ret = xmlStrndup(q, CUR_PTR - q);
2558             NEXT;
2559         }
2560     } else {
2561         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2562                      " or ' expected\n", NULL, NULL);
2563     }
2564
2565     return(ret);
2566 }
2567
2568 /**
2569  * htmlParsePubidLiteral:
2570  * @ctxt:  an HTML parser context
2571  *
2572  * parse an HTML public literal
2573  *
2574  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2575  *
2576  * Returns the PubidLiteral parsed or NULL.
2577  */
2578
2579 static xmlChar *
2580 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2581     const xmlChar *q;
2582     xmlChar *ret = NULL;
2583     /*
2584      * Name ::= (Letter | '_') (NameChar)*
2585      */
2586     if (CUR == '"') {
2587         NEXT;
2588         q = CUR_PTR;
2589         while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2590         if (CUR != '"') {
2591             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2592                          "Unfinished PubidLiteral\n", NULL, NULL);
2593         } else {
2594             ret = xmlStrndup(q, CUR_PTR - q);
2595             NEXT;
2596         }
2597     } else if (CUR == '\'') {
2598         NEXT;
2599         q = CUR_PTR;
2600         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2601             NEXT;
2602         if (CUR != '\'') {
2603             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2604                          "Unfinished PubidLiteral\n", NULL, NULL);
2605         } else {
2606             ret = xmlStrndup(q, CUR_PTR - q);
2607             NEXT;
2608         }
2609     } else {
2610         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2611                      "PubidLiteral \" or ' expected\n", NULL, NULL);
2612     }
2613
2614     return(ret);
2615 }
2616
2617 /**
2618  * htmlParseScript:
2619  * @ctxt:  an HTML parser context
2620  *
2621  * parse the content of an HTML SCRIPT or STYLE element
2622  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2623  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2624  * http://www.w3.org/TR/html4/types.html#type-script
2625  * http://www.w3.org/TR/html4/types.html#h-6.15
2626  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2627  *
2628  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2629  * element and the value of intrinsic event attributes. User agents must
2630  * not evaluate script data as HTML markup but instead must pass it on as
2631  * data to a script engine.
2632  * NOTES:
2633  * - The content is passed like CDATA
2634  * - the attributes for style and scripting "onXXX" are also described
2635  *   as CDATA but SGML allows entities references in attributes so their
2636  *   processing is identical as other attributes
2637  */
2638 static void
2639 htmlParseScript(htmlParserCtxtPtr ctxt) {
2640     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2641     int nbchar = 0;
2642     int cur,l;
2643
2644     SHRINK;
2645     cur = CUR_CHAR(l);
2646     while (IS_CHAR_CH(cur)) {
2647         if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2648             (NXT(3) == '-')) {
2649             if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2650                 if (ctxt->sax->cdataBlock!= NULL) {
2651                     /*
2652                      * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2653                      */
2654                     ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2655                 } else if (ctxt->sax->characters != NULL) {
2656                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
2657                 }
2658             }
2659             nbchar = 0;
2660             htmlParseComment(ctxt);
2661             cur = CUR_CHAR(l);
2662             continue;
2663         } else if ((cur == '<') && (NXT(1) == '/')) {
2664             /*
2665              * One should break here, the specification is clear:
2666              * Authors should therefore escape "</" within the content.
2667              * Escape mechanisms are specific to each scripting or
2668              * style sheet language.
2669              *
2670              * In recovery mode, only break if end tag match the
2671              * current tag, effectively ignoring all tags inside the
2672              * script/style block and treating the entire block as
2673              * CDATA.
2674              */
2675             if (ctxt->recovery) {
2676                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2677                                    xmlStrlen(ctxt->name)) == 0)
2678                 {
2679                     break; /* while */
2680                 } else {
2681                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2682                                  "Element %s embeds close tag\n",
2683                                  ctxt->name, NULL);
2684                 }
2685             } else {
2686                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2687                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2688                 {
2689                     break; /* while */
2690                 }
2691             }
2692         }
2693         COPY_BUF(l,buf,nbchar,cur);
2694         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2695             if (ctxt->sax->cdataBlock!= NULL) {
2696                 /*
2697                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2698                  */
2699                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2700             } else if (ctxt->sax->characters != NULL) {
2701                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2702             }
2703             nbchar = 0;
2704         }
2705         GROW;
2706         NEXTL(l);
2707         cur = CUR_CHAR(l);
2708     }
2709
2710     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2711         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2712                         "Invalid char in CDATA 0x%X\n", cur);
2713         NEXT;
2714     }
2715
2716     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2717         if (ctxt->sax->cdataBlock!= NULL) {
2718             /*
2719              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2720              */
2721             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2722         } else if (ctxt->sax->characters != NULL) {
2723             ctxt->sax->characters(ctxt->userData, buf, nbchar);
2724         }
2725     }
2726 }
2727
2728
2729 /**
2730  * htmlParseCharData:
2731  * @ctxt:  an HTML parser context
2732  *
2733  * parse a CharData section.
2734  * if we are within a CDATA section ']]>' marks an end of section.
2735  *
2736  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2737  */
2738
2739 static void
2740 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2741     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2742     int nbchar = 0;
2743     int cur, l;
2744
2745     SHRINK;
2746     cur = CUR_CHAR(l);
2747     while (((cur != '<') || (ctxt->token == '<')) &&
2748            ((cur != '&') || (ctxt->token == '&')) &&
2749            (IS_CHAR(cur))) {
2750         COPY_BUF(l,buf,nbchar,cur);
2751         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2752             /*
2753              * Ok the segment is to be consumed as chars.
2754              */
2755             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2756                 if (areBlanks(ctxt, buf, nbchar)) {
2757                     if (ctxt->sax->ignorableWhitespace != NULL)
2758                         ctxt->sax->ignorableWhitespace(ctxt->userData,
2759                                                        buf, nbchar);
2760                 } else {
2761                     htmlCheckParagraph(ctxt);
2762                     if (ctxt->sax->characters != NULL)
2763                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
2764                 }
2765             }
2766             nbchar = 0;
2767         }
2768         NEXTL(l);
2769         cur = CUR_CHAR(l);
2770         if (cur == 0) {
2771             SHRINK;
2772             GROW;
2773             cur = CUR_CHAR(l);
2774         }
2775     }
2776     if (nbchar != 0) {
2777         buf[nbchar] = 0;
2778
2779         /*
2780          * Ok the segment is to be consumed as chars.
2781          */
2782         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2783             if (areBlanks(ctxt, buf, nbchar)) {
2784                 if (ctxt->sax->ignorableWhitespace != NULL)
2785                     ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2786             } else {
2787                 htmlCheckParagraph(ctxt);
2788                 if (ctxt->sax->characters != NULL)
2789                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
2790             }
2791         }
2792     } else {
2793         /*
2794          * Loop detection
2795          */
2796         if (cur == 0)
2797             ctxt->instate = XML_PARSER_EOF;
2798     }
2799 }
2800
2801 /**
2802  * htmlParseExternalID:
2803  * @ctxt:  an HTML parser context
2804  * @publicID:  a xmlChar** receiving PubidLiteral
2805  *
2806  * Parse an External ID or a Public ID
2807  *
2808  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2809  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
2810  *
2811  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2812  *
2813  * Returns the function returns SystemLiteral and in the second
2814  *                case publicID receives PubidLiteral, is strict is off
2815  *                it is possible to return NULL and have publicID set.
2816  */
2817
2818 static xmlChar *
2819 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2820     xmlChar *URI = NULL;
2821
2822     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2823          (UPP(2) == 'S') && (UPP(3) == 'T') &&
2824          (UPP(4) == 'E') && (UPP(5) == 'M')) {
2825         SKIP(6);
2826         if (!IS_BLANK_CH(CUR)) {
2827             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2828                          "Space required after 'SYSTEM'\n", NULL, NULL);
2829         }
2830         SKIP_BLANKS;
2831         URI = htmlParseSystemLiteral(ctxt);
2832         if (URI == NULL) {
2833             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2834                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2835         }
2836     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2837                (UPP(2) == 'B') && (UPP(3) == 'L') &&
2838                (UPP(4) == 'I') && (UPP(5) == 'C')) {
2839         SKIP(6);
2840         if (!IS_BLANK_CH(CUR)) {
2841             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2842                          "Space required after 'PUBLIC'\n", NULL, NULL);
2843         }
2844         SKIP_BLANKS;
2845         *publicID = htmlParsePubidLiteral(ctxt);
2846         if (*publicID == NULL) {
2847             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2848                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2849                          NULL, NULL);
2850         }
2851         SKIP_BLANKS;
2852         if ((CUR == '"') || (CUR == '\'')) {
2853             URI = htmlParseSystemLiteral(ctxt);
2854         }
2855     }
2856     return(URI);
2857 }
2858
2859 /**
2860  * xmlParsePI:
2861  * @ctxt:  an XML parser context
2862  *
2863  * parse an XML Processing Instruction.
2864  *
2865  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2866  */
2867 static void
2868 htmlParsePI(htmlParserCtxtPtr ctxt) {
2869     xmlChar *buf = NULL;
2870     int len = 0;
2871     int size = HTML_PARSER_BUFFER_SIZE;
2872     int cur, l;
2873     const xmlChar *target;
2874     xmlParserInputState state;
2875     int count = 0;
2876
2877     if ((RAW == '<') && (NXT(1) == '?')) {
2878         state = ctxt->instate;
2879         ctxt->instate = XML_PARSER_PI;
2880         /*
2881          * this is a Processing Instruction.
2882          */
2883         SKIP(2);
2884         SHRINK;
2885
2886         /*
2887          * Parse the target name and check for special support like
2888          * namespace.
2889          */
2890         target = htmlParseName(ctxt);
2891         if (target != NULL) {
2892             if (RAW == '>') {
2893                 SKIP(1);
2894
2895                 /*
2896                  * SAX: PI detected.
2897                  */
2898                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2899                     (ctxt->sax->processingInstruction != NULL))
2900                     ctxt->sax->processingInstruction(ctxt->userData,
2901                                                      target, NULL);
2902                 ctxt->instate = state;
2903                 return;
2904             }
2905             buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2906             if (buf == NULL) {
2907                 htmlErrMemory(ctxt, NULL);
2908                 ctxt->instate = state;
2909                 return;
2910             }
2911             cur = CUR;
2912             if (!IS_BLANK(cur)) {
2913                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2914                           "ParsePI: PI %s space expected\n", target, NULL);
2915             }
2916             SKIP_BLANKS;
2917             cur = CUR_CHAR(l);
2918             while (IS_CHAR(cur) && (cur != '>')) {
2919                 if (len + 5 >= size) {
2920                     xmlChar *tmp;
2921
2922                     size *= 2;
2923                     tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2924                     if (tmp == NULL) {
2925                         htmlErrMemory(ctxt, NULL);
2926                         xmlFree(buf);
2927                         ctxt->instate = state;
2928                         return;
2929                     }
2930                     buf = tmp;
2931                 }
2932                 count++;
2933                 if (count > 50) {
2934                     GROW;
2935                     count = 0;
2936                 }
2937                 COPY_BUF(l,buf,len,cur);
2938                 NEXTL(l);
2939                 cur = CUR_CHAR(l);
2940                 if (cur == 0) {
2941                     SHRINK;
2942                     GROW;
2943                     cur = CUR_CHAR(l);
2944                 }
2945             }
2946             buf[len] = 0;
2947             if (cur != '>') {
2948                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2949                       "ParsePI: PI %s never end ...\n", target, NULL);
2950             } else {
2951                 SKIP(1);
2952
2953                 /*
2954                  * SAX: PI detected.
2955                  */
2956                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2957                     (ctxt->sax->processingInstruction != NULL))
2958                     ctxt->sax->processingInstruction(ctxt->userData,
2959                                                      target, buf);
2960             }
2961             xmlFree(buf);
2962         } else {
2963             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2964                          "PI is not started correctly", NULL, NULL);
2965         }
2966         ctxt->instate = state;
2967     }
2968 }
2969
2970 /**
2971  * htmlParseComment:
2972  * @ctxt:  an HTML parser context
2973  *
2974  * Parse an XML (SGML) comment <!-- .... -->
2975  *
2976  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2977  */
2978 static void
2979 htmlParseComment(htmlParserCtxtPtr ctxt) {
2980     xmlChar *buf = NULL;
2981     int len;
2982     int size = HTML_PARSER_BUFFER_SIZE;
2983     int q, ql;
2984     int r, rl;
2985     int cur, l;
2986     xmlParserInputState state;
2987
2988     /*
2989      * Check that there is a comment right here.
2990      */
2991     if ((RAW != '<') || (NXT(1) != '!') ||
2992         (NXT(2) != '-') || (NXT(3) != '-')) return;
2993
2994     state = ctxt->instate;
2995     ctxt->instate = XML_PARSER_COMMENT;
2996     SHRINK;
2997     SKIP(4);
2998     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2999     if (buf == NULL) {
3000         htmlErrMemory(ctxt, "buffer allocation failed\n");
3001         ctxt->instate = state;
3002         return;
3003     }
3004     q = CUR_CHAR(ql);
3005     NEXTL(ql);
3006     r = CUR_CHAR(rl);
3007     NEXTL(rl);
3008     cur = CUR_CHAR(l);
3009     len = 0;
3010     while (IS_CHAR(cur) &&
3011            ((cur != '>') ||
3012             (r != '-') || (q != '-'))) {
3013         if (len + 5 >= size) {
3014             xmlChar *tmp;
3015
3016             size *= 2;
3017             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3018             if (tmp == NULL) {
3019                 xmlFree(buf);
3020                 htmlErrMemory(ctxt, "growing buffer failed\n");
3021                 ctxt->instate = state;
3022                 return;
3023             }
3024             buf = tmp;
3025         }
3026         COPY_BUF(ql,buf,len,q);
3027         q = r;
3028         ql = rl;
3029         r = cur;
3030         rl = l;
3031         NEXTL(l);
3032         cur = CUR_CHAR(l);
3033         if (cur == 0) {
3034             SHRINK;
3035             GROW;
3036             cur = CUR_CHAR(l);
3037         }
3038     }
3039     buf[len] = 0;
3040     if (!IS_CHAR(cur)) {
3041         htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3042                      "Comment not terminated \n<!--%.50s\n", buf, NULL);
3043         xmlFree(buf);
3044     } else {
3045         NEXT;
3046         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3047             (!ctxt->disableSAX))
3048             ctxt->sax->comment(ctxt->userData, buf);
3049         xmlFree(buf);
3050     }
3051     ctxt->instate = state;
3052 }
3053
3054 /**
3055  * htmlParseCharRef:
3056  * @ctxt:  an HTML parser context
3057  *
3058  * parse Reference declarations
3059  *
3060  * [66] CharRef ::= '&#' [0-9]+ ';' |
3061  *                  '&#x' [0-9a-fA-F]+ ';'
3062  *
3063  * Returns the value parsed (as an int)
3064  */
3065 int
3066 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3067     int val = 0;
3068
3069     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3070         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3071                      "htmlParseCharRef: context error\n",
3072                      NULL, NULL);
3073         return(0);
3074     }
3075     if ((CUR == '&') && (NXT(1) == '#') &&
3076         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3077         SKIP(3);
3078         while (CUR != ';') {
3079             if ((CUR >= '0') && (CUR <= '9'))
3080                 val = val * 16 + (CUR - '0');
3081             else if ((CUR >= 'a') && (CUR <= 'f'))
3082                 val = val * 16 + (CUR - 'a') + 10;
3083             else if ((CUR >= 'A') && (CUR <= 'F'))
3084                 val = val * 16 + (CUR - 'A') + 10;
3085             else {
3086                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3087                              "htmlParseCharRef: invalid hexadecimal value\n",
3088                              NULL, NULL);
3089                 return(0);
3090             }
3091             NEXT;
3092         }
3093         if (CUR == ';')
3094             NEXT;
3095     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3096         SKIP(2);
3097         while (CUR != ';') {
3098             if ((CUR >= '0') && (CUR <= '9'))
3099                 val = val * 10 + (CUR - '0');
3100             else {
3101                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3102                              "htmlParseCharRef: invalid decimal value\n",
3103                              NULL, NULL);
3104                 return(0);
3105             }
3106             NEXT;
3107         }
3108         if (CUR == ';')
3109             NEXT;
3110     } else {
3111         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3112                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3113     }
3114     /*
3115      * Check the value IS_CHAR ...
3116      */
3117     if (IS_CHAR(val)) {
3118         return(val);
3119     } else {
3120         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3121                         "htmlParseCharRef: invalid xmlChar value %d\n",
3122                         val);
3123     }
3124     return(0);
3125 }
3126
3127
3128 /**
3129  * htmlParseDocTypeDecl:
3130  * @ctxt:  an HTML parser context
3131  *
3132  * parse a DOCTYPE declaration
3133  *
3134  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3135  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3136  */
3137
3138 static void
3139 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3140     const xmlChar *name;
3141     xmlChar *ExternalID = NULL;
3142     xmlChar *URI = NULL;
3143
3144     /*
3145      * We know that '<!DOCTYPE' has been detected.
3146      */
3147     SKIP(9);
3148
3149     SKIP_BLANKS;
3150
3151     /*
3152      * Parse the DOCTYPE name.
3153      */
3154     name = htmlParseName(ctxt);
3155     if (name == NULL) {
3156         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3157                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3158                      NULL, NULL);
3159     }
3160     /*
3161      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3162      */
3163
3164     SKIP_BLANKS;
3165
3166     /*
3167      * Check for SystemID and ExternalID
3168      */
3169     URI = htmlParseExternalID(ctxt, &ExternalID);
3170     SKIP_BLANKS;
3171
3172     /*
3173      * We should be at the end of the DOCTYPE declaration.
3174      */
3175     if (CUR != '>') {
3176         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3177                      "DOCTYPE improperly terminated\n", NULL, NULL);
3178         /* We shouldn't try to resynchronize ... */
3179     }
3180     NEXT;
3181
3182     /*
3183      * Create or update the document accordingly to the DOCTYPE
3184      */
3185     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3186         (!ctxt->disableSAX))
3187         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3188
3189     /*
3190      * Cleanup, since we don't use all those identifiers
3191      */
3192     if (URI != NULL) xmlFree(URI);
3193     if (ExternalID != NULL) xmlFree(ExternalID);
3194 }
3195
3196 /**
3197  * htmlParseAttribute:
3198  * @ctxt:  an HTML parser context
3199  * @value:  a xmlChar ** used to store the value of the attribute
3200  *
3201  * parse an attribute
3202  *
3203  * [41] Attribute ::= Name Eq AttValue
3204  *
3205  * [25] Eq ::= S? '=' S?
3206  *
3207  * With namespace:
3208  *
3209  * [NS 11] Attribute ::= QName Eq AttValue
3210  *
3211  * Also the case QName == xmlns:??? is handled independently as a namespace
3212  * definition.
3213  *
3214  * Returns the attribute name, and the value in *value.
3215  */
3216
3217 static const xmlChar *
3218 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3219     const xmlChar *name;
3220     xmlChar *val = NULL;
3221
3222     *value = NULL;
3223     name = htmlParseHTMLName(ctxt);
3224     if (name == NULL) {
3225         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3226                      "error parsing attribute name\n", NULL, NULL);
3227         return(NULL);
3228     }
3229
3230     /*
3231      * read the value
3232      */
3233     SKIP_BLANKS;
3234     if (CUR == '=') {
3235         NEXT;
3236         SKIP_BLANKS;
3237         val = htmlParseAttValue(ctxt);
3238     } else if (htmlIsBooleanAttr(name)) {
3239         /*
3240          * assume a minimized attribute
3241          */
3242         val = xmlStrdup(name);
3243     }
3244
3245     *value = val;
3246     return(name);
3247 }
3248
3249 /**
3250  * htmlCheckEncoding:
3251  * @ctxt:  an HTML parser context
3252  * @attvalue: the attribute value
3253  *
3254  * Checks an http-equiv attribute from a Meta tag to detect
3255  * the encoding
3256  * If a new encoding is detected the parser is switched to decode
3257  * it and pass UTF8
3258  */
3259 static void
3260 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3261     const xmlChar *encoding;
3262
3263     if ((ctxt == NULL) || (attvalue == NULL))
3264         return;
3265
3266     /* do not change encoding */
3267     if (ctxt->input->encoding != NULL)
3268         return;
3269
3270     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3271     if (encoding != NULL) {
3272         encoding += 8;
3273     } else {
3274         encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3275         if (encoding != NULL)
3276             encoding += 9;
3277     }
3278     if (encoding != NULL) {
3279         xmlCharEncoding enc;
3280         xmlCharEncodingHandlerPtr handler;
3281
3282         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3283
3284         if (ctxt->input->encoding != NULL)
3285             xmlFree((xmlChar *) ctxt->input->encoding);
3286         ctxt->input->encoding = xmlStrdup(encoding);
3287
3288         enc = xmlParseCharEncoding((const char *) encoding);
3289         /*
3290          * registered set of known encodings
3291          */
3292         if (enc != XML_CHAR_ENCODING_ERROR) {
3293             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3294                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3295                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3296                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3297                 (ctxt->input->buf != NULL) &&
3298                 (ctxt->input->buf->encoder == NULL)) {
3299                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3300                              "htmlCheckEncoding: wrong encoding meta\n",
3301                              NULL, NULL);
3302             } else {
3303                 xmlSwitchEncoding(ctxt, enc);
3304             }
3305             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3306         } else {
3307             /*
3308              * fallback for unknown encodings
3309              */
3310             handler = xmlFindCharEncodingHandler((const char *) encoding);
3311             if (handler != NULL) {
3312                 xmlSwitchToEncoding(ctxt, handler);
3313                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3314             } else {
3315                 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3316             }
3317         }
3318
3319         if ((ctxt->input->buf != NULL) &&
3320             (ctxt->input->buf->encoder != NULL) &&
3321             (ctxt->input->buf->raw != NULL) &&
3322             (ctxt->input->buf->buffer != NULL)) {
3323             int nbchars;
3324             int processed;
3325
3326             /*
3327              * convert as much as possible to the parser reading buffer.
3328              */
3329             processed = ctxt->input->cur - ctxt->input->base;
3330             xmlBufferShrink(ctxt->input->buf->buffer, processed);
3331             nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3332                                        ctxt->input->buf->buffer,
3333                                        ctxt->input->buf->raw);
3334             if (nbchars < 0) {
3335                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3336                              "htmlCheckEncoding: encoder error\n",
3337                              NULL, NULL);
3338             }
3339             ctxt->input->base =
3340             ctxt->input->cur = ctxt->input->buf->buffer->content;
3341         }
3342     }
3343 }
3344
3345 /**
3346  * htmlCheckMeta:
3347  * @ctxt:  an HTML parser context
3348  * @atts:  the attributes values
3349  *
3350  * Checks an attributes from a Meta tag
3351  */
3352 static void
3353 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3354     int i;
3355     const xmlChar *att, *value;
3356     int http = 0;
3357     const xmlChar *content = NULL;
3358
3359     if ((ctxt == NULL) || (atts == NULL))
3360         return;
3361
3362     i = 0;
3363     att = atts[i++];
3364     while (att != NULL) {
3365         value = atts[i++];
3366         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3367          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3368             http = 1;
3369         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3370             content = value;
3371         att = atts[i++];
3372     }
3373     if ((http) && (content != NULL))
3374         htmlCheckEncoding(ctxt, content);
3375
3376 }
3377
3378 /**
3379  * htmlParseStartTag:
3380  * @ctxt:  an HTML parser context
3381  *
3382  * parse a start of tag either for rule element or
3383  * EmptyElement. In both case we don't parse the tag closing chars.
3384  *
3385  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3386  *
3387  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3388  *
3389  * With namespace:
3390  *
3391  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3392  *
3393  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3394  *
3395  * Returns 0 in case of success and -1 in case of error.
3396  */
3397
3398 static int
3399 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3400     const xmlChar *name;
3401     const xmlChar *attname;
3402     xmlChar *attvalue;
3403     const xmlChar **atts;
3404     int nbatts = 0;
3405     int maxatts;
3406     int meta = 0;
3407     int i;
3408
3409     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3410         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3411                      "htmlParseStartTag: context error\n", NULL, NULL);
3412         return -1;
3413     }
3414     if (CUR != '<') return -1;
3415     NEXT;
3416
3417     atts = ctxt->atts;
3418     maxatts = ctxt->maxatts;
3419
3420     GROW;
3421     name = htmlParseHTMLName(ctxt);
3422     if (name == NULL) {
3423         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3424                      "htmlParseStartTag: invalid element name\n",
3425                      NULL, NULL);
3426         /* Dump the bogus tag like browsers do */
3427         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3428             NEXT;
3429         return -1;
3430     }
3431     if (xmlStrEqual(name, BAD_CAST"meta"))
3432         meta = 1;
3433
3434     /*
3435      * Check for auto-closure of HTML elements.
3436      */
3437     htmlAutoClose(ctxt, name);
3438
3439     /*
3440      * Check for implied HTML elements.
3441      */
3442     htmlCheckImplied(ctxt, name);
3443
3444     /*
3445      * Avoid html at any level > 0, head at any level != 1
3446      * or any attempt to recurse body
3447      */
3448     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3449         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3450                      "htmlParseStartTag: misplaced <html> tag\n",
3451                      name, NULL);
3452         return 0;
3453     }
3454     if ((ctxt->nameNr != 1) &&
3455         (xmlStrEqual(name, BAD_CAST"head"))) {
3456         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3457                      "htmlParseStartTag: misplaced <head> tag\n",
3458                      name, NULL);
3459         return 0;
3460     }
3461     if (xmlStrEqual(name, BAD_CAST"body")) {
3462         int indx;
3463         for (indx = 0;indx < ctxt->nameNr;indx++) {
3464             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3465                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3466                              "htmlParseStartTag: misplaced <body> tag\n",
3467                              name, NULL);
3468                 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3469                     NEXT;
3470                 return 0;
3471             }
3472         }
3473     }
3474
3475     /*
3476      * Now parse the attributes, it ends up with the ending
3477      *
3478      * (S Attribute)* S?
3479      */
3480     SKIP_BLANKS;
3481     while ((IS_CHAR_CH(CUR)) &&
3482            (CUR != '>') &&
3483            ((CUR != '/') || (NXT(1) != '>'))) {
3484         long cons = ctxt->nbChars;
3485
3486         GROW;
3487         attname = htmlParseAttribute(ctxt, &attvalue);
3488         if (attname != NULL) {
3489
3490             /*
3491              * Well formedness requires at most one declaration of an attribute
3492              */
3493             for (i = 0; i < nbatts;i += 2) {
3494                 if (xmlStrEqual(atts[i], attname)) {
3495                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3496                                  "Attribute %s redefined\n", attname, NULL);
3497                     if (attvalue != NULL)
3498                         xmlFree(attvalue);
3499                     goto failed;
3500                 }
3501             }
3502
3503             /*
3504              * Add the pair to atts
3505              */
3506             if (atts == NULL) {
3507                 maxatts = 22; /* allow for 10 attrs by default */
3508                 atts = (const xmlChar **)
3509                        xmlMalloc(maxatts * sizeof(xmlChar *));
3510                 if (atts == NULL) {
3511                     htmlErrMemory(ctxt, NULL);
3512                     if (attvalue != NULL)
3513                         xmlFree(attvalue);
3514                     goto failed;
3515                 }
3516                 ctxt->atts = atts;
3517                 ctxt->maxatts = maxatts;
3518             } else if (nbatts + 4 > maxatts) {
3519                 const xmlChar **n;
3520
3521                 maxatts *= 2;
3522                 n = (const xmlChar **) xmlRealloc((void *) atts,
3523                                              maxatts * sizeof(const xmlChar *));
3524                 if (n == NULL) {
3525                     htmlErrMemory(ctxt, NULL);
3526                     if (attvalue != NULL)
3527                         xmlFree(attvalue);
3528                     goto failed;
3529                 }
3530                 atts = n;
3531                 ctxt->atts = atts;
3532                 ctxt->maxatts = maxatts;
3533             }
3534             atts[nbatts++] = attname;
3535             atts[nbatts++] = attvalue;
3536             atts[nbatts] = NULL;
3537             atts[nbatts + 1] = NULL;
3538         }
3539         else {
3540             if (attvalue != NULL)
3541                 xmlFree(attvalue);
3542             /* Dump the bogus attribute string up to the next blank or
3543              * the end of the tag. */
3544             while ((IS_CHAR_CH(CUR)) &&
3545                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3546                    ((CUR != '/') || (NXT(1) != '>')))
3547                 NEXT;
3548         }
3549
3550 failed:
3551         SKIP_BLANKS;
3552         if (cons == ctxt->nbChars) {
3553             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3554                          "htmlParseStartTag: problem parsing attributes\n",
3555                          NULL, NULL);
3556             break;
3557         }
3558     }
3559
3560     /*
3561      * Handle specific association to the META tag
3562      */
3563     if (meta)
3564         htmlCheckMeta(ctxt, atts);
3565
3566     /*
3567      * SAX: Start of Element !
3568      */
3569     htmlnamePush(ctxt, name);
3570     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3571         if (nbatts != 0)
3572             ctxt->sax->startElement(ctxt->userData, name, atts);
3573         else
3574             ctxt->sax->startElement(ctxt->userData, name, NULL);
3575     }
3576
3577     if (atts != NULL) {
3578         for (i = 1;i < nbatts;i += 2) {
3579             if (atts[i] != NULL)
3580                 xmlFree((xmlChar *) atts[i]);
3581         }
3582     }
3583
3584     return 0;
3585 }
3586
3587 /**
3588  * htmlParseEndTag:
3589  * @ctxt:  an HTML parser context
3590  *
3591  * parse an end of tag
3592  *
3593  * [42] ETag ::= '</' Name S? '>'
3594  *
3595  * With namespace
3596  *
3597  * [NS 9] ETag ::= '</' QName S? '>'
3598  *
3599  * Returns 1 if the current level should be closed.
3600  */
3601
3602 static int
3603 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3604 {
3605     const xmlChar *name;
3606     const xmlChar *oldname;
3607     int i, ret;
3608
3609     if ((CUR != '<') || (NXT(1) != '/')) {
3610         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3611                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
3612         return (0);
3613     }
3614     SKIP(2);
3615
3616     name = htmlParseHTMLName(ctxt);
3617     if (name == NULL)
3618         return (0);
3619
3620     /*
3621      * We should definitely be at the ending "S? '>'" part
3622      */
3623     SKIP_BLANKS;
3624     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3625         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3626                      "End tag : expected '>'\n", NULL, NULL);
3627         if (ctxt->recovery) {
3628             /*
3629              * We're not at the ending > !!
3630              * Error, unless in recover mode where we search forwards
3631              * until we find a >
3632              */
3633             while (CUR != '\0' && CUR != '>') NEXT;
3634             NEXT;
3635         }
3636     } else
3637         NEXT;
3638
3639     /*
3640      * If the name read is not one of the element in the parsing stack
3641      * then return, it's just an error.
3642      */
3643     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3644         if (xmlStrEqual(name, ctxt->nameTab[i]))
3645             break;
3646     }
3647     if (i < 0) {
3648         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3649                      "Unexpected end tag : %s\n", name, NULL);
3650         return (0);
3651     }
3652
3653
3654     /*
3655      * Check for auto-closure of HTML elements.
3656      */
3657
3658     htmlAutoCloseOnClose(ctxt, name);
3659
3660     /*
3661      * Well formedness constraints, opening and closing must match.
3662      * With the exception that the autoclose may have popped stuff out
3663      * of the stack.
3664      */
3665     if (!xmlStrEqual(name, ctxt->name)) {
3666         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3667             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3668                          "Opening and ending tag mismatch: %s and %s\n",
3669                          name, ctxt->name);
3670         }
3671     }
3672
3673     /*
3674      * SAX: End of Tag
3675      */
3676     oldname = ctxt->name;
3677     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3678         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3679             ctxt->sax->endElement(ctxt->userData, name);
3680         htmlnamePop(ctxt);
3681         ret = 1;
3682     } else {
3683         ret = 0;
3684     }
3685
3686     return (ret);
3687 }
3688
3689
3690 /**
3691  * htmlParseReference:
3692  * @ctxt:  an HTML parser context
3693  *
3694  * parse and handle entity references in content,
3695  * this will end-up in a call to character() since this is either a
3696  * CharRef, or a predefined entity.
3697  */
3698 static void
3699 htmlParseReference(htmlParserCtxtPtr ctxt) {
3700     const htmlEntityDesc * ent;
3701     xmlChar out[6];
3702     const xmlChar *name;
3703     if (CUR != '&') return;
3704
3705     if (NXT(1) == '#') {
3706         unsigned int c;
3707         int bits, i = 0;
3708
3709         c = htmlParseCharRef(ctxt);
3710         if (c == 0)
3711             return;
3712
3713         if      (c <    0x80) { out[i++]= c;                bits= -6; }
3714         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3715         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3716         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3717
3718         for ( ; bits >= 0; bits-= 6) {
3719             out[i++]= ((c >> bits) & 0x3F) | 0x80;
3720         }
3721         out[i] = 0;
3722
3723         htmlCheckParagraph(ctxt);
3724         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3725             ctxt->sax->characters(ctxt->userData, out, i);
3726     } else {
3727         ent = htmlParseEntityRef(ctxt, &name);
3728         if (name == NULL) {
3729             htmlCheckParagraph(ctxt);
3730             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3731                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3732             return;
3733         }
3734         if ((ent == NULL) || !(ent->value > 0)) {
3735             htmlCheckParagraph(ctxt);
3736             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3737                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3738                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3739                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3740             }
3741         } else {
3742             unsigned int c;
3743             int bits, i = 0;
3744
3745             c = ent->value;
3746             if      (c <    0x80)
3747                     { out[i++]= c;                bits= -6; }
3748             else if (c <   0x800)
3749                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3750             else if (c < 0x10000)
3751                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3752             else
3753                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3754
3755             for ( ; bits >= 0; bits-= 6) {
3756                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3757             }
3758             out[i] = 0;
3759
3760             htmlCheckParagraph(ctxt);
3761             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3762                 ctxt->sax->characters(ctxt->userData, out, i);
3763         }
3764     }
3765 }
3766
3767 /**
3768  * htmlParseContent:
3769  * @ctxt:  an HTML parser context
3770  *
3771  * Parse a content: comment, sub-element, reference or text.
3772  */
3773
3774 static void
3775 htmlParseContent(htmlParserCtxtPtr ctxt) {
3776     xmlChar *currentNode;
3777     int depth;
3778
3779     currentNode = xmlStrdup(ctxt->name);
3780     depth = ctxt->nameNr;
3781     while (1) {
3782         long cons = ctxt->nbChars;
3783
3784         GROW;
3785         /*
3786          * Our tag or one of it's parent or children is ending.
3787          */
3788         if ((CUR == '<') && (NXT(1) == '/')) {
3789             if (htmlParseEndTag(ctxt) &&
3790                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3791                 if (currentNode != NULL)
3792                     xmlFree(currentNode);
3793                 return;
3794             }
3795             continue; /* while */
3796         }
3797
3798         /*
3799          * Has this node been popped out during parsing of
3800          * the next element
3801          */
3802         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3803             (!xmlStrEqual(currentNode, ctxt->name)))
3804              {
3805             if (currentNode != NULL) xmlFree(currentNode);
3806             return;
3807         }
3808
3809         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3810             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3811             /*
3812              * Handle SCRIPT/STYLE separately
3813              */
3814             htmlParseScript(ctxt);
3815         } else {
3816             /*
3817              * Sometimes DOCTYPE arrives in the middle of the document
3818              */
3819             if ((CUR == '<') && (NXT(1) == '!') &&
3820                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3821                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3822                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3823                 (UPP(8) == 'E')) {
3824                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3825                              "Misplaced DOCTYPE declaration\n",
3826                              BAD_CAST "DOCTYPE" , NULL);
3827                 htmlParseDocTypeDecl(ctxt);
3828             }
3829
3830             /*
3831              * First case :  a comment
3832              */
3833             if ((CUR == '<') && (NXT(1) == '!') &&
3834                 (NXT(2) == '-') && (NXT(3) == '-')) {
3835                 htmlParseComment(ctxt);
3836             }
3837
3838             /*
3839              * Second case : a Processing Instruction.
3840              */
3841             else if ((CUR == '<') && (NXT(1) == '?')) {
3842                 htmlParsePI(ctxt);
3843             }
3844
3845             /*
3846              * Third case :  a sub-element.
3847              */
3848             else if (CUR == '<') {
3849                 htmlParseElement(ctxt);
3850             }
3851
3852             /*
3853              * Fourth case : a reference. If if has not been resolved,
3854              *    parsing returns it's Name, create the node
3855              */
3856             else if (CUR == '&') {
3857                 htmlParseReference(ctxt);
3858             }
3859
3860             /*
3861              * Fifth case : end of the resource
3862              */
3863             else if (CUR == 0) {
3864                 htmlAutoCloseOnEnd(ctxt);
3865                 break;
3866             }
3867
3868             /*
3869              * Last case, text. Note that References are handled directly.
3870              */
3871             else {
3872                 htmlParseCharData(ctxt);
3873             }
3874
3875             if (cons == ctxt->nbChars) {
3876                 if (ctxt->node != NULL) {
3877                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3878                                  "detected an error in element content\n",
3879                                  NULL, NULL);
3880                 }
3881                 break;
3882             }
3883         }
3884         GROW;
3885     }
3886     if (currentNode != NULL) xmlFree(currentNode);
3887 }
3888
3889 /**
3890  * htmlParseContent:
3891  * @ctxt:  an HTML parser context
3892  *
3893  * Parse a content: comment, sub-element, reference or text.
3894  */
3895
3896 void
3897 __htmlParseContent(void *ctxt) {
3898     if (ctxt != NULL)
3899         htmlParseContent((htmlParserCtxtPtr) ctxt);
3900 }
3901
3902 /**
3903  * htmlParseElement:
3904  * @ctxt:  an HTML parser context
3905  *
3906  * parse an HTML element, this is highly recursive
3907  *
3908  * [39] element ::= EmptyElemTag | STag content ETag
3909  *
3910  * [41] Attribute ::= Name Eq AttValue
3911  */
3912
3913 void
3914 htmlParseElement(htmlParserCtxtPtr ctxt) {
3915     const xmlChar *name;
3916     xmlChar *currentNode = NULL;
3917     const htmlElemDesc * info;
3918     htmlParserNodeInfo node_info;
3919     int failed;
3920     int depth;
3921     const xmlChar *oldptr;
3922
3923     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3924         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3925                      "htmlParseElement: context error\n", NULL, NULL);
3926         return;
3927     }
3928     /* Capture start position */
3929     if (ctxt->record_info) {
3930         node_info.begin_pos = ctxt->input->consumed +
3931                           (CUR_PTR - ctxt->input->base);
3932         node_info.begin_line = ctxt->input->line;
3933     }
3934
3935     failed = htmlParseStartTag(ctxt);
3936     name = ctxt->name;
3937     if (failed || (name == NULL)) {
3938         if (CUR == '>')
3939             NEXT;
3940         return;
3941     }
3942
3943     /*
3944      * Lookup the info for that element.
3945      */
3946     info = htmlTagLookup(name);
3947     if (info == NULL) {
3948         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3949                      "Tag %s invalid\n", name, NULL);
3950     }
3951
3952     /*
3953      * Check for an Empty Element labeled the XML/SGML way
3954      */
3955     if ((CUR == '/') && (NXT(1) == '>')) {
3956         SKIP(2);
3957         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3958             ctxt->sax->endElement(ctxt->userData, name);
3959         htmlnamePop(ctxt);
3960         return;
3961     }
3962
3963     if (CUR == '>') {
3964         NEXT;
3965     } else {
3966         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3967                      "Couldn't find end of Start Tag %s\n", name, NULL);
3968
3969         /*
3970          * end of parsing of this node.
3971          */
3972         if (xmlStrEqual(name, ctxt->name)) {
3973             nodePop(ctxt);
3974             htmlnamePop(ctxt);
3975         }
3976
3977         /*
3978          * Capture end position and add node
3979          */
3980         if (ctxt->record_info) {
3981            node_info.end_pos = ctxt->input->consumed +
3982                               (CUR_PTR - ctxt->input->base);
3983            node_info.end_line = ctxt->input->line;
3984            node_info.node = ctxt->node;
3985            xmlParserAddNodeInfo(ctxt, &node_info);
3986         }
3987         return;
3988     }
3989
3990     /*
3991      * Check for an Empty Element from DTD definition
3992      */
3993     if ((info != NULL) && (info->empty)) {
3994         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3995             ctxt->sax->endElement(ctxt->userData, name);
3996         htmlnamePop(ctxt);
3997         return;
3998     }
3999
4000     /*
4001      * Parse the content of the element:
4002      */
4003     currentNode = xmlStrdup(ctxt->name);
4004     depth = ctxt->nameNr;
4005     while (IS_CHAR_CH(CUR)) {
4006         oldptr = ctxt->input->cur;
4007         htmlParseContent(ctxt);
4008         if (oldptr==ctxt->input->cur) break;
4009         if (ctxt->nameNr < depth) break;
4010     }
4011
4012     /*
4013      * Capture end position and add node
4014      */
4015     if ( currentNode != NULL && ctxt->record_info ) {
4016        node_info.end_pos = ctxt->input->consumed +
4017                           (CUR_PTR - ctxt->input->base);
4018        node_info.end_line = ctxt->input->line;
4019        node_info.node = ctxt->node;
4020        xmlParserAddNodeInfo(ctxt, &node_info);
4021     }
4022     if (!IS_CHAR_CH(CUR)) {
4023         htmlAutoCloseOnEnd(ctxt);
4024     }
4025
4026     if (currentNode != NULL)
4027         xmlFree(currentNode);
4028 }
4029
4030 /**
4031  * htmlParseDocument:
4032  * @ctxt:  an HTML parser context
4033  *
4034  * parse an HTML document (and build a tree if using the standard SAX
4035  * interface).
4036  *
4037  * Returns 0, -1 in case of error. the parser context is augmented
4038  *                as a result of the parsing.
4039  */
4040
4041 int
4042 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4043     xmlDtdPtr dtd;
4044
4045     xmlInitParser();
4046
4047     htmlDefaultSAXHandlerInit();
4048
4049     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4050         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4051                      "htmlParseDocument: context error\n", NULL, NULL);
4052         return(XML_ERR_INTERNAL_ERROR);
4053     }
4054     ctxt->html = 1;
4055     GROW;
4056     /*
4057      * SAX: beginning of the document processing.
4058      */
4059     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4060         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4061
4062     /*
4063      * Wipe out everything which is before the first '<'
4064      */
4065     SKIP_BLANKS;
4066     if (CUR == 0) {
4067         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4068                      "Document is empty\n", NULL, NULL);
4069     }
4070
4071     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4072         ctxt->sax->startDocument(ctxt->userData);
4073
4074
4075     /*
4076      * Parse possible comments and PIs before any content
4077      */
4078     while (((CUR == '<') && (NXT(1) == '!') &&
4079             (NXT(2) == '-') && (NXT(3) == '-')) ||
4080            ((CUR == '<') && (NXT(1) == '?'))) {
4081         htmlParseComment(ctxt);
4082         htmlParsePI(ctxt);
4083         SKIP_BLANKS;
4084     }
4085
4086
4087     /*
4088      * Then possibly doc type declaration(s) and more Misc
4089      * (doctypedecl Misc*)?
4090      */
4091     if ((CUR == '<') && (NXT(1) == '!') &&
4092         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4093         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4094         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4095         (UPP(8) == 'E')) {
4096         htmlParseDocTypeDecl(ctxt);
4097     }
4098     SKIP_BLANKS;
4099
4100     /*
4101      * Parse possible comments and PIs before any content
4102      */
4103     while (((CUR == '<') && (NXT(1) == '!') &&
4104             (NXT(2) == '-') && (NXT(3) == '-')) ||
4105            ((CUR == '<') && (NXT(1) == '?'))) {
4106         htmlParseComment(ctxt);
4107         htmlParsePI(ctxt);
4108         SKIP_BLANKS;
4109     }
4110
4111     /*
4112      * Time to start parsing the tree itself
4113      */
4114     htmlParseContent(ctxt);
4115
4116     /*
4117      * autoclose
4118      */
4119     if (CUR == 0)
4120         htmlAutoCloseOnEnd(ctxt);
4121
4122
4123     /*
4124      * SAX: end of the document processing.
4125      */
4126     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4127         ctxt->sax->endDocument(ctxt->userData);
4128
4129     if (ctxt->myDoc != NULL) {
4130         dtd = xmlGetIntSubset(ctxt->myDoc);
4131         if (dtd == NULL)
4132             ctxt->myDoc->intSubset =
4133                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4134                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4135                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4136     }
4137     if (! ctxt->wellFormed) return(-1);
4138     return(0);
4139 }
4140
4141
4142 /************************************************************************
4143  *                                                                      *
4144  *                      Parser contexts handling                        *
4145  *                                                                      *
4146  ************************************************************************/
4147
4148 /**
4149  * htmlInitParserCtxt:
4150  * @ctxt:  an HTML parser context
4151  *
4152  * Initialize a parser context
4153  *
4154  * Returns 0 in case of success and -1 in case of error
4155  */
4156
4157 static int
4158 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4159 {
4160     htmlSAXHandler *sax;
4161
4162     if (ctxt == NULL) return(-1);
4163     memset(ctxt, 0, sizeof(htmlParserCtxt));
4164
4165     ctxt->dict = xmlDictCreate();
4166     if (ctxt->dict == NULL) {
4167         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4168         return(-1);
4169     }
4170     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4171     if (sax == NULL) {
4172         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4173         return(-1);
4174     }
4175     else
4176         memset(sax, 0, sizeof(htmlSAXHandler));
4177
4178     /* Allocate the Input stack */
4179     ctxt->inputTab = (htmlParserInputPtr *)
4180                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4181     if (ctxt->inputTab == NULL) {
4182         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4183         ctxt->inputNr = 0;
4184         ctxt->inputMax = 0;
4185         ctxt->input = NULL;
4186         return(-1);
4187     }
4188     ctxt->inputNr = 0;
4189     ctxt->inputMax = 5;
4190     ctxt->input = NULL;
4191     ctxt->version = NULL;
4192     ctxt->encoding = NULL;
4193     ctxt->standalone = -1;
4194     ctxt->instate = XML_PARSER_START;
4195
4196     /* Allocate the Node stack */
4197     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4198     if (ctxt->nodeTab == NULL) {
4199         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4200         ctxt->nodeNr = 0;
4201         ctxt->nodeMax = 0;
4202         ctxt->node = NULL;
4203         ctxt->inputNr = 0;
4204         ctxt->inputMax = 0;
4205         ctxt->input = NULL;
4206         return(-1);
4207     }
4208     ctxt->nodeNr = 0;
4209     ctxt->nodeMax = 10;
4210     ctxt->node = NULL;
4211
4212     /* Allocate the Name stack */
4213     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4214     if (ctxt->nameTab == NULL) {
4215         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4216         ctxt->nameNr = 0;
4217         ctxt->nameMax = 10;
4218         ctxt->name = NULL;
4219         ctxt->nodeNr = 0;
4220         ctxt->nodeMax = 0;
4221         ctxt->node = NULL;
4222         ctxt->inputNr = 0;
4223         ctxt->inputMax = 0;
4224         ctxt->input = NULL;
4225         return(-1);
4226     }
4227     ctxt->nameNr = 0;
4228     ctxt->nameMax = 10;
4229     ctxt->name = NULL;
4230
4231     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4232     else {
4233         ctxt->sax = sax;
4234         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4235     }
4236     ctxt->userData = ctxt;
4237     ctxt->myDoc = NULL;
4238     ctxt->wellFormed = 1;
4239     ctxt->replaceEntities = 0;
4240     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4241     ctxt->html = 1;
4242     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4243     ctxt->vctxt.userData = ctxt;
4244     ctxt->vctxt.error = xmlParserValidityError;
4245     ctxt->vctxt.warning = xmlParserValidityWarning;
4246     ctxt->record_info = 0;
4247     ctxt->validate = 0;
4248     ctxt->nbChars = 0;
4249     ctxt->checkIndex = 0;
4250     ctxt->catalogs = NULL;
4251     xmlInitNodeInfoSeq(&ctxt->node_seq);
4252     return(0);
4253 }
4254
4255 /**
4256  * htmlFreeParserCtxt:
4257  * @ctxt:  an HTML parser context
4258  *
4259  * Free all the memory used by a parser context. However the parsed
4260  * document in ctxt->myDoc is not freed.
4261  */
4262
4263 void
4264 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4265 {
4266     xmlFreeParserCtxt(ctxt);
4267 }
4268
4269 /**
4270  * htmlNewParserCtxt:
4271  *
4272  * Allocate and initialize a new parser context.
4273  *
4274  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4275  */
4276
4277 htmlParserCtxtPtr
4278 htmlNewParserCtxt(void)
4279 {
4280     xmlParserCtxtPtr ctxt;
4281
4282     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4283     if (ctxt == NULL) {
4284         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4285         return(NULL);
4286     }
4287     memset(ctxt, 0, sizeof(xmlParserCtxt));
4288     if (htmlInitParserCtxt(ctxt) < 0) {
4289         htmlFreeParserCtxt(ctxt);
4290         return(NULL);
4291     }
4292     return(ctxt);
4293 }
4294
4295 /**
4296  * htmlCreateMemoryParserCtxt:
4297  * @buffer:  a pointer to a char array
4298  * @size:  the size of the array
4299  *
4300  * Create a parser context for an HTML in-memory document.
4301  *
4302  * Returns the new parser context or NULL
4303  */
4304 htmlParserCtxtPtr
4305 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4306     xmlParserCtxtPtr ctxt;
4307     xmlParserInputPtr input;
4308     xmlParserInputBufferPtr buf;
4309
4310     if (buffer == NULL)
4311         return(NULL);
4312     if (size <= 0)
4313         return(NULL);
4314
4315     ctxt = htmlNewParserCtxt();
4316     if (ctxt == NULL)
4317         return(NULL);
4318
4319     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4320     if (buf == NULL) return(NULL);
4321
4322     input = xmlNewInputStream(ctxt);
4323     if (input == NULL) {
4324         xmlFreeParserCtxt(ctxt);
4325         return(NULL);
4326     }
4327
4328     input->filename = NULL;
4329     input->buf = buf;
4330     input->base = input->buf->buffer->content;
4331     input->cur = input->buf->buffer->content;
4332     input->end = &input->buf->buffer->content[input->buf->buffer->use];
4333
4334     inputPush(ctxt, input);
4335     return(ctxt);
4336 }
4337
4338 /**
4339  * htmlCreateDocParserCtxt:
4340  * @cur:  a pointer to an array of xmlChar
4341  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4342  *
4343  * Create a parser context for an HTML document.
4344  *
4345  * TODO: check the need to add encoding handling there
4346  *
4347  * Returns the new parser context or NULL
4348  */
4349 static htmlParserCtxtPtr
4350 htmlCreateDocParserCtxt(const xmlChar *cur,
4351                         const char *encoding ATTRIBUTE_UNUSED) {
4352     int len;
4353     htmlParserCtxtPtr ctxt;
4354
4355     if (cur == NULL)
4356         return(NULL);
4357     len = xmlStrlen(cur);
4358     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4359
4360     if (encoding != NULL) {
4361         xmlCharEncoding enc;
4362         xmlCharEncodingHandlerPtr handler;
4363
4364         if (ctxt->input->encoding != NULL)
4365             xmlFree((xmlChar *) ctxt->input->encoding);
4366         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4367
4368         enc = xmlParseCharEncoding(encoding);
4369         /*
4370          * registered set of known encodings
4371          */
4372         if (enc != XML_CHAR_ENCODING_ERROR) {
4373             xmlSwitchEncoding(ctxt, enc);
4374             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4375                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4376                              "Unsupported encoding %s\n",
4377                              (const xmlChar *) encoding, NULL);
4378             }
4379         } else {
4380             /*
4381              * fallback for unknown encodings
4382              */
4383             handler = xmlFindCharEncodingHandler((const char *) encoding);
4384             if (handler != NULL) {
4385                 xmlSwitchToEncoding(ctxt, handler);
4386             } else {
4387                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4388                              "Unsupported encoding %s\n",
4389                              (const xmlChar *) encoding, NULL);
4390             }
4391         }
4392     }
4393     return(ctxt);
4394 }
4395
4396 #ifdef LIBXML_PUSH_ENABLED
4397 /************************************************************************
4398  *                                                                      *
4399  *              Progressive parsing interfaces                          *
4400  *                                                                      *
4401  ************************************************************************/
4402
4403 /**
4404  * htmlParseLookupSequence:
4405  * @ctxt:  an HTML parser context
4406  * @first:  the first char to lookup
4407  * @next:  the next char to lookup or zero
4408  * @third:  the next char to lookup or zero
4409  * @comment: flag to force checking inside comments
4410  *
4411  * Try to find if a sequence (first, next, third) or  just (first next) or
4412  * (first) is available in the input stream.
4413  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4414  * to avoid rescanning sequences of bytes, it DOES change the state of the
4415  * parser, do not use liberally.
4416  * This is basically similar to xmlParseLookupSequence()
4417  *
4418  * Returns the index to the current parsing point if the full sequence
4419  *      is available, -1 otherwise.
4420  */
4421 static int
4422 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4423                         xmlChar next, xmlChar third, int iscomment) {
4424     int base, len;
4425     htmlParserInputPtr in;
4426     const xmlChar *buf;
4427     int incomment = 0;
4428
4429     in = ctxt->input;
4430     if (in == NULL) return(-1);
4431     base = in->cur - in->base;
4432     if (base < 0) return(-1);
4433     if (ctxt->checkIndex > base)
4434         base = ctxt->checkIndex;
4435     if (in->buf == NULL) {
4436         buf = in->base;
4437         len = in->length;
4438     } else {
4439         buf = in->buf->buffer->content;
4440         len = in->buf->buffer->use;
4441     }
4442     /* take into account the sequence length */
4443     if (third) len -= 2;
4444     else if (next) len --;
4445     for (;base < len;base++) {
4446         if (!incomment && (base + 4 < len) && !iscomment) {
4447             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4448                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4449                 incomment = 1;
4450                 /* do not increment past <! - some people use <!--> */
4451                 base += 2;
4452             }
4453         }
4454         if (incomment) {
4455             if (base + 3 > len)
4456                 return(-1);
4457             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4458                 (buf[base + 2] == '>')) {
4459                 incomment = 0;
4460                 base += 2;
4461             }
4462             continue;
4463         }
4464         if (buf[base] == first) {
4465             if (third != 0) {
4466                 if ((buf[base + 1] != next) ||
4467                     (buf[base + 2] != third)) continue;
4468             } else if (next != 0) {
4469                 if (buf[base + 1] != next) continue;
4470             }
4471             ctxt->checkIndex = 0;
4472 #ifdef DEBUG_PUSH
4473             if (next == 0)
4474                 xmlGenericError(xmlGenericErrorContext,
4475                         "HPP: lookup '%c' found at %d\n",
4476                         first, base);
4477             else if (third == 0)
4478                 xmlGenericError(xmlGenericErrorContext,
4479                         "HPP: lookup '%c%c' found at %d\n",
4480                         first, next, base);
4481             else
4482                 xmlGenericError(xmlGenericErrorContext,
4483                         "HPP: lookup '%c%c%c' found at %d\n",
4484                         first, next, third, base);
4485 #endif
4486             return(base - (in->cur - in->base));
4487         }
4488     }
4489     ctxt->checkIndex = base;
4490 #ifdef DEBUG_PUSH
4491     if (next == 0)
4492         xmlGenericError(xmlGenericErrorContext,
4493                 "HPP: lookup '%c' failed\n", first);
4494     else if (third == 0)
4495         xmlGenericError(xmlGenericErrorContext,
4496                 "HPP: lookup '%c%c' failed\n", first, next);
4497     else
4498         xmlGenericError(xmlGenericErrorContext,
4499                 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4500 #endif
4501     return(-1);
4502 }
4503
4504 /**
4505  * htmlParseTryOrFinish:
4506  * @ctxt:  an HTML parser context
4507  * @terminate:  last chunk indicator
4508  *
4509  * Try to progress on parsing
4510  *
4511  * Returns zero if no parsing was possible
4512  */
4513 static int
4514 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4515     int ret = 0;
4516     htmlParserInputPtr in;
4517     int avail = 0;
4518     xmlChar cur, next;
4519
4520 #ifdef DEBUG_PUSH
4521     switch (ctxt->instate) {
4522         case XML_PARSER_EOF:
4523             xmlGenericError(xmlGenericErrorContext,
4524                     "HPP: try EOF\n"); break;
4525         case XML_PARSER_START:
4526             xmlGenericError(xmlGenericErrorContext,
4527                     "HPP: try START\n"); break;
4528         case XML_PARSER_MISC:
4529             xmlGenericError(xmlGenericErrorContext,
4530                     "HPP: try MISC\n");break;
4531         case XML_PARSER_COMMENT:
4532             xmlGenericError(xmlGenericErrorContext,
4533                     "HPP: try COMMENT\n");break;
4534         case XML_PARSER_PROLOG:
4535             xmlGenericError(xmlGenericErrorContext,
4536                     "HPP: try PROLOG\n");break;
4537         case XML_PARSER_START_TAG:
4538             xmlGenericError(xmlGenericErrorContext,
4539                     "HPP: try START_TAG\n");break;
4540         case XML_PARSER_CONTENT:
4541             xmlGenericError(xmlGenericErrorContext,
4542                     "HPP: try CONTENT\n");break;
4543         case XML_PARSER_CDATA_SECTION:
4544             xmlGenericError(xmlGenericErrorContext,
4545                     "HPP: try CDATA_SECTION\n");break;
4546         case XML_PARSER_END_TAG:
4547             xmlGenericError(xmlGenericErrorContext,
4548                     "HPP: try END_TAG\n");break;
4549         case XML_PARSER_ENTITY_DECL:
4550             xmlGenericError(xmlGenericErrorContext,
4551                     "HPP: try ENTITY_DECL\n");break;
4552         case XML_PARSER_ENTITY_VALUE:
4553             xmlGenericError(xmlGenericErrorContext,
4554                     "HPP: try ENTITY_VALUE\n");break;
4555         case XML_PARSER_ATTRIBUTE_VALUE:
4556             xmlGenericError(xmlGenericErrorContext,
4557                     "HPP: try ATTRIBUTE_VALUE\n");break;
4558         case XML_PARSER_DTD:
4559             xmlGenericError(xmlGenericErrorContext,
4560                     "HPP: try DTD\n");break;
4561         case XML_PARSER_EPILOG:
4562             xmlGenericError(xmlGenericErrorContext,
4563                     "HPP: try EPILOG\n");break;
4564         case XML_PARSER_PI:
4565             xmlGenericError(xmlGenericErrorContext,
4566                     "HPP: try PI\n");break;
4567         case XML_PARSER_SYSTEM_LITERAL:
4568             xmlGenericError(xmlGenericErrorContext,
4569                     "HPP: try SYSTEM_LITERAL\n");break;
4570     }
4571 #endif
4572
4573     while (1) {
4574
4575         in = ctxt->input;
4576         if (in == NULL) break;
4577         if (in->buf == NULL)
4578             avail = in->length - (in->cur - in->base);
4579         else
4580             avail = in->buf->buffer->use - (in->cur - in->base);
4581         if ((avail == 0) && (terminate)) {
4582             htmlAutoCloseOnEnd(ctxt);
4583             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4584                 /*
4585                  * SAX: end of the document processing.
4586                  */
4587                 ctxt->instate = XML_PARSER_EOF;
4588                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4589                     ctxt->sax->endDocument(ctxt->userData);
4590             }
4591         }
4592         if (avail < 1)
4593             goto done;
4594         cur = in->cur[0];
4595         if (cur == 0) {
4596             SKIP(1);
4597             continue;
4598         }
4599
4600         switch (ctxt->instate) {
4601             case XML_PARSER_EOF:
4602                 /*
4603                  * Document parsing is done !
4604                  */
4605                 goto done;
4606             case XML_PARSER_START:
4607                 /*
4608                  * Very first chars read from the document flow.
4609                  */
4610                 cur = in->cur[0];
4611                 if (IS_BLANK_CH(cur)) {
4612                     SKIP_BLANKS;
4613                     if (in->buf == NULL)
4614                         avail = in->length - (in->cur - in->base);
4615                     else
4616                         avail = in->buf->buffer->use - (in->cur - in->base);
4617                 }
4618                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4619                     ctxt->sax->setDocumentLocator(ctxt->userData,
4620                                                   &xmlDefaultSAXLocator);
4621                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4622                     (!ctxt->disableSAX))
4623                     ctxt->sax->startDocument(ctxt->userData);
4624
4625                 cur = in->cur[0];
4626                 next = in->cur[1];
4627                 if ((cur == '<') && (next == '!') &&
4628                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
4629                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
4630                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4631                     (UPP(8) == 'E')) {
4632                     if ((!terminate) &&
4633                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4634                         goto done;
4635 #ifdef DEBUG_PUSH
4636                     xmlGenericError(xmlGenericErrorContext,
4637                             "HPP: Parsing internal subset\n");
4638 #endif
4639                     htmlParseDocTypeDecl(ctxt);
4640                     ctxt->instate = XML_PARSER_PROLOG;
4641 #ifdef DEBUG_PUSH
4642                     xmlGenericError(xmlGenericErrorContext,
4643                             "HPP: entering PROLOG\n");
4644 #endif
4645                 } else {
4646                     ctxt->instate = XML_PARSER_MISC;
4647 #ifdef DEBUG_PUSH
4648                     xmlGenericError(xmlGenericErrorContext,
4649                             "HPP: entering MISC\n");
4650 #endif
4651                 }
4652                 break;
4653             case XML_PARSER_MISC:
4654                 SKIP_BLANKS;
4655                 if (in->buf == NULL)
4656                     avail = in->length - (in->cur - in->base);
4657                 else
4658                     avail = in->buf->buffer->use - (in->cur - in->base);
4659                 if (avail < 2)
4660                     goto done;
4661                 cur = in->cur[0];
4662                 next = in->cur[1];
4663                 if ((cur == '<') && (next == '!') &&
4664                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4665                     if ((!terminate) &&
4666                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4667                         goto done;
4668 #ifdef DEBUG_PUSH
4669                     xmlGenericError(xmlGenericErrorContext,
4670                             "HPP: Parsing Comment\n");
4671 #endif
4672                     htmlParseComment(ctxt);
4673                     ctxt->instate = XML_PARSER_MISC;
4674                 } else if ((cur == '<') && (next == '?')) {
4675                     if ((!terminate) &&
4676                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4677                         goto done;
4678 #ifdef DEBUG_PUSH
4679                     xmlGenericError(xmlGenericErrorContext,
4680                             "HPP: Parsing PI\n");
4681 #endif
4682                     htmlParsePI(ctxt);
4683                     ctxt->instate = XML_PARSER_MISC;
4684                 } else if ((cur == '<') && (next == '!') &&
4685                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
4686                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
4687                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4688                     (UPP(8) == 'E')) {
4689                     if ((!terminate) &&
4690                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4691                         goto done;
4692 #ifdef DEBUG_PUSH
4693                     xmlGenericError(xmlGenericErrorContext,
4694                             "HPP: Parsing internal subset\n");
4695 #endif
4696                     htmlParseDocTypeDecl(ctxt);
4697                     ctxt->instate = XML_PARSER_PROLOG;
4698 #ifdef DEBUG_PUSH
4699                     xmlGenericError(xmlGenericErrorContext,
4700                             "HPP: entering PROLOG\n");
4701 #endif
4702                 } else if ((cur == '<') && (next == '!') &&
4703                            (avail < 9)) {
4704                     goto done;
4705                 } else {
4706                     ctxt->instate = XML_PARSER_START_TAG;
4707 #ifdef DEBUG_PUSH
4708                     xmlGenericError(xmlGenericErrorContext,
4709                             "HPP: entering START_TAG\n");
4710 #endif
4711                 }
4712                 break;
4713             case XML_PARSER_PROLOG:
4714                 SKIP_BLANKS;
4715                 if (in->buf == NULL)
4716                     avail = in->length - (in->cur - in->base);
4717                 else
4718                     avail = in->buf->buffer->use - (in->cur - in->base);
4719                 if (avail < 2)
4720                     goto done;
4721                 cur = in->cur[0];
4722                 next = in->cur[1];
4723                 if ((cur == '<') && (next == '!') &&
4724                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4725                     if ((!terminate) &&
4726                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4727                         goto done;
4728 #ifdef DEBUG_PUSH
4729                     xmlGenericError(xmlGenericErrorContext,
4730                             "HPP: Parsing Comment\n");
4731 #endif
4732                     htmlParseComment(ctxt);
4733                     ctxt->instate = XML_PARSER_PROLOG;
4734                 } else if ((cur == '<') && (next == '?')) {
4735                     if ((!terminate) &&
4736                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4737                         goto done;
4738 #ifdef DEBUG_PUSH
4739                     xmlGenericError(xmlGenericErrorContext,
4740                             "HPP: Parsing PI\n");
4741 #endif
4742                     htmlParsePI(ctxt);
4743                     ctxt->instate = XML_PARSER_PROLOG;
4744                 } else if ((cur == '<') && (next == '!') &&
4745                            (avail < 4)) {
4746                     goto done;
4747                 } else {
4748                     ctxt->instate = XML_PARSER_START_TAG;
4749 #ifdef DEBUG_PUSH
4750                     xmlGenericError(xmlGenericErrorContext,
4751                             "HPP: entering START_TAG\n");
4752 #endif
4753                 }
4754                 break;
4755             case XML_PARSER_EPILOG:
4756                 if (in->buf == NULL)
4757                     avail = in->length - (in->cur - in->base);
4758                 else
4759                     avail = in->buf->buffer->use - (in->cur - in->base);
4760                 if (avail < 1)
4761                     goto done;
4762                 cur = in->cur[0];
4763                 if (IS_BLANK_CH(cur)) {
4764                     htmlParseCharData(ctxt);
4765                     goto done;
4766                 }
4767                 if (avail < 2)
4768                     goto done;
4769                 next = in->cur[1];
4770                 if ((cur == '<') && (next == '!') &&
4771                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4772                     if ((!terminate) &&
4773                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4774                         goto done;
4775 #ifdef DEBUG_PUSH
4776                     xmlGenericError(xmlGenericErrorContext,
4777                             "HPP: Parsing Comment\n");
4778 #endif
4779                     htmlParseComment(ctxt);
4780                     ctxt->instate = XML_PARSER_EPILOG;
4781                 } else if ((cur == '<') && (next == '?')) {
4782                     if ((!terminate) &&
4783                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4784                         goto done;
4785 #ifdef DEBUG_PUSH
4786                     xmlGenericError(xmlGenericErrorContext,
4787                             "HPP: Parsing PI\n");
4788 #endif
4789                     htmlParsePI(ctxt);
4790                     ctxt->instate = XML_PARSER_EPILOG;
4791                 } else if ((cur == '<') && (next == '!') &&
4792                            (avail < 4)) {
4793                     goto done;
4794                 } else {
4795                     ctxt->errNo = XML_ERR_DOCUMENT_END;
4796                     ctxt->wellFormed = 0;
4797                     ctxt->instate = XML_PARSER_EOF;
4798 #ifdef DEBUG_PUSH
4799                     xmlGenericError(xmlGenericErrorContext,
4800                             "HPP: entering EOF\n");
4801 #endif
4802                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4803                         ctxt->sax->endDocument(ctxt->userData);
4804                     goto done;
4805                 }
4806                 break;
4807             case XML_PARSER_START_TAG: {
4808                 const xmlChar *name;
4809                 int failed;
4810                 const htmlElemDesc * info;
4811
4812                 if (avail < 2)
4813                     goto done;
4814                 cur = in->cur[0];
4815                 if (cur != '<') {
4816                     ctxt->instate = XML_PARSER_CONTENT;
4817 #ifdef DEBUG_PUSH
4818                     xmlGenericError(xmlGenericErrorContext,
4819                             "HPP: entering CONTENT\n");
4820 #endif
4821                     break;
4822                 }
4823                 if (in->cur[1] == '/') {
4824                     ctxt->instate = XML_PARSER_END_TAG;
4825                     ctxt->checkIndex = 0;
4826 #ifdef DEBUG_PUSH
4827                     xmlGenericError(xmlGenericErrorContext,
4828                             "HPP: entering END_TAG\n");
4829 #endif
4830                     break;
4831                 }
4832                 if ((!terminate) &&
4833                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4834                     goto done;
4835
4836                 failed = htmlParseStartTag(ctxt);
4837                 name = ctxt->name;
4838                 if (failed ||
4839                     (name == NULL)) {
4840                     if (CUR == '>')
4841                         NEXT;
4842                     break;
4843                 }
4844
4845                 /*
4846                  * Lookup the info for that element.
4847                  */
4848                 info = htmlTagLookup(name);
4849                 if (info == NULL) {
4850                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4851                                  "Tag %s invalid\n", name, NULL);
4852                 }
4853
4854                 /*
4855                  * Check for an Empty Element labeled the XML/SGML way
4856                  */
4857                 if ((CUR == '/') && (NXT(1) == '>')) {
4858                     SKIP(2);
4859                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4860                         ctxt->sax->endElement(ctxt->userData, name);
4861                     htmlnamePop(ctxt);
4862                     ctxt->instate = XML_PARSER_CONTENT;
4863 #ifdef DEBUG_PUSH
4864                     xmlGenericError(xmlGenericErrorContext,
4865                             "HPP: entering CONTENT\n");
4866 #endif
4867                     break;
4868                 }
4869
4870                 if (CUR == '>') {
4871                     NEXT;
4872                 } else {
4873                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4874                                  "Couldn't find end of Start Tag %s\n",
4875                                  name, NULL);
4876
4877                     /*
4878                      * end of parsing of this node.
4879                      */
4880                     if (xmlStrEqual(name, ctxt->name)) {
4881                         nodePop(ctxt);
4882                         htmlnamePop(ctxt);
4883                     }
4884
4885                     ctxt->instate = XML_PARSER_CONTENT;
4886 #ifdef DEBUG_PUSH
4887                     xmlGenericError(xmlGenericErrorContext,
4888                             "HPP: entering CONTENT\n");
4889 #endif
4890                     break;
4891                 }
4892
4893                 /*
4894                  * Check for an Empty Element from DTD definition
4895                  */
4896                 if ((info != NULL) && (info->empty)) {
4897                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4898                         ctxt->sax->endElement(ctxt->userData, name);
4899                     htmlnamePop(ctxt);
4900                 }
4901                 ctxt->instate = XML_PARSER_CONTENT;
4902 #ifdef DEBUG_PUSH
4903                 xmlGenericError(xmlGenericErrorContext,
4904                         "HPP: entering CONTENT\n");
4905 #endif
4906                 break;
4907             }
4908             case XML_PARSER_CONTENT: {
4909                 long cons;
4910                 /*
4911                  * Handle preparsed entities and charRef
4912                  */
4913                 if (ctxt->token != 0) {
4914                     xmlChar chr[2] = { 0 , 0 } ;
4915
4916                     chr[0] = (xmlChar) ctxt->token;
4917                     htmlCheckParagraph(ctxt);
4918                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4919                         ctxt->sax->characters(ctxt->userData, chr, 1);
4920                     ctxt->token = 0;
4921                     ctxt->checkIndex = 0;
4922                 }
4923                 if ((avail == 1) && (terminate)) {
4924                     cur = in->cur[0];
4925                     if ((cur != '<') && (cur != '&')) {
4926                         if (ctxt->sax != NULL) {
4927                             if (IS_BLANK_CH(cur)) {
4928                                 if (ctxt->sax->ignorableWhitespace != NULL)
4929                                     ctxt->sax->ignorableWhitespace(
4930                                             ctxt->userData, &cur, 1);
4931                             } else {
4932                                 htmlCheckParagraph(ctxt);
4933                                 if (ctxt->sax->characters != NULL)
4934                                     ctxt->sax->characters(
4935                                             ctxt->userData, &cur, 1);
4936                             }
4937                         }
4938                         ctxt->token = 0;
4939                         ctxt->checkIndex = 0;
4940                         in->cur++;
4941                         break;
4942                     }
4943                 }
4944                 if (avail < 2)
4945                     goto done;
4946                 cur = in->cur[0];
4947                 next = in->cur[1];
4948                 cons = ctxt->nbChars;
4949                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4950                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4951                     /*
4952                      * Handle SCRIPT/STYLE separately
4953                      */
4954                     if (!terminate) {
4955                         int idx;
4956                         xmlChar val;
4957
4958                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
4959                         if (idx < 0)
4960                             goto done;
4961                         val = in->cur[idx + 2];
4962                         if (val == 0) /* bad cut of input */
4963                             goto done;
4964                     }
4965                     htmlParseScript(ctxt);
4966                     if ((cur == '<') && (next == '/')) {
4967                         ctxt->instate = XML_PARSER_END_TAG;
4968                         ctxt->checkIndex = 0;
4969 #ifdef DEBUG_PUSH
4970                         xmlGenericError(xmlGenericErrorContext,
4971                                 "HPP: entering END_TAG\n");
4972 #endif
4973                         break;
4974                     }
4975                 } else {
4976                     /*
4977                      * Sometimes DOCTYPE arrives in the middle of the document
4978                      */
4979                     if ((cur == '<') && (next == '!') &&
4980                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4981                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4982                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4983                         (UPP(8) == 'E')) {
4984                         if ((!terminate) &&
4985                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4986                             goto done;
4987                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4988                                      "Misplaced DOCTYPE declaration\n",
4989                                      BAD_CAST "DOCTYPE" , NULL);
4990                         htmlParseDocTypeDecl(ctxt);
4991                     } else if ((cur == '<') && (next == '!') &&
4992                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
4993                         if ((!terminate) &&
4994                             (htmlParseLookupSequence(
4995                                         ctxt, '-', '-', '>', 1) < 0))
4996                             goto done;
4997 #ifdef DEBUG_PUSH
4998                         xmlGenericError(xmlGenericErrorContext,
4999                                 "HPP: Parsing Comment\n");
5000 #endif
5001                         htmlParseComment(ctxt);
5002                         ctxt->instate = XML_PARSER_CONTENT;
5003                     } else if ((cur == '<') && (next == '?')) {
5004                         if ((!terminate) &&
5005                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5006                             goto done;
5007 #ifdef DEBUG_PUSH
5008                         xmlGenericError(xmlGenericErrorContext,
5009                                 "HPP: Parsing PI\n");
5010 #endif
5011                         htmlParsePI(ctxt);
5012                         ctxt->instate = XML_PARSER_CONTENT;
5013                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5014                         goto done;
5015                     } else if ((cur == '<') && (next == '/')) {
5016                         ctxt->instate = XML_PARSER_END_TAG;
5017                         ctxt->checkIndex = 0;
5018 #ifdef DEBUG_PUSH
5019                         xmlGenericError(xmlGenericErrorContext,
5020                                 "HPP: entering END_TAG\n");
5021 #endif
5022                         break;
5023                     } else if (cur == '<') {
5024                         ctxt->instate = XML_PARSER_START_TAG;
5025                         ctxt->checkIndex = 0;
5026 #ifdef DEBUG_PUSH
5027                         xmlGenericError(xmlGenericErrorContext,
5028                                 "HPP: entering START_TAG\n");
5029 #endif
5030                         break;
5031                     } else if (cur == '&') {
5032                         if ((!terminate) &&
5033                             (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5034                             goto done;
5035 #ifdef DEBUG_PUSH
5036                         xmlGenericError(xmlGenericErrorContext,
5037                                 "HPP: Parsing Reference\n");
5038 #endif
5039                         /* TODO: check generation of subtrees if noent !!! */
5040                         htmlParseReference(ctxt);
5041                     } else {
5042                         /*
5043                          * check that the text sequence is complete
5044                          * before handing out the data to the parser
5045                          * to avoid problems with erroneous end of
5046                          * data detection.
5047                          */
5048                         if ((!terminate) &&
5049                             (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5050                             goto done;
5051                         ctxt->checkIndex = 0;
5052 #ifdef DEBUG_PUSH
5053                         xmlGenericError(xmlGenericErrorContext,
5054                                 "HPP: Parsing char data\n");
5055 #endif
5056                         htmlParseCharData(ctxt);
5057                     }
5058                 }
5059                 if (cons == ctxt->nbChars) {
5060                     if (ctxt->node != NULL) {
5061                         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5062                                      "detected an error in element content\n",
5063                                      NULL, NULL);
5064                     }
5065                     NEXT;
5066                     break;
5067                 }
5068
5069                 break;
5070             }
5071             case XML_PARSER_END_TAG:
5072                 if (avail < 2)
5073                     goto done;
5074                 if ((!terminate) &&
5075                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5076                     goto done;
5077                 htmlParseEndTag(ctxt);
5078                 if (ctxt->nameNr == 0) {
5079                     ctxt->instate = XML_PARSER_EPILOG;
5080                 } else {
5081                     ctxt->instate = XML_PARSER_CONTENT;
5082                 }
5083                 ctxt->checkIndex = 0;
5084 #ifdef DEBUG_PUSH
5085                 xmlGenericError(xmlGenericErrorContext,
5086                         "HPP: entering CONTENT\n");
5087 #endif
5088                 break;
5089             case XML_PARSER_CDATA_SECTION:
5090                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5091                         "HPP: internal error, state == CDATA\n",
5092                              NULL, NULL);
5093                 ctxt->instate = XML_PARSER_CONTENT;
5094                 ctxt->checkIndex = 0;
5095 #ifdef DEBUG_PUSH
5096                 xmlGenericError(xmlGenericErrorContext,
5097                         "HPP: entering CONTENT\n");
5098 #endif
5099                 break;
5100             case XML_PARSER_DTD:
5101                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5102                         "HPP: internal error, state == DTD\n",
5103                              NULL, NULL);
5104                 ctxt->instate = XML_PARSER_CONTENT;
5105                 ctxt->checkIndex = 0;
5106 #ifdef DEBUG_PUSH
5107                 xmlGenericError(xmlGenericErrorContext,
5108                         "HPP: entering CONTENT\n");
5109 #endif
5110                 break;
5111             case XML_PARSER_COMMENT:
5112                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5113                         "HPP: internal error, state == COMMENT\n",
5114                              NULL, NULL);
5115                 ctxt->instate = XML_PARSER_CONTENT;
5116                 ctxt->checkIndex = 0;
5117 #ifdef DEBUG_PUSH
5118                 xmlGenericError(xmlGenericErrorContext,
5119                         "HPP: entering CONTENT\n");
5120 #endif
5121                 break;
5122             case XML_PARSER_PI:
5123                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5124                         "HPP: internal error, state == PI\n",
5125                              NULL, NULL);
5126                 ctxt->instate = XML_PARSER_CONTENT;
5127                 ctxt->checkIndex = 0;
5128 #ifdef DEBUG_PUSH
5129                 xmlGenericError(xmlGenericErrorContext,
5130                         "HPP: entering CONTENT\n");
5131 #endif
5132                 break;
5133             case XML_PARSER_ENTITY_DECL:
5134                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5135                         "HPP: internal error, state == ENTITY_DECL\n",
5136                              NULL, NULL);
5137                 ctxt->instate = XML_PARSER_CONTENT;
5138                 ctxt->checkIndex = 0;
5139 #ifdef DEBUG_PUSH
5140                 xmlGenericError(xmlGenericErrorContext,
5141                         "HPP: entering CONTENT\n");
5142 #endif
5143                 break;
5144             case XML_PARSER_ENTITY_VALUE:
5145                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5146                         "HPP: internal error, state == ENTITY_VALUE\n",
5147                              NULL, NULL);
5148                 ctxt->instate = XML_PARSER_CONTENT;
5149                 ctxt->checkIndex = 0;
5150 #ifdef DEBUG_PUSH
5151                 xmlGenericError(xmlGenericErrorContext,
5152                         "HPP: entering DTD\n");
5153 #endif
5154                 break;
5155             case XML_PARSER_ATTRIBUTE_VALUE:
5156                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5157                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5158                              NULL, NULL);
5159                 ctxt->instate = XML_PARSER_START_TAG;
5160                 ctxt->checkIndex = 0;
5161 #ifdef DEBUG_PUSH
5162                 xmlGenericError(xmlGenericErrorContext,
5163                         "HPP: entering START_TAG\n");
5164 #endif
5165                 break;
5166             case XML_PARSER_SYSTEM_LITERAL:
5167                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5168                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5169                              NULL, NULL);
5170                 ctxt->instate = XML_PARSER_CONTENT;
5171                 ctxt->checkIndex = 0;
5172 #ifdef DEBUG_PUSH
5173                 xmlGenericError(xmlGenericErrorContext,
5174                         "HPP: entering CONTENT\n");
5175 #endif
5176                 break;
5177             case XML_PARSER_IGNORE:
5178                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5179                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
5180                              NULL, NULL);
5181                 ctxt->instate = XML_PARSER_CONTENT;
5182                 ctxt->checkIndex = 0;
5183 #ifdef DEBUG_PUSH
5184                 xmlGenericError(xmlGenericErrorContext,
5185                         "HPP: entering CONTENT\n");
5186 #endif
5187                 break;
5188             case XML_PARSER_PUBLIC_LITERAL:
5189                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5190                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
5191                              NULL, NULL);
5192                 ctxt->instate = XML_PARSER_CONTENT;
5193                 ctxt->checkIndex = 0;
5194 #ifdef DEBUG_PUSH
5195                 xmlGenericError(xmlGenericErrorContext,
5196                         "HPP: entering CONTENT\n");
5197 #endif
5198                 break;
5199
5200         }
5201     }
5202 done:
5203     if ((avail == 0) && (terminate)) {
5204         htmlAutoCloseOnEnd(ctxt);
5205         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5206             /*
5207              * SAX: end of the document processing.
5208              */
5209             ctxt->instate = XML_PARSER_EOF;
5210             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5211                 ctxt->sax->endDocument(ctxt->userData);
5212         }
5213     }
5214     if ((ctxt->myDoc != NULL) &&
5215         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5216          (ctxt->instate == XML_PARSER_EPILOG))) {
5217         xmlDtdPtr dtd;
5218         dtd = xmlGetIntSubset(ctxt->myDoc);
5219         if (dtd == NULL)
5220             ctxt->myDoc->intSubset =
5221                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5222                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5223                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5224     }
5225 #ifdef DEBUG_PUSH
5226     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5227 #endif
5228     return(ret);
5229 }
5230
5231 /**
5232  * htmlParseChunk:
5233  * @ctxt:  an HTML parser context
5234  * @chunk:  an char array
5235  * @size:  the size in byte of the chunk
5236  * @terminate:  last chunk indicator
5237  *
5238  * Parse a Chunk of memory
5239  *
5240  * Returns zero if no error, the xmlParserErrors otherwise.
5241  */
5242 int
5243 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5244               int terminate) {
5245     if ((ctxt == NULL) || (ctxt->input == NULL)) {
5246         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5247                      "htmlParseChunk: context error\n", NULL, NULL);
5248         return(XML_ERR_INTERNAL_ERROR);
5249     }
5250     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5251         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5252         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5253         int cur = ctxt->input->cur - ctxt->input->base;
5254         int res;
5255
5256         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5257         if (res < 0) {
5258             ctxt->errNo = XML_PARSER_EOF;
5259             ctxt->disableSAX = 1;
5260             return (XML_PARSER_EOF);
5261         }
5262         ctxt->input->base = ctxt->input->buf->buffer->content + base;
5263         ctxt->input->cur = ctxt->input->base + cur;
5264         ctxt->input->end =
5265           &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5266 #ifdef DEBUG_PUSH
5267         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5268 #endif
5269
5270 #if 0
5271         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5272             htmlParseTryOrFinish(ctxt, terminate);
5273 #endif
5274     } else if (ctxt->instate != XML_PARSER_EOF) {
5275         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5276             xmlParserInputBufferPtr in = ctxt->input->buf;
5277             if ((in->encoder != NULL) && (in->buffer != NULL) &&
5278                     (in->raw != NULL)) {
5279                 int nbchars;
5280
5281                 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5282                 if (nbchars < 0) {
5283                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5284                                  "encoder error\n", NULL, NULL);
5285                     return(XML_ERR_INVALID_ENCODING);
5286                 }
5287             }
5288         }
5289     }
5290     htmlParseTryOrFinish(ctxt, terminate);
5291     if (terminate) {
5292         if ((ctxt->instate != XML_PARSER_EOF) &&
5293             (ctxt->instate != XML_PARSER_EPILOG) &&
5294             (ctxt->instate != XML_PARSER_MISC)) {
5295             ctxt->errNo = XML_ERR_DOCUMENT_END;
5296             ctxt->wellFormed = 0;
5297         }
5298         if (ctxt->instate != XML_PARSER_EOF) {
5299             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5300                 ctxt->sax->endDocument(ctxt->userData);
5301         }
5302         ctxt->instate = XML_PARSER_EOF;
5303     }
5304     return((xmlParserErrors) ctxt->errNo);
5305 }
5306
5307 /************************************************************************
5308  *                                                                      *
5309  *                      User entry points                               *
5310  *                                                                      *
5311  ************************************************************************/
5312
5313 /**
5314  * htmlCreatePushParserCtxt:
5315  * @sax:  a SAX handler
5316  * @user_data:  The user data returned on SAX callbacks
5317  * @chunk:  a pointer to an array of chars
5318  * @size:  number of chars in the array
5319  * @filename:  an optional file name or URI
5320  * @enc:  an optional encoding
5321  *
5322  * Create a parser context for using the HTML parser in push mode
5323  * The value of @filename is used for fetching external entities
5324  * and error/warning reports.
5325  *
5326  * Returns the new parser context or NULL
5327  */
5328 htmlParserCtxtPtr
5329 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5330                          const char *chunk, int size, const char *filename,
5331                          xmlCharEncoding enc) {
5332     htmlParserCtxtPtr ctxt;
5333     htmlParserInputPtr inputStream;
5334     xmlParserInputBufferPtr buf;
5335
5336     xmlInitParser();
5337
5338     buf = xmlAllocParserInputBuffer(enc);
5339     if (buf == NULL) return(NULL);
5340
5341     ctxt = htmlNewParserCtxt();
5342     if (ctxt == NULL) {
5343         xmlFreeParserInputBuffer(buf);
5344         return(NULL);
5345     }
5346     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5347         ctxt->charset=XML_CHAR_ENCODING_UTF8;
5348     if (sax != NULL) {
5349         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5350             xmlFree(ctxt->sax);
5351         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5352         if (ctxt->sax == NULL) {
5353             xmlFree(buf);
5354             xmlFree(ctxt);
5355             return(NULL);
5356         }
5357         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5358         if (user_data != NULL)
5359             ctxt->userData = user_data;
5360     }
5361     if (filename == NULL) {
5362         ctxt->directory = NULL;
5363     } else {
5364         ctxt->directory = xmlParserGetDirectory(filename);
5365     }
5366
5367     inputStream = htmlNewInputStream(ctxt);
5368     if (inputStream == NULL) {
5369         xmlFreeParserCtxt(ctxt);
5370         xmlFree(buf);
5371         return(NULL);
5372     }
5373
5374     if (filename == NULL)
5375         inputStream->filename = NULL;
5376     else
5377         inputStream->filename = (char *)
5378             xmlCanonicPath((const xmlChar *) filename);
5379     inputStream->buf = buf;
5380     inputStream->base = inputStream->buf->buffer->content;
5381     inputStream->cur = inputStream->buf->buffer->content;
5382     inputStream->end =
5383         &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5384
5385     inputPush(ctxt, inputStream);
5386
5387     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5388         (ctxt->input->buf != NULL))  {
5389         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5390         int cur = ctxt->input->cur - ctxt->input->base;
5391
5392         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5393
5394         ctxt->input->base = ctxt->input->buf->buffer->content + base;
5395         ctxt->input->cur = ctxt->input->base + cur;
5396         ctxt->input->end =
5397             &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5398 #ifdef DEBUG_PUSH
5399         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5400 #endif
5401     }
5402     ctxt->progressive = 1;
5403
5404     return(ctxt);
5405 }
5406 #endif /* LIBXML_PUSH_ENABLED */
5407
5408 /**
5409  * htmlSAXParseDoc:
5410  * @cur:  a pointer to an array of xmlChar
5411  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5412  * @sax:  the SAX handler block
5413  * @userData: if using SAX, this pointer will be provided on callbacks.
5414  *
5415  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5416  * to handle parse events. If sax is NULL, fallback to the default DOM
5417  * behavior and return a tree.
5418  *
5419  * Returns the resulting document tree unless SAX is NULL or the document is
5420  *     not well formed.
5421  */
5422
5423 htmlDocPtr
5424 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5425     htmlDocPtr ret;
5426     htmlParserCtxtPtr ctxt;
5427
5428     xmlInitParser();
5429
5430     if (cur == NULL) return(NULL);
5431
5432
5433     ctxt = htmlCreateDocParserCtxt(cur, encoding);
5434     if (ctxt == NULL) return(NULL);
5435     if (sax != NULL) {
5436         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5437         ctxt->sax = sax;
5438         ctxt->userData = userData;
5439     }
5440
5441     htmlParseDocument(ctxt);
5442     ret = ctxt->myDoc;
5443     if (sax != NULL) {
5444         ctxt->sax = NULL;
5445         ctxt->userData = NULL;
5446     }
5447     htmlFreeParserCtxt(ctxt);
5448
5449     return(ret);
5450 }
5451
5452 /**
5453  * htmlParseDoc:
5454  * @cur:  a pointer to an array of xmlChar
5455  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5456  *
5457  * parse an HTML in-memory document and build a tree.
5458  *
5459  * Returns the resulting document tree
5460  */
5461
5462 htmlDocPtr
5463 htmlParseDoc(xmlChar *cur, const char *encoding) {
5464     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5465 }
5466
5467
5468 /**
5469  * htmlCreateFileParserCtxt:
5470  * @filename:  the filename
5471  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5472  *
5473  * Create a parser context for a file content.
5474  * Automatic support for ZLIB/Compress compressed document is provided
5475  * by default if found at compile-time.
5476  *
5477  * Returns the new parser context or NULL
5478  */
5479 htmlParserCtxtPtr
5480 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5481 {
5482     htmlParserCtxtPtr ctxt;
5483     htmlParserInputPtr inputStream;
5484     char *canonicFilename;
5485     /* htmlCharEncoding enc; */
5486     xmlChar *content, *content_line = (xmlChar *) "charset=";
5487
5488     if (filename == NULL)
5489         return(NULL);
5490
5491     ctxt = htmlNewParserCtxt();
5492     if (ctxt == NULL) {
5493         return(NULL);
5494     }
5495     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5496     if (canonicFilename == NULL) {
5497 #ifdef LIBXML_SAX1_ENABLED
5498         if (xmlDefaultSAXHandler.error != NULL) {
5499             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5500         }
5501 #endif
5502         xmlFreeParserCtxt(ctxt);
5503         return(NULL);
5504     }
5505
5506     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5507     xmlFree(canonicFilename);
5508     if (inputStream == NULL) {
5509         xmlFreeParserCtxt(ctxt);
5510         return(NULL);
5511     }
5512
5513     inputPush(ctxt, inputStream);
5514
5515     /* set encoding */
5516     if (encoding) {
5517         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5518         if (content) {
5519             strcpy ((char *)content, (char *)content_line);
5520             strcat ((char *)content, (char *)encoding);
5521             htmlCheckEncoding (ctxt, content);
5522             xmlFree (content);
5523         }
5524     }
5525
5526     return(ctxt);
5527 }
5528
5529 /**
5530  * htmlSAXParseFile:
5531  * @filename:  the filename
5532  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5533  * @sax:  the SAX handler block
5534  * @userData: if using SAX, this pointer will be provided on callbacks.
5535  *
5536  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5537  * compressed document is provided by default if found at compile-time.
5538  * It use the given SAX function block to handle the parsing callback.
5539  * If sax is NULL, fallback to the default DOM tree building routines.
5540  *
5541  * Returns the resulting document tree unless SAX is NULL or the document is
5542  *     not well formed.
5543  */
5544
5545 htmlDocPtr
5546 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5547                  void *userData) {
5548     htmlDocPtr ret;
5549     htmlParserCtxtPtr ctxt;
5550     htmlSAXHandlerPtr oldsax = NULL;
5551
5552     xmlInitParser();
5553
5554     ctxt = htmlCreateFileParserCtxt(filename, encoding);
5555     if (ctxt == NULL) return(NULL);
5556     if (sax != NULL) {
5557         oldsax = ctxt->sax;
5558         ctxt->sax = sax;
5559         ctxt->userData = userData;
5560     }
5561
5562     htmlParseDocument(ctxt);
5563
5564     ret = ctxt->myDoc;
5565     if (sax != NULL) {
5566         ctxt->sax = oldsax;
5567         ctxt->userData = NULL;
5568     }
5569     htmlFreeParserCtxt(ctxt);
5570
5571     return(ret);
5572 }
5573
5574 /**
5575  * htmlParseFile:
5576  * @filename:  the filename
5577  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5578  *
5579  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5580  * compressed document is provided by default if found at compile-time.
5581  *
5582  * Returns the resulting document tree
5583  */
5584
5585 htmlDocPtr
5586 htmlParseFile(const char *filename, const char *encoding) {
5587     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5588 }
5589
5590 /**
5591  * htmlHandleOmittedElem:
5592  * @val:  int 0 or 1
5593  *
5594  * Set and return the previous value for handling HTML omitted tags.
5595  *
5596  * Returns the last value for 0 for no handling, 1 for auto insertion.
5597  */
5598
5599 int
5600 htmlHandleOmittedElem(int val) {
5601     int old = htmlOmittedDefaultValue;
5602
5603     htmlOmittedDefaultValue = val;
5604     return(old);
5605 }
5606
5607 /**
5608  * htmlElementAllowedHere:
5609  * @parent: HTML parent element
5610  * @elt: HTML element
5611  *
5612  * Checks whether an HTML element may be a direct child of a parent element.
5613  * Note - doesn't check for deprecated elements
5614  *
5615  * Returns 1 if allowed; 0 otherwise.
5616  */
5617 int
5618 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5619   const char** p ;
5620
5621   if ( ! elt || ! parent || ! parent->subelts )
5622         return 0 ;
5623
5624   for ( p = parent->subelts; *p; ++p )
5625     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5626       return 1 ;
5627
5628   return 0 ;
5629 }
5630 /**
5631  * htmlElementStatusHere:
5632  * @parent: HTML parent element
5633  * @elt: HTML element
5634  *
5635  * Checks whether an HTML element may be a direct child of a parent element.
5636  * and if so whether it is valid or deprecated.
5637  *
5638  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5639  */
5640 htmlStatus
5641 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5642   if ( ! parent || ! elt )
5643     return HTML_INVALID ;
5644   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5645     return HTML_INVALID ;
5646
5647   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5648 }
5649 /**
5650  * htmlAttrAllowed:
5651  * @elt: HTML element
5652  * @attr: HTML attribute
5653  * @legacy: whether to allow deprecated attributes
5654  *
5655  * Checks whether an attribute is valid for an element
5656  * Has full knowledge of Required and Deprecated attributes
5657  *
5658  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5659  */
5660 htmlStatus
5661 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5662   const char** p ;
5663
5664   if ( !elt || ! attr )
5665         return HTML_INVALID ;
5666
5667   if ( elt->attrs_req )
5668     for ( p = elt->attrs_req; *p; ++p)
5669       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5670         return HTML_REQUIRED ;
5671
5672   if ( elt->attrs_opt )
5673     for ( p = elt->attrs_opt; *p; ++p)
5674       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5675         return HTML_VALID ;
5676
5677   if ( legacy && elt->attrs_depr )
5678     for ( p = elt->attrs_depr; *p; ++p)
5679       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5680         return HTML_DEPRECATED ;
5681
5682   return HTML_INVALID ;
5683 }
5684 /**
5685  * htmlNodeStatus:
5686  * @node: an htmlNodePtr in a tree
5687  * @legacy: whether to allow deprecated elements (YES is faster here
5688  *      for Element nodes)
5689  *
5690  * Checks whether the tree node is valid.  Experimental (the author
5691  *     only uses the HTML enhancements in a SAX parser)
5692  *
5693  * Return: for Element nodes, a return from htmlElementAllowedHere (if
5694  *      legacy allowed) or htmlElementStatusHere (otherwise).
5695  *      for Attribute nodes, a return from htmlAttrAllowed
5696  *      for other nodes, HTML_NA (no checks performed)
5697  */
5698 htmlStatus
5699 htmlNodeStatus(const htmlNodePtr node, int legacy) {
5700   if ( ! node )
5701     return HTML_INVALID ;
5702
5703   switch ( node->type ) {
5704     case XML_ELEMENT_NODE:
5705       return legacy
5706         ? ( htmlElementAllowedHere (
5707                 htmlTagLookup(node->parent->name) , node->name
5708                 ) ? HTML_VALID : HTML_INVALID )
5709         : htmlElementStatusHere(
5710                 htmlTagLookup(node->parent->name) ,
5711                 htmlTagLookup(node->name) )
5712         ;
5713     case XML_ATTRIBUTE_NODE:
5714       return htmlAttrAllowed(
5715         htmlTagLookup(node->parent->name) , node->name, legacy) ;
5716     default: return HTML_NA ;
5717   }
5718 }
5719 /************************************************************************
5720  *                                                                      *
5721  *      New set (2.6.0) of simpler and more flexible APIs               *
5722  *                                                                      *
5723  ************************************************************************/
5724 /**
5725  * DICT_FREE:
5726  * @str:  a string
5727  *
5728  * Free a string if it is not owned by the "dict" dictionnary in the
5729  * current scope
5730  */
5731 #define DICT_FREE(str)                                          \
5732         if ((str) && ((!dict) ||                                \
5733             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
5734             xmlFree((char *)(str));
5735
5736 /**
5737  * htmlCtxtReset:
5738  * @ctxt: an HTML parser context
5739  *
5740  * Reset a parser context
5741  */
5742 void
5743 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5744 {
5745     xmlParserInputPtr input;
5746     xmlDictPtr dict;
5747
5748     if (ctxt == NULL)
5749         return;
5750
5751     xmlInitParser();
5752     dict = ctxt->dict;
5753
5754     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5755         xmlFreeInputStream(input);
5756     }
5757     ctxt->inputNr = 0;
5758     ctxt->input = NULL;
5759
5760     ctxt->spaceNr = 0;
5761     if (ctxt->spaceTab != NULL) {
5762         ctxt->spaceTab[0] = -1;
5763         ctxt->space = &ctxt->spaceTab[0];
5764     } else {
5765         ctxt->space = NULL;
5766     }
5767
5768
5769     ctxt->nodeNr = 0;
5770     ctxt->node = NULL;
5771
5772     ctxt->nameNr = 0;
5773     ctxt->name = NULL;
5774
5775     DICT_FREE(ctxt->version);
5776     ctxt->version = NULL;
5777     DICT_FREE(ctxt->encoding);
5778     ctxt->encoding = NULL;
5779     DICT_FREE(ctxt->directory);
5780     ctxt->directory = NULL;
5781     DICT_FREE(ctxt->extSubURI);
5782     ctxt->extSubURI = NULL;
5783     DICT_FREE(ctxt->extSubSystem);
5784     ctxt->extSubSystem = NULL;
5785     if (ctxt->myDoc != NULL)
5786         xmlFreeDoc(ctxt->myDoc);
5787     ctxt->myDoc = NULL;
5788
5789     ctxt->standalone = -1;
5790     ctxt->hasExternalSubset = 0;
5791     ctxt->hasPErefs = 0;
5792     ctxt->html = 1;
5793     ctxt->external = 0;
5794     ctxt->instate = XML_PARSER_START;
5795     ctxt->token = 0;
5796
5797     ctxt->wellFormed = 1;
5798     ctxt->nsWellFormed = 1;
5799     ctxt->valid = 1;
5800     ctxt->vctxt.userData = ctxt;
5801     ctxt->vctxt.error = xmlParserValidityError;
5802     ctxt->vctxt.warning = xmlParserValidityWarning;
5803     ctxt->record_info = 0;
5804     ctxt->nbChars = 0;
5805     ctxt->checkIndex = 0;
5806     ctxt->inSubset = 0;
5807     ctxt->errNo = XML_ERR_OK;
5808     ctxt->depth = 0;
5809     ctxt->charset = XML_CHAR_ENCODING_UTF8;
5810     ctxt->catalogs = NULL;
5811     xmlInitNodeInfoSeq(&ctxt->node_seq);
5812
5813     if (ctxt->attsDefault != NULL) {
5814         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5815         ctxt->attsDefault = NULL;
5816     }
5817     if (ctxt->attsSpecial != NULL) {
5818         xmlHashFree(ctxt->attsSpecial, NULL);
5819         ctxt->attsSpecial = NULL;
5820     }
5821 }
5822
5823 /**
5824  * htmlCtxtUseOptions:
5825  * @ctxt: an HTML parser context
5826  * @options:  a combination of htmlParserOption(s)
5827  *
5828  * Applies the options to the parser context
5829  *
5830  * Returns 0 in case of success, the set of unknown or unimplemented options
5831  *         in case of error.
5832  */
5833 int
5834 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5835 {
5836     if (ctxt == NULL)
5837         return(-1);
5838
5839     if (options & HTML_PARSE_NOWARNING) {
5840         ctxt->sax->warning = NULL;
5841         ctxt->vctxt.warning = NULL;
5842         options -= XML_PARSE_NOWARNING;
5843         ctxt->options |= XML_PARSE_NOWARNING;
5844     }
5845     if (options & HTML_PARSE_NOERROR) {
5846         ctxt->sax->error = NULL;
5847         ctxt->vctxt.error = NULL;
5848         ctxt->sax->fatalError = NULL;
5849         options -= XML_PARSE_NOERROR;
5850         ctxt->options |= XML_PARSE_NOERROR;
5851     }
5852     if (options & HTML_PARSE_PEDANTIC) {
5853         ctxt->pedantic = 1;
5854         options -= XML_PARSE_PEDANTIC;
5855         ctxt->options |= XML_PARSE_PEDANTIC;
5856     } else
5857         ctxt->pedantic = 0;
5858     if (options & XML_PARSE_NOBLANKS) {
5859         ctxt->keepBlanks = 0;
5860         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5861         options -= XML_PARSE_NOBLANKS;
5862         ctxt->options |= XML_PARSE_NOBLANKS;
5863     } else
5864         ctxt->keepBlanks = 1;
5865     if (options & HTML_PARSE_RECOVER) {
5866         ctxt->recovery = 1;
5867         options -= HTML_PARSE_RECOVER;
5868     } else
5869         ctxt->recovery = 0;
5870     if (options & HTML_PARSE_COMPACT) {
5871         ctxt->options |= HTML_PARSE_COMPACT;
5872         options -= HTML_PARSE_COMPACT;
5873     }
5874     ctxt->dictNames = 0;
5875     return (options);
5876 }
5877
5878 /**
5879  * htmlDoRead:
5880  * @ctxt:  an HTML parser context
5881  * @URL:  the base URL to use for the document
5882  * @encoding:  the document encoding, or NULL
5883  * @options:  a combination of htmlParserOption(s)
5884  * @reuse:  keep the context for reuse
5885  *
5886  * Common front-end for the htmlRead functions
5887  *
5888  * Returns the resulting document tree or NULL
5889  */
5890 static htmlDocPtr
5891 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5892           int options, int reuse)
5893 {
5894     htmlDocPtr ret;
5895
5896     htmlCtxtUseOptions(ctxt, options);
5897     ctxt->html = 1;
5898     if (encoding != NULL) {
5899         xmlCharEncodingHandlerPtr hdlr;
5900
5901         hdlr = xmlFindCharEncodingHandler(encoding);
5902         if (hdlr != NULL)
5903             xmlSwitchToEncoding(ctxt, hdlr);
5904     }
5905     if ((URL != NULL) && (ctxt->input != NULL) &&
5906         (ctxt->input->filename == NULL))
5907         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5908     htmlParseDocument(ctxt);
5909     ret = ctxt->myDoc;
5910     ctxt->myDoc = NULL;
5911     if (!reuse) {
5912         if ((ctxt->dictNames) &&
5913             (ret != NULL) &&
5914             (ret->dict == ctxt->dict))
5915             ctxt->dict = NULL;
5916         xmlFreeParserCtxt(ctxt);
5917     }
5918     return (ret);
5919 }
5920
5921 /**
5922  * htmlReadDoc:
5923  * @cur:  a pointer to a zero terminated string
5924  * @URL:  the base URL to use for the document
5925  * @encoding:  the document encoding, or NULL
5926  * @options:  a combination of htmlParserOption(s)
5927  *
5928  * parse an XML in-memory document and build a tree.
5929  *
5930  * Returns the resulting document tree
5931  */
5932 htmlDocPtr
5933 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5934 {
5935     htmlParserCtxtPtr ctxt;
5936
5937     if (cur == NULL)
5938         return (NULL);
5939
5940     xmlInitParser();
5941     ctxt = htmlCreateDocParserCtxt(cur, NULL);
5942     if (ctxt == NULL)
5943         return (NULL);
5944     return (htmlDoRead(ctxt, URL, encoding, options, 0));
5945 }
5946
5947 /**
5948  * htmlReadFile:
5949  * @filename:  a file or URL
5950  * @encoding:  the document encoding, or NULL
5951  * @options:  a combination of htmlParserOption(s)
5952  *
5953  * parse an XML file from the filesystem or the network.
5954  *
5955  * Returns the resulting document tree
5956  */
5957 htmlDocPtr
5958 htmlReadFile(const char *filename, const char *encoding, int options)
5959 {
5960     htmlParserCtxtPtr ctxt;
5961
5962     xmlInitParser();
5963     ctxt = htmlCreateFileParserCtxt(filename, encoding);
5964     if (ctxt == NULL)
5965         return (NULL);
5966     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5967 }
5968
5969 /**
5970  * htmlReadMemory:
5971  * @buffer:  a pointer to a char array
5972  * @size:  the size of the array
5973  * @URL:  the base URL to use for the document
5974  * @encoding:  the document encoding, or NULL
5975  * @options:  a combination of htmlParserOption(s)
5976  *
5977  * parse an XML in-memory document and build a tree.
5978  *
5979  * Returns the resulting document tree
5980  */
5981 htmlDocPtr
5982 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5983 {
5984     htmlParserCtxtPtr ctxt;
5985
5986     xmlInitParser();
5987     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5988     if (ctxt == NULL)
5989         return (NULL);
5990     htmlDefaultSAXHandlerInit();
5991     if (ctxt->sax != NULL)
5992         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5993     return (htmlDoRead(ctxt, URL, encoding, options, 0));
5994 }
5995
5996 /**
5997  * htmlReadFd:
5998  * @fd:  an open file descriptor
5999  * @URL:  the base URL to use for the document
6000  * @encoding:  the document encoding, or NULL
6001  * @options:  a combination of htmlParserOption(s)
6002  *
6003  * parse an XML from a file descriptor and build a tree.
6004  *
6005  * Returns the resulting document tree
6006  */
6007 htmlDocPtr
6008 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6009 {
6010     htmlParserCtxtPtr ctxt;
6011     xmlParserInputBufferPtr input;
6012     xmlParserInputPtr stream;
6013
6014     if (fd < 0)
6015         return (NULL);
6016
6017     xmlInitParser();
6018     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6019     if (input == NULL)
6020         return (NULL);
6021     ctxt = xmlNewParserCtxt();
6022     if (ctxt == NULL) {
6023         xmlFreeParserInputBuffer(input);
6024         return (NULL);
6025     }
6026     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6027     if (stream == NULL) {
6028         xmlFreeParserInputBuffer(input);
6029         xmlFreeParserCtxt(ctxt);
6030         return (NULL);
6031     }
6032     inputPush(ctxt, stream);
6033     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6034 }
6035
6036 /**
6037  * htmlReadIO:
6038  * @ioread:  an I/O read function
6039  * @ioclose:  an I/O close function
6040  * @ioctx:  an I/O handler
6041  * @URL:  the base URL to use for the document
6042  * @encoding:  the document encoding, or NULL
6043  * @options:  a combination of htmlParserOption(s)
6044  *
6045  * parse an HTML document from I/O functions and source and build a tree.
6046  *
6047  * Returns the resulting document tree
6048  */
6049 htmlDocPtr
6050 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6051           void *ioctx, const char *URL, const char *encoding, int options)
6052 {
6053     htmlParserCtxtPtr ctxt;
6054     xmlParserInputBufferPtr input;
6055     xmlParserInputPtr stream;
6056
6057     if (ioread == NULL)
6058         return (NULL);
6059     xmlInitParser();
6060
6061     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6062                                          XML_CHAR_ENCODING_NONE);
6063     if (input == NULL)
6064         return (NULL);
6065     ctxt = htmlNewParserCtxt();
6066     if (ctxt == NULL) {
6067         xmlFreeParserInputBuffer(input);
6068         return (NULL);
6069     }
6070     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6071     if (stream == NULL) {
6072         xmlFreeParserInputBuffer(input);
6073         xmlFreeParserCtxt(ctxt);
6074         return (NULL);
6075     }
6076     inputPush(ctxt, stream);
6077     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6078 }
6079
6080 /**
6081  * htmlCtxtReadDoc:
6082  * @ctxt:  an HTML parser context
6083  * @cur:  a pointer to a zero terminated string
6084  * @URL:  the base URL to use for the document
6085  * @encoding:  the document encoding, or NULL
6086  * @options:  a combination of htmlParserOption(s)
6087  *
6088  * parse an XML in-memory document and build a tree.
6089  * This reuses the existing @ctxt parser context
6090  *
6091  * Returns the resulting document tree
6092  */
6093 htmlDocPtr
6094 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6095                const char *URL, const char *encoding, int options)
6096 {
6097     xmlParserInputPtr stream;
6098
6099     if (cur == NULL)
6100         return (NULL);
6101     if (ctxt == NULL)
6102         return (NULL);
6103
6104     htmlCtxtReset(ctxt);
6105
6106     stream = xmlNewStringInputStream(ctxt, cur);
6107     if (stream == NULL) {
6108         return (NULL);
6109     }
6110     inputPush(ctxt, stream);
6111     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6112 }
6113
6114 /**
6115  * htmlCtxtReadFile:
6116  * @ctxt:  an HTML parser context
6117  * @filename:  a file or URL
6118  * @encoding:  the document encoding, or NULL
6119  * @options:  a combination of htmlParserOption(s)
6120  *
6121  * parse an XML file from the filesystem or the network.
6122  * This reuses the existing @ctxt parser context
6123  *
6124  * Returns the resulting document tree
6125  */
6126 htmlDocPtr
6127 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6128                 const char *encoding, int options)
6129 {
6130     xmlParserInputPtr stream;
6131
6132     if (filename == NULL)
6133         return (NULL);
6134     if (ctxt == NULL)
6135         return (NULL);
6136
6137     htmlCtxtReset(ctxt);
6138
6139     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6140     if (stream == NULL) {
6141         return (NULL);
6142     }
6143     inputPush(ctxt, stream);
6144     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6145 }
6146
6147 /**
6148  * htmlCtxtReadMemory:
6149  * @ctxt:  an HTML parser context
6150  * @buffer:  a pointer to a char array
6151  * @size:  the size of the array
6152  * @URL:  the base URL to use for the document
6153  * @encoding:  the document encoding, or NULL
6154  * @options:  a combination of htmlParserOption(s)
6155  *
6156  * parse an XML in-memory document and build a tree.
6157  * This reuses the existing @ctxt parser context
6158  *
6159  * Returns the resulting document tree
6160  */
6161 htmlDocPtr
6162 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6163                   const char *URL, const char *encoding, int options)
6164 {
6165     xmlParserInputBufferPtr input;
6166     xmlParserInputPtr stream;
6167
6168     if (ctxt == NULL)
6169         return (NULL);
6170     if (buffer == NULL)
6171         return (NULL);
6172
6173     htmlCtxtReset(ctxt);
6174
6175     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6176     if (input == NULL) {
6177         return(NULL);
6178     }
6179
6180     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6181     if (stream == NULL) {
6182         xmlFreeParserInputBuffer(input);
6183         return(NULL);
6184     }
6185
6186     inputPush(ctxt, stream);
6187     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6188 }
6189
6190 /**
6191  * htmlCtxtReadFd:
6192  * @ctxt:  an HTML parser context
6193  * @fd:  an open file descriptor
6194  * @URL:  the base URL to use for the document
6195  * @encoding:  the document encoding, or NULL
6196  * @options:  a combination of htmlParserOption(s)
6197  *
6198  * parse an XML from a file descriptor and build a tree.
6199  * This reuses the existing @ctxt parser context
6200  *
6201  * Returns the resulting document tree
6202  */
6203 htmlDocPtr
6204 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6205               const char *URL, const char *encoding, int options)
6206 {
6207     xmlParserInputBufferPtr input;
6208     xmlParserInputPtr stream;
6209
6210     if (fd < 0)
6211         return (NULL);
6212     if (ctxt == NULL)
6213         return (NULL);
6214
6215     htmlCtxtReset(ctxt);
6216
6217
6218     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6219     if (input == NULL)
6220         return (NULL);
6221     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6222     if (stream == NULL) {
6223         xmlFreeParserInputBuffer(input);
6224         return (NULL);
6225     }
6226     inputPush(ctxt, stream);
6227     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6228 }
6229
6230 /**
6231  * htmlCtxtReadIO:
6232  * @ctxt:  an HTML parser context
6233  * @ioread:  an I/O read function
6234  * @ioclose:  an I/O close function
6235  * @ioctx:  an I/O handler
6236  * @URL:  the base URL to use for the document
6237  * @encoding:  the document encoding, or NULL
6238  * @options:  a combination of htmlParserOption(s)
6239  *
6240  * parse an HTML document from I/O functions and source and build a tree.
6241  * This reuses the existing @ctxt parser context
6242  *
6243  * Returns the resulting document tree
6244  */
6245 htmlDocPtr
6246 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6247               xmlInputCloseCallback ioclose, void *ioctx,
6248               const char *URL,
6249               const char *encoding, int options)
6250 {
6251     xmlParserInputBufferPtr input;
6252     xmlParserInputPtr stream;
6253
6254     if (ioread == NULL)
6255         return (NULL);
6256     if (ctxt == NULL)
6257         return (NULL);
6258
6259     htmlCtxtReset(ctxt);
6260
6261     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6262                                          XML_CHAR_ENCODING_NONE);
6263     if (input == NULL)
6264         return (NULL);
6265     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6266     if (stream == NULL) {
6267         xmlFreeParserInputBuffer(input);
6268         return (NULL);
6269     }
6270     inputPush(ctxt, stream);
6271     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6272 }
6273
6274 #define bottom_HTMLparser
6275 #include "elfgcchack.h"
6276 #endif /* LIBXML_HTML_ENABLED */