HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #ifdef HAVE_CTYPE_H
  15 #include <ctype.h>
  16 #endif
  17 #ifdef HAVE_STDLIB_H
  18 #include <stdlib.h>
  19 #endif
  20 #ifdef HAVE_SYS_STAT_H
  21 #include <sys/stat.h>
  22 #endif
  23 #ifdef HAVE_FCNTL_H
  24 #include <fcntl.h>
  25 #endif
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif
  29 #ifdef HAVE_ZLIB_H
  30 #include <zlib.h>
  31 #endif
  32
  33 #include <libxml/xmlmemory.h>
  34 #include <libxml/tree.h>
  35 #include <libxml/parser.h>
  36 #include <libxml/parserInternals.h>
  37 #include <libxml/xmlerror.h>
  38 #include <libxml/HTMLparser.h>
  39 #include <libxml/HTMLtree.h>
  40 #include <libxml/entities.h>
  41 #include <libxml/encoding.h>
  42 #include <libxml/valid.h>
  43 #include <libxml/xmlIO.h>
  44 #include <libxml/globals.h>
  45 #include <libxml/uri.h>
  46
  47 #define HTML_MAX_NAMELEN 1000
  48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  49 #define HTML_PARSER_BUFFER_SIZE 100
  50
  51 /* #define DEBUG */
  52 /* #define DEBUG_PUSH */
  53
  54 static int htmlOmittedDefaultValue = 1;
  55
  56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  57                              xmlChar end, xmlChar  end2, xmlChar end3);
  58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  59
  60 /************************************************************************
  61  *                                                                      *
  62  *              Some factorized error routines                          *
  63  *                                                                      *
  64  ************************************************************************/
  65
  66 /**
  67  * htmlErrMemory:
  68  * @ctxt:  an HTML parser context
  69  * @extra:  extra informations
  70  *
  71  * Handle a redefinition of attribute error
  72  */
  73 static void
  74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  75 {
  76     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  77         (ctxt->instate == XML_PARSER_EOF))
  78         return;
  79     if (ctxt != NULL) {
  80         ctxt->errNo = XML_ERR_NO_MEMORY;
  81         ctxt->instate = XML_PARSER_EOF;
  82         ctxt->disableSAX = 1;
  83     }
  84     if (extra)
  85         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  86                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  87                         NULL, NULL, 0, 0,
  88                         "Memory allocation failed : %s\n", extra);
  89     else
  90         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  91                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  92                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  93 }
  94
  95 /**
  96  * htmlParseErr:
  97  * @ctxt:  an HTML parser context
  98  * @error:  the error number
  99  * @msg:  the error message
 100  * @str1:  string infor
 101  * @str2:  string infor
 102  *
 103  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 104  */
 105 static void
 106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 107              const char *msg, const xmlChar *str1, const xmlChar *str2)
 108 {
 109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 110         (ctxt->instate == XML_PARSER_EOF))
 111         return;
 112     if (ctxt != NULL)
 113         ctxt->errNo = error;
 114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 115                     XML_ERR_ERROR, NULL, 0,
 116                     (const char *) str1, (const char *) str2,
 117                     NULL, 0, 0,
 118                     msg, str1, str2);
 119     if (ctxt != NULL)
 120         ctxt->wellFormed = 0;
 121 }
 122
 123 /**
 124  * htmlParseErrInt:
 125  * @ctxt:  an HTML parser context
 126  * @error:  the error number
 127  * @msg:  the error message
 128  * @val:  integer info
 129  *
 130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 131  */
 132 static void
 133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 134              const char *msg, int val)
 135 {
 136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 137         (ctxt->instate == XML_PARSER_EOF))
 138         return;
 139     if (ctxt != NULL)
 140         ctxt->errNo = error;
 141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 143                     NULL, val, 0, msg, val);
 144     if (ctxt != NULL)
 145         ctxt->wellFormed = 0;
 146 }
 147
 148 /************************************************************************
 149  *                                                                      *
 150  *      Parser stacks related functions and macros              *
 151  *                                                                      *
 152  ************************************************************************/
 153
 154 /**
 155  * htmlnamePush:
 156  * @ctxt:  an HTML parser context
 157  * @value:  the element name
 158  *
 159  * Pushes a new element name on top of the name stack
 160  *
 161  * Returns 0 in case of error, the index in the stack otherwise
 162  */
 163 static int
 164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 165 {
 166     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
 167         ctxt->html = 3;
 168     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
 169         ctxt->html = 10;
 170     if (ctxt->nameNr >= ctxt->nameMax) {
 171         ctxt->nameMax *= 2;
 172         ctxt->nameTab = (const xmlChar * *)
 173                          xmlRealloc((xmlChar * *)ctxt->nameTab,
 174                                     ctxt->nameMax *
 175                                     sizeof(ctxt->nameTab[0]));
 176         if (ctxt->nameTab == NULL) {
 177             htmlErrMemory(ctxt, NULL);
 178             return (0);
 179         }
 180     }
 181     ctxt->nameTab[ctxt->nameNr] = value;
 182     ctxt->name = value;
 183     return (ctxt->nameNr++);
 184 }
 185 /**
 186  * htmlnamePop:
 187  * @ctxt: an HTML parser context
 188  *
 189  * Pops the top element name from the name stack
 190  *
 191  * Returns the name just removed
 192  */
 193 static const xmlChar *
 194 htmlnamePop(htmlParserCtxtPtr ctxt)
 195 {
 196     const xmlChar *ret;
 197
 198     if (ctxt->nameNr <= 0)
 199         return (NULL);
 200     ctxt->nameNr--;
 201     if (ctxt->nameNr < 0)
 202         return (NULL);
 203     if (ctxt->nameNr > 0)
 204         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 205     else
 206         ctxt->name = NULL;
 207     ret = ctxt->nameTab[ctxt->nameNr];
 208     ctxt->nameTab[ctxt->nameNr] = NULL;
 209     return (ret);
 210 }
 211
 212 /**
 213  * htmlNodeInfoPush:
 214  * @ctxt:  an HTML parser context
 215  * @value:  the node info
 216  *
 217  * Pushes a new element name on top of the node info stack
 218  *
 219  * Returns 0 in case of error, the index in the stack otherwise
 220  */
 221 static int
 222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
 223 {
 224     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
 225         if (ctxt->nodeInfoMax == 0)
 226                 ctxt->nodeInfoMax = 5;
 227         ctxt->nodeInfoMax *= 2;
 228         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
 229                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
 230                                     ctxt->nodeInfoMax *
 231                                     sizeof(ctxt->nodeInfoTab[0]));
 232         if (ctxt->nodeInfoTab == NULL) {
 233             htmlErrMemory(ctxt, NULL);
 234             return (0);
 235         }
 236     }
 237     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
 238     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 239     return (ctxt->nodeInfoNr++);
 240 }
 241
 242 /**
 243  * htmlNodeInfoPop:
 244  * @ctxt:  an HTML parser context
 245  *
 246  * Pops the top element name from the node info stack
 247  *
 248  * Returns 0 in case of error, the pointer to NodeInfo otherwise
 249  */
 250 static htmlParserNodeInfo *
 251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 252 {
 253     if (ctxt->nodeInfoNr <= 0)
 254         return (NULL);
 255     ctxt->nodeInfoNr--;
 256     if (ctxt->nodeInfoNr < 0)
 257         return (NULL);
 258     if (ctxt->nodeInfoNr > 0)
 259         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
 260     else
 261         ctxt->nodeInfo = NULL;
 262     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 263 }
 264
 265 /*
 266  * Macros for accessing the content. Those should be used only by the parser,
 267  * and not exported.
 268  *
 269  * Dirty macros, i.e. one need to make assumption on the context to use them
 270  *
 271  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 272  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 273  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 274  *           in UNICODE mode. This should be used internally by the parser
 275  *           only to compare to ASCII values otherwise it would break when
 276  *           running with UTF-8 encoding.
 277  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 278  *           to compare on ASCII based substring.
 279  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 280  *           it should be used only to compare on ASCII based substring.
 281  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 282  *           strings without newlines within the parser.
 283  *
 284  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 285  *
 286  *   CURRENT Returns the current char value, with the full decoding of
 287  *           UTF-8 if we are using this mode. It returns an int.
 288  *   NEXT    Skip to the next character, this does the proper decoding
 289  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 290  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 291  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 292  */
 293
 294 #define UPPER (toupper(*ctxt->input->cur))
 295
 296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
 297
 298 #define NXT(val) ctxt->input->cur[(val)]
 299
 300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 301
 302 #define CUR_PTR ctxt->input->cur
 303
 304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 305                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 306         xmlParserInputShrink(ctxt->input)
 307
 308 #define GROW if ((ctxt->progressive == 0) &&                            \
 309                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 310         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 311
 312 #define CURRENT ((int) (*ctxt->input->cur))
 313
 314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 315
 316 /* Inported from XML */
 317
 318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 319 #define CUR ((int) (*ctxt->input->cur))
 320 #define NEXT xmlNextChar(ctxt)
 321
 322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 323
 324
 325 #define NEXTL(l) do {                                                   \
 326     if (*(ctxt->input->cur) == '\n') {                                  \
 327         ctxt->input->line++; ctxt->input->col = 1;                      \
 328     } else ctxt->input->col++;                                          \
 329     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;            \
 330   } while (0)
 331
 332 /************
 333     \
 334     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 335     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 336  ************/
 337
 338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 340
 341 #define COPY_BUF(l,b,i,v)                                               \
 342     if (l == 1) b[i++] = (xmlChar) v;                                   \
 343     else i += xmlCopyChar(l,&b[i],v)
 344
 345 /**
 346  * htmlFindEncoding:
 347  * @the HTML parser context
 348  *
 349  * Ty to find and encoding in the current data available in the input
 350  * buffer this is needed to try to switch to the proper encoding when
 351  * one face a character error.
 352  * That's an heuristic, since it's operating outside of parsing it could
 353  * try to use a meta which had been commented out, that's the reason it
 354  * should only be used in case of error, not as a default.
 355  *
 356  * Returns an encoding string or NULL if not found, the string need to
 357  *   be freed
 358  */
 359 static xmlChar *
 360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 361     const xmlChar *start, *cur, *end;
 362
 363     if ((ctxt == NULL) || (ctxt->input == NULL) ||
 364         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
 365         (ctxt->input->buf->encoder != NULL))
 366         return(NULL);
 367     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
 368         return(NULL);
 369
 370     start = ctxt->input->cur;
 371     end = ctxt->input->end;
 372     /* we also expect the input buffer to be zero terminated */
 373     if (*end != 0)
 374         return(NULL);
 375
 376     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
 377     if (cur == NULL)
 378         return(NULL);
 379     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
 380     if (cur == NULL)
 381         return(NULL);
 382     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
 383     if (cur == NULL)
 384         return(NULL);
 385     cur += 8;
 386     start = cur;
 387     while (((*cur >= 'A') && (*cur <= 'Z')) ||
 388            ((*cur >= 'a') && (*cur <= 'z')) ||
 389            ((*cur >= '0') && (*cur <= '9')) ||
 390            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
 391            cur++;
 392     if (cur == start)
 393         return(NULL);
 394     return(xmlStrndup(start, cur - start));
 395 }
 396
 397 /**
 398  * htmlCurrentChar:
 399  * @ctxt:  the HTML parser context
 400  * @len:  pointer to the length of the char read
 401  *
 402  * The current char value, if using UTF-8 this may actually span multiple
 403  * bytes in the input buffer. Implement the end of line normalization:
 404  * 2.11 End-of-Line Handling
 405  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 406  * char, then the encoding converter is plugged in automatically.
 407  *
 408  * Returns the current char value and its length
 409  */
 410
 411 static int
 412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 413     if (ctxt->instate == XML_PARSER_EOF)
 414         return(0);
 415
 416     if (ctxt->token != 0) {
 417         *len = 0;
 418         return(ctxt->token);
 419     }
 420     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
 421         /*
 422          * We are supposed to handle UTF8, check it's valid
 423          * From rfc2044: encoding of the Unicode values on UTF-8:
 424          *
 425          * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 426          * 0000 0000-0000 007F   0xxxxxxx
 427          * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 428          * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 429          *
 430          * Check for the 0x110000 limit too
 431          */
 432         const unsigned char *cur = ctxt->input->cur;
 433         unsigned char c;
 434         unsigned int val;
 435
 436         c = *cur;
 437         if (c & 0x80) {
 438             if (cur[1] == 0) {
 439                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 440                 cur = ctxt->input->cur;
 441             }
 442             if ((cur[1] & 0xc0) != 0x80)
 443                 goto encoding_error;
 444             if ((c & 0xe0) == 0xe0) {
 445
 446                 if (cur[2] == 0) {
 447                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 448                     cur = ctxt->input->cur;
 449                 }
 450                 if ((cur[2] & 0xc0) != 0x80)
 451                     goto encoding_error;
 452                 if ((c & 0xf0) == 0xf0) {
 453                     if (cur[3] == 0) {
 454                         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 455                         cur = ctxt->input->cur;
 456                     }
 457                     if (((c & 0xf8) != 0xf0) ||
 458                         ((cur[3] & 0xc0) != 0x80))
 459                         goto encoding_error;
 460                     /* 4-byte code */
 461                     *len = 4;
 462                     val = (cur[0] & 0x7) << 18;
 463                     val |= (cur[1] & 0x3f) << 12;
 464                     val |= (cur[2] & 0x3f) << 6;
 465                     val |= cur[3] & 0x3f;
 466                 } else {
 467                   /* 3-byte code */
 468                     *len = 3;
 469                     val = (cur[0] & 0xf) << 12;
 470                     val |= (cur[1] & 0x3f) << 6;
 471                     val |= cur[2] & 0x3f;
 472                 }
 473             } else {
 474               /* 2-byte code */
 475                 *len = 2;
 476                 val = (cur[0] & 0x1f) << 6;
 477                 val |= cur[1] & 0x3f;
 478             }
 479             if (!IS_CHAR(val)) {
 480                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 481                                 "Char 0x%X out of allowed range\n", val);
 482             }
 483             return(val);
 484         } else {
 485             if ((*ctxt->input->cur == 0) &&
 486                 (ctxt->input->cur < ctxt->input->end)) {
 487                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 488                                 "Char 0x%X out of allowed range\n", 0);
 489                 *len = 1;
 490                 return(' ');
 491             }
 492             /* 1-byte code */
 493             *len = 1;
 494             return((int) *ctxt->input->cur);
 495         }
 496     }
 497     /*
 498      * Assume it's a fixed length encoding (1) with
 499      * a compatible encoding for the ASCII set, since
 500      * XML constructs only use < 128 chars
 501      */
 502     *len = 1;
 503     if ((int) *ctxt->input->cur < 0x80)
 504         return((int) *ctxt->input->cur);
 505
 506     /*
 507      * Humm this is bad, do an automatic flow conversion
 508      */
 509     {
 510         xmlChar * guess;
 511         xmlCharEncodingHandlerPtr handler;
 512
 513         guess = htmlFindEncoding(ctxt);
 514         if (guess == NULL) {
 515             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 516         } else {
 517             if (ctxt->input->encoding != NULL)
 518                 xmlFree((xmlChar *) ctxt->input->encoding);
 519             ctxt->input->encoding = guess;
 520             handler = xmlFindCharEncodingHandler((const char *) guess);
 521             if (handler != NULL) {
 522                 xmlSwitchToEncoding(ctxt, handler);
 523             } else {
 524                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 525                              "Unsupported encoding %s", guess, NULL);
 526             }
 527         }
 528         ctxt->charset = XML_CHAR_ENCODING_UTF8;
 529     }
 530
 531     return(xmlCurrentChar(ctxt, len));
 532
 533 encoding_error:
 534     /*
 535      * If we detect an UTF8 error that probably mean that the
 536      * input encoding didn't get properly advertized in the
 537      * declaration header. Report the error and switch the encoding
 538      * to ISO-Latin-1 (if you don't like this policy, just declare the
 539      * encoding !)
 540      */
 541     {
 542         char buffer[150];
 543
 544         if (ctxt->input->end - ctxt->input->cur >= 4) {
 545             snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 546                             ctxt->input->cur[0], ctxt->input->cur[1],
 547                             ctxt->input->cur[2], ctxt->input->cur[3]);
 548         } else {
 549             snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
 550         }
 551         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 552                      "Input is not proper UTF-8, indicate encoding !\n",
 553                      BAD_CAST buffer, NULL);
 554     }
 555
 556     ctxt->charset = XML_CHAR_ENCODING_8859_1;
 557     *len = 1;
 558     return((int) *ctxt->input->cur);
 559 }
 560
 561 /**
 562  * htmlSkipBlankChars:
 563  * @ctxt:  the HTML parser context
 564  *
 565  * skip all blanks character found at that point in the input streams.
 566  *
 567  * Returns the number of space chars skipped
 568  */
 569
 570 static int
 571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 572     int res = 0;
 573
 574     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 575         if ((*ctxt->input->cur == 0) &&
 576             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 577                 xmlPopInput(ctxt);
 578         } else {
 579             if (*(ctxt->input->cur) == '\n') {
 580                 ctxt->input->line++; ctxt->input->col = 1;
 581             } else ctxt->input->col++;
 582             ctxt->input->cur++;
 583             ctxt->nbChars++;
 584             if (*ctxt->input->cur == 0)
 585                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 586         }
 587         res++;
 588     }
 589     return(res);
 590 }
 591
 592
 593
 594 /************************************************************************
 595  *                                                                      *
 596  *      The list of HTML elements and their properties          *
 597  *                                                                      *
 598  ************************************************************************/
 599
 600 /*
 601  *  Start Tag: 1 means the start tag can be ommited
 602  *  End Tag:   1 means the end tag can be ommited
 603  *             2 means it's forbidden (empty elements)
 604  *             3 means the tag is stylistic and should be closed easily
 605  *  Depr:      this element is deprecated
 606  *  DTD:       1 means that this element is valid only in the Loose DTD
 607  *             2 means that this element is valid only in the Frameset DTD
 608  *
 609  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 610         , subElements , impliedsubelt , Attributes, userdata
 611  */
 612
 613 /* Definitions and a couple of vars for HTML Elements */
 614
 615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 616 #define NB_FONTSTYLE 8
 617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 618 #define NB_PHRASE 10
 619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 620 #define NB_SPECIAL 16
 621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
 622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 625 #define FORMCTRL "input", "select", "textarea", "label", "button"
 626 #define NB_FORMCTRL 5
 627 #define PCDATA
 628 #define NB_PCDATA 0
 629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 630 #define NB_HEADING 6
 631 #define LIST "ul", "ol", "dir", "menu"
 632 #define NB_LIST 4
 633 #define MODIFIER
 634 #define NB_MODIFIER 0
 635 #define FLOW BLOCK,INLINE
 636 #define NB_FLOW NB_BLOCK + NB_INLINE
 637 #define EMPTY NULL
 638
 639
 640 static const char* const html_flow[] = { FLOW, NULL } ;
 641 static const char* const html_inline[] = { INLINE, NULL } ;
 642
 643 /* placeholders: elts with content but no subelements */
 644 static const char* const html_pcdata[] = { NULL } ;
 645 #define html_cdata html_pcdata
 646
 647
 648 /* ... and for HTML Attributes */
 649
 650 #define COREATTRS "id", "class", "style", "title"
 651 #define NB_COREATTRS 4
 652 #define I18N "lang", "dir"
 653 #define NB_I18N 2
 654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 655 #define NB_EVENTS 9
 656 #define ATTRS COREATTRS,I18N,EVENTS
 657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 658 #define CELLHALIGN "align", "char", "charoff"
 659 #define NB_CELLHALIGN 3
 660 #define CELLVALIGN "valign"
 661 #define NB_CELLVALIGN 1
 662
 663 static const char* const html_attrs[] = { ATTRS, NULL } ;
 664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 666 static const char* const i18n_attrs[] = { I18N, NULL } ;
 667
 668
 669 /* Other declarations that should go inline ... */
 670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 671         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 672         "tabindex", "onfocus", "onblur", NULL } ;
 673 static const char* const target_attr[] = { "target", NULL } ;
 674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 675 static const char* const alt_attr[] = { "alt", NULL } ;
 676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 677 static const char* const href_attrs[] = { "href", NULL } ;
 678 static const char* const clear_attrs[] = { "clear", NULL } ;
 679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 680
 681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 683                 "archive", "alt", "name", "height", "width", "align",
 684                 "hspace", "vspace", NULL } ;
 685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 686         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 687 static const char* const basefont_attrs[] =
 688         { "id", "size", "color", "face", NULL } ;
 689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 692 static const char* const body_depr[] = { "background", "bgcolor", "text",
 693         "link", "vlink", "alink", NULL } ;
 694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 695         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 696
 697
 698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 699 static const char* const col_elt[] = { "col", NULL } ;
 700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 703 static const char* const compact_attr[] = { "compact", NULL } ;
 704 static const char* const label_attr[] = { "label", NULL } ;
 705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 715 static const char* const version_attr[] = { "version", NULL } ;
 716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
 720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 724 static const char* const align_attr[] = { "align", NULL } ;
 725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 727 static const char* const name_attr[] = { "name", NULL } ;
 728 static const char* const action_attr[] = { "action", NULL } ;
 729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
 731 static const char* const content_attr[] = { "content", NULL } ;
 732 static const char* const type_attr[] = { "type", NULL } ;
 733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 738 static const char* const option_elt[] = { "option", NULL } ;
 739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 742 static const char* const width_attr[] = { "width", NULL } ;
 743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 745 static const char* const language_attr[] = { "language", NULL } ;
 746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 752 static const char* const tr_elt[] = { "tr", NULL } ;
 753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 757 static const char* const tr_contents[] = { "th", "td", NULL } ;
 758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 759 static const char* const li_elt[] = { "li", NULL } ;
 760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 761 static const char* const dir_attr[] = { "dir", NULL} ;
 762
 763 #define DECL (const char**)
 764
 765 static const htmlElemDesc
 766 html40ElementTable[] = {
 767 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 768         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 769 },
 770 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 771         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 772 },
 773 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 774         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 775 },
 776 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 777         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 778 },
 779 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 780         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 781 },
 782 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 783         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 784 },
 785 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 786         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 787 },
 788 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 789         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 790 },
 791 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 792         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 793 },
 794 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 795         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 796 },
 797 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 798         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 799 },
 800 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 801         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 802 },
 803 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 804         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 805 },
 806 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 807         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 808 },
 809 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 810         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 811 },
 812 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 813         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 814 },
 815 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 816         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 817 },
 818 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 819         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 820 },
 821 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 822         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 823 },
 824 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 825         EMPTY , NULL , DECL col_attrs , NULL, NULL
 826 },
 827 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 828         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 829 },
 830 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 831         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 832 },
 833 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 834         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 835 },
 836 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 837         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 838 },
 839 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 840         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 841 },
 842 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 843         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 844 },
 845 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 846         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
 847 },
 848 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 849         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 850 },
 851 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 852         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 853 },
 854 { "embed",      0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
 855         EMPTY, NULL, DECL embed_attrs, NULL, NULL
 856 },
 857 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 858         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 859 },
 860 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 861         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 862 },
 863 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 864         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 865 },
 866 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 867         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 868 },
 869 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 870         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 871 },
 872 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 873         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 874 },
 875 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 876         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 877 },
 878 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 879         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 880 },
 881 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 882         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 883 },
 884 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 885         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 886 },
 887 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 888         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 889 },
 890 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 891         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 892 },
 893 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 894         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 895 },
 896 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 897         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 898 },
 899 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 900         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 901 },
 902 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 903         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 904 },
 905 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 906         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
 907 },
 908 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 909         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 910 },
 911 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 912         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 913 },
 914 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 915         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 916 },
 917 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 918         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 919 },
 920 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 921         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 922 },
 923 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 924         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 925 },
 926 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 927         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 928 },
 929 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 930         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 931 },
 932 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 933         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
 934 },
 935 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 936         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 937 },
 938 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 939         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 940 },
 941 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 942         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 943 },
 944 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 945         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 946 },
 947 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 948         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 949 },
 950 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 951         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 952 },
 953 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 954         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 955 },
 956 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 957         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 958 },
 959 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 960         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 961 },
 962 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 963         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
 964 },
 965 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 966         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 967 },
 968 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
 969         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
 970 },
 971 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
 972         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 973 },
 974 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
 975         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 976 },
 977 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
 978         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
 979 },
 980 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
 981         DECL select_content, NULL, DECL select_attrs, NULL, NULL
 982 },
 983 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
 984         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 985 },
 986 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
 987         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 988 },
 989 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
 990         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 991 },
 992 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
 993         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 994 },
 995 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
 996         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
 997 },
 998 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
 999         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000 },
1001 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
1002         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003 },
1004 { "table",      0, 0, 0, 0, 0, 0, 0, "",
1005         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1006 },
1007 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
1008         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1009 },
1010 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
1011         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1012 },
1013 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1015 },
1016 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
1017         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018 },
1019 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
1020         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021 },
1022 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
1023         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024 },
1025 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
1026         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1027 },
1028 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
1029         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1030 },
1031 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1036 },
1037 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1039 },
1040 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042 }
1043 };
1044
1045 /*
1046  * start tags that imply the end of current element
1047  */
1048 static const char * const htmlStartClose[] = {
1049 "form",         "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050                 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1051                 "listing", "xmp", "head", NULL,
1052 "head",         "p", NULL,
1053 "title",        "p", NULL,
1054 "body",         "head", "style", "link", "title", "p", NULL,
1055 "frameset",     "head", "style", "link", "title", "p", NULL,
1056 "li",           "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057                 "pre", "listing", "xmp", "head", "li", NULL,
1058 "hr",           "p", "head", NULL,
1059 "h1",           "p", "head", NULL,
1060 "h2",           "p", "head", NULL,
1061 "h3",           "p", "head", NULL,
1062 "h4",           "p", "head", NULL,
1063 "h5",           "p", "head", NULL,
1064 "h6",           "p", "head", NULL,
1065 "dir",          "p", "head", NULL,
1066 "address",      "p", "head", "ul", NULL,
1067 "pre",          "p", "head", "ul", NULL,
1068 "listing",      "p", "head", NULL,
1069 "xmp",          "p", "head", NULL,
1070 "blockquote",   "p", "head", NULL,
1071 "dl",           "p", "dt", "menu", "dir", "address", "pre", "listing",
1072                 "xmp", "head", NULL,
1073 "dt",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
1074                 "head", "dd", NULL,
1075 "dd",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
1076                 "head", "dt", NULL,
1077 "ul",           "p", "head", "ol", "menu", "dir", "address", "pre",
1078                 "listing", "xmp", NULL,
1079 "ol",           "p", "head", "ul", NULL,
1080 "menu",         "p", "head", "ul", NULL,
1081 "p",            "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1082 "div",          "p", "head", NULL,
1083 "noscript",     "p", "head", NULL,
1084 "center",       "font", "b", "i", "p", "head", NULL,
1085 "a",            "a", NULL,
1086 "caption",      "p", NULL,
1087 "colgroup",     "caption", "colgroup", "col", "p", NULL,
1088 "col",          "caption", "col", "p", NULL,
1089 "table",        "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090                 "listing", "xmp", "a", NULL,
1091 "th",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1092 "td",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1093 "tr",           "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094 "thead",        "caption", "col", "colgroup", NULL,
1095 "tfoot",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
1096                 "tbody", "p", NULL,
1097 "tbody",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
1098                 "tfoot", "tbody", "p", NULL,
1099 "optgroup",     "option", NULL,
1100 "option",       "option", NULL,
1101 "fieldset",     "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102                 "pre", "listing", "xmp", "a", NULL,
1103 NULL
1104 };
1105
1106 /*
1107  * The list of HTML elements which are supposed not to have
1108  * CDATA content and where a p element will be implied
1109  *
1110  * TODO: extend that list by reading the HTML SGML DTD on
1111  *       implied paragraph
1112  */
1113 static const char *const htmlNoContentElements[] = {
1114     "html",
1115     "head",
1116     NULL
1117 };
1118
1119 /*
1120  * The list of HTML attributes which are of content %Script;
1121  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122  *       it assumes the name starts with 'on'
1123  */
1124 static const char *const htmlScriptAttributes[] = {
1125     "onclick",
1126     "ondblclick",
1127     "onmousedown",
1128     "onmouseup",
1129     "onmouseover",
1130     "onmousemove",
1131     "onmouseout",
1132     "onkeypress",
1133     "onkeydown",
1134     "onkeyup",
1135     "onload",
1136     "onunload",
1137     "onfocus",
1138     "onblur",
1139     "onsubmit",
1140     "onrest",
1141     "onchange",
1142     "onselect"
1143 };
1144
1145 /*
1146  * This table is used by the htmlparser to know what to do with
1147  * broken html pages. By assigning different priorities to different
1148  * elements the parser can decide how to handle extra endtags.
1149  * Endtags are only allowed to close elements with lower or equal
1150  * priority.
1151  */
1152
1153 typedef struct {
1154     const char *name;
1155     int priority;
1156 } elementPriority;
1157
1158 static const elementPriority htmlEndPriority[] = {
1159     {"div",   150},
1160     {"td",    160},
1161     {"th",    160},
1162     {"tr",    170},
1163     {"thead", 180},
1164     {"tbody", 180},
1165     {"tfoot", 180},
1166     {"table", 190},
1167     {"head",  200},
1168     {"body",  200},
1169     {"html",  220},
1170     {NULL,    100} /* Default priority */
1171 };
1172
1173 static const char** htmlStartCloseIndex[100];
1174 static int htmlStartCloseIndexinitialized = 0;
1175
1176 /************************************************************************
1177  *                                                                      *
1178  *      functions to handle HTML specific data                  *
1179  *                                                                      *
1180  ************************************************************************/
1181
1182 /**
1183  * htmlInitAutoClose:
1184  *
1185  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186  * This is not reentrant. Call xmlInitParser() once before processing in
1187  * case of use in multithreaded programs.
1188  */
1189 void
1190 htmlInitAutoClose(void) {
1191     int indx, i = 0;
1192
1193     if (htmlStartCloseIndexinitialized) return;
1194
1195     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1196     indx = 0;
1197     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1198         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1199         while (htmlStartClose[i] != NULL) i++;
1200         i++;
1201     }
1202     htmlStartCloseIndexinitialized = 1;
1203 }
1204
1205 /**
1206  * htmlTagLookup:
1207  * @tag:  The tag name in lowercase
1208  *
1209  * Lookup the HTML tag in the ElementTable
1210  *
1211  * Returns the related htmlElemDescPtr or NULL if not found.
1212  */
1213 const htmlElemDesc *
1214 htmlTagLookup(const xmlChar *tag) {
1215     unsigned int i;
1216
1217     for (i = 0; i < (sizeof(html40ElementTable) /
1218                      sizeof(html40ElementTable[0]));i++) {
1219         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1220             return((htmlElemDescPtr) &html40ElementTable[i]);
1221     }
1222     return(NULL);
1223 }
1224
1225 /**
1226  * htmlGetEndPriority:
1227  * @name: The name of the element to look up the priority for.
1228  *
1229  * Return value: The "endtag" priority.
1230  **/
1231 static int
1232 htmlGetEndPriority (const xmlChar *name) {
1233     int i = 0;
1234
1235     while ((htmlEndPriority[i].name != NULL) &&
1236            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1237         i++;
1238
1239     return(htmlEndPriority[i].priority);
1240 }
1241
1242
1243 /**
1244  * htmlCheckAutoClose:
1245  * @newtag:  The new tag name
1246  * @oldtag:  The old tag name
1247  *
1248  * Checks whether the new tag is one of the registered valid tags for
1249  * closing old.
1250  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1251  *
1252  * Returns 0 if no, 1 if yes.
1253  */
1254 static int
1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1256 {
1257     int i, indx;
1258     const char **closed = NULL;
1259
1260     if (htmlStartCloseIndexinitialized == 0)
1261         htmlInitAutoClose();
1262
1263     /* inefficient, but not a big deal */
1264     for (indx = 0; indx < 100; indx++) {
1265         closed = htmlStartCloseIndex[indx];
1266         if (closed == NULL)
1267             return (0);
1268         if (xmlStrEqual(BAD_CAST * closed, newtag))
1269             break;
1270     }
1271
1272     i = closed - htmlStartClose;
1273     i++;
1274     while (htmlStartClose[i] != NULL) {
1275         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1276             return (1);
1277         }
1278         i++;
1279     }
1280     return (0);
1281 }
1282
1283 /**
1284  * htmlAutoCloseOnClose:
1285  * @ctxt:  an HTML parser context
1286  * @newtag:  The new tag name
1287  * @force:  force the tag closure
1288  *
1289  * The HTML DTD allows an ending tag to implicitly close other tags.
1290  */
1291 static void
1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1293 {
1294     const htmlElemDesc *info;
1295     int i, priority;
1296
1297     priority = htmlGetEndPriority(newtag);
1298
1299     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1300
1301         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1302             break;
1303         /*
1304          * A missplaced endtag can only close elements with lower
1305          * or equal priority, so if we find an element with higher
1306          * priority before we find an element with
1307          * matching name, we just ignore this endtag
1308          */
1309         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1310             return;
1311     }
1312     if (i < 0)
1313         return;
1314
1315     while (!xmlStrEqual(newtag, ctxt->name)) {
1316         info = htmlTagLookup(ctxt->name);
1317         if ((info != NULL) && (info->endTag == 3)) {
1318             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319                          "Opening and ending tag mismatch: %s and %s\n",
1320                          newtag, ctxt->name);
1321         }
1322         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1324         htmlnamePop(ctxt);
1325     }
1326 }
1327
1328 /**
1329  * htmlAutoCloseOnEnd:
1330  * @ctxt:  an HTML parser context
1331  *
1332  * Close all remaining tags at the end of the stream
1333  */
1334 static void
1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1336 {
1337     int i;
1338
1339     if (ctxt->nameNr == 0)
1340         return;
1341     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1342         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1344         htmlnamePop(ctxt);
1345     }
1346 }
1347
1348 /**
1349  * htmlAutoClose:
1350  * @ctxt:  an HTML parser context
1351  * @newtag:  The new tag name or NULL
1352  *
1353  * The HTML DTD allows a tag to implicitly close other tags.
1354  * The list is kept in htmlStartClose array. This function is
1355  * called when a new tag has been detected and generates the
1356  * appropriates closes if possible/needed.
1357  * If newtag is NULL this mean we are at the end of the resource
1358  * and we should check
1359  */
1360 static void
1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1362 {
1363     while ((newtag != NULL) && (ctxt->name != NULL) &&
1364            (htmlCheckAutoClose(newtag, ctxt->name))) {
1365         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1367         htmlnamePop(ctxt);
1368     }
1369     if (newtag == NULL) {
1370         htmlAutoCloseOnEnd(ctxt);
1371         return;
1372     }
1373     while ((newtag == NULL) && (ctxt->name != NULL) &&
1374            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1377         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1379         htmlnamePop(ctxt);
1380     }
1381 }
1382
1383 /**
1384  * htmlAutoCloseTag:
1385  * @doc:  the HTML document
1386  * @name:  The tag name
1387  * @elem:  the HTML element
1388  *
1389  * The HTML DTD allows a tag to implicitly close other tags.
1390  * The list is kept in htmlStartClose array. This function checks
1391  * if the element or one of it's children would autoclose the
1392  * given tag.
1393  *
1394  * Returns 1 if autoclose, 0 otherwise
1395  */
1396 int
1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1398     htmlNodePtr child;
1399
1400     if (elem == NULL) return(1);
1401     if (xmlStrEqual(name, elem->name)) return(0);
1402     if (htmlCheckAutoClose(elem->name, name)) return(1);
1403     child = elem->children;
1404     while (child != NULL) {
1405         if (htmlAutoCloseTag(doc, name, child)) return(1);
1406         child = child->next;
1407     }
1408     return(0);
1409 }
1410
1411 /**
1412  * htmlIsAutoClosed:
1413  * @doc:  the HTML document
1414  * @elem:  the HTML element
1415  *
1416  * The HTML DTD allows a tag to implicitly close other tags.
1417  * The list is kept in htmlStartClose array. This function checks
1418  * if a tag is autoclosed by one of it's child
1419  *
1420  * Returns 1 if autoclosed, 0 otherwise
1421  */
1422 int
1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1424     htmlNodePtr child;
1425
1426     if (elem == NULL) return(1);
1427     child = elem->children;
1428     while (child != NULL) {
1429         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430         child = child->next;
1431     }
1432     return(0);
1433 }
1434
1435 /**
1436  * htmlCheckImplied:
1437  * @ctxt:  an HTML parser context
1438  * @newtag:  The new tag name
1439  *
1440  * The HTML DTD allows a tag to exists only implicitly
1441  * called when a new tag has been detected and generates the
1442  * appropriates implicit tags if missing
1443  */
1444 static void
1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1446     int i;
1447
1448     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449         return;
1450     if (!htmlOmittedDefaultValue)
1451         return;
1452     if (xmlStrEqual(newtag, BAD_CAST"html"))
1453         return;
1454     if (ctxt->nameNr <= 0) {
1455         htmlnamePush(ctxt, BAD_CAST"html");
1456         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1458     }
1459     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1460         return;
1461     if ((ctxt->nameNr <= 1) &&
1462         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1468         if (ctxt->html >= 3) {
1469             /* we already saw or generated an <head> before */
1470             return;
1471         }
1472         /*
1473          * dropped OBJECT ... i you put it first BODY will be
1474          * assumed !
1475          */
1476         htmlnamePush(ctxt, BAD_CAST"head");
1477         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1479     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1482         if (ctxt->html >= 10) {
1483             /* we already saw or generated a <body> before */
1484             return;
1485         }
1486         for (i = 0;i < ctxt->nameNr;i++) {
1487             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1488                 return;
1489             }
1490             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1491                 return;
1492             }
1493         }
1494
1495         htmlnamePush(ctxt, BAD_CAST"body");
1496         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1498     }
1499 }
1500
1501 /**
1502  * htmlCheckParagraph
1503  * @ctxt:  an HTML parser context
1504  *
1505  * Check whether a p element need to be implied before inserting
1506  * characters in the current element.
1507  *
1508  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1509  *         in case of error.
1510  */
1511
1512 static int
1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1514     const xmlChar *tag;
1515     int i;
1516
1517     if (ctxt == NULL)
1518         return(-1);
1519     tag = ctxt->name;
1520     if (tag == NULL) {
1521         htmlAutoClose(ctxt, BAD_CAST"p");
1522         htmlCheckImplied(ctxt, BAD_CAST"p");
1523         htmlnamePush(ctxt, BAD_CAST"p");
1524         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1526         return(1);
1527     }
1528     if (!htmlOmittedDefaultValue)
1529         return(0);
1530     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1532             htmlAutoClose(ctxt, BAD_CAST"p");
1533             htmlCheckImplied(ctxt, BAD_CAST"p");
1534             htmlnamePush(ctxt, BAD_CAST"p");
1535             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1537             return(1);
1538         }
1539     }
1540     return(0);
1541 }
1542
1543 /**
1544  * htmlIsScriptAttribute:
1545  * @name:  an attribute name
1546  *
1547  * Check if an attribute is of content type Script
1548  *
1549  * Returns 1 is the attribute is a script 0 otherwise
1550  */
1551 int
1552 htmlIsScriptAttribute(const xmlChar *name) {
1553     unsigned int i;
1554
1555     if (name == NULL)
1556       return(0);
1557     /*
1558      * all script attributes start with 'on'
1559      */
1560     if ((name[0] != 'o') || (name[1] != 'n'))
1561       return(0);
1562     for (i = 0;
1563          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1564          i++) {
1565         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1566             return(1);
1567     }
1568     return(0);
1569 }
1570
1571 /************************************************************************
1572  *                                                                      *
1573  *      The list of HTML predefined entities                    *
1574  *                                                                      *
1575  ************************************************************************/
1576
1577
1578 static const htmlEntityDesc  html40EntitiesTable[] = {
1579 /*
1580  * the 4 absolute ones, plus apostrophe.
1581  */
1582 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1583 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1584 { 39,   "apos", "single quote" },
1585 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1586 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1587
1588 /*
1589  * A bunch still in the 128-255 range
1590  * Replacing them depend really on the charset used.
1591  */
1592 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1593 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1595 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1596 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1597 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1598 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1600 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1602 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1603 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604 { 172,  "not",  "not sign, U+00AC ISOnum" },
1605 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1607 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1609 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1614 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1618 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1619 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1641 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1648 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1668 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1672 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1673 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679 { 247,  "divide","division sign, U+00F7 ISOnum" },
1680 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1685 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1688
1689 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1691 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1694
1695 /*
1696  * Anything below should really be kept as entities references
1697  */
1698 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1699
1700 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1701 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1702
1703 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1704 { 914,  "Beta", "greek capital letter beta, U+0392" },
1705 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1708 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1709 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1710 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711 { 921,  "Iota", "greek capital letter iota, U+0399" },
1712 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1713 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1714 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1715 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1716 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1717 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1718 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1719 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1720 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1722 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1724 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1725 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1726 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1727
1728 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1730 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1732 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1734 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1735 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1736 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1737 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1740 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1741 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1742 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1743 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1744 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1745 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1748 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1750 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1751 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1752 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1753 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1756
1757 { 8194, "ensp", "en space, U+2002 ISOpub" },
1758 { 8195, "emsp", "em space, U+2003 ISOpub" },
1759 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1760 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1761 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1762 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1763 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1764 { 8211, "ndash","en dash, U+2013 ISOpub" },
1765 { 8212, "mdash","em dash, U+2014 ISOpub" },
1766 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1767 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1768 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1769 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1770 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1771 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1772 { 8224, "dagger","dagger, U+2020 ISOpub" },
1773 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1774
1775 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1776 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1777
1778 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1779
1780 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1781 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1782
1783 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1785
1786 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1787 { 8260, "frasl","fraction slash, U+2044 NEW" },
1788
1789 { 8364, "euro", "euro sign, U+20AC NEW" },
1790
1791 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1794 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1795 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1797 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1798 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1799 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1800 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1801 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1803 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1804 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1805 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1806 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1807
1808 { 8704, "forall","for all, U+2200 ISOtech" },
1809 { 8706, "part", "partial differential, U+2202 ISOtech" },
1810 { 8707, "exist","there exists, U+2203 ISOtech" },
1811 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1812 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1813 { 8712, "isin", "element of, U+2208 ISOtech" },
1814 { 8713, "notin","not an element of, U+2209 ISOtech" },
1815 { 8715, "ni",   "contains as member, U+220B ISOtech" },
1816 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1817 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1818 { 8722, "minus","minus sign, U+2212 ISOtech" },
1819 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1820 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1821 { 8733, "prop", "proportional to, U+221D ISOtech" },
1822 { 8734, "infin","infinity, U+221E ISOtech" },
1823 { 8736, "ang",  "angle, U+2220 ISOamso" },
1824 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1825 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
1826 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1827 { 8746, "cup",  "union = cup, U+222A ISOtech" },
1828 { 8747, "int",  "integral, U+222B ISOtech" },
1829 { 8756, "there4","therefore, U+2234 ISOtech" },
1830 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1831 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1832 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
1834 { 8801, "equiv","identical to, U+2261 ISOtech" },
1835 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
1836 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
1837 { 8834, "sub",  "subset of, U+2282 ISOtech" },
1838 { 8835, "sup",  "superset of, U+2283 ISOtech" },
1839 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1840 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1841 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1842 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1844 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1846 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1848 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1850 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1851 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1852 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
1853
1854 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1855 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1856 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1857 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1858
1859 };
1860
1861 /************************************************************************
1862  *                                                                      *
1863  *              Commodity functions to handle entities                  *
1864  *                                                                      *
1865  ************************************************************************/
1866
1867 /*
1868  * Macro used to grow the current buffer.
1869  */
1870 #define growBuffer(buffer) {                                            \
1871     xmlChar *tmp;                                                       \
1872     buffer##_size *= 2;                                                 \
1873     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874     if (tmp == NULL) {                                          \
1875         htmlErrMemory(ctxt, "growing buffer\n");                        \
1876         xmlFree(buffer);                                                \
1877         return(NULL);                                                   \
1878     }                                                                   \
1879     buffer = tmp;                                                       \
1880 }
1881
1882 /**
1883  * htmlEntityLookup:
1884  * @name: the entity name
1885  *
1886  * Lookup the given entity in EntitiesTable
1887  *
1888  * TODO: the linear scan is really ugly, an hash table is really needed.
1889  *
1890  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1891  */
1892 const htmlEntityDesc *
1893 htmlEntityLookup(const xmlChar *name) {
1894     unsigned int i;
1895
1896     for (i = 0;i < (sizeof(html40EntitiesTable)/
1897                     sizeof(html40EntitiesTable[0]));i++) {
1898         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1899             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1900         }
1901     }
1902     return(NULL);
1903 }
1904
1905 /**
1906  * htmlEntityValueLookup:
1907  * @value: the entity's unicode value
1908  *
1909  * Lookup the given entity in EntitiesTable
1910  *
1911  * TODO: the linear scan is really ugly, an hash table is really needed.
1912  *
1913  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1914  */
1915 const htmlEntityDesc *
1916 htmlEntityValueLookup(unsigned int value) {
1917     unsigned int i;
1918
1919     for (i = 0;i < (sizeof(html40EntitiesTable)/
1920                     sizeof(html40EntitiesTable[0]));i++) {
1921         if (html40EntitiesTable[i].value >= value) {
1922             if (html40EntitiesTable[i].value > value)
1923                 break;
1924             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1925         }
1926     }
1927     return(NULL);
1928 }
1929
1930 /**
1931  * UTF8ToHtml:
1932  * @out:  a pointer to an array of bytes to store the result
1933  * @outlen:  the length of @out
1934  * @in:  a pointer to an array of UTF-8 chars
1935  * @inlen:  the length of @in
1936  *
1937  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938  * plus HTML entities block of chars out.
1939  *
1940  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941  * The value of @inlen after return is the number of octets consumed
1942  *     as the return value is positive, else unpredictable.
1943  * The value of @outlen after return is the number of octets consumed.
1944  */
1945 int
1946 UTF8ToHtml(unsigned char* out, int *outlen,
1947               const unsigned char* in, int *inlen) {
1948     const unsigned char* processed = in;
1949     const unsigned char* outend;
1950     const unsigned char* outstart = out;
1951     const unsigned char* instart = in;
1952     const unsigned char* inend;
1953     unsigned int c, d;
1954     int trailing;
1955
1956     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1957     if (in == NULL) {
1958         /*
1959          * initialization nothing to do
1960          */
1961         *outlen = 0;
1962         *inlen = 0;
1963         return(0);
1964     }
1965     inend = in + (*inlen);
1966     outend = out + (*outlen);
1967     while (in < inend) {
1968         d = *in++;
1969         if      (d < 0x80)  { c= d; trailing= 0; }
1970         else if (d < 0xC0) {
1971             /* trailing byte in leading position */
1972             *outlen = out - outstart;
1973             *inlen = processed - instart;
1974             return(-2);
1975         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1976         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1977         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1978         else {
1979             /* no chance for this in Ascii */
1980             *outlen = out - outstart;
1981             *inlen = processed - instart;
1982             return(-2);
1983         }
1984
1985         if (inend - in < trailing) {
1986             break;
1987         }
1988
1989         for ( ; trailing; trailing--) {
1990             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1991                 break;
1992             c <<= 6;
1993             c |= d & 0x3F;
1994         }
1995
1996         /* assertion: c is a single UTF-4 value */
1997         if (c < 0x80) {
1998             if (out + 1 >= outend)
1999                 break;
2000             *out++ = c;
2001         } else {
2002             int len;
2003             const htmlEntityDesc * ent;
2004             const char *cp;
2005             char nbuf[16];
2006
2007             /*
2008              * Try to lookup a predefined HTML entity for it
2009              */
2010
2011             ent = htmlEntityValueLookup(c);
2012             if (ent == NULL) {
2013               snprintf(nbuf, sizeof(nbuf), "#%u", c);
2014               cp = nbuf;
2015             }
2016             else
2017               cp = ent->name;
2018             len = strlen(cp);
2019             if (out + 2 + len >= outend)
2020                 break;
2021             *out++ = '&';
2022             memcpy(out, cp, len);
2023             out += len;
2024             *out++ = ';';
2025         }
2026         processed = in;
2027     }
2028     *outlen = out - outstart;
2029     *inlen = processed - instart;
2030     return(0);
2031 }
2032
2033 /**
2034  * htmlEncodeEntities:
2035  * @out:  a pointer to an array of bytes to store the result
2036  * @outlen:  the length of @out
2037  * @in:  a pointer to an array of UTF-8 chars
2038  * @inlen:  the length of @in
2039  * @quoteChar: the quote character to escape (' or ") or zero.
2040  *
2041  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042  * plus HTML entities block of chars out.
2043  *
2044  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045  * The value of @inlen after return is the number of octets consumed
2046  *     as the return value is positive, else unpredictable.
2047  * The value of @outlen after return is the number of octets consumed.
2048  */
2049 int
2050 htmlEncodeEntities(unsigned char* out, int *outlen,
2051                    const unsigned char* in, int *inlen, int quoteChar) {
2052     const unsigned char* processed = in;
2053     const unsigned char* outend;
2054     const unsigned char* outstart = out;
2055     const unsigned char* instart = in;
2056     const unsigned char* inend;
2057     unsigned int c, d;
2058     int trailing;
2059
2060     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2061         return(-1);
2062     outend = out + (*outlen);
2063     inend = in + (*inlen);
2064     while (in < inend) {
2065         d = *in++;
2066         if      (d < 0x80)  { c= d; trailing= 0; }
2067         else if (d < 0xC0) {
2068             /* trailing byte in leading position */
2069             *outlen = out - outstart;
2070             *inlen = processed - instart;
2071             return(-2);
2072         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2073         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2074         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2075         else {
2076             /* no chance for this in Ascii */
2077             *outlen = out - outstart;
2078             *inlen = processed - instart;
2079             return(-2);
2080         }
2081
2082         if (inend - in < trailing)
2083             break;
2084
2085         while (trailing--) {
2086             if (((d= *in++) & 0xC0) != 0x80) {
2087                 *outlen = out - outstart;
2088                 *inlen = processed - instart;
2089                 return(-2);
2090             }
2091             c <<= 6;
2092             c |= d & 0x3F;
2093         }
2094
2095         /* assertion: c is a single UTF-4 value */
2096         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097             (c != '&') && (c != '<') && (c != '>')) {
2098             if (out >= outend)
2099                 break;
2100             *out++ = c;
2101         } else {
2102             const htmlEntityDesc * ent;
2103             const char *cp;
2104             char nbuf[16];
2105             int len;
2106
2107             /*
2108              * Try to lookup a predefined HTML entity for it
2109              */
2110             ent = htmlEntityValueLookup(c);
2111             if (ent == NULL) {
2112                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2113                 cp = nbuf;
2114             }
2115             else
2116                 cp = ent->name;
2117             len = strlen(cp);
2118             if (out + 2 + len > outend)
2119                 break;
2120             *out++ = '&';
2121             memcpy(out, cp, len);
2122             out += len;
2123             *out++ = ';';
2124         }
2125         processed = in;
2126     }
2127     *outlen = out - outstart;
2128     *inlen = processed - instart;
2129     return(0);
2130 }
2131
2132 /************************************************************************
2133  *                                                                      *
2134  *              Commodity functions to handle streams                   *
2135  *                                                                      *
2136  ************************************************************************/
2137
2138 /**
2139  * htmlNewInputStream:
2140  * @ctxt:  an HTML parser context
2141  *
2142  * Create a new input stream structure
2143  * Returns the new input stream or NULL
2144  */
2145 static htmlParserInputPtr
2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147     htmlParserInputPtr input;
2148
2149     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150     if (input == NULL) {
2151         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2152         return(NULL);
2153     }
2154     memset(input, 0, sizeof(htmlParserInput));
2155     input->filename = NULL;
2156     input->directory = NULL;
2157     input->base = NULL;
2158     input->cur = NULL;
2159     input->buf = NULL;
2160     input->line = 1;
2161     input->col = 1;
2162     input->buf = NULL;
2163     input->free = NULL;
2164     input->version = NULL;
2165     input->consumed = 0;
2166     input->length = 0;
2167     return(input);
2168 }
2169
2170
2171 /************************************************************************
2172  *                                                                      *
2173  *              Commodity functions, cleanup needed ?                   *
2174  *                                                                      *
2175  ************************************************************************/
2176 /*
2177  * all tags allowing pc data from the html 4.01 loose dtd
2178  * NOTE: it might be more apropriate to integrate this information
2179  * into the html40ElementTable array but I don't want to risk any
2180  * binary incomptibility
2181  */
2182 static const char *allowPCData[] = {
2183     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184     "blockquote", "body", "button", "caption", "center", "cite", "code",
2185     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2189 };
2190
2191 /**
2192  * areBlanks:
2193  * @ctxt:  an HTML parser context
2194  * @str:  a xmlChar *
2195  * @len:  the size of @str
2196  *
2197  * Is this a sequence of blank chars that one can ignore ?
2198  *
2199  * Returns 1 if ignorable 0 otherwise.
2200  */
2201
2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2203     unsigned int i;
2204     int j;
2205     xmlNodePtr lastChild;
2206     xmlDtdPtr dtd;
2207
2208     for (j = 0;j < len;j++)
2209         if (!(IS_BLANK_CH(str[j]))) return(0);
2210
2211     if (CUR == 0) return(1);
2212     if (CUR != '<') return(0);
2213     if (ctxt->name == NULL)
2214         return(1);
2215     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2216         return(1);
2217     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2218         return(1);
2219
2220     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222         dtd = xmlGetIntSubset(ctxt->myDoc);
2223         if (dtd != NULL && dtd->ExternalID != NULL) {
2224             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2226                 return(1);
2227         }
2228     }
2229
2230     if (ctxt->node == NULL) return(0);
2231     lastChild = xmlGetLastChild(ctxt->node);
2232     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233         lastChild = lastChild->prev;
2234     if (lastChild == NULL) {
2235         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236             (ctxt->node->content != NULL)) return(0);
2237         /* keep ws in constructs like ...<b> </b>...
2238            for all tags "b" allowing PCDATA */
2239         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2241                 return(0);
2242             }
2243         }
2244     } else if (xmlNodeIsText(lastChild)) {
2245         return(0);
2246     } else {
2247         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2248            for all tags "p" allowing PCDATA */
2249         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2251                 return(0);
2252             }
2253         }
2254     }
2255     return(1);
2256 }
2257
2258 /**
2259  * htmlNewDocNoDtD:
2260  * @URI:  URI for the dtd, or NULL
2261  * @ExternalID:  the external ID of the DTD, or NULL
2262  *
2263  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2264  * are NULL
2265  *
2266  * Returns a new document, do not initialize the DTD if not provided
2267  */
2268 htmlDocPtr
2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2270     xmlDocPtr cur;
2271
2272     /*
2273      * Allocate a new document and fill the fields.
2274      */
2275     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2276     if (cur == NULL) {
2277         htmlErrMemory(NULL, "HTML document creation failed\n");
2278         return(NULL);
2279     }
2280     memset(cur, 0, sizeof(xmlDoc));
2281
2282     cur->type = XML_HTML_DOCUMENT_NODE;
2283     cur->version = NULL;
2284     cur->intSubset = NULL;
2285     cur->doc = cur;
2286     cur->name = NULL;
2287     cur->children = NULL;
2288     cur->extSubset = NULL;
2289     cur->oldNs = NULL;
2290     cur->encoding = NULL;
2291     cur->standalone = 1;
2292     cur->compression = 0;
2293     cur->ids = NULL;
2294     cur->refs = NULL;
2295     cur->_private = NULL;
2296     cur->charset = XML_CHAR_ENCODING_UTF8;
2297     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2298     if ((ExternalID != NULL) ||
2299         (URI != NULL))
2300         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2301     return(cur);
2302 }
2303
2304 /**
2305  * htmlNewDoc:
2306  * @URI:  URI for the dtd, or NULL
2307  * @ExternalID:  the external ID of the DTD, or NULL
2308  *
2309  * Creates a new HTML document
2310  *
2311  * Returns a new document
2312  */
2313 htmlDocPtr
2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315     if ((URI == NULL) && (ExternalID == NULL))
2316         return(htmlNewDocNoDtD(
2317                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2319
2320     return(htmlNewDocNoDtD(URI, ExternalID));
2321 }
2322
2323
2324 /************************************************************************
2325  *                                                                      *
2326  *                      The parser itself                               *
2327  *      Relates to http://www.w3.org/TR/html40                          *
2328  *                                                                      *
2329  ************************************************************************/
2330
2331 /************************************************************************
2332  *                                                                      *
2333  *                      The parser itself                               *
2334  *                                                                      *
2335  ************************************************************************/
2336
2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2338
2339 /**
2340  * htmlParseHTMLName:
2341  * @ctxt:  an HTML parser context
2342  *
2343  * parse an HTML tag or attribute name, note that we convert it to lowercase
2344  * since HTML names are not case-sensitive.
2345  *
2346  * Returns the Tag Name parsed or NULL
2347  */
2348
2349 static const xmlChar *
2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2351     int i = 0;
2352     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2353
2354     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2355         (CUR != ':') && (CUR != '.')) return(NULL);
2356
2357     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2358            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2359            (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360            (CUR == '.'))) {
2361         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2362         else loc[i] = CUR;
2363         i++;
2364
2365         NEXT;
2366     }
2367
2368     return(xmlDictLookup(ctxt->dict, loc, i));
2369 }
2370
2371
2372 /**
2373  * htmlParseHTMLName_nonInvasive:
2374  * @ctxt:  an HTML parser context
2375  *
2376  * parse an HTML tag or attribute name, note that we convert it to lowercase
2377  * since HTML names are not case-sensitive, this doesn't consume the data
2378  * from the stream, it's a look-ahead
2379  *
2380  * Returns the Tag Name parsed or NULL
2381  */
2382
2383 static const xmlChar *
2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2385     int i = 0;
2386     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2387
2388     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389         (NXT(1) != ':')) return(NULL);
2390
2391     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393            (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394         if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395         else loc[i] = NXT(1+i);
2396         i++;
2397     }
2398
2399     return(xmlDictLookup(ctxt->dict, loc, i));
2400 }
2401
2402
2403 /**
2404  * htmlParseName:
2405  * @ctxt:  an HTML parser context
2406  *
2407  * parse an HTML name, this routine is case sensitive.
2408  *
2409  * Returns the Name parsed or NULL
2410  */
2411
2412 static const xmlChar *
2413 htmlParseName(htmlParserCtxtPtr ctxt) {
2414     const xmlChar *in;
2415     const xmlChar *ret;
2416     int count = 0;
2417
2418     GROW;
2419
2420     /*
2421      * Accelerator for simple ASCII names
2422      */
2423     in = ctxt->input->cur;
2424     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425         ((*in >= 0x41) && (*in <= 0x5A)) ||
2426         (*in == '_') || (*in == ':')) {
2427         in++;
2428         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429                ((*in >= 0x41) && (*in <= 0x5A)) ||
2430                ((*in >= 0x30) && (*in <= 0x39)) ||
2431                (*in == '_') || (*in == '-') ||
2432                (*in == ':') || (*in == '.'))
2433             in++;
2434         if ((*in > 0) && (*in < 0x80)) {
2435             count = in - ctxt->input->cur;
2436             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2437             ctxt->input->cur = in;
2438             ctxt->nbChars += count;
2439             ctxt->input->col += count;
2440             return(ret);
2441         }
2442     }
2443     return(htmlParseNameComplex(ctxt));
2444 }
2445
2446 static const xmlChar *
2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2448     int len = 0, l;
2449     int c;
2450     int count = 0;
2451
2452     /*
2453      * Handler for more complex cases
2454      */
2455     GROW;
2456     c = CUR_CHAR(l);
2457     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458         (!IS_LETTER(c) && (c != '_') &&
2459          (c != ':'))) {
2460         return(NULL);
2461     }
2462
2463     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465             (c == '.') || (c == '-') ||
2466             (c == '_') || (c == ':') ||
2467             (IS_COMBINING(c)) ||
2468             (IS_EXTENDER(c)))) {
2469         if (count++ > 100) {
2470             count = 0;
2471             GROW;
2472         }
2473         len += l;
2474         NEXTL(l);
2475         c = CUR_CHAR(l);
2476     }
2477     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2478 }
2479
2480
2481 /**
2482  * htmlParseHTMLAttribute:
2483  * @ctxt:  an HTML parser context
2484  * @stop:  a char stop value
2485  *
2486  * parse an HTML attribute value till the stop (quote), if
2487  * stop is 0 then it stops at the first space
2488  *
2489  * Returns the attribute parsed or NULL
2490  */
2491
2492 static xmlChar *
2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494     xmlChar *buffer = NULL;
2495     int buffer_size = 0;
2496     xmlChar *out = NULL;
2497     const xmlChar *name = NULL;
2498     const xmlChar *cur = NULL;
2499     const htmlEntityDesc * ent;
2500
2501     /*
2502      * allocate a translation buffer.
2503      */
2504     buffer_size = HTML_PARSER_BUFFER_SIZE;
2505     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2506     if (buffer == NULL) {
2507         htmlErrMemory(ctxt, "buffer allocation failed\n");
2508         return(NULL);
2509     }
2510     out = buffer;
2511
2512     /*
2513      * Ok loop until we reach one of the ending chars
2514      */
2515     while ((CUR != 0) && (CUR != stop)) {
2516         if ((stop == 0) && (CUR == '>')) break;
2517         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2518         if (CUR == '&') {
2519             if (NXT(1) == '#') {
2520                 unsigned int c;
2521                 int bits;
2522
2523                 c = htmlParseCharRef(ctxt);
2524                 if      (c <    0x80)
2525                         { *out++  = c;                bits= -6; }
2526                 else if (c <   0x800)
2527                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2528                 else if (c < 0x10000)
2529                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2530                 else
2531                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2532
2533                 for ( ; bits >= 0; bits-= 6) {
2534                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2535                 }
2536
2537                 if (out - buffer > buffer_size - 100) {
2538                         int indx = out - buffer;
2539
2540                         growBuffer(buffer);
2541                         out = &buffer[indx];
2542                 }
2543             } else {
2544                 ent = htmlParseEntityRef(ctxt, &name);
2545                 if (name == NULL) {
2546                     *out++ = '&';
2547                     if (out - buffer > buffer_size - 100) {
2548                         int indx = out - buffer;
2549
2550                         growBuffer(buffer);
2551                         out = &buffer[indx];
2552                     }
2553                 } else if (ent == NULL) {
2554                     *out++ = '&';
2555                     cur = name;
2556                     while (*cur != 0) {
2557                         if (out - buffer > buffer_size - 100) {
2558                             int indx = out - buffer;
2559
2560                             growBuffer(buffer);
2561                             out = &buffer[indx];
2562                         }
2563                         *out++ = *cur++;
2564                     }
2565                 } else {
2566                     unsigned int c;
2567                     int bits;
2568
2569                     if (out - buffer > buffer_size - 100) {
2570                         int indx = out - buffer;
2571
2572                         growBuffer(buffer);
2573                         out = &buffer[indx];
2574                     }
2575                     c = ent->value;
2576                     if      (c <    0x80)
2577                         { *out++  = c;                bits= -6; }
2578                     else if (c <   0x800)
2579                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2580                     else if (c < 0x10000)
2581                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2582                     else
2583                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2584
2585                     for ( ; bits >= 0; bits-= 6) {
2586                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2587                     }
2588                 }
2589             }
2590         } else {
2591             unsigned int c;
2592             int bits, l;
2593
2594             if (out - buffer > buffer_size - 100) {
2595                 int indx = out - buffer;
2596
2597                 growBuffer(buffer);
2598                 out = &buffer[indx];
2599             }
2600             c = CUR_CHAR(l);
2601             if      (c <    0x80)
2602                     { *out++  = c;                bits= -6; }
2603             else if (c <   0x800)
2604                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2605             else if (c < 0x10000)
2606                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2607             else
2608                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2609
2610             for ( ; bits >= 0; bits-= 6) {
2611                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2612             }
2613             NEXT;
2614         }
2615     }
2616     *out = 0;
2617     return(buffer);
2618 }
2619
2620 /**
2621  * htmlParseEntityRef:
2622  * @ctxt:  an HTML parser context
2623  * @str:  location to store the entity name
2624  *
2625  * parse an HTML ENTITY references
2626  *
2627  * [68] EntityRef ::= '&' Name ';'
2628  *
2629  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630  *         if non-NULL *str will have to be freed by the caller.
2631  */
2632 const htmlEntityDesc *
2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634     const xmlChar *name;
2635     const htmlEntityDesc * ent = NULL;
2636
2637     if (str != NULL) *str = NULL;
2638     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2639
2640     if (CUR == '&') {
2641         NEXT;
2642         name = htmlParseName(ctxt);
2643         if (name == NULL) {
2644             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645                          "htmlParseEntityRef: no name\n", NULL, NULL);
2646         } else {
2647             GROW;
2648             if (CUR == ';') {
2649                 if (str != NULL)
2650                     *str = name;
2651
2652                 /*
2653                  * Lookup the entity in the table.
2654                  */
2655                 ent = htmlEntityLookup(name);
2656                 if (ent != NULL) /* OK that's ugly !!! */
2657                     NEXT;
2658             } else {
2659                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660                              "htmlParseEntityRef: expecting ';'\n",
2661                              NULL, NULL);
2662                 if (str != NULL)
2663                     *str = name;
2664             }
2665         }
2666     }
2667     return(ent);
2668 }
2669
2670 /**
2671  * htmlParseAttValue:
2672  * @ctxt:  an HTML parser context
2673  *
2674  * parse a value for an attribute
2675  * Note: the parser won't do substitution of entities here, this
2676  * will be handled later in xmlStringGetNodeList, unless it was
2677  * asked for ctxt->replaceEntities != 0
2678  *
2679  * Returns the AttValue parsed or NULL.
2680  */
2681
2682 static xmlChar *
2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2684     xmlChar *ret = NULL;
2685
2686     if (CUR == '"') {
2687         NEXT;
2688         ret = htmlParseHTMLAttribute(ctxt, '"');
2689         if (CUR != '"') {
2690             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2691                          "AttValue: \" expected\n", NULL, NULL);
2692         } else
2693             NEXT;
2694     } else if (CUR == '\'') {
2695         NEXT;
2696         ret = htmlParseHTMLAttribute(ctxt, '\'');
2697         if (CUR != '\'') {
2698             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2699                          "AttValue: ' expected\n", NULL, NULL);
2700         } else
2701             NEXT;
2702     } else {
2703         /*
2704          * That's an HTMLism, the attribute value may not be quoted
2705          */
2706         ret = htmlParseHTMLAttribute(ctxt, 0);
2707         if (ret == NULL) {
2708             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2709                          "AttValue: no value found\n", NULL, NULL);
2710         }
2711     }
2712     return(ret);
2713 }
2714
2715 /**
2716  * htmlParseSystemLiteral:
2717  * @ctxt:  an HTML parser context
2718  *
2719  * parse an HTML Literal
2720  *
2721  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2722  *
2723  * Returns the SystemLiteral parsed or NULL
2724  */
2725
2726 static xmlChar *
2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2728     const xmlChar *q;
2729     xmlChar *ret = NULL;
2730
2731     if (CUR == '"') {
2732         NEXT;
2733         q = CUR_PTR;
2734         while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2735             NEXT;
2736         if (!IS_CHAR_CH(CUR)) {
2737             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2738                          "Unfinished SystemLiteral\n", NULL, NULL);
2739         } else {
2740             ret = xmlStrndup(q, CUR_PTR - q);
2741             NEXT;
2742         }
2743     } else if (CUR == '\'') {
2744         NEXT;
2745         q = CUR_PTR;
2746         while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2747             NEXT;
2748         if (!IS_CHAR_CH(CUR)) {
2749             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2750                          "Unfinished SystemLiteral\n", NULL, NULL);
2751         } else {
2752             ret = xmlStrndup(q, CUR_PTR - q);
2753             NEXT;
2754         }
2755     } else {
2756         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2757                      " or ' expected\n", NULL, NULL);
2758     }
2759
2760     return(ret);
2761 }
2762
2763 /**
2764  * htmlParsePubidLiteral:
2765  * @ctxt:  an HTML parser context
2766  *
2767  * parse an HTML public literal
2768  *
2769  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2770  *
2771  * Returns the PubidLiteral parsed or NULL.
2772  */
2773
2774 static xmlChar *
2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2776     const xmlChar *q;
2777     xmlChar *ret = NULL;
2778     /*
2779      * Name ::= (Letter | '_') (NameChar)*
2780      */
2781     if (CUR == '"') {
2782         NEXT;
2783         q = CUR_PTR;
2784         while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2785         if (CUR != '"') {
2786             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2787                          "Unfinished PubidLiteral\n", NULL, NULL);
2788         } else {
2789             ret = xmlStrndup(q, CUR_PTR - q);
2790             NEXT;
2791         }
2792     } else if (CUR == '\'') {
2793         NEXT;
2794         q = CUR_PTR;
2795         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2796             NEXT;
2797         if (CUR != '\'') {
2798             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2799                          "Unfinished PubidLiteral\n", NULL, NULL);
2800         } else {
2801             ret = xmlStrndup(q, CUR_PTR - q);
2802             NEXT;
2803         }
2804     } else {
2805         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2806                      "PubidLiteral \" or ' expected\n", NULL, NULL);
2807     }
2808
2809     return(ret);
2810 }
2811
2812 /**
2813  * htmlParseScript:
2814  * @ctxt:  an HTML parser context
2815  *
2816  * parse the content of an HTML SCRIPT or STYLE element
2817  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2818  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2819  * http://www.w3.org/TR/html4/types.html#type-script
2820  * http://www.w3.org/TR/html4/types.html#h-6.15
2821  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2822  *
2823  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2824  * element and the value of intrinsic event attributes. User agents must
2825  * not evaluate script data as HTML markup but instead must pass it on as
2826  * data to a script engine.
2827  * NOTES:
2828  * - The content is passed like CDATA
2829  * - the attributes for style and scripting "onXXX" are also described
2830  *   as CDATA but SGML allows entities references in attributes so their
2831  *   processing is identical as other attributes
2832  */
2833 static void
2834 htmlParseScript(htmlParserCtxtPtr ctxt) {
2835     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2836     int nbchar = 0;
2837     int cur,l;
2838
2839     SHRINK;
2840     cur = CUR_CHAR(l);
2841     while (IS_CHAR_CH(cur)) {
2842         if ((cur == '<') && (NXT(1) == '/')) {
2843             /*
2844              * One should break here, the specification is clear:
2845              * Authors should therefore escape "</" within the content.
2846              * Escape mechanisms are specific to each scripting or
2847              * style sheet language.
2848              *
2849              * In recovery mode, only break if end tag match the
2850              * current tag, effectively ignoring all tags inside the
2851              * script/style block and treating the entire block as
2852              * CDATA.
2853              */
2854             if (ctxt->recovery) {
2855                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2856                                    xmlStrlen(ctxt->name)) == 0)
2857                 {
2858                     break; /* while */
2859                 } else {
2860                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2861                                  "Element %s embeds close tag\n",
2862                                  ctxt->name, NULL);
2863                 }
2864             } else {
2865                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2866                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2867                 {
2868                     break; /* while */
2869                 }
2870             }
2871         }
2872         COPY_BUF(l,buf,nbchar,cur);
2873         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2874             if (ctxt->sax->cdataBlock!= NULL) {
2875                 /*
2876                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2877                  */
2878                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2879             } else if (ctxt->sax->characters != NULL) {
2880                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2881             }
2882             nbchar = 0;
2883         }
2884         GROW;
2885         NEXTL(l);
2886         cur = CUR_CHAR(l);
2887     }
2888
2889     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2890         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891                     "Invalid char in CDATA 0x%X\n", cur);
2892         if (ctxt->input->cur < ctxt->input->end) {
2893             NEXT;
2894         }
2895     }
2896
2897     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2898         if (ctxt->sax->cdataBlock!= NULL) {
2899             /*
2900              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2901              */
2902             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2903         } else if (ctxt->sax->characters != NULL) {
2904             ctxt->sax->characters(ctxt->userData, buf, nbchar);
2905         }
2906     }
2907 }
2908
2909
2910 /**
2911  * htmlParseCharData:
2912  * @ctxt:  an HTML parser context
2913  *
2914  * parse a CharData section.
2915  * if we are within a CDATA section ']]>' marks an end of section.
2916  *
2917  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2918  */
2919
2920 static void
2921 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2922     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2923     int nbchar = 0;
2924     int cur, l;
2925     int chunk = 0;
2926
2927     SHRINK;
2928     cur = CUR_CHAR(l);
2929     while (((cur != '<') || (ctxt->token == '<')) &&
2930            ((cur != '&') || (ctxt->token == '&')) &&
2931            (cur != 0)) {
2932         if (!(IS_CHAR(cur))) {
2933             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2934                         "Invalid char in CDATA 0x%X\n", cur);
2935         } else {
2936             COPY_BUF(l,buf,nbchar,cur);
2937         }
2938         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2939             /*
2940              * Ok the segment is to be consumed as chars.
2941              */
2942             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2943                 if (areBlanks(ctxt, buf, nbchar)) {
2944                     if (ctxt->sax->ignorableWhitespace != NULL)
2945                         ctxt->sax->ignorableWhitespace(ctxt->userData,
2946                                                        buf, nbchar);
2947                 } else {
2948                     htmlCheckParagraph(ctxt);
2949                     if (ctxt->sax->characters != NULL)
2950                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
2951                 }
2952             }
2953             nbchar = 0;
2954         }
2955         NEXTL(l);
2956         chunk++;
2957         if (chunk > HTML_PARSER_BUFFER_SIZE) {
2958             chunk = 0;
2959             SHRINK;
2960             GROW;
2961         }
2962         cur = CUR_CHAR(l);
2963         if (cur == 0) {
2964             SHRINK;
2965             GROW;
2966             cur = CUR_CHAR(l);
2967         }
2968     }
2969     if (nbchar != 0) {
2970         buf[nbchar] = 0;
2971
2972         /*
2973          * Ok the segment is to be consumed as chars.
2974          */
2975         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2976             if (areBlanks(ctxt, buf, nbchar)) {
2977                 if (ctxt->sax->ignorableWhitespace != NULL)
2978                     ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2979             } else {
2980                 htmlCheckParagraph(ctxt);
2981                 if (ctxt->sax->characters != NULL)
2982                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
2983             }
2984         }
2985     } else {
2986         /*
2987          * Loop detection
2988          */
2989         if (cur == 0)
2990             ctxt->instate = XML_PARSER_EOF;
2991     }
2992 }
2993
2994 /**
2995  * htmlParseExternalID:
2996  * @ctxt:  an HTML parser context
2997  * @publicID:  a xmlChar** receiving PubidLiteral
2998  *
2999  * Parse an External ID or a Public ID
3000  *
3001  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3002  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3003  *
3004  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3005  *
3006  * Returns the function returns SystemLiteral and in the second
3007  *                case publicID receives PubidLiteral, is strict is off
3008  *                it is possible to return NULL and have publicID set.
3009  */
3010
3011 static xmlChar *
3012 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3013     xmlChar *URI = NULL;
3014
3015     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3016          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3017          (UPP(4) == 'E') && (UPP(5) == 'M')) {
3018         SKIP(6);
3019         if (!IS_BLANK_CH(CUR)) {
3020             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3021                          "Space required after 'SYSTEM'\n", NULL, NULL);
3022         }
3023         SKIP_BLANKS;
3024         URI = htmlParseSystemLiteral(ctxt);
3025         if (URI == NULL) {
3026             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3027                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3028         }
3029     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3030                (UPP(2) == 'B') && (UPP(3) == 'L') &&
3031                (UPP(4) == 'I') && (UPP(5) == 'C')) {
3032         SKIP(6);
3033         if (!IS_BLANK_CH(CUR)) {
3034             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3035                          "Space required after 'PUBLIC'\n", NULL, NULL);
3036         }
3037         SKIP_BLANKS;
3038         *publicID = htmlParsePubidLiteral(ctxt);
3039         if (*publicID == NULL) {
3040             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3041                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3042                          NULL, NULL);
3043         }
3044         SKIP_BLANKS;
3045         if ((CUR == '"') || (CUR == '\'')) {
3046             URI = htmlParseSystemLiteral(ctxt);
3047         }
3048     }
3049     return(URI);
3050 }
3051
3052 /**
3053  * xmlParsePI:
3054  * @ctxt:  an XML parser context
3055  *
3056  * parse an XML Processing Instruction.
3057  *
3058  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3059  */
3060 static void
3061 htmlParsePI(htmlParserCtxtPtr ctxt) {
3062     xmlChar *buf = NULL;
3063     int len = 0;
3064     int size = HTML_PARSER_BUFFER_SIZE;
3065     int cur, l;
3066     const xmlChar *target;
3067     xmlParserInputState state;
3068     int count = 0;
3069
3070     if ((RAW == '<') && (NXT(1) == '?')) {
3071         state = ctxt->instate;
3072         ctxt->instate = XML_PARSER_PI;
3073         /*
3074          * this is a Processing Instruction.
3075          */
3076         SKIP(2);
3077         SHRINK;
3078
3079         /*
3080          * Parse the target name and check for special support like
3081          * namespace.
3082          */
3083         target = htmlParseName(ctxt);
3084         if (target != NULL) {
3085             if (RAW == '>') {
3086                 SKIP(1);
3087
3088                 /*
3089                  * SAX: PI detected.
3090                  */
3091                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3092                     (ctxt->sax->processingInstruction != NULL))
3093                     ctxt->sax->processingInstruction(ctxt->userData,
3094                                                      target, NULL);
3095                 ctxt->instate = state;
3096                 return;
3097             }
3098             buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3099             if (buf == NULL) {
3100                 htmlErrMemory(ctxt, NULL);
3101                 ctxt->instate = state;
3102                 return;
3103             }
3104             cur = CUR;
3105             if (!IS_BLANK(cur)) {
3106                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3107                           "ParsePI: PI %s space expected\n", target, NULL);
3108             }
3109             SKIP_BLANKS;
3110             cur = CUR_CHAR(l);
3111             while (IS_CHAR(cur) && (cur != '>')) {
3112                 if (len + 5 >= size) {
3113                     xmlChar *tmp;
3114
3115                     size *= 2;
3116                     tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3117                     if (tmp == NULL) {
3118                         htmlErrMemory(ctxt, NULL);
3119                         xmlFree(buf);
3120                         ctxt->instate = state;
3121                         return;
3122                     }
3123                     buf = tmp;
3124                 }
3125                 count++;
3126                 if (count > 50) {
3127                     GROW;
3128                     count = 0;
3129                 }
3130                 COPY_BUF(l,buf,len,cur);
3131                 NEXTL(l);
3132                 cur = CUR_CHAR(l);
3133                 if (cur == 0) {
3134                     SHRINK;
3135                     GROW;
3136                     cur = CUR_CHAR(l);
3137                 }
3138             }
3139             buf[len] = 0;
3140             if (cur != '>') {
3141                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3142                       "ParsePI: PI %s never end ...\n", target, NULL);
3143             } else {
3144                 SKIP(1);
3145
3146                 /*
3147                  * SAX: PI detected.
3148                  */
3149                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3150                     (ctxt->sax->processingInstruction != NULL))
3151                     ctxt->sax->processingInstruction(ctxt->userData,
3152                                                      target, buf);
3153             }
3154             xmlFree(buf);
3155         } else {
3156             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3157                          "PI is not started correctly", NULL, NULL);
3158         }
3159         ctxt->instate = state;
3160     }
3161 }
3162
3163 /**
3164  * htmlParseComment:
3165  * @ctxt:  an HTML parser context
3166  *
3167  * Parse an XML (SGML) comment <!-- .... -->
3168  *
3169  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3170  */
3171 static void
3172 htmlParseComment(htmlParserCtxtPtr ctxt) {
3173     xmlChar *buf = NULL;
3174     int len;
3175     int size = HTML_PARSER_BUFFER_SIZE;
3176     int q, ql;
3177     int r, rl;
3178     int cur, l;
3179     xmlParserInputState state;
3180
3181     /*
3182      * Check that there is a comment right here.
3183      */
3184     if ((RAW != '<') || (NXT(1) != '!') ||
3185         (NXT(2) != '-') || (NXT(3) != '-')) return;
3186
3187     state = ctxt->instate;
3188     ctxt->instate = XML_PARSER_COMMENT;
3189     SHRINK;
3190     SKIP(4);
3191     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3192     if (buf == NULL) {
3193         htmlErrMemory(ctxt, "buffer allocation failed\n");
3194         ctxt->instate = state;
3195         return;
3196     }
3197     q = CUR_CHAR(ql);
3198     NEXTL(ql);
3199     r = CUR_CHAR(rl);
3200     NEXTL(rl);
3201     cur = CUR_CHAR(l);
3202     len = 0;
3203     while (IS_CHAR(cur) &&
3204            ((cur != '>') ||
3205             (r != '-') || (q != '-'))) {
3206         if (len + 5 >= size) {
3207             xmlChar *tmp;
3208
3209             size *= 2;
3210             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3211             if (tmp == NULL) {
3212                 xmlFree(buf);
3213                 htmlErrMemory(ctxt, "growing buffer failed\n");
3214                 ctxt->instate = state;
3215                 return;
3216             }
3217             buf = tmp;
3218         }
3219         COPY_BUF(ql,buf,len,q);
3220         q = r;
3221         ql = rl;
3222         r = cur;
3223         rl = l;
3224         NEXTL(l);
3225         cur = CUR_CHAR(l);
3226         if (cur == 0) {
3227             SHRINK;
3228             GROW;
3229             cur = CUR_CHAR(l);
3230         }
3231     }
3232     buf[len] = 0;
3233     if (!IS_CHAR(cur)) {
3234         htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3235                      "Comment not terminated \n<!--%.50s\n", buf, NULL);
3236         xmlFree(buf);
3237     } else {
3238         NEXT;
3239         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3240             (!ctxt->disableSAX))
3241             ctxt->sax->comment(ctxt->userData, buf);
3242         xmlFree(buf);
3243     }
3244     ctxt->instate = state;
3245 }
3246
3247 /**
3248  * htmlParseCharRef:
3249  * @ctxt:  an HTML parser context
3250  *
3251  * parse Reference declarations
3252  *
3253  * [66] CharRef ::= '&#' [0-9]+ ';' |
3254  *                  '&#x' [0-9a-fA-F]+ ';'
3255  *
3256  * Returns the value parsed (as an int)
3257  */
3258 int
3259 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3260     int val = 0;
3261
3262     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3263         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3264                      "htmlParseCharRef: context error\n",
3265                      NULL, NULL);
3266         return(0);
3267     }
3268     if ((CUR == '&') && (NXT(1) == '#') &&
3269         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3270         SKIP(3);
3271         while (CUR != ';') {
3272             if ((CUR >= '0') && (CUR <= '9'))
3273                 val = val * 16 + (CUR - '0');
3274             else if ((CUR >= 'a') && (CUR <= 'f'))
3275                 val = val * 16 + (CUR - 'a') + 10;
3276             else if ((CUR >= 'A') && (CUR <= 'F'))
3277                 val = val * 16 + (CUR - 'A') + 10;
3278             else {
3279                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3280                              "htmlParseCharRef: missing semicolon\n",
3281                              NULL, NULL);
3282                 break;
3283             }
3284             NEXT;
3285         }
3286         if (CUR == ';')
3287             NEXT;
3288     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3289         SKIP(2);
3290         while (CUR != ';') {
3291             if ((CUR >= '0') && (CUR <= '9'))
3292                 val = val * 10 + (CUR - '0');
3293             else {
3294                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3295                              "htmlParseCharRef: missing semicolon\n",
3296                              NULL, NULL);
3297                 break;
3298             }
3299             NEXT;
3300         }
3301         if (CUR == ';')
3302             NEXT;
3303     } else {
3304         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3305                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3306     }
3307     /*
3308      * Check the value IS_CHAR ...
3309      */
3310     if (IS_CHAR(val)) {
3311         return(val);
3312     } else {
3313         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3314                         "htmlParseCharRef: invalid xmlChar value %d\n",
3315                         val);
3316     }
3317     return(0);
3318 }
3319
3320
3321 /**
3322  * htmlParseDocTypeDecl:
3323  * @ctxt:  an HTML parser context
3324  *
3325  * parse a DOCTYPE declaration
3326  *
3327  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3328  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3329  */
3330
3331 static void
3332 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3333     const xmlChar *name;
3334     xmlChar *ExternalID = NULL;
3335     xmlChar *URI = NULL;
3336
3337     /*
3338      * We know that '<!DOCTYPE' has been detected.
3339      */
3340     SKIP(9);
3341
3342     SKIP_BLANKS;
3343
3344     /*
3345      * Parse the DOCTYPE name.
3346      */
3347     name = htmlParseName(ctxt);
3348     if (name == NULL) {
3349         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3350                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3351                      NULL, NULL);
3352     }
3353     /*
3354      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3355      */
3356
3357     SKIP_BLANKS;
3358
3359     /*
3360      * Check for SystemID and ExternalID
3361      */
3362     URI = htmlParseExternalID(ctxt, &ExternalID);
3363     SKIP_BLANKS;
3364
3365     /*
3366      * We should be at the end of the DOCTYPE declaration.
3367      */
3368     if (CUR != '>') {
3369         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3370                      "DOCTYPE improperly terminated\n", NULL, NULL);
3371         /* We shouldn't try to resynchronize ... */
3372     }
3373     NEXT;
3374
3375     /*
3376      * Create or update the document accordingly to the DOCTYPE
3377      */
3378     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3379         (!ctxt->disableSAX))
3380         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3381
3382     /*
3383      * Cleanup, since we don't use all those identifiers
3384      */
3385     if (URI != NULL) xmlFree(URI);
3386     if (ExternalID != NULL) xmlFree(ExternalID);
3387 }
3388
3389 /**
3390  * htmlParseAttribute:
3391  * @ctxt:  an HTML parser context
3392  * @value:  a xmlChar ** used to store the value of the attribute
3393  *
3394  * parse an attribute
3395  *
3396  * [41] Attribute ::= Name Eq AttValue
3397  *
3398  * [25] Eq ::= S? '=' S?
3399  *
3400  * With namespace:
3401  *
3402  * [NS 11] Attribute ::= QName Eq AttValue
3403  *
3404  * Also the case QName == xmlns:??? is handled independently as a namespace
3405  * definition.
3406  *
3407  * Returns the attribute name, and the value in *value.
3408  */
3409
3410 static const xmlChar *
3411 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3412     const xmlChar *name;
3413     xmlChar *val = NULL;
3414
3415     *value = NULL;
3416     name = htmlParseHTMLName(ctxt);
3417     if (name == NULL) {
3418         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3419                      "error parsing attribute name\n", NULL, NULL);
3420         return(NULL);
3421     }
3422
3423     /*
3424      * read the value
3425      */
3426     SKIP_BLANKS;
3427     if (CUR == '=') {
3428         NEXT;
3429         SKIP_BLANKS;
3430         val = htmlParseAttValue(ctxt);
3431     }
3432
3433     *value = val;
3434     return(name);
3435 }
3436
3437 /**
3438  * htmlCheckEncoding:
3439  * @ctxt:  an HTML parser context
3440  * @attvalue: the attribute value
3441  *
3442  * Checks an http-equiv attribute from a Meta tag to detect
3443  * the encoding
3444  * If a new encoding is detected the parser is switched to decode
3445  * it and pass UTF8
3446  */
3447 static void
3448 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3449     const xmlChar *encoding;
3450
3451     if ((ctxt == NULL) || (attvalue == NULL))
3452         return;
3453
3454     /* do not change encoding */
3455     if (ctxt->input->encoding != NULL)
3456         return;
3457
3458     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3459     if (encoding != NULL) {
3460         encoding += 8;
3461     } else {
3462         encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3463         if (encoding != NULL)
3464             encoding += 9;
3465     }
3466     if (encoding != NULL) {
3467         xmlCharEncoding enc;
3468         xmlCharEncodingHandlerPtr handler;
3469
3470         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3471
3472         if (ctxt->input->encoding != NULL)
3473             xmlFree((xmlChar *) ctxt->input->encoding);
3474         ctxt->input->encoding = xmlStrdup(encoding);
3475
3476         enc = xmlParseCharEncoding((const char *) encoding);
3477         /*
3478          * registered set of known encodings
3479          */
3480         if (enc != XML_CHAR_ENCODING_ERROR) {
3481             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3482                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3483                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3484                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3485                 (ctxt->input->buf != NULL) &&
3486                 (ctxt->input->buf->encoder == NULL)) {
3487                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3488                              "htmlCheckEncoding: wrong encoding meta\n",
3489                              NULL, NULL);
3490             } else {
3491                 xmlSwitchEncoding(ctxt, enc);
3492             }
3493             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3494         } else {
3495             /*
3496              * fallback for unknown encodings
3497              */
3498             handler = xmlFindCharEncodingHandler((const char *) encoding);
3499             if (handler != NULL) {
3500                 xmlSwitchToEncoding(ctxt, handler);
3501                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3502             } else {
3503                 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3504             }
3505         }
3506
3507         if ((ctxt->input->buf != NULL) &&
3508             (ctxt->input->buf->encoder != NULL) &&
3509             (ctxt->input->buf->raw != NULL) &&
3510             (ctxt->input->buf->buffer != NULL)) {
3511             int nbchars;
3512             int processed;
3513
3514             /*
3515              * convert as much as possible to the parser reading buffer.
3516              */
3517             processed = ctxt->input->cur - ctxt->input->base;
3518             xmlBufferShrink(ctxt->input->buf->buffer, processed);
3519             nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3520                                        ctxt->input->buf->buffer,
3521                                        ctxt->input->buf->raw);
3522             if (nbchars < 0) {
3523                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3524                              "htmlCheckEncoding: encoder error\n",
3525                              NULL, NULL);
3526             }
3527             ctxt->input->base =
3528             ctxt->input->cur = ctxt->input->buf->buffer->content;
3529             ctxt->input->end =
3530                           &ctxt->input->base[ctxt->input->buf->buffer->use];
3531         }
3532     }
3533 }
3534
3535 /**
3536  * htmlCheckMeta:
3537  * @ctxt:  an HTML parser context
3538  * @atts:  the attributes values
3539  *
3540  * Checks an attributes from a Meta tag
3541  */
3542 static void
3543 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3544     int i;
3545     const xmlChar *att, *value;
3546     int http = 0;
3547     const xmlChar *content = NULL;
3548
3549     if ((ctxt == NULL) || (atts == NULL))
3550         return;
3551
3552     i = 0;
3553     att = atts[i++];
3554     while (att != NULL) {
3555         value = atts[i++];
3556         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3557          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3558             http = 1;
3559         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3560             content = value;
3561         att = atts[i++];
3562     }
3563     if ((http) && (content != NULL))
3564         htmlCheckEncoding(ctxt, content);
3565
3566 }
3567
3568 /**
3569  * htmlParseStartTag:
3570  * @ctxt:  an HTML parser context
3571  *
3572  * parse a start of tag either for rule element or
3573  * EmptyElement. In both case we don't parse the tag closing chars.
3574  *
3575  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3576  *
3577  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3578  *
3579  * With namespace:
3580  *
3581  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3582  *
3583  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3584  *
3585  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3586  */
3587
3588 static int
3589 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3590     const xmlChar *name;
3591     const xmlChar *attname;
3592     xmlChar *attvalue;
3593     const xmlChar **atts;
3594     int nbatts = 0;
3595     int maxatts;
3596     int meta = 0;
3597     int i;
3598     int discardtag = 0;
3599
3600     if (ctxt->instate == XML_PARSER_EOF)
3601         return(-1);
3602     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3603         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3604                      "htmlParseStartTag: context error\n", NULL, NULL);
3605         return -1;
3606     }
3607     if (CUR != '<') return -1;
3608     NEXT;
3609
3610     atts = ctxt->atts;
3611     maxatts = ctxt->maxatts;
3612
3613     GROW;
3614     name = htmlParseHTMLName(ctxt);
3615     if (name == NULL) {
3616         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3617                      "htmlParseStartTag: invalid element name\n",
3618                      NULL, NULL);
3619         /* Dump the bogus tag like browsers do */
3620         while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3621                (ctxt->instate != XML_PARSER_EOF))
3622             NEXT;
3623         return -1;
3624     }
3625     if (xmlStrEqual(name, BAD_CAST"meta"))
3626         meta = 1;
3627
3628     /*
3629      * Check for auto-closure of HTML elements.
3630      */
3631     htmlAutoClose(ctxt, name);
3632
3633     /*
3634      * Check for implied HTML elements.
3635      */
3636     htmlCheckImplied(ctxt, name);
3637
3638     /*
3639      * Avoid html at any level > 0, head at any level != 1
3640      * or any attempt to recurse body
3641      */
3642     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3643         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3644                      "htmlParseStartTag: misplaced <html> tag\n",
3645                      name, NULL);
3646         discardtag = 1;
3647         ctxt->depth++;
3648     }
3649     if ((ctxt->nameNr != 1) &&
3650         (xmlStrEqual(name, BAD_CAST"head"))) {
3651         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3652                      "htmlParseStartTag: misplaced <head> tag\n",
3653                      name, NULL);
3654         discardtag = 1;
3655         ctxt->depth++;
3656     }
3657     if (xmlStrEqual(name, BAD_CAST"body")) {
3658         int indx;
3659         for (indx = 0;indx < ctxt->nameNr;indx++) {
3660             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3661                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3662                              "htmlParseStartTag: misplaced <body> tag\n",
3663                              name, NULL);
3664                 discardtag = 1;
3665                 ctxt->depth++;
3666             }
3667         }
3668     }
3669
3670     /*
3671      * Now parse the attributes, it ends up with the ending
3672      *
3673      * (S Attribute)* S?
3674      */
3675     SKIP_BLANKS;
3676     while ((IS_CHAR_CH(CUR)) &&
3677            (CUR != '>') &&
3678            ((CUR != '/') || (NXT(1) != '>'))) {
3679         long cons = ctxt->nbChars;
3680
3681         GROW;
3682         attname = htmlParseAttribute(ctxt, &attvalue);
3683         if (attname != NULL) {
3684
3685             /*
3686              * Well formedness requires at most one declaration of an attribute
3687              */
3688             for (i = 0; i < nbatts;i += 2) {
3689                 if (xmlStrEqual(atts[i], attname)) {
3690                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3691                                  "Attribute %s redefined\n", attname, NULL);
3692                     if (attvalue != NULL)
3693                         xmlFree(attvalue);
3694                     goto failed;
3695                 }
3696             }
3697
3698             /*
3699              * Add the pair to atts
3700              */
3701             if (atts == NULL) {
3702                 maxatts = 22; /* allow for 10 attrs by default */
3703                 atts = (const xmlChar **)
3704                        xmlMalloc(maxatts * sizeof(xmlChar *));
3705                 if (atts == NULL) {
3706                     htmlErrMemory(ctxt, NULL);
3707                     if (attvalue != NULL)
3708                         xmlFree(attvalue);
3709                     goto failed;
3710                 }
3711                 ctxt->atts = atts;
3712                 ctxt->maxatts = maxatts;
3713             } else if (nbatts + 4 > maxatts) {
3714                 const xmlChar **n;
3715
3716                 maxatts *= 2;
3717                 n = (const xmlChar **) xmlRealloc((void *) atts,
3718                                              maxatts * sizeof(const xmlChar *));
3719                 if (n == NULL) {
3720                     htmlErrMemory(ctxt, NULL);
3721                     if (attvalue != NULL)
3722                         xmlFree(attvalue);
3723                     goto failed;
3724                 }
3725                 atts = n;
3726                 ctxt->atts = atts;
3727                 ctxt->maxatts = maxatts;
3728             }
3729             atts[nbatts++] = attname;
3730             atts[nbatts++] = attvalue;
3731             atts[nbatts] = NULL;
3732             atts[nbatts + 1] = NULL;
3733         }
3734         else {
3735             if (attvalue != NULL)
3736                 xmlFree(attvalue);
3737             /* Dump the bogus attribute string up to the next blank or
3738              * the end of the tag. */
3739             while ((IS_CHAR_CH(CUR)) &&
3740                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3741                    ((CUR != '/') || (NXT(1) != '>')))
3742                 NEXT;
3743         }
3744
3745 failed:
3746         SKIP_BLANKS;
3747         if (cons == ctxt->nbChars) {
3748             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3749                          "htmlParseStartTag: problem parsing attributes\n",
3750                          NULL, NULL);
3751             break;
3752         }
3753     }
3754
3755     /*
3756      * Handle specific association to the META tag
3757      */
3758     if (meta && (nbatts != 0))
3759         htmlCheckMeta(ctxt, atts);
3760
3761     /*
3762      * SAX: Start of Element !
3763      */
3764     if (!discardtag) {
3765         htmlnamePush(ctxt, name);
3766         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3767             if (nbatts != 0)
3768                 ctxt->sax->startElement(ctxt->userData, name, atts);
3769             else
3770                 ctxt->sax->startElement(ctxt->userData, name, NULL);
3771         }
3772     }
3773
3774     if (atts != NULL) {
3775         for (i = 1;i < nbatts;i += 2) {
3776             if (atts[i] != NULL)
3777                 xmlFree((xmlChar *) atts[i]);
3778         }
3779     }
3780
3781     return(discardtag);
3782 }
3783
3784 /**
3785  * htmlParseEndTag:
3786  * @ctxt:  an HTML parser context
3787  *
3788  * parse an end of tag
3789  *
3790  * [42] ETag ::= '</' Name S? '>'
3791  *
3792  * With namespace
3793  *
3794  * [NS 9] ETag ::= '</' QName S? '>'
3795  *
3796  * Returns 1 if the current level should be closed.
3797  */
3798
3799 static int
3800 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3801 {
3802     const xmlChar *name;
3803     const xmlChar *oldname;
3804     int i, ret;
3805
3806     if ((CUR != '<') || (NXT(1) != '/')) {
3807         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3808                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
3809         return (0);
3810     }
3811     SKIP(2);
3812
3813     name = htmlParseHTMLName(ctxt);
3814     if (name == NULL)
3815         return (0);
3816     /*
3817      * We should definitely be at the ending "S? '>'" part
3818      */
3819     SKIP_BLANKS;
3820     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3821         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3822                      "End tag : expected '>'\n", NULL, NULL);
3823         if (ctxt->recovery) {
3824             /*
3825              * We're not at the ending > !!
3826              * Error, unless in recover mode where we search forwards
3827              * until we find a >
3828              */
3829             while (CUR != '\0' && CUR != '>') NEXT;
3830             NEXT;
3831         }
3832     } else
3833         NEXT;
3834
3835     /*
3836      * if we ignored misplaced tags in htmlParseStartTag don't pop them
3837      * out now.
3838      */
3839     if ((ctxt->depth > 0) &&
3840         (xmlStrEqual(name, BAD_CAST "html") ||
3841          xmlStrEqual(name, BAD_CAST "body") ||
3842          xmlStrEqual(name, BAD_CAST "head"))) {
3843         ctxt->depth--;
3844         return (0);
3845     }
3846
3847     /*
3848      * If the name read is not one of the element in the parsing stack
3849      * then return, it's just an error.
3850      */
3851     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3852         if (xmlStrEqual(name, ctxt->nameTab[i]))
3853             break;
3854     }
3855     if (i < 0) {
3856         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3857                      "Unexpected end tag : %s\n", name, NULL);
3858         return (0);
3859     }
3860
3861
3862     /*
3863      * Check for auto-closure of HTML elements.
3864      */
3865
3866     htmlAutoCloseOnClose(ctxt, name);
3867
3868     /*
3869      * Well formedness constraints, opening and closing must match.
3870      * With the exception that the autoclose may have popped stuff out
3871      * of the stack.
3872      */
3873     if (!xmlStrEqual(name, ctxt->name)) {
3874         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3875             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3876                          "Opening and ending tag mismatch: %s and %s\n",
3877                          name, ctxt->name);
3878         }
3879     }
3880
3881     /*
3882      * SAX: End of Tag
3883      */
3884     oldname = ctxt->name;
3885     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3886         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3887             ctxt->sax->endElement(ctxt->userData, name);
3888         htmlnamePop(ctxt);
3889         ret = 1;
3890     } else {
3891         ret = 0;
3892     }
3893
3894     return (ret);
3895 }
3896
3897
3898 /**
3899  * htmlParseReference:
3900  * @ctxt:  an HTML parser context
3901  *
3902  * parse and handle entity references in content,
3903  * this will end-up in a call to character() since this is either a
3904  * CharRef, or a predefined entity.
3905  */
3906 static void
3907 htmlParseReference(htmlParserCtxtPtr ctxt) {
3908     const htmlEntityDesc * ent;
3909     xmlChar out[6];
3910     const xmlChar *name;
3911     if (CUR != '&') return;
3912
3913     if (NXT(1) == '#') {
3914         unsigned int c;
3915         int bits, i = 0;
3916
3917         c = htmlParseCharRef(ctxt);
3918         if (c == 0)
3919             return;
3920
3921         if      (c <    0x80) { out[i++]= c;                bits= -6; }
3922         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3923         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3924         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3925
3926         for ( ; bits >= 0; bits-= 6) {
3927             out[i++]= ((c >> bits) & 0x3F) | 0x80;
3928         }
3929         out[i] = 0;
3930
3931         htmlCheckParagraph(ctxt);
3932         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3933             ctxt->sax->characters(ctxt->userData, out, i);
3934     } else {
3935         ent = htmlParseEntityRef(ctxt, &name);
3936         if (name == NULL) {
3937             htmlCheckParagraph(ctxt);
3938             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3939                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3940             return;
3941         }
3942         if ((ent == NULL) || !(ent->value > 0)) {
3943             htmlCheckParagraph(ctxt);
3944             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3945                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3946                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3947                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3948             }
3949         } else {
3950             unsigned int c;
3951             int bits, i = 0;
3952
3953             c = ent->value;
3954             if      (c <    0x80)
3955                     { out[i++]= c;                bits= -6; }
3956             else if (c <   0x800)
3957                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3958             else if (c < 0x10000)
3959                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3960             else
3961                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3962
3963             for ( ; bits >= 0; bits-= 6) {
3964                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3965             }
3966             out[i] = 0;
3967
3968             htmlCheckParagraph(ctxt);
3969             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3970                 ctxt->sax->characters(ctxt->userData, out, i);
3971         }
3972     }
3973 }
3974
3975 /**
3976  * htmlParseContent:
3977  * @ctxt:  an HTML parser context
3978  *
3979  * Parse a content: comment, sub-element, reference or text.
3980  * Kept for compatibility with old code
3981  */
3982
3983 static void
3984 htmlParseContent(htmlParserCtxtPtr ctxt) {
3985     xmlChar *currentNode;
3986     int depth;
3987     const xmlChar *name;
3988
3989     currentNode = xmlStrdup(ctxt->name);
3990     depth = ctxt->nameNr;
3991     while (1) {
3992         long cons = ctxt->nbChars;
3993
3994         GROW;
3995
3996         if (ctxt->instate == XML_PARSER_EOF)
3997             break;
3998
3999         /*
4000          * Our tag or one of it's parent or children is ending.
4001          */
4002         if ((CUR == '<') && (NXT(1) == '/')) {
4003             if (htmlParseEndTag(ctxt) &&
4004                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4005                 if (currentNode != NULL)
4006                     xmlFree(currentNode);
4007                 return;
4008             }
4009             continue; /* while */
4010         }
4011
4012         else if ((CUR == '<') &&
4013                  ((IS_ASCII_LETTER(NXT(1))) ||
4014                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4015             name = htmlParseHTMLName_nonInvasive(ctxt);
4016             if (name == NULL) {
4017                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4018                          "htmlParseStartTag: invalid element name\n",
4019                          NULL, NULL);
4020                 /* Dump the bogus tag like browsers do */
4021         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4022                     NEXT;
4023
4024                 if (currentNode != NULL)
4025                     xmlFree(currentNode);
4026                 return;
4027             }
4028
4029             if (ctxt->name != NULL) {
4030                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4031                     htmlAutoClose(ctxt, name);
4032                     continue;
4033                 }
4034             }
4035         }
4036
4037         /*
4038          * Has this node been popped out during parsing of
4039          * the next element
4040          */
4041         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4042             (!xmlStrEqual(currentNode, ctxt->name)))
4043              {
4044             if (currentNode != NULL) xmlFree(currentNode);
4045             return;
4046         }
4047
4048         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4049             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4050             /*
4051              * Handle SCRIPT/STYLE separately
4052              */
4053             htmlParseScript(ctxt);
4054         } else {
4055             /*
4056              * Sometimes DOCTYPE arrives in the middle of the document
4057              */
4058             if ((CUR == '<') && (NXT(1) == '!') &&
4059                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4060                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4061                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4062                 (UPP(8) == 'E')) {
4063                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4064                              "Misplaced DOCTYPE declaration\n",
4065                              BAD_CAST "DOCTYPE" , NULL);
4066                 htmlParseDocTypeDecl(ctxt);
4067             }
4068
4069             /*
4070              * First case :  a comment
4071              */
4072             if ((CUR == '<') && (NXT(1) == '!') &&
4073                 (NXT(2) == '-') && (NXT(3) == '-')) {
4074                 htmlParseComment(ctxt);
4075             }
4076
4077             /*
4078              * Second case : a Processing Instruction.
4079              */
4080             else if ((CUR == '<') && (NXT(1) == '?')) {
4081                 htmlParsePI(ctxt);
4082             }
4083
4084             /*
4085              * Third case :  a sub-element.
4086              */
4087             else if (CUR == '<') {
4088                 htmlParseElement(ctxt);
4089             }
4090
4091             /*
4092              * Fourth case : a reference. If if has not been resolved,
4093              *    parsing returns it's Name, create the node
4094              */
4095             else if (CUR == '&') {
4096                 htmlParseReference(ctxt);
4097             }
4098
4099             /*
4100              * Fifth case : end of the resource
4101              */
4102             else if (CUR == 0) {
4103                 htmlAutoCloseOnEnd(ctxt);
4104                 break;
4105             }
4106
4107             /*
4108              * Last case, text. Note that References are handled directly.
4109              */
4110             else {
4111                 htmlParseCharData(ctxt);
4112             }
4113
4114             if (cons == ctxt->nbChars) {
4115                 if (ctxt->node != NULL) {
4116                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4117                                  "detected an error in element content\n",
4118                                  NULL, NULL);
4119                 }
4120                 break;
4121             }
4122         }
4123         GROW;
4124     }
4125     if (currentNode != NULL) xmlFree(currentNode);
4126 }
4127
4128 /**
4129  * htmlParseElement:
4130  * @ctxt:  an HTML parser context
4131  *
4132  * parse an HTML element, this is highly recursive
4133  * this is kept for compatibility with previous code versions
4134  *
4135  * [39] element ::= EmptyElemTag | STag content ETag
4136  *
4137  * [41] Attribute ::= Name Eq AttValue
4138  */
4139
4140 void
4141 htmlParseElement(htmlParserCtxtPtr ctxt) {
4142     const xmlChar *name;
4143     xmlChar *currentNode = NULL;
4144     const htmlElemDesc * info;
4145     htmlParserNodeInfo node_info;
4146     int failed;
4147     int depth;
4148     const xmlChar *oldptr;
4149
4150     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4151         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4152                      "htmlParseElement: context error\n", NULL, NULL);
4153         return;
4154     }
4155
4156     if (ctxt->instate == XML_PARSER_EOF)
4157         return;
4158
4159     /* Capture start position */
4160     if (ctxt->record_info) {
4161         node_info.begin_pos = ctxt->input->consumed +
4162                           (CUR_PTR - ctxt->input->base);
4163         node_info.begin_line = ctxt->input->line;
4164     }
4165
4166     failed = htmlParseStartTag(ctxt);
4167     name = ctxt->name;
4168     if ((failed == -1) || (name == NULL)) {
4169         if (CUR == '>')
4170             NEXT;
4171         return;
4172     }
4173
4174     /*
4175      * Lookup the info for that element.
4176      */
4177     info = htmlTagLookup(name);
4178     if (info == NULL) {
4179         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4180                      "Tag %s invalid\n", name, NULL);
4181     }
4182
4183     /*
4184      * Check for an Empty Element labeled the XML/SGML way
4185      */
4186     if ((CUR == '/') && (NXT(1) == '>')) {
4187         SKIP(2);
4188         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4189             ctxt->sax->endElement(ctxt->userData, name);
4190         htmlnamePop(ctxt);
4191         return;
4192     }
4193
4194     if (CUR == '>') {
4195         NEXT;
4196     } else {
4197         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4198                      "Couldn't find end of Start Tag %s\n", name, NULL);
4199
4200         /*
4201          * end of parsing of this node.
4202          */
4203         if (xmlStrEqual(name, ctxt->name)) {
4204             nodePop(ctxt);
4205             htmlnamePop(ctxt);
4206         }
4207
4208         /*
4209          * Capture end position and add node
4210          */
4211         if (ctxt->record_info) {
4212            node_info.end_pos = ctxt->input->consumed +
4213                               (CUR_PTR - ctxt->input->base);
4214            node_info.end_line = ctxt->input->line;
4215            node_info.node = ctxt->node;
4216            xmlParserAddNodeInfo(ctxt, &node_info);
4217         }
4218         return;
4219     }
4220
4221     /*
4222      * Check for an Empty Element from DTD definition
4223      */
4224     if ((info != NULL) && (info->empty)) {
4225         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4226             ctxt->sax->endElement(ctxt->userData, name);
4227         htmlnamePop(ctxt);
4228         return;
4229     }
4230
4231     /*
4232      * Parse the content of the element:
4233      */
4234     currentNode = xmlStrdup(ctxt->name);
4235     depth = ctxt->nameNr;
4236     while (IS_CHAR_CH(CUR)) {
4237         oldptr = ctxt->input->cur;
4238         htmlParseContent(ctxt);
4239         if (oldptr==ctxt->input->cur) break;
4240         if (ctxt->nameNr < depth) break;
4241     }
4242
4243     /*
4244      * Capture end position and add node
4245      */
4246     if ( currentNode != NULL && ctxt->record_info ) {
4247        node_info.end_pos = ctxt->input->consumed +
4248                           (CUR_PTR - ctxt->input->base);
4249        node_info.end_line = ctxt->input->line;
4250        node_info.node = ctxt->node;
4251        xmlParserAddNodeInfo(ctxt, &node_info);
4252     }
4253     if (!IS_CHAR_CH(CUR)) {
4254         htmlAutoCloseOnEnd(ctxt);
4255     }
4256
4257     if (currentNode != NULL)
4258         xmlFree(currentNode);
4259 }
4260
4261 static void
4262 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4263     /*
4264      * Capture end position and add node
4265      */
4266     if ( ctxt->node != NULL && ctxt->record_info ) {
4267        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4268                                 (CUR_PTR - ctxt->input->base);
4269        ctxt->nodeInfo->end_line = ctxt->input->line;
4270        ctxt->nodeInfo->node = ctxt->node;
4271        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4272        htmlNodeInfoPop(ctxt);
4273     }
4274     if (!IS_CHAR_CH(CUR)) {
4275        htmlAutoCloseOnEnd(ctxt);
4276     }
4277 }
4278
4279 /**
4280  * htmlParseElementInternal:
4281  * @ctxt:  an HTML parser context
4282  *
4283  * parse an HTML element, new version, non recursive
4284  *
4285  * [39] element ::= EmptyElemTag | STag content ETag
4286  *
4287  * [41] Attribute ::= Name Eq AttValue
4288  */
4289
4290 static void
4291 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4292     const xmlChar *name;
4293     const htmlElemDesc * info;
4294     htmlParserNodeInfo node_info;
4295     int failed;
4296
4297     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4298         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4299                      "htmlParseElementInternal: context error\n", NULL, NULL);
4300         return;
4301     }
4302
4303     if (ctxt->instate == XML_PARSER_EOF)
4304         return;
4305
4306     /* Capture start position */
4307     if (ctxt->record_info) {
4308         node_info.begin_pos = ctxt->input->consumed +
4309                           (CUR_PTR - ctxt->input->base);
4310         node_info.begin_line = ctxt->input->line;
4311     }
4312
4313     failed = htmlParseStartTag(ctxt);
4314     name = ctxt->name;
4315     if ((failed == -1) || (name == NULL)) {
4316         if (CUR == '>')
4317             NEXT;
4318         return;
4319     }
4320
4321     /*
4322      * Lookup the info for that element.
4323      */
4324     info = htmlTagLookup(name);
4325     if (info == NULL) {
4326         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4327                      "Tag %s invalid\n", name, NULL);
4328     }
4329
4330     /*
4331      * Check for an Empty Element labeled the XML/SGML way
4332      */
4333     if ((CUR == '/') && (NXT(1) == '>')) {
4334         SKIP(2);
4335         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4336             ctxt->sax->endElement(ctxt->userData, name);
4337         htmlnamePop(ctxt);
4338         return;
4339     }
4340
4341     if (CUR == '>') {
4342         NEXT;
4343     } else {
4344         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4345                      "Couldn't find end of Start Tag %s\n", name, NULL);
4346
4347         /*
4348          * end of parsing of this node.
4349          */
4350         if (xmlStrEqual(name, ctxt->name)) {
4351             nodePop(ctxt);
4352             htmlnamePop(ctxt);
4353         }
4354
4355         if (ctxt->record_info)
4356             htmlNodeInfoPush(ctxt, &node_info);
4357         htmlParserFinishElementParsing(ctxt);
4358         return;
4359     }
4360
4361     /*
4362      * Check for an Empty Element from DTD definition
4363      */
4364     if ((info != NULL) && (info->empty)) {
4365         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4366             ctxt->sax->endElement(ctxt->userData, name);
4367         htmlnamePop(ctxt);
4368         return;
4369     }
4370
4371     if (ctxt->record_info)
4372         htmlNodeInfoPush(ctxt, &node_info);
4373 }
4374
4375 /**
4376  * htmlParseContentInternal:
4377  * @ctxt:  an HTML parser context
4378  *
4379  * Parse a content: comment, sub-element, reference or text.
4380  * New version for non recursive htmlParseElementInternal
4381  */
4382
4383 static void
4384 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4385     xmlChar *currentNode;
4386     int depth;
4387     const xmlChar *name;
4388
4389     currentNode = xmlStrdup(ctxt->name);
4390     depth = ctxt->nameNr;
4391     while (1) {
4392         long cons = ctxt->nbChars;
4393
4394         GROW;
4395
4396         if (ctxt->instate == XML_PARSER_EOF)
4397             break;
4398
4399         /*
4400          * Our tag or one of it's parent or children is ending.
4401          */
4402         if ((CUR == '<') && (NXT(1) == '/')) {
4403             if (htmlParseEndTag(ctxt) &&
4404                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4405                 if (currentNode != NULL)
4406                     xmlFree(currentNode);
4407
4408                 currentNode = xmlStrdup(ctxt->name);
4409                 depth = ctxt->nameNr;
4410             }
4411             continue; /* while */
4412         }
4413
4414         else if ((CUR == '<') &&
4415                  ((IS_ASCII_LETTER(NXT(1))) ||
4416                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4417             name = htmlParseHTMLName_nonInvasive(ctxt);
4418             if (name == NULL) {
4419                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4420                          "htmlParseStartTag: invalid element name\n",
4421                          NULL, NULL);
4422                 /* Dump the bogus tag like browsers do */
4423                 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4424                     NEXT;
4425
4426                 htmlParserFinishElementParsing(ctxt);
4427                 if (currentNode != NULL)
4428                     xmlFree(currentNode);
4429
4430                 currentNode = xmlStrdup(ctxt->name);
4431                 depth = ctxt->nameNr;
4432                 continue;
4433             }
4434
4435             if (ctxt->name != NULL) {
4436                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4437                     htmlAutoClose(ctxt, name);
4438                     continue;
4439                 }
4440             }
4441         }
4442
4443         /*
4444          * Has this node been popped out during parsing of
4445          * the next element
4446          */
4447         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4448             (!xmlStrEqual(currentNode, ctxt->name)))
4449              {
4450             htmlParserFinishElementParsing(ctxt);
4451             if (currentNode != NULL) xmlFree(currentNode);
4452
4453             currentNode = xmlStrdup(ctxt->name);
4454             depth = ctxt->nameNr;
4455             continue;
4456         }
4457
4458         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4459             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4460             /*
4461              * Handle SCRIPT/STYLE separately
4462              */
4463             htmlParseScript(ctxt);
4464         } else {
4465             /*
4466              * Sometimes DOCTYPE arrives in the middle of the document
4467              */
4468             if ((CUR == '<') && (NXT(1) == '!') &&
4469                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4470                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4471                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4472                 (UPP(8) == 'E')) {
4473                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4474                              "Misplaced DOCTYPE declaration\n",
4475                              BAD_CAST "DOCTYPE" , NULL);
4476                 htmlParseDocTypeDecl(ctxt);
4477             }
4478
4479             /*
4480              * First case :  a comment
4481              */
4482             if ((CUR == '<') && (NXT(1) == '!') &&
4483                 (NXT(2) == '-') && (NXT(3) == '-')) {
4484                 htmlParseComment(ctxt);
4485             }
4486
4487             /*
4488              * Second case : a Processing Instruction.
4489              */
4490             else if ((CUR == '<') && (NXT(1) == '?')) {
4491                 htmlParsePI(ctxt);
4492             }
4493
4494             /*
4495              * Third case :  a sub-element.
4496              */
4497             else if (CUR == '<') {
4498                 htmlParseElementInternal(ctxt);
4499                 if (currentNode != NULL) xmlFree(currentNode);
4500
4501                 currentNode = xmlStrdup(ctxt->name);
4502                 depth = ctxt->nameNr;
4503             }
4504
4505             /*
4506              * Fourth case : a reference. If if has not been resolved,
4507              *    parsing returns it's Name, create the node
4508              */
4509             else if (CUR == '&') {
4510                 htmlParseReference(ctxt);
4511             }
4512
4513             /*
4514              * Fifth case : end of the resource
4515              */
4516             else if (CUR == 0) {
4517                 htmlAutoCloseOnEnd(ctxt);
4518                 break;
4519             }
4520
4521             /*
4522              * Last case, text. Note that References are handled directly.
4523              */
4524             else {
4525                 htmlParseCharData(ctxt);
4526             }
4527
4528             if (cons == ctxt->nbChars) {
4529                 if (ctxt->node != NULL) {
4530                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4531                                  "detected an error in element content\n",
4532                                  NULL, NULL);
4533                 }
4534                 break;
4535             }
4536         }
4537         GROW;
4538     }
4539     if (currentNode != NULL) xmlFree(currentNode);
4540 }
4541
4542 /**
4543  * htmlParseContent:
4544  * @ctxt:  an HTML parser context
4545  *
4546  * Parse a content: comment, sub-element, reference or text.
4547  * This is the entry point when called from parser.c
4548  */
4549
4550 void
4551 __htmlParseContent(void *ctxt) {
4552     if (ctxt != NULL)
4553         htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4554 }
4555
4556 /**
4557  * htmlParseDocument:
4558  * @ctxt:  an HTML parser context
4559  *
4560  * parse an HTML document (and build a tree if using the standard SAX
4561  * interface).
4562  *
4563  * Returns 0, -1 in case of error. the parser context is augmented
4564  *                as a result of the parsing.
4565  */
4566
4567 int
4568 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4569     xmlChar start[4];
4570     xmlCharEncoding enc;
4571     xmlDtdPtr dtd;
4572
4573     xmlInitParser();
4574
4575     htmlDefaultSAXHandlerInit();
4576
4577     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4578         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4579                      "htmlParseDocument: context error\n", NULL, NULL);
4580         return(XML_ERR_INTERNAL_ERROR);
4581     }
4582     ctxt->html = 1;
4583     ctxt->linenumbers = 1;
4584     GROW;
4585     /*
4586      * SAX: beginning of the document processing.
4587      */
4588     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4589         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4590
4591     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4592         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4593         /*
4594          * Get the 4 first bytes and decode the charset
4595          * if enc != XML_CHAR_ENCODING_NONE
4596          * plug some encoding conversion routines.
4597          */
4598         start[0] = RAW;
4599         start[1] = NXT(1);
4600         start[2] = NXT(2);
4601         start[3] = NXT(3);
4602         enc = xmlDetectCharEncoding(&start[0], 4);
4603         if (enc != XML_CHAR_ENCODING_NONE) {
4604             xmlSwitchEncoding(ctxt, enc);
4605         }
4606     }
4607
4608     /*
4609      * Wipe out everything which is before the first '<'
4610      */
4611     SKIP_BLANKS;
4612     if (CUR == 0) {
4613         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4614                      "Document is empty\n", NULL, NULL);
4615     }
4616
4617     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4618         ctxt->sax->startDocument(ctxt->userData);
4619
4620
4621     /*
4622      * Parse possible comments and PIs before any content
4623      */
4624     while (((CUR == '<') && (NXT(1) == '!') &&
4625             (NXT(2) == '-') && (NXT(3) == '-')) ||
4626            ((CUR == '<') && (NXT(1) == '?'))) {
4627         htmlParseComment(ctxt);
4628         htmlParsePI(ctxt);
4629         SKIP_BLANKS;
4630     }
4631
4632
4633     /*
4634      * Then possibly doc type declaration(s) and more Misc
4635      * (doctypedecl Misc*)?
4636      */
4637     if ((CUR == '<') && (NXT(1) == '!') &&
4638         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4639         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4640         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4641         (UPP(8) == 'E')) {
4642         htmlParseDocTypeDecl(ctxt);
4643     }
4644     SKIP_BLANKS;
4645
4646     /*
4647      * Parse possible comments and PIs before any content
4648      */
4649     while (((CUR == '<') && (NXT(1) == '!') &&
4650             (NXT(2) == '-') && (NXT(3) == '-')) ||
4651            ((CUR == '<') && (NXT(1) == '?'))) {
4652         htmlParseComment(ctxt);
4653         htmlParsePI(ctxt);
4654         SKIP_BLANKS;
4655     }
4656
4657     /*
4658      * Time to start parsing the tree itself
4659      */
4660     htmlParseContentInternal(ctxt);
4661
4662     /*
4663      * autoclose
4664      */
4665     if (CUR == 0)
4666         htmlAutoCloseOnEnd(ctxt);
4667
4668
4669     /*
4670      * SAX: end of the document processing.
4671      */
4672     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4673         ctxt->sax->endDocument(ctxt->userData);
4674
4675     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4676         dtd = xmlGetIntSubset(ctxt->myDoc);
4677         if (dtd == NULL)
4678             ctxt->myDoc->intSubset =
4679                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4680                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4681                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4682     }
4683     if (! ctxt->wellFormed) return(-1);
4684     return(0);
4685 }
4686
4687
4688 /************************************************************************
4689  *                                                                      *
4690  *                      Parser contexts handling                        *
4691  *                                                                      *
4692  ************************************************************************/
4693
4694 /**
4695  * htmlInitParserCtxt:
4696  * @ctxt:  an HTML parser context
4697  *
4698  * Initialize a parser context
4699  *
4700  * Returns 0 in case of success and -1 in case of error
4701  */
4702
4703 static int
4704 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4705 {
4706     htmlSAXHandler *sax;
4707
4708     if (ctxt == NULL) return(-1);
4709     memset(ctxt, 0, sizeof(htmlParserCtxt));
4710
4711     ctxt->dict = xmlDictCreate();
4712     if (ctxt->dict == NULL) {
4713         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4714         return(-1);
4715     }
4716     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4717     if (sax == NULL) {
4718         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4719         return(-1);
4720     }
4721     else
4722         memset(sax, 0, sizeof(htmlSAXHandler));
4723
4724     /* Allocate the Input stack */
4725     ctxt->inputTab = (htmlParserInputPtr *)
4726                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4727     if (ctxt->inputTab == NULL) {
4728         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4729         ctxt->inputNr = 0;
4730         ctxt->inputMax = 0;
4731         ctxt->input = NULL;
4732         return(-1);
4733     }
4734     ctxt->inputNr = 0;
4735     ctxt->inputMax = 5;
4736     ctxt->input = NULL;
4737     ctxt->version = NULL;
4738     ctxt->encoding = NULL;
4739     ctxt->standalone = -1;
4740     ctxt->instate = XML_PARSER_START;
4741
4742     /* Allocate the Node stack */
4743     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4744     if (ctxt->nodeTab == NULL) {
4745         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4746         ctxt->nodeNr = 0;
4747         ctxt->nodeMax = 0;
4748         ctxt->node = NULL;
4749         ctxt->inputNr = 0;
4750         ctxt->inputMax = 0;
4751         ctxt->input = NULL;
4752         return(-1);
4753     }
4754     ctxt->nodeNr = 0;
4755     ctxt->nodeMax = 10;
4756     ctxt->node = NULL;
4757
4758     /* Allocate the Name stack */
4759     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4760     if (ctxt->nameTab == NULL) {
4761         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4762         ctxt->nameNr = 0;
4763         ctxt->nameMax = 0;
4764         ctxt->name = NULL;
4765         ctxt->nodeNr = 0;
4766         ctxt->nodeMax = 0;
4767         ctxt->node = NULL;
4768         ctxt->inputNr = 0;
4769         ctxt->inputMax = 0;
4770         ctxt->input = NULL;
4771         return(-1);
4772     }
4773     ctxt->nameNr = 0;
4774     ctxt->nameMax = 10;
4775     ctxt->name = NULL;
4776
4777     ctxt->nodeInfoTab = NULL;
4778     ctxt->nodeInfoNr  = 0;
4779     ctxt->nodeInfoMax = 0;
4780
4781     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4782     else {
4783         ctxt->sax = sax;
4784         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4785     }
4786     ctxt->userData = ctxt;
4787     ctxt->myDoc = NULL;
4788     ctxt->wellFormed = 1;
4789     ctxt->replaceEntities = 0;
4790     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4791     ctxt->html = 1;
4792     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4793     ctxt->vctxt.userData = ctxt;
4794     ctxt->vctxt.error = xmlParserValidityError;
4795     ctxt->vctxt.warning = xmlParserValidityWarning;
4796     ctxt->record_info = 0;
4797     ctxt->validate = 0;
4798     ctxt->nbChars = 0;
4799     ctxt->checkIndex = 0;
4800     ctxt->catalogs = NULL;
4801     xmlInitNodeInfoSeq(&ctxt->node_seq);
4802     return(0);
4803 }
4804
4805 /**
4806  * htmlFreeParserCtxt:
4807  * @ctxt:  an HTML parser context
4808  *
4809  * Free all the memory used by a parser context. However the parsed
4810  * document in ctxt->myDoc is not freed.
4811  */
4812
4813 void
4814 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4815 {
4816     xmlFreeParserCtxt(ctxt);
4817 }
4818
4819 /**
4820  * htmlNewParserCtxt:
4821  *
4822  * Allocate and initialize a new parser context.
4823  *
4824  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4825  */
4826
4827 htmlParserCtxtPtr
4828 htmlNewParserCtxt(void)
4829 {
4830     xmlParserCtxtPtr ctxt;
4831
4832     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4833     if (ctxt == NULL) {
4834         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4835         return(NULL);
4836     }
4837     memset(ctxt, 0, sizeof(xmlParserCtxt));
4838     if (htmlInitParserCtxt(ctxt) < 0) {
4839         htmlFreeParserCtxt(ctxt);
4840         return(NULL);
4841     }
4842     return(ctxt);
4843 }
4844
4845 /**
4846  * htmlCreateMemoryParserCtxt:
4847  * @buffer:  a pointer to a char array
4848  * @size:  the size of the array
4849  *
4850  * Create a parser context for an HTML in-memory document.
4851  *
4852  * Returns the new parser context or NULL
4853  */
4854 htmlParserCtxtPtr
4855 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4856     xmlParserCtxtPtr ctxt;
4857     xmlParserInputPtr input;
4858     xmlParserInputBufferPtr buf;
4859
4860     if (buffer == NULL)
4861         return(NULL);
4862     if (size <= 0)
4863         return(NULL);
4864
4865     ctxt = htmlNewParserCtxt();
4866     if (ctxt == NULL)
4867         return(NULL);
4868
4869     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4870     if (buf == NULL) return(NULL);
4871
4872     input = xmlNewInputStream(ctxt);
4873     if (input == NULL) {
4874         xmlFreeParserCtxt(ctxt);
4875         return(NULL);
4876     }
4877
4878     input->filename = NULL;
4879     input->buf = buf;
4880     input->base = input->buf->buffer->content;
4881     input->cur = input->buf->buffer->content;
4882     input->end = &input->buf->buffer->content[input->buf->buffer->use];
4883
4884     inputPush(ctxt, input);
4885     return(ctxt);
4886 }
4887
4888 /**
4889  * htmlCreateDocParserCtxt:
4890  * @cur:  a pointer to an array of xmlChar
4891  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4892  *
4893  * Create a parser context for an HTML document.
4894  *
4895  * TODO: check the need to add encoding handling there
4896  *
4897  * Returns the new parser context or NULL
4898  */
4899 static htmlParserCtxtPtr
4900 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4901     int len;
4902     htmlParserCtxtPtr ctxt;
4903
4904     if (cur == NULL)
4905         return(NULL);
4906     len = xmlStrlen(cur);
4907     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4908     if (ctxt == NULL)
4909         return(NULL);
4910
4911     if (encoding != NULL) {
4912         xmlCharEncoding enc;
4913         xmlCharEncodingHandlerPtr handler;
4914
4915         if (ctxt->input->encoding != NULL)
4916             xmlFree((xmlChar *) ctxt->input->encoding);
4917         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4918
4919         enc = xmlParseCharEncoding(encoding);
4920         /*
4921          * registered set of known encodings
4922          */
4923         if (enc != XML_CHAR_ENCODING_ERROR) {
4924             xmlSwitchEncoding(ctxt, enc);
4925             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4926                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4927                              "Unsupported encoding %s\n",
4928                              (const xmlChar *) encoding, NULL);
4929             }
4930         } else {
4931             /*
4932              * fallback for unknown encodings
4933              */
4934             handler = xmlFindCharEncodingHandler((const char *) encoding);
4935             if (handler != NULL) {
4936                 xmlSwitchToEncoding(ctxt, handler);
4937             } else {
4938                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4939                              "Unsupported encoding %s\n",
4940                              (const xmlChar *) encoding, NULL);
4941             }
4942         }
4943     }
4944     return(ctxt);
4945 }
4946
4947 #ifdef LIBXML_PUSH_ENABLED
4948 /************************************************************************
4949  *                                                                      *
4950  *      Progressive parsing interfaces                          *
4951  *                                                                      *
4952  ************************************************************************/
4953
4954 /**
4955  * htmlParseLookupSequence:
4956  * @ctxt:  an HTML parser context
4957  * @first:  the first char to lookup
4958  * @next:  the next char to lookup or zero
4959  * @third:  the next char to lookup or zero
4960  * @comment: flag to force checking inside comments
4961  *
4962  * Try to find if a sequence (first, next, third) or  just (first next) or
4963  * (first) is available in the input stream.
4964  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4965  * to avoid rescanning sequences of bytes, it DOES change the state of the
4966  * parser, do not use liberally.
4967  * This is basically similar to xmlParseLookupSequence()
4968  *
4969  * Returns the index to the current parsing point if the full sequence
4970  *      is available, -1 otherwise.
4971  */
4972 static int
4973 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4974                         xmlChar next, xmlChar third, int iscomment,
4975                         int ignoreattrval)
4976 {
4977     int base, len;
4978     htmlParserInputPtr in;
4979     const xmlChar *buf;
4980     int incomment = 0;
4981     int invalue = 0;
4982     char valdellim = 0x0;
4983
4984     in = ctxt->input;
4985     if (in == NULL)
4986         return (-1);
4987
4988     base = in->cur - in->base;
4989     if (base < 0)
4990         return (-1);
4991
4992     if (ctxt->checkIndex > base)
4993         base = ctxt->checkIndex;
4994
4995     if (in->buf == NULL) {
4996         buf = in->base;
4997         len = in->length;
4998     } else {
4999         buf = in->buf->buffer->content;
5000         len = in->buf->buffer->use;
5001     }
5002
5003     /* take into account the sequence length */
5004     if (third)
5005         len -= 2;
5006     else if (next)
5007         len--;
5008     for (; base < len; base++) {
5009         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5010             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5011                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5012                 incomment = 1;
5013                 /* do not increment past <! - some people use <!--> */
5014                 base += 2;
5015             }
5016         }
5017         if (ignoreattrval) {
5018             if (buf[base] == '"' || buf[base] == '\'') {
5019                 if (invalue) {
5020                     if (buf[base] == valdellim) {
5021                         invalue = 0;
5022                         continue;
5023                     }
5024                 } else {
5025                     valdellim = buf[base];
5026                     invalue = 1;
5027                     continue;
5028                 }
5029             } else if (invalue) {
5030                 continue;
5031             }
5032         }
5033         if (incomment) {
5034             if (base + 3 > len)
5035                 return (-1);
5036             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5037                 (buf[base + 2] == '>')) {
5038                 incomment = 0;
5039                 base += 2;
5040             }
5041             continue;
5042         }
5043         if (buf[base] == first) {
5044             if (third != 0) {
5045                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5046                     continue;
5047             } else if (next != 0) {
5048                 if (buf[base + 1] != next)
5049                     continue;
5050             }
5051             ctxt->checkIndex = 0;
5052 #ifdef DEBUG_PUSH
5053             if (next == 0)
5054                 xmlGenericError(xmlGenericErrorContext,
5055                                 "HPP: lookup '%c' found at %d\n",
5056                                 first, base);
5057             else if (third == 0)
5058                 xmlGenericError(xmlGenericErrorContext,
5059                                 "HPP: lookup '%c%c' found at %d\n",
5060                                 first, next, base);
5061             else
5062                 xmlGenericError(xmlGenericErrorContext,
5063                                 "HPP: lookup '%c%c%c' found at %d\n",
5064                                 first, next, third, base);
5065 #endif
5066             return (base - (in->cur - in->base));
5067         }
5068     }
5069     if ((!incomment) && (!invalue))
5070         ctxt->checkIndex = base;
5071 #ifdef DEBUG_PUSH
5072     if (next == 0)
5073         xmlGenericError(xmlGenericErrorContext,
5074                         "HPP: lookup '%c' failed\n", first);
5075     else if (third == 0)
5076         xmlGenericError(xmlGenericErrorContext,
5077                         "HPP: lookup '%c%c' failed\n", first, next);
5078     else
5079         xmlGenericError(xmlGenericErrorContext,
5080                         "HPP: lookup '%c%c%c' failed\n", first, next,
5081                         third);
5082 #endif
5083     return (-1);
5084 }
5085
5086 /**
5087  * htmlParseLookupChars:
5088  * @ctxt: an HTML parser context
5089  * @stop: Array of chars, which stop the lookup.
5090  * @stopLen: Length of stop-Array
5091  *
5092  * Try to find if any char of the stop-Array is available in the input
5093  * stream.
5094  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5095  * to avoid rescanning sequences of bytes, it DOES change the state of the
5096  * parser, do not use liberally.
5097  *
5098  * Returns the index to the current parsing point if a stopChar
5099  *      is available, -1 otherwise.
5100  */
5101 static int
5102 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5103                      int stopLen)
5104 {
5105     int base, len;
5106     htmlParserInputPtr in;
5107     const xmlChar *buf;
5108     int incomment = 0;
5109     int i;
5110
5111     in = ctxt->input;
5112     if (in == NULL)
5113         return (-1);
5114
5115     base = in->cur - in->base;
5116     if (base < 0)
5117         return (-1);
5118
5119     if (ctxt->checkIndex > base)
5120         base = ctxt->checkIndex;
5121
5122     if (in->buf == NULL) {
5123         buf = in->base;
5124         len = in->length;
5125     } else {
5126         buf = in->buf->buffer->content;
5127         len = in->buf->buffer->use;
5128     }
5129
5130     for (; base < len; base++) {
5131         if (!incomment && (base + 4 < len)) {
5132             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5133                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5134                 incomment = 1;
5135                 /* do not increment past <! - some people use <!--> */
5136                 base += 2;
5137             }
5138         }
5139         if (incomment) {
5140             if (base + 3 > len)
5141                 return (-1);
5142             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5143                 (buf[base + 2] == '>')) {
5144                 incomment = 0;
5145                 base += 2;
5146             }
5147             continue;
5148         }
5149         for (i = 0; i < stopLen; ++i) {
5150             if (buf[base] == stop[i]) {
5151                 ctxt->checkIndex = 0;
5152                 return (base - (in->cur - in->base));
5153             }
5154         }
5155     }
5156     ctxt->checkIndex = base;
5157     return (-1);
5158 }
5159
5160 /**
5161  * htmlParseTryOrFinish:
5162  * @ctxt:  an HTML parser context
5163  * @terminate:  last chunk indicator
5164  *
5165  * Try to progress on parsing
5166  *
5167  * Returns zero if no parsing was possible
5168  */
5169 static int
5170 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5171     int ret = 0;
5172     htmlParserInputPtr in;
5173     int avail = 0;
5174     xmlChar cur, next;
5175
5176 #ifdef DEBUG_PUSH
5177     switch (ctxt->instate) {
5178         case XML_PARSER_EOF:
5179             xmlGenericError(xmlGenericErrorContext,
5180                     "HPP: try EOF\n"); break;
5181         case XML_PARSER_START:
5182             xmlGenericError(xmlGenericErrorContext,
5183                     "HPP: try START\n"); break;
5184         case XML_PARSER_MISC:
5185             xmlGenericError(xmlGenericErrorContext,
5186                     "HPP: try MISC\n");break;
5187         case XML_PARSER_COMMENT:
5188             xmlGenericError(xmlGenericErrorContext,
5189                     "HPP: try COMMENT\n");break;
5190         case XML_PARSER_PROLOG:
5191             xmlGenericError(xmlGenericErrorContext,
5192                     "HPP: try PROLOG\n");break;
5193         case XML_PARSER_START_TAG:
5194             xmlGenericError(xmlGenericErrorContext,
5195                     "HPP: try START_TAG\n");break;
5196         case XML_PARSER_CONTENT:
5197             xmlGenericError(xmlGenericErrorContext,
5198                     "HPP: try CONTENT\n");break;
5199         case XML_PARSER_CDATA_SECTION:
5200             xmlGenericError(xmlGenericErrorContext,
5201                     "HPP: try CDATA_SECTION\n");break;
5202         case XML_PARSER_END_TAG:
5203             xmlGenericError(xmlGenericErrorContext,
5204                     "HPP: try END_TAG\n");break;
5205         case XML_PARSER_ENTITY_DECL:
5206             xmlGenericError(xmlGenericErrorContext,
5207                     "HPP: try ENTITY_DECL\n");break;
5208         case XML_PARSER_ENTITY_VALUE:
5209             xmlGenericError(xmlGenericErrorContext,
5210                     "HPP: try ENTITY_VALUE\n");break;
5211         case XML_PARSER_ATTRIBUTE_VALUE:
5212             xmlGenericError(xmlGenericErrorContext,
5213                     "HPP: try ATTRIBUTE_VALUE\n");break;
5214         case XML_PARSER_DTD:
5215             xmlGenericError(xmlGenericErrorContext,
5216                     "HPP: try DTD\n");break;
5217         case XML_PARSER_EPILOG:
5218             xmlGenericError(xmlGenericErrorContext,
5219                     "HPP: try EPILOG\n");break;
5220         case XML_PARSER_PI:
5221             xmlGenericError(xmlGenericErrorContext,
5222                     "HPP: try PI\n");break;
5223         case XML_PARSER_SYSTEM_LITERAL:
5224             xmlGenericError(xmlGenericErrorContext,
5225                     "HPP: try SYSTEM_LITERAL\n");break;
5226     }
5227 #endif
5228
5229     while (1) {
5230
5231         in = ctxt->input;
5232         if (in == NULL) break;
5233         if (in->buf == NULL)
5234             avail = in->length - (in->cur - in->base);
5235         else
5236             avail = in->buf->buffer->use - (in->cur - in->base);
5237         if ((avail == 0) && (terminate)) {
5238             htmlAutoCloseOnEnd(ctxt);
5239             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5240                 /*
5241                  * SAX: end of the document processing.
5242                  */
5243                 ctxt->instate = XML_PARSER_EOF;
5244                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5245                     ctxt->sax->endDocument(ctxt->userData);
5246             }
5247         }
5248         if (avail < 1)
5249             goto done;
5250         cur = in->cur[0];
5251         if (cur == 0) {
5252             SKIP(1);
5253             continue;
5254         }
5255
5256         switch (ctxt->instate) {
5257             case XML_PARSER_EOF:
5258                 /*
5259                  * Document parsing is done !
5260                  */
5261                 goto done;
5262             case XML_PARSER_START:
5263                 /*
5264                  * Very first chars read from the document flow.
5265                  */
5266                 cur = in->cur[0];
5267                 if (IS_BLANK_CH(cur)) {
5268                     SKIP_BLANKS;
5269                     if (in->buf == NULL)
5270                         avail = in->length - (in->cur - in->base);
5271                     else
5272                         avail = in->buf->buffer->use - (in->cur - in->base);
5273                 }
5274                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5275                     ctxt->sax->setDocumentLocator(ctxt->userData,
5276                                                   &xmlDefaultSAXLocator);
5277                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5278                     (!ctxt->disableSAX))
5279                     ctxt->sax->startDocument(ctxt->userData);
5280
5281                 cur = in->cur[0];
5282                 next = in->cur[1];
5283                 if ((cur == '<') && (next == '!') &&
5284                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5285                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5286                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5287                     (UPP(8) == 'E')) {
5288                     if ((!terminate) &&
5289                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5290                         goto done;
5291 #ifdef DEBUG_PUSH
5292                     xmlGenericError(xmlGenericErrorContext,
5293                             "HPP: Parsing internal subset\n");
5294 #endif
5295                     htmlParseDocTypeDecl(ctxt);
5296                     ctxt->instate = XML_PARSER_PROLOG;
5297 #ifdef DEBUG_PUSH
5298                     xmlGenericError(xmlGenericErrorContext,
5299                             "HPP: entering PROLOG\n");
5300 #endif
5301                 } else {
5302                     ctxt->instate = XML_PARSER_MISC;
5303 #ifdef DEBUG_PUSH
5304                     xmlGenericError(xmlGenericErrorContext,
5305                             "HPP: entering MISC\n");
5306 #endif
5307                 }
5308                 break;
5309             case XML_PARSER_MISC:
5310                 SKIP_BLANKS;
5311                 if (in->buf == NULL)
5312                     avail = in->length - (in->cur - in->base);
5313                 else
5314                     avail = in->buf->buffer->use - (in->cur - in->base);
5315                 if (avail < 2)
5316                     goto done;
5317                 cur = in->cur[0];
5318                 next = in->cur[1];
5319                 if ((cur == '<') && (next == '!') &&
5320                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5321                     if ((!terminate) &&
5322                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5323                         goto done;
5324 #ifdef DEBUG_PUSH
5325                     xmlGenericError(xmlGenericErrorContext,
5326                             "HPP: Parsing Comment\n");
5327 #endif
5328                     htmlParseComment(ctxt);
5329                     ctxt->instate = XML_PARSER_MISC;
5330                 } else if ((cur == '<') && (next == '?')) {
5331                     if ((!terminate) &&
5332                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5333                         goto done;
5334 #ifdef DEBUG_PUSH
5335                     xmlGenericError(xmlGenericErrorContext,
5336                             "HPP: Parsing PI\n");
5337 #endif
5338                     htmlParsePI(ctxt);
5339                     ctxt->instate = XML_PARSER_MISC;
5340                 } else if ((cur == '<') && (next == '!') &&
5341                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5342                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5343                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5344                     (UPP(8) == 'E')) {
5345                     if ((!terminate) &&
5346                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5347                         goto done;
5348 #ifdef DEBUG_PUSH
5349                     xmlGenericError(xmlGenericErrorContext,
5350                             "HPP: Parsing internal subset\n");
5351 #endif
5352                     htmlParseDocTypeDecl(ctxt);
5353                     ctxt->instate = XML_PARSER_PROLOG;
5354 #ifdef DEBUG_PUSH
5355                     xmlGenericError(xmlGenericErrorContext,
5356                             "HPP: entering PROLOG\n");
5357 #endif
5358                 } else if ((cur == '<') && (next == '!') &&
5359                            (avail < 9)) {
5360                     goto done;
5361                 } else {
5362                     ctxt->instate = XML_PARSER_START_TAG;
5363 #ifdef DEBUG_PUSH
5364                     xmlGenericError(xmlGenericErrorContext,
5365                             "HPP: entering START_TAG\n");
5366 #endif
5367                 }
5368                 break;
5369             case XML_PARSER_PROLOG:
5370                 SKIP_BLANKS;
5371                 if (in->buf == NULL)
5372                     avail = in->length - (in->cur - in->base);
5373                 else
5374                     avail = in->buf->buffer->use - (in->cur - in->base);
5375                 if (avail < 2)
5376                     goto done;
5377                 cur = in->cur[0];
5378                 next = in->cur[1];
5379                 if ((cur == '<') && (next == '!') &&
5380                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5381                     if ((!terminate) &&
5382                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5383                         goto done;
5384 #ifdef DEBUG_PUSH
5385                     xmlGenericError(xmlGenericErrorContext,
5386                             "HPP: Parsing Comment\n");
5387 #endif
5388                     htmlParseComment(ctxt);
5389                     ctxt->instate = XML_PARSER_PROLOG;
5390                 } else if ((cur == '<') && (next == '?')) {
5391                     if ((!terminate) &&
5392                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5393                         goto done;
5394 #ifdef DEBUG_PUSH
5395                     xmlGenericError(xmlGenericErrorContext,
5396                             "HPP: Parsing PI\n");
5397 #endif
5398                     htmlParsePI(ctxt);
5399                     ctxt->instate = XML_PARSER_PROLOG;
5400                 } else if ((cur == '<') && (next == '!') &&
5401                            (avail < 4)) {
5402                     goto done;
5403                 } else {
5404                     ctxt->instate = XML_PARSER_START_TAG;
5405 #ifdef DEBUG_PUSH
5406                     xmlGenericError(xmlGenericErrorContext,
5407                             "HPP: entering START_TAG\n");
5408 #endif
5409                 }
5410                 break;
5411             case XML_PARSER_EPILOG:
5412                 if (in->buf == NULL)
5413                     avail = in->length - (in->cur - in->base);
5414                 else
5415                     avail = in->buf->buffer->use - (in->cur - in->base);
5416                 if (avail < 1)
5417                     goto done;
5418                 cur = in->cur[0];
5419                 if (IS_BLANK_CH(cur)) {
5420                     htmlParseCharData(ctxt);
5421                     goto done;
5422                 }
5423                 if (avail < 2)
5424                     goto done;
5425                 next = in->cur[1];
5426                 if ((cur == '<') && (next == '!') &&
5427                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5428                     if ((!terminate) &&
5429                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5430                         goto done;
5431 #ifdef DEBUG_PUSH
5432                     xmlGenericError(xmlGenericErrorContext,
5433                             "HPP: Parsing Comment\n");
5434 #endif
5435                     htmlParseComment(ctxt);
5436                     ctxt->instate = XML_PARSER_EPILOG;
5437                 } else if ((cur == '<') && (next == '?')) {
5438                     if ((!terminate) &&
5439                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5440                         goto done;
5441 #ifdef DEBUG_PUSH
5442                     xmlGenericError(xmlGenericErrorContext,
5443                             "HPP: Parsing PI\n");
5444 #endif
5445                     htmlParsePI(ctxt);
5446                     ctxt->instate = XML_PARSER_EPILOG;
5447                 } else if ((cur == '<') && (next == '!') &&
5448                            (avail < 4)) {
5449                     goto done;
5450                 } else {
5451                     ctxt->errNo = XML_ERR_DOCUMENT_END;
5452                     ctxt->wellFormed = 0;
5453                     ctxt->instate = XML_PARSER_EOF;
5454 #ifdef DEBUG_PUSH
5455                     xmlGenericError(xmlGenericErrorContext,
5456                             "HPP: entering EOF\n");
5457 #endif
5458                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5459                         ctxt->sax->endDocument(ctxt->userData);
5460                     goto done;
5461                 }
5462                 break;
5463             case XML_PARSER_START_TAG: {
5464                 const xmlChar *name;
5465                 int failed;
5466                 const htmlElemDesc * info;
5467
5468                 if (avail < 2)
5469                     goto done;
5470                 cur = in->cur[0];
5471                 if (cur != '<') {
5472                     ctxt->instate = XML_PARSER_CONTENT;
5473 #ifdef DEBUG_PUSH
5474                     xmlGenericError(xmlGenericErrorContext,
5475                             "HPP: entering CONTENT\n");
5476 #endif
5477                     break;
5478                 }
5479                 if (in->cur[1] == '/') {
5480                     ctxt->instate = XML_PARSER_END_TAG;
5481                     ctxt->checkIndex = 0;
5482 #ifdef DEBUG_PUSH
5483                     xmlGenericError(xmlGenericErrorContext,
5484                             "HPP: entering END_TAG\n");
5485 #endif
5486                     break;
5487                 }
5488                 if ((!terminate) &&
5489                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5490                     goto done;
5491
5492                 failed = htmlParseStartTag(ctxt);
5493                 name = ctxt->name;
5494                 if ((failed == -1) ||
5495                     (name == NULL)) {
5496                     if (CUR == '>')
5497                         NEXT;
5498                     break;
5499                 }
5500
5501                 /*
5502                  * Lookup the info for that element.
5503                  */
5504                 info = htmlTagLookup(name);
5505                 if (info == NULL) {
5506                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5507                                  "Tag %s invalid\n", name, NULL);
5508                 }
5509
5510                 /*
5511                  * Check for an Empty Element labeled the XML/SGML way
5512                  */
5513                 if ((CUR == '/') && (NXT(1) == '>')) {
5514                     SKIP(2);
5515                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5516                         ctxt->sax->endElement(ctxt->userData, name);
5517                     htmlnamePop(ctxt);
5518                     ctxt->instate = XML_PARSER_CONTENT;
5519 #ifdef DEBUG_PUSH
5520                     xmlGenericError(xmlGenericErrorContext,
5521                             "HPP: entering CONTENT\n");
5522 #endif
5523                     break;
5524                 }
5525
5526                 if (CUR == '>') {
5527                     NEXT;
5528                 } else {
5529                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5530                                  "Couldn't find end of Start Tag %s\n",
5531                                  name, NULL);
5532
5533                     /*
5534                      * end of parsing of this node.
5535                      */
5536                     if (xmlStrEqual(name, ctxt->name)) {
5537                         nodePop(ctxt);
5538                         htmlnamePop(ctxt);
5539                     }
5540
5541                     ctxt->instate = XML_PARSER_CONTENT;
5542 #ifdef DEBUG_PUSH
5543                     xmlGenericError(xmlGenericErrorContext,
5544                             "HPP: entering CONTENT\n");
5545 #endif
5546                     break;
5547                 }
5548
5549                 /*
5550                  * Check for an Empty Element from DTD definition
5551                  */
5552                 if ((info != NULL) && (info->empty)) {
5553                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5554                         ctxt->sax->endElement(ctxt->userData, name);
5555                     htmlnamePop(ctxt);
5556                 }
5557                 ctxt->instate = XML_PARSER_CONTENT;
5558 #ifdef DEBUG_PUSH
5559                 xmlGenericError(xmlGenericErrorContext,
5560                         "HPP: entering CONTENT\n");
5561 #endif
5562                 break;
5563             }
5564             case XML_PARSER_CONTENT: {
5565                 long cons;
5566                 /*
5567                  * Handle preparsed entities and charRef
5568                  */
5569                 if (ctxt->token != 0) {
5570                     xmlChar chr[2] = { 0 , 0 } ;
5571
5572                     chr[0] = (xmlChar) ctxt->token;
5573                     htmlCheckParagraph(ctxt);
5574                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5575                         ctxt->sax->characters(ctxt->userData, chr, 1);
5576                     ctxt->token = 0;
5577                     ctxt->checkIndex = 0;
5578                 }
5579                 if ((avail == 1) && (terminate)) {
5580                     cur = in->cur[0];
5581                     if ((cur != '<') && (cur != '&')) {
5582                         if (ctxt->sax != NULL) {
5583                             if (IS_BLANK_CH(cur)) {
5584                                 if (ctxt->sax->ignorableWhitespace != NULL)
5585                                     ctxt->sax->ignorableWhitespace(
5586                                             ctxt->userData, &cur, 1);
5587                             } else {
5588                                 htmlCheckParagraph(ctxt);
5589                                 if (ctxt->sax->characters != NULL)
5590                                     ctxt->sax->characters(
5591                                             ctxt->userData, &cur, 1);
5592                             }
5593                         }
5594                         ctxt->token = 0;
5595                         ctxt->checkIndex = 0;
5596                         in->cur++;
5597                         break;
5598                     }
5599                 }
5600                 if (avail < 2)
5601                     goto done;
5602                 cur = in->cur[0];
5603                 next = in->cur[1];
5604                 cons = ctxt->nbChars;
5605                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5606                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5607                     /*
5608                      * Handle SCRIPT/STYLE separately
5609                      */
5610                     if (!terminate) {
5611                         int idx;
5612                         xmlChar val;
5613
5614                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5615                         if (idx < 0)
5616                             goto done;
5617                         val = in->cur[idx + 2];
5618                         if (val == 0) /* bad cut of input */
5619                             goto done;
5620                     }
5621                     htmlParseScript(ctxt);
5622                     if ((cur == '<') && (next == '/')) {
5623                         ctxt->instate = XML_PARSER_END_TAG;
5624                         ctxt->checkIndex = 0;
5625 #ifdef DEBUG_PUSH
5626                         xmlGenericError(xmlGenericErrorContext,
5627                                 "HPP: entering END_TAG\n");
5628 #endif
5629                         break;
5630                     }
5631                 } else {
5632                     /*
5633                      * Sometimes DOCTYPE arrives in the middle of the document
5634                      */
5635                     if ((cur == '<') && (next == '!') &&
5636                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
5637                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5638                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5639                         (UPP(8) == 'E')) {
5640                         if ((!terminate) &&
5641                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5642                             goto done;
5643                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5644                                      "Misplaced DOCTYPE declaration\n",
5645                                      BAD_CAST "DOCTYPE" , NULL);
5646                         htmlParseDocTypeDecl(ctxt);
5647                     } else if ((cur == '<') && (next == '!') &&
5648                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
5649                         if ((!terminate) &&
5650                             (htmlParseLookupSequence(
5651                                 ctxt, '-', '-', '>', 1, 1) < 0))
5652                             goto done;
5653 #ifdef DEBUG_PUSH
5654                         xmlGenericError(xmlGenericErrorContext,
5655                                 "HPP: Parsing Comment\n");
5656 #endif
5657                         htmlParseComment(ctxt);
5658                         ctxt->instate = XML_PARSER_CONTENT;
5659                     } else if ((cur == '<') && (next == '?')) {
5660                         if ((!terminate) &&
5661                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5662                             goto done;
5663 #ifdef DEBUG_PUSH
5664                         xmlGenericError(xmlGenericErrorContext,
5665                                 "HPP: Parsing PI\n");
5666 #endif
5667                         htmlParsePI(ctxt);
5668                         ctxt->instate = XML_PARSER_CONTENT;
5669                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5670                         goto done;
5671                     } else if ((cur == '<') && (next == '/')) {
5672                         ctxt->instate = XML_PARSER_END_TAG;
5673                         ctxt->checkIndex = 0;
5674 #ifdef DEBUG_PUSH
5675                         xmlGenericError(xmlGenericErrorContext,
5676                                 "HPP: entering END_TAG\n");
5677 #endif
5678                         break;
5679                     } else if (cur == '<') {
5680                         ctxt->instate = XML_PARSER_START_TAG;
5681                         ctxt->checkIndex = 0;
5682 #ifdef DEBUG_PUSH
5683                         xmlGenericError(xmlGenericErrorContext,
5684                                 "HPP: entering START_TAG\n");
5685 #endif
5686                         break;
5687                     } else if (cur == '&') {
5688                         if ((!terminate) &&
5689                             (htmlParseLookupChars(ctxt,
5690                                                   BAD_CAST "; >/", 4) < 0))
5691                             goto done;
5692 #ifdef DEBUG_PUSH
5693                         xmlGenericError(xmlGenericErrorContext,
5694                                 "HPP: Parsing Reference\n");
5695 #endif
5696                         /* TODO: check generation of subtrees if noent !!! */
5697                         htmlParseReference(ctxt);
5698                     } else {
5699                         /*
5700                          * check that the text sequence is complete
5701                          * before handing out the data to the parser
5702                          * to avoid problems with erroneous end of
5703                          * data detection.
5704                          */
5705                         if ((!terminate) &&
5706                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5707                             goto done;
5708                         ctxt->checkIndex = 0;
5709 #ifdef DEBUG_PUSH
5710                         xmlGenericError(xmlGenericErrorContext,
5711                                 "HPP: Parsing char data\n");
5712 #endif
5713                         htmlParseCharData(ctxt);
5714                     }
5715                 }
5716                 if (cons == ctxt->nbChars) {
5717                     if (ctxt->node != NULL) {
5718                         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5719                                      "detected an error in element content\n",
5720                                      NULL, NULL);
5721                     }
5722                     NEXT;
5723                     break;
5724                 }
5725
5726                 break;
5727             }
5728             case XML_PARSER_END_TAG:
5729                 if (avail < 2)
5730                     goto done;
5731                 if ((!terminate) &&
5732                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5733                     goto done;
5734                 htmlParseEndTag(ctxt);
5735                 if (ctxt->nameNr == 0) {
5736                     ctxt->instate = XML_PARSER_EPILOG;
5737                 } else {
5738                     ctxt->instate = XML_PARSER_CONTENT;
5739                 }
5740                 ctxt->checkIndex = 0;
5741 #ifdef DEBUG_PUSH
5742                 xmlGenericError(xmlGenericErrorContext,
5743                         "HPP: entering CONTENT\n");
5744 #endif
5745                 break;
5746             case XML_PARSER_CDATA_SECTION:
5747                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5748                         "HPP: internal error, state == CDATA\n",
5749                              NULL, NULL);
5750                 ctxt->instate = XML_PARSER_CONTENT;
5751                 ctxt->checkIndex = 0;
5752 #ifdef DEBUG_PUSH
5753                 xmlGenericError(xmlGenericErrorContext,
5754                         "HPP: entering CONTENT\n");
5755 #endif
5756                 break;
5757             case XML_PARSER_DTD:
5758                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5759                         "HPP: internal error, state == DTD\n",
5760                              NULL, NULL);
5761                 ctxt->instate = XML_PARSER_CONTENT;
5762                 ctxt->checkIndex = 0;
5763 #ifdef DEBUG_PUSH
5764                 xmlGenericError(xmlGenericErrorContext,
5765                         "HPP: entering CONTENT\n");
5766 #endif
5767                 break;
5768             case XML_PARSER_COMMENT:
5769                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5770                         "HPP: internal error, state == COMMENT\n",
5771                              NULL, NULL);
5772                 ctxt->instate = XML_PARSER_CONTENT;
5773                 ctxt->checkIndex = 0;
5774 #ifdef DEBUG_PUSH
5775                 xmlGenericError(xmlGenericErrorContext,
5776                         "HPP: entering CONTENT\n");
5777 #endif
5778                 break;
5779             case XML_PARSER_PI:
5780                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5781                         "HPP: internal error, state == PI\n",
5782                              NULL, NULL);
5783                 ctxt->instate = XML_PARSER_CONTENT;
5784                 ctxt->checkIndex = 0;
5785 #ifdef DEBUG_PUSH
5786                 xmlGenericError(xmlGenericErrorContext,
5787                         "HPP: entering CONTENT\n");
5788 #endif
5789                 break;
5790             case XML_PARSER_ENTITY_DECL:
5791                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5792                         "HPP: internal error, state == ENTITY_DECL\n",
5793                              NULL, NULL);
5794                 ctxt->instate = XML_PARSER_CONTENT;
5795                 ctxt->checkIndex = 0;
5796 #ifdef DEBUG_PUSH
5797                 xmlGenericError(xmlGenericErrorContext,
5798                         "HPP: entering CONTENT\n");
5799 #endif
5800                 break;
5801             case XML_PARSER_ENTITY_VALUE:
5802                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5803                         "HPP: internal error, state == ENTITY_VALUE\n",
5804                              NULL, NULL);
5805                 ctxt->instate = XML_PARSER_CONTENT;
5806                 ctxt->checkIndex = 0;
5807 #ifdef DEBUG_PUSH
5808                 xmlGenericError(xmlGenericErrorContext,
5809                         "HPP: entering DTD\n");
5810 #endif
5811                 break;
5812             case XML_PARSER_ATTRIBUTE_VALUE:
5813                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5814                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5815                              NULL, NULL);
5816                 ctxt->instate = XML_PARSER_START_TAG;
5817                 ctxt->checkIndex = 0;
5818 #ifdef DEBUG_PUSH
5819                 xmlGenericError(xmlGenericErrorContext,
5820                         "HPP: entering START_TAG\n");
5821 #endif
5822                 break;
5823             case XML_PARSER_SYSTEM_LITERAL:
5824                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5825                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5826                              NULL, NULL);
5827                 ctxt->instate = XML_PARSER_CONTENT;
5828                 ctxt->checkIndex = 0;
5829 #ifdef DEBUG_PUSH
5830                 xmlGenericError(xmlGenericErrorContext,
5831                         "HPP: entering CONTENT\n");
5832 #endif
5833                 break;
5834             case XML_PARSER_IGNORE:
5835                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5836                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
5837                              NULL, NULL);
5838                 ctxt->instate = XML_PARSER_CONTENT;
5839                 ctxt->checkIndex = 0;
5840 #ifdef DEBUG_PUSH
5841                 xmlGenericError(xmlGenericErrorContext,
5842                         "HPP: entering CONTENT\n");
5843 #endif
5844                 break;
5845             case XML_PARSER_PUBLIC_LITERAL:
5846                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5847                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
5848                              NULL, NULL);
5849                 ctxt->instate = XML_PARSER_CONTENT;
5850                 ctxt->checkIndex = 0;
5851 #ifdef DEBUG_PUSH
5852                 xmlGenericError(xmlGenericErrorContext,
5853                         "HPP: entering CONTENT\n");
5854 #endif
5855                 break;
5856
5857         }
5858     }
5859 done:
5860     if ((avail == 0) && (terminate)) {
5861         htmlAutoCloseOnEnd(ctxt);
5862         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5863             /*
5864              * SAX: end of the document processing.
5865              */
5866             ctxt->instate = XML_PARSER_EOF;
5867             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5868                 ctxt->sax->endDocument(ctxt->userData);
5869         }
5870     }
5871     if ((ctxt->myDoc != NULL) &&
5872         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5873          (ctxt->instate == XML_PARSER_EPILOG))) {
5874         xmlDtdPtr dtd;
5875         dtd = xmlGetIntSubset(ctxt->myDoc);
5876         if (dtd == NULL)
5877             ctxt->myDoc->intSubset =
5878                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5879                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5880                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5881     }
5882 #ifdef DEBUG_PUSH
5883     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5884 #endif
5885     return(ret);
5886 }
5887
5888 /**
5889  * htmlParseChunk:
5890  * @ctxt:  an HTML parser context
5891  * @chunk:  an char array
5892  * @size:  the size in byte of the chunk
5893  * @terminate:  last chunk indicator
5894  *
5895  * Parse a Chunk of memory
5896  *
5897  * Returns zero if no error, the xmlParserErrors otherwise.
5898  */
5899 int
5900 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5901               int terminate) {
5902     if ((ctxt == NULL) || (ctxt->input == NULL)) {
5903         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5904                      "htmlParseChunk: context error\n", NULL, NULL);
5905         return(XML_ERR_INTERNAL_ERROR);
5906     }
5907     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5908         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5909         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5910         int cur = ctxt->input->cur - ctxt->input->base;
5911         int res;
5912
5913         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5914         if (res < 0) {
5915             ctxt->errNo = XML_PARSER_EOF;
5916             ctxt->disableSAX = 1;
5917             return (XML_PARSER_EOF);
5918         }
5919         ctxt->input->base = ctxt->input->buf->buffer->content + base;
5920         ctxt->input->cur = ctxt->input->base + cur;
5921         ctxt->input->end =
5922           &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5923 #ifdef DEBUG_PUSH
5924         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5925 #endif
5926
5927 #if 0
5928         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5929             htmlParseTryOrFinish(ctxt, terminate);
5930 #endif
5931     } else if (ctxt->instate != XML_PARSER_EOF) {
5932         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5933             xmlParserInputBufferPtr in = ctxt->input->buf;
5934             if ((in->encoder != NULL) && (in->buffer != NULL) &&
5935                     (in->raw != NULL)) {
5936                 int nbchars;
5937
5938                 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5939                 if (nbchars < 0) {
5940                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5941                                  "encoder error\n", NULL, NULL);
5942                     return(XML_ERR_INVALID_ENCODING);
5943                 }
5944             }
5945         }
5946     }
5947     htmlParseTryOrFinish(ctxt, terminate);
5948     if (terminate) {
5949         if ((ctxt->instate != XML_PARSER_EOF) &&
5950             (ctxt->instate != XML_PARSER_EPILOG) &&
5951             (ctxt->instate != XML_PARSER_MISC)) {
5952             ctxt->errNo = XML_ERR_DOCUMENT_END;
5953             ctxt->wellFormed = 0;
5954         }
5955         if (ctxt->instate != XML_PARSER_EOF) {
5956             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5957                 ctxt->sax->endDocument(ctxt->userData);
5958         }
5959         ctxt->instate = XML_PARSER_EOF;
5960     }
5961     return((xmlParserErrors) ctxt->errNo);
5962 }
5963
5964 /************************************************************************
5965  *                                                                      *
5966  *                      User entry points                               *
5967  *                                                                      *
5968  ************************************************************************/
5969
5970 /**
5971  * htmlCreatePushParserCtxt:
5972  * @sax:  a SAX handler
5973  * @user_data:  The user data returned on SAX callbacks
5974  * @chunk:  a pointer to an array of chars
5975  * @size:  number of chars in the array
5976  * @filename:  an optional file name or URI
5977  * @enc:  an optional encoding
5978  *
5979  * Create a parser context for using the HTML parser in push mode
5980  * The value of @filename is used for fetching external entities
5981  * and error/warning reports.
5982  *
5983  * Returns the new parser context or NULL
5984  */
5985 htmlParserCtxtPtr
5986 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5987                          const char *chunk, int size, const char *filename,
5988                          xmlCharEncoding enc) {
5989     htmlParserCtxtPtr ctxt;
5990     htmlParserInputPtr inputStream;
5991     xmlParserInputBufferPtr buf;
5992
5993     xmlInitParser();
5994
5995     buf = xmlAllocParserInputBuffer(enc);
5996     if (buf == NULL) return(NULL);
5997
5998     ctxt = htmlNewParserCtxt();
5999     if (ctxt == NULL) {
6000         xmlFreeParserInputBuffer(buf);
6001         return(NULL);
6002     }
6003     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6004         ctxt->charset=XML_CHAR_ENCODING_UTF8;
6005     if (sax != NULL) {
6006         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6007             xmlFree(ctxt->sax);
6008         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6009         if (ctxt->sax == NULL) {
6010             xmlFree(buf);
6011             xmlFree(ctxt);
6012             return(NULL);
6013         }
6014         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6015         if (user_data != NULL)
6016             ctxt->userData = user_data;
6017     }
6018     if (filename == NULL) {
6019         ctxt->directory = NULL;
6020     } else {
6021         ctxt->directory = xmlParserGetDirectory(filename);
6022     }
6023
6024     inputStream = htmlNewInputStream(ctxt);
6025     if (inputStream == NULL) {
6026         xmlFreeParserCtxt(ctxt);
6027         xmlFree(buf);
6028         return(NULL);
6029     }
6030
6031     if (filename == NULL)
6032         inputStream->filename = NULL;
6033     else
6034         inputStream->filename = (char *)
6035             xmlCanonicPath((const xmlChar *) filename);
6036     inputStream->buf = buf;
6037     inputStream->base = inputStream->buf->buffer->content;
6038     inputStream->cur = inputStream->buf->buffer->content;
6039     inputStream->end =
6040         &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
6041
6042     inputPush(ctxt, inputStream);
6043
6044     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6045         (ctxt->input->buf != NULL))  {
6046         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
6047         int cur = ctxt->input->cur - ctxt->input->base;
6048
6049         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6050
6051         ctxt->input->base = ctxt->input->buf->buffer->content + base;
6052         ctxt->input->cur = ctxt->input->base + cur;
6053         ctxt->input->end =
6054             &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
6055 #ifdef DEBUG_PUSH
6056         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6057 #endif
6058     }
6059     ctxt->progressive = 1;
6060
6061     return(ctxt);
6062 }
6063 #endif /* LIBXML_PUSH_ENABLED */
6064
6065 /**
6066  * htmlSAXParseDoc:
6067  * @cur:  a pointer to an array of xmlChar
6068  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6069  * @sax:  the SAX handler block
6070  * @userData: if using SAX, this pointer will be provided on callbacks.
6071  *
6072  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6073  * to handle parse events. If sax is NULL, fallback to the default DOM
6074  * behavior and return a tree.
6075  *
6076  * Returns the resulting document tree unless SAX is NULL or the document is
6077  *     not well formed.
6078  */
6079
6080 htmlDocPtr
6081 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6082     htmlDocPtr ret;
6083     htmlParserCtxtPtr ctxt;
6084
6085     xmlInitParser();
6086
6087     if (cur == NULL) return(NULL);
6088
6089
6090     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6091     if (ctxt == NULL) return(NULL);
6092     if (sax != NULL) {
6093         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6094         ctxt->sax = sax;
6095         ctxt->userData = userData;
6096     }
6097
6098     htmlParseDocument(ctxt);
6099     ret = ctxt->myDoc;
6100     if (sax != NULL) {
6101         ctxt->sax = NULL;
6102         ctxt->userData = NULL;
6103     }
6104     htmlFreeParserCtxt(ctxt);
6105
6106     return(ret);
6107 }
6108
6109 /**
6110  * htmlParseDoc:
6111  * @cur:  a pointer to an array of xmlChar
6112  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6113  *
6114  * parse an HTML in-memory document and build a tree.
6115  *
6116  * Returns the resulting document tree
6117  */
6118
6119 htmlDocPtr
6120 htmlParseDoc(xmlChar *cur, const char *encoding) {
6121     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6122 }
6123
6124
6125 /**
6126  * htmlCreateFileParserCtxt:
6127  * @filename:  the filename
6128  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6129  *
6130  * Create a parser context for a file content.
6131  * Automatic support for ZLIB/Compress compressed document is provided
6132  * by default if found at compile-time.
6133  *
6134  * Returns the new parser context or NULL
6135  */
6136 htmlParserCtxtPtr
6137 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6138 {
6139     htmlParserCtxtPtr ctxt;
6140     htmlParserInputPtr inputStream;
6141     char *canonicFilename;
6142     /* htmlCharEncoding enc; */
6143     xmlChar *content, *content_line = (xmlChar *) "charset=";
6144
6145     if (filename == NULL)
6146         return(NULL);
6147
6148     ctxt = htmlNewParserCtxt();
6149     if (ctxt == NULL) {
6150         return(NULL);
6151     }
6152     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6153     if (canonicFilename == NULL) {
6154 #ifdef LIBXML_SAX1_ENABLED
6155         if (xmlDefaultSAXHandler.error != NULL) {
6156             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6157         }
6158 #endif
6159         xmlFreeParserCtxt(ctxt);
6160         return(NULL);
6161     }
6162
6163     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6164     xmlFree(canonicFilename);
6165     if (inputStream == NULL) {
6166         xmlFreeParserCtxt(ctxt);
6167         return(NULL);
6168     }
6169
6170     inputPush(ctxt, inputStream);
6171
6172     /* set encoding */
6173     if (encoding) {
6174         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
6175         if (content) {
6176             strcpy ((char *)content, (char *)content_line);
6177             strcat ((char *)content, (char *)encoding);
6178             htmlCheckEncoding (ctxt, content);
6179             xmlFree (content);
6180         }
6181     }
6182
6183     return(ctxt);
6184 }
6185
6186 /**
6187  * htmlSAXParseFile:
6188  * @filename:  the filename
6189  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6190  * @sax:  the SAX handler block
6191  * @userData: if using SAX, this pointer will be provided on callbacks.
6192  *
6193  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6194  * compressed document is provided by default if found at compile-time.
6195  * It use the given SAX function block to handle the parsing callback.
6196  * If sax is NULL, fallback to the default DOM tree building routines.
6197  *
6198  * Returns the resulting document tree unless SAX is NULL or the document is
6199  *     not well formed.
6200  */
6201
6202 htmlDocPtr
6203 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6204                  void *userData) {
6205     htmlDocPtr ret;
6206     htmlParserCtxtPtr ctxt;
6207     htmlSAXHandlerPtr oldsax = NULL;
6208
6209     xmlInitParser();
6210
6211     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6212     if (ctxt == NULL) return(NULL);
6213     if (sax != NULL) {
6214         oldsax = ctxt->sax;
6215         ctxt->sax = sax;
6216         ctxt->userData = userData;
6217     }
6218
6219     htmlParseDocument(ctxt);
6220
6221     ret = ctxt->myDoc;
6222     if (sax != NULL) {
6223         ctxt->sax = oldsax;
6224         ctxt->userData = NULL;
6225     }
6226     htmlFreeParserCtxt(ctxt);
6227
6228     return(ret);
6229 }
6230
6231 /**
6232  * htmlParseFile:
6233  * @filename:  the filename
6234  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6235  *
6236  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6237  * compressed document is provided by default if found at compile-time.
6238  *
6239  * Returns the resulting document tree
6240  */
6241
6242 htmlDocPtr
6243 htmlParseFile(const char *filename, const char *encoding) {
6244     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6245 }
6246
6247 /**
6248  * htmlHandleOmittedElem:
6249  * @val:  int 0 or 1
6250  *
6251  * Set and return the previous value for handling HTML omitted tags.
6252  *
6253  * Returns the last value for 0 for no handling, 1 for auto insertion.
6254  */
6255
6256 int
6257 htmlHandleOmittedElem(int val) {
6258     int old = htmlOmittedDefaultValue;
6259
6260     htmlOmittedDefaultValue = val;
6261     return(old);
6262 }
6263
6264 /**
6265  * htmlElementAllowedHere:
6266  * @parent: HTML parent element
6267  * @elt: HTML element
6268  *
6269  * Checks whether an HTML element may be a direct child of a parent element.
6270  * Note - doesn't check for deprecated elements
6271  *
6272  * Returns 1 if allowed; 0 otherwise.
6273  */
6274 int
6275 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6276   const char** p ;
6277
6278   if ( ! elt || ! parent || ! parent->subelts )
6279         return 0 ;
6280
6281   for ( p = parent->subelts; *p; ++p )
6282     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6283       return 1 ;
6284
6285   return 0 ;
6286 }
6287 /**
6288  * htmlElementStatusHere:
6289  * @parent: HTML parent element
6290  * @elt: HTML element
6291  *
6292  * Checks whether an HTML element may be a direct child of a parent element.
6293  * and if so whether it is valid or deprecated.
6294  *
6295  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6296  */
6297 htmlStatus
6298 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6299   if ( ! parent || ! elt )
6300     return HTML_INVALID ;
6301   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6302     return HTML_INVALID ;
6303
6304   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6305 }
6306 /**
6307  * htmlAttrAllowed:
6308  * @elt: HTML element
6309  * @attr: HTML attribute
6310  * @legacy: whether to allow deprecated attributes
6311  *
6312  * Checks whether an attribute is valid for an element
6313  * Has full knowledge of Required and Deprecated attributes
6314  *
6315  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6316  */
6317 htmlStatus
6318 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6319   const char** p ;
6320
6321   if ( !elt || ! attr )
6322         return HTML_INVALID ;
6323
6324   if ( elt->attrs_req )
6325     for ( p = elt->attrs_req; *p; ++p)
6326       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6327         return HTML_REQUIRED ;
6328
6329   if ( elt->attrs_opt )
6330     for ( p = elt->attrs_opt; *p; ++p)
6331       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6332         return HTML_VALID ;
6333
6334   if ( legacy && elt->attrs_depr )
6335     for ( p = elt->attrs_depr; *p; ++p)
6336       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6337         return HTML_DEPRECATED ;
6338
6339   return HTML_INVALID ;
6340 }
6341 /**
6342  * htmlNodeStatus:
6343  * @node: an htmlNodePtr in a tree
6344  * @legacy: whether to allow deprecated elements (YES is faster here
6345  *      for Element nodes)
6346  *
6347  * Checks whether the tree node is valid.  Experimental (the author
6348  *     only uses the HTML enhancements in a SAX parser)
6349  *
6350  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6351  *      legacy allowed) or htmlElementStatusHere (otherwise).
6352  *      for Attribute nodes, a return from htmlAttrAllowed
6353  *      for other nodes, HTML_NA (no checks performed)
6354  */
6355 htmlStatus
6356 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6357   if ( ! node )
6358     return HTML_INVALID ;
6359
6360   switch ( node->type ) {
6361     case XML_ELEMENT_NODE:
6362       return legacy
6363         ? ( htmlElementAllowedHere (
6364                 htmlTagLookup(node->parent->name) , node->name
6365                 ) ? HTML_VALID : HTML_INVALID )
6366         : htmlElementStatusHere(
6367                 htmlTagLookup(node->parent->name) ,
6368                 htmlTagLookup(node->name) )
6369         ;
6370     case XML_ATTRIBUTE_NODE:
6371       return htmlAttrAllowed(
6372         htmlTagLookup(node->parent->name) , node->name, legacy) ;
6373     default: return HTML_NA ;
6374   }
6375 }
6376 /************************************************************************
6377  *                                                                      *
6378  *      New set (2.6.0) of simpler and more flexible APIs               *
6379  *                                                                      *
6380  ************************************************************************/
6381 /**
6382  * DICT_FREE:
6383  * @str:  a string
6384  *
6385  * Free a string if it is not owned by the "dict" dictionnary in the
6386  * current scope
6387  */
6388 #define DICT_FREE(str)                                          \
6389         if ((str) && ((!dict) ||                                \
6390             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6391             xmlFree((char *)(str));
6392
6393 /**
6394  * htmlCtxtReset:
6395  * @ctxt: an HTML parser context
6396  *
6397  * Reset a parser context
6398  */
6399 void
6400 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6401 {
6402     xmlParserInputPtr input;
6403     xmlDictPtr dict;
6404
6405     if (ctxt == NULL)
6406         return;
6407
6408     xmlInitParser();
6409     dict = ctxt->dict;
6410
6411     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6412         xmlFreeInputStream(input);
6413     }
6414     ctxt->inputNr = 0;
6415     ctxt->input = NULL;
6416
6417     ctxt->spaceNr = 0;
6418     if (ctxt->spaceTab != NULL) {
6419         ctxt->spaceTab[0] = -1;
6420         ctxt->space = &ctxt->spaceTab[0];
6421     } else {
6422         ctxt->space = NULL;
6423     }
6424
6425
6426     ctxt->nodeNr = 0;
6427     ctxt->node = NULL;
6428
6429     ctxt->nameNr = 0;
6430     ctxt->name = NULL;
6431
6432     DICT_FREE(ctxt->version);
6433     ctxt->version = NULL;
6434     DICT_FREE(ctxt->encoding);
6435     ctxt->encoding = NULL;
6436     DICT_FREE(ctxt->directory);
6437     ctxt->directory = NULL;
6438     DICT_FREE(ctxt->extSubURI);
6439     ctxt->extSubURI = NULL;
6440     DICT_FREE(ctxt->extSubSystem);
6441     ctxt->extSubSystem = NULL;
6442     if (ctxt->myDoc != NULL)
6443         xmlFreeDoc(ctxt->myDoc);
6444     ctxt->myDoc = NULL;
6445
6446     ctxt->standalone = -1;
6447     ctxt->hasExternalSubset = 0;
6448     ctxt->hasPErefs = 0;
6449     ctxt->html = 1;
6450     ctxt->external = 0;
6451     ctxt->instate = XML_PARSER_START;
6452     ctxt->token = 0;
6453
6454     ctxt->wellFormed = 1;
6455     ctxt->nsWellFormed = 1;
6456     ctxt->disableSAX = 0;
6457     ctxt->valid = 1;
6458     ctxt->vctxt.userData = ctxt;
6459     ctxt->vctxt.error = xmlParserValidityError;
6460     ctxt->vctxt.warning = xmlParserValidityWarning;
6461     ctxt->record_info = 0;
6462     ctxt->nbChars = 0;
6463     ctxt->checkIndex = 0;
6464     ctxt->inSubset = 0;
6465     ctxt->errNo = XML_ERR_OK;
6466     ctxt->depth = 0;
6467     ctxt->charset = XML_CHAR_ENCODING_NONE;
6468     ctxt->catalogs = NULL;
6469     xmlInitNodeInfoSeq(&ctxt->node_seq);
6470
6471     if (ctxt->attsDefault != NULL) {
6472         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6473         ctxt->attsDefault = NULL;
6474     }
6475     if (ctxt->attsSpecial != NULL) {
6476         xmlHashFree(ctxt->attsSpecial, NULL);
6477         ctxt->attsSpecial = NULL;
6478     }
6479 }
6480
6481 /**
6482  * htmlCtxtUseOptions:
6483  * @ctxt: an HTML parser context
6484  * @options:  a combination of htmlParserOption(s)
6485  *
6486  * Applies the options to the parser context
6487  *
6488  * Returns 0 in case of success, the set of unknown or unimplemented options
6489  *         in case of error.
6490  */
6491 int
6492 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6493 {
6494     if (ctxt == NULL)
6495         return(-1);
6496
6497     if (options & HTML_PARSE_NOWARNING) {
6498         ctxt->sax->warning = NULL;
6499         ctxt->vctxt.warning = NULL;
6500         options -= XML_PARSE_NOWARNING;
6501         ctxt->options |= XML_PARSE_NOWARNING;
6502     }
6503     if (options & HTML_PARSE_NOERROR) {
6504         ctxt->sax->error = NULL;
6505         ctxt->vctxt.error = NULL;
6506         ctxt->sax->fatalError = NULL;
6507         options -= XML_PARSE_NOERROR;
6508         ctxt->options |= XML_PARSE_NOERROR;
6509     }
6510     if (options & HTML_PARSE_PEDANTIC) {
6511         ctxt->pedantic = 1;
6512         options -= XML_PARSE_PEDANTIC;
6513         ctxt->options |= XML_PARSE_PEDANTIC;
6514     } else
6515         ctxt->pedantic = 0;
6516     if (options & XML_PARSE_NOBLANKS) {
6517         ctxt->keepBlanks = 0;
6518         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6519         options -= XML_PARSE_NOBLANKS;
6520         ctxt->options |= XML_PARSE_NOBLANKS;
6521     } else
6522         ctxt->keepBlanks = 1;
6523     if (options & HTML_PARSE_RECOVER) {
6524         ctxt->recovery = 1;
6525         options -= HTML_PARSE_RECOVER;
6526     } else
6527         ctxt->recovery = 0;
6528     if (options & HTML_PARSE_COMPACT) {
6529         ctxt->options |= HTML_PARSE_COMPACT;
6530         options -= HTML_PARSE_COMPACT;
6531     }
6532     if (options & XML_PARSE_HUGE) {
6533         ctxt->options |= XML_PARSE_HUGE;
6534         options -= XML_PARSE_HUGE;
6535     }
6536     if (options & HTML_PARSE_NODEFDTD) {
6537         ctxt->options |= HTML_PARSE_NODEFDTD;
6538         options -= HTML_PARSE_NODEFDTD;
6539     }
6540     ctxt->dictNames = 0;
6541     return (options);
6542 }
6543
6544 /**
6545  * htmlDoRead:
6546  * @ctxt:  an HTML parser context
6547  * @URL:  the base URL to use for the document
6548  * @encoding:  the document encoding, or NULL
6549  * @options:  a combination of htmlParserOption(s)
6550  * @reuse:  keep the context for reuse
6551  *
6552  * Common front-end for the htmlRead functions
6553  *
6554  * Returns the resulting document tree or NULL
6555  */
6556 static htmlDocPtr
6557 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6558           int options, int reuse)
6559 {
6560     htmlDocPtr ret;
6561
6562     htmlCtxtUseOptions(ctxt, options);
6563     ctxt->html = 1;
6564     if (encoding != NULL) {
6565         xmlCharEncodingHandlerPtr hdlr;
6566
6567         hdlr = xmlFindCharEncodingHandler(encoding);
6568         if (hdlr != NULL) {
6569             xmlSwitchToEncoding(ctxt, hdlr);
6570             if (ctxt->input->encoding != NULL)
6571               xmlFree((xmlChar *) ctxt->input->encoding);
6572             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6573         }
6574     }
6575     if ((URL != NULL) && (ctxt->input != NULL) &&
6576         (ctxt->input->filename == NULL))
6577         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6578     htmlParseDocument(ctxt);
6579     ret = ctxt->myDoc;
6580     ctxt->myDoc = NULL;
6581     if (!reuse) {
6582         if ((ctxt->dictNames) &&
6583             (ret != NULL) &&
6584             (ret->dict == ctxt->dict))
6585             ctxt->dict = NULL;
6586         xmlFreeParserCtxt(ctxt);
6587     }
6588     return (ret);
6589 }
6590
6591 /**
6592  * htmlReadDoc:
6593  * @cur:  a pointer to a zero terminated string
6594  * @URL:  the base URL to use for the document
6595  * @encoding:  the document encoding, or NULL
6596  * @options:  a combination of htmlParserOption(s)
6597  *
6598  * parse an XML in-memory document and build a tree.
6599  *
6600  * Returns the resulting document tree
6601  */
6602 htmlDocPtr
6603 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6604 {
6605     htmlParserCtxtPtr ctxt;
6606
6607     if (cur == NULL)
6608         return (NULL);
6609
6610     xmlInitParser();
6611     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6612     if (ctxt == NULL)
6613         return (NULL);
6614     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6615 }
6616
6617 /**
6618  * htmlReadFile:
6619  * @filename:  a file or URL
6620  * @encoding:  the document encoding, or NULL
6621  * @options:  a combination of htmlParserOption(s)
6622  *
6623  * parse an XML file from the filesystem or the network.
6624  *
6625  * Returns the resulting document tree
6626  */
6627 htmlDocPtr
6628 htmlReadFile(const char *filename, const char *encoding, int options)
6629 {
6630     htmlParserCtxtPtr ctxt;
6631
6632     xmlInitParser();
6633     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6634     if (ctxt == NULL)
6635         return (NULL);
6636     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6637 }
6638
6639 /**
6640  * htmlReadMemory:
6641  * @buffer:  a pointer to a char array
6642  * @size:  the size of the array
6643  * @URL:  the base URL to use for the document
6644  * @encoding:  the document encoding, or NULL
6645  * @options:  a combination of htmlParserOption(s)
6646  *
6647  * parse an XML in-memory document and build a tree.
6648  *
6649  * Returns the resulting document tree
6650  */
6651 htmlDocPtr
6652 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6653 {
6654     htmlParserCtxtPtr ctxt;
6655
6656     xmlInitParser();
6657     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6658     if (ctxt == NULL)
6659         return (NULL);
6660     htmlDefaultSAXHandlerInit();
6661     if (ctxt->sax != NULL)
6662         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6663     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6664 }
6665
6666 /**
6667  * htmlReadFd:
6668  * @fd:  an open file descriptor
6669  * @URL:  the base URL to use for the document
6670  * @encoding:  the document encoding, or NULL
6671  * @options:  a combination of htmlParserOption(s)
6672  *
6673  * parse an XML from a file descriptor and build a tree.
6674  *
6675  * Returns the resulting document tree
6676  */
6677 htmlDocPtr
6678 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6679 {
6680     htmlParserCtxtPtr ctxt;
6681     xmlParserInputBufferPtr input;
6682     xmlParserInputPtr stream;
6683
6684     if (fd < 0)
6685         return (NULL);
6686
6687     xmlInitParser();
6688     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6689     if (input == NULL)
6690         return (NULL);
6691     ctxt = xmlNewParserCtxt();
6692     if (ctxt == NULL) {
6693         xmlFreeParserInputBuffer(input);
6694         return (NULL);
6695     }
6696     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6697     if (stream == NULL) {
6698         xmlFreeParserInputBuffer(input);
6699         xmlFreeParserCtxt(ctxt);
6700         return (NULL);
6701     }
6702     inputPush(ctxt, stream);
6703     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6704 }
6705
6706 /**
6707  * htmlReadIO:
6708  * @ioread:  an I/O read function
6709  * @ioclose:  an I/O close function
6710  * @ioctx:  an I/O handler
6711  * @URL:  the base URL to use for the document
6712  * @encoding:  the document encoding, or NULL
6713  * @options:  a combination of htmlParserOption(s)
6714  *
6715  * parse an HTML document from I/O functions and source and build a tree.
6716  *
6717  * Returns the resulting document tree
6718  */
6719 htmlDocPtr
6720 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6721           void *ioctx, const char *URL, const char *encoding, int options)
6722 {
6723     htmlParserCtxtPtr ctxt;
6724     xmlParserInputBufferPtr input;
6725     xmlParserInputPtr stream;
6726
6727     if (ioread == NULL)
6728         return (NULL);
6729     xmlInitParser();
6730
6731     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6732                                          XML_CHAR_ENCODING_NONE);
6733     if (input == NULL)
6734         return (NULL);
6735     ctxt = htmlNewParserCtxt();
6736     if (ctxt == NULL) {
6737         xmlFreeParserInputBuffer(input);
6738         return (NULL);
6739     }
6740     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6741     if (stream == NULL) {
6742         xmlFreeParserInputBuffer(input);
6743         xmlFreeParserCtxt(ctxt);
6744         return (NULL);
6745     }
6746     inputPush(ctxt, stream);
6747     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6748 }
6749
6750 /**
6751  * htmlCtxtReadDoc:
6752  * @ctxt:  an HTML parser context
6753  * @cur:  a pointer to a zero terminated string
6754  * @URL:  the base URL to use for the document
6755  * @encoding:  the document encoding, or NULL
6756  * @options:  a combination of htmlParserOption(s)
6757  *
6758  * parse an XML in-memory document and build a tree.
6759  * This reuses the existing @ctxt parser context
6760  *
6761  * Returns the resulting document tree
6762  */
6763 htmlDocPtr
6764 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6765                const char *URL, const char *encoding, int options)
6766 {
6767     xmlParserInputPtr stream;
6768
6769     if (cur == NULL)
6770         return (NULL);
6771     if (ctxt == NULL)
6772         return (NULL);
6773
6774     htmlCtxtReset(ctxt);
6775
6776     stream = xmlNewStringInputStream(ctxt, cur);
6777     if (stream == NULL) {
6778         return (NULL);
6779     }
6780     inputPush(ctxt, stream);
6781     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6782 }
6783
6784 /**
6785  * htmlCtxtReadFile:
6786  * @ctxt:  an HTML parser context
6787  * @filename:  a file or URL
6788  * @encoding:  the document encoding, or NULL
6789  * @options:  a combination of htmlParserOption(s)
6790  *
6791  * parse an XML file from the filesystem or the network.
6792  * This reuses the existing @ctxt parser context
6793  *
6794  * Returns the resulting document tree
6795  */
6796 htmlDocPtr
6797 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6798                 const char *encoding, int options)
6799 {
6800     xmlParserInputPtr stream;
6801
6802     if (filename == NULL)
6803         return (NULL);
6804     if (ctxt == NULL)
6805         return (NULL);
6806
6807     htmlCtxtReset(ctxt);
6808
6809     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6810     if (stream == NULL) {
6811         return (NULL);
6812     }
6813     inputPush(ctxt, stream);
6814     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6815 }
6816
6817 /**
6818  * htmlCtxtReadMemory:
6819  * @ctxt:  an HTML parser context
6820  * @buffer:  a pointer to a char array
6821  * @size:  the size of the array
6822  * @URL:  the base URL to use for the document
6823  * @encoding:  the document encoding, or NULL
6824  * @options:  a combination of htmlParserOption(s)
6825  *
6826  * parse an XML in-memory document and build a tree.
6827  * This reuses the existing @ctxt parser context
6828  *
6829  * Returns the resulting document tree
6830  */
6831 htmlDocPtr
6832 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6833                   const char *URL, const char *encoding, int options)
6834 {
6835     xmlParserInputBufferPtr input;
6836     xmlParserInputPtr stream;
6837
6838     if (ctxt == NULL)
6839         return (NULL);
6840     if (buffer == NULL)
6841         return (NULL);
6842
6843     htmlCtxtReset(ctxt);
6844
6845     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6846     if (input == NULL) {
6847         return(NULL);
6848     }
6849
6850     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6851     if (stream == NULL) {
6852         xmlFreeParserInputBuffer(input);
6853         return(NULL);
6854     }
6855
6856     inputPush(ctxt, stream);
6857     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6858 }
6859
6860 /**
6861  * htmlCtxtReadFd:
6862  * @ctxt:  an HTML parser context
6863  * @fd:  an open file descriptor
6864  * @URL:  the base URL to use for the document
6865  * @encoding:  the document encoding, or NULL
6866  * @options:  a combination of htmlParserOption(s)
6867  *
6868  * parse an XML from a file descriptor and build a tree.
6869  * This reuses the existing @ctxt parser context
6870  *
6871  * Returns the resulting document tree
6872  */
6873 htmlDocPtr
6874 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6875               const char *URL, const char *encoding, int options)
6876 {
6877     xmlParserInputBufferPtr input;
6878     xmlParserInputPtr stream;
6879
6880     if (fd < 0)
6881         return (NULL);
6882     if (ctxt == NULL)
6883         return (NULL);
6884
6885     htmlCtxtReset(ctxt);
6886
6887
6888     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6889     if (input == NULL)
6890         return (NULL);
6891     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6892     if (stream == NULL) {
6893         xmlFreeParserInputBuffer(input);
6894         return (NULL);
6895     }
6896     inputPush(ctxt, stream);
6897     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6898 }
6899
6900 /**
6901  * htmlCtxtReadIO:
6902  * @ctxt:  an HTML parser context
6903  * @ioread:  an I/O read function
6904  * @ioclose:  an I/O close function
6905  * @ioctx:  an I/O handler
6906  * @URL:  the base URL to use for the document
6907  * @encoding:  the document encoding, or NULL
6908  * @options:  a combination of htmlParserOption(s)
6909  *
6910  * parse an HTML document from I/O functions and source and build a tree.
6911  * This reuses the existing @ctxt parser context
6912  *
6913  * Returns the resulting document tree
6914  */
6915 htmlDocPtr
6916 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6917               xmlInputCloseCallback ioclose, void *ioctx,
6918               const char *URL,
6919               const char *encoding, int options)
6920 {
6921     xmlParserInputBufferPtr input;
6922     xmlParserInputPtr stream;
6923
6924     if (ioread == NULL)
6925         return (NULL);
6926     if (ctxt == NULL)
6927         return (NULL);
6928
6929     htmlCtxtReset(ctxt);
6930
6931     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6932                                          XML_CHAR_ENCODING_NONE);
6933     if (input == NULL)
6934         return (NULL);
6935     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6936     if (stream == NULL) {
6937         xmlFreeParserInputBuffer(input);
6938         return (NULL);
6939     }
6940     inputPush(ctxt, stream);
6941     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6942 }
6943
6944 #define bottom_HTMLparser
6945 #include "elfgcchack.h"
6946 #endif /* LIBXML_HTML_ENABLED */