HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #ifdef HAVE_CTYPE_H
  15 #include <ctype.h>
  16 #endif
  17 #ifdef HAVE_STDLIB_H
  18 #include <stdlib.h>
  19 #endif
  20 #ifdef HAVE_SYS_STAT_H
  21 #include <sys/stat.h>
  22 #endif
  23 #ifdef HAVE_FCNTL_H
  24 #include <fcntl.h>
  25 #endif
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif
  29 #ifdef HAVE_ZLIB_H
  30 #include <zlib.h>
  31 #endif
  32
  33 #include <libxml/xmlmemory.h>
  34 #include <libxml/tree.h>
  35 #include <libxml/parser.h>
  36 #include <libxml/parserInternals.h>
  37 #include <libxml/xmlerror.h>
  38 #include <libxml/HTMLparser.h>
  39 #include <libxml/HTMLtree.h>
  40 #include <libxml/entities.h>
  41 #include <libxml/encoding.h>
  42 #include <libxml/valid.h>
  43 #include <libxml/xmlIO.h>
  44 #include <libxml/globals.h>
  45 #include <libxml/uri.h>
  46
  47 #include "buf.h"
  48 #include "enc.h"
  49
  50 #define HTML_MAX_NAMELEN 1000
  51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  52 #define HTML_PARSER_BUFFER_SIZE 100
  53
  54 /* #define DEBUG */
  55 /* #define DEBUG_PUSH */
  56
  57 static int htmlOmittedDefaultValue = 1;
  58
  59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  60                              xmlChar end, xmlChar  end2, xmlChar end3);
  61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  62
  63 /************************************************************************
  64  *                                                                      *
  65  *              Some factorized error routines                          *
  66  *                                                                      *
  67  ************************************************************************/
  68
  69 /**
  70  * htmlErrMemory:
  71  * @ctxt:  an HTML parser context
  72  * @extra:  extra informations
  73  *
  74  * Handle a redefinition of attribute error
  75  */
  76 static void
  77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  78 {
  79     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  80         (ctxt->instate == XML_PARSER_EOF))
  81         return;
  82     if (ctxt != NULL) {
  83         ctxt->errNo = XML_ERR_NO_MEMORY;
  84         ctxt->instate = XML_PARSER_EOF;
  85         ctxt->disableSAX = 1;
  86     }
  87     if (extra)
  88         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  89                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  90                         NULL, NULL, 0, 0,
  91                         "Memory allocation failed : %s\n", extra);
  92     else
  93         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  94                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  95                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  96 }
  97
  98 /**
  99  * htmlParseErr:
 100  * @ctxt:  an HTML parser context
 101  * @error:  the error number
 102  * @msg:  the error message
 103  * @str1:  string infor
 104  * @str2:  string infor
 105  *
 106  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 107  */
 108 static void LIBXML_ATTR_FORMAT(3,0)
 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 110              const char *msg, const xmlChar *str1, const xmlChar *str2)
 111 {
 112     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 113         (ctxt->instate == XML_PARSER_EOF))
 114         return;
 115     if (ctxt != NULL)
 116         ctxt->errNo = error;
 117     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 118                     XML_ERR_ERROR, NULL, 0,
 119                     (const char *) str1, (const char *) str2,
 120                     NULL, 0, 0,
 121                     msg, str1, str2);
 122     if (ctxt != NULL)
 123         ctxt->wellFormed = 0;
 124 }
 125
 126 /**
 127  * htmlParseErrInt:
 128  * @ctxt:  an HTML parser context
 129  * @error:  the error number
 130  * @msg:  the error message
 131  * @val:  integer info
 132  *
 133  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 134  */
 135 static void LIBXML_ATTR_FORMAT(3,0)
 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 137              const char *msg, int val)
 138 {
 139     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 140         (ctxt->instate == XML_PARSER_EOF))
 141         return;
 142     if (ctxt != NULL)
 143         ctxt->errNo = error;
 144     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 145                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 146                     NULL, val, 0, msg, val);
 147     if (ctxt != NULL)
 148         ctxt->wellFormed = 0;
 149 }
 150
 151 /************************************************************************
 152  *                                                                      *
 153  *      Parser stacks related functions and macros              *
 154  *                                                                      *
 155  ************************************************************************/
 156
 157 /**
 158  * htmlnamePush:
 159  * @ctxt:  an HTML parser context
 160  * @value:  the element name
 161  *
 162  * Pushes a new element name on top of the name stack
 163  *
 164  * Returns 0 in case of error, the index in the stack otherwise
 165  */
 166 static int
 167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 168 {
 169     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
 170         ctxt->html = 3;
 171     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
 172         ctxt->html = 10;
 173     if (ctxt->nameNr >= ctxt->nameMax) {
 174         ctxt->nameMax *= 2;
 175         ctxt->nameTab = (const xmlChar * *)
 176                          xmlRealloc((xmlChar * *)ctxt->nameTab,
 177                                     ctxt->nameMax *
 178                                     sizeof(ctxt->nameTab[0]));
 179         if (ctxt->nameTab == NULL) {
 180             htmlErrMemory(ctxt, NULL);
 181             return (0);
 182         }
 183     }
 184     ctxt->nameTab[ctxt->nameNr] = value;
 185     ctxt->name = value;
 186     return (ctxt->nameNr++);
 187 }
 188 /**
 189  * htmlnamePop:
 190  * @ctxt: an HTML parser context
 191  *
 192  * Pops the top element name from the name stack
 193  *
 194  * Returns the name just removed
 195  */
 196 static const xmlChar *
 197 htmlnamePop(htmlParserCtxtPtr ctxt)
 198 {
 199     const xmlChar *ret;
 200
 201     if (ctxt->nameNr <= 0)
 202         return (NULL);
 203     ctxt->nameNr--;
 204     if (ctxt->nameNr < 0)
 205         return (NULL);
 206     if (ctxt->nameNr > 0)
 207         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 208     else
 209         ctxt->name = NULL;
 210     ret = ctxt->nameTab[ctxt->nameNr];
 211     ctxt->nameTab[ctxt->nameNr] = NULL;
 212     return (ret);
 213 }
 214
 215 /**
 216  * htmlNodeInfoPush:
 217  * @ctxt:  an HTML parser context
 218  * @value:  the node info
 219  *
 220  * Pushes a new element name on top of the node info stack
 221  *
 222  * Returns 0 in case of error, the index in the stack otherwise
 223  */
 224 static int
 225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
 226 {
 227     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
 228         if (ctxt->nodeInfoMax == 0)
 229                 ctxt->nodeInfoMax = 5;
 230         ctxt->nodeInfoMax *= 2;
 231         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
 232                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
 233                                     ctxt->nodeInfoMax *
 234                                     sizeof(ctxt->nodeInfoTab[0]));
 235         if (ctxt->nodeInfoTab == NULL) {
 236             htmlErrMemory(ctxt, NULL);
 237             return (0);
 238         }
 239     }
 240     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
 241     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 242     return (ctxt->nodeInfoNr++);
 243 }
 244
 245 /**
 246  * htmlNodeInfoPop:
 247  * @ctxt:  an HTML parser context
 248  *
 249  * Pops the top element name from the node info stack
 250  *
 251  * Returns 0 in case of error, the pointer to NodeInfo otherwise
 252  */
 253 static htmlParserNodeInfo *
 254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 255 {
 256     if (ctxt->nodeInfoNr <= 0)
 257         return (NULL);
 258     ctxt->nodeInfoNr--;
 259     if (ctxt->nodeInfoNr < 0)
 260         return (NULL);
 261     if (ctxt->nodeInfoNr > 0)
 262         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
 263     else
 264         ctxt->nodeInfo = NULL;
 265     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 266 }
 267
 268 /*
 269  * Macros for accessing the content. Those should be used only by the parser,
 270  * and not exported.
 271  *
 272  * Dirty macros, i.e. one need to make assumption on the context to use them
 273  *
 274  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 275  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 276  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 277  *           in UNICODE mode. This should be used internally by the parser
 278  *           only to compare to ASCII values otherwise it would break when
 279  *           running with UTF-8 encoding.
 280  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 281  *           to compare on ASCII based substring.
 282  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 283  *           it should be used only to compare on ASCII based substring.
 284  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 285  *           strings without newlines within the parser.
 286  *
 287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 288  *
 289  *   CURRENT Returns the current char value, with the full decoding of
 290  *           UTF-8 if we are using this mode. It returns an int.
 291  *   NEXT    Skip to the next character, this does the proper decoding
 292  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 293  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 294  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 295  */
 296
 297 #define UPPER (toupper(*ctxt->input->cur))
 298
 299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
 300
 301 #define NXT(val) ctxt->input->cur[(val)]
 302
 303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 304
 305 #define CUR_PTR ctxt->input->cur
 306 #define BASE_PTR ctxt->input->base
 307
 308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 309                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 310         xmlParserInputShrink(ctxt->input)
 311
 312 #define GROW if ((ctxt->progressive == 0) &&                            \
 313                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 314         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 315
 316 #define CURRENT ((int) (*ctxt->input->cur))
 317
 318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 319
 320 /* Inported from XML */
 321
 322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 323 #define CUR ((int) (*ctxt->input->cur))
 324 #define NEXT xmlNextChar(ctxt)
 325
 326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 327
 328
 329 #define NEXTL(l) do {                                                   \
 330     if (*(ctxt->input->cur) == '\n') {                                  \
 331         ctxt->input->line++; ctxt->input->col = 1;                      \
 332     } else ctxt->input->col++;                                          \
 333     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;            \
 334   } while (0)
 335
 336 /************
 337     \
 338     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 339     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 340  ************/
 341
 342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 344
 345 #define COPY_BUF(l,b,i,v)                                               \
 346     if (l == 1) b[i++] = (xmlChar) v;                                   \
 347     else i += xmlCopyChar(l,&b[i],v)
 348
 349 /**
 350  * htmlFindEncoding:
 351  * @the HTML parser context
 352  *
 353  * Ty to find and encoding in the current data available in the input
 354  * buffer this is needed to try to switch to the proper encoding when
 355  * one face a character error.
 356  * That's an heuristic, since it's operating outside of parsing it could
 357  * try to use a meta which had been commented out, that's the reason it
 358  * should only be used in case of error, not as a default.
 359  *
 360  * Returns an encoding string or NULL if not found, the string need to
 361  *   be freed
 362  */
 363 static xmlChar *
 364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 365     const xmlChar *start, *cur, *end;
 366
 367     if ((ctxt == NULL) || (ctxt->input == NULL) ||
 368         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
 369         (ctxt->input->buf->encoder != NULL))
 370         return(NULL);
 371     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
 372         return(NULL);
 373
 374     start = ctxt->input->cur;
 375     end = ctxt->input->end;
 376     /* we also expect the input buffer to be zero terminated */
 377     if (*end != 0)
 378         return(NULL);
 379
 380     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
 381     if (cur == NULL)
 382         return(NULL);
 383     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
 384     if (cur == NULL)
 385         return(NULL);
 386     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
 387     if (cur == NULL)
 388         return(NULL);
 389     cur += 8;
 390     start = cur;
 391     while (((*cur >= 'A') && (*cur <= 'Z')) ||
 392            ((*cur >= 'a') && (*cur <= 'z')) ||
 393            ((*cur >= '0') && (*cur <= '9')) ||
 394            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
 395            cur++;
 396     if (cur == start)
 397         return(NULL);
 398     return(xmlStrndup(start, cur - start));
 399 }
 400
 401 /**
 402  * htmlCurrentChar:
 403  * @ctxt:  the HTML parser context
 404  * @len:  pointer to the length of the char read
 405  *
 406  * The current char value, if using UTF-8 this may actually span multiple
 407  * bytes in the input buffer. Implement the end of line normalization:
 408  * 2.11 End-of-Line Handling
 409  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 410  * char, then the encoding converter is plugged in automatically.
 411  *
 412  * Returns the current char value and its length
 413  */
 414
 415 static int
 416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 417     if (ctxt->instate == XML_PARSER_EOF)
 418         return(0);
 419
 420     if (ctxt->token != 0) {
 421         *len = 0;
 422         return(ctxt->token);
 423     }
 424     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
 425         /*
 426          * We are supposed to handle UTF8, check it's valid
 427          * From rfc2044: encoding of the Unicode values on UTF-8:
 428          *
 429          * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 430          * 0000 0000-0000 007F   0xxxxxxx
 431          * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 432          * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 433          *
 434          * Check for the 0x110000 limit too
 435          */
 436         const unsigned char *cur = ctxt->input->cur;
 437         unsigned char c;
 438         unsigned int val;
 439
 440         c = *cur;
 441         if (c & 0x80) {
 442             if (cur[1] == 0) {
 443                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 444                 cur = ctxt->input->cur;
 445             }
 446             if ((cur[1] & 0xc0) != 0x80)
 447                 goto encoding_error;
 448             if ((c & 0xe0) == 0xe0) {
 449
 450                 if (cur[2] == 0) {
 451                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 452                     cur = ctxt->input->cur;
 453                 }
 454                 if ((cur[2] & 0xc0) != 0x80)
 455                     goto encoding_error;
 456                 if ((c & 0xf0) == 0xf0) {
 457                     if (cur[3] == 0) {
 458                         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 459                         cur = ctxt->input->cur;
 460                     }
 461                     if (((c & 0xf8) != 0xf0) ||
 462                         ((cur[3] & 0xc0) != 0x80))
 463                         goto encoding_error;
 464                     /* 4-byte code */
 465                     *len = 4;
 466                     val = (cur[0] & 0x7) << 18;
 467                     val |= (cur[1] & 0x3f) << 12;
 468                     val |= (cur[2] & 0x3f) << 6;
 469                     val |= cur[3] & 0x3f;
 470                 } else {
 471                   /* 3-byte code */
 472                     *len = 3;
 473                     val = (cur[0] & 0xf) << 12;
 474                     val |= (cur[1] & 0x3f) << 6;
 475                     val |= cur[2] & 0x3f;
 476                 }
 477             } else {
 478               /* 2-byte code */
 479                 *len = 2;
 480                 val = (cur[0] & 0x1f) << 6;
 481                 val |= cur[1] & 0x3f;
 482             }
 483             if (!IS_CHAR(val)) {
 484                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 485                                 "Char 0x%X out of allowed range\n", val);
 486             }
 487             return(val);
 488         } else {
 489             if ((*ctxt->input->cur == 0) &&
 490                 (ctxt->input->cur < ctxt->input->end)) {
 491                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 492                                 "Char 0x%X out of allowed range\n", 0);
 493                 *len = 1;
 494                 return(' ');
 495             }
 496             /* 1-byte code */
 497             *len = 1;
 498             return((int) *ctxt->input->cur);
 499         }
 500     }
 501     /*
 502      * Assume it's a fixed length encoding (1) with
 503      * a compatible encoding for the ASCII set, since
 504      * XML constructs only use < 128 chars
 505      */
 506     *len = 1;
 507     if ((int) *ctxt->input->cur < 0x80)
 508         return((int) *ctxt->input->cur);
 509
 510     /*
 511      * Humm this is bad, do an automatic flow conversion
 512      */
 513     {
 514         xmlChar * guess;
 515         xmlCharEncodingHandlerPtr handler;
 516
 517         guess = htmlFindEncoding(ctxt);
 518         if (guess == NULL) {
 519             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 520         } else {
 521             if (ctxt->input->encoding != NULL)
 522                 xmlFree((xmlChar *) ctxt->input->encoding);
 523             ctxt->input->encoding = guess;
 524             handler = xmlFindCharEncodingHandler((const char *) guess);
 525             if (handler != NULL) {
 526                 xmlSwitchToEncoding(ctxt, handler);
 527             } else {
 528                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 529                              "Unsupported encoding %s", guess, NULL);
 530             }
 531         }
 532         ctxt->charset = XML_CHAR_ENCODING_UTF8;
 533     }
 534
 535     return(xmlCurrentChar(ctxt, len));
 536
 537 encoding_error:
 538     /*
 539      * If we detect an UTF8 error that probably mean that the
 540      * input encoding didn't get properly advertized in the
 541      * declaration header. Report the error and switch the encoding
 542      * to ISO-Latin-1 (if you don't like this policy, just declare the
 543      * encoding !)
 544      */
 545     {
 546         char buffer[150];
 547
 548         if (ctxt->input->end - ctxt->input->cur >= 4) {
 549             snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 550                             ctxt->input->cur[0], ctxt->input->cur[1],
 551                             ctxt->input->cur[2], ctxt->input->cur[3]);
 552         } else {
 553             snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
 554         }
 555         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 556                      "Input is not proper UTF-8, indicate encoding !\n",
 557                      BAD_CAST buffer, NULL);
 558     }
 559
 560     ctxt->charset = XML_CHAR_ENCODING_8859_1;
 561     *len = 1;
 562     return((int) *ctxt->input->cur);
 563 }
 564
 565 /**
 566  * htmlSkipBlankChars:
 567  * @ctxt:  the HTML parser context
 568  *
 569  * skip all blanks character found at that point in the input streams.
 570  *
 571  * Returns the number of space chars skipped
 572  */
 573
 574 static int
 575 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 576     int res = 0;
 577
 578     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 579         if ((*ctxt->input->cur == 0) &&
 580             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 581                 xmlPopInput(ctxt);
 582         } else {
 583             if (*(ctxt->input->cur) == '\n') {
 584                 ctxt->input->line++; ctxt->input->col = 1;
 585             } else ctxt->input->col++;
 586             ctxt->input->cur++;
 587             ctxt->nbChars++;
 588             if (*ctxt->input->cur == 0)
 589                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 590         }
 591         res++;
 592     }
 593     return(res);
 594 }
 595
 596
 597
 598 /************************************************************************
 599  *                                                                      *
 600  *      The list of HTML elements and their properties          *
 601  *                                                                      *
 602  ************************************************************************/
 603
 604 /*
 605  *  Start Tag: 1 means the start tag can be ommited
 606  *  End Tag:   1 means the end tag can be ommited
 607  *             2 means it's forbidden (empty elements)
 608  *             3 means the tag is stylistic and should be closed easily
 609  *  Depr:      this element is deprecated
 610  *  DTD:       1 means that this element is valid only in the Loose DTD
 611  *             2 means that this element is valid only in the Frameset DTD
 612  *
 613  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 614         , subElements , impliedsubelt , Attributes, userdata
 615  */
 616
 617 /* Definitions and a couple of vars for HTML Elements */
 618
 619 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 620 #define NB_FONTSTYLE 8
 621 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 622 #define NB_PHRASE 10
 623 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 624 #define NB_SPECIAL 16
 625 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
 626 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 627 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 628 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 629 #define FORMCTRL "input", "select", "textarea", "label", "button"
 630 #define NB_FORMCTRL 5
 631 #define PCDATA
 632 #define NB_PCDATA 0
 633 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 634 #define NB_HEADING 6
 635 #define LIST "ul", "ol", "dir", "menu"
 636 #define NB_LIST 4
 637 #define MODIFIER
 638 #define NB_MODIFIER 0
 639 #define FLOW BLOCK,INLINE
 640 #define NB_FLOW NB_BLOCK + NB_INLINE
 641 #define EMPTY NULL
 642
 643
 644 static const char* const html_flow[] = { FLOW, NULL } ;
 645 static const char* const html_inline[] = { INLINE, NULL } ;
 646
 647 /* placeholders: elts with content but no subelements */
 648 static const char* const html_pcdata[] = { NULL } ;
 649 #define html_cdata html_pcdata
 650
 651
 652 /* ... and for HTML Attributes */
 653
 654 #define COREATTRS "id", "class", "style", "title"
 655 #define NB_COREATTRS 4
 656 #define I18N "lang", "dir"
 657 #define NB_I18N 2
 658 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 659 #define NB_EVENTS 9
 660 #define ATTRS COREATTRS,I18N,EVENTS
 661 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 662 #define CELLHALIGN "align", "char", "charoff"
 663 #define NB_CELLHALIGN 3
 664 #define CELLVALIGN "valign"
 665 #define NB_CELLVALIGN 1
 666
 667 static const char* const html_attrs[] = { ATTRS, NULL } ;
 668 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 669 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 670 static const char* const i18n_attrs[] = { I18N, NULL } ;
 671
 672
 673 /* Other declarations that should go inline ... */
 674 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 675         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 676         "tabindex", "onfocus", "onblur", NULL } ;
 677 static const char* const target_attr[] = { "target", NULL } ;
 678 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 679 static const char* const alt_attr[] = { "alt", NULL } ;
 680 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 681 static const char* const href_attrs[] = { "href", NULL } ;
 682 static const char* const clear_attrs[] = { "clear", NULL } ;
 683 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 684
 685 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 686 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 687                 "archive", "alt", "name", "height", "width", "align",
 688                 "hspace", "vspace", NULL } ;
 689 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 690         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 691 static const char* const basefont_attrs[] =
 692         { "id", "size", "color", "face", NULL } ;
 693 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 694 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 695 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 696 static const char* const body_depr[] = { "background", "bgcolor", "text",
 697         "link", "vlink", "alink", NULL } ;
 698 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 699         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 700
 701
 702 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 703 static const char* const col_elt[] = { "col", NULL } ;
 704 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 705 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 706 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 707 static const char* const compact_attr[] = { "compact", NULL } ;
 708 static const char* const label_attr[] = { "label", NULL } ;
 709 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 710 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 711 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 712 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 713 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 714 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 715 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 716 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 717 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 718 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 719 static const char* const version_attr[] = { "version", NULL } ;
 720 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 721 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 722 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 723 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
 724 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 725 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 726 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 727 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 728 static const char* const align_attr[] = { "align", NULL } ;
 729 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 730 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 731 static const char* const name_attr[] = { "name", NULL } ;
 732 static const char* const action_attr[] = { "action", NULL } ;
 733 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 734 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
 735 static const char* const content_attr[] = { "content", NULL } ;
 736 static const char* const type_attr[] = { "type", NULL } ;
 737 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 738 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 739 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 740 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 741 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 742 static const char* const option_elt[] = { "option", NULL } ;
 743 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 744 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 745 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 746 static const char* const width_attr[] = { "width", NULL } ;
 747 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 748 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 749 static const char* const language_attr[] = { "language", NULL } ;
 750 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 751 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 752 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 753 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 754 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 755 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 756 static const char* const tr_elt[] = { "tr", NULL } ;
 757 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 758 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 759 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 760 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 761 static const char* const tr_contents[] = { "th", "td", NULL } ;
 762 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 763 static const char* const li_elt[] = { "li", NULL } ;
 764 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 765 static const char* const dir_attr[] = { "dir", NULL} ;
 766
 767 #define DECL (const char**)
 768
 769 static const htmlElemDesc
 770 html40ElementTable[] = {
 771 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 772         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 773 },
 774 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 775         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 776 },
 777 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 778         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 779 },
 780 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 781         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 782 },
 783 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 784         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 785 },
 786 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 787         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 788 },
 789 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 790         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 791 },
 792 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 793         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 794 },
 795 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 796         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 797 },
 798 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 799         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 800 },
 801 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 802         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 803 },
 804 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 805         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 806 },
 807 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 808         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 809 },
 810 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 811         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 812 },
 813 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 814         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 815 },
 816 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 817         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 818 },
 819 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 820         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 821 },
 822 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 823         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 824 },
 825 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 826         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 827 },
 828 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 829         EMPTY , NULL , DECL col_attrs , NULL, NULL
 830 },
 831 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 832         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 833 },
 834 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 835         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 836 },
 837 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 838         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 839 },
 840 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 841         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 842 },
 843 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 844         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 845 },
 846 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 847         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 848 },
 849 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 850         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
 851 },
 852 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 853         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 854 },
 855 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 856         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 857 },
 858 { "embed",      0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
 859         EMPTY, NULL, DECL embed_attrs, NULL, NULL
 860 },
 861 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 862         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 863 },
 864 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 865         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 866 },
 867 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 868         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 869 },
 870 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 871         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 872 },
 873 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 874         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 875 },
 876 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 877         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 878 },
 879 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 880         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 881 },
 882 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 883         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 884 },
 885 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 886         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 887 },
 888 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 889         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 890 },
 891 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 892         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 893 },
 894 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 895         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 896 },
 897 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 898         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 899 },
 900 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 901         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 902 },
 903 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 904         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 905 },
 906 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 907         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 908 },
 909 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 910         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
 911 },
 912 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 913         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 914 },
 915 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 916         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 917 },
 918 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 919         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 920 },
 921 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 922         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 923 },
 924 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 925         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 926 },
 927 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 928         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 929 },
 930 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 931         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 932 },
 933 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 934         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 935 },
 936 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 937         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
 938 },
 939 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 940         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 941 },
 942 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 943         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 944 },
 945 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 946         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 947 },
 948 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 949         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 950 },
 951 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 952         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 953 },
 954 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 955         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 956 },
 957 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 958         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 959 },
 960 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 961         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 962 },
 963 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 964         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 965 },
 966 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 967         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
 968 },
 969 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 970         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 971 },
 972 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
 973         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
 974 },
 975 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
 976         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 977 },
 978 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
 979         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 980 },
 981 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
 982         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
 983 },
 984 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
 985         DECL select_content, NULL, DECL select_attrs, NULL, NULL
 986 },
 987 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
 988         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 989 },
 990 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
 991         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 992 },
 993 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
 994         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 995 },
 996 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
 997         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 998 },
 999 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
1000         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1001 },
1002 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
1003         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004 },
1005 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
1006         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007 },
1008 { "table",      0, 0, 0, 0, 0, 0, 0, "",
1009         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1010 },
1011 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
1012         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1013 },
1014 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
1015         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1016 },
1017 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1018         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1019 },
1020 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
1021         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1022 },
1023 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
1024         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1025 },
1026 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
1027         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1028 },
1029 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
1030         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1031 },
1032 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
1033         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1034 },
1035 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1036         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1037 },
1038 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
1039         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1040 },
1041 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
1042         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1043 },
1044 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1045         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1046 }
1047 };
1048
1049 /*
1050  * start tags that imply the end of current element
1051  */
1052 static const char * const htmlStartClose[] = {
1053 "form",         "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1054                 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1055                 "listing", "xmp", "head", NULL,
1056 "head",         "p", NULL,
1057 "title",        "p", NULL,
1058 "body",         "head", "style", "link", "title", "p", NULL,
1059 "frameset",     "head", "style", "link", "title", "p", NULL,
1060 "li",           "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1061                 "pre", "listing", "xmp", "head", "li", NULL,
1062 "hr",           "p", "head", NULL,
1063 "h1",           "p", "head", NULL,
1064 "h2",           "p", "head", NULL,
1065 "h3",           "p", "head", NULL,
1066 "h4",           "p", "head", NULL,
1067 "h5",           "p", "head", NULL,
1068 "h6",           "p", "head", NULL,
1069 "dir",          "p", "head", NULL,
1070 "address",      "p", "head", "ul", NULL,
1071 "pre",          "p", "head", "ul", NULL,
1072 "listing",      "p", "head", NULL,
1073 "xmp",          "p", "head", NULL,
1074 "blockquote",   "p", "head", NULL,
1075 "dl",           "p", "dt", "menu", "dir", "address", "pre", "listing",
1076                 "xmp", "head", NULL,
1077 "dt",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
1078                 "head", "dd", NULL,
1079 "dd",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
1080                 "head", "dt", NULL,
1081 "ul",           "p", "head", "ol", "menu", "dir", "address", "pre",
1082                 "listing", "xmp", NULL,
1083 "ol",           "p", "head", "ul", NULL,
1084 "menu",         "p", "head", "ul", NULL,
1085 "p",            "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1086 "div",          "p", "head", NULL,
1087 "noscript",     "p", NULL,
1088 "center",       "font", "b", "i", "p", "head", NULL,
1089 "a",            "a", "head", NULL,
1090 "caption",      "p", NULL,
1091 "colgroup",     "caption", "colgroup", "col", "p", NULL,
1092 "col",          "caption", "col", "p", NULL,
1093 "table",        "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1094                 "listing", "xmp", "a", NULL,
1095 "th",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1096 "td",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1097 "tr",           "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1098 "thead",        "caption", "col", "colgroup", NULL,
1099 "tfoot",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
1100                 "tbody", "p", NULL,
1101 "tbody",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
1102                 "tfoot", "tbody", "p", NULL,
1103 "optgroup",     "option", NULL,
1104 "option",       "option", NULL,
1105 "fieldset",     "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1106                 "pre", "listing", "xmp", "a", NULL,
1107 /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1108 "tt",           "head", NULL,
1109 "i",            "head", NULL,
1110 "b",            "head", NULL,
1111 "u",            "head", NULL,
1112 "s",            "head", NULL,
1113 "strike",       "head", NULL,
1114 "big",          "head", NULL,
1115 "small",        "head", NULL,
1116
1117 "em",           "head", NULL,
1118 "strong",       "head", NULL,
1119 "dfn",          "head", NULL,
1120 "code",         "head", NULL,
1121 "samp",         "head", NULL,
1122 "kbd",          "head", NULL,
1123 "var",          "head", NULL,
1124 "cite",         "head", NULL,
1125 "abbr",         "head", NULL,
1126 "acronym",      "head", NULL,
1127
1128 /* "a" */
1129 "img",          "head", NULL,
1130 /* "applet" */
1131 /* "embed" */
1132 /* "object" */
1133 "font",         "head", NULL,
1134 /* "basefont" */
1135 "br",           "head", NULL,
1136 /* "script" */
1137 "map",          "head", NULL,
1138 "q",            "head", NULL,
1139 "sub",          "head", NULL,
1140 "sup",          "head", NULL,
1141 "span",         "head", NULL,
1142 "bdo",          "head", NULL,
1143 "iframe",       "head", NULL,
1144 NULL
1145 };
1146
1147 /*
1148  * The list of HTML elements which are supposed not to have
1149  * CDATA content and where a p element will be implied
1150  *
1151  * TODO: extend that list by reading the HTML SGML DTD on
1152  *       implied paragraph
1153  */
1154 static const char *const htmlNoContentElements[] = {
1155     "html",
1156     "head",
1157     NULL
1158 };
1159
1160 /*
1161  * The list of HTML attributes which are of content %Script;
1162  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1163  *       it assumes the name starts with 'on'
1164  */
1165 static const char *const htmlScriptAttributes[] = {
1166     "onclick",
1167     "ondblclick",
1168     "onmousedown",
1169     "onmouseup",
1170     "onmouseover",
1171     "onmousemove",
1172     "onmouseout",
1173     "onkeypress",
1174     "onkeydown",
1175     "onkeyup",
1176     "onload",
1177     "onunload",
1178     "onfocus",
1179     "onblur",
1180     "onsubmit",
1181     "onreset",
1182     "onchange",
1183     "onselect"
1184 };
1185
1186 /*
1187  * This table is used by the htmlparser to know what to do with
1188  * broken html pages. By assigning different priorities to different
1189  * elements the parser can decide how to handle extra endtags.
1190  * Endtags are only allowed to close elements with lower or equal
1191  * priority.
1192  */
1193
1194 typedef struct {
1195     const char *name;
1196     int priority;
1197 } elementPriority;
1198
1199 static const elementPriority htmlEndPriority[] = {
1200     {"div",   150},
1201     {"td",    160},
1202     {"th",    160},
1203     {"tr",    170},
1204     {"thead", 180},
1205     {"tbody", 180},
1206     {"tfoot", 180},
1207     {"table", 190},
1208     {"head",  200},
1209     {"body",  200},
1210     {"html",  220},
1211     {NULL,    100} /* Default priority */
1212 };
1213
1214 static const char** htmlStartCloseIndex[100];
1215 static int htmlStartCloseIndexinitialized = 0;
1216
1217 /************************************************************************
1218  *                                                                      *
1219  *      functions to handle HTML specific data                  *
1220  *                                                                      *
1221  ************************************************************************/
1222
1223 /**
1224  * htmlInitAutoClose:
1225  *
1226  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1227  * This is not reentrant. Call xmlInitParser() once before processing in
1228  * case of use in multithreaded programs.
1229  */
1230 void
1231 htmlInitAutoClose(void) {
1232     int indx, i = 0;
1233
1234     if (htmlStartCloseIndexinitialized) return;
1235
1236     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1237     indx = 0;
1238     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1239         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1240         while (htmlStartClose[i] != NULL) i++;
1241         i++;
1242     }
1243     htmlStartCloseIndexinitialized = 1;
1244 }
1245
1246 /**
1247  * htmlTagLookup:
1248  * @tag:  The tag name in lowercase
1249  *
1250  * Lookup the HTML tag in the ElementTable
1251  *
1252  * Returns the related htmlElemDescPtr or NULL if not found.
1253  */
1254 const htmlElemDesc *
1255 htmlTagLookup(const xmlChar *tag) {
1256     unsigned int i;
1257
1258     for (i = 0; i < (sizeof(html40ElementTable) /
1259                      sizeof(html40ElementTable[0]));i++) {
1260         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1261             return((htmlElemDescPtr) &html40ElementTable[i]);
1262     }
1263     return(NULL);
1264 }
1265
1266 /**
1267  * htmlGetEndPriority:
1268  * @name: The name of the element to look up the priority for.
1269  *
1270  * Return value: The "endtag" priority.
1271  **/
1272 static int
1273 htmlGetEndPriority (const xmlChar *name) {
1274     int i = 0;
1275
1276     while ((htmlEndPriority[i].name != NULL) &&
1277            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1278         i++;
1279
1280     return(htmlEndPriority[i].priority);
1281 }
1282
1283
1284 /**
1285  * htmlCheckAutoClose:
1286  * @newtag:  The new tag name
1287  * @oldtag:  The old tag name
1288  *
1289  * Checks whether the new tag is one of the registered valid tags for
1290  * closing old.
1291  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1292  *
1293  * Returns 0 if no, 1 if yes.
1294  */
1295 static int
1296 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1297 {
1298     int i, indx;
1299     const char **closed = NULL;
1300
1301     if (htmlStartCloseIndexinitialized == 0)
1302         htmlInitAutoClose();
1303
1304     /* inefficient, but not a big deal */
1305     for (indx = 0; indx < 100; indx++) {
1306         closed = htmlStartCloseIndex[indx];
1307         if (closed == NULL)
1308             return (0);
1309         if (xmlStrEqual(BAD_CAST * closed, newtag))
1310             break;
1311     }
1312
1313     i = closed - htmlStartClose;
1314     i++;
1315     while (htmlStartClose[i] != NULL) {
1316         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1317             return (1);
1318         }
1319         i++;
1320     }
1321     return (0);
1322 }
1323
1324 /**
1325  * htmlAutoCloseOnClose:
1326  * @ctxt:  an HTML parser context
1327  * @newtag:  The new tag name
1328  * @force:  force the tag closure
1329  *
1330  * The HTML DTD allows an ending tag to implicitly close other tags.
1331  */
1332 static void
1333 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1334 {
1335     const htmlElemDesc *info;
1336     int i, priority;
1337
1338     priority = htmlGetEndPriority(newtag);
1339
1340     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1341
1342         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1343             break;
1344         /*
1345          * A missplaced endtag can only close elements with lower
1346          * or equal priority, so if we find an element with higher
1347          * priority before we find an element with
1348          * matching name, we just ignore this endtag
1349          */
1350         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1351             return;
1352     }
1353     if (i < 0)
1354         return;
1355
1356     while (!xmlStrEqual(newtag, ctxt->name)) {
1357         info = htmlTagLookup(ctxt->name);
1358         if ((info != NULL) && (info->endTag == 3)) {
1359             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1360                          "Opening and ending tag mismatch: %s and %s\n",
1361                          newtag, ctxt->name);
1362         }
1363         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1364             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1365         htmlnamePop(ctxt);
1366     }
1367 }
1368
1369 /**
1370  * htmlAutoCloseOnEnd:
1371  * @ctxt:  an HTML parser context
1372  *
1373  * Close all remaining tags at the end of the stream
1374  */
1375 static void
1376 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1377 {
1378     int i;
1379
1380     if (ctxt->nameNr == 0)
1381         return;
1382     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1383         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1384             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1385         htmlnamePop(ctxt);
1386     }
1387 }
1388
1389 /**
1390  * htmlAutoClose:
1391  * @ctxt:  an HTML parser context
1392  * @newtag:  The new tag name or NULL
1393  *
1394  * The HTML DTD allows a tag to implicitly close other tags.
1395  * The list is kept in htmlStartClose array. This function is
1396  * called when a new tag has been detected and generates the
1397  * appropriates closes if possible/needed.
1398  * If newtag is NULL this mean we are at the end of the resource
1399  * and we should check
1400  */
1401 static void
1402 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1403 {
1404     while ((newtag != NULL) && (ctxt->name != NULL) &&
1405            (htmlCheckAutoClose(newtag, ctxt->name))) {
1406         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1407             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1408         htmlnamePop(ctxt);
1409     }
1410     if (newtag == NULL) {
1411         htmlAutoCloseOnEnd(ctxt);
1412         return;
1413     }
1414     while ((newtag == NULL) && (ctxt->name != NULL) &&
1415            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1416             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1417             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1418         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420         htmlnamePop(ctxt);
1421     }
1422 }
1423
1424 /**
1425  * htmlAutoCloseTag:
1426  * @doc:  the HTML document
1427  * @name:  The tag name
1428  * @elem:  the HTML element
1429  *
1430  * The HTML DTD allows a tag to implicitly close other tags.
1431  * The list is kept in htmlStartClose array. This function checks
1432  * if the element or one of it's children would autoclose the
1433  * given tag.
1434  *
1435  * Returns 1 if autoclose, 0 otherwise
1436  */
1437 int
1438 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1439     htmlNodePtr child;
1440
1441     if (elem == NULL) return(1);
1442     if (xmlStrEqual(name, elem->name)) return(0);
1443     if (htmlCheckAutoClose(elem->name, name)) return(1);
1444     child = elem->children;
1445     while (child != NULL) {
1446         if (htmlAutoCloseTag(doc, name, child)) return(1);
1447         child = child->next;
1448     }
1449     return(0);
1450 }
1451
1452 /**
1453  * htmlIsAutoClosed:
1454  * @doc:  the HTML document
1455  * @elem:  the HTML element
1456  *
1457  * The HTML DTD allows a tag to implicitly close other tags.
1458  * The list is kept in htmlStartClose array. This function checks
1459  * if a tag is autoclosed by one of it's child
1460  *
1461  * Returns 1 if autoclosed, 0 otherwise
1462  */
1463 int
1464 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1465     htmlNodePtr child;
1466
1467     if (elem == NULL) return(1);
1468     child = elem->children;
1469     while (child != NULL) {
1470         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471         child = child->next;
1472     }
1473     return(0);
1474 }
1475
1476 /**
1477  * htmlCheckImplied:
1478  * @ctxt:  an HTML parser context
1479  * @newtag:  The new tag name
1480  *
1481  * The HTML DTD allows a tag to exists only implicitly
1482  * called when a new tag has been detected and generates the
1483  * appropriates implicit tags if missing
1484  */
1485 static void
1486 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1487     int i;
1488
1489     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1490         return;
1491     if (!htmlOmittedDefaultValue)
1492         return;
1493     if (xmlStrEqual(newtag, BAD_CAST"html"))
1494         return;
1495     if (ctxt->nameNr <= 0) {
1496         htmlnamePush(ctxt, BAD_CAST"html");
1497         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1498             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1499     }
1500     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1501         return;
1502     if ((ctxt->nameNr <= 1) &&
1503         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1504          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1505          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1506          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1507          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1508          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1509         if (ctxt->html >= 3) {
1510             /* we already saw or generated an <head> before */
1511             return;
1512         }
1513         /*
1514          * dropped OBJECT ... i you put it first BODY will be
1515          * assumed !
1516          */
1517         htmlnamePush(ctxt, BAD_CAST"head");
1518         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1519             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1520     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1521                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1522                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1523         if (ctxt->html >= 10) {
1524             /* we already saw or generated a <body> before */
1525             return;
1526         }
1527         for (i = 0;i < ctxt->nameNr;i++) {
1528             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1529                 return;
1530             }
1531             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1532                 return;
1533             }
1534         }
1535
1536         htmlnamePush(ctxt, BAD_CAST"body");
1537         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1538             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1539     }
1540 }
1541
1542 /**
1543  * htmlCheckParagraph
1544  * @ctxt:  an HTML parser context
1545  *
1546  * Check whether a p element need to be implied before inserting
1547  * characters in the current element.
1548  *
1549  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1550  *         in case of error.
1551  */
1552
1553 static int
1554 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1555     const xmlChar *tag;
1556     int i;
1557
1558     if (ctxt == NULL)
1559         return(-1);
1560     tag = ctxt->name;
1561     if (tag == NULL) {
1562         htmlAutoClose(ctxt, BAD_CAST"p");
1563         htmlCheckImplied(ctxt, BAD_CAST"p");
1564         htmlnamePush(ctxt, BAD_CAST"p");
1565         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1567         return(1);
1568     }
1569     if (!htmlOmittedDefaultValue)
1570         return(0);
1571     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1572         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1573             htmlAutoClose(ctxt, BAD_CAST"p");
1574             htmlCheckImplied(ctxt, BAD_CAST"p");
1575             htmlnamePush(ctxt, BAD_CAST"p");
1576             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1577                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1578             return(1);
1579         }
1580     }
1581     return(0);
1582 }
1583
1584 /**
1585  * htmlIsScriptAttribute:
1586  * @name:  an attribute name
1587  *
1588  * Check if an attribute is of content type Script
1589  *
1590  * Returns 1 is the attribute is a script 0 otherwise
1591  */
1592 int
1593 htmlIsScriptAttribute(const xmlChar *name) {
1594     unsigned int i;
1595
1596     if (name == NULL)
1597       return(0);
1598     /*
1599      * all script attributes start with 'on'
1600      */
1601     if ((name[0] != 'o') || (name[1] != 'n'))
1602       return(0);
1603     for (i = 0;
1604          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1605          i++) {
1606         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1607             return(1);
1608     }
1609     return(0);
1610 }
1611
1612 /************************************************************************
1613  *                                                                      *
1614  *      The list of HTML predefined entities                    *
1615  *                                                                      *
1616  ************************************************************************/
1617
1618
1619 static const htmlEntityDesc  html40EntitiesTable[] = {
1620 /*
1621  * the 4 absolute ones, plus apostrophe.
1622  */
1623 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1624 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1625 { 39,   "apos", "single quote" },
1626 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1627 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1628
1629 /*
1630  * A bunch still in the 128-255 range
1631  * Replacing them depend really on the charset used.
1632  */
1633 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1634 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1635 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1636 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1637 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1638 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1639 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1640 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1641 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1642 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1643 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1644 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1645 { 172,  "not",  "not sign, U+00AC ISOnum" },
1646 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1647 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1648 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1649 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1650 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1651 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1652 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1653 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1654 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1655 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1656 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1657 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1658 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1659 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1660 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1661 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1662 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1663 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1664 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1665 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1666 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1667 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1668 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1669 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1670 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1671 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1672 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1673 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1674 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1675 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1676 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1677 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1678 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1679 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1680 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1681 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1682 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1683 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1684 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1685 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1686 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1687 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1688 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1689 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1690 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1691 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1692 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1693 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1694 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1695 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1696 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1697 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1698 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1699 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1700 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1701 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1702 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1703 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1704 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1705 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1706 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1707 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1708 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1709 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1710 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1711 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1712 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1713 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1714 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1715 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1716 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1717 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1718 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1719 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1720 { 247,  "divide","division sign, U+00F7 ISOnum" },
1721 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1722 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1723 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1724 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1725 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1726 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1727 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1728 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1729
1730 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1731 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1732 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1733 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1734 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1735
1736 /*
1737  * Anything below should really be kept as entities references
1738  */
1739 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1740
1741 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1742 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1743
1744 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1745 { 914,  "Beta", "greek capital letter beta, U+0392" },
1746 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1747 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1748 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1749 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1750 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1751 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1752 { 921,  "Iota", "greek capital letter iota, U+0399" },
1753 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1754 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1755 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1756 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1757 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1758 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1759 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1760 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1761 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1762 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1763 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1764 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1765 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1766 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1767 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1768
1769 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1770 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1771 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1772 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1773 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1774 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1775 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1776 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1777 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1778 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1779 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1780 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1781 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1782 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1783 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1784 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1785 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1786 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1787 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1788 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1789 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1790 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1791 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1792 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1793 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1794 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1795 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1796 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1797
1798 { 8194, "ensp", "en space, U+2002 ISOpub" },
1799 { 8195, "emsp", "em space, U+2003 ISOpub" },
1800 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1801 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1802 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1803 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1804 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1805 { 8211, "ndash","en dash, U+2013 ISOpub" },
1806 { 8212, "mdash","em dash, U+2014 ISOpub" },
1807 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1808 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1809 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1810 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1811 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1812 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1813 { 8224, "dagger","dagger, U+2020 ISOpub" },
1814 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1815
1816 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1817 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1818
1819 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1820
1821 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1822 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1823
1824 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1825 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1826
1827 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1828 { 8260, "frasl","fraction slash, U+2044 NEW" },
1829
1830 { 8364, "euro", "euro sign, U+20AC NEW" },
1831
1832 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1833 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1834 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1835 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1836 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1837 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1838 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1839 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1840 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1841 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1842 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1843 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1844 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1845 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1846 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1847 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1848
1849 { 8704, "forall","for all, U+2200 ISOtech" },
1850 { 8706, "part", "partial differential, U+2202 ISOtech" },
1851 { 8707, "exist","there exists, U+2203 ISOtech" },
1852 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1853 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1854 { 8712, "isin", "element of, U+2208 ISOtech" },
1855 { 8713, "notin","not an element of, U+2209 ISOtech" },
1856 { 8715, "ni",   "contains as member, U+220B ISOtech" },
1857 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1858 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1859 { 8722, "minus","minus sign, U+2212 ISOtech" },
1860 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1861 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1862 { 8733, "prop", "proportional to, U+221D ISOtech" },
1863 { 8734, "infin","infinity, U+221E ISOtech" },
1864 { 8736, "ang",  "angle, U+2220 ISOamso" },
1865 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1866 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
1867 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1868 { 8746, "cup",  "union = cup, U+222A ISOtech" },
1869 { 8747, "int",  "integral, U+222B ISOtech" },
1870 { 8756, "there4","therefore, U+2234 ISOtech" },
1871 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1872 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1873 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1874 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
1875 { 8801, "equiv","identical to, U+2261 ISOtech" },
1876 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
1877 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
1878 { 8834, "sub",  "subset of, U+2282 ISOtech" },
1879 { 8835, "sup",  "superset of, U+2283 ISOtech" },
1880 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1881 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1882 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1883 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1884 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1885 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1886 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1887 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1888 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1889 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1890 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1891 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1892 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1893 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
1894
1895 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1896 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1897 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1898 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1899
1900 };
1901
1902 /************************************************************************
1903  *                                                                      *
1904  *              Commodity functions to handle entities                  *
1905  *                                                                      *
1906  ************************************************************************/
1907
1908 /*
1909  * Macro used to grow the current buffer.
1910  */
1911 #define growBuffer(buffer) {                                            \
1912     xmlChar *tmp;                                                       \
1913     buffer##_size *= 2;                                                 \
1914     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1915     if (tmp == NULL) {                                          \
1916         htmlErrMemory(ctxt, "growing buffer\n");                        \
1917         xmlFree(buffer);                                                \
1918         return(NULL);                                                   \
1919     }                                                                   \
1920     buffer = tmp;                                                       \
1921 }
1922
1923 /**
1924  * htmlEntityLookup:
1925  * @name: the entity name
1926  *
1927  * Lookup the given entity in EntitiesTable
1928  *
1929  * TODO: the linear scan is really ugly, an hash table is really needed.
1930  *
1931  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1932  */
1933 const htmlEntityDesc *
1934 htmlEntityLookup(const xmlChar *name) {
1935     unsigned int i;
1936
1937     for (i = 0;i < (sizeof(html40EntitiesTable)/
1938                     sizeof(html40EntitiesTable[0]));i++) {
1939         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1940             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1941         }
1942     }
1943     return(NULL);
1944 }
1945
1946 /**
1947  * htmlEntityValueLookup:
1948  * @value: the entity's unicode value
1949  *
1950  * Lookup the given entity in EntitiesTable
1951  *
1952  * TODO: the linear scan is really ugly, an hash table is really needed.
1953  *
1954  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1955  */
1956 const htmlEntityDesc *
1957 htmlEntityValueLookup(unsigned int value) {
1958     unsigned int i;
1959
1960     for (i = 0;i < (sizeof(html40EntitiesTable)/
1961                     sizeof(html40EntitiesTable[0]));i++) {
1962         if (html40EntitiesTable[i].value >= value) {
1963             if (html40EntitiesTable[i].value > value)
1964                 break;
1965             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1966         }
1967     }
1968     return(NULL);
1969 }
1970
1971 /**
1972  * UTF8ToHtml:
1973  * @out:  a pointer to an array of bytes to store the result
1974  * @outlen:  the length of @out
1975  * @in:  a pointer to an array of UTF-8 chars
1976  * @inlen:  the length of @in
1977  *
1978  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1979  * plus HTML entities block of chars out.
1980  *
1981  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1982  * The value of @inlen after return is the number of octets consumed
1983  *     as the return value is positive, else unpredictable.
1984  * The value of @outlen after return is the number of octets consumed.
1985  */
1986 int
1987 UTF8ToHtml(unsigned char* out, int *outlen,
1988               const unsigned char* in, int *inlen) {
1989     const unsigned char* processed = in;
1990     const unsigned char* outend;
1991     const unsigned char* outstart = out;
1992     const unsigned char* instart = in;
1993     const unsigned char* inend;
1994     unsigned int c, d;
1995     int trailing;
1996
1997     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1998     if (in == NULL) {
1999         /*
2000          * initialization nothing to do
2001          */
2002         *outlen = 0;
2003         *inlen = 0;
2004         return(0);
2005     }
2006     inend = in + (*inlen);
2007     outend = out + (*outlen);
2008     while (in < inend) {
2009         d = *in++;
2010         if      (d < 0x80)  { c= d; trailing= 0; }
2011         else if (d < 0xC0) {
2012             /* trailing byte in leading position */
2013             *outlen = out - outstart;
2014             *inlen = processed - instart;
2015             return(-2);
2016         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2017         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2018         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2019         else {
2020             /* no chance for this in Ascii */
2021             *outlen = out - outstart;
2022             *inlen = processed - instart;
2023             return(-2);
2024         }
2025
2026         if (inend - in < trailing) {
2027             break;
2028         }
2029
2030         for ( ; trailing; trailing--) {
2031             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2032                 break;
2033             c <<= 6;
2034             c |= d & 0x3F;
2035         }
2036
2037         /* assertion: c is a single UTF-4 value */
2038         if (c < 0x80) {
2039             if (out + 1 >= outend)
2040                 break;
2041             *out++ = c;
2042         } else {
2043             int len;
2044             const htmlEntityDesc * ent;
2045             const char *cp;
2046             char nbuf[16];
2047
2048             /*
2049              * Try to lookup a predefined HTML entity for it
2050              */
2051
2052             ent = htmlEntityValueLookup(c);
2053             if (ent == NULL) {
2054               snprintf(nbuf, sizeof(nbuf), "#%u", c);
2055               cp = nbuf;
2056             }
2057             else
2058               cp = ent->name;
2059             len = strlen(cp);
2060             if (out + 2 + len >= outend)
2061                 break;
2062             *out++ = '&';
2063             memcpy(out, cp, len);
2064             out += len;
2065             *out++ = ';';
2066         }
2067         processed = in;
2068     }
2069     *outlen = out - outstart;
2070     *inlen = processed - instart;
2071     return(0);
2072 }
2073
2074 /**
2075  * htmlEncodeEntities:
2076  * @out:  a pointer to an array of bytes to store the result
2077  * @outlen:  the length of @out
2078  * @in:  a pointer to an array of UTF-8 chars
2079  * @inlen:  the length of @in
2080  * @quoteChar: the quote character to escape (' or ") or zero.
2081  *
2082  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2083  * plus HTML entities block of chars out.
2084  *
2085  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2086  * The value of @inlen after return is the number of octets consumed
2087  *     as the return value is positive, else unpredictable.
2088  * The value of @outlen after return is the number of octets consumed.
2089  */
2090 int
2091 htmlEncodeEntities(unsigned char* out, int *outlen,
2092                    const unsigned char* in, int *inlen, int quoteChar) {
2093     const unsigned char* processed = in;
2094     const unsigned char* outend;
2095     const unsigned char* outstart = out;
2096     const unsigned char* instart = in;
2097     const unsigned char* inend;
2098     unsigned int c, d;
2099     int trailing;
2100
2101     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2102         return(-1);
2103     outend = out + (*outlen);
2104     inend = in + (*inlen);
2105     while (in < inend) {
2106         d = *in++;
2107         if      (d < 0x80)  { c= d; trailing= 0; }
2108         else if (d < 0xC0) {
2109             /* trailing byte in leading position */
2110             *outlen = out - outstart;
2111             *inlen = processed - instart;
2112             return(-2);
2113         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2114         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2115         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2116         else {
2117             /* no chance for this in Ascii */
2118             *outlen = out - outstart;
2119             *inlen = processed - instart;
2120             return(-2);
2121         }
2122
2123         if (inend - in < trailing)
2124             break;
2125
2126         while (trailing--) {
2127             if (((d= *in++) & 0xC0) != 0x80) {
2128                 *outlen = out - outstart;
2129                 *inlen = processed - instart;
2130                 return(-2);
2131             }
2132             c <<= 6;
2133             c |= d & 0x3F;
2134         }
2135
2136         /* assertion: c is a single UTF-4 value */
2137         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2138             (c != '&') && (c != '<') && (c != '>')) {
2139             if (out >= outend)
2140                 break;
2141             *out++ = c;
2142         } else {
2143             const htmlEntityDesc * ent;
2144             const char *cp;
2145             char nbuf[16];
2146             int len;
2147
2148             /*
2149              * Try to lookup a predefined HTML entity for it
2150              */
2151             ent = htmlEntityValueLookup(c);
2152             if (ent == NULL) {
2153                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2154                 cp = nbuf;
2155             }
2156             else
2157                 cp = ent->name;
2158             len = strlen(cp);
2159             if (out + 2 + len > outend)
2160                 break;
2161             *out++ = '&';
2162             memcpy(out, cp, len);
2163             out += len;
2164             *out++ = ';';
2165         }
2166         processed = in;
2167     }
2168     *outlen = out - outstart;
2169     *inlen = processed - instart;
2170     return(0);
2171 }
2172
2173 /************************************************************************
2174  *                                                                      *
2175  *              Commodity functions to handle streams                   *
2176  *                                                                      *
2177  ************************************************************************/
2178
2179 /**
2180  * htmlNewInputStream:
2181  * @ctxt:  an HTML parser context
2182  *
2183  * Create a new input stream structure
2184  * Returns the new input stream or NULL
2185  */
2186 static htmlParserInputPtr
2187 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2188     htmlParserInputPtr input;
2189
2190     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2191     if (input == NULL) {
2192         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2193         return(NULL);
2194     }
2195     memset(input, 0, sizeof(htmlParserInput));
2196     input->filename = NULL;
2197     input->directory = NULL;
2198     input->base = NULL;
2199     input->cur = NULL;
2200     input->buf = NULL;
2201     input->line = 1;
2202     input->col = 1;
2203     input->buf = NULL;
2204     input->free = NULL;
2205     input->version = NULL;
2206     input->consumed = 0;
2207     input->length = 0;
2208     return(input);
2209 }
2210
2211
2212 /************************************************************************
2213  *                                                                      *
2214  *              Commodity functions, cleanup needed ?                   *
2215  *                                                                      *
2216  ************************************************************************/
2217 /*
2218  * all tags allowing pc data from the html 4.01 loose dtd
2219  * NOTE: it might be more apropriate to integrate this information
2220  * into the html40ElementTable array but I don't want to risk any
2221  * binary incomptibility
2222  */
2223 static const char *allowPCData[] = {
2224     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2225     "blockquote", "body", "button", "caption", "center", "cite", "code",
2226     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2227     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2228     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2229     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2230 };
2231
2232 /**
2233  * areBlanks:
2234  * @ctxt:  an HTML parser context
2235  * @str:  a xmlChar *
2236  * @len:  the size of @str
2237  *
2238  * Is this a sequence of blank chars that one can ignore ?
2239  *
2240  * Returns 1 if ignorable 0 otherwise.
2241  */
2242
2243 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2244     unsigned int i;
2245     int j;
2246     xmlNodePtr lastChild;
2247     xmlDtdPtr dtd;
2248
2249     for (j = 0;j < len;j++)
2250         if (!(IS_BLANK_CH(str[j]))) return(0);
2251
2252     if (CUR == 0) return(1);
2253     if (CUR != '<') return(0);
2254     if (ctxt->name == NULL)
2255         return(1);
2256     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2257         return(1);
2258     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2259         return(1);
2260
2261     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2262     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2263         dtd = xmlGetIntSubset(ctxt->myDoc);
2264         if (dtd != NULL && dtd->ExternalID != NULL) {
2265             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2266                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2267                 return(1);
2268         }
2269     }
2270
2271     if (ctxt->node == NULL) return(0);
2272     lastChild = xmlGetLastChild(ctxt->node);
2273     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2274         lastChild = lastChild->prev;
2275     if (lastChild == NULL) {
2276         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2277             (ctxt->node->content != NULL)) return(0);
2278         /* keep ws in constructs like ...<b> </b>...
2279            for all tags "b" allowing PCDATA */
2280         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2281             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2282                 return(0);
2283             }
2284         }
2285     } else if (xmlNodeIsText(lastChild)) {
2286         return(0);
2287     } else {
2288         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2289            for all tags "p" allowing PCDATA */
2290         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2291             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2292                 return(0);
2293             }
2294         }
2295     }
2296     return(1);
2297 }
2298
2299 /**
2300  * htmlNewDocNoDtD:
2301  * @URI:  URI for the dtd, or NULL
2302  * @ExternalID:  the external ID of the DTD, or NULL
2303  *
2304  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2305  * are NULL
2306  *
2307  * Returns a new document, do not initialize the DTD if not provided
2308  */
2309 htmlDocPtr
2310 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2311     xmlDocPtr cur;
2312
2313     /*
2314      * Allocate a new document and fill the fields.
2315      */
2316     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2317     if (cur == NULL) {
2318         htmlErrMemory(NULL, "HTML document creation failed\n");
2319         return(NULL);
2320     }
2321     memset(cur, 0, sizeof(xmlDoc));
2322
2323     cur->type = XML_HTML_DOCUMENT_NODE;
2324     cur->version = NULL;
2325     cur->intSubset = NULL;
2326     cur->doc = cur;
2327     cur->name = NULL;
2328     cur->children = NULL;
2329     cur->extSubset = NULL;
2330     cur->oldNs = NULL;
2331     cur->encoding = NULL;
2332     cur->standalone = 1;
2333     cur->compression = 0;
2334     cur->ids = NULL;
2335     cur->refs = NULL;
2336     cur->_private = NULL;
2337     cur->charset = XML_CHAR_ENCODING_UTF8;
2338     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2339     if ((ExternalID != NULL) ||
2340         (URI != NULL))
2341         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2342     return(cur);
2343 }
2344
2345 /**
2346  * htmlNewDoc:
2347  * @URI:  URI for the dtd, or NULL
2348  * @ExternalID:  the external ID of the DTD, or NULL
2349  *
2350  * Creates a new HTML document
2351  *
2352  * Returns a new document
2353  */
2354 htmlDocPtr
2355 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2356     if ((URI == NULL) && (ExternalID == NULL))
2357         return(htmlNewDocNoDtD(
2358                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2359                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2360
2361     return(htmlNewDocNoDtD(URI, ExternalID));
2362 }
2363
2364
2365 /************************************************************************
2366  *                                                                      *
2367  *                      The parser itself                               *
2368  *      Relates to http://www.w3.org/TR/html40                          *
2369  *                                                                      *
2370  ************************************************************************/
2371
2372 /************************************************************************
2373  *                                                                      *
2374  *                      The parser itself                               *
2375  *                                                                      *
2376  ************************************************************************/
2377
2378 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2379
2380 /**
2381  * htmlParseHTMLName:
2382  * @ctxt:  an HTML parser context
2383  *
2384  * parse an HTML tag or attribute name, note that we convert it to lowercase
2385  * since HTML names are not case-sensitive.
2386  *
2387  * Returns the Tag Name parsed or NULL
2388  */
2389
2390 static const xmlChar *
2391 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2392     int i = 0;
2393     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2394
2395     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2396         (CUR != ':') && (CUR != '.')) return(NULL);
2397
2398     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2399            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2400            (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2401            (CUR == '.'))) {
2402         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2403         else loc[i] = CUR;
2404         i++;
2405
2406         NEXT;
2407     }
2408
2409     return(xmlDictLookup(ctxt->dict, loc, i));
2410 }
2411
2412
2413 /**
2414  * htmlParseHTMLName_nonInvasive:
2415  * @ctxt:  an HTML parser context
2416  *
2417  * parse an HTML tag or attribute name, note that we convert it to lowercase
2418  * since HTML names are not case-sensitive, this doesn't consume the data
2419  * from the stream, it's a look-ahead
2420  *
2421  * Returns the Tag Name parsed or NULL
2422  */
2423
2424 static const xmlChar *
2425 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2426     int i = 0;
2427     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2428
2429     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2430         (NXT(1) != ':')) return(NULL);
2431
2432     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2433            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2434            (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2435         if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2436         else loc[i] = NXT(1+i);
2437         i++;
2438     }
2439
2440     return(xmlDictLookup(ctxt->dict, loc, i));
2441 }
2442
2443
2444 /**
2445  * htmlParseName:
2446  * @ctxt:  an HTML parser context
2447  *
2448  * parse an HTML name, this routine is case sensitive.
2449  *
2450  * Returns the Name parsed or NULL
2451  */
2452
2453 static const xmlChar *
2454 htmlParseName(htmlParserCtxtPtr ctxt) {
2455     const xmlChar *in;
2456     const xmlChar *ret;
2457     int count = 0;
2458
2459     GROW;
2460
2461     /*
2462      * Accelerator for simple ASCII names
2463      */
2464     in = ctxt->input->cur;
2465     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2466         ((*in >= 0x41) && (*in <= 0x5A)) ||
2467         (*in == '_') || (*in == ':')) {
2468         in++;
2469         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2470                ((*in >= 0x41) && (*in <= 0x5A)) ||
2471                ((*in >= 0x30) && (*in <= 0x39)) ||
2472                (*in == '_') || (*in == '-') ||
2473                (*in == ':') || (*in == '.'))
2474             in++;
2475
2476         if (in == ctxt->input->end)
2477             return(NULL);
2478
2479         if ((*in > 0) && (*in < 0x80)) {
2480             count = in - ctxt->input->cur;
2481             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2482             ctxt->input->cur = in;
2483             ctxt->nbChars += count;
2484             ctxt->input->col += count;
2485             return(ret);
2486         }
2487     }
2488     return(htmlParseNameComplex(ctxt));
2489 }
2490
2491 static const xmlChar *
2492 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2493     int len = 0, l;
2494     int c;
2495     int count = 0;
2496     const xmlChar *base = ctxt->input->base;
2497
2498     /*
2499      * Handler for more complex cases
2500      */
2501     GROW;
2502     c = CUR_CHAR(l);
2503     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2504         (!IS_LETTER(c) && (c != '_') &&
2505          (c != ':'))) {
2506         return(NULL);
2507     }
2508
2509     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2510            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2511             (c == '.') || (c == '-') ||
2512             (c == '_') || (c == ':') ||
2513             (IS_COMBINING(c)) ||
2514             (IS_EXTENDER(c)))) {
2515         if (count++ > 100) {
2516             count = 0;
2517             GROW;
2518         }
2519         len += l;
2520         NEXTL(l);
2521         c = CUR_CHAR(l);
2522         if (ctxt->input->base != base) {
2523             /*
2524              * We changed encoding from an unknown encoding
2525              * Input buffer changed location, so we better start again
2526              */
2527             return(htmlParseNameComplex(ctxt));
2528         }
2529     }
2530
2531     if (ctxt->input->cur - ctxt->input->base < len) {
2532         /* Sanity check */
2533         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2534                      "unexpected change of input buffer", NULL, NULL);
2535         return (NULL);
2536     }
2537
2538     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2539 }
2540
2541
2542 /**
2543  * htmlParseHTMLAttribute:
2544  * @ctxt:  an HTML parser context
2545  * @stop:  a char stop value
2546  *
2547  * parse an HTML attribute value till the stop (quote), if
2548  * stop is 0 then it stops at the first space
2549  *
2550  * Returns the attribute parsed or NULL
2551  */
2552
2553 static xmlChar *
2554 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2555     xmlChar *buffer = NULL;
2556     int buffer_size = 0;
2557     xmlChar *out = NULL;
2558     const xmlChar *name = NULL;
2559     const xmlChar *cur = NULL;
2560     const htmlEntityDesc * ent;
2561
2562     /*
2563      * allocate a translation buffer.
2564      */
2565     buffer_size = HTML_PARSER_BUFFER_SIZE;
2566     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2567     if (buffer == NULL) {
2568         htmlErrMemory(ctxt, "buffer allocation failed\n");
2569         return(NULL);
2570     }
2571     out = buffer;
2572
2573     /*
2574      * Ok loop until we reach one of the ending chars
2575      */
2576     while ((CUR != 0) && (CUR != stop)) {
2577         if ((stop == 0) && (CUR == '>')) break;
2578         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2579         if (CUR == '&') {
2580             if (NXT(1) == '#') {
2581                 unsigned int c;
2582                 int bits;
2583
2584                 c = htmlParseCharRef(ctxt);
2585                 if      (c <    0x80)
2586                         { *out++  = c;                bits= -6; }
2587                 else if (c <   0x800)
2588                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2589                 else if (c < 0x10000)
2590                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2591                 else
2592                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2593
2594                 for ( ; bits >= 0; bits-= 6) {
2595                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2596                 }
2597
2598                 if (out - buffer > buffer_size - 100) {
2599                         int indx = out - buffer;
2600
2601                         growBuffer(buffer);
2602                         out = &buffer[indx];
2603                 }
2604             } else {
2605                 ent = htmlParseEntityRef(ctxt, &name);
2606                 if (name == NULL) {
2607                     *out++ = '&';
2608                     if (out - buffer > buffer_size - 100) {
2609                         int indx = out - buffer;
2610
2611                         growBuffer(buffer);
2612                         out = &buffer[indx];
2613                     }
2614                 } else if (ent == NULL) {
2615                     *out++ = '&';
2616                     cur = name;
2617                     while (*cur != 0) {
2618                         if (out - buffer > buffer_size - 100) {
2619                             int indx = out - buffer;
2620
2621                             growBuffer(buffer);
2622                             out = &buffer[indx];
2623                         }
2624                         *out++ = *cur++;
2625                     }
2626                 } else {
2627                     unsigned int c;
2628                     int bits;
2629
2630                     if (out - buffer > buffer_size - 100) {
2631                         int indx = out - buffer;
2632
2633                         growBuffer(buffer);
2634                         out = &buffer[indx];
2635                     }
2636                     c = ent->value;
2637                     if      (c <    0x80)
2638                         { *out++  = c;                bits= -6; }
2639                     else if (c <   0x800)
2640                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2641                     else if (c < 0x10000)
2642                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2643                     else
2644                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2645
2646                     for ( ; bits >= 0; bits-= 6) {
2647                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2648                     }
2649                 }
2650             }
2651         } else {
2652             unsigned int c;
2653             int bits, l;
2654
2655             if (out - buffer > buffer_size - 100) {
2656                 int indx = out - buffer;
2657
2658                 growBuffer(buffer);
2659                 out = &buffer[indx];
2660             }
2661             c = CUR_CHAR(l);
2662             if      (c <    0x80)
2663                     { *out++  = c;                bits= -6; }
2664             else if (c <   0x800)
2665                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2666             else if (c < 0x10000)
2667                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2668             else
2669                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2670
2671             for ( ; bits >= 0; bits-= 6) {
2672                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2673             }
2674             NEXT;
2675         }
2676     }
2677     *out = 0;
2678     return(buffer);
2679 }
2680
2681 /**
2682  * htmlParseEntityRef:
2683  * @ctxt:  an HTML parser context
2684  * @str:  location to store the entity name
2685  *
2686  * parse an HTML ENTITY references
2687  *
2688  * [68] EntityRef ::= '&' Name ';'
2689  *
2690  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2691  *         if non-NULL *str will have to be freed by the caller.
2692  */
2693 const htmlEntityDesc *
2694 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2695     const xmlChar *name;
2696     const htmlEntityDesc * ent = NULL;
2697
2698     if (str != NULL) *str = NULL;
2699     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2700
2701     if (CUR == '&') {
2702         NEXT;
2703         name = htmlParseName(ctxt);
2704         if (name == NULL) {
2705             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2706                          "htmlParseEntityRef: no name\n", NULL, NULL);
2707         } else {
2708             GROW;
2709             if (CUR == ';') {
2710                 if (str != NULL)
2711                     *str = name;
2712
2713                 /*
2714                  * Lookup the entity in the table.
2715                  */
2716                 ent = htmlEntityLookup(name);
2717                 if (ent != NULL) /* OK that's ugly !!! */
2718                     NEXT;
2719             } else {
2720                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2721                              "htmlParseEntityRef: expecting ';'\n",
2722                              NULL, NULL);
2723                 if (str != NULL)
2724                     *str = name;
2725             }
2726         }
2727     }
2728     return(ent);
2729 }
2730
2731 /**
2732  * htmlParseAttValue:
2733  * @ctxt:  an HTML parser context
2734  *
2735  * parse a value for an attribute
2736  * Note: the parser won't do substitution of entities here, this
2737  * will be handled later in xmlStringGetNodeList, unless it was
2738  * asked for ctxt->replaceEntities != 0
2739  *
2740  * Returns the AttValue parsed or NULL.
2741  */
2742
2743 static xmlChar *
2744 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2745     xmlChar *ret = NULL;
2746
2747     if (CUR == '"') {
2748         NEXT;
2749         ret = htmlParseHTMLAttribute(ctxt, '"');
2750         if (CUR != '"') {
2751             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2752                          "AttValue: \" expected\n", NULL, NULL);
2753         } else
2754             NEXT;
2755     } else if (CUR == '\'') {
2756         NEXT;
2757         ret = htmlParseHTMLAttribute(ctxt, '\'');
2758         if (CUR != '\'') {
2759             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2760                          "AttValue: ' expected\n", NULL, NULL);
2761         } else
2762             NEXT;
2763     } else {
2764         /*
2765          * That's an HTMLism, the attribute value may not be quoted
2766          */
2767         ret = htmlParseHTMLAttribute(ctxt, 0);
2768         if (ret == NULL) {
2769             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2770                          "AttValue: no value found\n", NULL, NULL);
2771         }
2772     }
2773     return(ret);
2774 }
2775
2776 /**
2777  * htmlParseSystemLiteral:
2778  * @ctxt:  an HTML parser context
2779  *
2780  * parse an HTML Literal
2781  *
2782  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2783  *
2784  * Returns the SystemLiteral parsed or NULL
2785  */
2786
2787 static xmlChar *
2788 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2789     size_t len = 0, startPosition = 0;
2790     xmlChar *ret = NULL;
2791
2792     if (CUR == '"') {
2793         NEXT;
2794
2795         if (CUR_PTR < BASE_PTR)
2796             return(ret);
2797         startPosition = CUR_PTR - BASE_PTR;
2798
2799         while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
2800             NEXT;
2801             len++;
2802         }
2803         if (!IS_CHAR_CH(CUR)) {
2804             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2805                          "Unfinished SystemLiteral\n", NULL, NULL);
2806         } else {
2807             ret = xmlStrndup((BASE_PTR+startPosition), len);
2808             NEXT;
2809         }
2810     } else if (CUR == '\'') {
2811         NEXT;
2812
2813         if (CUR_PTR < BASE_PTR)
2814             return(ret);
2815         startPosition = CUR_PTR - BASE_PTR;
2816
2817         while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
2818             NEXT;
2819             len++;
2820         }
2821         if (!IS_CHAR_CH(CUR)) {
2822             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2823                          "Unfinished SystemLiteral\n", NULL, NULL);
2824         } else {
2825             ret = xmlStrndup((BASE_PTR+startPosition), len);
2826             NEXT;
2827         }
2828     } else {
2829         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2830                      " or ' expected\n", NULL, NULL);
2831     }
2832
2833     return(ret);
2834 }
2835
2836 /**
2837  * htmlParsePubidLiteral:
2838  * @ctxt:  an HTML parser context
2839  *
2840  * parse an HTML public literal
2841  *
2842  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2843  *
2844  * Returns the PubidLiteral parsed or NULL.
2845  */
2846
2847 static xmlChar *
2848 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2849     size_t len = 0, startPosition = 0;
2850     xmlChar *ret = NULL;
2851     /*
2852      * Name ::= (Letter | '_') (NameChar)*
2853      */
2854     if (CUR == '"') {
2855         NEXT;
2856
2857         if (CUR_PTR < BASE_PTR)
2858             return(ret);
2859         startPosition = CUR_PTR - BASE_PTR;
2860
2861         while (IS_PUBIDCHAR_CH(CUR)) {
2862             len++;
2863             NEXT;
2864         }
2865
2866         if (CUR != '"') {
2867             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2868                          "Unfinished PubidLiteral\n", NULL, NULL);
2869         } else {
2870             ret = xmlStrndup((BASE_PTR + startPosition), len);
2871             NEXT;
2872         }
2873     } else if (CUR == '\'') {
2874         NEXT;
2875
2876         if (CUR_PTR < BASE_PTR)
2877             return(ret);
2878         startPosition = CUR_PTR - BASE_PTR;
2879
2880         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2881             len++;
2882             NEXT;
2883         }
2884
2885         if (CUR != '\'') {
2886             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2887                          "Unfinished PubidLiteral\n", NULL, NULL);
2888         } else {
2889             ret = xmlStrndup((BASE_PTR + startPosition), len);
2890             NEXT;
2891         }
2892     } else {
2893         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2894                      "PubidLiteral \" or ' expected\n", NULL, NULL);
2895     }
2896
2897     return(ret);
2898 }
2899
2900 /**
2901  * htmlParseScript:
2902  * @ctxt:  an HTML parser context
2903  *
2904  * parse the content of an HTML SCRIPT or STYLE element
2905  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2906  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2907  * http://www.w3.org/TR/html4/types.html#type-script
2908  * http://www.w3.org/TR/html4/types.html#h-6.15
2909  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2910  *
2911  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2912  * element and the value of intrinsic event attributes. User agents must
2913  * not evaluate script data as HTML markup but instead must pass it on as
2914  * data to a script engine.
2915  * NOTES:
2916  * - The content is passed like CDATA
2917  * - the attributes for style and scripting "onXXX" are also described
2918  *   as CDATA but SGML allows entities references in attributes so their
2919  *   processing is identical as other attributes
2920  */
2921 static void
2922 htmlParseScript(htmlParserCtxtPtr ctxt) {
2923     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2924     int nbchar = 0;
2925     int cur,l;
2926
2927     SHRINK;
2928     cur = CUR_CHAR(l);
2929     while (IS_CHAR_CH(cur)) {
2930         if ((cur == '<') && (NXT(1) == '/')) {
2931             /*
2932              * One should break here, the specification is clear:
2933              * Authors should therefore escape "</" within the content.
2934              * Escape mechanisms are specific to each scripting or
2935              * style sheet language.
2936              *
2937              * In recovery mode, only break if end tag match the
2938              * current tag, effectively ignoring all tags inside the
2939              * script/style block and treating the entire block as
2940              * CDATA.
2941              */
2942             if (ctxt->recovery) {
2943                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2944                                    xmlStrlen(ctxt->name)) == 0)
2945                 {
2946                     break; /* while */
2947                 } else {
2948                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2949                                  "Element %s embeds close tag\n",
2950                                  ctxt->name, NULL);
2951                 }
2952             } else {
2953                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2954                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2955                 {
2956                     break; /* while */
2957                 }
2958             }
2959         }
2960         COPY_BUF(l,buf,nbchar,cur);
2961         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2962             if (ctxt->sax->cdataBlock!= NULL) {
2963                 /*
2964                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2965                  */
2966                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2967             } else if (ctxt->sax->characters != NULL) {
2968                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2969             }
2970             nbchar = 0;
2971         }
2972         GROW;
2973         NEXTL(l);
2974         cur = CUR_CHAR(l);
2975     }
2976
2977     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2978         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2979                     "Invalid char in CDATA 0x%X\n", cur);
2980         if (ctxt->input->cur < ctxt->input->end) {
2981             NEXT;
2982         }
2983     }
2984
2985     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2986         if (ctxt->sax->cdataBlock!= NULL) {
2987             /*
2988              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2989              */
2990             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2991         } else if (ctxt->sax->characters != NULL) {
2992             ctxt->sax->characters(ctxt->userData, buf, nbchar);
2993         }
2994     }
2995 }
2996
2997
2998 /**
2999  * htmlParseCharDataInternal:
3000  * @ctxt:  an HTML parser context
3001  * @readahead: optional read ahead character in ascii range
3002  *
3003  * parse a CharData section.
3004  * if we are within a CDATA section ']]>' marks an end of section.
3005  *
3006  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3007  */
3008
3009 static void
3010 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3011     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3012     int nbchar = 0;
3013     int cur, l;
3014     int chunk = 0;
3015
3016     if (readahead)
3017         buf[nbchar++] = readahead;
3018
3019     SHRINK;
3020     cur = CUR_CHAR(l);
3021     while (((cur != '<') || (ctxt->token == '<')) &&
3022            ((cur != '&') || (ctxt->token == '&')) &&
3023            (cur != 0)) {
3024         if (!(IS_CHAR(cur))) {
3025             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3026                         "Invalid char in CDATA 0x%X\n", cur);
3027         } else {
3028             COPY_BUF(l,buf,nbchar,cur);
3029         }
3030         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3031             /*
3032              * Ok the segment is to be consumed as chars.
3033              */
3034             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3035                 if (areBlanks(ctxt, buf, nbchar)) {
3036                     if (ctxt->keepBlanks) {
3037                         if (ctxt->sax->characters != NULL)
3038                             ctxt->sax->characters(ctxt->userData, buf, nbchar);
3039                     } else {
3040                         if (ctxt->sax->ignorableWhitespace != NULL)
3041                             ctxt->sax->ignorableWhitespace(ctxt->userData,
3042                                                            buf, nbchar);
3043                     }
3044                 } else {
3045                     htmlCheckParagraph(ctxt);
3046                     if (ctxt->sax->characters != NULL)
3047                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3048                 }
3049             }
3050             nbchar = 0;
3051         }
3052         NEXTL(l);
3053         chunk++;
3054         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3055             chunk = 0;
3056             SHRINK;
3057             GROW;
3058         }
3059         cur = CUR_CHAR(l);
3060         if (cur == 0) {
3061             SHRINK;
3062             GROW;
3063             cur = CUR_CHAR(l);
3064         }
3065     }
3066     if (nbchar != 0) {
3067         buf[nbchar] = 0;
3068
3069         /*
3070          * Ok the segment is to be consumed as chars.
3071          */
3072         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3073             if (areBlanks(ctxt, buf, nbchar)) {
3074                 if (ctxt->keepBlanks) {
3075                     if (ctxt->sax->characters != NULL)
3076                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
3077                 } else {
3078                     if (ctxt->sax->ignorableWhitespace != NULL)
3079                         ctxt->sax->ignorableWhitespace(ctxt->userData,
3080                                                        buf, nbchar);
3081                 }
3082             } else {
3083                 htmlCheckParagraph(ctxt);
3084                 if (ctxt->sax->characters != NULL)
3085                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
3086             }
3087         }
3088     } else {
3089         /*
3090          * Loop detection
3091          */
3092         if (cur == 0)
3093             ctxt->instate = XML_PARSER_EOF;
3094     }
3095 }
3096
3097 /**
3098  * htmlParseCharData:
3099  * @ctxt:  an HTML parser context
3100  *
3101  * parse a CharData section.
3102  * if we are within a CDATA section ']]>' marks an end of section.
3103  *
3104  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3105  */
3106
3107 static void
3108 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3109     htmlParseCharDataInternal(ctxt, 0);
3110 }
3111
3112 /**
3113  * htmlParseExternalID:
3114  * @ctxt:  an HTML parser context
3115  * @publicID:  a xmlChar** receiving PubidLiteral
3116  *
3117  * Parse an External ID or a Public ID
3118  *
3119  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3120  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3121  *
3122  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3123  *
3124  * Returns the function returns SystemLiteral and in the second
3125  *                case publicID receives PubidLiteral, is strict is off
3126  *                it is possible to return NULL and have publicID set.
3127  */
3128
3129 static xmlChar *
3130 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3131     xmlChar *URI = NULL;
3132
3133     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3134          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3135          (UPP(4) == 'E') && (UPP(5) == 'M')) {
3136         SKIP(6);
3137         if (!IS_BLANK_CH(CUR)) {
3138             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3139                          "Space required after 'SYSTEM'\n", NULL, NULL);
3140         }
3141         SKIP_BLANKS;
3142         URI = htmlParseSystemLiteral(ctxt);
3143         if (URI == NULL) {
3144             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3145                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3146         }
3147     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3148                (UPP(2) == 'B') && (UPP(3) == 'L') &&
3149                (UPP(4) == 'I') && (UPP(5) == 'C')) {
3150         SKIP(6);
3151         if (!IS_BLANK_CH(CUR)) {
3152             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3153                          "Space required after 'PUBLIC'\n", NULL, NULL);
3154         }
3155         SKIP_BLANKS;
3156         *publicID = htmlParsePubidLiteral(ctxt);
3157         if (*publicID == NULL) {
3158             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3159                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3160                          NULL, NULL);
3161         }
3162         SKIP_BLANKS;
3163         if ((CUR == '"') || (CUR == '\'')) {
3164             URI = htmlParseSystemLiteral(ctxt);
3165         }
3166     }
3167     return(URI);
3168 }
3169
3170 /**
3171  * xmlParsePI:
3172  * @ctxt:  an XML parser context
3173  *
3174  * parse an XML Processing Instruction.
3175  *
3176  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3177  */
3178 static void
3179 htmlParsePI(htmlParserCtxtPtr ctxt) {
3180     xmlChar *buf = NULL;
3181     int len = 0;
3182     int size = HTML_PARSER_BUFFER_SIZE;
3183     int cur, l;
3184     const xmlChar *target;
3185     xmlParserInputState state;
3186     int count = 0;
3187
3188     if ((RAW == '<') && (NXT(1) == '?')) {
3189         state = ctxt->instate;
3190         ctxt->instate = XML_PARSER_PI;
3191         /*
3192          * this is a Processing Instruction.
3193          */
3194         SKIP(2);
3195         SHRINK;
3196
3197         /*
3198          * Parse the target name and check for special support like
3199          * namespace.
3200          */
3201         target = htmlParseName(ctxt);
3202         if (target != NULL) {
3203             if (RAW == '>') {
3204                 SKIP(1);
3205
3206                 /*
3207                  * SAX: PI detected.
3208                  */
3209                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3210                     (ctxt->sax->processingInstruction != NULL))
3211                     ctxt->sax->processingInstruction(ctxt->userData,
3212                                                      target, NULL);
3213                 ctxt->instate = state;
3214                 return;
3215             }
3216             buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3217             if (buf == NULL) {
3218                 htmlErrMemory(ctxt, NULL);
3219                 ctxt->instate = state;
3220                 return;
3221             }
3222             cur = CUR;
3223             if (!IS_BLANK(cur)) {
3224                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3225                           "ParsePI: PI %s space expected\n", target, NULL);
3226             }
3227             SKIP_BLANKS;
3228             cur = CUR_CHAR(l);
3229             while (IS_CHAR(cur) && (cur != '>')) {
3230                 if (len + 5 >= size) {
3231                     xmlChar *tmp;
3232
3233                     size *= 2;
3234                     tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3235                     if (tmp == NULL) {
3236                         htmlErrMemory(ctxt, NULL);
3237                         xmlFree(buf);
3238                         ctxt->instate = state;
3239                         return;
3240                     }
3241                     buf = tmp;
3242                 }
3243                 count++;
3244                 if (count > 50) {
3245                     GROW;
3246                     count = 0;
3247                 }
3248                 COPY_BUF(l,buf,len,cur);
3249                 NEXTL(l);
3250                 cur = CUR_CHAR(l);
3251                 if (cur == 0) {
3252                     SHRINK;
3253                     GROW;
3254                     cur = CUR_CHAR(l);
3255                 }
3256             }
3257             buf[len] = 0;
3258             if (cur != '>') {
3259                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3260                       "ParsePI: PI %s never end ...\n", target, NULL);
3261             } else {
3262                 SKIP(1);
3263
3264                 /*
3265                  * SAX: PI detected.
3266                  */
3267                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3268                     (ctxt->sax->processingInstruction != NULL))
3269                     ctxt->sax->processingInstruction(ctxt->userData,
3270                                                      target, buf);
3271             }
3272             xmlFree(buf);
3273         } else {
3274             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3275                          "PI is not started correctly", NULL, NULL);
3276         }
3277         ctxt->instate = state;
3278     }
3279 }
3280
3281 /**
3282  * htmlParseComment:
3283  * @ctxt:  an HTML parser context
3284  *
3285  * Parse an XML (SGML) comment <!-- .... -->
3286  *
3287  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3288  */
3289 static void
3290 htmlParseComment(htmlParserCtxtPtr ctxt) {
3291     xmlChar *buf = NULL;
3292     int len;
3293     int size = HTML_PARSER_BUFFER_SIZE;
3294     int q, ql;
3295     int r, rl;
3296     int cur, l;
3297     xmlParserInputState state;
3298
3299     /*
3300      * Check that there is a comment right here.
3301      */
3302     if ((RAW != '<') || (NXT(1) != '!') ||
3303         (NXT(2) != '-') || (NXT(3) != '-')) return;
3304
3305     state = ctxt->instate;
3306     ctxt->instate = XML_PARSER_COMMENT;
3307     SHRINK;
3308     SKIP(4);
3309     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3310     if (buf == NULL) {
3311         htmlErrMemory(ctxt, "buffer allocation failed\n");
3312         ctxt->instate = state;
3313         return;
3314     }
3315     len = 0;
3316     buf[len] = 0;
3317     q = CUR_CHAR(ql);
3318     if (!IS_CHAR(q))
3319         goto unfinished;
3320     NEXTL(ql);
3321     r = CUR_CHAR(rl);
3322     if (!IS_CHAR(r))
3323         goto unfinished;
3324     NEXTL(rl);
3325     cur = CUR_CHAR(l);
3326     while (IS_CHAR(cur) &&
3327            ((cur != '>') ||
3328             (r != '-') || (q != '-'))) {
3329         if (len + 5 >= size) {
3330             xmlChar *tmp;
3331
3332             size *= 2;
3333             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3334             if (tmp == NULL) {
3335                 xmlFree(buf);
3336                 htmlErrMemory(ctxt, "growing buffer failed\n");
3337                 ctxt->instate = state;
3338                 return;
3339             }
3340             buf = tmp;
3341         }
3342         COPY_BUF(ql,buf,len,q);
3343         q = r;
3344         ql = rl;
3345         r = cur;
3346         rl = l;
3347         NEXTL(l);
3348         cur = CUR_CHAR(l);
3349         if (cur == 0) {
3350             SHRINK;
3351             GROW;
3352             cur = CUR_CHAR(l);
3353         }
3354     }
3355     buf[len] = 0;
3356     if (IS_CHAR(cur)) {
3357         NEXT;
3358         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3359             (!ctxt->disableSAX))
3360             ctxt->sax->comment(ctxt->userData, buf);
3361         xmlFree(buf);
3362         ctxt->instate = state;
3363         return;
3364     }
3365
3366 unfinished:
3367     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3368                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
3369     xmlFree(buf);
3370 }
3371
3372 /**
3373  * htmlParseCharRef:
3374  * @ctxt:  an HTML parser context
3375  *
3376  * parse Reference declarations
3377  *
3378  * [66] CharRef ::= '&#' [0-9]+ ';' |
3379  *                  '&#x' [0-9a-fA-F]+ ';'
3380  *
3381  * Returns the value parsed (as an int)
3382  */
3383 int
3384 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3385     int val = 0;
3386
3387     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3388         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3389                      "htmlParseCharRef: context error\n",
3390                      NULL, NULL);
3391         return(0);
3392     }
3393     if ((CUR == '&') && (NXT(1) == '#') &&
3394         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3395         SKIP(3);
3396         while (CUR != ';') {
3397             if ((CUR >= '0') && (CUR <= '9'))
3398                 val = val * 16 + (CUR - '0');
3399             else if ((CUR >= 'a') && (CUR <= 'f'))
3400                 val = val * 16 + (CUR - 'a') + 10;
3401             else if ((CUR >= 'A') && (CUR <= 'F'))
3402                 val = val * 16 + (CUR - 'A') + 10;
3403             else {
3404                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3405                              "htmlParseCharRef: missing semicolon\n",
3406                              NULL, NULL);
3407                 break;
3408             }
3409             NEXT;
3410         }
3411         if (CUR == ';')
3412             NEXT;
3413     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3414         SKIP(2);
3415         while (CUR != ';') {
3416             if ((CUR >= '0') && (CUR <= '9'))
3417                 val = val * 10 + (CUR - '0');
3418             else {
3419                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3420                              "htmlParseCharRef: missing semicolon\n",
3421                              NULL, NULL);
3422                 break;
3423             }
3424             NEXT;
3425         }
3426         if (CUR == ';')
3427             NEXT;
3428     } else {
3429         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3430                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3431     }
3432     /*
3433      * Check the value IS_CHAR ...
3434      */
3435     if (IS_CHAR(val)) {
3436         return(val);
3437     } else {
3438         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3439                         "htmlParseCharRef: invalid xmlChar value %d\n",
3440                         val);
3441     }
3442     return(0);
3443 }
3444
3445
3446 /**
3447  * htmlParseDocTypeDecl:
3448  * @ctxt:  an HTML parser context
3449  *
3450  * parse a DOCTYPE declaration
3451  *
3452  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3453  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3454  */
3455
3456 static void
3457 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3458     const xmlChar *name;
3459     xmlChar *ExternalID = NULL;
3460     xmlChar *URI = NULL;
3461
3462     /*
3463      * We know that '<!DOCTYPE' has been detected.
3464      */
3465     SKIP(9);
3466
3467     SKIP_BLANKS;
3468
3469     /*
3470      * Parse the DOCTYPE name.
3471      */
3472     name = htmlParseName(ctxt);
3473     if (name == NULL) {
3474         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3475                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3476                      NULL, NULL);
3477     }
3478     /*
3479      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3480      */
3481
3482     SKIP_BLANKS;
3483
3484     /*
3485      * Check for SystemID and ExternalID
3486      */
3487     URI = htmlParseExternalID(ctxt, &ExternalID);
3488     SKIP_BLANKS;
3489
3490     /*
3491      * We should be at the end of the DOCTYPE declaration.
3492      */
3493     if (CUR != '>') {
3494         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3495                      "DOCTYPE improperly terminated\n", NULL, NULL);
3496         /* We shouldn't try to resynchronize ... */
3497     }
3498     NEXT;
3499
3500     /*
3501      * Create or update the document accordingly to the DOCTYPE
3502      */
3503     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3504         (!ctxt->disableSAX))
3505         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3506
3507     /*
3508      * Cleanup, since we don't use all those identifiers
3509      */
3510     if (URI != NULL) xmlFree(URI);
3511     if (ExternalID != NULL) xmlFree(ExternalID);
3512 }
3513
3514 /**
3515  * htmlParseAttribute:
3516  * @ctxt:  an HTML parser context
3517  * @value:  a xmlChar ** used to store the value of the attribute
3518  *
3519  * parse an attribute
3520  *
3521  * [41] Attribute ::= Name Eq AttValue
3522  *
3523  * [25] Eq ::= S? '=' S?
3524  *
3525  * With namespace:
3526  *
3527  * [NS 11] Attribute ::= QName Eq AttValue
3528  *
3529  * Also the case QName == xmlns:??? is handled independently as a namespace
3530  * definition.
3531  *
3532  * Returns the attribute name, and the value in *value.
3533  */
3534
3535 static const xmlChar *
3536 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3537     const xmlChar *name;
3538     xmlChar *val = NULL;
3539
3540     *value = NULL;
3541     name = htmlParseHTMLName(ctxt);
3542     if (name == NULL) {
3543         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3544                      "error parsing attribute name\n", NULL, NULL);
3545         return(NULL);
3546     }
3547
3548     /*
3549      * read the value
3550      */
3551     SKIP_BLANKS;
3552     if (CUR == '=') {
3553         NEXT;
3554         SKIP_BLANKS;
3555         val = htmlParseAttValue(ctxt);
3556     }
3557
3558     *value = val;
3559     return(name);
3560 }
3561
3562 /**
3563  * htmlCheckEncodingDirect:
3564  * @ctxt:  an HTML parser context
3565  * @attvalue: the attribute value
3566  *
3567  * Checks an attribute value to detect
3568  * the encoding
3569  * If a new encoding is detected the parser is switched to decode
3570  * it and pass UTF8
3571  */
3572 static void
3573 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3574
3575     if ((ctxt == NULL) || (encoding == NULL) ||
3576         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3577         return;
3578
3579     /* do not change encoding */
3580     if (ctxt->input->encoding != NULL)
3581         return;
3582
3583     if (encoding != NULL) {
3584         xmlCharEncoding enc;
3585         xmlCharEncodingHandlerPtr handler;
3586
3587         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3588
3589         if (ctxt->input->encoding != NULL)
3590             xmlFree((xmlChar *) ctxt->input->encoding);
3591         ctxt->input->encoding = xmlStrdup(encoding);
3592
3593         enc = xmlParseCharEncoding((const char *) encoding);
3594         /*
3595          * registered set of known encodings
3596          */
3597         if (enc != XML_CHAR_ENCODING_ERROR) {
3598             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3599                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3600                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3601                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3602                 (ctxt->input->buf != NULL) &&
3603                 (ctxt->input->buf->encoder == NULL)) {
3604                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3605                              "htmlCheckEncoding: wrong encoding meta\n",
3606                              NULL, NULL);
3607             } else {
3608                 xmlSwitchEncoding(ctxt, enc);
3609             }
3610             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3611         } else {
3612             /*
3613              * fallback for unknown encodings
3614              */
3615             handler = xmlFindCharEncodingHandler((const char *) encoding);
3616             if (handler != NULL) {
3617                 xmlSwitchToEncoding(ctxt, handler);
3618                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3619             } else {
3620                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3621                              "htmlCheckEncoding: unknown encoding %s\n",
3622                              encoding, NULL);
3623             }
3624         }
3625
3626         if ((ctxt->input->buf != NULL) &&
3627             (ctxt->input->buf->encoder != NULL) &&
3628             (ctxt->input->buf->raw != NULL) &&
3629             (ctxt->input->buf->buffer != NULL)) {
3630             int nbchars;
3631             int processed;
3632
3633             /*
3634              * convert as much as possible to the parser reading buffer.
3635              */
3636             processed = ctxt->input->cur - ctxt->input->base;
3637             xmlBufShrink(ctxt->input->buf->buffer, processed);
3638             nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3639             if (nbchars < 0) {
3640                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3641                              "htmlCheckEncoding: encoder error\n",
3642                              NULL, NULL);
3643             }
3644             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3645         }
3646     }
3647 }
3648
3649 /**
3650  * htmlCheckEncoding:
3651  * @ctxt:  an HTML parser context
3652  * @attvalue: the attribute value
3653  *
3654  * Checks an http-equiv attribute from a Meta tag to detect
3655  * the encoding
3656  * If a new encoding is detected the parser is switched to decode
3657  * it and pass UTF8
3658  */
3659 static void
3660 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3661     const xmlChar *encoding;
3662
3663     if (!attvalue)
3664         return;
3665
3666     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3667     if (encoding != NULL) {
3668         encoding += 7;
3669     }
3670     /*
3671      * skip blank
3672      */
3673     if (encoding && IS_BLANK_CH(*encoding))
3674         encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3675     if (encoding && *encoding == '=') {
3676         encoding ++;
3677         htmlCheckEncodingDirect(ctxt, encoding);
3678     }
3679 }
3680
3681 /**
3682  * htmlCheckMeta:
3683  * @ctxt:  an HTML parser context
3684  * @atts:  the attributes values
3685  *
3686  * Checks an attributes from a Meta tag
3687  */
3688 static void
3689 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3690     int i;
3691     const xmlChar *att, *value;
3692     int http = 0;
3693     const xmlChar *content = NULL;
3694
3695     if ((ctxt == NULL) || (atts == NULL))
3696         return;
3697
3698     i = 0;
3699     att = atts[i++];
3700     while (att != NULL) {
3701         value = atts[i++];
3702         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3703          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3704             http = 1;
3705         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3706             htmlCheckEncodingDirect(ctxt, value);
3707         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3708             content = value;
3709         att = atts[i++];
3710     }
3711     if ((http) && (content != NULL))
3712         htmlCheckEncoding(ctxt, content);
3713
3714 }
3715
3716 /**
3717  * htmlParseStartTag:
3718  * @ctxt:  an HTML parser context
3719  *
3720  * parse a start of tag either for rule element or
3721  * EmptyElement. In both case we don't parse the tag closing chars.
3722  *
3723  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3724  *
3725  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3726  *
3727  * With namespace:
3728  *
3729  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3730  *
3731  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3732  *
3733  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3734  */
3735
3736 static int
3737 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3738     const xmlChar *name;
3739     const xmlChar *attname;
3740     xmlChar *attvalue;
3741     const xmlChar **atts;
3742     int nbatts = 0;
3743     int maxatts;
3744     int meta = 0;
3745     int i;
3746     int discardtag = 0;
3747
3748     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3749         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3750                      "htmlParseStartTag: context error\n", NULL, NULL);
3751         return -1;
3752     }
3753     if (ctxt->instate == XML_PARSER_EOF)
3754         return(-1);
3755     if (CUR != '<') return -1;
3756     NEXT;
3757
3758     atts = ctxt->atts;
3759     maxatts = ctxt->maxatts;
3760
3761     GROW;
3762     name = htmlParseHTMLName(ctxt);
3763     if (name == NULL) {
3764         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3765                      "htmlParseStartTag: invalid element name\n",
3766                      NULL, NULL);
3767         /* if recover preserve text on classic misconstructs */
3768         if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3769             (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3770             htmlParseCharDataInternal(ctxt, '<');
3771             return(-1);
3772         }
3773
3774
3775         /* Dump the bogus tag like browsers do */
3776         while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3777                (ctxt->instate != XML_PARSER_EOF))
3778             NEXT;
3779         return -1;
3780     }
3781     if (xmlStrEqual(name, BAD_CAST"meta"))
3782         meta = 1;
3783
3784     /*
3785      * Check for auto-closure of HTML elements.
3786      */
3787     htmlAutoClose(ctxt, name);
3788
3789     /*
3790      * Check for implied HTML elements.
3791      */
3792     htmlCheckImplied(ctxt, name);
3793
3794     /*
3795      * Avoid html at any level > 0, head at any level != 1
3796      * or any attempt to recurse body
3797      */
3798     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3799         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3800                      "htmlParseStartTag: misplaced <html> tag\n",
3801                      name, NULL);
3802         discardtag = 1;
3803         ctxt->depth++;
3804     }
3805     if ((ctxt->nameNr != 1) &&
3806         (xmlStrEqual(name, BAD_CAST"head"))) {
3807         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3808                      "htmlParseStartTag: misplaced <head> tag\n",
3809                      name, NULL);
3810         discardtag = 1;
3811         ctxt->depth++;
3812     }
3813     if (xmlStrEqual(name, BAD_CAST"body")) {
3814         int indx;
3815         for (indx = 0;indx < ctxt->nameNr;indx++) {
3816             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3817                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3818                              "htmlParseStartTag: misplaced <body> tag\n",
3819                              name, NULL);
3820                 discardtag = 1;
3821                 ctxt->depth++;
3822             }
3823         }
3824     }
3825
3826     /*
3827      * Now parse the attributes, it ends up with the ending
3828      *
3829      * (S Attribute)* S?
3830      */
3831     SKIP_BLANKS;
3832     while ((IS_CHAR_CH(CUR)) &&
3833            (CUR != '>') &&
3834            ((CUR != '/') || (NXT(1) != '>'))) {
3835         long cons = ctxt->nbChars;
3836
3837         GROW;
3838         attname = htmlParseAttribute(ctxt, &attvalue);
3839         if (attname != NULL) {
3840
3841             /*
3842              * Well formedness requires at most one declaration of an attribute
3843              */
3844             for (i = 0; i < nbatts;i += 2) {
3845                 if (xmlStrEqual(atts[i], attname)) {
3846                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3847                                  "Attribute %s redefined\n", attname, NULL);
3848                     if (attvalue != NULL)
3849                         xmlFree(attvalue);
3850                     goto failed;
3851                 }
3852             }
3853
3854             /*
3855              * Add the pair to atts
3856              */
3857             if (atts == NULL) {
3858                 maxatts = 22; /* allow for 10 attrs by default */
3859                 atts = (const xmlChar **)
3860                        xmlMalloc(maxatts * sizeof(xmlChar *));
3861                 if (atts == NULL) {
3862                     htmlErrMemory(ctxt, NULL);
3863                     if (attvalue != NULL)
3864                         xmlFree(attvalue);
3865                     goto failed;
3866                 }
3867                 ctxt->atts = atts;
3868                 ctxt->maxatts = maxatts;
3869             } else if (nbatts + 4 > maxatts) {
3870                 const xmlChar **n;
3871
3872                 maxatts *= 2;
3873                 n = (const xmlChar **) xmlRealloc((void *) atts,
3874                                              maxatts * sizeof(const xmlChar *));
3875                 if (n == NULL) {
3876                     htmlErrMemory(ctxt, NULL);
3877                     if (attvalue != NULL)
3878                         xmlFree(attvalue);
3879                     goto failed;
3880                 }
3881                 atts = n;
3882                 ctxt->atts = atts;
3883                 ctxt->maxatts = maxatts;
3884             }
3885             atts[nbatts++] = attname;
3886             atts[nbatts++] = attvalue;
3887             atts[nbatts] = NULL;
3888             atts[nbatts + 1] = NULL;
3889         }
3890         else {
3891             if (attvalue != NULL)
3892                 xmlFree(attvalue);
3893             /* Dump the bogus attribute string up to the next blank or
3894              * the end of the tag. */
3895             while ((IS_CHAR_CH(CUR)) &&
3896                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3897                    ((CUR != '/') || (NXT(1) != '>')))
3898                 NEXT;
3899         }
3900
3901 failed:
3902         SKIP_BLANKS;
3903         if (cons == ctxt->nbChars) {
3904             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3905                          "htmlParseStartTag: problem parsing attributes\n",
3906                          NULL, NULL);
3907             break;
3908         }
3909     }
3910
3911     /*
3912      * Handle specific association to the META tag
3913      */
3914     if (meta && (nbatts != 0))
3915         htmlCheckMeta(ctxt, atts);
3916
3917     /*
3918      * SAX: Start of Element !
3919      */
3920     if (!discardtag) {
3921         htmlnamePush(ctxt, name);
3922         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3923             if (nbatts != 0)
3924                 ctxt->sax->startElement(ctxt->userData, name, atts);
3925             else
3926                 ctxt->sax->startElement(ctxt->userData, name, NULL);
3927         }
3928     }
3929
3930     if (atts != NULL) {
3931         for (i = 1;i < nbatts;i += 2) {
3932             if (atts[i] != NULL)
3933                 xmlFree((xmlChar *) atts[i]);
3934         }
3935     }
3936
3937     return(discardtag);
3938 }
3939
3940 /**
3941  * htmlParseEndTag:
3942  * @ctxt:  an HTML parser context
3943  *
3944  * parse an end of tag
3945  *
3946  * [42] ETag ::= '</' Name S? '>'
3947  *
3948  * With namespace
3949  *
3950  * [NS 9] ETag ::= '</' QName S? '>'
3951  *
3952  * Returns 1 if the current level should be closed.
3953  */
3954
3955 static int
3956 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3957 {
3958     const xmlChar *name;
3959     const xmlChar *oldname;
3960     int i, ret;
3961
3962     if ((CUR != '<') || (NXT(1) != '/')) {
3963         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3964                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
3965         return (0);
3966     }
3967     SKIP(2);
3968
3969     name = htmlParseHTMLName(ctxt);
3970     if (name == NULL)
3971         return (0);
3972     /*
3973      * We should definitely be at the ending "S? '>'" part
3974      */
3975     SKIP_BLANKS;
3976     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3977         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3978                      "End tag : expected '>'\n", NULL, NULL);
3979         if (ctxt->recovery) {
3980             /*
3981              * We're not at the ending > !!
3982              * Error, unless in recover mode where we search forwards
3983              * until we find a >
3984              */
3985             while (CUR != '\0' && CUR != '>') NEXT;
3986             NEXT;
3987         }
3988     } else
3989         NEXT;
3990
3991     /*
3992      * if we ignored misplaced tags in htmlParseStartTag don't pop them
3993      * out now.
3994      */
3995     if ((ctxt->depth > 0) &&
3996         (xmlStrEqual(name, BAD_CAST "html") ||
3997          xmlStrEqual(name, BAD_CAST "body") ||
3998          xmlStrEqual(name, BAD_CAST "head"))) {
3999         ctxt->depth--;
4000         return (0);
4001     }
4002
4003     /*
4004      * If the name read is not one of the element in the parsing stack
4005      * then return, it's just an error.
4006      */
4007     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4008         if (xmlStrEqual(name, ctxt->nameTab[i]))
4009             break;
4010     }
4011     if (i < 0) {
4012         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4013                      "Unexpected end tag : %s\n", name, NULL);
4014         return (0);
4015     }
4016
4017
4018     /*
4019      * Check for auto-closure of HTML elements.
4020      */
4021
4022     htmlAutoCloseOnClose(ctxt, name);
4023
4024     /*
4025      * Well formedness constraints, opening and closing must match.
4026      * With the exception that the autoclose may have popped stuff out
4027      * of the stack.
4028      */
4029     if (!xmlStrEqual(name, ctxt->name)) {
4030         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4031             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4032                          "Opening and ending tag mismatch: %s and %s\n",
4033                          name, ctxt->name);
4034         }
4035     }
4036
4037     /*
4038      * SAX: End of Tag
4039      */
4040     oldname = ctxt->name;
4041     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4042         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4043             ctxt->sax->endElement(ctxt->userData, name);
4044         htmlNodeInfoPop(ctxt);
4045         htmlnamePop(ctxt);
4046         ret = 1;
4047     } else {
4048         ret = 0;
4049     }
4050
4051     return (ret);
4052 }
4053
4054
4055 /**
4056  * htmlParseReference:
4057  * @ctxt:  an HTML parser context
4058  *
4059  * parse and handle entity references in content,
4060  * this will end-up in a call to character() since this is either a
4061  * CharRef, or a predefined entity.
4062  */
4063 static void
4064 htmlParseReference(htmlParserCtxtPtr ctxt) {
4065     const htmlEntityDesc * ent;
4066     xmlChar out[6];
4067     const xmlChar *name;
4068     if (CUR != '&') return;
4069
4070     if (NXT(1) == '#') {
4071         unsigned int c;
4072         int bits, i = 0;
4073
4074         c = htmlParseCharRef(ctxt);
4075         if (c == 0)
4076             return;
4077
4078         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4079         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4080         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4081         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4082
4083         for ( ; bits >= 0; bits-= 6) {
4084             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4085         }
4086         out[i] = 0;
4087
4088         htmlCheckParagraph(ctxt);
4089         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4090             ctxt->sax->characters(ctxt->userData, out, i);
4091     } else {
4092         ent = htmlParseEntityRef(ctxt, &name);
4093         if (name == NULL) {
4094             htmlCheckParagraph(ctxt);
4095             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4096                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4097             return;
4098         }
4099         if ((ent == NULL) || !(ent->value > 0)) {
4100             htmlCheckParagraph(ctxt);
4101             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4102                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4103                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4104                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4105             }
4106         } else {
4107             unsigned int c;
4108             int bits, i = 0;
4109
4110             c = ent->value;
4111             if      (c <    0x80)
4112                     { out[i++]= c;                bits= -6; }
4113             else if (c <   0x800)
4114                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4115             else if (c < 0x10000)
4116                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4117             else
4118                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4119
4120             for ( ; bits >= 0; bits-= 6) {
4121                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4122             }
4123             out[i] = 0;
4124
4125             htmlCheckParagraph(ctxt);
4126             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4127                 ctxt->sax->characters(ctxt->userData, out, i);
4128         }
4129     }
4130 }
4131
4132 /**
4133  * htmlParseContent:
4134  * @ctxt:  an HTML parser context
4135  *
4136  * Parse a content: comment, sub-element, reference or text.
4137  * Kept for compatibility with old code
4138  */
4139
4140 static void
4141 htmlParseContent(htmlParserCtxtPtr ctxt) {
4142     xmlChar *currentNode;
4143     int depth;
4144     const xmlChar *name;
4145
4146     currentNode = xmlStrdup(ctxt->name);
4147     depth = ctxt->nameNr;
4148     while (1) {
4149         long cons = ctxt->nbChars;
4150
4151         GROW;
4152
4153         if (ctxt->instate == XML_PARSER_EOF)
4154             break;
4155
4156         /*
4157          * Our tag or one of it's parent or children is ending.
4158          */
4159         if ((CUR == '<') && (NXT(1) == '/')) {
4160             if (htmlParseEndTag(ctxt) &&
4161                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4162                 if (currentNode != NULL)
4163                     xmlFree(currentNode);
4164                 return;
4165             }
4166             continue; /* while */
4167         }
4168
4169         else if ((CUR == '<') &&
4170                  ((IS_ASCII_LETTER(NXT(1))) ||
4171                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4172             name = htmlParseHTMLName_nonInvasive(ctxt);
4173             if (name == NULL) {
4174                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4175                          "htmlParseStartTag: invalid element name\n",
4176                          NULL, NULL);
4177                 /* Dump the bogus tag like browsers do */
4178         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4179                     NEXT;
4180
4181                 if (currentNode != NULL)
4182                     xmlFree(currentNode);
4183                 return;
4184             }
4185
4186             if (ctxt->name != NULL) {
4187                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4188                     htmlAutoClose(ctxt, name);
4189                     continue;
4190                 }
4191             }
4192         }
4193
4194         /*
4195          * Has this node been popped out during parsing of
4196          * the next element
4197          */
4198         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4199             (!xmlStrEqual(currentNode, ctxt->name)))
4200              {
4201             if (currentNode != NULL) xmlFree(currentNode);
4202             return;
4203         }
4204
4205         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4206             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4207             /*
4208              * Handle SCRIPT/STYLE separately
4209              */
4210             htmlParseScript(ctxt);
4211         } else {
4212             /*
4213              * Sometimes DOCTYPE arrives in the middle of the document
4214              */
4215             if ((CUR == '<') && (NXT(1) == '!') &&
4216                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4217                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4218                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4219                 (UPP(8) == 'E')) {
4220                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4221                              "Misplaced DOCTYPE declaration\n",
4222                              BAD_CAST "DOCTYPE" , NULL);
4223                 htmlParseDocTypeDecl(ctxt);
4224             }
4225
4226             /*
4227              * First case :  a comment
4228              */
4229             if ((CUR == '<') && (NXT(1) == '!') &&
4230                 (NXT(2) == '-') && (NXT(3) == '-')) {
4231                 htmlParseComment(ctxt);
4232             }
4233
4234             /*
4235              * Second case : a Processing Instruction.
4236              */
4237             else if ((CUR == '<') && (NXT(1) == '?')) {
4238                 htmlParsePI(ctxt);
4239             }
4240
4241             /*
4242              * Third case :  a sub-element.
4243              */
4244             else if (CUR == '<') {
4245                 htmlParseElement(ctxt);
4246             }
4247
4248             /*
4249              * Fourth case : a reference. If if has not been resolved,
4250              *    parsing returns it's Name, create the node
4251              */
4252             else if (CUR == '&') {
4253                 htmlParseReference(ctxt);
4254             }
4255
4256             /*
4257              * Fifth case : end of the resource
4258              */
4259             else if (CUR == 0) {
4260                 htmlAutoCloseOnEnd(ctxt);
4261                 break;
4262             }
4263
4264             /*
4265              * Last case, text. Note that References are handled directly.
4266              */
4267             else {
4268                 htmlParseCharData(ctxt);
4269             }
4270
4271             if (cons == ctxt->nbChars) {
4272                 if (ctxt->node != NULL) {
4273                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4274                                  "detected an error in element content\n",
4275                                  NULL, NULL);
4276                 }
4277                 break;
4278             }
4279         }
4280         GROW;
4281     }
4282     if (currentNode != NULL) xmlFree(currentNode);
4283 }
4284
4285 /**
4286  * htmlParseElement:
4287  * @ctxt:  an HTML parser context
4288  *
4289  * parse an HTML element, this is highly recursive
4290  * this is kept for compatibility with previous code versions
4291  *
4292  * [39] element ::= EmptyElemTag | STag content ETag
4293  *
4294  * [41] Attribute ::= Name Eq AttValue
4295  */
4296
4297 void
4298 htmlParseElement(htmlParserCtxtPtr ctxt) {
4299     const xmlChar *name;
4300     xmlChar *currentNode = NULL;
4301     const htmlElemDesc * info;
4302     htmlParserNodeInfo node_info;
4303     int failed;
4304     int depth;
4305     const xmlChar *oldptr;
4306
4307     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4308         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4309                      "htmlParseElement: context error\n", NULL, NULL);
4310         return;
4311     }
4312
4313     if (ctxt->instate == XML_PARSER_EOF)
4314         return;
4315
4316     /* Capture start position */
4317     if (ctxt->record_info) {
4318         node_info.begin_pos = ctxt->input->consumed +
4319                           (CUR_PTR - ctxt->input->base);
4320         node_info.begin_line = ctxt->input->line;
4321     }
4322
4323     failed = htmlParseStartTag(ctxt);
4324     name = ctxt->name;
4325     if ((failed == -1) || (name == NULL)) {
4326         if (CUR == '>')
4327             NEXT;
4328         return;
4329     }
4330
4331     /*
4332      * Lookup the info for that element.
4333      */
4334     info = htmlTagLookup(name);
4335     if (info == NULL) {
4336         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4337                      "Tag %s invalid\n", name, NULL);
4338     }
4339
4340     /*
4341      * Check for an Empty Element labeled the XML/SGML way
4342      */
4343     if ((CUR == '/') && (NXT(1) == '>')) {
4344         SKIP(2);
4345         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4346             ctxt->sax->endElement(ctxt->userData, name);
4347         htmlnamePop(ctxt);
4348         return;
4349     }
4350
4351     if (CUR == '>') {
4352         NEXT;
4353     } else {
4354         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4355                      "Couldn't find end of Start Tag %s\n", name, NULL);
4356
4357         /*
4358          * end of parsing of this node.
4359          */
4360         if (xmlStrEqual(name, ctxt->name)) {
4361             nodePop(ctxt);
4362             htmlnamePop(ctxt);
4363         }
4364
4365         /*
4366          * Capture end position and add node
4367          */
4368         if (ctxt->record_info) {
4369            node_info.end_pos = ctxt->input->consumed +
4370                               (CUR_PTR - ctxt->input->base);
4371            node_info.end_line = ctxt->input->line;
4372            node_info.node = ctxt->node;
4373            xmlParserAddNodeInfo(ctxt, &node_info);
4374         }
4375         return;
4376     }
4377
4378     /*
4379      * Check for an Empty Element from DTD definition
4380      */
4381     if ((info != NULL) && (info->empty)) {
4382         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4383             ctxt->sax->endElement(ctxt->userData, name);
4384         htmlnamePop(ctxt);
4385         return;
4386     }
4387
4388     /*
4389      * Parse the content of the element:
4390      */
4391     currentNode = xmlStrdup(ctxt->name);
4392     depth = ctxt->nameNr;
4393     while (IS_CHAR_CH(CUR)) {
4394         oldptr = ctxt->input->cur;
4395         htmlParseContent(ctxt);
4396         if (oldptr==ctxt->input->cur) break;
4397         if (ctxt->nameNr < depth) break;
4398     }
4399
4400     /*
4401      * Capture end position and add node
4402      */
4403     if ( currentNode != NULL && ctxt->record_info ) {
4404        node_info.end_pos = ctxt->input->consumed +
4405                           (CUR_PTR - ctxt->input->base);
4406        node_info.end_line = ctxt->input->line;
4407        node_info.node = ctxt->node;
4408        xmlParserAddNodeInfo(ctxt, &node_info);
4409     }
4410     if (!IS_CHAR_CH(CUR)) {
4411         htmlAutoCloseOnEnd(ctxt);
4412     }
4413
4414     if (currentNode != NULL)
4415         xmlFree(currentNode);
4416 }
4417
4418 static void
4419 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4420     /*
4421      * Capture end position and add node
4422      */
4423     if ( ctxt->node != NULL && ctxt->record_info ) {
4424        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4425                                 (CUR_PTR - ctxt->input->base);
4426        ctxt->nodeInfo->end_line = ctxt->input->line;
4427        ctxt->nodeInfo->node = ctxt->node;
4428        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4429        htmlNodeInfoPop(ctxt);
4430     }
4431     if (!IS_CHAR_CH(CUR)) {
4432        htmlAutoCloseOnEnd(ctxt);
4433     }
4434 }
4435
4436 /**
4437  * htmlParseElementInternal:
4438  * @ctxt:  an HTML parser context
4439  *
4440  * parse an HTML element, new version, non recursive
4441  *
4442  * [39] element ::= EmptyElemTag | STag content ETag
4443  *
4444  * [41] Attribute ::= Name Eq AttValue
4445  */
4446
4447 static void
4448 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4449     const xmlChar *name;
4450     const htmlElemDesc * info;
4451     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4452     int failed;
4453
4454     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4455         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4456                      "htmlParseElementInternal: context error\n", NULL, NULL);
4457         return;
4458     }
4459
4460     if (ctxt->instate == XML_PARSER_EOF)
4461         return;
4462
4463     /* Capture start position */
4464     if (ctxt->record_info) {
4465         node_info.begin_pos = ctxt->input->consumed +
4466                           (CUR_PTR - ctxt->input->base);
4467         node_info.begin_line = ctxt->input->line;
4468     }
4469
4470     failed = htmlParseStartTag(ctxt);
4471     name = ctxt->name;
4472     if ((failed == -1) || (name == NULL)) {
4473         if (CUR == '>')
4474             NEXT;
4475         return;
4476     }
4477
4478     /*
4479      * Lookup the info for that element.
4480      */
4481     info = htmlTagLookup(name);
4482     if (info == NULL) {
4483         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4484                      "Tag %s invalid\n", name, NULL);
4485     }
4486
4487     /*
4488      * Check for an Empty Element labeled the XML/SGML way
4489      */
4490     if ((CUR == '/') && (NXT(1) == '>')) {
4491         SKIP(2);
4492         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4493             ctxt->sax->endElement(ctxt->userData, name);
4494         htmlnamePop(ctxt);
4495         return;
4496     }
4497
4498     if (CUR == '>') {
4499         NEXT;
4500     } else {
4501         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4502                      "Couldn't find end of Start Tag %s\n", name, NULL);
4503
4504         /*
4505          * end of parsing of this node.
4506          */
4507         if (xmlStrEqual(name, ctxt->name)) {
4508             nodePop(ctxt);
4509             htmlnamePop(ctxt);
4510         }
4511
4512         if (ctxt->record_info)
4513             htmlNodeInfoPush(ctxt, &node_info);
4514         htmlParserFinishElementParsing(ctxt);
4515         return;
4516     }
4517
4518     /*
4519      * Check for an Empty Element from DTD definition
4520      */
4521     if ((info != NULL) && (info->empty)) {
4522         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4523             ctxt->sax->endElement(ctxt->userData, name);
4524         htmlnamePop(ctxt);
4525         return;
4526     }
4527
4528     if (ctxt->record_info)
4529         htmlNodeInfoPush(ctxt, &node_info);
4530 }
4531
4532 /**
4533  * htmlParseContentInternal:
4534  * @ctxt:  an HTML parser context
4535  *
4536  * Parse a content: comment, sub-element, reference or text.
4537  * New version for non recursive htmlParseElementInternal
4538  */
4539
4540 static void
4541 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4542     xmlChar *currentNode;
4543     int depth;
4544     const xmlChar *name;
4545
4546     currentNode = xmlStrdup(ctxt->name);
4547     depth = ctxt->nameNr;
4548     while (1) {
4549         long cons = ctxt->nbChars;
4550
4551         GROW;
4552
4553         if (ctxt->instate == XML_PARSER_EOF)
4554             break;
4555
4556         /*
4557          * Our tag or one of it's parent or children is ending.
4558          */
4559         if ((CUR == '<') && (NXT(1) == '/')) {
4560             if (htmlParseEndTag(ctxt) &&
4561                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4562                 if (currentNode != NULL)
4563                     xmlFree(currentNode);
4564
4565                 currentNode = xmlStrdup(ctxt->name);
4566                 depth = ctxt->nameNr;
4567             }
4568             continue; /* while */
4569         }
4570
4571         else if ((CUR == '<') &&
4572                  ((IS_ASCII_LETTER(NXT(1))) ||
4573                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4574             name = htmlParseHTMLName_nonInvasive(ctxt);
4575             if (name == NULL) {
4576                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4577                          "htmlParseStartTag: invalid element name\n",
4578                          NULL, NULL);
4579                 /* Dump the bogus tag like browsers do */
4580                 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4581                     NEXT;
4582
4583                 htmlParserFinishElementParsing(ctxt);
4584                 if (currentNode != NULL)
4585                     xmlFree(currentNode);
4586
4587                 currentNode = xmlStrdup(ctxt->name);
4588                 depth = ctxt->nameNr;
4589                 continue;
4590             }
4591
4592             if (ctxt->name != NULL) {
4593                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4594                     htmlAutoClose(ctxt, name);
4595                     continue;
4596                 }
4597             }
4598         }
4599
4600         /*
4601          * Has this node been popped out during parsing of
4602          * the next element
4603          */
4604         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4605             (!xmlStrEqual(currentNode, ctxt->name)))
4606              {
4607             htmlParserFinishElementParsing(ctxt);
4608             if (currentNode != NULL) xmlFree(currentNode);
4609
4610             currentNode = xmlStrdup(ctxt->name);
4611             depth = ctxt->nameNr;
4612             continue;
4613         }
4614
4615         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4616             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4617             /*
4618              * Handle SCRIPT/STYLE separately
4619              */
4620             htmlParseScript(ctxt);
4621         } else {
4622             /*
4623              * Sometimes DOCTYPE arrives in the middle of the document
4624              */
4625             if ((CUR == '<') && (NXT(1) == '!') &&
4626                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4627                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4628                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4629                 (UPP(8) == 'E')) {
4630                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4631                              "Misplaced DOCTYPE declaration\n",
4632                              BAD_CAST "DOCTYPE" , NULL);
4633                 htmlParseDocTypeDecl(ctxt);
4634             }
4635
4636             /*
4637              * First case :  a comment
4638              */
4639             if ((CUR == '<') && (NXT(1) == '!') &&
4640                 (NXT(2) == '-') && (NXT(3) == '-')) {
4641                 htmlParseComment(ctxt);
4642             }
4643
4644             /*
4645              * Second case : a Processing Instruction.
4646              */
4647             else if ((CUR == '<') && (NXT(1) == '?')) {
4648                 htmlParsePI(ctxt);
4649             }
4650
4651             /*
4652              * Third case :  a sub-element.
4653              */
4654             else if (CUR == '<') {
4655                 htmlParseElementInternal(ctxt);
4656                 if (currentNode != NULL) xmlFree(currentNode);
4657
4658                 currentNode = xmlStrdup(ctxt->name);
4659                 depth = ctxt->nameNr;
4660             }
4661
4662             /*
4663              * Fourth case : a reference. If if has not been resolved,
4664              *    parsing returns it's Name, create the node
4665              */
4666             else if (CUR == '&') {
4667                 htmlParseReference(ctxt);
4668             }
4669
4670             /*
4671              * Fifth case : end of the resource
4672              */
4673             else if (CUR == 0) {
4674                 htmlAutoCloseOnEnd(ctxt);
4675                 break;
4676             }
4677
4678             /*
4679              * Last case, text. Note that References are handled directly.
4680              */
4681             else {
4682                 htmlParseCharData(ctxt);
4683             }
4684
4685             if (cons == ctxt->nbChars) {
4686                 if (ctxt->node != NULL) {
4687                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4688                                  "detected an error in element content\n",
4689                                  NULL, NULL);
4690                 }
4691                 break;
4692             }
4693         }
4694         GROW;
4695     }
4696     if (currentNode != NULL) xmlFree(currentNode);
4697 }
4698
4699 /**
4700  * htmlParseContent:
4701  * @ctxt:  an HTML parser context
4702  *
4703  * Parse a content: comment, sub-element, reference or text.
4704  * This is the entry point when called from parser.c
4705  */
4706
4707 void
4708 __htmlParseContent(void *ctxt) {
4709     if (ctxt != NULL)
4710         htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4711 }
4712
4713 /**
4714  * htmlParseDocument:
4715  * @ctxt:  an HTML parser context
4716  *
4717  * parse an HTML document (and build a tree if using the standard SAX
4718  * interface).
4719  *
4720  * Returns 0, -1 in case of error. the parser context is augmented
4721  *                as a result of the parsing.
4722  */
4723
4724 int
4725 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4726     xmlChar start[4];
4727     xmlCharEncoding enc;
4728     xmlDtdPtr dtd;
4729
4730     xmlInitParser();
4731
4732     htmlDefaultSAXHandlerInit();
4733
4734     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4735         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4736                      "htmlParseDocument: context error\n", NULL, NULL);
4737         return(XML_ERR_INTERNAL_ERROR);
4738     }
4739     ctxt->html = 1;
4740     ctxt->linenumbers = 1;
4741     GROW;
4742     /*
4743      * SAX: beginning of the document processing.
4744      */
4745     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4746         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4747
4748     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4749         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4750         /*
4751          * Get the 4 first bytes and decode the charset
4752          * if enc != XML_CHAR_ENCODING_NONE
4753          * plug some encoding conversion routines.
4754          */
4755         start[0] = RAW;
4756         start[1] = NXT(1);
4757         start[2] = NXT(2);
4758         start[3] = NXT(3);
4759         enc = xmlDetectCharEncoding(&start[0], 4);
4760         if (enc != XML_CHAR_ENCODING_NONE) {
4761             xmlSwitchEncoding(ctxt, enc);
4762         }
4763     }
4764
4765     /*
4766      * Wipe out everything which is before the first '<'
4767      */
4768     SKIP_BLANKS;
4769     if (CUR == 0) {
4770         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4771                      "Document is empty\n", NULL, NULL);
4772     }
4773
4774     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4775         ctxt->sax->startDocument(ctxt->userData);
4776
4777
4778     /*
4779      * Parse possible comments and PIs before any content
4780      */
4781     while (((CUR == '<') && (NXT(1) == '!') &&
4782             (NXT(2) == '-') && (NXT(3) == '-')) ||
4783            ((CUR == '<') && (NXT(1) == '?'))) {
4784         htmlParseComment(ctxt);
4785         htmlParsePI(ctxt);
4786         SKIP_BLANKS;
4787     }
4788
4789
4790     /*
4791      * Then possibly doc type declaration(s) and more Misc
4792      * (doctypedecl Misc*)?
4793      */
4794     if ((CUR == '<') && (NXT(1) == '!') &&
4795         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4796         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4797         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4798         (UPP(8) == 'E')) {
4799         htmlParseDocTypeDecl(ctxt);
4800     }
4801     SKIP_BLANKS;
4802
4803     /*
4804      * Parse possible comments and PIs before any content
4805      */
4806     while (((CUR == '<') && (NXT(1) == '!') &&
4807             (NXT(2) == '-') && (NXT(3) == '-')) ||
4808            ((CUR == '<') && (NXT(1) == '?'))) {
4809         htmlParseComment(ctxt);
4810         htmlParsePI(ctxt);
4811         SKIP_BLANKS;
4812     }
4813
4814     /*
4815      * Time to start parsing the tree itself
4816      */
4817     htmlParseContentInternal(ctxt);
4818
4819     /*
4820      * autoclose
4821      */
4822     if (CUR == 0)
4823         htmlAutoCloseOnEnd(ctxt);
4824
4825
4826     /*
4827      * SAX: end of the document processing.
4828      */
4829     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4830         ctxt->sax->endDocument(ctxt->userData);
4831
4832     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4833         dtd = xmlGetIntSubset(ctxt->myDoc);
4834         if (dtd == NULL)
4835             ctxt->myDoc->intSubset =
4836                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4837                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4838                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4839     }
4840     if (! ctxt->wellFormed) return(-1);
4841     return(0);
4842 }
4843
4844
4845 /************************************************************************
4846  *                                                                      *
4847  *                      Parser contexts handling                        *
4848  *                                                                      *
4849  ************************************************************************/
4850
4851 /**
4852  * htmlInitParserCtxt:
4853  * @ctxt:  an HTML parser context
4854  *
4855  * Initialize a parser context
4856  *
4857  * Returns 0 in case of success and -1 in case of error
4858  */
4859
4860 static int
4861 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4862 {
4863     htmlSAXHandler *sax;
4864
4865     if (ctxt == NULL) return(-1);
4866     memset(ctxt, 0, sizeof(htmlParserCtxt));
4867
4868     ctxt->dict = xmlDictCreate();
4869     if (ctxt->dict == NULL) {
4870         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4871         return(-1);
4872     }
4873     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4874     if (sax == NULL) {
4875         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4876         return(-1);
4877     }
4878     else
4879         memset(sax, 0, sizeof(htmlSAXHandler));
4880
4881     /* Allocate the Input stack */
4882     ctxt->inputTab = (htmlParserInputPtr *)
4883                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4884     if (ctxt->inputTab == NULL) {
4885         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4886         ctxt->inputNr = 0;
4887         ctxt->inputMax = 0;
4888         ctxt->input = NULL;
4889         return(-1);
4890     }
4891     ctxt->inputNr = 0;
4892     ctxt->inputMax = 5;
4893     ctxt->input = NULL;
4894     ctxt->version = NULL;
4895     ctxt->encoding = NULL;
4896     ctxt->standalone = -1;
4897     ctxt->instate = XML_PARSER_START;
4898
4899     /* Allocate the Node stack */
4900     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4901     if (ctxt->nodeTab == NULL) {
4902         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4903         ctxt->nodeNr = 0;
4904         ctxt->nodeMax = 0;
4905         ctxt->node = NULL;
4906         ctxt->inputNr = 0;
4907         ctxt->inputMax = 0;
4908         ctxt->input = NULL;
4909         return(-1);
4910     }
4911     ctxt->nodeNr = 0;
4912     ctxt->nodeMax = 10;
4913     ctxt->node = NULL;
4914
4915     /* Allocate the Name stack */
4916     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4917     if (ctxt->nameTab == NULL) {
4918         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4919         ctxt->nameNr = 0;
4920         ctxt->nameMax = 0;
4921         ctxt->name = NULL;
4922         ctxt->nodeNr = 0;
4923         ctxt->nodeMax = 0;
4924         ctxt->node = NULL;
4925         ctxt->inputNr = 0;
4926         ctxt->inputMax = 0;
4927         ctxt->input = NULL;
4928         return(-1);
4929     }
4930     ctxt->nameNr = 0;
4931     ctxt->nameMax = 10;
4932     ctxt->name = NULL;
4933
4934     ctxt->nodeInfoTab = NULL;
4935     ctxt->nodeInfoNr  = 0;
4936     ctxt->nodeInfoMax = 0;
4937
4938     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4939     else {
4940         ctxt->sax = sax;
4941         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4942     }
4943     ctxt->userData = ctxt;
4944     ctxt->myDoc = NULL;
4945     ctxt->wellFormed = 1;
4946     ctxt->replaceEntities = 0;
4947     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4948     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4949     ctxt->html = 1;
4950     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4951     ctxt->vctxt.userData = ctxt;
4952     ctxt->vctxt.error = xmlParserValidityError;
4953     ctxt->vctxt.warning = xmlParserValidityWarning;
4954     ctxt->record_info = 0;
4955     ctxt->validate = 0;
4956     ctxt->nbChars = 0;
4957     ctxt->checkIndex = 0;
4958     ctxt->catalogs = NULL;
4959     xmlInitNodeInfoSeq(&ctxt->node_seq);
4960     return(0);
4961 }
4962
4963 /**
4964  * htmlFreeParserCtxt:
4965  * @ctxt:  an HTML parser context
4966  *
4967  * Free all the memory used by a parser context. However the parsed
4968  * document in ctxt->myDoc is not freed.
4969  */
4970
4971 void
4972 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4973 {
4974     xmlFreeParserCtxt(ctxt);
4975 }
4976
4977 /**
4978  * htmlNewParserCtxt:
4979  *
4980  * Allocate and initialize a new parser context.
4981  *
4982  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4983  */
4984
4985 htmlParserCtxtPtr
4986 htmlNewParserCtxt(void)
4987 {
4988     xmlParserCtxtPtr ctxt;
4989
4990     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4991     if (ctxt == NULL) {
4992         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4993         return(NULL);
4994     }
4995     memset(ctxt, 0, sizeof(xmlParserCtxt));
4996     if (htmlInitParserCtxt(ctxt) < 0) {
4997         htmlFreeParserCtxt(ctxt);
4998         return(NULL);
4999     }
5000     return(ctxt);
5001 }
5002
5003 /**
5004  * htmlCreateMemoryParserCtxt:
5005  * @buffer:  a pointer to a char array
5006  * @size:  the size of the array
5007  *
5008  * Create a parser context for an HTML in-memory document.
5009  *
5010  * Returns the new parser context or NULL
5011  */
5012 htmlParserCtxtPtr
5013 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5014     xmlParserCtxtPtr ctxt;
5015     xmlParserInputPtr input;
5016     xmlParserInputBufferPtr buf;
5017
5018     if (buffer == NULL)
5019         return(NULL);
5020     if (size <= 0)
5021         return(NULL);
5022
5023     ctxt = htmlNewParserCtxt();
5024     if (ctxt == NULL)
5025         return(NULL);
5026
5027     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5028     if (buf == NULL) return(NULL);
5029
5030     input = xmlNewInputStream(ctxt);
5031     if (input == NULL) {
5032         xmlFreeParserCtxt(ctxt);
5033         return(NULL);
5034     }
5035
5036     input->filename = NULL;
5037     input->buf = buf;
5038     xmlBufResetInput(buf->buffer, input);
5039
5040     inputPush(ctxt, input);
5041     return(ctxt);
5042 }
5043
5044 /**
5045  * htmlCreateDocParserCtxt:
5046  * @cur:  a pointer to an array of xmlChar
5047  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5048  *
5049  * Create a parser context for an HTML document.
5050  *
5051  * TODO: check the need to add encoding handling there
5052  *
5053  * Returns the new parser context or NULL
5054  */
5055 static htmlParserCtxtPtr
5056 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5057     int len;
5058     htmlParserCtxtPtr ctxt;
5059
5060     if (cur == NULL)
5061         return(NULL);
5062     len = xmlStrlen(cur);
5063     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5064     if (ctxt == NULL)
5065         return(NULL);
5066
5067     if (encoding != NULL) {
5068         xmlCharEncoding enc;
5069         xmlCharEncodingHandlerPtr handler;
5070
5071         if (ctxt->input->encoding != NULL)
5072             xmlFree((xmlChar *) ctxt->input->encoding);
5073         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5074
5075         enc = xmlParseCharEncoding(encoding);
5076         /*
5077          * registered set of known encodings
5078          */
5079         if (enc != XML_CHAR_ENCODING_ERROR) {
5080             xmlSwitchEncoding(ctxt, enc);
5081             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5082                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5083                              "Unsupported encoding %s\n",
5084                              (const xmlChar *) encoding, NULL);
5085             }
5086         } else {
5087             /*
5088              * fallback for unknown encodings
5089              */
5090             handler = xmlFindCharEncodingHandler((const char *) encoding);
5091             if (handler != NULL) {
5092                 xmlSwitchToEncoding(ctxt, handler);
5093             } else {
5094                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5095                              "Unsupported encoding %s\n",
5096                              (const xmlChar *) encoding, NULL);
5097             }
5098         }
5099     }
5100     return(ctxt);
5101 }
5102
5103 #ifdef LIBXML_PUSH_ENABLED
5104 /************************************************************************
5105  *                                                                      *
5106  *      Progressive parsing interfaces                          *
5107  *                                                                      *
5108  ************************************************************************/
5109
5110 /**
5111  * htmlParseLookupSequence:
5112  * @ctxt:  an HTML parser context
5113  * @first:  the first char to lookup
5114  * @next:  the next char to lookup or zero
5115  * @third:  the next char to lookup or zero
5116  * @comment: flag to force checking inside comments
5117  *
5118  * Try to find if a sequence (first, next, third) or  just (first next) or
5119  * (first) is available in the input stream.
5120  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5121  * to avoid rescanning sequences of bytes, it DOES change the state of the
5122  * parser, do not use liberally.
5123  * This is basically similar to xmlParseLookupSequence()
5124  *
5125  * Returns the index to the current parsing point if the full sequence
5126  *      is available, -1 otherwise.
5127  */
5128 static int
5129 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5130                         xmlChar next, xmlChar third, int iscomment,
5131                         int ignoreattrval)
5132 {
5133     int base, len;
5134     htmlParserInputPtr in;
5135     const xmlChar *buf;
5136     int incomment = 0;
5137     int invalue = 0;
5138     char valdellim = 0x0;
5139
5140     in = ctxt->input;
5141     if (in == NULL)
5142         return (-1);
5143
5144     base = in->cur - in->base;
5145     if (base < 0)
5146         return (-1);
5147
5148     if (ctxt->checkIndex > base)
5149         base = ctxt->checkIndex;
5150
5151     if (in->buf == NULL) {
5152         buf = in->base;
5153         len = in->length;
5154     } else {
5155         buf = xmlBufContent(in->buf->buffer);
5156         len = xmlBufUse(in->buf->buffer);
5157     }
5158
5159     /* take into account the sequence length */
5160     if (third)
5161         len -= 2;
5162     else if (next)
5163         len--;
5164     for (; base < len; base++) {
5165         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5166             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5167                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5168                 incomment = 1;
5169                 /* do not increment past <! - some people use <!--> */
5170                 base += 2;
5171             }
5172         }
5173         if (ignoreattrval) {
5174             if (buf[base] == '"' || buf[base] == '\'') {
5175                 if (invalue) {
5176                     if (buf[base] == valdellim) {
5177                         invalue = 0;
5178                         continue;
5179                     }
5180                 } else {
5181                     valdellim = buf[base];
5182                     invalue = 1;
5183                     continue;
5184                 }
5185             } else if (invalue) {
5186                 continue;
5187             }
5188         }
5189         if (incomment) {
5190             if (base + 3 > len)
5191                 return (-1);
5192             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5193                 (buf[base + 2] == '>')) {
5194                 incomment = 0;
5195                 base += 2;
5196             }
5197             continue;
5198         }
5199         if (buf[base] == first) {
5200             if (third != 0) {
5201                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5202                     continue;
5203             } else if (next != 0) {
5204                 if (buf[base + 1] != next)
5205                     continue;
5206             }
5207             ctxt->checkIndex = 0;
5208 #ifdef DEBUG_PUSH
5209             if (next == 0)
5210                 xmlGenericError(xmlGenericErrorContext,
5211                                 "HPP: lookup '%c' found at %d\n",
5212                                 first, base);
5213             else if (third == 0)
5214                 xmlGenericError(xmlGenericErrorContext,
5215                                 "HPP: lookup '%c%c' found at %d\n",
5216                                 first, next, base);
5217             else
5218                 xmlGenericError(xmlGenericErrorContext,
5219                                 "HPP: lookup '%c%c%c' found at %d\n",
5220                                 first, next, third, base);
5221 #endif
5222             return (base - (in->cur - in->base));
5223         }
5224     }
5225     if ((!incomment) && (!invalue))
5226         ctxt->checkIndex = base;
5227 #ifdef DEBUG_PUSH
5228     if (next == 0)
5229         xmlGenericError(xmlGenericErrorContext,
5230                         "HPP: lookup '%c' failed\n", first);
5231     else if (third == 0)
5232         xmlGenericError(xmlGenericErrorContext,
5233                         "HPP: lookup '%c%c' failed\n", first, next);
5234     else
5235         xmlGenericError(xmlGenericErrorContext,
5236                         "HPP: lookup '%c%c%c' failed\n", first, next,
5237                         third);
5238 #endif
5239     return (-1);
5240 }
5241
5242 /**
5243  * htmlParseLookupChars:
5244  * @ctxt: an HTML parser context
5245  * @stop: Array of chars, which stop the lookup.
5246  * @stopLen: Length of stop-Array
5247  *
5248  * Try to find if any char of the stop-Array is available in the input
5249  * stream.
5250  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5251  * to avoid rescanning sequences of bytes, it DOES change the state of the
5252  * parser, do not use liberally.
5253  *
5254  * Returns the index to the current parsing point if a stopChar
5255  *      is available, -1 otherwise.
5256  */
5257 static int
5258 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5259                      int stopLen)
5260 {
5261     int base, len;
5262     htmlParserInputPtr in;
5263     const xmlChar *buf;
5264     int incomment = 0;
5265     int i;
5266
5267     in = ctxt->input;
5268     if (in == NULL)
5269         return (-1);
5270
5271     base = in->cur - in->base;
5272     if (base < 0)
5273         return (-1);
5274
5275     if (ctxt->checkIndex > base)
5276         base = ctxt->checkIndex;
5277
5278     if (in->buf == NULL) {
5279         buf = in->base;
5280         len = in->length;
5281     } else {
5282         buf = xmlBufContent(in->buf->buffer);
5283         len = xmlBufUse(in->buf->buffer);
5284     }
5285
5286     for (; base < len; base++) {
5287         if (!incomment && (base + 4 < len)) {
5288             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5289                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5290                 incomment = 1;
5291                 /* do not increment past <! - some people use <!--> */
5292                 base += 2;
5293             }
5294         }
5295         if (incomment) {
5296             if (base + 3 > len)
5297                 return (-1);
5298             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5299                 (buf[base + 2] == '>')) {
5300                 incomment = 0;
5301                 base += 2;
5302             }
5303             continue;
5304         }
5305         for (i = 0; i < stopLen; ++i) {
5306             if (buf[base] == stop[i]) {
5307                 ctxt->checkIndex = 0;
5308                 return (base - (in->cur - in->base));
5309             }
5310         }
5311     }
5312     ctxt->checkIndex = base;
5313     return (-1);
5314 }
5315
5316 /**
5317  * htmlParseTryOrFinish:
5318  * @ctxt:  an HTML parser context
5319  * @terminate:  last chunk indicator
5320  *
5321  * Try to progress on parsing
5322  *
5323  * Returns zero if no parsing was possible
5324  */
5325 static int
5326 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5327     int ret = 0;
5328     htmlParserInputPtr in;
5329     int avail = 0;
5330     xmlChar cur, next;
5331
5332     htmlParserNodeInfo node_info;
5333
5334 #ifdef DEBUG_PUSH
5335     switch (ctxt->instate) {
5336         case XML_PARSER_EOF:
5337             xmlGenericError(xmlGenericErrorContext,
5338                     "HPP: try EOF\n"); break;
5339         case XML_PARSER_START:
5340             xmlGenericError(xmlGenericErrorContext,
5341                     "HPP: try START\n"); break;
5342         case XML_PARSER_MISC:
5343             xmlGenericError(xmlGenericErrorContext,
5344                     "HPP: try MISC\n");break;
5345         case XML_PARSER_COMMENT:
5346             xmlGenericError(xmlGenericErrorContext,
5347                     "HPP: try COMMENT\n");break;
5348         case XML_PARSER_PROLOG:
5349             xmlGenericError(xmlGenericErrorContext,
5350                     "HPP: try PROLOG\n");break;
5351         case XML_PARSER_START_TAG:
5352             xmlGenericError(xmlGenericErrorContext,
5353                     "HPP: try START_TAG\n");break;
5354         case XML_PARSER_CONTENT:
5355             xmlGenericError(xmlGenericErrorContext,
5356                     "HPP: try CONTENT\n");break;
5357         case XML_PARSER_CDATA_SECTION:
5358             xmlGenericError(xmlGenericErrorContext,
5359                     "HPP: try CDATA_SECTION\n");break;
5360         case XML_PARSER_END_TAG:
5361             xmlGenericError(xmlGenericErrorContext,
5362                     "HPP: try END_TAG\n");break;
5363         case XML_PARSER_ENTITY_DECL:
5364             xmlGenericError(xmlGenericErrorContext,
5365                     "HPP: try ENTITY_DECL\n");break;
5366         case XML_PARSER_ENTITY_VALUE:
5367             xmlGenericError(xmlGenericErrorContext,
5368                     "HPP: try ENTITY_VALUE\n");break;
5369         case XML_PARSER_ATTRIBUTE_VALUE:
5370             xmlGenericError(xmlGenericErrorContext,
5371                     "HPP: try ATTRIBUTE_VALUE\n");break;
5372         case XML_PARSER_DTD:
5373             xmlGenericError(xmlGenericErrorContext,
5374                     "HPP: try DTD\n");break;
5375         case XML_PARSER_EPILOG:
5376             xmlGenericError(xmlGenericErrorContext,
5377                     "HPP: try EPILOG\n");break;
5378         case XML_PARSER_PI:
5379             xmlGenericError(xmlGenericErrorContext,
5380                     "HPP: try PI\n");break;
5381         case XML_PARSER_SYSTEM_LITERAL:
5382             xmlGenericError(xmlGenericErrorContext,
5383                     "HPP: try SYSTEM_LITERAL\n");break;
5384     }
5385 #endif
5386
5387     while (1) {
5388
5389         in = ctxt->input;
5390         if (in == NULL) break;
5391         if (in->buf == NULL)
5392             avail = in->length - (in->cur - in->base);
5393         else
5394             avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5395         if ((avail == 0) && (terminate)) {
5396             htmlAutoCloseOnEnd(ctxt);
5397             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5398                 /*
5399                  * SAX: end of the document processing.
5400                  */
5401                 ctxt->instate = XML_PARSER_EOF;
5402                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5403                     ctxt->sax->endDocument(ctxt->userData);
5404             }
5405         }
5406         if (avail < 1)
5407             goto done;
5408         cur = in->cur[0];
5409         if (cur == 0) {
5410             SKIP(1);
5411             continue;
5412         }
5413
5414         switch (ctxt->instate) {
5415             case XML_PARSER_EOF:
5416                 /*
5417                  * Document parsing is done !
5418                  */
5419                 goto done;
5420             case XML_PARSER_START:
5421                 /*
5422                  * Very first chars read from the document flow.
5423                  */
5424                 cur = in->cur[0];
5425                 if (IS_BLANK_CH(cur)) {
5426                     SKIP_BLANKS;
5427                     if (in->buf == NULL)
5428                         avail = in->length - (in->cur - in->base);
5429                     else
5430                         avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5431                 }
5432                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5433                     ctxt->sax->setDocumentLocator(ctxt->userData,
5434                                                   &xmlDefaultSAXLocator);
5435                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5436                     (!ctxt->disableSAX))
5437                     ctxt->sax->startDocument(ctxt->userData);
5438
5439                 cur = in->cur[0];
5440                 next = in->cur[1];
5441                 if ((cur == '<') && (next == '!') &&
5442                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5443                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5444                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5445                     (UPP(8) == 'E')) {
5446                     if ((!terminate) &&
5447                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5448                         goto done;
5449 #ifdef DEBUG_PUSH
5450                     xmlGenericError(xmlGenericErrorContext,
5451                             "HPP: Parsing internal subset\n");
5452 #endif
5453                     htmlParseDocTypeDecl(ctxt);
5454                     ctxt->instate = XML_PARSER_PROLOG;
5455 #ifdef DEBUG_PUSH
5456                     xmlGenericError(xmlGenericErrorContext,
5457                             "HPP: entering PROLOG\n");
5458 #endif
5459                 } else {
5460                     ctxt->instate = XML_PARSER_MISC;
5461 #ifdef DEBUG_PUSH
5462                     xmlGenericError(xmlGenericErrorContext,
5463                             "HPP: entering MISC\n");
5464 #endif
5465                 }
5466                 break;
5467             case XML_PARSER_MISC:
5468                 SKIP_BLANKS;
5469                 if (in->buf == NULL)
5470                     avail = in->length - (in->cur - in->base);
5471                 else
5472                     avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5473                 /*
5474                  * no chars in buffer
5475                  */
5476                 if (avail < 1)
5477                     goto done;
5478                 /*
5479                  * not enouth chars in buffer
5480                  */
5481                 if (avail < 2) {
5482                     if (!terminate)
5483                         goto done;
5484                     else
5485                         next = ' ';
5486                 } else {
5487                     next = in->cur[1];
5488                 }
5489                 cur = in->cur[0];
5490                 if ((cur == '<') && (next == '!') &&
5491                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5492                     if ((!terminate) &&
5493                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5494                         goto done;
5495 #ifdef DEBUG_PUSH
5496                     xmlGenericError(xmlGenericErrorContext,
5497                             "HPP: Parsing Comment\n");
5498 #endif
5499                     htmlParseComment(ctxt);
5500                     ctxt->instate = XML_PARSER_MISC;
5501                 } else if ((cur == '<') && (next == '?')) {
5502                     if ((!terminate) &&
5503                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5504                         goto done;
5505 #ifdef DEBUG_PUSH
5506                     xmlGenericError(xmlGenericErrorContext,
5507                             "HPP: Parsing PI\n");
5508 #endif
5509                     htmlParsePI(ctxt);
5510                     ctxt->instate = XML_PARSER_MISC;
5511                 } else if ((cur == '<') && (next == '!') &&
5512                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5513                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5514                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5515                     (UPP(8) == 'E')) {
5516                     if ((!terminate) &&
5517                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5518                         goto done;
5519 #ifdef DEBUG_PUSH
5520                     xmlGenericError(xmlGenericErrorContext,
5521                             "HPP: Parsing internal subset\n");
5522 #endif
5523                     htmlParseDocTypeDecl(ctxt);
5524                     ctxt->instate = XML_PARSER_PROLOG;
5525 #ifdef DEBUG_PUSH
5526                     xmlGenericError(xmlGenericErrorContext,
5527                             "HPP: entering PROLOG\n");
5528 #endif
5529                 } else if ((cur == '<') && (next == '!') &&
5530                            (avail < 9)) {
5531                     goto done;
5532                 } else {
5533                     ctxt->instate = XML_PARSER_START_TAG;
5534 #ifdef DEBUG_PUSH
5535                     xmlGenericError(xmlGenericErrorContext,
5536                             "HPP: entering START_TAG\n");
5537 #endif
5538                 }
5539                 break;
5540             case XML_PARSER_PROLOG:
5541                 SKIP_BLANKS;
5542                 if (in->buf == NULL)
5543                     avail = in->length - (in->cur - in->base);
5544                 else
5545                     avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5546                 if (avail < 2)
5547                     goto done;
5548                 cur = in->cur[0];
5549                 next = in->cur[1];
5550                 if ((cur == '<') && (next == '!') &&
5551                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5552                     if ((!terminate) &&
5553                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5554                         goto done;
5555 #ifdef DEBUG_PUSH
5556                     xmlGenericError(xmlGenericErrorContext,
5557                             "HPP: Parsing Comment\n");
5558 #endif
5559                     htmlParseComment(ctxt);
5560                     ctxt->instate = XML_PARSER_PROLOG;
5561                 } else if ((cur == '<') && (next == '?')) {
5562                     if ((!terminate) &&
5563                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5564                         goto done;
5565 #ifdef DEBUG_PUSH
5566                     xmlGenericError(xmlGenericErrorContext,
5567                             "HPP: Parsing PI\n");
5568 #endif
5569                     htmlParsePI(ctxt);
5570                     ctxt->instate = XML_PARSER_PROLOG;
5571                 } else if ((cur == '<') && (next == '!') &&
5572                            (avail < 4)) {
5573                     goto done;
5574                 } else {
5575                     ctxt->instate = XML_PARSER_START_TAG;
5576 #ifdef DEBUG_PUSH
5577                     xmlGenericError(xmlGenericErrorContext,
5578                             "HPP: entering START_TAG\n");
5579 #endif
5580                 }
5581                 break;
5582             case XML_PARSER_EPILOG:
5583                 if (in->buf == NULL)
5584                     avail = in->length - (in->cur - in->base);
5585                 else
5586                     avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5587                 if (avail < 1)
5588                     goto done;
5589                 cur = in->cur[0];
5590                 if (IS_BLANK_CH(cur)) {
5591                     htmlParseCharData(ctxt);
5592                     goto done;
5593                 }
5594                 if (avail < 2)
5595                     goto done;
5596                 next = in->cur[1];
5597                 if ((cur == '<') && (next == '!') &&
5598                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5599                     if ((!terminate) &&
5600                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5601                         goto done;
5602 #ifdef DEBUG_PUSH
5603                     xmlGenericError(xmlGenericErrorContext,
5604                             "HPP: Parsing Comment\n");
5605 #endif
5606                     htmlParseComment(ctxt);
5607                     ctxt->instate = XML_PARSER_EPILOG;
5608                 } else if ((cur == '<') && (next == '?')) {
5609                     if ((!terminate) &&
5610                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5611                         goto done;
5612 #ifdef DEBUG_PUSH
5613                     xmlGenericError(xmlGenericErrorContext,
5614                             "HPP: Parsing PI\n");
5615 #endif
5616                     htmlParsePI(ctxt);
5617                     ctxt->instate = XML_PARSER_EPILOG;
5618                 } else if ((cur == '<') && (next == '!') &&
5619                            (avail < 4)) {
5620                     goto done;
5621                 } else {
5622                     ctxt->errNo = XML_ERR_DOCUMENT_END;
5623                     ctxt->wellFormed = 0;
5624                     ctxt->instate = XML_PARSER_EOF;
5625 #ifdef DEBUG_PUSH
5626                     xmlGenericError(xmlGenericErrorContext,
5627                             "HPP: entering EOF\n");
5628 #endif
5629                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5630                         ctxt->sax->endDocument(ctxt->userData);
5631                     goto done;
5632                 }
5633                 break;
5634             case XML_PARSER_START_TAG: {
5635                 const xmlChar *name;
5636                 int failed;
5637                 const htmlElemDesc * info;
5638
5639                 /*
5640                  * no chars in buffer
5641                  */
5642                 if (avail < 1)
5643                     goto done;
5644                 /*
5645                  * not enouth chars in buffer
5646                  */
5647                 if (avail < 2) {
5648                     if (!terminate)
5649                         goto done;
5650                     else
5651                         next = ' ';
5652                 } else {
5653                     next = in->cur[1];
5654                 }
5655                 cur = in->cur[0];
5656                 if (cur != '<') {
5657                     ctxt->instate = XML_PARSER_CONTENT;
5658 #ifdef DEBUG_PUSH
5659                     xmlGenericError(xmlGenericErrorContext,
5660                             "HPP: entering CONTENT\n");
5661 #endif
5662                     break;
5663                 }
5664                 if (next == '/') {
5665                     ctxt->instate = XML_PARSER_END_TAG;
5666                     ctxt->checkIndex = 0;
5667 #ifdef DEBUG_PUSH
5668                     xmlGenericError(xmlGenericErrorContext,
5669                             "HPP: entering END_TAG\n");
5670 #endif
5671                     break;
5672                 }
5673                 if ((!terminate) &&
5674                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5675                     goto done;
5676
5677                 /* Capture start position */
5678                 if (ctxt->record_info) {
5679                      node_info.begin_pos = ctxt->input->consumed +
5680                                         (CUR_PTR - ctxt->input->base);
5681                      node_info.begin_line = ctxt->input->line;
5682                 }
5683
5684
5685                 failed = htmlParseStartTag(ctxt);
5686                 name = ctxt->name;
5687                 if ((failed == -1) ||
5688                     (name == NULL)) {
5689                     if (CUR == '>')
5690                         NEXT;
5691                     break;
5692                 }
5693
5694                 /*
5695                  * Lookup the info for that element.
5696                  */
5697                 info = htmlTagLookup(name);
5698                 if (info == NULL) {
5699                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5700                                  "Tag %s invalid\n", name, NULL);
5701                 }
5702
5703                 /*
5704                  * Check for an Empty Element labeled the XML/SGML way
5705                  */
5706                 if ((CUR == '/') && (NXT(1) == '>')) {
5707                     SKIP(2);
5708                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5709                         ctxt->sax->endElement(ctxt->userData, name);
5710                     htmlnamePop(ctxt);
5711                     ctxt->instate = XML_PARSER_CONTENT;
5712 #ifdef DEBUG_PUSH
5713                     xmlGenericError(xmlGenericErrorContext,
5714                             "HPP: entering CONTENT\n");
5715 #endif
5716                     break;
5717                 }
5718
5719                 if (CUR == '>') {
5720                     NEXT;
5721                 } else {
5722                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5723                                  "Couldn't find end of Start Tag %s\n",
5724                                  name, NULL);
5725
5726                     /*
5727                      * end of parsing of this node.
5728                      */
5729                     if (xmlStrEqual(name, ctxt->name)) {
5730                         nodePop(ctxt);
5731                         htmlnamePop(ctxt);
5732                     }
5733
5734                     if (ctxt->record_info)
5735                         htmlNodeInfoPush(ctxt, &node_info);
5736
5737                     ctxt->instate = XML_PARSER_CONTENT;
5738 #ifdef DEBUG_PUSH
5739                     xmlGenericError(xmlGenericErrorContext,
5740                             "HPP: entering CONTENT\n");
5741 #endif
5742                     break;
5743                 }
5744
5745                 /*
5746                  * Check for an Empty Element from DTD definition
5747                  */
5748                 if ((info != NULL) && (info->empty)) {
5749                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5750                         ctxt->sax->endElement(ctxt->userData, name);
5751                     htmlnamePop(ctxt);
5752                 }
5753
5754                 if (ctxt->record_info)
5755                     htmlNodeInfoPush(ctxt, &node_info);
5756
5757                 ctxt->instate = XML_PARSER_CONTENT;
5758 #ifdef DEBUG_PUSH
5759                 xmlGenericError(xmlGenericErrorContext,
5760                         "HPP: entering CONTENT\n");
5761 #endif
5762                 break;
5763             }
5764             case XML_PARSER_CONTENT: {
5765                 long cons;
5766                 /*
5767                  * Handle preparsed entities and charRef
5768                  */
5769                 if (ctxt->token != 0) {
5770                     xmlChar chr[2] = { 0 , 0 } ;
5771
5772                     chr[0] = (xmlChar) ctxt->token;
5773                     htmlCheckParagraph(ctxt);
5774                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5775                         ctxt->sax->characters(ctxt->userData, chr, 1);
5776                     ctxt->token = 0;
5777                     ctxt->checkIndex = 0;
5778                 }
5779                 if ((avail == 1) && (terminate)) {
5780                     cur = in->cur[0];
5781                     if ((cur != '<') && (cur != '&')) {
5782                         if (ctxt->sax != NULL) {
5783                             if (IS_BLANK_CH(cur)) {
5784                                 if (ctxt->keepBlanks) {
5785                                     if (ctxt->sax->characters != NULL)
5786                                         ctxt->sax->characters(
5787                                                 ctxt->userData, &in->cur[0], 1);
5788                                 } else {
5789                                     if (ctxt->sax->ignorableWhitespace != NULL)
5790                                         ctxt->sax->ignorableWhitespace(
5791                                                 ctxt->userData, &in->cur[0], 1);
5792                                 }
5793                             } else {
5794                                 htmlCheckParagraph(ctxt);
5795                                 if (ctxt->sax->characters != NULL)
5796                                     ctxt->sax->characters(
5797                                             ctxt->userData, &in->cur[0], 1);
5798                             }
5799                         }
5800                         ctxt->token = 0;
5801                         ctxt->checkIndex = 0;
5802                         in->cur++;
5803                         break;
5804                     }
5805                 }
5806                 if (avail < 2)
5807                     goto done;
5808                 cur = in->cur[0];
5809                 next = in->cur[1];
5810                 cons = ctxt->nbChars;
5811                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5812                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5813                     /*
5814                      * Handle SCRIPT/STYLE separately
5815                      */
5816                     if (!terminate) {
5817                         int idx;
5818                         xmlChar val;
5819
5820                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5821                         if (idx < 0)
5822                             goto done;
5823                         val = in->cur[idx + 2];
5824                         if (val == 0) /* bad cut of input */
5825                             goto done;
5826                     }
5827                     htmlParseScript(ctxt);
5828                     if ((cur == '<') && (next == '/')) {
5829                         ctxt->instate = XML_PARSER_END_TAG;
5830                         ctxt->checkIndex = 0;
5831 #ifdef DEBUG_PUSH
5832                         xmlGenericError(xmlGenericErrorContext,
5833                                 "HPP: entering END_TAG\n");
5834 #endif
5835                         break;
5836                     }
5837                 } else {
5838                     /*
5839                      * Sometimes DOCTYPE arrives in the middle of the document
5840                      */
5841                     if ((cur == '<') && (next == '!') &&
5842                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
5843                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5844                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5845                         (UPP(8) == 'E')) {
5846                         if ((!terminate) &&
5847                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5848                             goto done;
5849                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5850                                      "Misplaced DOCTYPE declaration\n",
5851                                      BAD_CAST "DOCTYPE" , NULL);
5852                         htmlParseDocTypeDecl(ctxt);
5853                     } else if ((cur == '<') && (next == '!') &&
5854                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
5855                         if ((!terminate) &&
5856                             (htmlParseLookupSequence(
5857                                 ctxt, '-', '-', '>', 1, 1) < 0))
5858                             goto done;
5859 #ifdef DEBUG_PUSH
5860                         xmlGenericError(xmlGenericErrorContext,
5861                                 "HPP: Parsing Comment\n");
5862 #endif
5863                         htmlParseComment(ctxt);
5864                         ctxt->instate = XML_PARSER_CONTENT;
5865                     } else if ((cur == '<') && (next == '?')) {
5866                         if ((!terminate) &&
5867                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5868                             goto done;
5869 #ifdef DEBUG_PUSH
5870                         xmlGenericError(xmlGenericErrorContext,
5871                                 "HPP: Parsing PI\n");
5872 #endif
5873                         htmlParsePI(ctxt);
5874                         ctxt->instate = XML_PARSER_CONTENT;
5875                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5876                         goto done;
5877                     } else if ((cur == '<') && (next == '/')) {
5878                         ctxt->instate = XML_PARSER_END_TAG;
5879                         ctxt->checkIndex = 0;
5880 #ifdef DEBUG_PUSH
5881                         xmlGenericError(xmlGenericErrorContext,
5882                                 "HPP: entering END_TAG\n");
5883 #endif
5884                         break;
5885                     } else if (cur == '<') {
5886                         ctxt->instate = XML_PARSER_START_TAG;
5887                         ctxt->checkIndex = 0;
5888 #ifdef DEBUG_PUSH
5889                         xmlGenericError(xmlGenericErrorContext,
5890                                 "HPP: entering START_TAG\n");
5891 #endif
5892                         break;
5893                     } else if (cur == '&') {
5894                         if ((!terminate) &&
5895                             (htmlParseLookupChars(ctxt,
5896                                                   BAD_CAST "; >/", 4) < 0))
5897                             goto done;
5898 #ifdef DEBUG_PUSH
5899                         xmlGenericError(xmlGenericErrorContext,
5900                                 "HPP: Parsing Reference\n");
5901 #endif
5902                         /* TODO: check generation of subtrees if noent !!! */
5903                         htmlParseReference(ctxt);
5904                     } else {
5905                         /*
5906                          * check that the text sequence is complete
5907                          * before handing out the data to the parser
5908                          * to avoid problems with erroneous end of
5909                          * data detection.
5910                          */
5911                         if ((!terminate) &&
5912                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5913                             goto done;
5914                         ctxt->checkIndex = 0;
5915 #ifdef DEBUG_PUSH
5916                         xmlGenericError(xmlGenericErrorContext,
5917                                 "HPP: Parsing char data\n");
5918 #endif
5919                         htmlParseCharData(ctxt);
5920                     }
5921                 }
5922                 if (cons == ctxt->nbChars) {
5923                     if (ctxt->node != NULL) {
5924                         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5925                                      "detected an error in element content\n",
5926                                      NULL, NULL);
5927                     }
5928                     NEXT;
5929                     break;
5930                 }
5931
5932                 break;
5933             }
5934             case XML_PARSER_END_TAG:
5935                 if (avail < 2)
5936                     goto done;
5937                 if ((!terminate) &&
5938                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5939                     goto done;
5940                 htmlParseEndTag(ctxt);
5941                 if (ctxt->nameNr == 0) {
5942                     ctxt->instate = XML_PARSER_EPILOG;
5943                 } else {
5944                     ctxt->instate = XML_PARSER_CONTENT;
5945                 }
5946                 ctxt->checkIndex = 0;
5947 #ifdef DEBUG_PUSH
5948                 xmlGenericError(xmlGenericErrorContext,
5949                         "HPP: entering CONTENT\n");
5950 #endif
5951                 break;
5952             case XML_PARSER_CDATA_SECTION:
5953                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5954                         "HPP: internal error, state == CDATA\n",
5955                              NULL, NULL);
5956                 ctxt->instate = XML_PARSER_CONTENT;
5957                 ctxt->checkIndex = 0;
5958 #ifdef DEBUG_PUSH
5959                 xmlGenericError(xmlGenericErrorContext,
5960                         "HPP: entering CONTENT\n");
5961 #endif
5962                 break;
5963             case XML_PARSER_DTD:
5964                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5965                         "HPP: internal error, state == DTD\n",
5966                              NULL, NULL);
5967                 ctxt->instate = XML_PARSER_CONTENT;
5968                 ctxt->checkIndex = 0;
5969 #ifdef DEBUG_PUSH
5970                 xmlGenericError(xmlGenericErrorContext,
5971                         "HPP: entering CONTENT\n");
5972 #endif
5973                 break;
5974             case XML_PARSER_COMMENT:
5975                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5976                         "HPP: internal error, state == COMMENT\n",
5977                              NULL, NULL);
5978                 ctxt->instate = XML_PARSER_CONTENT;
5979                 ctxt->checkIndex = 0;
5980 #ifdef DEBUG_PUSH
5981                 xmlGenericError(xmlGenericErrorContext,
5982                         "HPP: entering CONTENT\n");
5983 #endif
5984                 break;
5985             case XML_PARSER_PI:
5986                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5987                         "HPP: internal error, state == PI\n",
5988                              NULL, NULL);
5989                 ctxt->instate = XML_PARSER_CONTENT;
5990                 ctxt->checkIndex = 0;
5991 #ifdef DEBUG_PUSH
5992                 xmlGenericError(xmlGenericErrorContext,
5993                         "HPP: entering CONTENT\n");
5994 #endif
5995                 break;
5996             case XML_PARSER_ENTITY_DECL:
5997                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5998                         "HPP: internal error, state == ENTITY_DECL\n",
5999                              NULL, NULL);
6000                 ctxt->instate = XML_PARSER_CONTENT;
6001                 ctxt->checkIndex = 0;
6002 #ifdef DEBUG_PUSH
6003                 xmlGenericError(xmlGenericErrorContext,
6004                         "HPP: entering CONTENT\n");
6005 #endif
6006                 break;
6007             case XML_PARSER_ENTITY_VALUE:
6008                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6009                         "HPP: internal error, state == ENTITY_VALUE\n",
6010                              NULL, NULL);
6011                 ctxt->instate = XML_PARSER_CONTENT;
6012                 ctxt->checkIndex = 0;
6013 #ifdef DEBUG_PUSH
6014                 xmlGenericError(xmlGenericErrorContext,
6015                         "HPP: entering DTD\n");
6016 #endif
6017                 break;
6018             case XML_PARSER_ATTRIBUTE_VALUE:
6019                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6020                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6021                              NULL, NULL);
6022                 ctxt->instate = XML_PARSER_START_TAG;
6023                 ctxt->checkIndex = 0;
6024 #ifdef DEBUG_PUSH
6025                 xmlGenericError(xmlGenericErrorContext,
6026                         "HPP: entering START_TAG\n");
6027 #endif
6028                 break;
6029             case XML_PARSER_SYSTEM_LITERAL:
6030                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6031                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6032                              NULL, NULL);
6033                 ctxt->instate = XML_PARSER_CONTENT;
6034                 ctxt->checkIndex = 0;
6035 #ifdef DEBUG_PUSH
6036                 xmlGenericError(xmlGenericErrorContext,
6037                         "HPP: entering CONTENT\n");
6038 #endif
6039                 break;
6040             case XML_PARSER_IGNORE:
6041                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6042                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
6043                              NULL, NULL);
6044                 ctxt->instate = XML_PARSER_CONTENT;
6045                 ctxt->checkIndex = 0;
6046 #ifdef DEBUG_PUSH
6047                 xmlGenericError(xmlGenericErrorContext,
6048                         "HPP: entering CONTENT\n");
6049 #endif
6050                 break;
6051             case XML_PARSER_PUBLIC_LITERAL:
6052                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6053                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
6054                              NULL, NULL);
6055                 ctxt->instate = XML_PARSER_CONTENT;
6056                 ctxt->checkIndex = 0;
6057 #ifdef DEBUG_PUSH
6058                 xmlGenericError(xmlGenericErrorContext,
6059                         "HPP: entering CONTENT\n");
6060 #endif
6061                 break;
6062
6063         }
6064     }
6065 done:
6066     if ((avail == 0) && (terminate)) {
6067         htmlAutoCloseOnEnd(ctxt);
6068         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6069             /*
6070              * SAX: end of the document processing.
6071              */
6072             ctxt->instate = XML_PARSER_EOF;
6073             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6074                 ctxt->sax->endDocument(ctxt->userData);
6075         }
6076     }
6077     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6078         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6079          (ctxt->instate == XML_PARSER_EPILOG))) {
6080         xmlDtdPtr dtd;
6081         dtd = xmlGetIntSubset(ctxt->myDoc);
6082         if (dtd == NULL)
6083             ctxt->myDoc->intSubset =
6084                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6085                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6086                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6087     }
6088 #ifdef DEBUG_PUSH
6089     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6090 #endif
6091     return(ret);
6092 }
6093
6094 /**
6095  * htmlParseChunk:
6096  * @ctxt:  an HTML parser context
6097  * @chunk:  an char array
6098  * @size:  the size in byte of the chunk
6099  * @terminate:  last chunk indicator
6100  *
6101  * Parse a Chunk of memory
6102  *
6103  * Returns zero if no error, the xmlParserErrors otherwise.
6104  */
6105 int
6106 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6107               int terminate) {
6108     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6109         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6110                      "htmlParseChunk: context error\n", NULL, NULL);
6111         return(XML_ERR_INTERNAL_ERROR);
6112     }
6113     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6114         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6115         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6116         size_t cur = ctxt->input->cur - ctxt->input->base;
6117         int res;
6118
6119         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6120         if (res < 0) {
6121             ctxt->errNo = XML_PARSER_EOF;
6122             ctxt->disableSAX = 1;
6123             return (XML_PARSER_EOF);
6124         }
6125         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6126 #ifdef DEBUG_PUSH
6127         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6128 #endif
6129
6130 #if 0
6131         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6132             htmlParseTryOrFinish(ctxt, terminate);
6133 #endif
6134     } else if (ctxt->instate != XML_PARSER_EOF) {
6135         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6136             xmlParserInputBufferPtr in = ctxt->input->buf;
6137             if ((in->encoder != NULL) && (in->buffer != NULL) &&
6138                     (in->raw != NULL)) {
6139                 int nbchars;
6140                 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6141                 size_t current = ctxt->input->cur - ctxt->input->base;
6142
6143                 nbchars = xmlCharEncInput(in, terminate);
6144                 if (nbchars < 0) {
6145                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6146                                  "encoder error\n", NULL, NULL);
6147                     return(XML_ERR_INVALID_ENCODING);
6148                 }
6149                 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6150             }
6151         }
6152     }
6153     htmlParseTryOrFinish(ctxt, terminate);
6154     if (terminate) {
6155         if ((ctxt->instate != XML_PARSER_EOF) &&
6156             (ctxt->instate != XML_PARSER_EPILOG) &&
6157             (ctxt->instate != XML_PARSER_MISC)) {
6158             ctxt->errNo = XML_ERR_DOCUMENT_END;
6159             ctxt->wellFormed = 0;
6160         }
6161         if (ctxt->instate != XML_PARSER_EOF) {
6162             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6163                 ctxt->sax->endDocument(ctxt->userData);
6164         }
6165         ctxt->instate = XML_PARSER_EOF;
6166     }
6167     return((xmlParserErrors) ctxt->errNo);
6168 }
6169
6170 /************************************************************************
6171  *                                                                      *
6172  *                      User entry points                               *
6173  *                                                                      *
6174  ************************************************************************/
6175
6176 /**
6177  * htmlCreatePushParserCtxt:
6178  * @sax:  a SAX handler
6179  * @user_data:  The user data returned on SAX callbacks
6180  * @chunk:  a pointer to an array of chars
6181  * @size:  number of chars in the array
6182  * @filename:  an optional file name or URI
6183  * @enc:  an optional encoding
6184  *
6185  * Create a parser context for using the HTML parser in push mode
6186  * The value of @filename is used for fetching external entities
6187  * and error/warning reports.
6188  *
6189  * Returns the new parser context or NULL
6190  */
6191 htmlParserCtxtPtr
6192 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6193                          const char *chunk, int size, const char *filename,
6194                          xmlCharEncoding enc) {
6195     htmlParserCtxtPtr ctxt;
6196     htmlParserInputPtr inputStream;
6197     xmlParserInputBufferPtr buf;
6198
6199     xmlInitParser();
6200
6201     buf = xmlAllocParserInputBuffer(enc);
6202     if (buf == NULL) return(NULL);
6203
6204     ctxt = htmlNewParserCtxt();
6205     if (ctxt == NULL) {
6206         xmlFreeParserInputBuffer(buf);
6207         return(NULL);
6208     }
6209     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6210         ctxt->charset=XML_CHAR_ENCODING_UTF8;
6211     if (sax != NULL) {
6212         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6213             xmlFree(ctxt->sax);
6214         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6215         if (ctxt->sax == NULL) {
6216             xmlFree(buf);
6217             xmlFree(ctxt);
6218             return(NULL);
6219         }
6220         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6221         if (user_data != NULL)
6222             ctxt->userData = user_data;
6223     }
6224     if (filename == NULL) {
6225         ctxt->directory = NULL;
6226     } else {
6227         ctxt->directory = xmlParserGetDirectory(filename);
6228     }
6229
6230     inputStream = htmlNewInputStream(ctxt);
6231     if (inputStream == NULL) {
6232         xmlFreeParserCtxt(ctxt);
6233         xmlFree(buf);
6234         return(NULL);
6235     }
6236
6237     if (filename == NULL)
6238         inputStream->filename = NULL;
6239     else
6240         inputStream->filename = (char *)
6241             xmlCanonicPath((const xmlChar *) filename);
6242     inputStream->buf = buf;
6243     xmlBufResetInput(buf->buffer, inputStream);
6244
6245     inputPush(ctxt, inputStream);
6246
6247     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6248         (ctxt->input->buf != NULL))  {
6249         size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6250         size_t cur = ctxt->input->cur - ctxt->input->base;
6251
6252         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6253
6254         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6255 #ifdef DEBUG_PUSH
6256         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6257 #endif
6258     }
6259     ctxt->progressive = 1;
6260
6261     return(ctxt);
6262 }
6263 #endif /* LIBXML_PUSH_ENABLED */
6264
6265 /**
6266  * htmlSAXParseDoc:
6267  * @cur:  a pointer to an array of xmlChar
6268  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6269  * @sax:  the SAX handler block
6270  * @userData: if using SAX, this pointer will be provided on callbacks.
6271  *
6272  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6273  * to handle parse events. If sax is NULL, fallback to the default DOM
6274  * behavior and return a tree.
6275  *
6276  * Returns the resulting document tree unless SAX is NULL or the document is
6277  *     not well formed.
6278  */
6279
6280 htmlDocPtr
6281 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6282                 htmlSAXHandlerPtr sax, void *userData) {
6283     htmlDocPtr ret;
6284     htmlParserCtxtPtr ctxt;
6285
6286     xmlInitParser();
6287
6288     if (cur == NULL) return(NULL);
6289
6290
6291     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6292     if (ctxt == NULL) return(NULL);
6293     if (sax != NULL) {
6294         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6295         ctxt->sax = sax;
6296         ctxt->userData = userData;
6297     }
6298
6299     htmlParseDocument(ctxt);
6300     ret = ctxt->myDoc;
6301     if (sax != NULL) {
6302         ctxt->sax = NULL;
6303         ctxt->userData = NULL;
6304     }
6305     htmlFreeParserCtxt(ctxt);
6306
6307     return(ret);
6308 }
6309
6310 /**
6311  * htmlParseDoc:
6312  * @cur:  a pointer to an array of xmlChar
6313  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6314  *
6315  * parse an HTML in-memory document and build a tree.
6316  *
6317  * Returns the resulting document tree
6318  */
6319
6320 htmlDocPtr
6321 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6322     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6323 }
6324
6325
6326 /**
6327  * htmlCreateFileParserCtxt:
6328  * @filename:  the filename
6329  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6330  *
6331  * Create a parser context for a file content.
6332  * Automatic support for ZLIB/Compress compressed document is provided
6333  * by default if found at compile-time.
6334  *
6335  * Returns the new parser context or NULL
6336  */
6337 htmlParserCtxtPtr
6338 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6339 {
6340     htmlParserCtxtPtr ctxt;
6341     htmlParserInputPtr inputStream;
6342     char *canonicFilename;
6343     /* htmlCharEncoding enc; */
6344     xmlChar *content, *content_line = (xmlChar *) "charset=";
6345
6346     if (filename == NULL)
6347         return(NULL);
6348
6349     ctxt = htmlNewParserCtxt();
6350     if (ctxt == NULL) {
6351         return(NULL);
6352     }
6353     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6354     if (canonicFilename == NULL) {
6355 #ifdef LIBXML_SAX1_ENABLED
6356         if (xmlDefaultSAXHandler.error != NULL) {
6357             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6358         }
6359 #endif
6360         xmlFreeParserCtxt(ctxt);
6361         return(NULL);
6362     }
6363
6364     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6365     xmlFree(canonicFilename);
6366     if (inputStream == NULL) {
6367         xmlFreeParserCtxt(ctxt);
6368         return(NULL);
6369     }
6370
6371     inputPush(ctxt, inputStream);
6372
6373     /* set encoding */
6374     if (encoding) {
6375         size_t l = strlen(encoding);
6376
6377         if (l < 1000) {
6378             content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6379             if (content) {
6380                 strcpy ((char *)content, (char *)content_line);
6381                 strcat ((char *)content, (char *)encoding);
6382                 htmlCheckEncoding (ctxt, content);
6383                 xmlFree (content);
6384             }
6385         }
6386     }
6387
6388     return(ctxt);
6389 }
6390
6391 /**
6392  * htmlSAXParseFile:
6393  * @filename:  the filename
6394  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6395  * @sax:  the SAX handler block
6396  * @userData: if using SAX, this pointer will be provided on callbacks.
6397  *
6398  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6399  * compressed document is provided by default if found at compile-time.
6400  * It use the given SAX function block to handle the parsing callback.
6401  * If sax is NULL, fallback to the default DOM tree building routines.
6402  *
6403  * Returns the resulting document tree unless SAX is NULL or the document is
6404  *     not well formed.
6405  */
6406
6407 htmlDocPtr
6408 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6409                  void *userData) {
6410     htmlDocPtr ret;
6411     htmlParserCtxtPtr ctxt;
6412     htmlSAXHandlerPtr oldsax = NULL;
6413
6414     xmlInitParser();
6415
6416     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6417     if (ctxt == NULL) return(NULL);
6418     if (sax != NULL) {
6419         oldsax = ctxt->sax;
6420         ctxt->sax = sax;
6421         ctxt->userData = userData;
6422     }
6423
6424     htmlParseDocument(ctxt);
6425
6426     ret = ctxt->myDoc;
6427     if (sax != NULL) {
6428         ctxt->sax = oldsax;
6429         ctxt->userData = NULL;
6430     }
6431     htmlFreeParserCtxt(ctxt);
6432
6433     return(ret);
6434 }
6435
6436 /**
6437  * htmlParseFile:
6438  * @filename:  the filename
6439  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6440  *
6441  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6442  * compressed document is provided by default if found at compile-time.
6443  *
6444  * Returns the resulting document tree
6445  */
6446
6447 htmlDocPtr
6448 htmlParseFile(const char *filename, const char *encoding) {
6449     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6450 }
6451
6452 /**
6453  * htmlHandleOmittedElem:
6454  * @val:  int 0 or 1
6455  *
6456  * Set and return the previous value for handling HTML omitted tags.
6457  *
6458  * Returns the last value for 0 for no handling, 1 for auto insertion.
6459  */
6460
6461 int
6462 htmlHandleOmittedElem(int val) {
6463     int old = htmlOmittedDefaultValue;
6464
6465     htmlOmittedDefaultValue = val;
6466     return(old);
6467 }
6468
6469 /**
6470  * htmlElementAllowedHere:
6471  * @parent: HTML parent element
6472  * @elt: HTML element
6473  *
6474  * Checks whether an HTML element may be a direct child of a parent element.
6475  * Note - doesn't check for deprecated elements
6476  *
6477  * Returns 1 if allowed; 0 otherwise.
6478  */
6479 int
6480 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6481   const char** p ;
6482
6483   if ( ! elt || ! parent || ! parent->subelts )
6484         return 0 ;
6485
6486   for ( p = parent->subelts; *p; ++p )
6487     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6488       return 1 ;
6489
6490   return 0 ;
6491 }
6492 /**
6493  * htmlElementStatusHere:
6494  * @parent: HTML parent element
6495  * @elt: HTML element
6496  *
6497  * Checks whether an HTML element may be a direct child of a parent element.
6498  * and if so whether it is valid or deprecated.
6499  *
6500  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6501  */
6502 htmlStatus
6503 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6504   if ( ! parent || ! elt )
6505     return HTML_INVALID ;
6506   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6507     return HTML_INVALID ;
6508
6509   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6510 }
6511 /**
6512  * htmlAttrAllowed:
6513  * @elt: HTML element
6514  * @attr: HTML attribute
6515  * @legacy: whether to allow deprecated attributes
6516  *
6517  * Checks whether an attribute is valid for an element
6518  * Has full knowledge of Required and Deprecated attributes
6519  *
6520  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6521  */
6522 htmlStatus
6523 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6524   const char** p ;
6525
6526   if ( !elt || ! attr )
6527         return HTML_INVALID ;
6528
6529   if ( elt->attrs_req )
6530     for ( p = elt->attrs_req; *p; ++p)
6531       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6532         return HTML_REQUIRED ;
6533
6534   if ( elt->attrs_opt )
6535     for ( p = elt->attrs_opt; *p; ++p)
6536       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6537         return HTML_VALID ;
6538
6539   if ( legacy && elt->attrs_depr )
6540     for ( p = elt->attrs_depr; *p; ++p)
6541       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6542         return HTML_DEPRECATED ;
6543
6544   return HTML_INVALID ;
6545 }
6546 /**
6547  * htmlNodeStatus:
6548  * @node: an htmlNodePtr in a tree
6549  * @legacy: whether to allow deprecated elements (YES is faster here
6550  *      for Element nodes)
6551  *
6552  * Checks whether the tree node is valid.  Experimental (the author
6553  *     only uses the HTML enhancements in a SAX parser)
6554  *
6555  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6556  *      legacy allowed) or htmlElementStatusHere (otherwise).
6557  *      for Attribute nodes, a return from htmlAttrAllowed
6558  *      for other nodes, HTML_NA (no checks performed)
6559  */
6560 htmlStatus
6561 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6562   if ( ! node )
6563     return HTML_INVALID ;
6564
6565   switch ( node->type ) {
6566     case XML_ELEMENT_NODE:
6567       return legacy
6568         ? ( htmlElementAllowedHere (
6569                 htmlTagLookup(node->parent->name) , node->name
6570                 ) ? HTML_VALID : HTML_INVALID )
6571         : htmlElementStatusHere(
6572                 htmlTagLookup(node->parent->name) ,
6573                 htmlTagLookup(node->name) )
6574         ;
6575     case XML_ATTRIBUTE_NODE:
6576       return htmlAttrAllowed(
6577         htmlTagLookup(node->parent->name) , node->name, legacy) ;
6578     default: return HTML_NA ;
6579   }
6580 }
6581 /************************************************************************
6582  *                                                                      *
6583  *      New set (2.6.0) of simpler and more flexible APIs               *
6584  *                                                                      *
6585  ************************************************************************/
6586 /**
6587  * DICT_FREE:
6588  * @str:  a string
6589  *
6590  * Free a string if it is not owned by the "dict" dictionary in the
6591  * current scope
6592  */
6593 #define DICT_FREE(str)                                          \
6594         if ((str) && ((!dict) ||                                \
6595             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6596             xmlFree((char *)(str));
6597
6598 /**
6599  * htmlCtxtReset:
6600  * @ctxt: an HTML parser context
6601  *
6602  * Reset a parser context
6603  */
6604 void
6605 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6606 {
6607     xmlParserInputPtr input;
6608     xmlDictPtr dict;
6609
6610     if (ctxt == NULL)
6611         return;
6612
6613     xmlInitParser();
6614     dict = ctxt->dict;
6615
6616     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6617         xmlFreeInputStream(input);
6618     }
6619     ctxt->inputNr = 0;
6620     ctxt->input = NULL;
6621
6622     ctxt->spaceNr = 0;
6623     if (ctxt->spaceTab != NULL) {
6624         ctxt->spaceTab[0] = -1;
6625         ctxt->space = &ctxt->spaceTab[0];
6626     } else {
6627         ctxt->space = NULL;
6628     }
6629
6630
6631     ctxt->nodeNr = 0;
6632     ctxt->node = NULL;
6633
6634     ctxt->nameNr = 0;
6635     ctxt->name = NULL;
6636
6637     DICT_FREE(ctxt->version);
6638     ctxt->version = NULL;
6639     DICT_FREE(ctxt->encoding);
6640     ctxt->encoding = NULL;
6641     DICT_FREE(ctxt->directory);
6642     ctxt->directory = NULL;
6643     DICT_FREE(ctxt->extSubURI);
6644     ctxt->extSubURI = NULL;
6645     DICT_FREE(ctxt->extSubSystem);
6646     ctxt->extSubSystem = NULL;
6647     if (ctxt->myDoc != NULL)
6648         xmlFreeDoc(ctxt->myDoc);
6649     ctxt->myDoc = NULL;
6650
6651     ctxt->standalone = -1;
6652     ctxt->hasExternalSubset = 0;
6653     ctxt->hasPErefs = 0;
6654     ctxt->html = 1;
6655     ctxt->external = 0;
6656     ctxt->instate = XML_PARSER_START;
6657     ctxt->token = 0;
6658
6659     ctxt->wellFormed = 1;
6660     ctxt->nsWellFormed = 1;
6661     ctxt->disableSAX = 0;
6662     ctxt->valid = 1;
6663     ctxt->vctxt.userData = ctxt;
6664     ctxt->vctxt.error = xmlParserValidityError;
6665     ctxt->vctxt.warning = xmlParserValidityWarning;
6666     ctxt->record_info = 0;
6667     ctxt->nbChars = 0;
6668     ctxt->checkIndex = 0;
6669     ctxt->inSubset = 0;
6670     ctxt->errNo = XML_ERR_OK;
6671     ctxt->depth = 0;
6672     ctxt->charset = XML_CHAR_ENCODING_NONE;
6673     ctxt->catalogs = NULL;
6674     xmlInitNodeInfoSeq(&ctxt->node_seq);
6675
6676     if (ctxt->attsDefault != NULL) {
6677         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6678         ctxt->attsDefault = NULL;
6679     }
6680     if (ctxt->attsSpecial != NULL) {
6681         xmlHashFree(ctxt->attsSpecial, NULL);
6682         ctxt->attsSpecial = NULL;
6683     }
6684 }
6685
6686 /**
6687  * htmlCtxtUseOptions:
6688  * @ctxt: an HTML parser context
6689  * @options:  a combination of htmlParserOption(s)
6690  *
6691  * Applies the options to the parser context
6692  *
6693  * Returns 0 in case of success, the set of unknown or unimplemented options
6694  *         in case of error.
6695  */
6696 int
6697 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6698 {
6699     if (ctxt == NULL)
6700         return(-1);
6701
6702     if (options & HTML_PARSE_NOWARNING) {
6703         ctxt->sax->warning = NULL;
6704         ctxt->vctxt.warning = NULL;
6705         options -= XML_PARSE_NOWARNING;
6706         ctxt->options |= XML_PARSE_NOWARNING;
6707     }
6708     if (options & HTML_PARSE_NOERROR) {
6709         ctxt->sax->error = NULL;
6710         ctxt->vctxt.error = NULL;
6711         ctxt->sax->fatalError = NULL;
6712         options -= XML_PARSE_NOERROR;
6713         ctxt->options |= XML_PARSE_NOERROR;
6714     }
6715     if (options & HTML_PARSE_PEDANTIC) {
6716         ctxt->pedantic = 1;
6717         options -= XML_PARSE_PEDANTIC;
6718         ctxt->options |= XML_PARSE_PEDANTIC;
6719     } else
6720         ctxt->pedantic = 0;
6721     if (options & XML_PARSE_NOBLANKS) {
6722         ctxt->keepBlanks = 0;
6723         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6724         options -= XML_PARSE_NOBLANKS;
6725         ctxt->options |= XML_PARSE_NOBLANKS;
6726     } else
6727         ctxt->keepBlanks = 1;
6728     if (options & HTML_PARSE_RECOVER) {
6729         ctxt->recovery = 1;
6730         options -= HTML_PARSE_RECOVER;
6731     } else
6732         ctxt->recovery = 0;
6733     if (options & HTML_PARSE_COMPACT) {
6734         ctxt->options |= HTML_PARSE_COMPACT;
6735         options -= HTML_PARSE_COMPACT;
6736     }
6737     if (options & XML_PARSE_HUGE) {
6738         ctxt->options |= XML_PARSE_HUGE;
6739         options -= XML_PARSE_HUGE;
6740     }
6741     if (options & HTML_PARSE_NODEFDTD) {
6742         ctxt->options |= HTML_PARSE_NODEFDTD;
6743         options -= HTML_PARSE_NODEFDTD;
6744     }
6745     if (options & HTML_PARSE_IGNORE_ENC) {
6746         ctxt->options |= HTML_PARSE_IGNORE_ENC;
6747         options -= HTML_PARSE_IGNORE_ENC;
6748     }
6749     if (options & HTML_PARSE_NOIMPLIED) {
6750         ctxt->options |= HTML_PARSE_NOIMPLIED;
6751         options -= HTML_PARSE_NOIMPLIED;
6752     }
6753     ctxt->dictNames = 0;
6754     return (options);
6755 }
6756
6757 /**
6758  * htmlDoRead:
6759  * @ctxt:  an HTML parser context
6760  * @URL:  the base URL to use for the document
6761  * @encoding:  the document encoding, or NULL
6762  * @options:  a combination of htmlParserOption(s)
6763  * @reuse:  keep the context for reuse
6764  *
6765  * Common front-end for the htmlRead functions
6766  *
6767  * Returns the resulting document tree or NULL
6768  */
6769 static htmlDocPtr
6770 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6771           int options, int reuse)
6772 {
6773     htmlDocPtr ret;
6774
6775     htmlCtxtUseOptions(ctxt, options);
6776     ctxt->html = 1;
6777     if (encoding != NULL) {
6778         xmlCharEncodingHandlerPtr hdlr;
6779
6780         hdlr = xmlFindCharEncodingHandler(encoding);
6781         if (hdlr != NULL) {
6782             xmlSwitchToEncoding(ctxt, hdlr);
6783             if (ctxt->input->encoding != NULL)
6784               xmlFree((xmlChar *) ctxt->input->encoding);
6785             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6786         }
6787     }
6788     if ((URL != NULL) && (ctxt->input != NULL) &&
6789         (ctxt->input->filename == NULL))
6790         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6791     htmlParseDocument(ctxt);
6792     ret = ctxt->myDoc;
6793     ctxt->myDoc = NULL;
6794     if (!reuse) {
6795         if ((ctxt->dictNames) &&
6796             (ret != NULL) &&
6797             (ret->dict == ctxt->dict))
6798             ctxt->dict = NULL;
6799         xmlFreeParserCtxt(ctxt);
6800     }
6801     return (ret);
6802 }
6803
6804 /**
6805  * htmlReadDoc:
6806  * @cur:  a pointer to a zero terminated string
6807  * @URL:  the base URL to use for the document
6808  * @encoding:  the document encoding, or NULL
6809  * @options:  a combination of htmlParserOption(s)
6810  *
6811  * parse an XML in-memory document and build a tree.
6812  *
6813  * Returns the resulting document tree
6814  */
6815 htmlDocPtr
6816 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6817 {
6818     htmlParserCtxtPtr ctxt;
6819
6820     if (cur == NULL)
6821         return (NULL);
6822
6823     xmlInitParser();
6824     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6825     if (ctxt == NULL)
6826         return (NULL);
6827     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6828 }
6829
6830 /**
6831  * htmlReadFile:
6832  * @filename:  a file or URL
6833  * @encoding:  the document encoding, or NULL
6834  * @options:  a combination of htmlParserOption(s)
6835  *
6836  * parse an XML file from the filesystem or the network.
6837  *
6838  * Returns the resulting document tree
6839  */
6840 htmlDocPtr
6841 htmlReadFile(const char *filename, const char *encoding, int options)
6842 {
6843     htmlParserCtxtPtr ctxt;
6844
6845     xmlInitParser();
6846     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6847     if (ctxt == NULL)
6848         return (NULL);
6849     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6850 }
6851
6852 /**
6853  * htmlReadMemory:
6854  * @buffer:  a pointer to a char array
6855  * @size:  the size of the array
6856  * @URL:  the base URL to use for the document
6857  * @encoding:  the document encoding, or NULL
6858  * @options:  a combination of htmlParserOption(s)
6859  *
6860  * parse an XML in-memory document and build a tree.
6861  *
6862  * Returns the resulting document tree
6863  */
6864 htmlDocPtr
6865 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6866 {
6867     htmlParserCtxtPtr ctxt;
6868
6869     xmlInitParser();
6870     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6871     if (ctxt == NULL)
6872         return (NULL);
6873     htmlDefaultSAXHandlerInit();
6874     if (ctxt->sax != NULL)
6875         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6876     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6877 }
6878
6879 /**
6880  * htmlReadFd:
6881  * @fd:  an open file descriptor
6882  * @URL:  the base URL to use for the document
6883  * @encoding:  the document encoding, or NULL
6884  * @options:  a combination of htmlParserOption(s)
6885  *
6886  * parse an XML from a file descriptor and build a tree.
6887  *
6888  * Returns the resulting document tree
6889  */
6890 htmlDocPtr
6891 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6892 {
6893     htmlParserCtxtPtr ctxt;
6894     xmlParserInputBufferPtr input;
6895     xmlParserInputPtr stream;
6896
6897     if (fd < 0)
6898         return (NULL);
6899     xmlInitParser();
6900
6901     xmlInitParser();
6902     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6903     if (input == NULL)
6904         return (NULL);
6905     ctxt = xmlNewParserCtxt();
6906     if (ctxt == NULL) {
6907         xmlFreeParserInputBuffer(input);
6908         return (NULL);
6909     }
6910     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6911     if (stream == NULL) {
6912         xmlFreeParserInputBuffer(input);
6913         xmlFreeParserCtxt(ctxt);
6914         return (NULL);
6915     }
6916     inputPush(ctxt, stream);
6917     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6918 }
6919
6920 /**
6921  * htmlReadIO:
6922  * @ioread:  an I/O read function
6923  * @ioclose:  an I/O close function
6924  * @ioctx:  an I/O handler
6925  * @URL:  the base URL to use for the document
6926  * @encoding:  the document encoding, or NULL
6927  * @options:  a combination of htmlParserOption(s)
6928  *
6929  * parse an HTML document from I/O functions and source and build a tree.
6930  *
6931  * Returns the resulting document tree
6932  */
6933 htmlDocPtr
6934 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6935           void *ioctx, const char *URL, const char *encoding, int options)
6936 {
6937     htmlParserCtxtPtr ctxt;
6938     xmlParserInputBufferPtr input;
6939     xmlParserInputPtr stream;
6940
6941     if (ioread == NULL)
6942         return (NULL);
6943     xmlInitParser();
6944
6945     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6946                                          XML_CHAR_ENCODING_NONE);
6947     if (input == NULL) {
6948         if (ioclose != NULL)
6949             ioclose(ioctx);
6950         return (NULL);
6951     }
6952     ctxt = htmlNewParserCtxt();
6953     if (ctxt == NULL) {
6954         xmlFreeParserInputBuffer(input);
6955         return (NULL);
6956     }
6957     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6958     if (stream == NULL) {
6959         xmlFreeParserInputBuffer(input);
6960         xmlFreeParserCtxt(ctxt);
6961         return (NULL);
6962     }
6963     inputPush(ctxt, stream);
6964     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6965 }
6966
6967 /**
6968  * htmlCtxtReadDoc:
6969  * @ctxt:  an HTML parser context
6970  * @cur:  a pointer to a zero terminated string
6971  * @URL:  the base URL to use for the document
6972  * @encoding:  the document encoding, or NULL
6973  * @options:  a combination of htmlParserOption(s)
6974  *
6975  * parse an XML in-memory document and build a tree.
6976  * This reuses the existing @ctxt parser context
6977  *
6978  * Returns the resulting document tree
6979  */
6980 htmlDocPtr
6981 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6982                const char *URL, const char *encoding, int options)
6983 {
6984     xmlParserInputPtr stream;
6985
6986     if (cur == NULL)
6987         return (NULL);
6988     if (ctxt == NULL)
6989         return (NULL);
6990     xmlInitParser();
6991
6992     htmlCtxtReset(ctxt);
6993
6994     stream = xmlNewStringInputStream(ctxt, cur);
6995     if (stream == NULL) {
6996         return (NULL);
6997     }
6998     inputPush(ctxt, stream);
6999     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7000 }
7001
7002 /**
7003  * htmlCtxtReadFile:
7004  * @ctxt:  an HTML parser context
7005  * @filename:  a file or URL
7006  * @encoding:  the document encoding, or NULL
7007  * @options:  a combination of htmlParserOption(s)
7008  *
7009  * parse an XML file from the filesystem or the network.
7010  * This reuses the existing @ctxt parser context
7011  *
7012  * Returns the resulting document tree
7013  */
7014 htmlDocPtr
7015 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7016                 const char *encoding, int options)
7017 {
7018     xmlParserInputPtr stream;
7019
7020     if (filename == NULL)
7021         return (NULL);
7022     if (ctxt == NULL)
7023         return (NULL);
7024     xmlInitParser();
7025
7026     htmlCtxtReset(ctxt);
7027
7028     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7029     if (stream == NULL) {
7030         return (NULL);
7031     }
7032     inputPush(ctxt, stream);
7033     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7034 }
7035
7036 /**
7037  * htmlCtxtReadMemory:
7038  * @ctxt:  an HTML parser context
7039  * @buffer:  a pointer to a char array
7040  * @size:  the size of the array
7041  * @URL:  the base URL to use for the document
7042  * @encoding:  the document encoding, or NULL
7043  * @options:  a combination of htmlParserOption(s)
7044  *
7045  * parse an XML in-memory document and build a tree.
7046  * This reuses the existing @ctxt parser context
7047  *
7048  * Returns the resulting document tree
7049  */
7050 htmlDocPtr
7051 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7052                   const char *URL, const char *encoding, int options)
7053 {
7054     xmlParserInputBufferPtr input;
7055     xmlParserInputPtr stream;
7056
7057     if (ctxt == NULL)
7058         return (NULL);
7059     if (buffer == NULL)
7060         return (NULL);
7061     xmlInitParser();
7062
7063     htmlCtxtReset(ctxt);
7064
7065     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7066     if (input == NULL) {
7067         return(NULL);
7068     }
7069
7070     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7071     if (stream == NULL) {
7072         xmlFreeParserInputBuffer(input);
7073         return(NULL);
7074     }
7075
7076     inputPush(ctxt, stream);
7077     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7078 }
7079
7080 /**
7081  * htmlCtxtReadFd:
7082  * @ctxt:  an HTML parser context
7083  * @fd:  an open file descriptor
7084  * @URL:  the base URL to use for the document
7085  * @encoding:  the document encoding, or NULL
7086  * @options:  a combination of htmlParserOption(s)
7087  *
7088  * parse an XML from a file descriptor and build a tree.
7089  * This reuses the existing @ctxt parser context
7090  *
7091  * Returns the resulting document tree
7092  */
7093 htmlDocPtr
7094 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7095               const char *URL, const char *encoding, int options)
7096 {
7097     xmlParserInputBufferPtr input;
7098     xmlParserInputPtr stream;
7099
7100     if (fd < 0)
7101         return (NULL);
7102     if (ctxt == NULL)
7103         return (NULL);
7104     xmlInitParser();
7105
7106     htmlCtxtReset(ctxt);
7107
7108
7109     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7110     if (input == NULL)
7111         return (NULL);
7112     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7113     if (stream == NULL) {
7114         xmlFreeParserInputBuffer(input);
7115         return (NULL);
7116     }
7117     inputPush(ctxt, stream);
7118     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7119 }
7120
7121 /**
7122  * htmlCtxtReadIO:
7123  * @ctxt:  an HTML parser context
7124  * @ioread:  an I/O read function
7125  * @ioclose:  an I/O close function
7126  * @ioctx:  an I/O handler
7127  * @URL:  the base URL to use for the document
7128  * @encoding:  the document encoding, or NULL
7129  * @options:  a combination of htmlParserOption(s)
7130  *
7131  * parse an HTML document from I/O functions and source and build a tree.
7132  * This reuses the existing @ctxt parser context
7133  *
7134  * Returns the resulting document tree
7135  */
7136 htmlDocPtr
7137 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7138               xmlInputCloseCallback ioclose, void *ioctx,
7139               const char *URL,
7140               const char *encoding, int options)
7141 {
7142     xmlParserInputBufferPtr input;
7143     xmlParserInputPtr stream;
7144
7145     if (ioread == NULL)
7146         return (NULL);
7147     if (ctxt == NULL)
7148         return (NULL);
7149     xmlInitParser();
7150
7151     htmlCtxtReset(ctxt);
7152
7153     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7154                                          XML_CHAR_ENCODING_NONE);
7155     if (input == NULL) {
7156         if (ioclose != NULL)
7157             ioclose(ioctx);
7158         return (NULL);
7159     }
7160     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7161     if (stream == NULL) {
7162         xmlFreeParserInputBuffer(input);
7163         return (NULL);
7164     }
7165     inputPush(ctxt, stream);
7166     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7167 }
7168
7169 #define bottom_HTMLparser
7170 #include "elfgcchack.h"
7171 #endif /* LIBXML_HTML_ENABLED */