#include <libxml/globals.h>
#include <libxml/uri.h>
+#include "buf.h"
+#include "enc.h"
+
#define HTML_MAX_NAMELEN 1000
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
#define HTML_PARSER_BUFFER_SIZE 100
*
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
*/
-static void
+static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, const xmlChar *str1, const xmlChar *str2)
{
*
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
*/
-static void
+static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, int val)
{
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
#define CUR_PTR ctxt->input->cur
+#define BASE_PTR ctxt->input->base
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
"div", "p", "head", NULL,
"noscript", "p", NULL,
"center", "font", "b", "i", "p", "head", NULL,
-"a", "a", NULL,
+"a", "a", "head", NULL,
"caption", "p", NULL,
"colgroup", "caption", "colgroup", "col", "p", NULL,
"col", "caption", "col", "p", NULL,
"option", "option", NULL,
"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
"pre", "listing", "xmp", "a", NULL,
+/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
+"tt", "head", NULL,
+"i", "head", NULL,
+"b", "head", NULL,
+"u", "head", NULL,
+"s", "head", NULL,
+"strike", "head", NULL,
+"big", "head", NULL,
+"small", "head", NULL,
+
+"em", "head", NULL,
+"strong", "head", NULL,
+"dfn", "head", NULL,
+"code", "head", NULL,
+"samp", "head", NULL,
+"kbd", "head", NULL,
+"var", "head", NULL,
+"cite", "head", NULL,
+"abbr", "head", NULL,
+"acronym", "head", NULL,
+
+/* "a" */
+"img", "head", NULL,
+/* "applet" */
+/* "embed" */
+/* "object" */
+"font", "head", NULL,
+/* "basefont" */
+"br", "head", NULL,
+/* "script" */
+"map", "head", NULL,
+"q", "head", NULL,
+"sub", "head", NULL,
+"sup", "head", NULL,
+"span", "head", NULL,
+"bdo", "head", NULL,
+"iframe", "head", NULL,
NULL
};
"onfocus",
"onblur",
"onsubmit",
- "onrest",
+ "onreset",
"onchange",
"onselect"
};
(*in == '_') || (*in == '-') ||
(*in == ':') || (*in == '.'))
in++;
+
+ if (in == ctxt->input->end)
+ return(NULL);
+
if ((*in > 0) && (*in < 0x80)) {
count = in - ctxt->input->cur;
ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
int len = 0, l;
int c;
int count = 0;
+ const xmlChar *base = ctxt->input->base;
/*
* Handler for more complex cases
len += l;
NEXTL(l);
c = CUR_CHAR(l);
+ if (ctxt->input->base != base) {
+ /*
+ * We changed encoding from an unknown encoding
+ * Input buffer changed location, so we better start again
+ */
+ return(htmlParseNameComplex(ctxt));
+ }
}
+
+ if (ctxt->input->base > ctxt->input->cur - len)
+ return(NULL);
+
return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
}
static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
- const xmlChar *q;
+ size_t len = 0, startPosition = 0;
xmlChar *ret = NULL;
if (CUR == '"') {
NEXT;
- q = CUR_PTR;
- while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
+
+ if (CUR_PTR < BASE_PTR)
+ return(ret);
+ startPosition = CUR_PTR - BASE_PTR;
+
+ while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
NEXT;
+ len++;
+ }
if (!IS_CHAR_CH(CUR)) {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished SystemLiteral\n", NULL, NULL);
} else {
- ret = xmlStrndup(q, CUR_PTR - q);
+ ret = xmlStrndup((BASE_PTR+startPosition), len);
NEXT;
}
} else if (CUR == '\'') {
NEXT;
- q = CUR_PTR;
- while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
+
+ if (CUR_PTR < BASE_PTR)
+ return(ret);
+ startPosition = CUR_PTR - BASE_PTR;
+
+ while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
NEXT;
+ len++;
+ }
if (!IS_CHAR_CH(CUR)) {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished SystemLiteral\n", NULL, NULL);
} else {
- ret = xmlStrndup(q, CUR_PTR - q);
+ ret = xmlStrndup((BASE_PTR+startPosition), len);
NEXT;
}
} else {
static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
- const xmlChar *q;
+ size_t len = 0, startPosition = 0;
xmlChar *ret = NULL;
/*
* Name ::= (Letter | '_') (NameChar)*
*/
if (CUR == '"') {
NEXT;
- q = CUR_PTR;
- while (IS_PUBIDCHAR_CH(CUR)) NEXT;
+
+ if (CUR_PTR < BASE_PTR)
+ return(ret);
+ startPosition = CUR_PTR - BASE_PTR;
+
+ while (IS_PUBIDCHAR_CH(CUR)) {
+ len++;
+ NEXT;
+ }
+
if (CUR != '"') {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished PubidLiteral\n", NULL, NULL);
} else {
- ret = xmlStrndup(q, CUR_PTR - q);
+ ret = xmlStrndup((BASE_PTR + startPosition), len);
NEXT;
}
} else if (CUR == '\'') {
NEXT;
- q = CUR_PTR;
- while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
- NEXT;
+
+ if (CUR_PTR < BASE_PTR)
+ return(ret);
+ startPosition = CUR_PTR - BASE_PTR;
+
+ while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
+ len++;
+ NEXT;
+ }
+
if (CUR != '\'') {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished PubidLiteral\n", NULL, NULL);
} else {
- ret = xmlStrndup(q, CUR_PTR - q);
+ ret = xmlStrndup((BASE_PTR + startPosition), len);
NEXT;
}
} else {
/**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
* @ctxt: an HTML parser context
+ * @readahead: optional read ahead character in ascii range
*
* parse a CharData section.
* if we are within a CDATA section ']]>' marks an end of section.
*/
static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
- xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
+ xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
int nbchar = 0;
int cur, l;
int chunk = 0;
+ if (readahead)
+ buf[nbchar++] = readahead;
+
SHRINK;
cur = CUR_CHAR(l);
while (((cur != '<') || (ctxt->token == '<')) &&
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(ctxt->userData,
- buf, nbchar);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(ctxt->userData, buf, nbchar);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(ctxt->userData,
+ buf, nbchar);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(ctxt->userData, buf, nbchar);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(ctxt->userData,
+ buf, nbchar);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
}
/**
+ * htmlParseCharData:
+ * @ctxt: an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+ htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
* htmlParseExternalID:
* @ctxt: an HTML parser context
* @publicID: a xmlChar** receiving PubidLiteral
ctxt->instate = state;
return;
}
+ len = 0;
+ buf[len] = 0;
q = CUR_CHAR(ql);
+ if (!IS_CHAR(q))
+ goto unfinished;
NEXTL(ql);
r = CUR_CHAR(rl);
+ if (!IS_CHAR(r))
+ goto unfinished;
NEXTL(rl);
cur = CUR_CHAR(l);
- len = 0;
while (IS_CHAR(cur) &&
((cur != '>') ||
(r != '-') || (q != '-'))) {
}
}
buf[len] = 0;
- if (!IS_CHAR(cur)) {
- htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
- "Comment not terminated \n<!--%.50s\n", buf, NULL);
- xmlFree(buf);
- } else {
+ if (IS_CHAR(cur)) {
NEXT;
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
(!ctxt->disableSAX))
ctxt->sax->comment(ctxt->userData, buf);
xmlFree(buf);
+ ctxt->instate = state;
+ return;
}
- ctxt->instate = state;
+
+unfinished:
+ htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
+ "Comment not terminated \n<!--%.50s\n", buf, NULL);
+ xmlFree(buf);
}
/**
* convert as much as possible to the parser reading buffer.
*/
processed = ctxt->input->cur - ctxt->input->base;
- xmlBufferShrink(ctxt->input->buf->buffer, processed);
- nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
- ctxt->input->buf->buffer,
- ctxt->input->buf->raw);
+ xmlBufShrink(ctxt->input->buf->buffer, processed);
+ nbchars = xmlCharEncInput(ctxt->input->buf, 1);
if (nbchars < 0) {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"htmlCheckEncoding: encoder error\n",
NULL, NULL);
}
- ctxt->input->base =
- ctxt->input->cur = ctxt->input->buf->buffer->content;
- ctxt->input->end =
- &ctxt->input->base[ctxt->input->buf->buffer->use];
+ xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
}
}
}
int i;
int discardtag = 0;
- if (ctxt->instate == XML_PARSER_EOF)
- return(-1);
if ((ctxt == NULL) || (ctxt->input == NULL)) {
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
"htmlParseStartTag: context error\n", NULL, NULL);
return -1;
}
+ if (ctxt->instate == XML_PARSER_EOF)
+ return(-1);
if (CUR != '<') return -1;
NEXT;
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
+ /* if recover preserve text on classic misconstructs */
+ if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
+ (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
+ htmlParseCharDataInternal(ctxt, '<');
+ return(-1);
+ }
+
+
/* Dump the bogus tag like browsers do */
while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
(ctxt->instate != XML_PARSER_EOF))
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
const xmlChar *name;
const htmlElemDesc * info;
- htmlParserNodeInfo node_info;
+ htmlParserNodeInfo node_info = { 0, };
int failed;
if ((ctxt == NULL) || (ctxt->input == NULL)) {
input->filename = NULL;
input->buf = buf;
- input->base = input->buf->buffer->content;
- input->cur = input->buf->buffer->content;
- input->end = &input->buf->buffer->content[input->buf->buffer->use];
+ xmlBufResetInput(buf->buffer, input);
inputPush(ctxt, input);
return(ctxt);
buf = in->base;
len = in->length;
} else {
- buf = in->buf->buffer->content;
- len = in->buf->buffer->use;
+ buf = xmlBufContent(in->buf->buffer);
+ len = xmlBufUse(in->buf->buffer);
}
/* take into account the sequence length */
* @stop: Array of chars, which stop the lookup.
* @stopLen: Length of stop-Array
*
- * Try to find if any char of the stop-Array is available in the input
+ * Try to find if any char of the stop-Array is available in the input
* stream.
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
* to avoid rescanning sequences of bytes, it DOES change the state of the
* parser, do not use liberally.
*
- * Returns the index to the current parsing point if a stopChar
+ * Returns the index to the current parsing point if a stopChar
* is available, -1 otherwise.
*/
static int
buf = in->base;
len = in->length;
} else {
- buf = in->buf->buffer->content;
- len = in->buf->buffer->use;
+ buf = xmlBufContent(in->buf->buffer);
+ len = xmlBufUse(in->buf->buffer);
}
for (; base < len; base++) {
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
if ((avail == 0) && (terminate)) {
htmlAutoCloseOnEnd(ctxt);
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
}
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData,
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
/*
* no chars in buffer
*/
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
if (avail < 2)
goto done;
cur = in->cur[0];
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
if (avail < 1)
goto done;
cur = in->cur[0];
if ((cur != '<') && (cur != '&')) {
if (ctxt->sax != NULL) {
if (IS_BLANK_CH(cur)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(
- ctxt->userData, &cur, 1);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(
+ ctxt->userData, &in->cur[0], 1);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(
+ ctxt->userData, &in->cur[0], 1);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(
- ctxt->userData, &cur, 1);
+ ctxt->userData, &in->cur[0], 1);
}
}
ctxt->token = 0;
ctxt->sax->endDocument(ctxt->userData);
}
}
- if ((ctxt->myDoc != NULL) &&
+ if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
(ctxt->instate == XML_PARSER_EPILOG))) {
xmlDtdPtr dtd;
}
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
- int base = ctxt->input->base - ctxt->input->buf->buffer->content;
- int cur = ctxt->input->cur - ctxt->input->base;
+ size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
+ size_t cur = ctxt->input->cur - ctxt->input->base;
int res;
res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
ctxt->disableSAX = 1;
return (XML_PARSER_EOF);
}
- ctxt->input->base = ctxt->input->buf->buffer->content + base;
- ctxt->input->cur = ctxt->input->base + cur;
- ctxt->input->end =
- &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
+ xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
if ((in->encoder != NULL) && (in->buffer != NULL) &&
(in->raw != NULL)) {
int nbchars;
+ size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
+ size_t current = ctxt->input->cur - ctxt->input->base;
- nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
+ nbchars = xmlCharEncInput(in, terminate);
if (nbchars < 0) {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"encoder error\n", NULL, NULL);
return(XML_ERR_INVALID_ENCODING);
}
+ xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
}
}
}
inputStream->filename = (char *)
xmlCanonicPath((const xmlChar *) filename);
inputStream->buf = buf;
- inputStream->base = inputStream->buf->buffer->content;
- inputStream->cur = inputStream->buf->buffer->content;
- inputStream->end =
- &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
+ xmlBufResetInput(buf->buffer, inputStream);
inputPush(ctxt, inputStream);
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL)) {
- int base = ctxt->input->base - ctxt->input->buf->buffer->content;
- int cur = ctxt->input->cur - ctxt->input->base;
+ size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
+ size_t cur = ctxt->input->cur - ctxt->input->base;
xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
- ctxt->input->base = ctxt->input->buf->buffer->content + base;
- ctxt->input->cur = ctxt->input->base + cur;
- ctxt->input->end =
- &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
+ xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
/* set encoding */
if (encoding) {
- content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
- if (content) {
- strcpy ((char *)content, (char *)content_line);
- strcat ((char *)content, (char *)encoding);
- htmlCheckEncoding (ctxt, content);
- xmlFree (content);
+ size_t l = strlen(encoding);
+
+ if (l < 1000) {
+ content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
+ if (content) {
+ strcpy ((char *)content, (char *)content_line);
+ strcat ((char *)content, (char *)encoding);
+ htmlCheckEncoding (ctxt, content);
+ xmlFree (content);
+ }
}
}
* DICT_FREE:
* @str: a string
*
- * Free a string if it is not owned by the "dict" dictionnary in the
+ * Free a string if it is not owned by the "dict" dictionary in the
* current scope
*/
#define DICT_FREE(str) \
if (fd < 0)
return (NULL);
+ xmlInitParser();
xmlInitParser();
input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
return (NULL);
if (ctxt == NULL)
return (NULL);
+ xmlInitParser();
htmlCtxtReset(ctxt);
return (NULL);
if (ctxt == NULL)
return (NULL);
+ xmlInitParser();
htmlCtxtReset(ctxt);
return (NULL);
if (buffer == NULL)
return (NULL);
+ xmlInitParser();
htmlCtxtReset(ctxt);
return (NULL);
if (ctxt == NULL)
return (NULL);
+ xmlInitParser();
htmlCtxtReset(ctxt);
return (NULL);
if (ctxt == NULL)
return (NULL);
+ xmlInitParser();
htmlCtxtReset(ctxt);