2 * testHTML.c : a small tester program for HTML input.
4 * See Copyright for the status of this software.
11 #ifdef LIBXML_HTML_ENABLED
17 #ifdef HAVE_SYS_TYPES_H
18 #include <sys/types.h>
20 #ifdef HAVE_SYS_STAT_H
33 #include <libxml/xmlmemory.h>
34 #include <libxml/HTMLparser.h>
35 #include <libxml/HTMLtree.h>
36 #include <libxml/debugXML.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/globals.h>
40 #ifdef LIBXML_DEBUG_ENABLED
45 static int repeat = 0;
47 #ifdef LIBXML_PUSH_ENABLED
49 #endif /* LIBXML_PUSH_ENABLED */
50 static char *encoding = NULL;
51 static int options = 0;
53 static xmlSAXHandler emptySAXHandlerStruct = {
54 NULL, /* internalSubset */
55 NULL, /* isStandalone */
56 NULL, /* hasInternalSubset */
57 NULL, /* hasExternalSubset */
58 NULL, /* resolveEntity */
60 NULL, /* entityDecl */
61 NULL, /* notationDecl */
62 NULL, /* attributeDecl */
63 NULL, /* elementDecl */
64 NULL, /* unparsedEntityDecl */
65 NULL, /* setDocumentLocator */
66 NULL, /* startDocument */
67 NULL, /* endDocument */
68 NULL, /* startElement */
69 NULL, /* endElement */
71 NULL, /* characters */
72 NULL, /* ignorableWhitespace */
73 NULL, /* processingInstruction */
75 NULL, /* xmlParserWarning */
76 NULL, /* xmlParserError */
77 NULL, /* xmlParserError */
78 NULL, /* getParameterEntity */
79 NULL, /* cdataBlock */
80 NULL, /* externalSubset */
83 NULL, /* startElementNsSAX2Func */
84 NULL, /* endElementNsSAX2Func */
85 NULL /* xmlStructuredErrorFunc */
88 static xmlSAXHandlerPtr emptySAXHandler = &emptySAXHandlerStruct;
89 extern xmlSAXHandlerPtr debugSAXHandler;
91 /************************************************************************
95 ************************************************************************/
99 * @ctxt: An XML parser context
101 * Is this document tagged standalone ?
106 isStandaloneDebug(void *ctx ATTRIBUTE_UNUSED)
108 fprintf(stdout, "SAX.isStandalone()\n");
113 * hasInternalSubsetDebug:
114 * @ctxt: An XML parser context
116 * Does this document has an internal subset
121 hasInternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED)
123 fprintf(stdout, "SAX.hasInternalSubset()\n");
128 * hasExternalSubsetDebug:
129 * @ctxt: An XML parser context
131 * Does this document has an external subset
136 hasExternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED)
138 fprintf(stdout, "SAX.hasExternalSubset()\n");
143 * hasInternalSubsetDebug:
144 * @ctxt: An XML parser context
146 * Does this document has an internal subset
149 internalSubsetDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
150 const xmlChar *ExternalID, const xmlChar *SystemID)
152 fprintf(stdout, "SAX.internalSubset(%s,", name);
153 if (ExternalID == NULL)
154 fprintf(stdout, " ,");
156 fprintf(stdout, " %s,", ExternalID);
157 if (SystemID == NULL)
158 fprintf(stdout, " )\n");
160 fprintf(stdout, " %s)\n", SystemID);
164 * resolveEntityDebug:
165 * @ctxt: An XML parser context
166 * @publicId: The public ID of the entity
167 * @systemId: The system ID of the entity
169 * Special entity resolver, better left to the parser, it has
170 * more context than the application layer.
171 * The default behaviour is to NOT resolve the entities, in that case
172 * the ENTITY_REF nodes are built in the structure (and the parameter
175 * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
177 static xmlParserInputPtr
178 resolveEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *publicId, const xmlChar *systemId)
180 /* xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx; */
183 fprintf(stdout, "SAX.resolveEntity(");
184 if (publicId != NULL)
185 fprintf(stdout, "%s", (char *)publicId);
187 fprintf(stdout, " ");
188 if (systemId != NULL)
189 fprintf(stdout, ", %s)\n", (char *)systemId);
191 fprintf(stdout, ", )\n");
193 if (systemId != NULL) {
194 return(xmlNewInputFromFile(ctxt, (char *) systemId));
202 * @ctxt: An XML parser context
203 * @name: The entity name
205 * Get an entity by name
207 * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
210 getEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
212 fprintf(stdout, "SAX.getEntity(%s)\n", name);
217 * getParameterEntityDebug:
218 * @ctxt: An XML parser context
219 * @name: The entity name
221 * Get a parameter entity by name
223 * Returns the xmlParserInputPtr
226 getParameterEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
228 fprintf(stdout, "SAX.getParameterEntity(%s)\n", name);
235 * @ctxt: An XML parser context
236 * @name: the entity name
237 * @type: the entity type
238 * @publicId: The public ID of the entity
239 * @systemId: The system ID of the entity
240 * @content: the entity value (without processing).
242 * An entity definition has been parsed
245 entityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type,
246 const xmlChar *publicId, const xmlChar *systemId, xmlChar *content)
248 fprintf(stdout, "SAX.entityDecl(%s, %d, %s, %s, %s)\n",
249 name, type, publicId, systemId, content);
253 * attributeDeclDebug:
254 * @ctxt: An XML parser context
255 * @name: the attribute name
256 * @type: the attribute type
258 * An attribute definition has been parsed
261 attributeDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *elem, const xmlChar *name,
262 int type, int def, const xmlChar *defaultValue,
263 xmlEnumerationPtr tree ATTRIBUTE_UNUSED)
265 fprintf(stdout, "SAX.attributeDecl(%s, %s, %d, %d, %s, ...)\n",
266 elem, name, type, def, defaultValue);
271 * @ctxt: An XML parser context
272 * @name: the element name
273 * @type: the element type
274 * @content: the element value (without processing).
276 * An element definition has been parsed
279 elementDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type,
280 xmlElementContentPtr content ATTRIBUTE_UNUSED)
282 fprintf(stdout, "SAX.elementDecl(%s, %d, ...)\n",
288 * @ctxt: An XML parser context
289 * @name: The name of the notation
290 * @publicId: The public ID of the entity
291 * @systemId: The system ID of the entity
293 * What to do when a notation declaration has been parsed.
296 notationDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
297 const xmlChar *publicId, const xmlChar *systemId)
299 fprintf(stdout, "SAX.notationDecl(%s, %s, %s)\n",
300 (char *) name, (char *) publicId, (char *) systemId);
304 * unparsedEntityDeclDebug:
305 * @ctxt: An XML parser context
306 * @name: The name of the entity
307 * @publicId: The public ID of the entity
308 * @systemId: The system ID of the entity
309 * @notationName: the name of the notation
311 * What to do when an unparsed entity declaration is parsed
314 unparsedEntityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
315 const xmlChar *publicId, const xmlChar *systemId,
316 const xmlChar *notationName)
318 fprintf(stdout, "SAX.unparsedEntityDecl(%s, %s, %s, %s)\n",
319 (char *) name, (char *) publicId, (char *) systemId,
320 (char *) notationName);
324 * setDocumentLocatorDebug:
325 * @ctxt: An XML parser context
326 * @loc: A SAX Locator
328 * Receive the document locator at startup, actually xmlDefaultSAXLocator
329 * Everything is available on the context, so this is useless in our case.
332 setDocumentLocatorDebug(void *ctx ATTRIBUTE_UNUSED, xmlSAXLocatorPtr loc ATTRIBUTE_UNUSED)
334 fprintf(stdout, "SAX.setDocumentLocator()\n");
338 * startDocumentDebug:
339 * @ctxt: An XML parser context
341 * called when the document start being processed.
344 startDocumentDebug(void *ctx ATTRIBUTE_UNUSED)
346 fprintf(stdout, "SAX.startDocument()\n");
351 * @ctxt: An XML parser context
353 * called when the document end has been detected.
356 endDocumentDebug(void *ctx ATTRIBUTE_UNUSED)
358 fprintf(stdout, "SAX.endDocument()\n");
363 * @ctxt: An XML parser context
364 * @name: The element name
366 * called when an opening tag has been processed.
369 startElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar **atts)
373 fprintf(stdout, "SAX.startElement(%s", (char *) name);
375 for (i = 0;(atts[i] != NULL);i++) {
376 fprintf(stdout, ", %s", atts[i++]);
377 if (atts[i] != NULL) {
378 unsigned char output[40];
379 const unsigned char *att = atts[i];
381 fprintf(stdout, "='");
382 while ((attlen = strlen((char*)att)) > 0) {
383 outlen = sizeof output - 1;
384 htmlEncodeEntities(output, &outlen, att, &attlen, '\'');
386 fprintf(stdout, "%s", (char *) output);
389 fprintf(stdout, "'");
393 fprintf(stdout, ")\n");
398 * @ctxt: An XML parser context
399 * @name: The element name
401 * called when the end of an element has been detected.
404 endElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
406 fprintf(stdout, "SAX.endElement(%s)\n", (char *) name);
411 * @ctxt: An XML parser context
412 * @ch: a xmlChar string
413 * @len: the number of xmlChar
415 * receiving some chars from the parser.
416 * Question: how much at a time ???
419 charactersDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
421 unsigned char output[40];
422 int inlen = len, outlen = 30;
424 htmlEncodeEntities(output, &outlen, ch, &inlen, 0);
427 fprintf(stdout, "SAX.characters(%s, %d)\n", output, len);
432 * @ctxt: An XML parser context
433 * @ch: a xmlChar string
434 * @len: the number of xmlChar
436 * receiving some cdata chars from the parser.
437 * Question: how much at a time ???
440 cdataDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
442 unsigned char output[40];
443 int inlen = len, outlen = 30;
445 htmlEncodeEntities(output, &outlen, ch, &inlen, 0);
448 fprintf(stdout, "SAX.cdata(%s, %d)\n", output, len);
453 * @ctxt: An XML parser context
454 * @name: The entity name
456 * called when an entity reference is detected.
459 referenceDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
461 fprintf(stdout, "SAX.reference(%s)\n", name);
465 * ignorableWhitespaceDebug:
466 * @ctxt: An XML parser context
467 * @ch: a xmlChar string
468 * @start: the first char in the string
469 * @len: the number of xmlChar
471 * receiving some ignorable whitespaces from the parser.
472 * Question: how much at a time ???
475 ignorableWhitespaceDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
480 for (i = 0;(i<len) && (i < 30);i++)
484 fprintf(stdout, "SAX.ignorableWhitespace(%s, %d)\n", output, len);
488 * processingInstructionDebug:
489 * @ctxt: An XML parser context
490 * @target: the target name
491 * @data: the PI data's
492 * @len: the number of xmlChar
494 * A processing instruction has been parsed.
497 processingInstructionDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *target,
500 fprintf(stdout, "SAX.processingInstruction(%s, %s)\n",
501 (char *) target, (char *) data);
506 * @ctxt: An XML parser context
507 * @value: the comment content
509 * A comment has been parsed.
512 commentDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *value)
514 fprintf(stdout, "SAX.comment(%s)\n", value);
519 * @ctxt: An XML parser context
520 * @msg: the message to display/transmit
521 * @...: extra parameters for the message display
523 * Display and format a warning messages, gives file, line, position and
527 warningDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
532 fprintf(stdout, "SAX.warning: ");
533 vfprintf(stdout, msg, args);
539 * @ctxt: An XML parser context
540 * @msg: the message to display/transmit
541 * @...: extra parameters for the message display
543 * Display and format a error messages, gives file, line, position and
547 errorDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
552 fprintf(stdout, "SAX.error: ");
553 vfprintf(stdout, msg, args);
559 * @ctxt: An XML parser context
560 * @msg: the message to display/transmit
561 * @...: extra parameters for the message display
563 * Display and format a fatalError messages, gives file, line, position and
567 fatalErrorDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
572 fprintf(stdout, "SAX.fatalError: ");
573 vfprintf(stdout, msg, args);
577 static xmlSAXHandler debugSAXHandlerStruct = {
580 hasInternalSubsetDebug,
581 hasExternalSubsetDebug,
588 unparsedEntityDeclDebug,
589 setDocumentLocatorDebug,
596 ignorableWhitespaceDebug,
597 processingInstructionDebug,
602 getParameterEntityDebug,
612 xmlSAXHandlerPtr debugSAXHandler = &debugSAXHandlerStruct;
613 /************************************************************************
617 ************************************************************************/
620 parseSAXFile(char *filename) {
621 htmlDocPtr doc = NULL;
624 * Empty callbacks for checking
626 #ifdef LIBXML_PUSH_ENABLED
630 #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
631 f = fopen(filename, "rb");
633 f = fopen(filename, "r");
638 htmlParserCtxtPtr ctxt;
642 res = fread(chars, 1, 4, f);
644 ctxt = htmlCreatePushParserCtxt(emptySAXHandler, NULL,
645 chars, res, filename, XML_CHAR_ENCODING_NONE);
646 while ((res = fread(chars, 1, size, f)) > 0) {
647 htmlParseChunk(ctxt, chars, res, 0);
649 htmlParseChunk(ctxt, chars, 0, 1);
651 htmlFreeParserCtxt(ctxt);
654 fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
660 #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
661 f = fopen(filename, "rb");
663 f = fopen(filename, "r");
668 htmlParserCtxtPtr ctxt;
672 res = fread(chars, 1, 4, f);
674 ctxt = htmlCreatePushParserCtxt(debugSAXHandler, NULL,
675 chars, res, filename, XML_CHAR_ENCODING_NONE);
676 while ((res = fread(chars, 1, size, f)) > 0) {
677 htmlParseChunk(ctxt, chars, res, 0);
679 htmlParseChunk(ctxt, chars, 0, 1);
681 htmlFreeParserCtxt(ctxt);
684 fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
691 #endif /* LIBXML_PUSH_ENABLED */
692 doc = htmlSAXParseFile(filename, NULL, emptySAXHandler, NULL);
694 fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
702 doc = htmlSAXParseFile(filename, NULL, debugSAXHandler, NULL);
704 fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
708 #ifdef LIBXML_PUSH_ENABLED
710 #endif /* LIBXML_PUSH_ENABLED */
714 parseAndPrintFile(char *filename) {
715 htmlDocPtr doc = NULL;
718 * build an HTML tree from a string;
720 #ifdef LIBXML_PUSH_ENABLED
724 #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
725 f = fopen(filename, "rb");
727 f = fopen(filename, "r");
732 htmlParserCtxtPtr ctxt;
736 res = fread(chars, 1, 4, f);
738 ctxt = htmlCreatePushParserCtxt(NULL, NULL,
739 chars, res, filename, XML_CHAR_ENCODING_NONE);
740 while ((res = fread(chars, 1, size, f)) > 0) {
741 htmlParseChunk(ctxt, chars, res, 0);
743 htmlParseChunk(ctxt, chars, 0, 1);
745 htmlFreeParserCtxt(ctxt);
750 doc = htmlReadFile(filename, NULL, options);
753 doc = htmlReadFile(filename,NULL,options);
756 xmlGenericError(xmlGenericErrorContext,
757 "Could not parse %s\n", filename);
760 #ifdef LIBXML_TREE_ENABLED
762 * test intermediate copy if needed.
768 doc = xmlCopyDoc(doc, 1);
773 #ifdef LIBXML_OUTPUT_ENABLED
778 #ifdef LIBXML_DEBUG_ENABLED
781 htmlSaveFileEnc("-", doc, encoding);
783 htmlDocDump(stdout, doc);
785 xmlDebugDumpDocument(stdout, doc);
788 htmlSaveFileEnc("-", doc, encoding);
790 htmlDocDump(stdout, doc);
793 #endif /* LIBXML_OUTPUT_ENABLED */
801 int main(int argc, char **argv) {
805 for (i = 1; i < argc ; i++) {
806 #ifdef LIBXML_DEBUG_ENABLED
807 if ((!strcmp(argv[i], "-debug")) || (!strcmp(argv[i], "--debug")))
811 if ((!strcmp(argv[i], "-copy")) || (!strcmp(argv[i], "--copy")))
813 #ifdef LIBXML_PUSH_ENABLED
814 else if ((!strcmp(argv[i], "-push")) || (!strcmp(argv[i], "--push")))
816 #endif /* LIBXML_PUSH_ENABLED */
817 else if ((!strcmp(argv[i], "-sax")) || (!strcmp(argv[i], "--sax")))
819 else if ((!strcmp(argv[i], "-noout")) || (!strcmp(argv[i], "--noout")))
821 else if ((!strcmp(argv[i], "-repeat")) ||
822 (!strcmp(argv[i], "--repeat")))
824 else if ((!strcmp(argv[i], "-encode")) ||
825 (!strcmp(argv[i], "--encode"))) {
830 for (i = 1; i < argc ; i++) {
831 if ((!strcmp(argv[i], "-encode")) ||
832 (!strcmp(argv[i], "--encode"))) {
836 if (argv[i][0] != '-') {
838 for (count = 0;count < 100 * repeat;count++) {
840 parseSAXFile(argv[i]);
842 parseAndPrintFile(argv[i]);
846 parseSAXFile(argv[i]);
848 parseAndPrintFile(argv[i]);
854 printf("Usage : %s [--debug] [--copy] [--copy] HTMLfiles ...\n",
856 printf("\tParse the HTML files and output the result of the parsing\n");
857 #ifdef LIBXML_DEBUG_ENABLED
858 printf("\t--debug : dump a debug tree of the in-memory document\n");
860 printf("\t--copy : used to test the internal copy implementation\n");
861 printf("\t--sax : debug the sequence of SAX callbacks\n");
862 printf("\t--repeat : parse the file 100 times, for timing\n");
863 printf("\t--noout : do not print the result\n");
864 #ifdef LIBXML_PUSH_ENABLED
865 printf("\t--push : use the push mode parser\n");
866 #endif /* LIBXML_PUSH_ENABLED */
867 printf("\t--encode encoding : output in the given encoding\n");
874 #else /* !LIBXML_HTML_ENABLED */
876 int main(int argc ATTRIBUTE_UNUSED, char **argv ATTRIBUTE_UNUSED) {
877 printf("%s : HTML support not compiled in\n", argv[0]);