2 * Test the UTF-8 decoding routines
4 * author: Daniel Veillard
5 * copy: see Copyright for the status of this software.
10 #include <libxml/parser.h>
11 #include <libxml/parserInternals.h>
15 static void errorHandler(void *unused, xmlErrorPtr err) {
16 if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
17 lastError = err->code;
21 char document1[100] = "<doc>XXXX</doc>";
22 char document2[100] = "<doc foo='XXXX'/>";
24 static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
25 int len, char *data, int forbid1, int forbid2) {
29 for (i = 0;i <= 0xFF;i++) {
35 res = xmlReadMemory(document, len, "test", NULL, 0);
37 if ((i == forbid1) || (i == forbid2)) {
38 if ((lastError == 0) || (res != NULL))
40 "Failed to detect invalid char for Byte 0x%02X: %c\n",
44 else if ((i == '<') || (i == '&')) {
45 if ((lastError == 0) || (res != NULL))
47 "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
49 else if (((i < 0x20) || (i >= 0x80)) &&
50 (i != 0x9) && (i != 0xA) && (i != 0xD)) {
51 if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
53 "Failed to detect invalid char for Byte 0x%02X\n", i);
55 else if (res == NULL) {
57 "Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
64 static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
65 int len, char *data) {
69 for (i = 0x80;i <= 0xFF;i++) {
70 for (j = 0;j <= 0xFF;j++) {
77 res = xmlReadMemory(document, len, "test", NULL, 0);
79 /* if first bit of first char is set, then second bit must too */
80 if ((i & 0x80) && ((i & 0x40) == 0)) {
81 if ((lastError == 0) || (res != NULL))
83 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
88 * if first bit of first char is set, then second char first
91 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
92 if ((lastError == 0) || (res != NULL))
94 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
99 * if using a 2 byte encoding then the value must be greater
100 * than 0x80, i.e. one of bits 5 to 1 of i must be set
102 else if ((i & 0x80) && ((i & 0x1E) == 0)) {
103 if ((lastError == 0) || (res != NULL))
105 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
110 * if third bit of first char is set, then the sequence would need
111 * at least 3 bytes, but we give only 2 !
113 else if ((i & 0xE0) == 0xE0) {
114 if ((lastError == 0) || (res != NULL))
116 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
121 * We should see no error in remaning cases
123 else if ((lastError != 0) || (res == NULL)) {
125 "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
134 * testDocumentRanges:
136 * Test the correct UTF8 character parsing in context of XML documents
137 * Those are in-context injection tests checking the parser behaviour on
138 * edge case values at different point in content, beginning and end of
139 * CDATA in text or in attribute values.
142 static void testDocumentRanges(void) {
143 xmlParserCtxtPtr ctxt;
147 * Set up a parsing context using the first document as
148 * the current input source.
150 ctxt = xmlNewParserCtxt();
152 fprintf(stderr, "Failed to allocate parser context\n");
156 printf("testing 1 byte char in document: 1");
158 data = &document1[5];
163 /* test 1 byte injection at beginning of area */
164 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
172 /* test 1 byte injection at end of area */
173 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
178 data = &document2[10];
183 /* test 1 byte injection at beginning of area */
184 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
192 /* test 1 byte injection at end of area */
193 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
197 printf("testing 2 byte char in document: 1");
199 data = &document1[5];
204 /* test 2 byte injection at beginning of area */
205 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
213 /* test 2 byte injection at end of area */
214 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
219 data = &document2[10];
224 /* test 2 byte injection at beginning of area */
225 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
233 /* test 2 byte injection at end of area */
234 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
238 xmlFreeParserCtxt(ctxt);
241 static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
248 for (i = 0;i <= 0xFF;i++) {
250 ctxt->charset = XML_CHAR_ENCODING_UTF8;
253 c = xmlCurrentChar(ctxt, &len);
254 if ((i == 0) || (i >= 0x80)) {
255 /* we must see an error there */
256 if (lastError != XML_ERR_INVALID_CHAR)
258 "Failed to detect invalid char for Byte 0x%02X\n", i);
259 } else if (i == 0xD) {
260 if ((c != 0xA) || (len != 1))
261 fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
262 } else if ((c != i) || (len != 1)) {
263 fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
268 static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
274 for (i = 0x80;i <= 0xFF;i++) {
275 for (j = 0;j <= 0xFF;j++) {
278 ctxt->charset = XML_CHAR_ENCODING_UTF8;
281 c = xmlCurrentChar(ctxt, &len);
283 /* if first bit of first char is set, then second bit must too */
284 if ((i & 0x80) && ((i & 0x40) == 0)) {
285 if (lastError != XML_ERR_INVALID_CHAR)
287 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
292 * if first bit of first char is set, then second char first
295 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
296 if (lastError != XML_ERR_INVALID_CHAR)
298 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
303 * if using a 2 byte encoding then the value must be greater
304 * than 0x80, i.e. one of bits 5 to 1 of i must be set
306 else if ((i & 0x80) && ((i & 0x1E) == 0)) {
307 if (lastError != XML_ERR_INVALID_CHAR)
309 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
314 * if third bit of first char is set, then the sequence would need
315 * at least 3 bytes, but we give only 2 !
317 else if ((i & 0xE0) == 0xE0) {
318 if (lastError != XML_ERR_INVALID_CHAR)
320 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
325 * We should see no error in remaning cases
327 else if ((lastError != 0) || (len != 2)) {
329 "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
333 * Finally check the value is right
335 else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
337 "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
338 i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
344 static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
347 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
351 for (i = 0xE0;i <= 0xFF;i++) {
352 for (j = 0;j <= 0xFF;j++) {
353 for (k = 0;k < 6;k++) {
358 value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
359 ctxt->charset = XML_CHAR_ENCODING_UTF8;
362 c = xmlCurrentChar(ctxt, &len);
365 * if fourth bit of first char is set, then the sequence would need
366 * at least 4 bytes, but we give only 3 !
368 if ((i & 0xF0) == 0xF0) {
369 if (lastError != XML_ERR_INVALID_CHAR)
371 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
376 * The second and the third bytes must start with 10
378 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
379 if (lastError != XML_ERR_INVALID_CHAR)
381 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
386 * if using a 3 byte encoding then the value must be greater
387 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
388 * the 6th byte of data[1] must be set
390 else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
391 if (lastError != XML_ERR_INVALID_CHAR)
393 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
398 * There are values in that range that are not allowed in XML-1.0
400 else if (((value > 0xD7FF) && (value <0xE000)) ||
401 ((value > 0xFFFD) && (value <0x10000))) {
402 if (lastError != XML_ERR_INVALID_CHAR)
404 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
409 * We should see no error in remaining cases
411 else if ((lastError != 0) || (len != 3)) {
413 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
418 * Finally check the value is right
420 else if (c != value) {
422 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
423 i, j, data[2], value, c);
430 static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
431 int i, j, k, K, l, L;
433 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
437 for (i = 0xF0;i <= 0xFF;i++) {
438 for (j = 0;j <= 0xFF;j++) {
439 for (k = 0;k < 6;k++) {
440 for (l = 0;l < 6;l++) {
447 value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
449 ctxt->charset = XML_CHAR_ENCODING_UTF8;
452 c = xmlCurrentChar(ctxt, &len);
455 * if fifth bit of first char is set, then the sequence would need
456 * at least 5 bytes, but we give only 4 !
458 if ((i & 0xF8) == 0xF8) {
459 if (lastError != XML_ERR_INVALID_CHAR)
461 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
466 * The second, third and fourth bytes must start with 10
468 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
469 ((L & 0xC0) != 0x80)) {
470 if (lastError != XML_ERR_INVALID_CHAR)
472 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
477 * if using a 3 byte encoding then the value must be greater
478 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
479 * the 6 or 5th byte of j must be set
481 else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
482 if (lastError != XML_ERR_INVALID_CHAR)
484 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
489 * There are values in that range that are not allowed in XML-1.0
491 else if (((value > 0xD7FF) && (value <0xE000)) ||
492 ((value > 0xFFFD) && (value <0x10000)) ||
493 (value > 0x10FFFF)) {
494 if (lastError != XML_ERR_INVALID_CHAR)
496 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
501 * We should see no error in remaining cases
503 else if ((lastError != 0) || (len != 4)) {
505 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
510 * Finally check the value is right
512 else if (c != value) {
514 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
515 i, j, data[2], value, c);
526 * Test the correct UTF8 character parsing in isolation i.e.
527 * not when parsing a full document, this is less expensive and we can
528 * cover the full range of UTF-8 chars accepted by XML-1.0
531 static void testCharRanges(void) {
533 xmlParserCtxtPtr ctxt;
534 xmlParserInputBufferPtr buf;
535 xmlParserInputPtr input;
540 * Set up a parsing context using the above data buffer as
541 * the current input source.
543 ctxt = xmlNewParserCtxt();
545 fprintf(stderr, "Failed to allocate parser context\n");
548 buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
549 XML_CHAR_ENCODING_NONE);
551 fprintf(stderr, "Failed to allocate input buffer\n");
554 input = xmlNewInputStream(ctxt);
556 xmlFreeParserInputBuffer(buf);
559 input->filename = NULL;
561 input->base = input->buf->buffer->content;
562 input->cur = input->buf->buffer->content;
563 input->end = &input->buf->buffer->content[4];
564 inputPush(ctxt, input);
566 printf("testing char range: 1");
568 testCharRangeByte1(ctxt, data);
571 testCharRangeByte2(ctxt, data);
574 testCharRangeByte3(ctxt, data);
577 testCharRangeByte4(ctxt, data);
582 xmlFreeParserCtxt(ctxt);
588 * this initialize the library and check potential ABI mismatches
589 * between the version it was compiled for and the actual shared
595 * Catch errors separately
598 xmlSetStructuredErrorFunc(NULL, errorHandler);
604 testDocumentRanges();
607 * Cleanup function for the XML library.
611 * this is to debug memory for regression tests