2 * string.c : an XML string utilities module
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
8 * See Copyright for the status of this software.
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
21 #include <libxml/xmlmemory.h>
22 #include <libxml/parserInternals.h>
23 #include <libxml/xmlstring.h>
25 /************************************************************************
27 * Commodity functions to handle xmlChars *
29 ************************************************************************/
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
36 * a strndup for array of xmlChar's
38 * Returns a new xmlChar * or NULL
41 xmlStrndup(const xmlChar *cur, int len) {
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
47 xmlErrMemory(NULL, NULL);
50 memcpy(ret, cur, len * sizeof(xmlChar));
57 * @cur: the input xmlChar *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
63 * Returns a new xmlChar * or NULL
66 xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
76 * @cur: the input char *
77 * @len: the len of @cur
79 * a strndup for char's to xmlChar's
81 * Returns a new xmlChar * or NULL
85 xmlCharStrndup(const char *cur, int len) {
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
92 xmlErrMemory(NULL, NULL);
95 for (i = 0;i < len;i++) {
96 ret[i] = (xmlChar) cur[i];
97 if (ret[i] == 0) return(ret);
105 * @cur: the input char *
107 * a strdup for char's to xmlChar's
109 * Returns a new xmlChar * or NULL
113 xmlCharStrdup(const char *cur) {
116 if (cur == NULL) return(NULL);
117 while (*p != '\0') p++; /* non input consuming */
118 return(xmlCharStrndup(cur, p - cur));
123 * @str1: the first xmlChar *
124 * @str2: the second xmlChar *
126 * a strcmp for xmlChar's
128 * Returns the integer result of the comparison
132 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
135 if (str1 == str2) return(0);
136 if (str1 == NULL) return(-1);
137 if (str2 == NULL) return(1);
139 tmp = *str1++ - *str2;
140 if (tmp != 0) return(tmp);
141 } while (*str2++ != 0);
147 * @str1: the first xmlChar *
148 * @str2: the second xmlChar *
150 * Check if both strings are equal of have same content.
151 * Should be a bit more readable and faster than xmlStrcmp()
153 * Returns 1 if they are equal, 0 if they are different
157 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158 if (str1 == str2) return(1);
159 if (str1 == NULL) return(0);
160 if (str2 == NULL) return(0);
162 if (*str1++ != *str2) return(0);
169 * @pref: the prefix of the QName
170 * @name: the localname of the QName
171 * @str: the second xmlChar *
173 * Check if a QName is Equal to a given string
175 * Returns 1 if they are equal, 0 if they are different
179 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180 if (pref == NULL) return(xmlStrEqual(name, str));
181 if (name == NULL) return(0);
182 if (str == NULL) return(0);
185 if (*pref++ != *str) return(0);
186 } while ((*str++) && (*pref));
187 if (*str++ != ':') return(0);
189 if (*name++ != *str) return(0);
196 * @str1: the first xmlChar *
197 * @str2: the second xmlChar *
198 * @len: the max comparison length
200 * a strncmp for xmlChar's
202 * Returns the integer result of the comparison
206 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
209 if (len <= 0) return(0);
210 if (str1 == str2) return(0);
211 if (str1 == NULL) return(-1);
212 if (str2 == NULL) return(1);
214 tmp = strncmp((const char *)str1, (const char *)str2, len);
218 tmp = *str1++ - *str2;
219 if (tmp != 0 || --len == 0) return(tmp);
220 } while (*str2++ != 0);
225 static const xmlChar casemap[256] = {
226 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
262 * @str1: the first xmlChar *
263 * @str2: the second xmlChar *
265 * a strcasecmp for xmlChar's
267 * Returns the integer result of the comparison
271 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
274 if (str1 == str2) return(0);
275 if (str1 == NULL) return(-1);
276 if (str2 == NULL) return(1);
278 tmp = casemap[*str1++] - casemap[*str2];
279 if (tmp != 0) return(tmp);
280 } while (*str2++ != 0);
286 * @str1: the first xmlChar *
287 * @str2: the second xmlChar *
288 * @len: the max comparison length
290 * a strncasecmp for xmlChar's
292 * Returns the integer result of the comparison
296 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
299 if (len <= 0) return(0);
300 if (str1 == str2) return(0);
301 if (str1 == NULL) return(-1);
302 if (str2 == NULL) return(1);
304 tmp = casemap[*str1++] - casemap[*str2];
305 if (tmp != 0 || --len == 0) return(tmp);
306 } while (*str2++ != 0);
312 * @str: the xmlChar * array
313 * @val: the xmlChar to search
315 * a strchr for xmlChar's
317 * Returns the xmlChar * for the first occurrence or NULL.
321 xmlStrchr(const xmlChar *str, xmlChar val) {
322 if (str == NULL) return(NULL);
323 while (*str != 0) { /* non input consuming */
324 if (*str == val) return((xmlChar *) str);
332 * @str: the xmlChar * array (haystack)
333 * @val: the xmlChar to search (needle)
335 * a strstr for xmlChar's
337 * Returns the xmlChar * for the first occurrence or NULL.
341 xmlStrstr(const xmlChar *str, const xmlChar *val) {
344 if (str == NULL) return(NULL);
345 if (val == NULL) return(NULL);
348 if (n == 0) return(str);
349 while (*str != 0) { /* non input consuming */
351 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
360 * @str: the xmlChar * array (haystack)
361 * @val: the xmlChar to search (needle)
363 * a case-ignoring strstr for xmlChar's
365 * Returns the xmlChar * for the first occurrence or NULL.
369 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
372 if (str == NULL) return(NULL);
373 if (val == NULL) return(NULL);
376 if (n == 0) return(str);
377 while (*str != 0) { /* non input consuming */
378 if (casemap[*str] == casemap[*val])
379 if (!xmlStrncasecmp(str, val, n)) return(str);
387 * @str: the xmlChar * array (haystack)
388 * @start: the index of the first char (zero based)
389 * @len: the length of the substring
391 * Extract a substring of a given string
393 * Returns the xmlChar * for the first occurrence or NULL.
397 xmlStrsub(const xmlChar *str, int start, int len) {
400 if (str == NULL) return(NULL);
401 if (start < 0) return(NULL);
402 if (len < 0) return(NULL);
404 for (i = 0;i < start;i++) {
405 if (*str == 0) return(NULL);
408 if (*str == 0) return(NULL);
409 return(xmlStrndup(str, len));
414 * @str: the xmlChar * array
416 * length of a xmlChar's string
418 * Returns the number of xmlChar contained in the ARRAY.
422 xmlStrlen(const xmlChar *str) {
425 if (str == NULL) return(0);
426 while (*str != 0) { /* non input consuming */
435 * @cur: the original xmlChar * array
436 * @add: the xmlChar * array added
437 * @len: the length of @add
439 * a strncat for array of xmlChar's, it will extend @cur with the len
440 * first bytes of @add. Note that if @len < 0 then this is an API error
441 * and NULL will be returned.
443 * Returns a new xmlChar *, the original @cur is reallocated if needed
444 * and should not be freed
448 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
452 if ((add == NULL) || (len == 0))
457 return(xmlStrndup(add, len));
459 size = xmlStrlen(cur);
462 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
464 xmlErrMemory(NULL, NULL);
467 memcpy(&ret[size], add, len * sizeof(xmlChar));
474 * @str1: first xmlChar string
475 * @str2: second xmlChar string
476 * @len: the len of @str2 or < 0
478 * same as xmlStrncat, but creates a new string. The original
479 * two strings are not freed. If @len is < 0 then the length
480 * will be calculated automatically.
482 * Returns a new xmlChar * or NULL
485 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
490 len = xmlStrlen(str2);
494 if ((str2 == NULL) || (len == 0))
495 return(xmlStrdup(str1));
497 return(xmlStrndup(str2, len));
499 size = xmlStrlen(str1);
502 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
504 xmlErrMemory(NULL, NULL);
505 return(xmlStrndup(str1, size));
507 memcpy(ret, str1, size * sizeof(xmlChar));
508 memcpy(&ret[size], str2, len * sizeof(xmlChar));
515 * @cur: the original xmlChar * array
516 * @add: the xmlChar * array added
518 * a strcat for array of xmlChar's. Since they are supposed to be
519 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
520 * a termination mark of '0'.
522 * Returns a new xmlChar * containing the concatenated string.
525 xmlStrcat(xmlChar *cur, const xmlChar *add) {
526 const xmlChar *p = add;
528 if (add == NULL) return(cur);
530 return(xmlStrdup(add));
532 while (*p != 0) p++; /* non input consuming */
533 return(xmlStrncat(cur, add, p - add));
538 * @buf: the result buffer.
539 * @len: the result buffer length.
540 * @msg: the message with printf formatting.
541 * @...: extra parameters for the message.
543 * Formats @msg and places result into @buf.
545 * Returns the number of characters written to @buf or -1 if an error occurs.
548 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
552 if((buf == NULL) || (msg == NULL)) {
557 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
559 buf[len - 1] = 0; /* be safe ! */
566 * @buf: the result buffer.
567 * @len: the result buffer length.
568 * @msg: the message with printf formatting.
569 * @ap: extra parameters for the message.
571 * Formats @msg and places result into @buf.
573 * Returns the number of characters written to @buf or -1 if an error occurs.
576 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
579 if((buf == NULL) || (msg == NULL)) {
583 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
584 buf[len - 1] = 0; /* be safe ! */
589 /************************************************************************
591 * Generic UTF8 handling routines *
593 * From rfc2044: encoding of the Unicode values on UTF-8: *
595 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
596 * 0000 0000-0000 007F 0xxxxxxx *
597 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
598 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
600 * I hope we won't use values > 0xFFFF anytime soon ! *
602 ************************************************************************/
607 * @utf: pointer to the UTF8 character
609 * calculates the internal size of a UTF8 character
611 * returns the numbers of bytes in the character, -1 on format error
614 xmlUTF8Size(const xmlChar *utf) {
622 /* check valid UTF8 character */
625 /* determine number of bytes in char */
627 for (mask=0x20; mask != 0; mask>>=1) {
637 * @utf1: pointer to first UTF8 char
638 * @utf2: pointer to second UTF8 char
640 * compares the two UCS4 values
642 * returns result of the compare as with xmlStrncmp
645 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
652 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
657 * @utf: a sequence of UTF-8 encoded bytes
659 * compute the length of an UTF8 string, it doesn't do a full UTF8
660 * checking of the content of the string.
662 * Returns the number of characters in the string or -1 in case of error
665 xmlUTF8Strlen(const xmlChar *utf) {
673 if ((utf[1] & 0xc0) != 0x80)
675 if ((utf[0] & 0xe0) == 0xe0) {
676 if ((utf[2] & 0xc0) != 0x80)
678 if ((utf[0] & 0xf0) == 0xf0) {
679 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
698 * @utf: a sequence of UTF-8 encoded bytes
699 * @len: a pointer to the minimum number of bytes present in
700 * the sequence. This is used to assure the next character
701 * is completely contained within the sequence.
703 * Read the first UTF8 character from @utf
705 * Returns the char value or -1 in case of error, and sets *len to
706 * the actual number of bytes consumed (0 in case of error)
709 xmlGetUTF8Char(const unsigned char *utf, int *len) {
723 if ((utf[1] & 0xc0) != 0x80)
725 if ((c & 0xe0) == 0xe0) {
728 if ((utf[2] & 0xc0) != 0x80)
730 if ((c & 0xf0) == 0xf0) {
733 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
737 c = (utf[0] & 0x7) << 18;
738 c |= (utf[1] & 0x3f) << 12;
739 c |= (utf[2] & 0x3f) << 6;
744 c = (utf[0] & 0xf) << 12;
745 c |= (utf[1] & 0x3f) << 6;
751 c = (utf[0] & 0x1f) << 6;
768 * @utf: Pointer to putative UTF-8 encoded string.
770 * Checks @utf for being valid UTF-8. @utf is assumed to be
771 * null-terminated. This function is not super-strict, as it will
772 * allow longer UTF-8 sequences than necessary. Note that Java is
773 * capable of producing these sequences if provoked. Also note, this
774 * routine checks for the 4-byte maximum size, but does not check for
775 * 0x10ffff maximum value.
777 * Return value: true if @utf is valid.
780 xmlCheckUTF8(const unsigned char *utf)
788 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
789 * are as follows (in "bit format"):
790 * 0xxxxxxx valid 1-byte
791 * 110xxxxx 10xxxxxx valid 2-byte
792 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
793 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
795 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
796 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
798 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799 if ((utf[ix+1] & 0xc0 ) != 0x80)
802 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803 if (((utf[ix+1] & 0xc0) != 0x80) ||
804 ((utf[ix+2] & 0xc0) != 0x80))
807 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808 if (((utf[ix+1] & 0xc0) != 0x80) ||
809 ((utf[ix+2] & 0xc0) != 0x80) ||
810 ((utf[ix+3] & 0xc0) != 0x80))
813 } else /* unknown encoding */
821 * @utf: a sequence of UTF-8 encoded bytes
822 * @len: the number of characters in the array
824 * storage size of an UTF8 string
825 * the behaviour is not garanteed if the input string is not UTF-8
827 * Returns the storage size of
828 * the first 'len' characters of ARRAY
832 xmlUTF8Strsize(const xmlChar *utf, int len) {
833 const xmlChar *ptr=utf;
845 if ( (ch = *ptr++) & 0x80)
846 while ((ch<<=1) & 0x80 ) {
847 if (*ptr == 0) break;
857 * @utf: the input UTF8 *
858 * @len: the len of @utf (in chars)
860 * a strndup for array of UTF8's
862 * Returns a new UTF8 * or NULL
865 xmlUTF8Strndup(const xmlChar *utf, int len) {
869 if ((utf == NULL) || (len < 0)) return(NULL);
870 i = xmlUTF8Strsize(utf, len);
871 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
873 xmlGenericError(xmlGenericErrorContext,
874 "malloc of %ld byte failed\n",
875 (len + 1) * (long)sizeof(xmlChar));
878 memcpy(ret, utf, i * sizeof(xmlChar));
885 * @utf: the input UTF8 *
886 * @pos: the position of the desired UTF8 char (in chars)
888 * a function to provide the equivalent of fetching a
889 * character from a string array
891 * Returns a pointer to the UTF8 character or NULL
894 xmlUTF8Strpos(const xmlChar *utf, int pos) {
897 if (utf == NULL) return(NULL);
901 if ((ch=*utf++) == 0) return(NULL);
903 /* if not simple ascii, verify proper format */
904 if ( (ch & 0xc0) != 0xc0 )
906 /* then skip over remaining bytes for this char */
907 while ( (ch <<= 1) & 0x80 )
908 if ( (*utf++ & 0xc0) != 0x80 )
912 return((xmlChar *)utf);
917 * @utf: the input UTF8 *
918 * @utfchar: the UTF8 character to be found
920 * a function to provide the relative location of a UTF8 char
922 * Returns the relative character position of the desired char
926 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
930 if (utf==NULL || utfchar==NULL) return -1;
931 size = xmlUTF8Strsize(utfchar, 1);
932 for(i=0; (ch=*utf) != 0; i++) {
933 if (xmlStrncmp(utf, utfchar, size)==0)
937 /* if not simple ascii, verify proper format */
938 if ( (ch & 0xc0) != 0xc0 )
940 /* then skip over remaining bytes for this char */
941 while ( (ch <<= 1) & 0x80 )
942 if ( (*utf++ & 0xc0) != 0x80 )
951 * @utf: a sequence of UTF-8 encoded bytes
952 * @start: relative pos of first char
953 * @len: total number to copy
955 * Create a substring from a given UTF-8 string
956 * Note: positions are given in units of UTF-8 chars
958 * Returns a pointer to a newly created string
959 * or NULL if any problem
963 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
967 if (utf == NULL) return(NULL);
968 if (start < 0) return(NULL);
969 if (len < 0) return(NULL);
972 * Skip over any leading chars
974 for (i = 0;i < start;i++) {
975 if ((ch=*utf++) == 0) return(NULL);
977 /* if not simple ascii, verify proper format */
978 if ( (ch & 0xc0) != 0xc0 )
980 /* then skip over remaining bytes for this char */
981 while ( (ch <<= 1) & 0x80 )
982 if ( (*utf++ & 0xc0) != 0x80 )
987 return(xmlUTF8Strndup(utf, len));
991 * xmlEscapeFormatString:
992 * @msg: a pointer to the string in which to escape '%' characters.
993 * Must be a heap-allocated buffer created by libxml2 that may be
994 * returned, or that may be freed and replaced.
996 * Replaces the string pointed to by 'msg' with an escaped string.
997 * Returns the same string with all '%' characters escaped.
1000 xmlEscapeFormatString(xmlChar **msg)
1002 xmlChar *msgPtr = NULL;
1003 xmlChar *result = NULL;
1004 xmlChar *resultPtr = NULL;
1007 size_t resultLen = 0;
1012 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1021 resultLen = msgLen + count + 1;
1022 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1023 if (result == NULL) {
1024 /* Clear *msg to prevent format string vulnerabilities in
1025 out-of-memory situations. */
1028 xmlErrMemory(NULL, NULL);
1032 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1033 *resultPtr = *msgPtr;
1035 *(++resultPtr) = '%';
1037 result[resultLen - 1] = '\0';
1045 #define bottom_xmlstring
1046 #include "elfgcchack.h"