1 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
4 * This file is part of The Croco Library
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2.1 of the GNU Lesser General Public
8 * License as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * Author: Dodji Seketeli
21 * See COPYRIGHTS file for copyright information.
26 #include "cr-string.h"
30 *Some misc utility functions used
32 *Note that troughout this file I will
33 *refer to the CSS SPECIFICATIONS DOCUMENTATION
34 *written by the w3c guys. You can find that document
35 *at http://www.w3.org/TR/REC-CSS2/ .
38 /****************************
39 *Encoding transformations and
41 ****************************/
44 *Here is the correspondance between the ucs-4 charactere codes
45 *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
47 *UCS-4 range (hex.) UTF-8 octet sequence (binary)
48 *------------------ -----------------------------
49 *0000 0000-0000 007F 0xxxxxxx
50 *0000 0080-0000 07FF 110xxxxx 10xxxxxx
51 *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
52 *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
53 *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
54 *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
58 *Given an utf8 string buffer, calculates
59 *the length of this string if it was encoded
61 *@param a_in_start a pointer to the begining of
62 *the input utf8 string.
63 *@param a_in_end a pointre to the end of the input
64 *utf8 string (points to the last byte of the buffer)
65 *@param a_len out parameter the calculated length.
66 *@return CR_OK upon succesfull completion, an error code
70 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
71 const guchar * a_in_end, gulong * a_len)
73 guchar *byte_ptr = NULL;
77 *to store the final decoded
82 g_return_val_if_fail (a_in_start && a_in_end && a_len,
86 for (byte_ptr = (guchar *) a_in_start;
87 byte_ptr <= a_in_end; byte_ptr++) {
88 gint nb_bytes_2_decode = 0;
90 if (*byte_ptr <= 0x7F) {
97 nb_bytes_2_decode = 1;
99 } else if ((*byte_ptr & 0xE0) == 0xC0) {
101 *up to 11 bits long char.
102 *encoded over 2 bytes:
105 c = *byte_ptr & 0x1F;
106 nb_bytes_2_decode = 2;
108 } else if ((*byte_ptr & 0xF0) == 0xE0) {
110 *up to 16 bit long char
111 *encoded over 3 bytes:
112 *1110 xxxx 10xx xxxx 10xx xxxx
114 c = *byte_ptr & 0x0F;
115 nb_bytes_2_decode = 3;
117 } else if ((*byte_ptr & 0xF8) == 0xF0) {
119 *up to 21 bits long char
120 *encoded over 4 bytes:
121 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
124 nb_bytes_2_decode = 4;
126 } else if ((*byte_ptr & 0xFC) == 0xF8) {
128 *up to 26 bits long char
129 *encoded over 5 bytes.
130 *1111 10xx 10xx xxxx 10xx xxxx
134 nb_bytes_2_decode = 5;
136 } else if ((*byte_ptr & 0xFE) == 0xFC) {
138 *up to 31 bits long char
139 *encoded over 6 bytes:
140 *1111 110x 10xx xxxx 10xx xxxx
141 *10xx xxxx 10xx xxxx 10xx xxxx
144 nb_bytes_2_decode = 6;
150 return CR_ENCODING_ERROR;
154 *Go and decode the remaining byte(s)
155 *(if any) to get the current character.
157 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
158 /*decode the next byte */
161 /*byte pattern must be: 10xx xxxx */
162 if ((*byte_ptr & 0xC0) != 0x80) {
163 return CR_ENCODING_ERROR;
166 c = (c << 6) | (*byte_ptr & 0x3F);
178 *Given an ucs4 string, this function
179 *returns the size (in bytes) this string
180 *would have occupied if it was encoded in utf-8.
181 *@param a_in_start a pointer to the beginning of the input
183 *@param a_in_end a pointer to the end of the input buffer.
184 *@param a_len out parameter. The computed length.
185 *@return CR_OK upon successfull completion, an error code otherwise.
188 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
189 const guint32 * a_in_end, gulong * a_len)
192 guint32 *char_ptr = NULL;
194 g_return_val_if_fail (a_in_start && a_in_end && a_len,
197 for (char_ptr = (guint32 *) a_in_start;
198 char_ptr <= a_in_end; char_ptr++) {
199 if (*char_ptr <= 0x7F) {
200 /*the utf-8 char would take 1 byte */
202 } else if (*char_ptr <= 0x7FF) {
203 /*the utf-8 char would take 2 bytes */
205 } else if (*char_ptr <= 0xFFFF) {
207 } else if (*char_ptr <= 0x1FFFFF) {
209 } else if (*char_ptr <= 0x3FFFFFF) {
211 } else if (*char_ptr <= 0x7FFFFFFF) {
221 *Given an ucsA string, this function
222 *returns the size (in bytes) this string
223 *would have occupied if it was encoded in utf-8.
224 *@param a_in_start a pointer to the beginning of the input
226 *@param a_in_end a pointer to the end of the input buffer.
227 *@param a_len out parameter. The computed length.
228 *@return CR_OK upon successfull completion, an error code otherwise.
231 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
232 const guchar * a_in_end, gulong * a_len)
235 guchar *char_ptr = NULL;
237 g_return_val_if_fail (a_in_start && a_in_end && a_len,
240 for (char_ptr = (guchar *) a_in_start;
241 char_ptr <= a_in_end; char_ptr++) {
242 if (*char_ptr <= 0x7F) {
243 /*the utf-8 char would take 1 byte */
246 /*the utf-8 char would take 2 bytes */
256 *Converts an utf8 buffer into an ucs4 buffer.
258 *@param a_in the input utf8 buffer to convert.
259 *@param a_in_len in/out parameter. The size of the
260 *input buffer to convert. After return, this parameter contains
261 *the actual number of bytes consumed.
262 *@param a_out the output converted ucs4 buffer. Must be allocated by
264 *@param a_out_len in/out parameter. The size of the output buffer.
265 *If this size is actually smaller than the real needed size, the function
266 *just converts what it can and returns a success status. After return,
267 *this param points to the actual number of characters decoded.
268 *@return CR_OK upon successfull completion, an error code otherwise.
271 cr_utils_utf8_to_ucs4 (const guchar * a_in,
272 gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
278 enum CRStatus status = CR_OK;
281 *to store the final decoded
286 g_return_val_if_fail (a_in && a_in_len
287 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
295 out_len = *a_out_len;
297 for (in_index = 0, out_index = 0;
298 (in_index < in_len) && (out_index < out_len);
299 in_index++, out_index++) {
300 gint nb_bytes_2_decode = 0;
302 if (a_in[in_index] <= 0x7F) {
305 *encoded over 1 byte:
309 nb_bytes_2_decode = 1;
311 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
313 *up to 11 bits long char.
314 *encoded over 2 bytes:
317 c = a_in[in_index] & 0x1F;
318 nb_bytes_2_decode = 2;
320 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
322 *up to 16 bit long char
323 *encoded over 3 bytes:
324 *1110 xxxx 10xx xxxx 10xx xxxx
326 c = a_in[in_index] & 0x0F;
327 nb_bytes_2_decode = 3;
329 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
331 *up to 21 bits long char
332 *encoded over 4 bytes:
333 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
335 c = a_in[in_index] & 0x7;
336 nb_bytes_2_decode = 4;
338 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
340 *up to 26 bits long char
341 *encoded over 5 bytes.
342 *1111 10xx 10xx xxxx 10xx xxxx
345 c = a_in[in_index] & 3;
346 nb_bytes_2_decode = 5;
348 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
350 *up to 31 bits long char
351 *encoded over 6 bytes:
352 *1111 110x 10xx xxxx 10xx xxxx
353 *10xx xxxx 10xx xxxx 10xx xxxx
355 c = a_in[in_index] & 1;
356 nb_bytes_2_decode = 6;
364 *Go and decode the remaining byte(s)
365 *(if any) to get the current character.
367 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
368 /*decode the next byte */
371 /*byte pattern must be: 10xx xxxx */
372 if ((a_in[in_index] & 0xC0) != 0x80) {
376 c = (c << 6) | (a_in[in_index] & 0x3F);
380 *The decoded ucs4 char is now
384 /************************
386 ***********************/
388 /*be sure c is a char */
389 if (c == 0xFFFF || c == 0xFFFE)
392 /*be sure c is inferior to the max ucs4 char value */
397 *c must be less than UTF16 "lower surrogate begin"
398 *or higher than UTF16 "High surrogate end"
400 if (c >= 0xD800 && c <= 0xDFFF)
403 /*Avoid characters that equals zero */
407 a_out[out_index] = c;
411 *a_out_len = out_index + 1;
412 *a_in_len = in_index + 1;
418 *Reads a character from an utf8 buffer.
419 *Actually decode the next character code (unicode character code)
421 *@param a_in the starting address of the utf8 buffer.
422 *@param a_in_len the length of the utf8 buffer.
423 *@param a_out output parameter. The resulting read char.
424 *@param a_consumed the number of the bytes consumed to
425 *decode the returned character code.
426 *@return CR_OK upon successfull completion, an error code otherwise.
429 cr_utils_read_char_from_utf8_buf (const guchar * a_in,
431 guint32 * a_out, gulong * a_consumed)
435 nb_bytes_2_decode = 0;
436 enum CRStatus status = CR_OK;
439 *to store the final decoded
444 g_return_val_if_fail (a_in && a_out && a_out
445 && a_consumed, CR_BAD_PARAM_ERROR);
457 *encoded over 1 byte:
461 nb_bytes_2_decode = 1;
463 } else if ((*a_in & 0xE0) == 0xC0) {
465 *up to 11 bits long char.
466 *encoded over 2 bytes:
470 nb_bytes_2_decode = 2;
472 } else if ((*a_in & 0xF0) == 0xE0) {
474 *up to 16 bit long char
475 *encoded over 3 bytes:
476 *1110 xxxx 10xx xxxx 10xx xxxx
479 nb_bytes_2_decode = 3;
481 } else if ((*a_in & 0xF8) == 0xF0) {
483 *up to 21 bits long char
484 *encoded over 4 bytes:
485 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
488 nb_bytes_2_decode = 4;
490 } else if ((*a_in & 0xFC) == 0xF8) {
492 *up to 26 bits long char
493 *encoded over 5 bytes.
494 *1111 10xx 10xx xxxx 10xx xxxx
498 nb_bytes_2_decode = 5;
500 } else if ((*a_in & 0xFE) == 0xFC) {
502 *up to 31 bits long char
503 *encoded over 6 bytes:
504 *1111 110x 10xx xxxx 10xx xxxx
505 *10xx xxxx 10xx xxxx 10xx xxxx
508 nb_bytes_2_decode = 6;
515 if (nb_bytes_2_decode > a_in_len) {
516 status = CR_END_OF_INPUT_ERROR;
521 *Go and decode the remaining byte(s)
522 *(if any) to get the current character.
524 for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
525 /*byte pattern must be: 10xx xxxx */
526 if ((a_in[in_index] & 0xC0) != 0x80) {
530 c = (c << 6) | (a_in[in_index] & 0x3F);
534 *The decoded ucs4 char is now
538 /************************
540 ***********************/
542 /*be sure c is a char */
543 if (c == 0xFFFF || c == 0xFFFE)
546 /*be sure c is inferior to the max ucs4 char value */
551 *c must be less than UTF16 "lower surrogate begin"
552 *or higher than UTF16 "High surrogate end"
554 if (c >= 0xD800 && c <= 0xDFFF)
557 /*Avoid characters that equals zero */
564 *a_consumed = nb_bytes_2_decode;
573 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
574 const guchar * a_in_end, gulong * a_len)
577 *Note: this function can be made shorter
578 *but it considers all the cases of the utf8 encoding
579 *to ease further extensions ...
582 guchar *byte_ptr = NULL;
586 *to store the final decoded
591 g_return_val_if_fail (a_in_start && a_in_end && a_len,
595 for (byte_ptr = (guchar *) a_in_start;
596 byte_ptr <= a_in_end; byte_ptr++) {
597 gint nb_bytes_2_decode = 0;
599 if (*byte_ptr <= 0x7F) {
602 *encoded over 1 byte:
606 nb_bytes_2_decode = 1;
608 } else if ((*byte_ptr & 0xE0) == 0xC0) {
610 *up to 11 bits long char.
611 *encoded over 2 bytes:
614 c = *byte_ptr & 0x1F;
615 nb_bytes_2_decode = 2;
617 } else if ((*byte_ptr & 0xF0) == 0xE0) {
619 *up to 16 bit long char
620 *encoded over 3 bytes:
621 *1110 xxxx 10xx xxxx 10xx xxxx
623 c = *byte_ptr & 0x0F;
624 nb_bytes_2_decode = 3;
626 } else if ((*byte_ptr & 0xF8) == 0xF0) {
628 *up to 21 bits long char
629 *encoded over 4 bytes:
630 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
633 nb_bytes_2_decode = 4;
635 } else if ((*byte_ptr & 0xFC) == 0xF8) {
637 *up to 26 bits long char
638 *encoded over 5 bytes.
639 *1111 10xx 10xx xxxx 10xx xxxx
643 nb_bytes_2_decode = 5;
645 } else if ((*byte_ptr & 0xFE) == 0xFC) {
647 *up to 31 bits long char
648 *encoded over 6 bytes:
649 *1111 110x 10xx xxxx 10xx xxxx
650 *10xx xxxx 10xx xxxx 10xx xxxx
653 nb_bytes_2_decode = 6;
659 return CR_ENCODING_ERROR;
663 *Go and decode the remaining byte(s)
664 *(if any) to get the current character.
666 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
667 /*decode the next byte */
670 /*byte pattern must be: 10xx xxxx */
671 if ((*byte_ptr & 0xC0) != 0x80) {
672 return CR_ENCODING_ERROR;
675 c = (c << 6) | (*byte_ptr & 0x3F);
679 *The decoded ucs4 char is now
683 if (c <= 0xFF) { /*Add other conditions to support
684 *other char sets (ucs2, ucs3, ucs4).
688 /*the char is too long to fit
689 *into the supposed charset len.
691 return CR_ENCODING_ERROR;
701 *Converts an utf8 string into an ucs4 string.
702 *@param a_in the input string to convert.
703 *@param a_in_len in/out parameter. The length of the input
704 *string. After return, points to the actual number of bytes
705 *consumed. This can be usefull to debug the input stream in case
707 *@param a_out out parameter. Points to the output string. It is allocated
708 *by this function and must be freed by the caller.
709 *@param a_out_len out parameter. The length of the output string.
710 *@return CR_OK upon successfull completion, an error code otherwise.
714 cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
716 guint32 ** a_out, gulong * a_out_len)
718 enum CRStatus status = CR_OK;
720 g_return_val_if_fail (a_in && a_in_len
721 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
723 status = cr_utils_utf8_str_len_as_ucs4 (a_in,
724 &a_in[*a_in_len - 1],
727 g_return_val_if_fail (status == CR_OK, status);
729 *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
731 status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
737 *Converts an ucs4 buffer into an utf8 buffer.
739 *@param a_in the input ucs4 buffer to convert.
740 *@param a_in_len in/out parameter. The size of the
741 *input buffer to convert. After return, this parameter contains
742 *the actual number of characters consumed.
743 *@param a_out the output converted utf8 buffer. Must be allocated by
745 *@param a_out_len in/out parameter. The size of the output buffer.
746 *If this size is actually smaller than the real needed size, the function
747 *just converts what it can and returns a success status. After return,
748 *this param points to the actual number of bytes in the buffer.
749 *@return CR_OK upon successfull completion, an error code otherwise.
752 cr_utils_ucs4_to_utf8 (const guint32 * a_in,
753 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
758 enum CRStatus status = CR_OK;
760 g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
770 for (in_index = 0; in_index < in_len; in_index++) {
772 *FIXME: return whenever we encounter forbidden char values.
775 if (a_in[in_index] <= 0x7F) {
776 a_out[out_index] = a_in[in_index];
778 } else if (a_in[in_index] <= 0x7FF) {
779 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
780 a_out[out_index + 1] =
781 (0x80 | (a_in[in_index] & 0x3F));
783 } else if (a_in[in_index] <= 0xFFFF) {
784 a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
785 a_out[out_index + 1] =
786 (0x80 | ((a_in[in_index] >> 6) & 0x3F));
787 a_out[out_index + 2] =
788 (0x80 | (a_in[in_index] & 0x3F));
790 } else if (a_in[in_index] <= 0x1FFFFF) {
791 a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
793 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
795 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
797 = (0x80 | (a_in[in_index] & 0x3F));
799 } else if (a_in[in_index] <= 0x3FFFFFF) {
800 a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
801 a_out[out_index + 1] =
802 (0x80 | (a_in[in_index] >> 18));
804 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
806 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
808 = (0x80 | (a_in[in_index] & 0x3F));
810 } else if (a_in[in_index] <= 0x7FFFFFFF) {
811 a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
812 a_out[out_index + 1] =
813 (0x80 | (a_in[in_index] >> 24));
815 = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
817 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
819 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
821 = (0x80 | (a_in[in_index] & 0x3F));
824 status = CR_ENCODING_ERROR;
830 *a_in_len = in_index + 1;
831 *a_out_len = out_index + 1;
837 *Converts an ucs4 string into an utf8 string.
838 *@param a_in the input string to convert.
839 *@param a_in_len in/out parameter. The length of the input
840 *string. After return, points to the actual number of characters
841 *consumed. This can be usefull to debug the input string in case
843 *@param a_out out parameter. Points to the output string. It is allocated
844 *by this function and must be freed by the caller.
845 *@param a_out_len out parameter. The length (in bytes) of the output string.
846 *@return CR_OK upon successfull completion, an error code otherwise.
849 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
851 guchar ** a_out, gulong * a_out_len)
853 enum CRStatus status = CR_OK;
855 g_return_val_if_fail (a_in && a_in_len && a_out
856 && a_out_len, CR_BAD_PARAM_ERROR);
858 status = cr_utils_ucs4_str_len_as_utf8 (a_in,
859 &a_in[*a_out_len - 1],
862 g_return_val_if_fail (status == CR_OK, status);
864 status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
870 *Converts an ucs1 buffer into an utf8 buffer.
871 *The caller must know the size of the resulting buffer and
872 *allocate it prior to calling this function.
874 *@param a_in the input ucs1 buffer.
876 *@param a_in_len in/out parameter. The length of the input buffer.
877 *After return, points to the number of bytes actually consumed even
878 *in case of encoding error.
880 *@param a_out out parameter. The output utf8 converted buffer.
882 *@param a_out_len in/out parameter. The size of the output buffer.
883 *If the output buffer size is shorter than the actual needed size,
884 *this function just convert what it can.
886 *@return CR_OK upon successfull completion, an error code otherwise.
890 cr_utils_ucs1_to_utf8 (const guchar * a_in,
891 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
893 gulong out_index = 0,
897 enum CRStatus status = CR_OK;
899 g_return_val_if_fail (a_in && a_in_len
903 if (*a_in_len == 0) {
907 g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
915 out_len = *a_out_len;
917 for (in_index = 0, out_index = 0;
918 (in_index < in_len) && (out_index < out_len); in_index++) {
920 *FIXME: return whenever we encounter forbidden char values.
923 if (a_in[in_index] <= 0x7F) {
924 a_out[out_index] = a_in[in_index];
927 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
928 a_out[out_index + 1] =
929 (0x80 | (a_in[in_index] & 0x3F));
935 *a_in_len = in_index;
936 *a_out_len = out_index;
942 *Converts an ucs1 string into an utf8 string.
943 *@param a_in_start the beginning of the input string to convert.
944 *@param a_in_end the end of the input string to convert.
945 *@param a_out out parameter. The converted string.
946 *@param a_out out parameter. The length of the converted string.
947 *@return CR_OK upon successfull completion, an error code otherwise.
951 cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
953 guchar ** a_out, gulong * a_out_len)
957 enum CRStatus status = CR_OK;
959 g_return_val_if_fail (a_in && a_in_len && a_out
960 && a_out_len, CR_BAD_PARAM_ERROR);
968 status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
971 g_return_val_if_fail (status == CR_OK, status);
975 *a_out = g_malloc0 (out_len);
977 status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
979 *a_out_len = out_len;
985 *Converts an utf8 buffer into an ucs1 buffer.
986 *The caller must know the size of the resulting
987 *converted buffer, and allocated it prior to calling this
990 *@param a_in the input utf8 buffer to convert.
992 *@param a_in_len in/out parameter. The size of the input utf8 buffer.
993 *After return, points to the number of bytes consumed
994 *by the function even in case of encoding error.
996 *@param a_out out parameter. Points to the resulting buffer.
997 *Must be allocated by the caller. If the size of a_out is shorter
998 *than its required size, this function converts what it can and return
999 *a successfull status.
1001 *@param a_out_len in/out parameter. The size of the output buffer.
1002 *After return, points to the number of bytes consumed even in case of
1005 *@return CR_OK upon successfull completion, an error code otherwise.
1008 cr_utils_utf8_to_ucs1 (const guchar * a_in,
1009 gulong * a_in_len, guchar * a_out, gulong * a_out_len)
1011 gulong in_index = 0,
1015 enum CRStatus status = CR_OK;
1018 *to store the final decoded
1023 g_return_val_if_fail (a_in && a_in_len
1024 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1026 if (*a_in_len < 1) {
1032 out_len = *a_out_len;
1034 for (in_index = 0, out_index = 0;
1035 (in_index < in_len) && (out_index < out_len);
1036 in_index++, out_index++) {
1037 gint nb_bytes_2_decode = 0;
1039 if (a_in[in_index] <= 0x7F) {
1042 *encoded over 1 byte:
1046 nb_bytes_2_decode = 1;
1048 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
1050 *up to 11 bits long char.
1051 *encoded over 2 bytes:
1052 *110x xxxx 10xx xxxx
1054 c = a_in[in_index] & 0x1F;
1055 nb_bytes_2_decode = 2;
1057 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
1059 *up to 16 bit long char
1060 *encoded over 3 bytes:
1061 *1110 xxxx 10xx xxxx 10xx xxxx
1063 c = a_in[in_index] & 0x0F;
1064 nb_bytes_2_decode = 3;
1066 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
1068 *up to 21 bits long char
1069 *encoded over 4 bytes:
1070 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
1072 c = a_in[in_index] & 0x7;
1073 nb_bytes_2_decode = 4;
1075 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
1077 *up to 26 bits long char
1078 *encoded over 5 bytes.
1079 *1111 10xx 10xx xxxx 10xx xxxx
1080 *10xx xxxx 10xx xxxx
1082 c = a_in[in_index] & 3;
1083 nb_bytes_2_decode = 5;
1085 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
1087 *up to 31 bits long char
1088 *encoded over 6 bytes:
1089 *1111 110x 10xx xxxx 10xx xxxx
1090 *10xx xxxx 10xx xxxx 10xx xxxx
1092 c = a_in[in_index] & 1;
1093 nb_bytes_2_decode = 6;
1097 status = CR_ENCODING_ERROR;
1102 *Go and decode the remaining byte(s)
1103 *(if any) to get the current character.
1105 if (in_index + nb_bytes_2_decode - 1 >= in_len) {
1110 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
1111 /*decode the next byte */
1114 /*byte pattern must be: 10xx xxxx */
1115 if ((a_in[in_index] & 0xC0) != 0x80) {
1116 status = CR_ENCODING_ERROR;
1120 c = (c << 6) | (a_in[in_index] & 0x3F);
1124 *The decoded ucs4 char is now
1129 status = CR_ENCODING_ERROR;
1133 a_out[out_index] = c;
1137 *a_out_len = out_index;
1138 *a_in_len = in_index;
1144 *Converts an utf8 buffer into an
1146 *@param a_in_start the start of the input buffer.
1147 *@param a_in_end the end of the input buffer.
1148 *@param a_out out parameter. The resulting converted ucs4 buffer.
1149 *Must be freed by the caller.
1150 *@param a_out_len out parameter. The length of the converted buffer.
1151 *@return CR_OK upon successfull completion, an error code otherwise.
1152 *Note that out parameters are valid if and only if this function
1156 cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
1158 guchar ** a_out, gulong * a_out_len)
1160 enum CRStatus status = CR_OK;
1162 g_return_val_if_fail (a_in && a_in_len
1163 && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1165 if (*a_in_len < 1) {
1171 status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
1174 g_return_val_if_fail (status == CR_OK, status);
1176 *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
1178 status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
1182 /*****************************************
1183 *CSS basic types identification utilities
1184 *****************************************/
1187 *Returns TRUE if a_char is a white space as
1188 *defined in the css spec in chap 4.1.1.
1190 *white-space ::= ' '| \t|\r|\n|\f
1192 *@param a_char the character to test.
1193 *return TRUE if is a white space, false otherwise.
1196 cr_utils_is_white_space (guint32 a_char)
1212 *Returns true if the character is a newline
1213 *as defined in the css spec in the chap 4.1.1.
1215 *nl ::= \n|\r\n|\r|\f
1217 *@param a_char the character to test.
1218 *@return TRUE if the character is a newline, FALSE otherwise.
1221 cr_utils_is_newline (guint32 a_char)
1235 *returns TRUE if the char is part of an hexa num char:
1236 *i.e hexa_char ::= [0-9A-F]
1239 cr_utils_is_hexa_char (guint32 a_char)
1241 if ((a_char >= '0' && a_char <= '9')
1242 || (a_char >= 'A' && a_char <= 'F')) {
1249 *Returns true if the character is a nonascii
1250 *character (as defined in the css spec chap 4.1.1):
1252 *nonascii ::= [^\0-\177]
1254 *@param a_char the character to test.
1255 *@return TRUE if the character is a nonascii char,
1259 cr_utils_is_nonascii (guint32 a_char)
1261 if (a_char <= 177) {
1269 *Dumps a character a_nb times on a file.
1270 *@param a_char the char to dump
1271 *@param a_fp the destination file pointer
1272 *@param a_nb the number of times a_char is to be dumped.
1275 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
1279 for (i = 0; i < a_nb; i++) {
1280 fprintf (a_fp, "%c", a_char);
1285 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
1289 g_return_if_fail (a_string);
1291 for (i = 0; i < a_nb; i++) {
1292 g_string_append_printf (a_string, "%c", a_char);
1297 cr_utils_n_to_0_dot_n (glong a_n, glong decimal_places)
1299 gdouble result = a_n;
1301 while (decimal_places > 0) {
1302 result = result / 10;
1310 *Duplicates a list of GString instances.
1311 *@return the duplicated list of GString instances or NULL if
1312 *something bad happened.
1313 *@param a_list_of_strings the list of strings to be duplicated.
1316 cr_utils_dup_glist_of_string (GList * a_list_of_strings)
1321 g_return_val_if_fail (a_list_of_strings, NULL);
1323 for (cur = a_list_of_strings; cur; cur = cur->next) {
1324 GString *str = NULL;
1326 str = g_string_new_len (((GString *) cur->data)->str,
1327 ((GString *) cur->data)->len);
1329 result = g_list_append (result, str);
1336 *Duplicate a GList where the GList::data is a CRString.
1337 *@param a_list_of_strings the list to duplicate
1338 *@return the duplicated list, or NULL if something bad
1342 cr_utils_dup_glist_of_cr_string (GList * a_list_of_strings)
1344 GList *cur = NULL, *result = NULL;
1346 g_return_val_if_fail (a_list_of_strings, NULL);
1348 for (cur = a_list_of_strings; cur; cur = cur->next) {
1349 CRString *str = NULL;
1351 str = cr_string_dup ((CRString *) cur->data) ;
1353 result = g_list_append (result, str);