gnulib-local/lib/libcroco/cr-utils.c

   1 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
   2
   3 /*
   4  * This file is part of The Croco Library
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of version 2.1 of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  *
  20  * Author: Dodji Seketeli
  21  * See COPYRIGHTS file for copyright information.
  22  */
  23
  24 #include <config.h>
  25 #include "cr-utils.h"
  26 #include "cr-string.h"
  27
  28 /**
  29  *@file:
  30  *Some misc utility functions used
  31  *in the libcroco.
  32  *Note that troughout this file I will
  33  *refer to the CSS SPECIFICATIONS DOCUMENTATION
  34  *written by the w3c guys. You can find that document
  35  *at http://www.w3.org/TR/REC-CSS2/ .
  36  */
  37
  38 /****************************
  39  *Encoding transformations and
  40  *encoding helpers
  41  ****************************/
  42
  43 /*
  44  *Here is the correspondance between the ucs-4 charactere codes
  45  *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
  46  *
  47  *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
  48  *------------------    -----------------------------
  49  *0000 0000-0000 007F   0xxxxxxx
  50  *0000 0080-0000 07FF   110xxxxx 10xxxxxx
  51  *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
  52  *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  53  *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  54  *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
  55  */
  56
  57 /**
  58  *Given an utf8 string buffer, calculates
  59  *the length of this string if it was encoded
  60  *in ucs4.
  61  *@param a_in_start a pointer to the begining of
  62  *the input utf8 string.
  63  *@param a_in_end a pointre to the end of the input
  64  *utf8 string (points to the last byte of the buffer)
  65  *@param a_len out parameter the calculated length.
  66  *@return CR_OK upon succesfull completion, an error code
  67  *otherwise.
  68  */
  69 enum CRStatus
  70 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
  71                                const guchar * a_in_end, gulong * a_len)
  72 {
  73         guchar *byte_ptr = NULL;
  74         gint len = 0;
  75
  76         /*
  77          *to store the final decoded
  78          *unicode char
  79          */
  80         guint c = 0;
  81
  82         g_return_val_if_fail (a_in_start && a_in_end && a_len,
  83                               CR_BAD_PARAM_ERROR);
  84         *a_len = 0;
  85
  86         for (byte_ptr = (guchar *) a_in_start;
  87              byte_ptr <= a_in_end; byte_ptr++) {
  88                 gint nb_bytes_2_decode = 0;
  89
  90                 if (*byte_ptr <= 0x7F) {
  91                         /*
  92                          *7 bits long char
  93                          *encoded over 1 byte:
  94                          * 0xxx xxxx
  95                          */
  96                         c = *byte_ptr;
  97                         nb_bytes_2_decode = 1;
  98
  99                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
 100                         /*
 101                          *up to 11 bits long char.
 102                          *encoded over 2 bytes:
 103                          *110x xxxx  10xx xxxx
 104                          */
 105                         c = *byte_ptr & 0x1F;
 106                         nb_bytes_2_decode = 2;
 107
 108                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
 109                         /*
 110                          *up to 16 bit long char
 111                          *encoded over 3 bytes:
 112                          *1110 xxxx  10xx xxxx  10xx xxxx
 113                          */
 114                         c = *byte_ptr & 0x0F;
 115                         nb_bytes_2_decode = 3;
 116
 117                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
 118                         /*
 119                          *up to 21 bits long char
 120                          *encoded over 4 bytes:
 121                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
 122                          */
 123                         c = *byte_ptr & 0x7;
 124                         nb_bytes_2_decode = 4;
 125
 126                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
 127                         /*
 128                          *up to 26 bits long char
 129                          *encoded over 5 bytes.
 130                          *1111 10xx  10xx xxxx  10xx xxxx
 131                          *10xx xxxx  10xx xxxx
 132                          */
 133                         c = *byte_ptr & 3;
 134                         nb_bytes_2_decode = 5;
 135
 136                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
 137                         /*
 138                          *up to 31 bits long char
 139                          *encoded over 6 bytes:
 140                          *1111 110x  10xx xxxx  10xx xxxx
 141                          *10xx xxxx  10xx xxxx  10xx xxxx
 142                          */
 143                         c = *byte_ptr & 1;
 144                         nb_bytes_2_decode = 6;
 145
 146                 } else {
 147                         /*
 148                          *BAD ENCODING
 149                          */
 150                         return CR_ENCODING_ERROR;
 151                 }
 152
 153                 /*
 154                  *Go and decode the remaining byte(s)
 155                  *(if any) to get the current character.
 156                  */
 157                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
 158                         /*decode the next byte */
 159                         byte_ptr++;
 160
 161                         /*byte pattern must be: 10xx xxxx */
 162                         if ((*byte_ptr & 0xC0) != 0x80) {
 163                                 return CR_ENCODING_ERROR;
 164                         }
 165
 166                         c = (c << 6) | (*byte_ptr & 0x3F);
 167                 }
 168
 169                 len++;
 170         }
 171
 172         *a_len = len;
 173
 174         return CR_OK;
 175 }
 176
 177 /**
 178  *Given an ucs4 string, this function
 179  *returns the size (in bytes) this string
 180  *would have occupied if it was encoded in utf-8.
 181  *@param a_in_start a pointer to the beginning of the input
 182  *buffer.
 183  *@param a_in_end a pointer to the end of the input buffer.
 184  *@param a_len out parameter. The computed length.
 185  *@return CR_OK upon successfull completion, an error code otherwise.
 186  */
 187 enum CRStatus
 188 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
 189                                const guint32 * a_in_end, gulong * a_len)
 190 {
 191         gint len = 0;
 192         guint32 *char_ptr = NULL;
 193
 194         g_return_val_if_fail (a_in_start && a_in_end && a_len,
 195                               CR_BAD_PARAM_ERROR);
 196
 197         for (char_ptr = (guint32 *) a_in_start;
 198              char_ptr <= a_in_end; char_ptr++) {
 199                 if (*char_ptr <= 0x7F) {
 200                         /*the utf-8 char would take 1 byte */
 201                         len += 1;
 202                 } else if (*char_ptr <= 0x7FF) {
 203                         /*the utf-8 char would take 2 bytes */
 204                         len += 2;
 205                 } else if (*char_ptr <= 0xFFFF) {
 206                         len += 3;
 207                 } else if (*char_ptr <= 0x1FFFFF) {
 208                         len += 4;
 209                 } else if (*char_ptr <= 0x3FFFFFF) {
 210                         len += 5;
 211                 } else if (*char_ptr <= 0x7FFFFFFF) {
 212                         len += 6;
 213                 }
 214         }
 215
 216         *a_len = len;
 217         return CR_OK;
 218 }
 219
 220 /**
 221  *Given an ucsA string, this function
 222  *returns the size (in bytes) this string
 223  *would have occupied if it was encoded in utf-8.
 224  *@param a_in_start a pointer to the beginning of the input
 225  *buffer.
 226  *@param a_in_end a pointer to the end of the input buffer.
 227  *@param a_len out parameter. The computed length.
 228  *@return CR_OK upon successfull completion, an error code otherwise.
 229  */
 230 enum CRStatus
 231 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
 232                                const guchar * a_in_end, gulong * a_len)
 233 {
 234         gint len = 0;
 235         guchar *char_ptr = NULL;
 236
 237         g_return_val_if_fail (a_in_start && a_in_end && a_len,
 238                               CR_BAD_PARAM_ERROR);
 239
 240         for (char_ptr = (guchar *) a_in_start;
 241              char_ptr <= a_in_end; char_ptr++) {
 242                 if (*char_ptr <= 0x7F) {
 243                         /*the utf-8 char would take 1 byte */
 244                         len += 1;
 245                 } else {
 246                         /*the utf-8 char would take 2 bytes */
 247                         len += 2;
 248                 }
 249         }
 250
 251         *a_len = len;
 252         return CR_OK;
 253 }
 254
 255 /**
 256  *Converts an utf8 buffer into an ucs4 buffer.
 257  *
 258  *@param a_in the input utf8 buffer to convert.
 259  *@param a_in_len in/out parameter. The size of the
 260  *input buffer to convert. After return, this parameter contains
 261  *the actual number of bytes consumed.
 262  *@param a_out the output converted ucs4 buffer. Must be allocated by
 263  *the caller.
 264  *@param a_out_len in/out parameter. The size of the output buffer.
 265  *If this size is actually smaller than the real needed size, the function
 266  *just converts what it can and returns a success status. After return,
 267  *this param points to the actual number of characters decoded.
 268  *@return CR_OK upon successfull completion, an error code otherwise.
 269  */
 270 enum CRStatus
 271 cr_utils_utf8_to_ucs4 (const guchar * a_in,
 272                        gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
 273 {
 274         gulong in_len = 0,
 275                 out_len = 0,
 276                 in_index = 0,
 277                 out_index = 0;
 278         enum CRStatus status = CR_OK;
 279
 280         /*
 281          *to store the final decoded
 282          *unicode char
 283          */
 284         guint c = 0;
 285
 286         g_return_val_if_fail (a_in && a_in_len
 287                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
 288
 289         if (*a_in_len < 1) {
 290                 status = CR_OK;
 291                 goto end;
 292         }
 293
 294         in_len = *a_in_len;
 295         out_len = *a_out_len;
 296
 297         for (in_index = 0, out_index = 0;
 298              (in_index < in_len) && (out_index < out_len);
 299              in_index++, out_index++) {
 300                 gint nb_bytes_2_decode = 0;
 301
 302                 if (a_in[in_index] <= 0x7F) {
 303                         /*
 304                          *7 bits long char
 305                          *encoded over 1 byte:
 306                          * 0xxx xxxx
 307                          */
 308                         c = a_in[in_index];
 309                         nb_bytes_2_decode = 1;
 310
 311                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
 312                         /*
 313                          *up to 11 bits long char.
 314                          *encoded over 2 bytes:
 315                          *110x xxxx  10xx xxxx
 316                          */
 317                         c = a_in[in_index] & 0x1F;
 318                         nb_bytes_2_decode = 2;
 319
 320                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
 321                         /*
 322                          *up to 16 bit long char
 323                          *encoded over 3 bytes:
 324                          *1110 xxxx  10xx xxxx  10xx xxxx
 325                          */
 326                         c = a_in[in_index] & 0x0F;
 327                         nb_bytes_2_decode = 3;
 328
 329                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
 330                         /*
 331                          *up to 21 bits long char
 332                          *encoded over 4 bytes:
 333                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
 334                          */
 335                         c = a_in[in_index] & 0x7;
 336                         nb_bytes_2_decode = 4;
 337
 338                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
 339                         /*
 340                          *up to 26 bits long char
 341                          *encoded over 5 bytes.
 342                          *1111 10xx  10xx xxxx  10xx xxxx
 343                          *10xx xxxx  10xx xxxx
 344                          */
 345                         c = a_in[in_index] & 3;
 346                         nb_bytes_2_decode = 5;
 347
 348                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
 349                         /*
 350                          *up to 31 bits long char
 351                          *encoded over 6 bytes:
 352                          *1111 110x  10xx xxxx  10xx xxxx
 353                          *10xx xxxx  10xx xxxx  10xx xxxx
 354                          */
 355                         c = a_in[in_index] & 1;
 356                         nb_bytes_2_decode = 6;
 357
 358                 } else {
 359                         /*BAD ENCODING */
 360                         goto end;
 361                 }
 362
 363                 /*
 364                  *Go and decode the remaining byte(s)
 365                  *(if any) to get the current character.
 366                  */
 367                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
 368                         /*decode the next byte */
 369                         in_index++;
 370
 371                         /*byte pattern must be: 10xx xxxx */
 372                         if ((a_in[in_index] & 0xC0) != 0x80) {
 373                                 goto end;
 374                         }
 375
 376                         c = (c << 6) | (a_in[in_index] & 0x3F);
 377                 }
 378
 379                 /*
 380                  *The decoded ucs4 char is now
 381                  *in c.
 382                  */
 383
 384                 /************************
 385                  *Some security tests
 386                  ***********************/
 387
 388                 /*be sure c is a char */
 389                 if (c == 0xFFFF || c == 0xFFFE)
 390                         goto end;
 391
 392                 /*be sure c is inferior to the max ucs4 char value */
 393                 if (c > 0x10FFFF)
 394                         goto end;
 395
 396                 /*
 397                  *c must be less than UTF16 "lower surrogate begin"
 398                  *or higher than UTF16 "High surrogate end"
 399                  */
 400                 if (c >= 0xD800 && c <= 0xDFFF)
 401                         goto end;
 402
 403                 /*Avoid characters that equals zero */
 404                 if (c == 0)
 405                         goto end;
 406
 407                 a_out[out_index] = c;
 408         }
 409
 410       end:
 411         *a_out_len = out_index + 1;
 412         *a_in_len = in_index + 1;
 413
 414         return status;
 415 }
 416
 417 /**
 418  *Reads a character from an utf8 buffer.
 419  *Actually decode the next character code (unicode character code)
 420  *and returns it.
 421  *@param a_in the starting address of the utf8 buffer.
 422  *@param a_in_len the length of the utf8 buffer.
 423  *@param a_out output parameter. The resulting read char.
 424  *@param a_consumed the number of the bytes consumed to
 425  *decode the returned character code.
 426  *@return CR_OK upon successfull completion, an error code otherwise.
 427  */
 428 enum CRStatus
 429 cr_utils_read_char_from_utf8_buf (const guchar * a_in,
 430                                   gulong a_in_len,
 431                                   guint32 * a_out, gulong * a_consumed)
 432 {
 433         gulong in_len = 0,
 434                 in_index = 0,
 435                 nb_bytes_2_decode = 0;
 436         enum CRStatus status = CR_OK;
 437
 438         /*
 439          *to store the final decoded
 440          *unicode char
 441          */
 442         guint32 c = 0;
 443
 444         g_return_val_if_fail (a_in && a_out && a_out
 445                               && a_consumed, CR_BAD_PARAM_ERROR);
 446
 447         if (a_in_len < 1) {
 448                 status = CR_OK;
 449                 goto end;
 450         }
 451
 452         in_len = a_in_len;
 453
 454         if (*a_in <= 0x7F) {
 455                 /*
 456                  *7 bits long char
 457                  *encoded over 1 byte:
 458                  * 0xxx xxxx
 459                  */
 460                 c = *a_in;
 461                 nb_bytes_2_decode = 1;
 462
 463         } else if ((*a_in & 0xE0) == 0xC0) {
 464                 /*
 465                  *up to 11 bits long char.
 466                  *encoded over 2 bytes:
 467                  *110x xxxx  10xx xxxx
 468                  */
 469                 c = *a_in & 0x1F;
 470                 nb_bytes_2_decode = 2;
 471
 472         } else if ((*a_in & 0xF0) == 0xE0) {
 473                 /*
 474                  *up to 16 bit long char
 475                  *encoded over 3 bytes:
 476                  *1110 xxxx  10xx xxxx  10xx xxxx
 477                  */
 478                 c = *a_in & 0x0F;
 479                 nb_bytes_2_decode = 3;
 480
 481         } else if ((*a_in & 0xF8) == 0xF0) {
 482                 /*
 483                  *up to 21 bits long char
 484                  *encoded over 4 bytes:
 485                  *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
 486                  */
 487                 c = *a_in & 0x7;
 488                 nb_bytes_2_decode = 4;
 489
 490         } else if ((*a_in & 0xFC) == 0xF8) {
 491                 /*
 492                  *up to 26 bits long char
 493                  *encoded over 5 bytes.
 494                  *1111 10xx  10xx xxxx  10xx xxxx
 495                  *10xx xxxx  10xx xxxx
 496                  */
 497                 c = *a_in & 3;
 498                 nb_bytes_2_decode = 5;
 499
 500         } else if ((*a_in & 0xFE) == 0xFC) {
 501                 /*
 502                  *up to 31 bits long char
 503                  *encoded over 6 bytes:
 504                  *1111 110x  10xx xxxx  10xx xxxx
 505                  *10xx xxxx  10xx xxxx  10xx xxxx
 506                  */
 507                 c = *a_in & 1;
 508                 nb_bytes_2_decode = 6;
 509
 510         } else {
 511                 /*BAD ENCODING */
 512                 goto end;
 513         }
 514
 515         if (nb_bytes_2_decode > a_in_len) {
 516                 status = CR_END_OF_INPUT_ERROR;
 517                 goto end;
 518         }
 519
 520         /*
 521          *Go and decode the remaining byte(s)
 522          *(if any) to get the current character.
 523          */
 524         for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
 525                 /*byte pattern must be: 10xx xxxx */
 526                 if ((a_in[in_index] & 0xC0) != 0x80) {
 527                         goto end;
 528                 }
 529
 530                 c = (c << 6) | (a_in[in_index] & 0x3F);
 531         }
 532
 533         /*
 534          *The decoded ucs4 char is now
 535          *in c.
 536          */
 537
 538     /************************
 539      *Some security tests
 540      ***********************/
 541
 542         /*be sure c is a char */
 543         if (c == 0xFFFF || c == 0xFFFE)
 544                 goto end;
 545
 546         /*be sure c is inferior to the max ucs4 char value */
 547         if (c > 0x10FFFF)
 548                 goto end;
 549
 550         /*
 551          *c must be less than UTF16 "lower surrogate begin"
 552          *or higher than UTF16 "High surrogate end"
 553          */
 554         if (c >= 0xD800 && c <= 0xDFFF)
 555                 goto end;
 556
 557         /*Avoid characters that equals zero */
 558         if (c == 0)
 559                 goto end;
 560
 561         *a_out = c;
 562
 563       end:
 564         *a_consumed = nb_bytes_2_decode;
 565
 566         return status;
 567 }
 568
 569 /**
 570  *
 571  */
 572 enum CRStatus
 573 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
 574                                const guchar * a_in_end, gulong * a_len)
 575 {
 576         /*
 577          *Note: this function can be made shorter
 578          *but it considers all the cases of the utf8 encoding
 579          *to ease further extensions ...
 580          */
 581
 582         guchar *byte_ptr = NULL;
 583         gint len = 0;
 584
 585         /*
 586          *to store the final decoded
 587          *unicode char
 588          */
 589         guint c = 0;
 590
 591         g_return_val_if_fail (a_in_start && a_in_end && a_len,
 592                               CR_BAD_PARAM_ERROR);
 593         *a_len = 0;
 594
 595         for (byte_ptr = (guchar *) a_in_start;
 596              byte_ptr <= a_in_end; byte_ptr++) {
 597                 gint nb_bytes_2_decode = 0;
 598
 599                 if (*byte_ptr <= 0x7F) {
 600                         /*
 601                          *7 bits long char
 602                          *encoded over 1 byte:
 603                          * 0xxx xxxx
 604                          */
 605                         c = *byte_ptr;
 606                         nb_bytes_2_decode = 1;
 607
 608                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
 609                         /*
 610                          *up to 11 bits long char.
 611                          *encoded over 2 bytes:
 612                          *110x xxxx  10xx xxxx
 613                          */
 614                         c = *byte_ptr & 0x1F;
 615                         nb_bytes_2_decode = 2;
 616
 617                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
 618                         /*
 619                          *up to 16 bit long char
 620                          *encoded over 3 bytes:
 621                          *1110 xxxx  10xx xxxx  10xx xxxx
 622                          */
 623                         c = *byte_ptr & 0x0F;
 624                         nb_bytes_2_decode = 3;
 625
 626                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
 627                         /*
 628                          *up to 21 bits long char
 629                          *encoded over 4 bytes:
 630                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
 631                          */
 632                         c = *byte_ptr & 0x7;
 633                         nb_bytes_2_decode = 4;
 634
 635                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
 636                         /*
 637                          *up to 26 bits long char
 638                          *encoded over 5 bytes.
 639                          *1111 10xx  10xx xxxx  10xx xxxx
 640                          *10xx xxxx  10xx xxxx
 641                          */
 642                         c = *byte_ptr & 3;
 643                         nb_bytes_2_decode = 5;
 644
 645                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
 646                         /*
 647                          *up to 31 bits long char
 648                          *encoded over 6 bytes:
 649                          *1111 110x  10xx xxxx  10xx xxxx
 650                          *10xx xxxx  10xx xxxx  10xx xxxx
 651                          */
 652                         c = *byte_ptr & 1;
 653                         nb_bytes_2_decode = 6;
 654
 655                 } else {
 656                         /*
 657                          *BAD ENCODING
 658                          */
 659                         return CR_ENCODING_ERROR;
 660                 }
 661
 662                 /*
 663                  *Go and decode the remaining byte(s)
 664                  *(if any) to get the current character.
 665                  */
 666                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
 667                         /*decode the next byte */
 668                         byte_ptr++;
 669
 670                         /*byte pattern must be: 10xx xxxx */
 671                         if ((*byte_ptr & 0xC0) != 0x80) {
 672                                 return CR_ENCODING_ERROR;
 673                         }
 674
 675                         c = (c << 6) | (*byte_ptr & 0x3F);
 676                 }
 677
 678                 /*
 679                  *The decoded ucs4 char is now
 680                  *in c.
 681                  */
 682
 683                 if (c <= 0xFF) { /*Add other conditions to support
 684                                   *other char sets (ucs2, ucs3, ucs4).
 685                                   */
 686                         len++;
 687                 } else {
 688                         /*the char is too long to fit
 689                          *into the supposed charset len.
 690                          */
 691                         return CR_ENCODING_ERROR;
 692                 }
 693         }
 694
 695         *a_len = len;
 696
 697         return CR_OK;
 698 }
 699
 700 /**
 701  *Converts an utf8 string into an ucs4 string.
 702  *@param a_in the input string to convert.
 703  *@param a_in_len in/out parameter. The length of the input
 704  *string. After return, points to the actual number of bytes
 705  *consumed. This can be usefull to debug the input stream in case
 706  *of encoding error.
 707  *@param a_out out parameter. Points to the output string. It is allocated
 708  *by this function and must be freed by the caller.
 709  *@param a_out_len out parameter. The length of the output string.
 710  *@return CR_OK upon successfull completion, an error code otherwise.
 711  *
 712  */
 713 enum CRStatus
 714 cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
 715                            gulong * a_in_len,
 716                            guint32 ** a_out, gulong * a_out_len)
 717 {
 718         enum CRStatus status = CR_OK;
 719
 720         g_return_val_if_fail (a_in && a_in_len
 721                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
 722
 723         status = cr_utils_utf8_str_len_as_ucs4 (a_in,
 724                                                 &a_in[*a_in_len - 1],
 725                                                 a_out_len);
 726
 727         g_return_val_if_fail (status == CR_OK, status);
 728
 729         *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
 730
 731         status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
 732
 733         return status;
 734 }
 735
 736 /**
 737  *Converts an ucs4 buffer into an utf8 buffer.
 738  *
 739  *@param a_in the input ucs4 buffer to convert.
 740  *@param a_in_len in/out parameter. The size of the
 741  *input buffer to convert. After return, this parameter contains
 742  *the actual number of characters consumed.
 743  *@param a_out the output converted utf8 buffer. Must be allocated by
 744  *the caller.
 745  *@param a_out_len in/out parameter. The size of the output buffer.
 746  *If this size is actually smaller than the real needed size, the function
 747  *just converts what it can and returns a success status. After return,
 748  *this param points to the actual number of bytes in the buffer.
 749  *@return CR_OK upon successfull completion, an error code otherwise.
 750  */
 751 enum CRStatus
 752 cr_utils_ucs4_to_utf8 (const guint32 * a_in,
 753                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
 754 {
 755         gulong in_len = 0,
 756                 in_index = 0,
 757                 out_index = 0;
 758         enum CRStatus status = CR_OK;
 759
 760         g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
 761                               CR_BAD_PARAM_ERROR);
 762
 763         if (*a_in_len < 1) {
 764                 status = CR_OK;
 765                 goto end;
 766         }
 767
 768         in_len = *a_in_len;
 769
 770         for (in_index = 0; in_index < in_len; in_index++) {
 771                 /*
 772                  *FIXME: return whenever we encounter forbidden char values.
 773                  */
 774
 775                 if (a_in[in_index] <= 0x7F) {
 776                         a_out[out_index] = a_in[in_index];
 777                         out_index++;
 778                 } else if (a_in[in_index] <= 0x7FF) {
 779                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
 780                         a_out[out_index + 1] =
 781                                 (0x80 | (a_in[in_index] & 0x3F));
 782                         out_index += 2;
 783                 } else if (a_in[in_index] <= 0xFFFF) {
 784                         a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
 785                         a_out[out_index + 1] =
 786                                 (0x80 | ((a_in[in_index] >> 6) & 0x3F));
 787                         a_out[out_index + 2] =
 788                                 (0x80 | (a_in[in_index] & 0x3F));
 789                         out_index += 3;
 790                 } else if (a_in[in_index] <= 0x1FFFFF) {
 791                         a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
 792                         a_out[out_index + 1]
 793                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
 794                         a_out[out_index + 2]
 795                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
 796                         a_out[out_index + 3]
 797                                 = (0x80 | (a_in[in_index] & 0x3F));
 798                         out_index += 4;
 799                 } else if (a_in[in_index] <= 0x3FFFFFF) {
 800                         a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
 801                         a_out[out_index + 1] =
 802                                 (0x80 | (a_in[in_index] >> 18));
 803                         a_out[out_index + 2]
 804                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
 805                         a_out[out_index + 3]
 806                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
 807                         a_out[out_index + 4]
 808                                 = (0x80 | (a_in[in_index] & 0x3F));
 809                         out_index += 5;
 810                 } else if (a_in[in_index] <= 0x7FFFFFFF) {
 811                         a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
 812                         a_out[out_index + 1] =
 813                                 (0x80 | (a_in[in_index] >> 24));
 814                         a_out[out_index + 2]
 815                                 = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
 816                         a_out[out_index + 3]
 817                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
 818                         a_out[out_index + 4]
 819                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
 820                         a_out[out_index + 4]
 821                                 = (0x80 | (a_in[in_index] & 0x3F));
 822                         out_index += 6;
 823                 } else {
 824                         status = CR_ENCODING_ERROR;
 825                         goto end;
 826                 }
 827         }                       /*end for */
 828
 829       end:
 830         *a_in_len = in_index + 1;
 831         *a_out_len = out_index + 1;
 832
 833         return status;
 834 }
 835
 836 /**
 837  *Converts an ucs4 string into an utf8 string.
 838  *@param a_in the input string to convert.
 839  *@param a_in_len in/out parameter. The length of the input
 840  *string. After return, points to the actual number of characters
 841  *consumed. This can be usefull to debug the input string in case
 842  *of encoding error.
 843  *@param a_out out parameter. Points to the output string. It is allocated
 844  *by this function and must be freed by the caller.
 845  *@param a_out_len out parameter. The length (in bytes) of the output string.
 846  *@return CR_OK upon successfull completion, an error code otherwise.
 847  */
 848 enum CRStatus
 849 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
 850                            gulong * a_in_len,
 851                            guchar ** a_out, gulong * a_out_len)
 852 {
 853         enum CRStatus status = CR_OK;
 854
 855         g_return_val_if_fail (a_in && a_in_len && a_out
 856                               && a_out_len, CR_BAD_PARAM_ERROR);
 857
 858         status = cr_utils_ucs4_str_len_as_utf8 (a_in,
 859                                                 &a_in[*a_out_len - 1],
 860                                                 a_out_len);
 861
 862         g_return_val_if_fail (status == CR_OK, status);
 863
 864         status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
 865
 866         return status;
 867 }
 868
 869 /**
 870  *Converts an ucs1 buffer into an utf8 buffer.
 871  *The caller must know the size of the resulting buffer and
 872  *allocate it prior to calling this function.
 873  *
 874  *@param a_in the input ucs1 buffer.
 875  *
 876  *@param a_in_len in/out parameter. The length of the input buffer.
 877  *After return, points to the number of bytes actually consumed even
 878  *in case of encoding error.
 879  *
 880  *@param a_out out parameter. The output utf8 converted buffer.
 881  *
 882  *@param a_out_len in/out parameter. The size of the output buffer.
 883  *If the output buffer size is shorter than the actual needed size,
 884  *this function just convert what it can.
 885  *
 886  *@return CR_OK upon successfull completion, an error code otherwise.
 887  *
 888  */
 889 enum CRStatus
 890 cr_utils_ucs1_to_utf8 (const guchar * a_in,
 891                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
 892 {
 893         gulong out_index = 0,
 894                 in_index = 0,
 895                 in_len = 0,
 896                 out_len = 0;
 897         enum CRStatus status = CR_OK;
 898
 899         g_return_val_if_fail (a_in && a_in_len
 900                               && a_out_len,
 901                               CR_BAD_PARAM_ERROR);
 902
 903         if (*a_in_len == 0) {
 904                 *a_out_len = 0 ;
 905                 return CR_OK ;
 906         }
 907         g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
 908
 909         if (*a_in_len < 1) {
 910                 status = CR_OK;
 911                 goto end;
 912         }
 913
 914         in_len = *a_in_len;
 915         out_len = *a_out_len;
 916
 917         for (in_index = 0, out_index = 0;
 918              (in_index < in_len) && (out_index < out_len); in_index++) {
 919                 /*
 920                  *FIXME: return whenever we encounter forbidden char values.
 921                  */
 922
 923                 if (a_in[in_index] <= 0x7F) {
 924                         a_out[out_index] = a_in[in_index];
 925                         out_index++;
 926                 } else {
 927                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
 928                         a_out[out_index + 1] =
 929                                 (0x80 | (a_in[in_index] & 0x3F));
 930                         out_index += 2;
 931                 }
 932         }                       /*end for */
 933
 934       end:
 935         *a_in_len = in_index;
 936         *a_out_len = out_index;
 937
 938         return CR_OK;
 939 }
 940
 941 /**
 942  *Converts an ucs1 string into an utf8 string.
 943  *@param a_in_start the beginning of the input string to convert.
 944  *@param a_in_end the end of the input string to convert.
 945  *@param a_out out parameter. The converted string.
 946  *@param a_out out parameter. The length of the converted string.
 947  *@return CR_OK upon successfull completion, an error code otherwise.
 948  *
 949  */
 950 enum CRStatus
 951 cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
 952                            gulong * a_in_len,
 953                            guchar ** a_out, gulong * a_out_len)
 954 {
 955         gulong in_len = 0,
 956                 out_len = 0;
 957         enum CRStatus status = CR_OK;
 958
 959         g_return_val_if_fail (a_in && a_in_len && a_out
 960                               && a_out_len, CR_BAD_PARAM_ERROR);
 961
 962         if (*a_in_len < 1) {
 963                 *a_out_len = 0;
 964                 *a_out = NULL;
 965                 return CR_OK;
 966         }
 967
 968         status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
 969                                                 &out_len);
 970
 971         g_return_val_if_fail (status == CR_OK, status);
 972
 973         in_len = *a_in_len;
 974
 975         *a_out = g_malloc0 (out_len);
 976
 977         status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
 978
 979         *a_out_len = out_len;
 980
 981         return status;
 982 }
 983
 984 /**
 985  *Converts an utf8 buffer into an ucs1 buffer.
 986  *The caller must know the size of the resulting
 987  *converted buffer, and allocated it prior to calling this
 988  *function.
 989  *
 990  *@param a_in the input utf8 buffer to convert.
 991  *
 992  *@param a_in_len in/out parameter. The size of the input utf8 buffer.
 993  *After return, points to the number of bytes consumed
 994  *by the function even in case of encoding error.
 995  *
 996  *@param a_out out parameter. Points to the resulting buffer.
 997  *Must be allocated by the caller. If the size of a_out is shorter
 998  *than its required size, this function converts what it can and return
 999  *a successfull status.
1000  *
1001  *@param a_out_len in/out parameter. The size of the output buffer.
1002  *After return, points to the number of bytes consumed even in case of
1003  *encoding error.
1004  *
1005  *@return CR_OK upon successfull completion, an error code otherwise.
1006  */
1007 enum CRStatus
1008 cr_utils_utf8_to_ucs1 (const guchar * a_in,
1009                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
1010 {
1011         gulong in_index = 0,
1012                 out_index = 0,
1013                 in_len = 0,
1014                 out_len = 0;
1015         enum CRStatus status = CR_OK;
1016
1017         /*
1018          *to store the final decoded
1019          *unicode char
1020          */
1021         guint32 c = 0;
1022
1023         g_return_val_if_fail (a_in && a_in_len
1024                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1025
1026         if (*a_in_len < 1) {
1027                 status = CR_OK;
1028                 goto end;
1029         }
1030
1031         in_len = *a_in_len;
1032         out_len = *a_out_len;
1033
1034         for (in_index = 0, out_index = 0;
1035              (in_index < in_len) && (out_index < out_len);
1036              in_index++, out_index++) {
1037                 gint nb_bytes_2_decode = 0;
1038
1039                 if (a_in[in_index] <= 0x7F) {
1040                         /*
1041                          *7 bits long char
1042                          *encoded over 1 byte:
1043                          * 0xxx xxxx
1044                          */
1045                         c = a_in[in_index];
1046                         nb_bytes_2_decode = 1;
1047
1048                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
1049                         /*
1050                          *up to 11 bits long char.
1051                          *encoded over 2 bytes:
1052                          *110x xxxx  10xx xxxx
1053                          */
1054                         c = a_in[in_index] & 0x1F;
1055                         nb_bytes_2_decode = 2;
1056
1057                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
1058                         /*
1059                          *up to 16 bit long char
1060                          *encoded over 3 bytes:
1061                          *1110 xxxx  10xx xxxx  10xx xxxx
1062                          */
1063                         c = a_in[in_index] & 0x0F;
1064                         nb_bytes_2_decode = 3;
1065
1066                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
1067                         /*
1068                          *up to 21 bits long char
1069                          *encoded over 4 bytes:
1070                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
1071                          */
1072                         c = a_in[in_index] & 0x7;
1073                         nb_bytes_2_decode = 4;
1074
1075                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
1076                         /*
1077                          *up to 26 bits long char
1078                          *encoded over 5 bytes.
1079                          *1111 10xx  10xx xxxx  10xx xxxx
1080                          *10xx xxxx  10xx xxxx
1081                          */
1082                         c = a_in[in_index] & 3;
1083                         nb_bytes_2_decode = 5;
1084
1085                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
1086                         /*
1087                          *up to 31 bits long char
1088                          *encoded over 6 bytes:
1089                          *1111 110x  10xx xxxx  10xx xxxx
1090                          *10xx xxxx  10xx xxxx  10xx xxxx
1091                          */
1092                         c = a_in[in_index] & 1;
1093                         nb_bytes_2_decode = 6;
1094
1095                 } else {
1096                         /*BAD ENCODING */
1097                         status = CR_ENCODING_ERROR;
1098                         goto end;
1099                 }
1100
1101                 /*
1102                  *Go and decode the remaining byte(s)
1103                  *(if any) to get the current character.
1104                  */
1105                 if (in_index + nb_bytes_2_decode - 1 >= in_len) {
1106                         status = CR_OK;
1107                         goto end;
1108                 }
1109
1110                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
1111                         /*decode the next byte */
1112                         in_index++;
1113
1114                         /*byte pattern must be: 10xx xxxx */
1115                         if ((a_in[in_index] & 0xC0) != 0x80) {
1116                                 status = CR_ENCODING_ERROR;
1117                                 goto end;
1118                         }
1119
1120                         c = (c << 6) | (a_in[in_index] & 0x3F);
1121                 }
1122
1123                 /*
1124                  *The decoded ucs4 char is now
1125                  *in c.
1126                  */
1127
1128                 if (c > 0xFF) {
1129                         status = CR_ENCODING_ERROR;
1130                         goto end;
1131                 }
1132
1133                 a_out[out_index] = c;
1134         }
1135
1136       end:
1137         *a_out_len = out_index;
1138         *a_in_len = in_index;
1139
1140         return CR_OK;
1141 }
1142
1143 /**
1144  *Converts an utf8 buffer into an
1145  *ucs1 buffer.
1146  *@param a_in_start the start of the input buffer.
1147  *@param a_in_end the end of the input buffer.
1148  *@param a_out out parameter. The resulting converted ucs4 buffer.
1149  *Must be freed by the caller.
1150  *@param a_out_len out parameter. The length of the converted buffer.
1151  *@return CR_OK upon successfull completion, an error code otherwise.
1152  *Note that out parameters are valid if and only if this function
1153  *returns CR_OK.
1154  */
1155 enum CRStatus
1156 cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
1157                            gulong * a_in_len,
1158                            guchar ** a_out, gulong * a_out_len)
1159 {
1160         enum CRStatus status = CR_OK;
1161
1162         g_return_val_if_fail (a_in && a_in_len
1163                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1164
1165         if (*a_in_len < 1) {
1166                 *a_out_len = 0;
1167                 *a_out = NULL;
1168                 return CR_OK;
1169         }
1170
1171         status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
1172                                                 a_out_len);
1173
1174         g_return_val_if_fail (status == CR_OK, status);
1175
1176         *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
1177
1178         status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
1179         return status;
1180 }
1181
1182 /*****************************************
1183  *CSS basic types identification utilities
1184  *****************************************/
1185
1186 /**
1187  *Returns TRUE if a_char is a white space as
1188  *defined in the css spec in chap 4.1.1.
1189  *
1190  *white-space ::= ' '| \t|\r|\n|\f
1191  *
1192  *@param a_char the character to test.
1193  *return TRUE if is a white space, false otherwise.
1194  */
1195 gboolean
1196 cr_utils_is_white_space (guint32 a_char)
1197 {
1198         switch (a_char) {
1199         case ' ':
1200         case '\t':
1201         case '\r':
1202         case '\n':
1203         case '\f':
1204                 return TRUE;
1205                 break;
1206         default:
1207                 return FALSE;
1208         }
1209 }
1210
1211 /**
1212  *Returns true if the character is a newline
1213  *as defined in the css spec in the chap 4.1.1.
1214  *
1215  *nl ::= \n|\r\n|\r|\f
1216  *
1217  *@param a_char the character to test.
1218  *@return TRUE if the character is a newline, FALSE otherwise.
1219  */
1220 gboolean
1221 cr_utils_is_newline (guint32 a_char)
1222 {
1223         switch (a_char) {
1224         case '\n':
1225         case '\r':
1226         case '\f':
1227                 return TRUE;
1228                 break;
1229         default:
1230                 return FALSE;
1231         }
1232 }
1233
1234 /**
1235  *returns TRUE if the char is part of an hexa num char:
1236  *i.e hexa_char ::= [0-9A-F]
1237  */
1238 gboolean
1239 cr_utils_is_hexa_char (guint32 a_char)
1240 {
1241         if ((a_char >= '0' && a_char <= '9')
1242             || (a_char >= 'A' && a_char <= 'F')) {
1243                 return TRUE;
1244         }
1245         return FALSE;
1246 }
1247
1248 /**
1249  *Returns true if the character is a nonascii
1250  *character (as defined in the css spec chap 4.1.1):
1251  *
1252  *nonascii ::= [^\0-\177]
1253  *
1254  *@param a_char the character to test.
1255  *@return TRUE if the character is a nonascii char,
1256  *FALSE otherwise.
1257  */
1258 gboolean
1259 cr_utils_is_nonascii (guint32 a_char)
1260 {
1261         if (a_char <= 177) {
1262                 return FALSE;
1263         }
1264
1265         return TRUE;
1266 }
1267
1268 /**
1269  *Dumps a character a_nb times on a file.
1270  *@param a_char the char to dump
1271  *@param a_fp the destination file pointer
1272  *@param a_nb the number of times a_char is to be dumped.
1273  */
1274 void
1275 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
1276 {
1277         glong i = 0;
1278
1279         for (i = 0; i < a_nb; i++) {
1280                 fprintf (a_fp, "%c", a_char);
1281         }
1282 }
1283
1284 void
1285 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
1286 {
1287         glong i = 0;
1288
1289         g_return_if_fail (a_string);
1290
1291         for (i = 0; i < a_nb; i++) {
1292                 g_string_append_printf (a_string, "%c", a_char);
1293         }
1294 }
1295
1296 gdouble
1297 cr_utils_n_to_0_dot_n (glong a_n, glong decimal_places)
1298 {
1299         gdouble result = a_n;
1300
1301         while (decimal_places > 0) {
1302                 result = result / 10;
1303                 decimal_places--;
1304         }
1305
1306         return result;
1307 }
1308
1309 /**
1310  *Duplicates a list of GString instances.
1311  *@return the duplicated list of GString instances or NULL if
1312  *something bad happened.
1313  *@param a_list_of_strings the list of strings to be duplicated.
1314  */
1315 GList *
1316 cr_utils_dup_glist_of_string (GList * a_list_of_strings)
1317 {
1318         GList *cur = NULL,
1319                 *result = NULL;
1320
1321         g_return_val_if_fail (a_list_of_strings, NULL);
1322
1323         for (cur = a_list_of_strings; cur; cur = cur->next) {
1324                 GString *str = NULL;
1325
1326                 str = g_string_new_len (((GString *) cur->data)->str,
1327                                         ((GString *) cur->data)->len);
1328                 if (str)
1329                         result = g_list_append (result, str);
1330         }
1331
1332         return result;
1333 }
1334
1335 /**
1336  *Duplicate a GList where the GList::data is a CRString.
1337  *@param a_list_of_strings the list to duplicate
1338  *@return the duplicated list, or NULL if something bad
1339  *happened.
1340  */
1341 GList *
1342 cr_utils_dup_glist_of_cr_string (GList * a_list_of_strings)
1343 {
1344         GList *cur = NULL, *result = NULL;
1345
1346         g_return_val_if_fail (a_list_of_strings, NULL);
1347
1348         for (cur = a_list_of_strings; cur; cur = cur->next) {
1349                 CRString *str = NULL;
1350
1351                 str = cr_string_dup ((CRString *) cur->data) ;
1352                 if (str)
1353                         result = g_list_append (result, str);
1354         }
1355
1356         return result;
1357 }