docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <para>
  19 The implementations of the Unicode functions in GLib are based
  20 on the Unicode Character Data tables, which are available from
  21 <ulink url="http://www.unicode.org">www.unicode.org</ulink>.
  22 GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1,
  23 GLib 2.12 supports Unicode 5.0.
  24 </para>
  25
  26 <!-- ##### SECTION See_Also ##### -->
  27 <para>
  28 <variablelist>
  29
  30 <varlistentry>
  31 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  32 <listitem><para>
  33 Convenience functions for converting between UTF-8 and the locale encoding.
  34 </para></listitem>
  35 </varlistentry>
  36
  37 </variablelist>
  38 </para>
  39
  40 <!-- ##### SECTION Stability_Level ##### -->
  41
  42
  43 <!-- ##### TYPEDEF gunichar ##### -->
  44 <para>
  45 A type which can hold any UCS-4 character code.
  46 </para>
  47
  48
  49 <!-- ##### TYPEDEF gunichar2 ##### -->
  50 <para>
  51 A type which can hold any UTF-16 code
  52 point<footnote id="utf16_surrogate_pairs">UTF-16 also has so called
  53 <firstterm>surrogate pairs</firstterm> to encode characters beyond the
  54 BMP as pairs of 16bit numbers. Surrogate pairs cannot be stored in a
  55 single gunichar2 field, but all GLib functions accepting gunichar2 arrays
  56 will correctly interpret surrogate pairs.</footnote>.
  57 </para>
  58
  59
  60 <!-- ##### FUNCTION g_unichar_validate ##### -->
  61 <para>
  62
  63 </para>
  64
  65 @ch:
  66 @Returns:
  67
  68
  69 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  70 <para>
  71
  72 </para>
  73
  74 @c:
  75 @Returns:
  76
  77
  78 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  79 <para>
  80
  81 </para>
  82
  83 @c:
  84 @Returns:
  85
  86
  87 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  88 <para>
  89
  90 </para>
  91
  92 @c:
  93 @Returns:
  94
  95
  96 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  97 <para>
  98
  99 </para>
 100
 101 @c:
 102 @Returns:
 103
 104
 105 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
 106 <para>
 107
 108 </para>
 109
 110 @c:
 111 @Returns:
 112
 113
 114 <!-- ##### FUNCTION g_unichar_islower ##### -->
 115 <para>
 116
 117 </para>
 118
 119 @c:
 120 @Returns:
 121
 122
 123 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 124 <para>
 125
 126 </para>
 127
 128 @c:
 129 @Returns:
 130
 131
 132 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 133 <para>
 134
 135 </para>
 136
 137 @c:
 138 @Returns:
 139
 140
 141 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 142 <para>
 143
 144 </para>
 145
 146 @c:
 147 @Returns:
 148
 149
 150 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 151 <para>
 152
 153 </para>
 154
 155 @c:
 156 @Returns:
 157
 158
 159 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 160 <para>
 161
 162 </para>
 163
 164 @c:
 165 @Returns:
 166
 167
 168 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 169 <para>
 170
 171 </para>
 172
 173 @c:
 174 @Returns:
 175
 176
 177 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 178 <para>
 179
 180 </para>
 181
 182 @c:
 183 @Returns:
 184
 185
 186 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 187 <para>
 188
 189 </para>
 190
 191 @c:
 192 @Returns:
 193
 194
 195 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 196 <para>
 197
 198 </para>
 199
 200 @c:
 201 @Returns:
 202
 203
 204 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 205 <para>
 206
 207 </para>
 208
 209 @c:
 210 @Returns:
 211
 212
 213 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 214 <para>
 215
 216 </para>
 217
 218 @c:
 219 @Returns:
 220
 221
 222 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 223 <para>
 224
 225 </para>
 226
 227 @c:
 228 @Returns:
 229
 230
 231 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 232 <para>
 233
 234 </para>
 235
 236 @c:
 237 @Returns:
 238
 239
 240 <!-- ##### ENUM GUnicodeType ##### -->
 241 <para>
 242 These are the possible character classifications.
 243 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 244 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 245 </para>
 246
 247 @G_UNICODE_CONTROL:
 248 @G_UNICODE_FORMAT:
 249 @G_UNICODE_UNASSIGNED:
 250 @G_UNICODE_PRIVATE_USE:
 251 @G_UNICODE_SURROGATE:
 252 @G_UNICODE_LOWERCASE_LETTER:
 253 @G_UNICODE_MODIFIER_LETTER:
 254 @G_UNICODE_OTHER_LETTER:
 255 @G_UNICODE_TITLECASE_LETTER:
 256 @G_UNICODE_UPPERCASE_LETTER:
 257 @G_UNICODE_COMBINING_MARK:
 258 @G_UNICODE_ENCLOSING_MARK:
 259 @G_UNICODE_NON_SPACING_MARK:
 260 @G_UNICODE_DECIMAL_NUMBER:
 261 @G_UNICODE_LETTER_NUMBER:
 262 @G_UNICODE_OTHER_NUMBER:
 263 @G_UNICODE_CONNECT_PUNCTUATION:
 264 @G_UNICODE_DASH_PUNCTUATION:
 265 @G_UNICODE_CLOSE_PUNCTUATION:
 266 @G_UNICODE_FINAL_PUNCTUATION:
 267 @G_UNICODE_INITIAL_PUNCTUATION:
 268 @G_UNICODE_OTHER_PUNCTUATION:
 269 @G_UNICODE_OPEN_PUNCTUATION:
 270 @G_UNICODE_CURRENCY_SYMBOL:
 271 @G_UNICODE_MODIFIER_SYMBOL:
 272 @G_UNICODE_MATH_SYMBOL:
 273 @G_UNICODE_OTHER_SYMBOL:
 274 @G_UNICODE_LINE_SEPARATOR:
 275 @G_UNICODE_PARAGRAPH_SEPARATOR:
 276 @G_UNICODE_SPACE_SEPARATOR:
 277
 278 <!-- ##### FUNCTION g_unichar_type ##### -->
 279 <para>
 280
 281 </para>
 282
 283 @c:
 284 @Returns:
 285
 286
 287 <!-- ##### ENUM GUnicodeBreakType ##### -->
 288 <para>
 289 These are the possible line break classifications.
 290 The five Hangul types were added in Unicode 4.1, so, has been
 291 introduced in GLib 2.10.  Note that new types may be added in the future.
 292 Applications should be ready to handle unknown values.
 293 They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
 294 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 295 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 296
 297 </para>
 298
 299 @G_UNICODE_BREAK_MANDATORY:
 300 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 301 @G_UNICODE_BREAK_LINE_FEED:
 302 @G_UNICODE_BREAK_COMBINING_MARK:
 303 @G_UNICODE_BREAK_SURROGATE:
 304 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 305 @G_UNICODE_BREAK_INSEPARABLE:
 306 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 307 @G_UNICODE_BREAK_CONTINGENT:
 308 @G_UNICODE_BREAK_SPACE:
 309 @G_UNICODE_BREAK_AFTER:
 310 @G_UNICODE_BREAK_BEFORE:
 311 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 312 @G_UNICODE_BREAK_HYPHEN:
 313 @G_UNICODE_BREAK_NON_STARTER:
 314 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 315 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 316 @G_UNICODE_BREAK_QUOTATION:
 317 @G_UNICODE_BREAK_EXCLAMATION:
 318 @G_UNICODE_BREAK_IDEOGRAPHIC:
 319 @G_UNICODE_BREAK_NUMERIC:
 320 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 321 @G_UNICODE_BREAK_SYMBOL:
 322 @G_UNICODE_BREAK_ALPHABETIC:
 323 @G_UNICODE_BREAK_PREFIX:
 324 @G_UNICODE_BREAK_POSTFIX:
 325 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 326 @G_UNICODE_BREAK_AMBIGUOUS:
 327 @G_UNICODE_BREAK_UNKNOWN:
 328 @G_UNICODE_BREAK_NEXT_LINE:
 329 @G_UNICODE_BREAK_WORD_JOINER:
 330 @G_UNICODE_BREAK_HANGUL_L_JAMO:
 331 @G_UNICODE_BREAK_HANGUL_V_JAMO:
 332 @G_UNICODE_BREAK_HANGUL_T_JAMO:
 333 @G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
 334 @G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
 335
 336 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 337 <para>
 338
 339 </para>
 340
 341 @c:
 342 @Returns:
 343
 344
 345 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 346 <para>
 347
 348 </para>
 349
 350 @string:
 351 @len:
 352
 353
 354 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 355 <para>
 356
 357 </para>
 358
 359 @ch:
 360 @result_len:
 361 @Returns:
 362
 363
 364 <!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
 365 <para>
 366
 367 </para>
 368
 369 @ch:
 370 @mirrored_ch:
 371 @Returns:
 372
 373
 374 <!-- ##### MACRO g_utf8_next_char ##### -->
 375 <para>
 376 Skips to the next character in a UTF-8 string. The string must be
 377 valid; this macro is as fast as possible, and has no error-checking.
 378 You would use this macro to iterate over a string character by
 379 character. The macro returns the start of the next UTF-8 character.
 380 Before using this macro, use g_utf8_validate() to validate strings
 381 that may contain invalid UTF-8.
 382 </para>
 383
 384 @p: Pointer to the start of a valid UTF-8 character.
 385
 386
 387 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 388 <para>
 389
 390 </para>
 391
 392 @p:
 393 @Returns:
 394
 395
 396 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 397 <para>
 398
 399 </para>
 400
 401 @p:
 402 @max_len:
 403 @Returns:
 404
 405
 406 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 407 <para>
 408
 409 </para>
 410
 411 @str:
 412 @offset:
 413 @Returns:
 414
 415
 416 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 417 <para>
 418
 419 </para>
 420
 421 @str:
 422 @pos:
 423 @Returns:
 424
 425
 426 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 427 <para>
 428
 429 </para>
 430
 431 @p:
 432 @Returns:
 433
 434
 435 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 436 <para>
 437
 438 </para>
 439
 440 @p:
 441 @end:
 442 @Returns:
 443 <!-- # Unused Parameters # -->
 444 @bound:
 445
 446
 447 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 448 <para>
 449
 450 </para>
 451
 452 @str:
 453 @p:
 454 @Returns:
 455
 456
 457 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 458 <para>
 459
 460 </para>
 461
 462 @p:
 463 @max:
 464 @Returns:
 465
 466
 467 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 468 <para>
 469
 470 </para>
 471
 472 @dest:
 473 @src:
 474 @n:
 475 @Returns:
 476
 477
 478 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 479 <para>
 480
 481 </para>
 482
 483 @p:
 484 @len:
 485 @c:
 486 @Returns:
 487 <!-- # Unused Parameters # -->
 488 @ch:
 489
 490
 491 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 492 <para>
 493
 494 </para>
 495
 496 @p:
 497 @len:
 498 @c:
 499 @Returns:
 500 <!-- # Unused Parameters # -->
 501 @ch:
 502
 503
 504 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 505 <para>
 506
 507 </para>
 508
 509 @str:
 510 @len:
 511 @Returns:
 512
 513
 514 <!-- ##### FUNCTION g_utf8_validate ##### -->
 515 <para>
 516
 517 </para>
 518
 519 @str:
 520 @max_len:
 521 @end:
 522 @Returns:
 523
 524
 525 <!-- ##### FUNCTION g_utf8_strup ##### -->
 526 <para>
 527
 528 </para>
 529
 530 @str:
 531 @len:
 532 @Returns:
 533
 534
 535 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 536 <para>
 537
 538 </para>
 539
 540 @str:
 541 @len:
 542 @Returns:
 543
 544
 545 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 546 <para>
 547
 548 </para>
 549
 550 @str:
 551 @len:
 552 @Returns:
 553
 554
 555 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 556 <para>
 557
 558 </para>
 559
 560 @str:
 561 @len:
 562 @mode:
 563 @Returns:
 564
 565
 566 <!-- ##### ENUM GNormalizeMode ##### -->
 567 <para>
 568 Defines how a Unicode string is transformed in a canonical
 569 form, standardizing such issues as whether a character with an accent is
 570 represented as a base character and combining accent or as a single precomposed
 571 character. Unicode strings should generally be normalized before comparing them.
 572 </para>
 573
 574 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 575   text content, such as the above-mentioned accent representation.
 576 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 577 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 578   forms rather than a maximally decomposed form.
 579 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 580 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 581   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 582   standard forms (in this case DIGIT THREE). Formatting information may be
 583   lost but for most text operations such characters should be considered the
 584   same.
 585 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 586 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 587   forms rather than a maximally decomposed form.
 588 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 589
 590 <!-- ##### FUNCTION g_utf8_collate ##### -->
 591 <para>
 592
 593 </para>
 594
 595 @str1:
 596 @str2:
 597 @Returns:
 598
 599
 600 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 601 <para>
 602
 603 </para>
 604
 605 @str:
 606 @len:
 607 @Returns:
 608
 609
 610 <!-- ##### FUNCTION g_utf8_collate_key_for_filename ##### -->
 611 <para>
 612
 613 </para>
 614
 615 @str:
 616 @len:
 617 @Returns:
 618
 619
 620 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 621 <para>
 622
 623 </para>
 624
 625 @str:
 626 @len:
 627 @items_read:
 628 @items_written:
 629 @error:
 630 @Returns:
 631
 632
 633 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 634 <para>
 635
 636 </para>
 637
 638 @str:
 639 @len:
 640 @items_read:
 641 @items_written:
 642 @error:
 643 @Returns:
 644
 645
 646 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 647 <para>
 648
 649 </para>
 650
 651 @str:
 652 @len:
 653 @items_written:
 654 @Returns:
 655
 656
 657 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 658 <para>
 659
 660 </para>
 661
 662 @str:
 663 @len:
 664 @items_read:
 665 @items_written:
 666 @error:
 667 @Returns:
 668
 669
 670 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 671 <para>
 672
 673 </para>
 674
 675 @str:
 676 @len:
 677 @items_read:
 678 @items_written:
 679 @error:
 680 @Returns:
 681
 682
 683 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 684 <para>
 685
 686 </para>
 687
 688 @str:
 689 @len:
 690 @items_read:
 691 @items_written:
 692 @error:
 693 @Returns:
 694
 695
 696 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 697 <para>
 698
 699 </para>
 700
 701 @str:
 702 @len:
 703 @items_read:
 704 @items_written:
 705 @error:
 706 @Returns:
 707
 708
 709 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 710 <para>
 711
 712 </para>
 713
 714 @c:
 715 @outbuf:
 716 @Returns:
 717
 718