docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <para>
  19 The implementations of the Unicode functions in GLib are based
  20 on the Unicode Character Data tables, which are available from
  21 <ulink url="http://www.unicode.org">www.unicode.org</ulink>.
  22 GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1,
  23 GLib 2.12 supports Unicode 5.0.
  24 </para>
  25
  26 <!-- ##### SECTION See_Also ##### -->
  27 <para>
  28 <variablelist>
  29
  30 <varlistentry>
  31 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  32 <listitem><para>
  33 Convenience functions for converting between UTF-8 and the locale encoding.
  34 </para></listitem>
  35 </varlistentry>
  36
  37 </variablelist>
  38 </para>
  39
  40 <!-- ##### SECTION Stability_Level ##### -->
  41
  42
  43 <!-- ##### TYPEDEF gunichar ##### -->
  44 <para>
  45 A type which can hold any UCS-4 character code.
  46 </para>
  47
  48
  49 <!-- ##### TYPEDEF gunichar2 ##### -->
  50 <para>
  51 A type which can hold any UTF-16 code
  52 point<footnote id="utf16_surrogate_pairs">UTF-16 also has so called
  53 <firstterm>surrogate pairs</firstterm> to encode characters beyond the
  54 BMP as pairs of 16bit numbers. Surrogate pairs cannot be stored in a
  55 single gunichar2 field, but all GLib functions accepting gunichar2 arrays
  56 will correctly interpret surrogate pairs.</footnote>.
  57 </para>
  58
  59
  60 <!-- ##### FUNCTION g_unichar_validate ##### -->
  61 <para>
  62
  63 </para>
  64
  65 @ch:
  66 @Returns:
  67
  68
  69 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  70 <para>
  71
  72 </para>
  73
  74 @c:
  75 @Returns:
  76
  77
  78 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  79 <para>
  80
  81 </para>
  82
  83 @c:
  84 @Returns:
  85
  86
  87 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  88 <para>
  89
  90 </para>
  91
  92 @c:
  93 @Returns:
  94
  95
  96 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  97 <para>
  98
  99 </para>
 100
 101 @c:
 102 @Returns:
 103
 104
 105 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
 106 <para>
 107
 108 </para>
 109
 110 @c:
 111 @Returns:
 112
 113
 114 <!-- ##### FUNCTION g_unichar_islower ##### -->
 115 <para>
 116
 117 </para>
 118
 119 @c:
 120 @Returns:
 121
 122
 123 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 124 <para>
 125
 126 </para>
 127
 128 @c:
 129 @Returns:
 130
 131
 132 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 133 <para>
 134
 135 </para>
 136
 137 @c:
 138 @Returns:
 139
 140
 141 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 142 <para>
 143
 144 </para>
 145
 146 @c:
 147 @Returns:
 148
 149
 150 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 151 <para>
 152
 153 </para>
 154
 155 @c:
 156 @Returns:
 157
 158
 159 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 160 <para>
 161
 162 </para>
 163
 164 @c:
 165 @Returns:
 166
 167
 168 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 169 <para>
 170
 171 </para>
 172
 173 @c:
 174 @Returns:
 175
 176
 177 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 178 <para>
 179
 180 </para>
 181
 182 @c:
 183 @Returns:
 184
 185
 186 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 187 <para>
 188
 189 </para>
 190
 191 @c:
 192 @Returns:
 193
 194
 195 <!-- ##### FUNCTION g_unichar_iswide_cjk ##### -->
 196 <para>
 197
 198 </para>
 199
 200 @c:
 201 @Returns:
 202
 203
 204 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 205 <para>
 206
 207 </para>
 208
 209 @c:
 210 @Returns:
 211
 212
 213 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 214 <para>
 215
 216 </para>
 217
 218 @c:
 219 @Returns:
 220
 221
 222 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 223 <para>
 224
 225 </para>
 226
 227 @c:
 228 @Returns:
 229
 230
 231 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 232 <para>
 233
 234 </para>
 235
 236 @c:
 237 @Returns:
 238
 239
 240 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 241 <para>
 242
 243 </para>
 244
 245 @c:
 246 @Returns:
 247
 248
 249 <!-- ##### ENUM GUnicodeType ##### -->
 250 <para>
 251 These are the possible character classifications.
 252 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 253 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 254 </para>
 255
 256 @G_UNICODE_CONTROL:
 257 @G_UNICODE_FORMAT:
 258 @G_UNICODE_UNASSIGNED:
 259 @G_UNICODE_PRIVATE_USE:
 260 @G_UNICODE_SURROGATE:
 261 @G_UNICODE_LOWERCASE_LETTER:
 262 @G_UNICODE_MODIFIER_LETTER:
 263 @G_UNICODE_OTHER_LETTER:
 264 @G_UNICODE_TITLECASE_LETTER:
 265 @G_UNICODE_UPPERCASE_LETTER:
 266 @G_UNICODE_COMBINING_MARK:
 267 @G_UNICODE_ENCLOSING_MARK:
 268 @G_UNICODE_NON_SPACING_MARK:
 269 @G_UNICODE_DECIMAL_NUMBER:
 270 @G_UNICODE_LETTER_NUMBER:
 271 @G_UNICODE_OTHER_NUMBER:
 272 @G_UNICODE_CONNECT_PUNCTUATION:
 273 @G_UNICODE_DASH_PUNCTUATION:
 274 @G_UNICODE_CLOSE_PUNCTUATION:
 275 @G_UNICODE_FINAL_PUNCTUATION:
 276 @G_UNICODE_INITIAL_PUNCTUATION:
 277 @G_UNICODE_OTHER_PUNCTUATION:
 278 @G_UNICODE_OPEN_PUNCTUATION:
 279 @G_UNICODE_CURRENCY_SYMBOL:
 280 @G_UNICODE_MODIFIER_SYMBOL:
 281 @G_UNICODE_MATH_SYMBOL:
 282 @G_UNICODE_OTHER_SYMBOL:
 283 @G_UNICODE_LINE_SEPARATOR:
 284 @G_UNICODE_PARAGRAPH_SEPARATOR:
 285 @G_UNICODE_SPACE_SEPARATOR:
 286
 287 <!-- ##### FUNCTION g_unichar_type ##### -->
 288 <para>
 289
 290 </para>
 291
 292 @c:
 293 @Returns:
 294
 295
 296 <!-- ##### ENUM GUnicodeBreakType ##### -->
 297 <para>
 298 These are the possible line break classifications.
 299 The five Hangul types were added in Unicode 4.1, so, has been
 300 introduced in GLib 2.10.  Note that new types may be added in the future.
 301 Applications should be ready to handle unknown values.
 302 They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
 303 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 304 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 305
 306 </para>
 307
 308 @G_UNICODE_BREAK_MANDATORY:
 309 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 310 @G_UNICODE_BREAK_LINE_FEED:
 311 @G_UNICODE_BREAK_COMBINING_MARK:
 312 @G_UNICODE_BREAK_SURROGATE:
 313 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 314 @G_UNICODE_BREAK_INSEPARABLE:
 315 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 316 @G_UNICODE_BREAK_CONTINGENT:
 317 @G_UNICODE_BREAK_SPACE:
 318 @G_UNICODE_BREAK_AFTER:
 319 @G_UNICODE_BREAK_BEFORE:
 320 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 321 @G_UNICODE_BREAK_HYPHEN:
 322 @G_UNICODE_BREAK_NON_STARTER:
 323 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 324 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 325 @G_UNICODE_BREAK_QUOTATION:
 326 @G_UNICODE_BREAK_EXCLAMATION:
 327 @G_UNICODE_BREAK_IDEOGRAPHIC:
 328 @G_UNICODE_BREAK_NUMERIC:
 329 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 330 @G_UNICODE_BREAK_SYMBOL:
 331 @G_UNICODE_BREAK_ALPHABETIC:
 332 @G_UNICODE_BREAK_PREFIX:
 333 @G_UNICODE_BREAK_POSTFIX:
 334 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 335 @G_UNICODE_BREAK_AMBIGUOUS:
 336 @G_UNICODE_BREAK_UNKNOWN:
 337 @G_UNICODE_BREAK_NEXT_LINE:
 338 @G_UNICODE_BREAK_WORD_JOINER:
 339 @G_UNICODE_BREAK_HANGUL_L_JAMO:
 340 @G_UNICODE_BREAK_HANGUL_V_JAMO:
 341 @G_UNICODE_BREAK_HANGUL_T_JAMO:
 342 @G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
 343 @G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
 344
 345 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 346 <para>
 347
 348 </para>
 349
 350 @c:
 351 @Returns:
 352
 353
 354 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 355 <para>
 356
 357 </para>
 358
 359 @string:
 360 @len:
 361
 362
 363 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 364 <para>
 365
 366 </para>
 367
 368 @ch:
 369 @result_len:
 370 @Returns:
 371
 372
 373 <!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
 374 <para>
 375
 376 </para>
 377
 378 @ch:
 379 @mirrored_ch:
 380 @Returns:
 381
 382
 383 <!-- ##### MACRO g_utf8_next_char ##### -->
 384 <para>
 385 Skips to the next character in a UTF-8 string. The string must be
 386 valid; this macro is as fast as possible, and has no error-checking.
 387 You would use this macro to iterate over a string character by
 388 character. The macro returns the start of the next UTF-8 character.
 389 Before using this macro, use g_utf8_validate() to validate strings
 390 that may contain invalid UTF-8.
 391 </para>
 392
 393 @p: Pointer to the start of a valid UTF-8 character.
 394
 395
 396 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 397 <para>
 398
 399 </para>
 400
 401 @p:
 402 @Returns:
 403
 404
 405 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 406 <para>
 407
 408 </para>
 409
 410 @p:
 411 @max_len:
 412 @Returns:
 413
 414
 415 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 416 <para>
 417
 418 </para>
 419
 420 @str:
 421 @offset:
 422 @Returns:
 423
 424
 425 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 426 <para>
 427
 428 </para>
 429
 430 @str:
 431 @pos:
 432 @Returns:
 433
 434
 435 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 436 <para>
 437
 438 </para>
 439
 440 @p:
 441 @Returns:
 442
 443
 444 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 445 <para>
 446
 447 </para>
 448
 449 @p:
 450 @end:
 451 @Returns:
 452
 453
 454 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 455 <para>
 456
 457 </para>
 458
 459 @str:
 460 @p:
 461 @Returns:
 462
 463
 464 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 465 <para>
 466
 467 </para>
 468
 469 @p:
 470 @max:
 471 @Returns:
 472
 473
 474 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 475 <para>
 476
 477 </para>
 478
 479 @dest:
 480 @src:
 481 @n:
 482 @Returns:
 483
 484
 485 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 486 <para>
 487
 488 </para>
 489
 490 @p:
 491 @len:
 492 @c:
 493 @Returns:
 494
 495
 496 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 497 <para>
 498
 499 </para>
 500
 501 @p:
 502 @len:
 503 @c:
 504 @Returns:
 505
 506
 507 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 508 <para>
 509
 510 </para>
 511
 512 @str:
 513 @len:
 514 @Returns:
 515
 516
 517 <!-- ##### FUNCTION g_utf8_validate ##### -->
 518 <para>
 519
 520 </para>
 521
 522 @str:
 523 @max_len:
 524 @end:
 525 @Returns:
 526
 527
 528 <!-- ##### FUNCTION g_utf8_strup ##### -->
 529 <para>
 530
 531 </para>
 532
 533 @str:
 534 @len:
 535 @Returns:
 536
 537
 538 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 539 <para>
 540
 541 </para>
 542
 543 @str:
 544 @len:
 545 @Returns:
 546
 547
 548 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 549 <para>
 550
 551 </para>
 552
 553 @str:
 554 @len:
 555 @Returns:
 556
 557
 558 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 559 <para>
 560
 561 </para>
 562
 563 @str:
 564 @len:
 565 @mode:
 566 @Returns:
 567
 568
 569 <!-- ##### ENUM GNormalizeMode ##### -->
 570 <para>
 571 Defines how a Unicode string is transformed in a canonical
 572 form, standardizing such issues as whether a character with an accent is
 573 represented as a base character and combining accent or as a single precomposed
 574 character. Unicode strings should generally be normalized before comparing them.
 575 </para>
 576
 577 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 578   text content, such as the above-mentioned accent representation.
 579 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 580 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 581   forms rather than a maximally decomposed form.
 582 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 583 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 584   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 585   standard forms (in this case DIGIT THREE). Formatting information may be
 586   lost but for most text operations such characters should be considered the
 587   same.
 588 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 589 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 590   forms rather than a maximally decomposed form.
 591 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 592
 593 <!-- ##### FUNCTION g_utf8_collate ##### -->
 594 <para>
 595
 596 </para>
 597
 598 @str1:
 599 @str2:
 600 @Returns:
 601
 602
 603 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 604 <para>
 605
 606 </para>
 607
 608 @str:
 609 @len:
 610 @Returns:
 611
 612
 613 <!-- ##### FUNCTION g_utf8_collate_key_for_filename ##### -->
 614 <para>
 615
 616 </para>
 617
 618 @str:
 619 @len:
 620 @Returns:
 621
 622
 623 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 624 <para>
 625
 626 </para>
 627
 628 @str:
 629 @len:
 630 @items_read:
 631 @items_written:
 632 @error:
 633 @Returns:
 634
 635
 636 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 637 <para>
 638
 639 </para>
 640
 641 @str:
 642 @len:
 643 @items_read:
 644 @items_written:
 645 @error:
 646 @Returns:
 647
 648
 649 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 650 <para>
 651
 652 </para>
 653
 654 @str:
 655 @len:
 656 @items_written:
 657 @Returns:
 658
 659
 660 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 661 <para>
 662
 663 </para>
 664
 665 @str:
 666 @len:
 667 @items_read:
 668 @items_written:
 669 @error:
 670 @Returns:
 671
 672
 673 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 674 <para>
 675
 676 </para>
 677
 678 @str:
 679 @len:
 680 @items_read:
 681 @items_written:
 682 @error:
 683 @Returns:
 684
 685
 686 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 687 <para>
 688
 689 </para>
 690
 691 @str:
 692 @len:
 693 @items_read:
 694 @items_written:
 695 @error:
 696 @Returns:
 697
 698
 699 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 700 <para>
 701
 702 </para>
 703
 704 @str:
 705 @len:
 706 @items_read:
 707 @items_written:
 708 @error:
 709 @Returns:
 710
 711
 712 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 713 <para>
 714
 715 </para>
 716
 717 @c:
 718 @outbuf:
 719 @Returns:
 720
 721