docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### SECTION Stability_Level ##### -->
  33
  34
  35 <!-- ##### TYPEDEF gunichar ##### -->
  36 <para>
  37 A type which can hold any UCS-4 character code.
  38 </para>
  39
  40
  41 <!-- ##### TYPEDEF gunichar2 ##### -->
  42 <para>
  43 A type which can hold any UTF-16 code
  44 point<footnote id="utf16_surrogate_pairs">UTF-16 also has so called
  45 <firstterm>surrogate pairs</firstterm> to encode characters beyond the
  46 BMP as pairs of 16bit numbers. Surrogate pairs cannot be stored in a
  47 single gunichar2 field, but all GLib functions accepting gunichar2 arrays
  48 will correctly interpret surrogate pairs.</footnote>.
  49 </para>
  50
  51
  52 <!-- ##### FUNCTION g_unichar_validate ##### -->
  53 <para>
  54
  55 </para>
  56
  57 @ch:
  58 @Returns:
  59
  60
  61 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  62 <para>
  63
  64 </para>
  65
  66 @c:
  67 @Returns:
  68
  69
  70 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  71 <para>
  72
  73 </para>
  74
  75 @c:
  76 @Returns:
  77
  78
  79 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  80 <para>
  81
  82 </para>
  83
  84 @c:
  85 @Returns:
  86
  87
  88 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  89 <para>
  90
  91 </para>
  92
  93 @c:
  94 @Returns:
  95
  96
  97 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  98 <para>
  99
 100 </para>
 101
 102 @c:
 103 @Returns:
 104
 105
 106 <!-- ##### FUNCTION g_unichar_islower ##### -->
 107 <para>
 108
 109 </para>
 110
 111 @c:
 112 @Returns:
 113
 114
 115 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 116 <para>
 117
 118 </para>
 119
 120 @c:
 121 @Returns:
 122
 123
 124 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 125 <para>
 126
 127 </para>
 128
 129 @c:
 130 @Returns:
 131
 132
 133 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 134 <para>
 135
 136 </para>
 137
 138 @c:
 139 @Returns:
 140
 141
 142 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 143 <para>
 144
 145 </para>
 146
 147 @c:
 148 @Returns:
 149
 150
 151 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 152 <para>
 153
 154 </para>
 155
 156 @c:
 157 @Returns:
 158
 159
 160 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 161 <para>
 162
 163 </para>
 164
 165 @c:
 166 @Returns:
 167
 168
 169 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 170 <para>
 171
 172 </para>
 173
 174 @c:
 175 @Returns:
 176
 177
 178 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 179 <para>
 180
 181 </para>
 182
 183 @c:
 184 @Returns:
 185
 186
 187 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 188 <para>
 189
 190 </para>
 191
 192 @c:
 193 @Returns:
 194
 195
 196 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 197 <para>
 198
 199 </para>
 200
 201 @c:
 202 @Returns:
 203
 204
 205 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 206 <para>
 207
 208 </para>
 209
 210 @c:
 211 @Returns:
 212
 213
 214 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 215 <para>
 216
 217 </para>
 218
 219 @c:
 220 @Returns:
 221
 222
 223 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 224 <para>
 225
 226 </para>
 227
 228 @c:
 229 @Returns:
 230
 231
 232 <!-- ##### ENUM GUnicodeType ##### -->
 233 <para>
 234 These are the possible character classifications.
 235 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 236 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 237 </para>
 238
 239 @G_UNICODE_CONTROL:
 240 @G_UNICODE_FORMAT:
 241 @G_UNICODE_UNASSIGNED:
 242 @G_UNICODE_PRIVATE_USE:
 243 @G_UNICODE_SURROGATE:
 244 @G_UNICODE_LOWERCASE_LETTER:
 245 @G_UNICODE_MODIFIER_LETTER:
 246 @G_UNICODE_OTHER_LETTER:
 247 @G_UNICODE_TITLECASE_LETTER:
 248 @G_UNICODE_UPPERCASE_LETTER:
 249 @G_UNICODE_COMBINING_MARK:
 250 @G_UNICODE_ENCLOSING_MARK:
 251 @G_UNICODE_NON_SPACING_MARK:
 252 @G_UNICODE_DECIMAL_NUMBER:
 253 @G_UNICODE_LETTER_NUMBER:
 254 @G_UNICODE_OTHER_NUMBER:
 255 @G_UNICODE_CONNECT_PUNCTUATION:
 256 @G_UNICODE_DASH_PUNCTUATION:
 257 @G_UNICODE_CLOSE_PUNCTUATION:
 258 @G_UNICODE_FINAL_PUNCTUATION:
 259 @G_UNICODE_INITIAL_PUNCTUATION:
 260 @G_UNICODE_OTHER_PUNCTUATION:
 261 @G_UNICODE_OPEN_PUNCTUATION:
 262 @G_UNICODE_CURRENCY_SYMBOL:
 263 @G_UNICODE_MODIFIER_SYMBOL:
 264 @G_UNICODE_MATH_SYMBOL:
 265 @G_UNICODE_OTHER_SYMBOL:
 266 @G_UNICODE_LINE_SEPARATOR:
 267 @G_UNICODE_PARAGRAPH_SEPARATOR:
 268 @G_UNICODE_SPACE_SEPARATOR:
 269
 270 <!-- ##### FUNCTION g_unichar_type ##### -->
 271 <para>
 272
 273 </para>
 274
 275 @c:
 276 @Returns:
 277
 278
 279 <!-- ##### ENUM GUnicodeBreakType ##### -->
 280 <para>
 281 These are the possible line break classifications.
 282 GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1.
 283 The five Hangul types were added in Unicode 4.1, so, has been
 284 introduced in GLib 2.10.  Note that new types may be added in the future.
 285 Applications should be ready to handle unknown values.
 286 They may be regarded as %G_UNICODE_BREAK_UNKNOWN
 287 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 288 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 289
 290 </para>
 291
 292 @G_UNICODE_BREAK_MANDATORY:
 293 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 294 @G_UNICODE_BREAK_LINE_FEED:
 295 @G_UNICODE_BREAK_COMBINING_MARK:
 296 @G_UNICODE_BREAK_SURROGATE:
 297 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 298 @G_UNICODE_BREAK_INSEPARABLE:
 299 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 300 @G_UNICODE_BREAK_CONTINGENT:
 301 @G_UNICODE_BREAK_SPACE:
 302 @G_UNICODE_BREAK_AFTER:
 303 @G_UNICODE_BREAK_BEFORE:
 304 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 305 @G_UNICODE_BREAK_HYPHEN:
 306 @G_UNICODE_BREAK_NON_STARTER:
 307 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 308 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 309 @G_UNICODE_BREAK_QUOTATION:
 310 @G_UNICODE_BREAK_EXCLAMATION:
 311 @G_UNICODE_BREAK_IDEOGRAPHIC:
 312 @G_UNICODE_BREAK_NUMERIC:
 313 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 314 @G_UNICODE_BREAK_SYMBOL:
 315 @G_UNICODE_BREAK_ALPHABETIC:
 316 @G_UNICODE_BREAK_PREFIX:
 317 @G_UNICODE_BREAK_POSTFIX:
 318 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 319 @G_UNICODE_BREAK_AMBIGUOUS:
 320 @G_UNICODE_BREAK_UNKNOWN:
 321 @G_UNICODE_BREAK_NEXT_LINE:
 322 @G_UNICODE_BREAK_WORD_JOINER:
 323 @G_UNICODE_BREAK_HANGUL_L_JAMO:
 324 @G_UNICODE_BREAK_HANGUL_V_JAMO:
 325 @G_UNICODE_BREAK_HANGUL_T_JAMO:
 326 @G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
 327 @G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
 328
 329
 330 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 331 <para>
 332
 333 </para>
 334
 335 @c:
 336 @Returns:
 337
 338
 339 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 340 <para>
 341
 342 </para>
 343
 344 @string:
 345 @len:
 346
 347
 348 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 349 <para>
 350
 351 </para>
 352
 353 @ch:
 354 @result_len:
 355 @Returns:
 356
 357
 358 <!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
 359 <para>
 360
 361 </para>
 362
 363 @ch:
 364 @mirrored_ch:
 365 @Returns:
 366
 367
 368 <!-- ##### MACRO g_utf8_next_char ##### -->
 369 <para>
 370 Skips to the next character in a UTF-8 string. The string must be
 371 valid; this macro is as fast as possible, and has no error-checking.
 372 You would use this macro to iterate over a string character by
 373 character. The macro returns the start of the next UTF-8 character.
 374 Before using this macro, use g_utf8_validate() to validate strings
 375 that may contain invalid UTF-8.
 376 </para>
 377
 378 @p: Pointer to the start of a valid UTF-8 character.
 379
 380
 381 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 382 <para>
 383
 384 </para>
 385
 386 @p:
 387 @Returns:
 388
 389
 390 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 391 <para>
 392
 393 </para>
 394
 395 @p:
 396 @max_len:
 397 @Returns:
 398
 399
 400 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 401 <para>
 402
 403 </para>
 404
 405 @str:
 406 @offset:
 407 @Returns:
 408
 409
 410 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 411 <para>
 412
 413 </para>
 414
 415 @str:
 416 @pos:
 417 @Returns:
 418
 419
 420 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 421 <para>
 422
 423 </para>
 424
 425 @p:
 426 @Returns:
 427
 428
 429 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 430 <para>
 431
 432 </para>
 433
 434 @p:
 435 @end:
 436 @Returns:
 437 <!-- # Unused Parameters # -->
 438 @bound:
 439
 440
 441 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 442 <para>
 443
 444 </para>
 445
 446 @str:
 447 @p:
 448 @Returns:
 449
 450
 451 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 452 <para>
 453
 454 </para>
 455
 456 @p:
 457 @max:
 458 @Returns:
 459
 460
 461 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 462 <para>
 463
 464 </para>
 465
 466 @dest:
 467 @src:
 468 @n:
 469 @Returns:
 470
 471
 472 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 473 <para>
 474
 475 </para>
 476
 477 @p:
 478 @len:
 479 @c:
 480 @Returns:
 481 <!-- # Unused Parameters # -->
 482 @ch:
 483
 484
 485 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 486 <para>
 487
 488 </para>
 489
 490 @p:
 491 @len:
 492 @c:
 493 @Returns:
 494 <!-- # Unused Parameters # -->
 495 @ch:
 496
 497
 498 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 499 <para>
 500
 501 </para>
 502
 503 @str:
 504 @len:
 505 @Returns:
 506
 507
 508 <!-- ##### FUNCTION g_utf8_validate ##### -->
 509 <para>
 510
 511 </para>
 512
 513 @str:
 514 @max_len:
 515 @end:
 516 @Returns:
 517
 518
 519 <!-- ##### FUNCTION g_utf8_strup ##### -->
 520 <para>
 521
 522 </para>
 523
 524 @str:
 525 @len:
 526 @Returns:
 527
 528
 529 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 530 <para>
 531
 532 </para>
 533
 534 @str:
 535 @len:
 536 @Returns:
 537
 538
 539 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 540 <para>
 541
 542 </para>
 543
 544 @str:
 545 @len:
 546 @Returns:
 547
 548
 549 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 550 <para>
 551
 552 </para>
 553
 554 @str:
 555 @len:
 556 @mode:
 557 @Returns:
 558
 559
 560 <!-- ##### ENUM GNormalizeMode ##### -->
 561 <para>
 562 Defines how a Unicode string is transformed in a canonical
 563 form, standardizing such issues as whether a character with an accent is
 564 represented as a base character and combining accent or as a single precomposed
 565 character. Unicode strings should generally be normalized before comparing them.
 566 </para>
 567
 568 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 569   text content, such as the above-mentioned accent representation.
 570 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 571 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 572   forms rather than a maximally decomposed form.
 573 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 574 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 575   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 576   standard forms (in this case DIGIT THREE). Formatting information may be
 577   lost but for most text operations such characters should be considered the
 578   same.
 579 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 580 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 581   forms rather than a maximally decomposed form.
 582 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 583
 584 <!-- ##### FUNCTION g_utf8_collate ##### -->
 585 <para>
 586
 587 </para>
 588
 589 @str1:
 590 @str2:
 591 @Returns:
 592
 593
 594 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 595 <para>
 596
 597 </para>
 598
 599 @str:
 600 @len:
 601 @Returns:
 602
 603
 604 <!-- ##### FUNCTION g_utf8_collate_key_for_filename ##### -->
 605 <para>
 606
 607 </para>
 608
 609 @str:
 610 @len:
 611 @Returns:
 612
 613
 614 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 615 <para>
 616
 617 </para>
 618
 619 @str:
 620 @len:
 621 @items_read:
 622 @items_written:
 623 @error:
 624 @Returns:
 625
 626
 627 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 628 <para>
 629
 630 </para>
 631
 632 @str:
 633 @len:
 634 @items_read:
 635 @items_written:
 636 @error:
 637 @Returns:
 638
 639
 640 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 641 <para>
 642
 643 </para>
 644
 645 @str:
 646 @len:
 647 @items_written:
 648 @Returns:
 649
 650
 651 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 652 <para>
 653
 654 </para>
 655
 656 @str:
 657 @len:
 658 @items_read:
 659 @items_written:
 660 @error:
 661 @Returns:
 662
 663
 664 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 665 <para>
 666
 667 </para>
 668
 669 @str:
 670 @len:
 671 @items_read:
 672 @items_written:
 673 @error:
 674 @Returns:
 675
 676
 677 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 678 <para>
 679
 680 </para>
 681
 682 @str:
 683 @len:
 684 @items_read:
 685 @items_written:
 686 @error:
 687 @Returns:
 688
 689
 690 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 691 <para>
 692
 693 </para>
 694
 695 @str:
 696 @len:
 697 @items_read:
 698 @items_written:
 699 @error:
 700 @Returns:
 701
 702
 703 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 704 <para>
 705
 706 </para>
 707
 708 @c:
 709 @outbuf:
 710 @Returns:
 711
 712