docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### SECTION Stability_Level ##### -->
  33
  34
  35 <!-- ##### TYPEDEF gunichar ##### -->
  36 <para>
  37 A type which can hold any UCS-4 character code.
  38 </para>
  39
  40
  41 <!-- ##### TYPEDEF gunichar2 ##### -->
  42 <para>
  43 A type which can hold any UTF-16 code
  44 point<footnote id="utf16_surrogate_pairs">UTF-16 also has so called
  45 <firstterm>surrogate pairs</firstterm> to encode characters beyond the
  46 BMP as pairs of 16bit numbers. Surrogate pairs cannot be stored in a
  47 single gunichar2 field, but all GLib functions accepting gunichar2 arrays
  48 will correctly interpret surrogate pairs.</footnote>.
  49 </para>
  50
  51
  52 <!-- ##### FUNCTION g_unichar_validate ##### -->
  53 <para>
  54
  55 </para>
  56
  57 @ch:
  58 @Returns:
  59
  60
  61 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  62 <para>
  63
  64 </para>
  65
  66 @c:
  67 @Returns:
  68
  69
  70 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  71 <para>
  72
  73 </para>
  74
  75 @c:
  76 @Returns:
  77
  78
  79 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  80 <para>
  81
  82 </para>
  83
  84 @c:
  85 @Returns:
  86
  87
  88 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  89 <para>
  90
  91 </para>
  92
  93 @c:
  94 @Returns:
  95
  96
  97 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  98 <para>
  99
 100 </para>
 101
 102 @c:
 103 @Returns:
 104
 105
 106 <!-- ##### FUNCTION g_unichar_islower ##### -->
 107 <para>
 108
 109 </para>
 110
 111 @c:
 112 @Returns:
 113
 114
 115 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 116 <para>
 117
 118 </para>
 119
 120 @c:
 121 @Returns:
 122
 123
 124 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 125 <para>
 126
 127 </para>
 128
 129 @c:
 130 @Returns:
 131
 132
 133 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 134 <para>
 135
 136 </para>
 137
 138 @c:
 139 @Returns:
 140
 141
 142 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 143 <para>
 144
 145 </para>
 146
 147 @c:
 148 @Returns:
 149
 150
 151 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 152 <para>
 153
 154 </para>
 155
 156 @c:
 157 @Returns:
 158
 159
 160 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 161 <para>
 162
 163 </para>
 164
 165 @c:
 166 @Returns:
 167
 168
 169 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 170 <para>
 171
 172 </para>
 173
 174 @c:
 175 @Returns:
 176
 177
 178 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 179 <para>
 180
 181 </para>
 182
 183 @c:
 184 @Returns:
 185
 186
 187 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 188 <para>
 189
 190 </para>
 191
 192 @c:
 193 @Returns:
 194
 195
 196 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 197 <para>
 198
 199 </para>
 200
 201 @c:
 202 @Returns:
 203
 204
 205 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 206 <para>
 207
 208 </para>
 209
 210 @c:
 211 @Returns:
 212
 213
 214 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 215 <para>
 216
 217 </para>
 218
 219 @c:
 220 @Returns:
 221
 222
 223 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 224 <para>
 225
 226 </para>
 227
 228 @c:
 229 @Returns:
 230
 231
 232 <!-- ##### ENUM GUnicodeType ##### -->
 233 <para>
 234 These are the possible character classifications.
 235 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 236 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 237 </para>
 238
 239 @G_UNICODE_CONTROL:
 240 @G_UNICODE_FORMAT:
 241 @G_UNICODE_UNASSIGNED:
 242 @G_UNICODE_PRIVATE_USE:
 243 @G_UNICODE_SURROGATE:
 244 @G_UNICODE_LOWERCASE_LETTER:
 245 @G_UNICODE_MODIFIER_LETTER:
 246 @G_UNICODE_OTHER_LETTER:
 247 @G_UNICODE_TITLECASE_LETTER:
 248 @G_UNICODE_UPPERCASE_LETTER:
 249 @G_UNICODE_COMBINING_MARK:
 250 @G_UNICODE_ENCLOSING_MARK:
 251 @G_UNICODE_NON_SPACING_MARK:
 252 @G_UNICODE_DECIMAL_NUMBER:
 253 @G_UNICODE_LETTER_NUMBER:
 254 @G_UNICODE_OTHER_NUMBER:
 255 @G_UNICODE_CONNECT_PUNCTUATION:
 256 @G_UNICODE_DASH_PUNCTUATION:
 257 @G_UNICODE_CLOSE_PUNCTUATION:
 258 @G_UNICODE_FINAL_PUNCTUATION:
 259 @G_UNICODE_INITIAL_PUNCTUATION:
 260 @G_UNICODE_OTHER_PUNCTUATION:
 261 @G_UNICODE_OPEN_PUNCTUATION:
 262 @G_UNICODE_CURRENCY_SYMBOL:
 263 @G_UNICODE_MODIFIER_SYMBOL:
 264 @G_UNICODE_MATH_SYMBOL:
 265 @G_UNICODE_OTHER_SYMBOL:
 266 @G_UNICODE_LINE_SEPARATOR:
 267 @G_UNICODE_PARAGRAPH_SEPARATOR:
 268 @G_UNICODE_SPACE_SEPARATOR:
 269
 270 <!-- ##### FUNCTION g_unichar_type ##### -->
 271 <para>
 272
 273 </para>
 274
 275 @c:
 276 @Returns:
 277
 278
 279 <!-- ##### ENUM GUnicodeBreakType ##### -->
 280 <para>
 281 These are the possible line break classifications.
 282 GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1.
 283 The five Hangul types were added in Unicode 4.1, so, has been
 284 introduced in GLib 2.10.  Note that new types may be added in the future.
 285 Applications should be ready to handle unknown values.
 286 They may be regarded as %G_UNICODE_BREAK_UNKNOWN
 287 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 288 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 289
 290 </para>
 291
 292 @G_UNICODE_BREAK_MANDATORY:
 293 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 294 @G_UNICODE_BREAK_LINE_FEED:
 295 @G_UNICODE_BREAK_COMBINING_MARK:
 296 @G_UNICODE_BREAK_SURROGATE:
 297 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 298 @G_UNICODE_BREAK_INSEPARABLE:
 299 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 300 @G_UNICODE_BREAK_CONTINGENT:
 301 @G_UNICODE_BREAK_SPACE:
 302 @G_UNICODE_BREAK_AFTER:
 303 @G_UNICODE_BREAK_BEFORE:
 304 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 305 @G_UNICODE_BREAK_HYPHEN:
 306 @G_UNICODE_BREAK_NON_STARTER:
 307 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 308 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 309 @G_UNICODE_BREAK_QUOTATION:
 310 @G_UNICODE_BREAK_EXCLAMATION:
 311 @G_UNICODE_BREAK_IDEOGRAPHIC:
 312 @G_UNICODE_BREAK_NUMERIC:
 313 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 314 @G_UNICODE_BREAK_SYMBOL:
 315 @G_UNICODE_BREAK_ALPHABETIC:
 316 @G_UNICODE_BREAK_PREFIX:
 317 @G_UNICODE_BREAK_POSTFIX:
 318 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 319 @G_UNICODE_BREAK_AMBIGUOUS:
 320 @G_UNICODE_BREAK_UNKNOWN:
 321 @G_UNICODE_BREAK_NEXT_LINE:
 322 @G_UNICODE_BREAK_WORD_JOINER:
 323 @G_UNICODE_BREAK_HANGUL_L_JAMO:
 324 @G_UNICODE_BREAK_HANGUL_V_JAMO:
 325 @G_UNICODE_BREAK_HANGUL_T_JAMO:
 326 @G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
 327 @G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
 328
 329 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 330 <para>
 331
 332 </para>
 333
 334 @c:
 335 @Returns:
 336
 337
 338 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 339 <para>
 340
 341 </para>
 342
 343 @string:
 344 @len:
 345
 346
 347 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 348 <para>
 349
 350 </para>
 351
 352 @ch:
 353 @result_len:
 354 @Returns:
 355
 356
 357 <!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
 358 <para>
 359
 360 </para>
 361
 362 @ch:
 363 @mirrored_ch:
 364 @Returns:
 365
 366
 367 <!-- ##### MACRO g_utf8_next_char ##### -->
 368 <para>
 369 Skips to the next character in a UTF-8 string. The string must be
 370 valid; this macro is as fast as possible, and has no error-checking.
 371 You would use this macro to iterate over a string character by
 372 character. The macro returns the start of the next UTF-8 character.
 373 Before using this macro, use g_utf8_validate() to validate strings
 374 that may contain invalid UTF-8.
 375 </para>
 376
 377 @p: Pointer to the start of a valid UTF-8 character.
 378
 379
 380 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 381 <para>
 382
 383 </para>
 384
 385 @p:
 386 @Returns:
 387
 388
 389 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 390 <para>
 391
 392 </para>
 393
 394 @p:
 395 @max_len:
 396 @Returns:
 397
 398
 399 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 400 <para>
 401
 402 </para>
 403
 404 @str:
 405 @offset:
 406 @Returns:
 407
 408
 409 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 410 <para>
 411
 412 </para>
 413
 414 @str:
 415 @pos:
 416 @Returns:
 417
 418
 419 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 420 <para>
 421
 422 </para>
 423
 424 @p:
 425 @Returns:
 426
 427
 428 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 429 <para>
 430
 431 </para>
 432
 433 @p:
 434 @end:
 435 @Returns:
 436 <!-- # Unused Parameters # -->
 437 @bound:
 438
 439
 440 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 441 <para>
 442
 443 </para>
 444
 445 @str:
 446 @p:
 447 @Returns:
 448
 449
 450 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 451 <para>
 452
 453 </para>
 454
 455 @p:
 456 @max:
 457 @Returns:
 458
 459
 460 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 461 <para>
 462
 463 </para>
 464
 465 @dest:
 466 @src:
 467 @n:
 468 @Returns:
 469
 470
 471 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 472 <para>
 473
 474 </para>
 475
 476 @p:
 477 @len:
 478 @c:
 479 @Returns:
 480 <!-- # Unused Parameters # -->
 481 @ch:
 482
 483
 484 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 485 <para>
 486
 487 </para>
 488
 489 @p:
 490 @len:
 491 @c:
 492 @Returns:
 493 <!-- # Unused Parameters # -->
 494 @ch:
 495
 496
 497 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 498 <para>
 499
 500 </para>
 501
 502 @str:
 503 @len:
 504 @Returns:
 505
 506
 507 <!-- ##### FUNCTION g_utf8_validate ##### -->
 508 <para>
 509
 510 </para>
 511
 512 @str:
 513 @max_len:
 514 @end:
 515 @Returns:
 516
 517
 518 <!-- ##### FUNCTION g_utf8_strup ##### -->
 519 <para>
 520
 521 </para>
 522
 523 @str:
 524 @len:
 525 @Returns:
 526
 527
 528 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 529 <para>
 530
 531 </para>
 532
 533 @str:
 534 @len:
 535 @Returns:
 536
 537
 538 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 539 <para>
 540
 541 </para>
 542
 543 @str:
 544 @len:
 545 @Returns:
 546
 547
 548 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 549 <para>
 550
 551 </para>
 552
 553 @str:
 554 @len:
 555 @mode:
 556 @Returns:
 557
 558
 559 <!-- ##### ENUM GNormalizeMode ##### -->
 560 <para>
 561 Defines how a Unicode string is transformed in a canonical
 562 form, standardizing such issues as whether a character with an accent is
 563 represented as a base character and combining accent or as a single precomposed
 564 character. Unicode strings should generally be normalized before comparing them.
 565 </para>
 566
 567 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 568   text content, such as the above-mentioned accent representation.
 569 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 570 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 571   forms rather than a maximally decomposed form.
 572 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 573 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 574   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 575   standard forms (in this case DIGIT THREE). Formatting information may be
 576   lost but for most text operations such characters should be considered the
 577   same.
 578 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 579 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 580   forms rather than a maximally decomposed form.
 581 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 582
 583 <!-- ##### FUNCTION g_utf8_collate ##### -->
 584 <para>
 585
 586 </para>
 587
 588 @str1:
 589 @str2:
 590 @Returns:
 591
 592
 593 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 594 <para>
 595
 596 </para>
 597
 598 @str:
 599 @len:
 600 @Returns:
 601
 602
 603 <!-- ##### FUNCTION g_utf8_collate_key_for_filename ##### -->
 604 <para>
 605
 606 </para>
 607
 608 @str:
 609 @len:
 610 @Returns:
 611
 612
 613 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 614 <para>
 615
 616 </para>
 617
 618 @str:
 619 @len:
 620 @items_read:
 621 @items_written:
 622 @error:
 623 @Returns:
 624
 625
 626 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 627 <para>
 628
 629 </para>
 630
 631 @str:
 632 @len:
 633 @items_read:
 634 @items_written:
 635 @error:
 636 @Returns:
 637
 638
 639 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 640 <para>
 641
 642 </para>
 643
 644 @str:
 645 @len:
 646 @items_written:
 647 @Returns:
 648
 649
 650 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 651 <para>
 652
 653 </para>
 654
 655 @str:
 656 @len:
 657 @items_read:
 658 @items_written:
 659 @error:
 660 @Returns:
 661
 662
 663 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 664 <para>
 665
 666 </para>
 667
 668 @str:
 669 @len:
 670 @items_read:
 671 @items_written:
 672 @error:
 673 @Returns:
 674
 675
 676 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 677 <para>
 678
 679 </para>
 680
 681 @str:
 682 @len:
 683 @items_read:
 684 @items_written:
 685 @error:
 686 @Returns:
 687
 688
 689 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 690 <para>
 691
 692 </para>
 693
 694 @str:
 695 @len:
 696 @items_read:
 697 @items_written:
 698 @error:
 699 @Returns:
 700
 701
 702 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 703 <para>
 704
 705 </para>
 706
 707 @c:
 708 @outbuf:
 709 @Returns:
 710
 711