docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### SECTION Stability_Level ##### -->
  33
  34
  35 <!-- ##### TYPEDEF gunichar ##### -->
  36 <para>
  37 A type which can hold any UCS-4 character code.
  38 </para>
  39
  40
  41 <!-- ##### TYPEDEF gunichar2 ##### -->
  42 <para>
  43 A type which can hold any UTF-16 character code.
  44 </para>
  45
  46
  47 <!-- ##### FUNCTION g_unichar_validate ##### -->
  48 <para>
  49
  50 </para>
  51
  52 @ch:
  53 @Returns:
  54
  55
  56 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  57 <para>
  58
  59 </para>
  60
  61 @c:
  62 @Returns:
  63
  64
  65 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  66 <para>
  67
  68 </para>
  69
  70 @c:
  71 @Returns:
  72
  73
  74 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  75 <para>
  76
  77 </para>
  78
  79 @c:
  80 @Returns:
  81
  82
  83 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  84 <para>
  85
  86 </para>
  87
  88 @c:
  89 @Returns:
  90
  91
  92 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  93 <para>
  94
  95 </para>
  96
  97 @c:
  98 @Returns:
  99
 100
 101 <!-- ##### FUNCTION g_unichar_islower ##### -->
 102 <para>
 103
 104 </para>
 105
 106 @c:
 107 @Returns:
 108
 109
 110 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 111 <para>
 112
 113 </para>
 114
 115 @c:
 116 @Returns:
 117
 118
 119 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 120 <para>
 121
 122 </para>
 123
 124 @c:
 125 @Returns:
 126
 127
 128 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 129 <para>
 130
 131 </para>
 132
 133 @c:
 134 @Returns:
 135
 136
 137 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 138 <para>
 139
 140 </para>
 141
 142 @c:
 143 @Returns:
 144
 145
 146 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 147 <para>
 148
 149 </para>
 150
 151 @c:
 152 @Returns:
 153
 154
 155 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 156 <para>
 157
 158 </para>
 159
 160 @c:
 161 @Returns:
 162
 163
 164 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 165 <para>
 166
 167 </para>
 168
 169 @c:
 170 @Returns:
 171
 172
 173 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 174 <para>
 175
 176 </para>
 177
 178 @c:
 179 @Returns:
 180
 181
 182 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 183 <para>
 184
 185 </para>
 186
 187 @c:
 188 @Returns:
 189
 190
 191 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 192 <para>
 193
 194 </para>
 195
 196 @c:
 197 @Returns:
 198
 199
 200 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 201 <para>
 202
 203 </para>
 204
 205 @c:
 206 @Returns:
 207
 208
 209 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 210 <para>
 211
 212 </para>
 213
 214 @c:
 215 @Returns:
 216
 217
 218 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 219 <para>
 220
 221 </para>
 222
 223 @c:
 224 @Returns:
 225
 226
 227 <!-- ##### ENUM GUnicodeType ##### -->
 228 <para>
 229 These are the possible character classifications.
 230 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 231 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 232 </para>
 233
 234 @G_UNICODE_CONTROL:
 235 @G_UNICODE_FORMAT:
 236 @G_UNICODE_UNASSIGNED:
 237 @G_UNICODE_PRIVATE_USE:
 238 @G_UNICODE_SURROGATE:
 239 @G_UNICODE_LOWERCASE_LETTER:
 240 @G_UNICODE_MODIFIER_LETTER:
 241 @G_UNICODE_OTHER_LETTER:
 242 @G_UNICODE_TITLECASE_LETTER:
 243 @G_UNICODE_UPPERCASE_LETTER:
 244 @G_UNICODE_COMBINING_MARK:
 245 @G_UNICODE_ENCLOSING_MARK:
 246 @G_UNICODE_NON_SPACING_MARK:
 247 @G_UNICODE_DECIMAL_NUMBER:
 248 @G_UNICODE_LETTER_NUMBER:
 249 @G_UNICODE_OTHER_NUMBER:
 250 @G_UNICODE_CONNECT_PUNCTUATION:
 251 @G_UNICODE_DASH_PUNCTUATION:
 252 @G_UNICODE_CLOSE_PUNCTUATION:
 253 @G_UNICODE_FINAL_PUNCTUATION:
 254 @G_UNICODE_INITIAL_PUNCTUATION:
 255 @G_UNICODE_OTHER_PUNCTUATION:
 256 @G_UNICODE_OPEN_PUNCTUATION:
 257 @G_UNICODE_CURRENCY_SYMBOL:
 258 @G_UNICODE_MODIFIER_SYMBOL:
 259 @G_UNICODE_MATH_SYMBOL:
 260 @G_UNICODE_OTHER_SYMBOL:
 261 @G_UNICODE_LINE_SEPARATOR:
 262 @G_UNICODE_PARAGRAPH_SEPARATOR:
 263 @G_UNICODE_SPACE_SEPARATOR:
 264
 265 <!-- ##### FUNCTION g_unichar_type ##### -->
 266 <para>
 267
 268 </para>
 269
 270 @c:
 271 @Returns:
 272
 273
 274 <!-- ##### ENUM GUnicodeBreakType ##### -->
 275 <para>
 276 These are the possible line break classifications.
 277 GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1.
 278 The five Hangul types were added in Unicode 4.1, so, has been
 279 introduced in GLib 2.10.  Note that new types may be added in the future.
 280 Applications should be ready to handle unknown values.
 281 They may be regarded as @G_UNICODE_BREAK_UNKNOWN
 282 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 283 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 284
 285 </para>
 286
 287 @G_UNICODE_BREAK_MANDATORY:
 288 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 289 @G_UNICODE_BREAK_LINE_FEED:
 290 @G_UNICODE_BREAK_COMBINING_MARK:
 291 @G_UNICODE_BREAK_SURROGATE:
 292 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 293 @G_UNICODE_BREAK_INSEPARABLE:
 294 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 295 @G_UNICODE_BREAK_CONTINGENT:
 296 @G_UNICODE_BREAK_SPACE:
 297 @G_UNICODE_BREAK_AFTER:
 298 @G_UNICODE_BREAK_BEFORE:
 299 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 300 @G_UNICODE_BREAK_HYPHEN:
 301 @G_UNICODE_BREAK_NON_STARTER:
 302 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 303 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 304 @G_UNICODE_BREAK_QUOTATION:
 305 @G_UNICODE_BREAK_EXCLAMATION:
 306 @G_UNICODE_BREAK_IDEOGRAPHIC:
 307 @G_UNICODE_BREAK_NUMERIC:
 308 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 309 @G_UNICODE_BREAK_SYMBOL:
 310 @G_UNICODE_BREAK_ALPHABETIC:
 311 @G_UNICODE_BREAK_PREFIX:
 312 @G_UNICODE_BREAK_POSTFIX:
 313 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 314 @G_UNICODE_BREAK_AMBIGUOUS:
 315 @G_UNICODE_BREAK_UNKNOWN:
 316 @G_UNICODE_BREAK_NEXT_LINE:
 317 @G_UNICODE_BREAK_WORD_JOINER:
 318 @G_UNICODE_BREAK_HANGUL_L_JAMO:
 319 @G_UNICODE_BREAK_HANGUL_V_JAMO:
 320 @G_UNICODE_BREAK_HANGUL_T_JAMO:
 321 @G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
 322 @G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
 323
 324
 325 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 326 <para>
 327
 328 </para>
 329
 330 @c:
 331 @Returns:
 332
 333
 334 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 335 <para>
 336
 337 </para>
 338
 339 @string:
 340 @len:
 341
 342
 343 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 344 <para>
 345
 346 </para>
 347
 348 @ch:
 349 @result_len:
 350 @Returns:
 351
 352
 353 <!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
 354 <para>
 355
 356 </para>
 357
 358 @ch:
 359 @mirrored_ch:
 360 @Returns:
 361
 362
 363 <!-- ##### MACRO g_utf8_next_char ##### -->
 364 <para>
 365 Skips to the next character in a UTF-8 string. The string must be
 366 valid; this macro is as fast as possible, and has no error-checking.
 367 You would use this macro to iterate over a string character by
 368 character. The macro returns the start of the next UTF-8 character.
 369 Before using this macro, use g_utf8_validate() to validate strings
 370 that may contain invalid UTF-8.
 371 </para>
 372
 373 @p: Pointer to the start of a valid UTF-8 character.
 374
 375
 376 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 377 <para>
 378
 379 </para>
 380
 381 @p:
 382 @Returns:
 383
 384
 385 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 386 <para>
 387
 388 </para>
 389
 390 @p:
 391 @max_len:
 392 @Returns:
 393
 394
 395 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 396 <para>
 397
 398 </para>
 399
 400 @str:
 401 @offset:
 402 @Returns:
 403
 404
 405 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 406 <para>
 407
 408 </para>
 409
 410 @str:
 411 @pos:
 412 @Returns:
 413
 414
 415 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 416 <para>
 417
 418 </para>
 419
 420 @p:
 421 @Returns:
 422
 423
 424 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 425 <para>
 426
 427 </para>
 428
 429 @p:
 430 @end:
 431 @Returns:
 432 <!-- # Unused Parameters # -->
 433 @bound:
 434
 435
 436 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 437 <para>
 438
 439 </para>
 440
 441 @str:
 442 @p:
 443 @Returns:
 444
 445
 446 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 447 <para>
 448
 449 </para>
 450
 451 @p:
 452 @max:
 453 @Returns:
 454
 455
 456 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 457 <para>
 458
 459 </para>
 460
 461 @dest:
 462 @src:
 463 @n:
 464 @Returns:
 465
 466
 467 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 468 <para>
 469
 470 </para>
 471
 472 @p:
 473 @len:
 474 @c:
 475 @Returns:
 476 <!-- # Unused Parameters # -->
 477 @ch:
 478
 479
 480 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 481 <para>
 482
 483 </para>
 484
 485 @p:
 486 @len:
 487 @c:
 488 @Returns:
 489 <!-- # Unused Parameters # -->
 490 @ch:
 491
 492
 493 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 494 <para>
 495
 496 </para>
 497
 498 @str:
 499 @len:
 500 @Returns:
 501
 502
 503 <!-- ##### FUNCTION g_utf8_validate ##### -->
 504 <para>
 505
 506 </para>
 507
 508 @str:
 509 @max_len:
 510 @end:
 511 @Returns:
 512
 513
 514 <!-- ##### FUNCTION g_utf8_strup ##### -->
 515 <para>
 516
 517 </para>
 518
 519 @str:
 520 @len:
 521 @Returns:
 522
 523
 524 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 525 <para>
 526
 527 </para>
 528
 529 @str:
 530 @len:
 531 @Returns:
 532
 533
 534 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 535 <para>
 536
 537 </para>
 538
 539 @str:
 540 @len:
 541 @Returns:
 542
 543
 544 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 545 <para>
 546
 547 </para>
 548
 549 @str:
 550 @len:
 551 @mode:
 552 @Returns:
 553
 554
 555 <!-- ##### ENUM GNormalizeMode ##### -->
 556 <para>
 557 Defines how a Unicode string is transformed in a canonical
 558 form, standardizing such issues as whether a character with an accent is
 559 represented as a base character and combining accent or as a single precomposed
 560 character. Unicode strings should generally be normalized before comparing them.
 561 </para>
 562
 563 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 564   text content, such as the above-mentioned accent representation.
 565 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 566 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 567   forms rather than a maximally decomposed form.
 568 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 569 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 570   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 571   standard forms (in this case DIGIT THREE). Formatting information may be
 572   lost but for most text operations such characters should be considered the
 573   same.
 574 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 575 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 576   forms rather than a maximally decomposed form.
 577 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 578
 579 <!-- ##### FUNCTION g_utf8_collate ##### -->
 580 <para>
 581
 582 </para>
 583
 584 @str1:
 585 @str2:
 586 @Returns:
 587
 588
 589 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 590 <para>
 591
 592 </para>
 593
 594 @str:
 595 @len:
 596 @Returns:
 597
 598
 599 <!-- ##### FUNCTION g_utf8_collate_key_for_filename ##### -->
 600 <para>
 601
 602 </para>
 603
 604 @str:
 605 @len:
 606 @Returns:
 607
 608
 609 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 610 <para>
 611
 612 </para>
 613
 614 @str:
 615 @len:
 616 @items_read:
 617 @items_written:
 618 @error:
 619 @Returns:
 620
 621
 622 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 623 <para>
 624
 625 </para>
 626
 627 @str:
 628 @len:
 629 @items_read:
 630 @items_written:
 631 @error:
 632 @Returns:
 633
 634
 635 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 636 <para>
 637
 638 </para>
 639
 640 @str:
 641 @len:
 642 @items_written:
 643 @Returns:
 644
 645
 646 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 647 <para>
 648
 649 </para>
 650
 651 @str:
 652 @len:
 653 @items_read:
 654 @items_written:
 655 @error:
 656 @Returns:
 657
 658
 659 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 660 <para>
 661
 662 </para>
 663
 664 @str:
 665 @len:
 666 @items_read:
 667 @items_written:
 668 @error:
 669 @Returns:
 670
 671
 672 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 673 <para>
 674
 675 </para>
 676
 677 @str:
 678 @len:
 679 @items_read:
 680 @items_written:
 681 @error:
 682 @Returns:
 683
 684
 685 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 686 <para>
 687
 688 </para>
 689
 690 @str:
 691 @len:
 692 @items_read:
 693 @items_written:
 694 @error:
 695 @Returns:
 696
 697
 698 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 699 <para>
 700
 701 </para>
 702
 703 @c:
 704 @outbuf:
 705 @Returns:
 706
 707