docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### SECTION Stability_Level ##### -->
  33
  34
  35 <!-- ##### TYPEDEF gunichar ##### -->
  36 <para>
  37 A type which can hold any UCS-4 character code.
  38 </para>
  39
  40
  41 <!-- ##### TYPEDEF gunichar2 ##### -->
  42 <para>
  43 A type which can hold any UTF-16 character code.
  44 </para>
  45
  46
  47 <!-- ##### FUNCTION g_unichar_validate ##### -->
  48 <para>
  49
  50 </para>
  51
  52 @ch:
  53 @Returns:
  54
  55
  56 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  57 <para>
  58
  59 </para>
  60
  61 @c:
  62 @Returns:
  63
  64
  65 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  66 <para>
  67
  68 </para>
  69
  70 @c:
  71 @Returns:
  72
  73
  74 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  75 <para>
  76
  77 </para>
  78
  79 @c:
  80 @Returns:
  81
  82
  83 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  84 <para>
  85
  86 </para>
  87
  88 @c:
  89 @Returns:
  90
  91
  92 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  93 <para>
  94
  95 </para>
  96
  97 @c:
  98 @Returns:
  99
 100
 101 <!-- ##### FUNCTION g_unichar_islower ##### -->
 102 <para>
 103
 104 </para>
 105
 106 @c:
 107 @Returns:
 108
 109
 110 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 111 <para>
 112
 113 </para>
 114
 115 @c:
 116 @Returns:
 117
 118
 119 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 120 <para>
 121
 122 </para>
 123
 124 @c:
 125 @Returns:
 126
 127
 128 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 129 <para>
 130
 131 </para>
 132
 133 @c:
 134 @Returns:
 135
 136
 137 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 138 <para>
 139
 140 </para>
 141
 142 @c:
 143 @Returns:
 144
 145
 146 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 147 <para>
 148
 149 </para>
 150
 151 @c:
 152 @Returns:
 153
 154
 155 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 156 <para>
 157
 158 </para>
 159
 160 @c:
 161 @Returns:
 162
 163
 164 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 165 <para>
 166
 167 </para>
 168
 169 @c:
 170 @Returns:
 171
 172
 173 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 174 <para>
 175
 176 </para>
 177
 178 @c:
 179 @Returns:
 180
 181
 182 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 183 <para>
 184
 185 </para>
 186
 187 @c:
 188 @Returns:
 189
 190
 191 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 192 <para>
 193
 194 </para>
 195
 196 @c:
 197 @Returns:
 198
 199
 200 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 201 <para>
 202
 203 </para>
 204
 205 @c:
 206 @Returns:
 207
 208
 209 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 210 <para>
 211
 212 </para>
 213
 214 @c:
 215 @Returns:
 216
 217
 218 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 219 <para>
 220
 221 </para>
 222
 223 @c:
 224 @Returns:
 225
 226
 227 <!-- ##### ENUM GUnicodeType ##### -->
 228 <para>
 229 These are the possible character classifications.
 230 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 231 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 232 </para>
 233
 234 @G_UNICODE_CONTROL:
 235 @G_UNICODE_FORMAT:
 236 @G_UNICODE_UNASSIGNED:
 237 @G_UNICODE_PRIVATE_USE:
 238 @G_UNICODE_SURROGATE:
 239 @G_UNICODE_LOWERCASE_LETTER:
 240 @G_UNICODE_MODIFIER_LETTER:
 241 @G_UNICODE_OTHER_LETTER:
 242 @G_UNICODE_TITLECASE_LETTER:
 243 @G_UNICODE_UPPERCASE_LETTER:
 244 @G_UNICODE_COMBINING_MARK:
 245 @G_UNICODE_ENCLOSING_MARK:
 246 @G_UNICODE_NON_SPACING_MARK:
 247 @G_UNICODE_DECIMAL_NUMBER:
 248 @G_UNICODE_LETTER_NUMBER:
 249 @G_UNICODE_OTHER_NUMBER:
 250 @G_UNICODE_CONNECT_PUNCTUATION:
 251 @G_UNICODE_DASH_PUNCTUATION:
 252 @G_UNICODE_CLOSE_PUNCTUATION:
 253 @G_UNICODE_FINAL_PUNCTUATION:
 254 @G_UNICODE_INITIAL_PUNCTUATION:
 255 @G_UNICODE_OTHER_PUNCTUATION:
 256 @G_UNICODE_OPEN_PUNCTUATION:
 257 @G_UNICODE_CURRENCY_SYMBOL:
 258 @G_UNICODE_MODIFIER_SYMBOL:
 259 @G_UNICODE_MATH_SYMBOL:
 260 @G_UNICODE_OTHER_SYMBOL:
 261 @G_UNICODE_LINE_SEPARATOR:
 262 @G_UNICODE_PARAGRAPH_SEPARATOR:
 263 @G_UNICODE_SPACE_SEPARATOR:
 264
 265 <!-- ##### FUNCTION g_unichar_type ##### -->
 266 <para>
 267
 268 </para>
 269
 270 @c:
 271 @Returns:
 272
 273
 274 <!-- ##### ENUM GUnicodeBreakType ##### -->
 275 <para>
 276 These are the possible line break classifications.
 277 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 278 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 279 </para>
 280
 281 @G_UNICODE_BREAK_MANDATORY:
 282 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 283 @G_UNICODE_BREAK_LINE_FEED:
 284 @G_UNICODE_BREAK_COMBINING_MARK:
 285 @G_UNICODE_BREAK_SURROGATE:
 286 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 287 @G_UNICODE_BREAK_INSEPARABLE:
 288 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 289 @G_UNICODE_BREAK_CONTINGENT:
 290 @G_UNICODE_BREAK_SPACE:
 291 @G_UNICODE_BREAK_AFTER:
 292 @G_UNICODE_BREAK_BEFORE:
 293 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 294 @G_UNICODE_BREAK_HYPHEN:
 295 @G_UNICODE_BREAK_NON_STARTER:
 296 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 297 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 298 @G_UNICODE_BREAK_QUOTATION:
 299 @G_UNICODE_BREAK_EXCLAMATION:
 300 @G_UNICODE_BREAK_IDEOGRAPHIC:
 301 @G_UNICODE_BREAK_NUMERIC:
 302 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 303 @G_UNICODE_BREAK_SYMBOL:
 304 @G_UNICODE_BREAK_ALPHABETIC:
 305 @G_UNICODE_BREAK_PREFIX:
 306 @G_UNICODE_BREAK_POSTFIX:
 307 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 308 @G_UNICODE_BREAK_AMBIGUOUS:
 309 @G_UNICODE_BREAK_UNKNOWN:
 310 @G_UNICODE_BREAK_NEXT_LINE:
 311 @G_UNICODE_BREAK_WORD_JOINER:
 312
 313 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 314 <para>
 315
 316 </para>
 317
 318 @c:
 319 @Returns:
 320
 321
 322 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 323 <para>
 324
 325 </para>
 326
 327 @string:
 328 @len:
 329
 330
 331 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 332 <para>
 333
 334 </para>
 335
 336 @ch:
 337 @result_len:
 338 @Returns:
 339
 340
 341 <!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
 342 <para>
 343
 344 </para>
 345
 346 @ch:
 347 @mirrored_ch:
 348 @Returns:
 349
 350
 351 <!-- ##### MACRO g_utf8_next_char ##### -->
 352 <para>
 353 Skips to the next character in a UTF-8 string. The string must be
 354 valid; this macro is as fast as possible, and has no error-checking.
 355 You would use this macro to iterate over a string character by
 356 character. The macro returns the start of the next UTF-8 character.
 357 Before using this macro, use g_utf8_validate() to validate strings
 358 that may contain invalid UTF-8.
 359 </para>
 360
 361 @p: Pointer to the start of a valid UTF-8 character.
 362
 363
 364 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 365 <para>
 366
 367 </para>
 368
 369 @p:
 370 @Returns:
 371
 372
 373 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 374 <para>
 375
 376 </para>
 377
 378 @p:
 379 @max_len:
 380 @Returns:
 381
 382
 383 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 384 <para>
 385
 386 </para>
 387
 388 @str:
 389 @offset:
 390 @Returns:
 391
 392
 393 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 394 <para>
 395
 396 </para>
 397
 398 @str:
 399 @pos:
 400 @Returns:
 401
 402
 403 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 404 <para>
 405
 406 </para>
 407
 408 @p:
 409 @Returns:
 410
 411
 412 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 413 <para>
 414
 415 </para>
 416
 417 @p:
 418 @end:
 419 @Returns:
 420 <!-- # Unused Parameters # -->
 421 @bound:
 422
 423
 424 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 425 <para>
 426
 427 </para>
 428
 429 @str:
 430 @p:
 431 @Returns:
 432
 433
 434 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 435 <para>
 436
 437 </para>
 438
 439 @p:
 440 @max:
 441 @Returns:
 442
 443
 444 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 445 <para>
 446
 447 </para>
 448
 449 @dest:
 450 @src:
 451 @n:
 452 @Returns:
 453
 454
 455 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 456 <para>
 457
 458 </para>
 459
 460 @p:
 461 @len:
 462 @c:
 463 @Returns:
 464 <!-- # Unused Parameters # -->
 465 @ch:
 466
 467
 468 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 469 <para>
 470
 471 </para>
 472
 473 @p:
 474 @len:
 475 @c:
 476 @Returns:
 477 <!-- # Unused Parameters # -->
 478 @ch:
 479
 480
 481 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 482 <para>
 483
 484 </para>
 485
 486 @str:
 487 @len:
 488 @Returns:
 489
 490
 491 <!-- ##### FUNCTION g_utf8_validate ##### -->
 492 <para>
 493
 494 </para>
 495
 496 @str:
 497 @max_len:
 498 @end:
 499 @Returns:
 500
 501
 502 <!-- ##### FUNCTION g_utf8_strup ##### -->
 503 <para>
 504
 505 </para>
 506
 507 @str:
 508 @len:
 509 @Returns:
 510
 511
 512 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 513 <para>
 514
 515 </para>
 516
 517 @str:
 518 @len:
 519 @Returns:
 520
 521
 522 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 523 <para>
 524
 525 </para>
 526
 527 @str:
 528 @len:
 529 @Returns:
 530
 531
 532 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 533 <para>
 534
 535 </para>
 536
 537 @str:
 538 @len:
 539 @mode:
 540 @Returns:
 541
 542
 543 <!-- ##### ENUM GNormalizeMode ##### -->
 544 <para>
 545 Defines how a Unicode string is transformed in a canonical
 546 form, standardizing such issues as whether a character with an accent is
 547 represented as a base character and combining accent or as a single precomposed
 548 character. Unicode strings should generally be normalized before comparing them.
 549 </para>
 550
 551 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 552   text content, such as the above-mentioned accent representation.
 553 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 554 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 555   forms rather than a maximally decomposed form.
 556 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 557 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 558   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 559   standard forms (in this case DIGIT THREE). Formatting information may be
 560   lost but for most text operations such characters should be considered the
 561   same.
 562 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 563 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 564   forms rather than a maximally decomposed form.
 565 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 566
 567 <!-- ##### FUNCTION g_utf8_collate ##### -->
 568 <para>
 569
 570 </para>
 571
 572 @str1:
 573 @str2:
 574 @Returns:
 575
 576
 577 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 578 <para>
 579
 580 </para>
 581
 582 @str:
 583 @len:
 584 @Returns:
 585
 586
 587 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 588 <para>
 589
 590 </para>
 591
 592 @str:
 593 @len:
 594 @items_read:
 595 @items_written:
 596 @error:
 597 @Returns:
 598
 599
 600 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 601 <para>
 602
 603 </para>
 604
 605 @str:
 606 @len:
 607 @items_read:
 608 @items_written:
 609 @error:
 610 @Returns:
 611
 612
 613 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 614 <para>
 615
 616 </para>
 617
 618 @str:
 619 @len:
 620 @items_written:
 621 @Returns:
 622
 623
 624 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 625 <para>
 626
 627 </para>
 628
 629 @str:
 630 @len:
 631 @items_read:
 632 @items_written:
 633 @error:
 634 @Returns:
 635
 636
 637 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 638 <para>
 639
 640 </para>
 641
 642 @str:
 643 @len:
 644 @items_read:
 645 @items_written:
 646 @error:
 647 @Returns:
 648
 649
 650 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 651 <para>
 652
 653 </para>
 654
 655 @str:
 656 @len:
 657 @items_read:
 658 @items_written:
 659 @error:
 660 @Returns:
 661
 662
 663 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 664 <para>
 665
 666 </para>
 667
 668 @str:
 669 @len:
 670 @items_read:
 671 @items_written:
 672 @error:
 673 @Returns:
 674
 675
 676 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 677 <para>
 678
 679 </para>
 680
 681 @c:
 682 @outbuf:
 683 @Returns:
 684
 685