docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### TYPEDEF gunichar ##### -->
  33 <para>
  34 A type which can hold any UCS-4 character code.
  35 </para>
  36
  37
  38 <!-- ##### TYPEDEF gunichar2 ##### -->
  39 <para>
  40 A type which can hold any UTF-16 character code.
  41 </para>
  42
  43
  44 <!-- ##### FUNCTION g_unichar_validate ##### -->
  45 <para>
  46
  47 </para>
  48
  49 @ch:
  50 @Returns:
  51
  52
  53 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  54 <para>
  55
  56 </para>
  57
  58 @c:
  59 @Returns:
  60
  61
  62 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  63 <para>
  64
  65 </para>
  66
  67 @c:
  68 @Returns:
  69
  70
  71 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  72 <para>
  73
  74 </para>
  75
  76 @c:
  77 @Returns:
  78
  79
  80 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  81 <para>
  82
  83 </para>
  84
  85 @c:
  86 @Returns:
  87
  88
  89 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  90 <para>
  91
  92 </para>
  93
  94 @c:
  95 @Returns:
  96
  97
  98 <!-- ##### FUNCTION g_unichar_islower ##### -->
  99 <para>
 100
 101 </para>
 102
 103 @c:
 104 @Returns:
 105
 106
 107 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 108 <para>
 109
 110 </para>
 111
 112 @c:
 113 @Returns:
 114
 115
 116 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 117 <para>
 118
 119 </para>
 120
 121 @c:
 122 @Returns:
 123
 124
 125 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 126 <para>
 127
 128 </para>
 129
 130 @c:
 131 @Returns:
 132
 133
 134 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 135 <para>
 136
 137 </para>
 138
 139 @c:
 140 @Returns:
 141
 142
 143 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 144 <para>
 145
 146 </para>
 147
 148 @c:
 149 @Returns:
 150
 151
 152 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 153 <para>
 154
 155 </para>
 156
 157 @c:
 158 @Returns:
 159
 160
 161 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 162 <para>
 163
 164 </para>
 165
 166 @c:
 167 @Returns:
 168
 169
 170 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 171 <para>
 172
 173 </para>
 174
 175 @c:
 176 @Returns:
 177
 178
 179 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 180 <para>
 181
 182 </para>
 183
 184 @c:
 185 @Returns:
 186
 187
 188 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 189 <para>
 190
 191 </para>
 192
 193 @c:
 194 @Returns:
 195
 196
 197 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 198 <para>
 199
 200 </para>
 201
 202 @c:
 203 @Returns:
 204
 205
 206 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 207 <para>
 208
 209 </para>
 210
 211 @c:
 212 @Returns:
 213
 214
 215 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 216 <para>
 217
 218 </para>
 219
 220 @c:
 221 @Returns:
 222
 223
 224 <!-- ##### ENUM GUnicodeType ##### -->
 225 <para>
 226 These are the possible character classifications.
 227 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 228 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 229 </para>
 230
 231 @G_UNICODE_CONTROL:
 232 @G_UNICODE_FORMAT:
 233 @G_UNICODE_UNASSIGNED:
 234 @G_UNICODE_PRIVATE_USE:
 235 @G_UNICODE_SURROGATE:
 236 @G_UNICODE_LOWERCASE_LETTER:
 237 @G_UNICODE_MODIFIER_LETTER:
 238 @G_UNICODE_OTHER_LETTER:
 239 @G_UNICODE_TITLECASE_LETTER:
 240 @G_UNICODE_UPPERCASE_LETTER:
 241 @G_UNICODE_COMBINING_MARK:
 242 @G_UNICODE_ENCLOSING_MARK:
 243 @G_UNICODE_NON_SPACING_MARK:
 244 @G_UNICODE_DECIMAL_NUMBER:
 245 @G_UNICODE_LETTER_NUMBER:
 246 @G_UNICODE_OTHER_NUMBER:
 247 @G_UNICODE_CONNECT_PUNCTUATION:
 248 @G_UNICODE_DASH_PUNCTUATION:
 249 @G_UNICODE_CLOSE_PUNCTUATION:
 250 @G_UNICODE_FINAL_PUNCTUATION:
 251 @G_UNICODE_INITIAL_PUNCTUATION:
 252 @G_UNICODE_OTHER_PUNCTUATION:
 253 @G_UNICODE_OPEN_PUNCTUATION:
 254 @G_UNICODE_CURRENCY_SYMBOL:
 255 @G_UNICODE_MODIFIER_SYMBOL:
 256 @G_UNICODE_MATH_SYMBOL:
 257 @G_UNICODE_OTHER_SYMBOL:
 258 @G_UNICODE_LINE_SEPARATOR:
 259 @G_UNICODE_PARAGRAPH_SEPARATOR:
 260 @G_UNICODE_SPACE_SEPARATOR:
 261
 262 <!-- ##### FUNCTION g_unichar_type ##### -->
 263 <para>
 264
 265 </para>
 266
 267 @c:
 268 @Returns:
 269
 270
 271 <!-- ##### ENUM GUnicodeBreakType ##### -->
 272 <para>
 273 These are the possible line break classifications.
 274 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 275 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 276 </para>
 277
 278 @G_UNICODE_BREAK_MANDATORY:
 279 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 280 @G_UNICODE_BREAK_LINE_FEED:
 281 @G_UNICODE_BREAK_COMBINING_MARK:
 282 @G_UNICODE_BREAK_SURROGATE:
 283 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 284 @G_UNICODE_BREAK_INSEPARABLE:
 285 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 286 @G_UNICODE_BREAK_CONTINGENT:
 287 @G_UNICODE_BREAK_SPACE:
 288 @G_UNICODE_BREAK_AFTER:
 289 @G_UNICODE_BREAK_BEFORE:
 290 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 291 @G_UNICODE_BREAK_HYPHEN:
 292 @G_UNICODE_BREAK_NON_STARTER:
 293 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 294 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 295 @G_UNICODE_BREAK_QUOTATION:
 296 @G_UNICODE_BREAK_EXCLAMATION:
 297 @G_UNICODE_BREAK_IDEOGRAPHIC:
 298 @G_UNICODE_BREAK_NUMERIC:
 299 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 300 @G_UNICODE_BREAK_SYMBOL:
 301 @G_UNICODE_BREAK_ALPHABETIC:
 302 @G_UNICODE_BREAK_PREFIX:
 303 @G_UNICODE_BREAK_POSTFIX:
 304 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 305 @G_UNICODE_BREAK_AMBIGUOUS:
 306 @G_UNICODE_BREAK_UNKNOWN:
 307 @G_UNICODE_BREAK_NEXT_LINE:
 308 @G_UNICODE_BREAK_WORD_JOINER:
 309
 310 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 311 <para>
 312
 313 </para>
 314
 315 @c:
 316 @Returns:
 317
 318
 319 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 320 <para>
 321
 322 </para>
 323
 324 @string:
 325 @len:
 326
 327
 328 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 329 <para>
 330
 331 </para>
 332
 333 @ch:
 334 @result_len:
 335 @Returns:
 336
 337
 338 <!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
 339 <para>
 340
 341 </para>
 342
 343 @ch:
 344 @mirrored_ch:
 345 @Returns:
 346
 347
 348 <!-- ##### MACRO g_utf8_next_char ##### -->
 349 <para>
 350 Skips to the next character in a UTF-8 string. The string must be
 351 valid; this macro is as fast as possible, and has no error-checking.
 352 You would use this macro to iterate over a string character by
 353 character. The macro returns the start of the next UTF-8 character.
 354 Before using this macro, use g_utf8_validate() to validate strings
 355 that may contain invalid UTF-8.
 356 </para>
 357
 358 @p: Pointer to the start of a valid UTF-8 character.
 359
 360
 361 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 362 <para>
 363
 364 </para>
 365
 366 @p:
 367 @Returns:
 368
 369
 370 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 371 <para>
 372
 373 </para>
 374
 375 @p:
 376 @max_len:
 377 @Returns:
 378
 379
 380 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 381 <para>
 382
 383 </para>
 384
 385 @str:
 386 @offset:
 387 @Returns:
 388
 389
 390 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 391 <para>
 392
 393 </para>
 394
 395 @str:
 396 @pos:
 397 @Returns:
 398
 399
 400 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 401 <para>
 402
 403 </para>
 404
 405 @p:
 406 @Returns:
 407
 408
 409 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 410 <para>
 411
 412 </para>
 413
 414 @p:
 415 @end:
 416 @Returns:
 417 <!-- # Unused Parameters # -->
 418 @bound:
 419
 420
 421 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 422 <para>
 423
 424 </para>
 425
 426 @str:
 427 @p:
 428 @Returns:
 429
 430
 431 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 432 <para>
 433
 434 </para>
 435
 436 @p:
 437 @max:
 438 @Returns:
 439
 440
 441 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 442 <para>
 443
 444 </para>
 445
 446 @dest:
 447 @src:
 448 @n:
 449 @Returns:
 450
 451
 452 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 453 <para>
 454
 455 </para>
 456
 457 @p:
 458 @len:
 459 @c:
 460 @Returns:
 461 <!-- # Unused Parameters # -->
 462 @ch:
 463
 464
 465 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 466 <para>
 467
 468 </para>
 469
 470 @p:
 471 @len:
 472 @c:
 473 @Returns:
 474 <!-- # Unused Parameters # -->
 475 @ch:
 476
 477
 478 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 479 <para>
 480
 481 </para>
 482
 483 @str:
 484 @len:
 485 @Returns:
 486
 487
 488 <!-- ##### FUNCTION g_utf8_validate ##### -->
 489 <para>
 490
 491 </para>
 492
 493 @str:
 494 @max_len:
 495 @end:
 496 @Returns:
 497
 498
 499 <!-- ##### FUNCTION g_utf8_strup ##### -->
 500 <para>
 501
 502 </para>
 503
 504 @str:
 505 @len:
 506 @Returns:
 507
 508
 509 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 510 <para>
 511
 512 </para>
 513
 514 @str:
 515 @len:
 516 @Returns:
 517
 518
 519 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 520 <para>
 521
 522 </para>
 523
 524 @str:
 525 @len:
 526 @Returns:
 527
 528
 529 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 530 <para>
 531
 532 </para>
 533
 534 @str:
 535 @len:
 536 @mode:
 537 @Returns:
 538
 539
 540 <!-- ##### ENUM GNormalizeMode ##### -->
 541 <para>
 542 Defines how a Unicode string is transformed in a canonical
 543 form, standardizing such issues as whether a character with an accent is
 544 represented as a base character and combining accent or as a single precomposed
 545 character. Unicode strings should generally be normalized before comparing them.
 546 </para>
 547
 548 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 549   text content, such as the above-mentioned accent representation.
 550 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 551 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 552   forms rather than a maximally decomposed form.
 553 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 554 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 555   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 556   standard forms (in this case DIGIT THREE). Formatting information may be
 557   lost but for most text operations such characters should be considered the
 558   same.
 559 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 560 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 561   forms rather than a maximally decomposed form.
 562 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 563
 564 <!-- ##### FUNCTION g_utf8_collate ##### -->
 565 <para>
 566
 567 </para>
 568
 569 @str1:
 570 @str2:
 571 @Returns:
 572
 573
 574 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 575 <para>
 576
 577 </para>
 578
 579 @str:
 580 @len:
 581 @Returns:
 582
 583
 584 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 585 <para>
 586
 587 </para>
 588
 589 @str:
 590 @len:
 591 @items_read:
 592 @items_written:
 593 @error:
 594 @Returns:
 595
 596
 597 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 598 <para>
 599
 600 </para>
 601
 602 @str:
 603 @len:
 604 @items_read:
 605 @items_written:
 606 @error:
 607 @Returns:
 608
 609
 610 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 611 <para>
 612
 613 </para>
 614
 615 @str:
 616 @len:
 617 @items_written:
 618 @Returns:
 619
 620
 621 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 622 <para>
 623
 624 </para>
 625
 626 @str:
 627 @len:
 628 @items_read:
 629 @items_written:
 630 @error:
 631 @Returns:
 632
 633
 634 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 635 <para>
 636
 637 </para>
 638
 639 @str:
 640 @len:
 641 @items_read:
 642 @items_written:
 643 @error:
 644 @Returns:
 645
 646
 647 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 648 <para>
 649
 650 </para>
 651
 652 @str:
 653 @len:
 654 @items_read:
 655 @items_written:
 656 @error:
 657 @Returns:
 658
 659
 660 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 661 <para>
 662
 663 </para>
 664
 665 @str:
 666 @len:
 667 @items_read:
 668 @items_written:
 669 @error:
 670 @Returns:
 671
 672
 673 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 674 <para>
 675
 676 </para>
 677
 678 @c:
 679 @outbuf:
 680 @Returns:
 681
 682