docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### TYPEDEF gunichar ##### -->
  33 <para>
  34 A type which can hold any UCS-4 character code.
  35 </para>
  36
  37
  38 <!-- ##### TYPEDEF gunichar2 ##### -->
  39 <para>
  40 A type which can hold any UTF-16 character code.
  41 </para>
  42
  43
  44 <!-- ##### FUNCTION g_unichar_validate ##### -->
  45 <para>
  46
  47 </para>
  48
  49 @ch:
  50 @Returns:
  51
  52
  53 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  54 <para>
  55
  56 </para>
  57
  58 @c:
  59 @Returns:
  60
  61
  62 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  63 <para>
  64
  65 </para>
  66
  67 @c:
  68 @Returns:
  69
  70
  71 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  72 <para>
  73
  74 </para>
  75
  76 @c:
  77 @Returns:
  78
  79
  80 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  81 <para>
  82
  83 </para>
  84
  85 @c:
  86 @Returns:
  87
  88
  89 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  90 <para>
  91
  92 </para>
  93
  94 @c:
  95 @Returns:
  96
  97
  98 <!-- ##### FUNCTION g_unichar_islower ##### -->
  99 <para>
 100
 101 </para>
 102
 103 @c:
 104 @Returns:
 105
 106
 107 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 108 <para>
 109
 110 </para>
 111
 112 @c:
 113 @Returns:
 114
 115
 116 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 117 <para>
 118
 119 </para>
 120
 121 @c:
 122 @Returns:
 123
 124
 125 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 126 <para>
 127
 128 </para>
 129
 130 @c:
 131 @Returns:
 132
 133
 134 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 135 <para>
 136
 137 </para>
 138
 139 @c:
 140 @Returns:
 141
 142
 143 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 144 <para>
 145
 146 </para>
 147
 148 @c:
 149 @Returns:
 150
 151
 152 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 153 <para>
 154
 155 </para>
 156
 157 @c:
 158 @Returns:
 159
 160
 161 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 162 <para>
 163
 164 </para>
 165
 166 @c:
 167 @Returns:
 168
 169
 170 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 171 <para>
 172
 173 </para>
 174
 175 @c:
 176 @Returns:
 177
 178
 179 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 180 <para>
 181
 182 </para>
 183
 184 @c:
 185 @Returns:
 186
 187
 188 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 189 <para>
 190
 191 </para>
 192
 193 @c:
 194 @Returns:
 195
 196
 197 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 198 <para>
 199
 200 </para>
 201
 202 @c:
 203 @Returns:
 204
 205
 206 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 207 <para>
 208
 209 </para>
 210
 211 @c:
 212 @Returns:
 213
 214
 215 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 216 <para>
 217
 218 </para>
 219
 220 @c:
 221 @Returns:
 222
 223
 224 <!-- ##### ENUM GUnicodeType ##### -->
 225 <para>
 226 These are the possible character classifications.
 227 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 228 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 229 </para>
 230
 231 @G_UNICODE_CONTROL:
 232 @G_UNICODE_FORMAT:
 233 @G_UNICODE_UNASSIGNED:
 234 @G_UNICODE_PRIVATE_USE:
 235 @G_UNICODE_SURROGATE:
 236 @G_UNICODE_LOWERCASE_LETTER:
 237 @G_UNICODE_MODIFIER_LETTER:
 238 @G_UNICODE_OTHER_LETTER:
 239 @G_UNICODE_TITLECASE_LETTER:
 240 @G_UNICODE_UPPERCASE_LETTER:
 241 @G_UNICODE_COMBINING_MARK:
 242 @G_UNICODE_ENCLOSING_MARK:
 243 @G_UNICODE_NON_SPACING_MARK:
 244 @G_UNICODE_DECIMAL_NUMBER:
 245 @G_UNICODE_LETTER_NUMBER:
 246 @G_UNICODE_OTHER_NUMBER:
 247 @G_UNICODE_CONNECT_PUNCTUATION:
 248 @G_UNICODE_DASH_PUNCTUATION:
 249 @G_UNICODE_CLOSE_PUNCTUATION:
 250 @G_UNICODE_FINAL_PUNCTUATION:
 251 @G_UNICODE_INITIAL_PUNCTUATION:
 252 @G_UNICODE_OTHER_PUNCTUATION:
 253 @G_UNICODE_OPEN_PUNCTUATION:
 254 @G_UNICODE_CURRENCY_SYMBOL:
 255 @G_UNICODE_MODIFIER_SYMBOL:
 256 @G_UNICODE_MATH_SYMBOL:
 257 @G_UNICODE_OTHER_SYMBOL:
 258 @G_UNICODE_LINE_SEPARATOR:
 259 @G_UNICODE_PARAGRAPH_SEPARATOR:
 260 @G_UNICODE_SPACE_SEPARATOR:
 261
 262 <!-- ##### FUNCTION g_unichar_type ##### -->
 263 <para>
 264
 265 </para>
 266
 267 @c:
 268 @Returns:
 269
 270
 271 <!-- ##### ENUM GUnicodeBreakType ##### -->
 272 <para>
 273 These are the possible line break classifications.
 274 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 275 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 276 </para>
 277
 278 @G_UNICODE_BREAK_MANDATORY:
 279 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 280 @G_UNICODE_BREAK_LINE_FEED:
 281 @G_UNICODE_BREAK_COMBINING_MARK:
 282 @G_UNICODE_BREAK_SURROGATE:
 283 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 284 @G_UNICODE_BREAK_INSEPARABLE:
 285 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 286 @G_UNICODE_BREAK_CONTINGENT:
 287 @G_UNICODE_BREAK_SPACE:
 288 @G_UNICODE_BREAK_AFTER:
 289 @G_UNICODE_BREAK_BEFORE:
 290 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 291 @G_UNICODE_BREAK_HYPHEN:
 292 @G_UNICODE_BREAK_NON_STARTER:
 293 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 294 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 295 @G_UNICODE_BREAK_QUOTATION:
 296 @G_UNICODE_BREAK_EXCLAMATION:
 297 @G_UNICODE_BREAK_IDEOGRAPHIC:
 298 @G_UNICODE_BREAK_NUMERIC:
 299 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 300 @G_UNICODE_BREAK_SYMBOL:
 301 @G_UNICODE_BREAK_ALPHABETIC:
 302 @G_UNICODE_BREAK_PREFIX:
 303 @G_UNICODE_BREAK_POSTFIX:
 304 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 305 @G_UNICODE_BREAK_AMBIGUOUS:
 306 @G_UNICODE_BREAK_UNKNOWN:
 307 @G_UNICODE_BREAK_NEXT_LINE:
 308 @G_UNICODE_BREAK_WORD_JOINER:
 309
 310 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 311 <para>
 312
 313 </para>
 314
 315 @c:
 316 @Returns:
 317
 318
 319 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 320 <para>
 321
 322 </para>
 323
 324 @string:
 325 @len:
 326
 327
 328 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 329 <para>
 330
 331 </para>
 332
 333 @ch:
 334 @result_len:
 335 @Returns:
 336
 337
 338 <!-- ##### MACRO g_utf8_next_char ##### -->
 339 <para>
 340 Skips to the next character in a UTF-8 string. The string must be
 341 valid; this macro is as fast as possible, and has no error-checking.
 342 You would use this macro to iterate over a string character by
 343 character. The macro returns the start of the next UTF-8 character.
 344 Before using this macro, use g_utf8_validate() to validate strings
 345 that may contain invalid UTF-8.
 346 </para>
 347
 348 @p: Pointer to the start of a valid UTF-8 character.
 349
 350
 351 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 352 <para>
 353
 354 </para>
 355
 356 @p:
 357 @Returns:
 358
 359
 360 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 361 <para>
 362
 363 </para>
 364
 365 @p:
 366 @max_len:
 367 @Returns:
 368
 369
 370 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 371 <para>
 372
 373 </para>
 374
 375 @str:
 376 @offset:
 377 @Returns:
 378
 379
 380 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 381 <para>
 382
 383 </para>
 384
 385 @str:
 386 @pos:
 387 @Returns:
 388
 389
 390 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 391 <para>
 392
 393 </para>
 394
 395 @p:
 396 @Returns:
 397
 398
 399 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 400 <para>
 401
 402 </para>
 403
 404 @p:
 405 @end:
 406 @Returns:
 407 <!-- # Unused Parameters # -->
 408 @bound:
 409
 410
 411 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 412 <para>
 413
 414 </para>
 415
 416 @str:
 417 @p:
 418 @Returns:
 419
 420
 421 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 422 <para>
 423
 424 </para>
 425
 426 @p:
 427 @max:
 428 @Returns:
 429
 430
 431 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 432 <para>
 433
 434 </para>
 435
 436 @dest:
 437 @src:
 438 @n:
 439 @Returns:
 440
 441
 442 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 443 <para>
 444
 445 </para>
 446
 447 @p:
 448 @len:
 449 @c:
 450 @Returns:
 451 <!-- # Unused Parameters # -->
 452 @ch:
 453
 454
 455 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 456 <para>
 457
 458 </para>
 459
 460 @p:
 461 @len:
 462 @c:
 463 @Returns:
 464 <!-- # Unused Parameters # -->
 465 @ch:
 466
 467
 468 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 469 <para>
 470
 471 </para>
 472
 473 @str:
 474 @len:
 475 @Returns:
 476
 477
 478 <!-- ##### FUNCTION g_utf8_validate ##### -->
 479 <para>
 480
 481 </para>
 482
 483 @str:
 484 @max_len:
 485 @end:
 486 @Returns:
 487
 488
 489 <!-- ##### FUNCTION g_utf8_strup ##### -->
 490 <para>
 491
 492 </para>
 493
 494 @str:
 495 @len:
 496 @Returns:
 497
 498
 499 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 500 <para>
 501
 502 </para>
 503
 504 @str:
 505 @len:
 506 @Returns:
 507
 508
 509 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 510 <para>
 511
 512 </para>
 513
 514 @str:
 515 @len:
 516 @Returns:
 517
 518
 519 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 520 <para>
 521
 522 </para>
 523
 524 @str:
 525 @len:
 526 @mode:
 527 @Returns:
 528
 529
 530 <!-- ##### ENUM GNormalizeMode ##### -->
 531 <para>
 532 Defines how a Unicode string is transformed in a canonical
 533 form, standardizing such issues as whether a character with an accent is
 534 represented as a base character and combining accent or as a single precomposed
 535 character. Unicode strings should generally be normalized before comparing them.
 536 </para>
 537
 538 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 539   text content, such as the above-mentioned accent representation.
 540 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 541 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 542   forms rather than a maximally decomposed form.
 543 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 544 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 545   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 546   standard forms (in this case DIGIT THREE). Formatting information may be
 547   lost but for most text operations such characters should be considered the
 548   same.
 549 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 550 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 551   forms rather than a maximally decomposed form.
 552 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 553
 554 <!-- ##### FUNCTION g_utf8_collate ##### -->
 555 <para>
 556
 557 </para>
 558
 559 @str1:
 560 @str2:
 561 @Returns:
 562
 563
 564 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 565 <para>
 566
 567 </para>
 568
 569 @str:
 570 @len:
 571 @Returns:
 572
 573
 574 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 575 <para>
 576
 577 </para>
 578
 579 @str:
 580 @len:
 581 @items_read:
 582 @items_written:
 583 @error:
 584 @Returns:
 585
 586
 587 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 588 <para>
 589
 590 </para>
 591
 592 @str:
 593 @len:
 594 @items_read:
 595 @items_written:
 596 @error:
 597 @Returns:
 598
 599
 600 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 601 <para>
 602
 603 </para>
 604
 605 @str:
 606 @len:
 607 @items_written:
 608 @Returns:
 609
 610
 611 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 612 <para>
 613
 614 </para>
 615
 616 @str:
 617 @len:
 618 @items_read:
 619 @items_written:
 620 @error:
 621 @Returns:
 622
 623
 624 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 625 <para>
 626
 627 </para>
 628
 629 @str:
 630 @len:
 631 @items_read:
 632 @items_written:
 633 @error:
 634 @Returns:
 635
 636
 637 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 638 <para>
 639
 640 </para>
 641
 642 @str:
 643 @len:
 644 @items_read:
 645 @items_written:
 646 @error:
 647 @Returns:
 648
 649
 650 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 651 <para>
 652
 653 </para>
 654
 655 @str:
 656 @len:
 657 @items_read:
 658 @items_written:
 659 @error:
 660 @Returns:
 661
 662
 663 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 664 <para>
 665
 666 </para>
 667
 668 @c:
 669 @outbuf:
 670 @Returns:
 671
 672