docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### TYPEDEF gunichar ##### -->
  33 <para>
  34 A type which can hold any UCS-4 character code.
  35 </para>
  36
  37
  38 <!-- ##### TYPEDEF gunichar2 ##### -->
  39 <para>
  40 A type which can hold any UTF-16 character code.
  41 </para>
  42
  43
  44 <!-- ##### FUNCTION g_get_charset ##### -->
  45 <para>
  46
  47 </para>
  48
  49 @charset:
  50 @Returns:
  51
  52
  53 <!-- ##### FUNCTION g_unichar_validate ##### -->
  54 <para>
  55
  56 </para>
  57
  58 @ch:
  59 @Returns:
  60
  61
  62 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  63 <para>
  64
  65 </para>
  66
  67 @c:
  68 @Returns:
  69
  70
  71 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  72 <para>
  73
  74 </para>
  75
  76 @c:
  77 @Returns:
  78
  79
  80 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  81 <para>
  82
  83 </para>
  84
  85 @c:
  86 @Returns:
  87
  88
  89 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  90 <para>
  91
  92 </para>
  93
  94 @c:
  95 @Returns:
  96
  97
  98 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  99 <para>
 100
 101 </para>
 102
 103 @c:
 104 @Returns:
 105
 106
 107 <!-- ##### FUNCTION g_unichar_islower ##### -->
 108 <para>
 109
 110 </para>
 111
 112 @c:
 113 @Returns:
 114
 115
 116 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 117 <para>
 118
 119 </para>
 120
 121 @c:
 122 @Returns:
 123
 124
 125 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 126 <para>
 127
 128 </para>
 129
 130 @c:
 131 @Returns:
 132
 133
 134 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 135 <para>
 136
 137 </para>
 138
 139 @c:
 140 @Returns:
 141
 142
 143 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 144 <para>
 145
 146 </para>
 147
 148 @c:
 149 @Returns:
 150
 151
 152 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 153 <para>
 154
 155 </para>
 156
 157 @c:
 158 @Returns:
 159
 160
 161 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 162 <para>
 163
 164 </para>
 165
 166 @c:
 167 @Returns:
 168
 169
 170 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 171 <para>
 172
 173 </para>
 174
 175 @c:
 176 @Returns:
 177
 178
 179 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 180 <para>
 181
 182 </para>
 183
 184 @c:
 185 @Returns:
 186
 187
 188 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 189 <para>
 190
 191 </para>
 192
 193 @c:
 194 @Returns:
 195
 196
 197 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 198 <para>
 199
 200 </para>
 201
 202 @c:
 203 @Returns:
 204
 205
 206 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 207 <para>
 208
 209 </para>
 210
 211 @c:
 212 @Returns:
 213
 214
 215 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 216 <para>
 217
 218 </para>
 219
 220 @c:
 221 @Returns:
 222
 223
 224 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 225 <para>
 226
 227 </para>
 228
 229 @c:
 230 @Returns:
 231
 232
 233 <!-- ##### ENUM GUnicodeType ##### -->
 234 <para>
 235 These are the possible character classifications.
 236 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 237 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 238 </para>
 239
 240 @G_UNICODE_CONTROL:
 241 @G_UNICODE_FORMAT:
 242 @G_UNICODE_UNASSIGNED:
 243 @G_UNICODE_PRIVATE_USE:
 244 @G_UNICODE_SURROGATE:
 245 @G_UNICODE_LOWERCASE_LETTER:
 246 @G_UNICODE_MODIFIER_LETTER:
 247 @G_UNICODE_OTHER_LETTER:
 248 @G_UNICODE_TITLECASE_LETTER:
 249 @G_UNICODE_UPPERCASE_LETTER:
 250 @G_UNICODE_COMBINING_MARK:
 251 @G_UNICODE_ENCLOSING_MARK:
 252 @G_UNICODE_NON_SPACING_MARK:
 253 @G_UNICODE_DECIMAL_NUMBER:
 254 @G_UNICODE_LETTER_NUMBER:
 255 @G_UNICODE_OTHER_NUMBER:
 256 @G_UNICODE_CONNECT_PUNCTUATION:
 257 @G_UNICODE_DASH_PUNCTUATION:
 258 @G_UNICODE_CLOSE_PUNCTUATION:
 259 @G_UNICODE_FINAL_PUNCTUATION:
 260 @G_UNICODE_INITIAL_PUNCTUATION:
 261 @G_UNICODE_OTHER_PUNCTUATION:
 262 @G_UNICODE_OPEN_PUNCTUATION:
 263 @G_UNICODE_CURRENCY_SYMBOL:
 264 @G_UNICODE_MODIFIER_SYMBOL:
 265 @G_UNICODE_MATH_SYMBOL:
 266 @G_UNICODE_OTHER_SYMBOL:
 267 @G_UNICODE_LINE_SEPARATOR:
 268 @G_UNICODE_PARAGRAPH_SEPARATOR:
 269 @G_UNICODE_SPACE_SEPARATOR:
 270
 271 <!-- ##### FUNCTION g_unichar_type ##### -->
 272 <para>
 273
 274 </para>
 275
 276 @c:
 277 @Returns:
 278
 279
 280 <!-- ##### ENUM GUnicodeBreakType ##### -->
 281 <para>
 282 These are the possible line break classifications.
 283 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 284 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 285 </para>
 286
 287 @G_UNICODE_BREAK_MANDATORY:
 288 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 289 @G_UNICODE_BREAK_LINE_FEED:
 290 @G_UNICODE_BREAK_COMBINING_MARK:
 291 @G_UNICODE_BREAK_SURROGATE:
 292 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 293 @G_UNICODE_BREAK_INSEPARABLE:
 294 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 295 @G_UNICODE_BREAK_CONTINGENT:
 296 @G_UNICODE_BREAK_SPACE:
 297 @G_UNICODE_BREAK_AFTER:
 298 @G_UNICODE_BREAK_BEFORE:
 299 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 300 @G_UNICODE_BREAK_HYPHEN:
 301 @G_UNICODE_BREAK_NON_STARTER:
 302 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 303 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 304 @G_UNICODE_BREAK_QUOTATION:
 305 @G_UNICODE_BREAK_EXCLAMATION:
 306 @G_UNICODE_BREAK_IDEOGRAPHIC:
 307 @G_UNICODE_BREAK_NUMERIC:
 308 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 309 @G_UNICODE_BREAK_SYMBOL:
 310 @G_UNICODE_BREAK_ALPHABETIC:
 311 @G_UNICODE_BREAK_PREFIX:
 312 @G_UNICODE_BREAK_POSTFIX:
 313 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 314 @G_UNICODE_BREAK_AMBIGUOUS:
 315 @G_UNICODE_BREAK_UNKNOWN:
 316
 317 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 318 <para>
 319
 320 </para>
 321
 322 @c:
 323 @Returns:
 324
 325
 326 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 327 <para>
 328
 329 </para>
 330
 331 @string:
 332 @len:
 333
 334
 335 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 336 <para>
 337
 338 </para>
 339
 340 @ch:
 341 @result_len:
 342 @Returns:
 343
 344
 345 <!-- ##### MACRO g_utf8_next_char ##### -->
 346 <para>
 347 Skips to the next character in a UTF-8 string. The string must be
 348 valid; this macro is as fast as possible, and has no error-checking.
 349 You would use this macro to iterate over a string character by
 350 character. The macro returns the start of the next UTF-8 character.
 351 Before using this macro, use g_utf8_validate() to validate strings
 352 that may contain invalid UTF-8.
 353 </para>
 354
 355 @p: Pointer to the start of a valid UTF-8 character.
 356
 357
 358 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 359 <para>
 360
 361 </para>
 362
 363 @p:
 364 @Returns:
 365
 366
 367 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 368 <para>
 369
 370 </para>
 371
 372 @p:
 373 @max_len:
 374 @Returns:
 375
 376
 377 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 378 <para>
 379
 380 </para>
 381
 382 @str:
 383 @offset:
 384 @Returns:
 385
 386
 387 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 388 <para>
 389
 390 </para>
 391
 392 @str:
 393 @pos:
 394 @Returns:
 395
 396
 397 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 398 <para>
 399
 400 </para>
 401
 402 @p:
 403 @Returns:
 404
 405
 406 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 407 <para>
 408
 409 </para>
 410
 411 @p:
 412 @end:
 413 @Returns:
 414 <!-- # Unused Parameters # -->
 415 @bound:
 416
 417
 418 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 419 <para>
 420
 421 </para>
 422
 423 @str:
 424 @p:
 425 @Returns:
 426
 427
 428 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 429 <para>
 430
 431 </para>
 432
 433 @p:
 434 @max:
 435 @Returns:
 436
 437
 438 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 439 <para>
 440
 441 </para>
 442
 443 @dest:
 444 @src:
 445 @n:
 446 @Returns:
 447
 448
 449 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 450 <para>
 451
 452 </para>
 453
 454 @p:
 455 @len:
 456 @c:
 457 @Returns:
 458 <!-- # Unused Parameters # -->
 459 @ch:
 460
 461
 462 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 463 <para>
 464
 465 </para>
 466
 467 @p:
 468 @len:
 469 @c:
 470 @Returns:
 471 <!-- # Unused Parameters # -->
 472 @ch:
 473
 474
 475 <!-- ##### FUNCTION g_utf8_validate ##### -->
 476 <para>
 477
 478 </para>
 479
 480 @str:
 481 @max_len:
 482 @end:
 483 @Returns:
 484
 485
 486 <!-- ##### FUNCTION g_utf8_strup ##### -->
 487 <para>
 488
 489 </para>
 490
 491 @str:
 492 @len:
 493 @Returns:
 494
 495
 496 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 497 <para>
 498
 499 </para>
 500
 501 @str:
 502 @len:
 503 @Returns:
 504
 505
 506 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 507 <para>
 508
 509 </para>
 510
 511 @str:
 512 @len:
 513 @Returns:
 514
 515
 516 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 517 <para>
 518
 519 </para>
 520
 521 @str:
 522 @len:
 523 @mode:
 524 @Returns:
 525
 526
 527 <!-- ##### ENUM GNormalizeMode ##### -->
 528 <para>
 529 Defines how a Unicode string is transformed in a canonical
 530 form, standardizing such issues as whether a character with an accent is
 531 represented as a base character and combining accent or as a single precomposed
 532 character. Unicode strings should generally be normalized before comparing them.
 533 </para>
 534
 535 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 536   text content, such as the above-mentioned accent representation.
 537 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 538 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 539   forms rather than a maximally decomposed form.
 540 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 541 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 542   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 543   standard forms (in this case DIGIT THREE). Formatting information may be
 544   lost but for most text operations such characters should be considered the
 545   same.
 546 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 547 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 548   forms rather than a maximally decomposed form.
 549 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 550
 551 <!-- ##### FUNCTION g_utf8_collate ##### -->
 552 <para>
 553
 554 </para>
 555
 556 @str1:
 557 @str2:
 558 @Returns:
 559
 560
 561 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 562 <para>
 563
 564 </para>
 565
 566 @str:
 567 @len:
 568 @Returns:
 569
 570
 571 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 572 <para>
 573
 574 </para>
 575
 576 @str:
 577 @len:
 578 @items_read:
 579 @items_written:
 580 @error:
 581 @Returns:
 582
 583
 584 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 585 <para>
 586
 587 </para>
 588
 589 @str:
 590 @len:
 591 @items_read:
 592 @items_written:
 593 @error:
 594 @Returns:
 595
 596
 597 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 598 <para>
 599
 600 </para>
 601
 602 @str:
 603 @len:
 604 @items_written:
 605 @Returns:
 606
 607
 608 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 609 <para>
 610
 611 </para>
 612
 613 @str:
 614 @len:
 615 @items_read:
 616 @items_written:
 617 @error:
 618 @Returns:
 619
 620
 621 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 622 <para>
 623
 624 </para>
 625
 626 @str:
 627 @len:
 628 @items_read:
 629 @items_written:
 630 @error:
 631 @Returns:
 632
 633
 634 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 635 <para>
 636
 637 </para>
 638
 639 @str:
 640 @len:
 641 @items_read:
 642 @items_written:
 643 @error:
 644 @Returns:
 645
 646
 647 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 648 <para>
 649
 650 </para>
 651
 652 @str:
 653 @len:
 654 @items_read:
 655 @items_written:
 656 @error:
 657 @Returns:
 658
 659
 660 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 661 <para>
 662
 663 </para>
 664
 665 @c:
 666 @outbuf:
 667 @Returns:
 668
 669