docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### TYPEDEF gunichar ##### -->
  33 <para>
  34 A type which can hold any UCS-4 character code.
  35 </para>
  36
  37
  38 <!-- ##### TYPEDEF gunichar2 ##### -->
  39 <para>
  40 A type which can hold any UTF-16 character code.
  41 </para>
  42
  43
  44 <!-- ##### FUNCTION g_unichar_validate ##### -->
  45 <para>
  46
  47 </para>
  48
  49 @ch:
  50 @Returns:
  51
  52
  53 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  54 <para>
  55
  56 </para>
  57
  58 @c:
  59 @Returns:
  60
  61
  62 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  63 <para>
  64
  65 </para>
  66
  67 @c:
  68 @Returns:
  69
  70
  71 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  72 <para>
  73
  74 </para>
  75
  76 @c:
  77 @Returns:
  78
  79
  80 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  81 <para>
  82
  83 </para>
  84
  85 @c:
  86 @Returns:
  87
  88
  89 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  90 <para>
  91
  92 </para>
  93
  94 @c:
  95 @Returns:
  96
  97
  98 <!-- ##### FUNCTION g_unichar_islower ##### -->
  99 <para>
 100
 101 </para>
 102
 103 @c:
 104 @Returns:
 105
 106
 107 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 108 <para>
 109
 110 </para>
 111
 112 @c:
 113 @Returns:
 114
 115
 116 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 117 <para>
 118
 119 </para>
 120
 121 @c:
 122 @Returns:
 123
 124
 125 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 126 <para>
 127
 128 </para>
 129
 130 @c:
 131 @Returns:
 132
 133
 134 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 135 <para>
 136
 137 </para>
 138
 139 @c:
 140 @Returns:
 141
 142
 143 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 144 <para>
 145
 146 </para>
 147
 148 @c:
 149 @Returns:
 150
 151
 152 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 153 <para>
 154
 155 </para>
 156
 157 @c:
 158 @Returns:
 159
 160
 161 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 162 <para>
 163
 164 </para>
 165
 166 @c:
 167 @Returns:
 168
 169
 170 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 171 <para>
 172
 173 </para>
 174
 175 @c:
 176 @Returns:
 177
 178
 179 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 180 <para>
 181
 182 </para>
 183
 184 @c:
 185 @Returns:
 186
 187
 188 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 189 <para>
 190
 191 </para>
 192
 193 @c:
 194 @Returns:
 195
 196
 197 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 198 <para>
 199
 200 </para>
 201
 202 @c:
 203 @Returns:
 204
 205
 206 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 207 <para>
 208
 209 </para>
 210
 211 @c:
 212 @Returns:
 213
 214
 215 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 216 <para>
 217
 218 </para>
 219
 220 @c:
 221 @Returns:
 222
 223
 224 <!-- ##### ENUM GUnicodeType ##### -->
 225 <para>
 226 These are the possible character classifications.
 227 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 228 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 229 </para>
 230
 231 @G_UNICODE_CONTROL:
 232 @G_UNICODE_FORMAT:
 233 @G_UNICODE_UNASSIGNED:
 234 @G_UNICODE_PRIVATE_USE:
 235 @G_UNICODE_SURROGATE:
 236 @G_UNICODE_LOWERCASE_LETTER:
 237 @G_UNICODE_MODIFIER_LETTER:
 238 @G_UNICODE_OTHER_LETTER:
 239 @G_UNICODE_TITLECASE_LETTER:
 240 @G_UNICODE_UPPERCASE_LETTER:
 241 @G_UNICODE_COMBINING_MARK:
 242 @G_UNICODE_ENCLOSING_MARK:
 243 @G_UNICODE_NON_SPACING_MARK:
 244 @G_UNICODE_DECIMAL_NUMBER:
 245 @G_UNICODE_LETTER_NUMBER:
 246 @G_UNICODE_OTHER_NUMBER:
 247 @G_UNICODE_CONNECT_PUNCTUATION:
 248 @G_UNICODE_DASH_PUNCTUATION:
 249 @G_UNICODE_CLOSE_PUNCTUATION:
 250 @G_UNICODE_FINAL_PUNCTUATION:
 251 @G_UNICODE_INITIAL_PUNCTUATION:
 252 @G_UNICODE_OTHER_PUNCTUATION:
 253 @G_UNICODE_OPEN_PUNCTUATION:
 254 @G_UNICODE_CURRENCY_SYMBOL:
 255 @G_UNICODE_MODIFIER_SYMBOL:
 256 @G_UNICODE_MATH_SYMBOL:
 257 @G_UNICODE_OTHER_SYMBOL:
 258 @G_UNICODE_LINE_SEPARATOR:
 259 @G_UNICODE_PARAGRAPH_SEPARATOR:
 260 @G_UNICODE_SPACE_SEPARATOR:
 261
 262 <!-- ##### FUNCTION g_unichar_type ##### -->
 263 <para>
 264
 265 </para>
 266
 267 @c:
 268 @Returns:
 269
 270
 271 <!-- ##### ENUM GUnicodeBreakType ##### -->
 272 <para>
 273 These are the possible line break classifications.
 274 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 275 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 276 </para>
 277
 278 @G_UNICODE_BREAK_MANDATORY:
 279 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 280 @G_UNICODE_BREAK_LINE_FEED:
 281 @G_UNICODE_BREAK_COMBINING_MARK:
 282 @G_UNICODE_BREAK_SURROGATE:
 283 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 284 @G_UNICODE_BREAK_INSEPARABLE:
 285 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 286 @G_UNICODE_BREAK_CONTINGENT:
 287 @G_UNICODE_BREAK_SPACE:
 288 @G_UNICODE_BREAK_AFTER:
 289 @G_UNICODE_BREAK_BEFORE:
 290 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 291 @G_UNICODE_BREAK_HYPHEN:
 292 @G_UNICODE_BREAK_NON_STARTER:
 293 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 294 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 295 @G_UNICODE_BREAK_QUOTATION:
 296 @G_UNICODE_BREAK_EXCLAMATION:
 297 @G_UNICODE_BREAK_IDEOGRAPHIC:
 298 @G_UNICODE_BREAK_NUMERIC:
 299 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 300 @G_UNICODE_BREAK_SYMBOL:
 301 @G_UNICODE_BREAK_ALPHABETIC:
 302 @G_UNICODE_BREAK_PREFIX:
 303 @G_UNICODE_BREAK_POSTFIX:
 304 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 305 @G_UNICODE_BREAK_AMBIGUOUS:
 306 @G_UNICODE_BREAK_UNKNOWN:
 307
 308 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 309 <para>
 310
 311 </para>
 312
 313 @c:
 314 @Returns:
 315
 316
 317 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 318 <para>
 319
 320 </para>
 321
 322 @string:
 323 @len:
 324
 325
 326 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 327 <para>
 328
 329 </para>
 330
 331 @ch:
 332 @result_len:
 333 @Returns:
 334
 335
 336 <!-- ##### MACRO g_utf8_next_char ##### -->
 337 <para>
 338 Skips to the next character in a UTF-8 string. The string must be
 339 valid; this macro is as fast as possible, and has no error-checking.
 340 You would use this macro to iterate over a string character by
 341 character. The macro returns the start of the next UTF-8 character.
 342 Before using this macro, use g_utf8_validate() to validate strings
 343 that may contain invalid UTF-8.
 344 </para>
 345
 346 @p: Pointer to the start of a valid UTF-8 character.
 347
 348
 349 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 350 <para>
 351
 352 </para>
 353
 354 @p:
 355 @Returns:
 356
 357
 358 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 359 <para>
 360
 361 </para>
 362
 363 @p:
 364 @max_len:
 365 @Returns:
 366
 367
 368 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 369 <para>
 370
 371 </para>
 372
 373 @str:
 374 @offset:
 375 @Returns:
 376
 377
 378 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 379 <para>
 380
 381 </para>
 382
 383 @str:
 384 @pos:
 385 @Returns:
 386
 387
 388 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 389 <para>
 390
 391 </para>
 392
 393 @p:
 394 @Returns:
 395
 396
 397 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 398 <para>
 399
 400 </para>
 401
 402 @p:
 403 @end:
 404 @Returns:
 405 <!-- # Unused Parameters # -->
 406 @bound:
 407
 408
 409 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 410 <para>
 411
 412 </para>
 413
 414 @str:
 415 @p:
 416 @Returns:
 417
 418
 419 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 420 <para>
 421
 422 </para>
 423
 424 @p:
 425 @max:
 426 @Returns:
 427
 428
 429 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 430 <para>
 431
 432 </para>
 433
 434 @dest:
 435 @src:
 436 @n:
 437 @Returns:
 438
 439
 440 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 441 <para>
 442
 443 </para>
 444
 445 @p:
 446 @len:
 447 @c:
 448 @Returns:
 449 <!-- # Unused Parameters # -->
 450 @ch:
 451
 452
 453 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 454 <para>
 455
 456 </para>
 457
 458 @p:
 459 @len:
 460 @c:
 461 @Returns:
 462 <!-- # Unused Parameters # -->
 463 @ch:
 464
 465
 466 <!-- ##### FUNCTION g_utf8_strreverse ##### -->
 467 <para>
 468
 469 </para>
 470
 471 @str:
 472 @len:
 473 @Returns:
 474
 475
 476 <!-- ##### FUNCTION g_utf8_validate ##### -->
 477 <para>
 478
 479 </para>
 480
 481 @str:
 482 @max_len:
 483 @end:
 484 @Returns:
 485
 486
 487 <!-- ##### FUNCTION g_utf8_strup ##### -->
 488 <para>
 489
 490 </para>
 491
 492 @str:
 493 @len:
 494 @Returns:
 495
 496
 497 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 498 <para>
 499
 500 </para>
 501
 502 @str:
 503 @len:
 504 @Returns:
 505
 506
 507 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 508 <para>
 509
 510 </para>
 511
 512 @str:
 513 @len:
 514 @Returns:
 515
 516
 517 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 518 <para>
 519
 520 </para>
 521
 522 @str:
 523 @len:
 524 @mode:
 525 @Returns:
 526
 527
 528 <!-- ##### ENUM GNormalizeMode ##### -->
 529 <para>
 530 Defines how a Unicode string is transformed in a canonical
 531 form, standardizing such issues as whether a character with an accent is
 532 represented as a base character and combining accent or as a single precomposed
 533 character. Unicode strings should generally be normalized before comparing them.
 534 </para>
 535
 536 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 537   text content, such as the above-mentioned accent representation.
 538 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 539 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 540   forms rather than a maximally decomposed form.
 541 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 542 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 543   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 544   standard forms (in this case DIGIT THREE). Formatting information may be
 545   lost but for most text operations such characters should be considered the
 546   same.
 547 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 548 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 549   forms rather than a maximally decomposed form.
 550 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 551
 552 <!-- ##### FUNCTION g_utf8_collate ##### -->
 553 <para>
 554
 555 </para>
 556
 557 @str1:
 558 @str2:
 559 @Returns:
 560
 561
 562 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 563 <para>
 564
 565 </para>
 566
 567 @str:
 568 @len:
 569 @Returns:
 570
 571
 572 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 573 <para>
 574
 575 </para>
 576
 577 @str:
 578 @len:
 579 @items_read:
 580 @items_written:
 581 @error:
 582 @Returns:
 583
 584
 585 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 586 <para>
 587
 588 </para>
 589
 590 @str:
 591 @len:
 592 @items_read:
 593 @items_written:
 594 @error:
 595 @Returns:
 596
 597
 598 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 599 <para>
 600
 601 </para>
 602
 603 @str:
 604 @len:
 605 @items_written:
 606 @Returns:
 607
 608
 609 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 610 <para>
 611
 612 </para>
 613
 614 @str:
 615 @len:
 616 @items_read:
 617 @items_written:
 618 @error:
 619 @Returns:
 620
 621
 622 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 623 <para>
 624
 625 </para>
 626
 627 @str:
 628 @len:
 629 @items_read:
 630 @items_written:
 631 @error:
 632 @Returns:
 633
 634
 635 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 636 <para>
 637
 638 </para>
 639
 640 @str:
 641 @len:
 642 @items_read:
 643 @items_written:
 644 @error:
 645 @Returns:
 646
 647
 648 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 649 <para>
 650
 651 </para>
 652
 653 @str:
 654 @len:
 655 @items_read:
 656 @items_written:
 657 @error:
 658 @Returns:
 659
 660
 661 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 662 <para>
 663
 664 </para>
 665
 666 @c:
 667 @outbuf:
 668 @Returns:
 669
 670