docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### TYPEDEF gunichar ##### -->
  33 <para>
  34 A type which can hold any UCS-4 character code.
  35 </para>
  36
  37
  38 <!-- ##### TYPEDEF gunichar2 ##### -->
  39 <para>
  40 A type which can hold any UTF-16 character code.
  41 </para>
  42
  43
  44 <!-- ##### FUNCTION g_get_charset ##### -->
  45 <para>
  46
  47 </para>
  48
  49 @Returns:
  50 <!-- # Unused Parameters # -->
  51 @charset:
  52
  53
  54 <!-- ##### FUNCTION g_unichar_validate ##### -->
  55 <para>
  56
  57 </para>
  58
  59 @ch:
  60 @Returns:
  61
  62
  63 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  64 <para>
  65
  66 </para>
  67
  68 @c:
  69 @Returns:
  70
  71
  72 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  73 <para>
  74
  75 </para>
  76
  77 @c:
  78 @Returns:
  79
  80
  81 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  82 <para>
  83
  84 </para>
  85
  86 @c:
  87 @Returns:
  88
  89
  90 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  91 <para>
  92
  93 </para>
  94
  95 @c:
  96 @Returns:
  97
  98
  99 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
 100 <para>
 101
 102 </para>
 103
 104 @c:
 105 @Returns:
 106
 107
 108 <!-- ##### FUNCTION g_unichar_islower ##### -->
 109 <para>
 110
 111 </para>
 112
 113 @c:
 114 @Returns:
 115
 116
 117 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 118 <para>
 119
 120 </para>
 121
 122 @c:
 123 @Returns:
 124
 125
 126 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 127 <para>
 128
 129 </para>
 130
 131 @c:
 132 @Returns:
 133
 134
 135 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 136 <para>
 137
 138 </para>
 139
 140 @c:
 141 @Returns:
 142
 143
 144 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 145 <para>
 146
 147 </para>
 148
 149 @c:
 150 @Returns:
 151
 152
 153 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 154 <para>
 155
 156 </para>
 157
 158 @c:
 159 @Returns:
 160
 161
 162 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 163 <para>
 164
 165 </para>
 166
 167 @c:
 168 @Returns:
 169
 170
 171 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 172 <para>
 173
 174 </para>
 175
 176 @c:
 177 @Returns:
 178
 179
 180 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 181 <para>
 182
 183 </para>
 184
 185 @c:
 186 @Returns:
 187
 188
 189 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 190 <para>
 191
 192 </para>
 193
 194 @c:
 195 @Returns:
 196
 197
 198 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 199 <para>
 200
 201 </para>
 202
 203 @c:
 204 @Returns:
 205
 206
 207 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 208 <para>
 209
 210 </para>
 211
 212 @c:
 213 @Returns:
 214
 215
 216 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 217 <para>
 218
 219 </para>
 220
 221 @c:
 222 @Returns:
 223
 224
 225 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 226 <para>
 227
 228 </para>
 229
 230 @c:
 231 @Returns:
 232
 233
 234 <!-- ##### ENUM GUnicodeType ##### -->
 235 <para>
 236 These are the possible character classifications.
 237 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 238 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 239 </para>
 240
 241 @G_UNICODE_CONTROL:
 242 @G_UNICODE_FORMAT:
 243 @G_UNICODE_UNASSIGNED:
 244 @G_UNICODE_PRIVATE_USE:
 245 @G_UNICODE_SURROGATE:
 246 @G_UNICODE_LOWERCASE_LETTER:
 247 @G_UNICODE_MODIFIER_LETTER:
 248 @G_UNICODE_OTHER_LETTER:
 249 @G_UNICODE_TITLECASE_LETTER:
 250 @G_UNICODE_UPPERCASE_LETTER:
 251 @G_UNICODE_COMBINING_MARK:
 252 @G_UNICODE_ENCLOSING_MARK:
 253 @G_UNICODE_NON_SPACING_MARK:
 254 @G_UNICODE_DECIMAL_NUMBER:
 255 @G_UNICODE_LETTER_NUMBER:
 256 @G_UNICODE_OTHER_NUMBER:
 257 @G_UNICODE_CONNECT_PUNCTUATION:
 258 @G_UNICODE_DASH_PUNCTUATION:
 259 @G_UNICODE_CLOSE_PUNCTUATION:
 260 @G_UNICODE_FINAL_PUNCTUATION:
 261 @G_UNICODE_INITIAL_PUNCTUATION:
 262 @G_UNICODE_OTHER_PUNCTUATION:
 263 @G_UNICODE_OPEN_PUNCTUATION:
 264 @G_UNICODE_CURRENCY_SYMBOL:
 265 @G_UNICODE_MODIFIER_SYMBOL:
 266 @G_UNICODE_MATH_SYMBOL:
 267 @G_UNICODE_OTHER_SYMBOL:
 268 @G_UNICODE_LINE_SEPARATOR:
 269 @G_UNICODE_PARAGRAPH_SEPARATOR:
 270 @G_UNICODE_SPACE_SEPARATOR:
 271
 272 <!-- ##### FUNCTION g_unichar_type ##### -->
 273 <para>
 274
 275 </para>
 276
 277 @c:
 278 @Returns:
 279
 280
 281 <!-- ##### ENUM GUnicodeBreakType ##### -->
 282 <para>
 283 These are the possible line break classifications.
 284 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 285 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 286 </para>
 287
 288 @G_UNICODE_BREAK_MANDATORY:
 289 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 290 @G_UNICODE_BREAK_LINE_FEED:
 291 @G_UNICODE_BREAK_COMBINING_MARK:
 292 @G_UNICODE_BREAK_SURROGATE:
 293 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 294 @G_UNICODE_BREAK_INSEPARABLE:
 295 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 296 @G_UNICODE_BREAK_CONTINGENT:
 297 @G_UNICODE_BREAK_SPACE:
 298 @G_UNICODE_BREAK_AFTER:
 299 @G_UNICODE_BREAK_BEFORE:
 300 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 301 @G_UNICODE_BREAK_HYPHEN:
 302 @G_UNICODE_BREAK_NON_STARTER:
 303 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 304 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 305 @G_UNICODE_BREAK_QUOTATION:
 306 @G_UNICODE_BREAK_EXCLAMATION:
 307 @G_UNICODE_BREAK_IDEOGRAPHIC:
 308 @G_UNICODE_BREAK_NUMERIC:
 309 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 310 @G_UNICODE_BREAK_SYMBOL:
 311 @G_UNICODE_BREAK_ALPHABETIC:
 312 @G_UNICODE_BREAK_PREFIX:
 313 @G_UNICODE_BREAK_POSTFIX:
 314 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 315 @G_UNICODE_BREAK_AMBIGUOUS:
 316 @G_UNICODE_BREAK_UNKNOWN:
 317
 318 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 319 <para>
 320
 321 </para>
 322
 323 @c:
 324 @Returns:
 325
 326
 327 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 328 <para>
 329
 330 </para>
 331
 332 @string:
 333 @len:
 334
 335
 336 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 337 <para>
 338
 339 </para>
 340
 341 @ch:
 342 @result_len:
 343 @Returns:
 344
 345
 346 <!-- ##### MACRO g_utf8_next_char ##### -->
 347 <para>
 348 Skips to the next character in a UTF-8 string. The string must be
 349 valid; this macro is as fast as possible, and has zero error-checking.
 350 You would use this macro to iterate over a string character by
 351 character. The macro returns the start of the next UTF-8 character.
 352 Before using this macro, use g_utf8_validate() to validate strings
 353 that may contain invalid UTF-8.
 354 </para>
 355
 356 @p: Pointer to the start of a valid UTF-8 character.
 357
 358
 359 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 360 <para>
 361
 362 </para>
 363
 364 @p:
 365 @Returns:
 366
 367
 368 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 369 <para>
 370
 371 </para>
 372
 373 @p:
 374 @max_len:
 375 @Returns:
 376
 377
 378 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 379 <para>
 380
 381 </para>
 382
 383 @str:
 384 @offset:
 385 @Returns:
 386
 387
 388 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 389 <para>
 390
 391 </para>
 392
 393 @str:
 394 @pos:
 395 @Returns:
 396
 397
 398 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 399 <para>
 400
 401 </para>
 402
 403 @p:
 404 @Returns:
 405
 406
 407 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 408 <para>
 409
 410 </para>
 411
 412 @p:
 413 @end:
 414 @Returns:
 415 <!-- # Unused Parameters # -->
 416 @bound:
 417
 418
 419 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 420 <para>
 421
 422 </para>
 423
 424 @str:
 425 @p:
 426 @Returns:
 427
 428
 429 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 430 <para>
 431
 432 </para>
 433
 434 @p:
 435 @max:
 436 @Returns:
 437
 438
 439 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 440 <para>
 441
 442 </para>
 443
 444 @dest:
 445 @src:
 446 @n:
 447 @Returns:
 448
 449
 450 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 451 <para>
 452
 453 </para>
 454
 455 @p:
 456 @len:
 457 @c:
 458 @Returns:
 459 <!-- # Unused Parameters # -->
 460 @ch:
 461
 462
 463 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 464 <para>
 465
 466 </para>
 467
 468 @p:
 469 @len:
 470 @c:
 471 @Returns:
 472 <!-- # Unused Parameters # -->
 473 @ch:
 474
 475
 476 <!-- ##### FUNCTION g_utf8_validate ##### -->
 477 <para>
 478
 479 </para>
 480
 481 @str:
 482 @max_len:
 483 @end:
 484 @Returns:
 485
 486
 487 <!-- ##### FUNCTION g_utf8_strup ##### -->
 488 <para>
 489
 490 </para>
 491
 492 @str:
 493 @len:
 494 @Returns:
 495
 496
 497 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 498 <para>
 499
 500 </para>
 501
 502 @str:
 503 @len:
 504 @Returns:
 505
 506
 507 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 508 <para>
 509
 510 </para>
 511
 512 @str:
 513 @len:
 514 @Returns:
 515
 516
 517 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 518 <para>
 519
 520 </para>
 521
 522 @str:
 523 @len:
 524 @mode:
 525 @Returns:
 526
 527
 528 <!-- ##### ENUM GNormalizeMode ##### -->
 529 <para>
 530 A #GNormalizeMode defines how a Unicode string is transformed in a canonical
 531 form, standardizing such issues as whether a character with an accent is
 532 represented as a base character and combining accent or as a single precomposed
 533 character. Unicode strings should generally be normalized before comparing them.
 534 </para>
 535
 536 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 537   text content, such as the above-mentioned accent representation.
 538 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 539 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 540   forms rather than a maximally decomposed form.
 541 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 542 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 543   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 544   standard forms (in this case DIGIT THREE). Formatting information may be
 545   lost but for most text operations such characters should be considered the
 546   same.
 547 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 548 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 549   forms rather than a maximally decomposed form.
 550 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 551
 552 <!-- ##### FUNCTION g_utf8_collate ##### -->
 553 <para>
 554
 555 </para>
 556
 557 @str1:
 558 @str2:
 559 @Returns:
 560
 561
 562 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 563 <para>
 564
 565 </para>
 566
 567 @str:
 568 @len:
 569 @Returns:
 570
 571
 572 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 573 <para>
 574
 575 </para>
 576
 577 @str:
 578 @len:
 579 @items_read:
 580 @items_written:
 581 @error:
 582 @Returns:
 583
 584
 585 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 586 <para>
 587
 588 </para>
 589
 590 @str:
 591 @len:
 592 @items_read:
 593 @items_written:
 594 @error:
 595 @Returns:
 596
 597
 598 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 599 <para>
 600
 601 </para>
 602
 603 @str:
 604 @len:
 605 @items_written:
 606 @Returns:
 607
 608
 609 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 610 <para>
 611
 612 </para>
 613
 614 @str:
 615 @len:
 616 @items_read:
 617 @items_written:
 618 @error:
 619 @Returns:
 620
 621
 622 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 623 <para>
 624
 625 </para>
 626
 627 @str:
 628 @len:
 629 @items_read:
 630 @items_written:
 631 @error:
 632 @Returns:
 633
 634
 635 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 636 <para>
 637
 638 </para>
 639
 640 @str:
 641 @len:
 642 @items_read:
 643 @items_written:
 644 @error:
 645 @Returns:
 646
 647
 648 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 649 <para>
 650
 651 </para>
 652
 653 @str:
 654 @len:
 655 @items_read:
 656 @items_written:
 657 @error:
 658 @Returns:
 659
 660
 661 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 662 <para>
 663
 664 </para>
 665
 666 @c:
 667 @outbuf:
 668 @Returns:
 669
 670