docs/reference/glib/tmpl/unicode.sgml

   1 <!-- ##### SECTION Title ##### -->
   2 Unicode Manipulation
   3
   4 <!-- ##### SECTION Short_Description ##### -->
   5 functions operating on Unicode characters and UTF-8 strings.
   6
   7 <!-- ##### SECTION Long_Description ##### -->
   8 <para>
   9 This section describes a number of functions for dealing with
  10 Unicode characters and strings.  There are analogues of the
  11 traditional <filename>ctype.h</filename> character classification
  12 and case conversion functions, UTF-8 analogues of some string utility
  13 functions, functions to perform normalization, case conversion and
  14 collation on UTF-8 strings and finally functions to convert between
  15 the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  16 </para>
  17
  18 <!-- ##### SECTION See_Also ##### -->
  19 <para>
  20 <variablelist>
  21
  22 <varlistentry>
  23 <term>g_locale_to_utf8(), g_locale_from_utf8()</term>
  24 <listitem><para>
  25 Convenience functions for converting between UTF-8 and the locale encoding.
  26 </para></listitem>
  27 </varlistentry>
  28
  29 </variablelist>
  30 </para>
  31
  32 <!-- ##### TYPEDEF gunichar ##### -->
  33 <para>
  34 A type which can hold any UCS-4 character code.
  35 </para>
  36
  37
  38 <!-- ##### TYPEDEF gunichar2 ##### -->
  39 <para>
  40 A type which can hold any UTF-16 character code.
  41 </para>
  42
  43
  44 <!-- ##### FUNCTION g_unichar_validate ##### -->
  45 <para>
  46
  47 </para>
  48
  49 @ch:
  50 @Returns:
  51
  52
  53 <!-- ##### FUNCTION g_unichar_isalnum ##### -->
  54 <para>
  55
  56 </para>
  57
  58 @c:
  59 @Returns:
  60
  61
  62 <!-- ##### FUNCTION g_unichar_isalpha ##### -->
  63 <para>
  64
  65 </para>
  66
  67 @c:
  68 @Returns:
  69
  70
  71 <!-- ##### FUNCTION g_unichar_iscntrl ##### -->
  72 <para>
  73
  74 </para>
  75
  76 @c:
  77 @Returns:
  78
  79
  80 <!-- ##### FUNCTION g_unichar_isdigit ##### -->
  81 <para>
  82
  83 </para>
  84
  85 @c:
  86 @Returns:
  87
  88
  89 <!-- ##### FUNCTION g_unichar_isgraph ##### -->
  90 <para>
  91
  92 </para>
  93
  94 @c:
  95 @Returns:
  96
  97
  98 <!-- ##### FUNCTION g_unichar_islower ##### -->
  99 <para>
 100
 101 </para>
 102
 103 @c:
 104 @Returns:
 105
 106
 107 <!-- ##### FUNCTION g_unichar_isprint ##### -->
 108 <para>
 109
 110 </para>
 111
 112 @c:
 113 @Returns:
 114
 115
 116 <!-- ##### FUNCTION g_unichar_ispunct ##### -->
 117 <para>
 118
 119 </para>
 120
 121 @c:
 122 @Returns:
 123
 124
 125 <!-- ##### FUNCTION g_unichar_isspace ##### -->
 126 <para>
 127
 128 </para>
 129
 130 @c:
 131 @Returns:
 132
 133
 134 <!-- ##### FUNCTION g_unichar_isupper ##### -->
 135 <para>
 136
 137 </para>
 138
 139 @c:
 140 @Returns:
 141
 142
 143 <!-- ##### FUNCTION g_unichar_isxdigit ##### -->
 144 <para>
 145
 146 </para>
 147
 148 @c:
 149 @Returns:
 150
 151
 152 <!-- ##### FUNCTION g_unichar_istitle ##### -->
 153 <para>
 154
 155 </para>
 156
 157 @c:
 158 @Returns:
 159
 160
 161 <!-- ##### FUNCTION g_unichar_isdefined ##### -->
 162 <para>
 163
 164 </para>
 165
 166 @c:
 167 @Returns:
 168
 169
 170 <!-- ##### FUNCTION g_unichar_iswide ##### -->
 171 <para>
 172
 173 </para>
 174
 175 @c:
 176 @Returns:
 177
 178
 179 <!-- ##### FUNCTION g_unichar_toupper ##### -->
 180 <para>
 181
 182 </para>
 183
 184 @c:
 185 @Returns:
 186
 187
 188 <!-- ##### FUNCTION g_unichar_tolower ##### -->
 189 <para>
 190
 191 </para>
 192
 193 @c:
 194 @Returns:
 195
 196
 197 <!-- ##### FUNCTION g_unichar_totitle ##### -->
 198 <para>
 199
 200 </para>
 201
 202 @c:
 203 @Returns:
 204
 205
 206 <!-- ##### FUNCTION g_unichar_digit_value ##### -->
 207 <para>
 208
 209 </para>
 210
 211 @c:
 212 @Returns:
 213
 214
 215 <!-- ##### FUNCTION g_unichar_xdigit_value ##### -->
 216 <para>
 217
 218 </para>
 219
 220 @c:
 221 @Returns:
 222
 223
 224 <!-- ##### ENUM GUnicodeType ##### -->
 225 <para>
 226 These are the possible character classifications.
 227 See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
 228 >http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
 229 </para>
 230
 231 @G_UNICODE_CONTROL:
 232 @G_UNICODE_FORMAT:
 233 @G_UNICODE_UNASSIGNED:
 234 @G_UNICODE_PRIVATE_USE:
 235 @G_UNICODE_SURROGATE:
 236 @G_UNICODE_LOWERCASE_LETTER:
 237 @G_UNICODE_MODIFIER_LETTER:
 238 @G_UNICODE_OTHER_LETTER:
 239 @G_UNICODE_TITLECASE_LETTER:
 240 @G_UNICODE_UPPERCASE_LETTER:
 241 @G_UNICODE_COMBINING_MARK:
 242 @G_UNICODE_ENCLOSING_MARK:
 243 @G_UNICODE_NON_SPACING_MARK:
 244 @G_UNICODE_DECIMAL_NUMBER:
 245 @G_UNICODE_LETTER_NUMBER:
 246 @G_UNICODE_OTHER_NUMBER:
 247 @G_UNICODE_CONNECT_PUNCTUATION:
 248 @G_UNICODE_DASH_PUNCTUATION:
 249 @G_UNICODE_CLOSE_PUNCTUATION:
 250 @G_UNICODE_FINAL_PUNCTUATION:
 251 @G_UNICODE_INITIAL_PUNCTUATION:
 252 @G_UNICODE_OTHER_PUNCTUATION:
 253 @G_UNICODE_OPEN_PUNCTUATION:
 254 @G_UNICODE_CURRENCY_SYMBOL:
 255 @G_UNICODE_MODIFIER_SYMBOL:
 256 @G_UNICODE_MATH_SYMBOL:
 257 @G_UNICODE_OTHER_SYMBOL:
 258 @G_UNICODE_LINE_SEPARATOR:
 259 @G_UNICODE_PARAGRAPH_SEPARATOR:
 260 @G_UNICODE_SPACE_SEPARATOR:
 261
 262 <!-- ##### FUNCTION g_unichar_type ##### -->
 263 <para>
 264
 265 </para>
 266
 267 @c:
 268 @Returns:
 269
 270
 271 <!-- ##### ENUM GUnicodeBreakType ##### -->
 272 <para>
 273 These are the possible line break classifications.
 274 See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
 275 >http://www.unicode.org/unicode/reports/tr14/</ulink>.
 276 </para>
 277
 278 @G_UNICODE_BREAK_MANDATORY:
 279 @G_UNICODE_BREAK_CARRIAGE_RETURN:
 280 @G_UNICODE_BREAK_LINE_FEED:
 281 @G_UNICODE_BREAK_COMBINING_MARK:
 282 @G_UNICODE_BREAK_SURROGATE:
 283 @G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
 284 @G_UNICODE_BREAK_INSEPARABLE:
 285 @G_UNICODE_BREAK_NON_BREAKING_GLUE:
 286 @G_UNICODE_BREAK_CONTINGENT:
 287 @G_UNICODE_BREAK_SPACE:
 288 @G_UNICODE_BREAK_AFTER:
 289 @G_UNICODE_BREAK_BEFORE:
 290 @G_UNICODE_BREAK_BEFORE_AND_AFTER:
 291 @G_UNICODE_BREAK_HYPHEN:
 292 @G_UNICODE_BREAK_NON_STARTER:
 293 @G_UNICODE_BREAK_OPEN_PUNCTUATION:
 294 @G_UNICODE_BREAK_CLOSE_PUNCTUATION:
 295 @G_UNICODE_BREAK_QUOTATION:
 296 @G_UNICODE_BREAK_EXCLAMATION:
 297 @G_UNICODE_BREAK_IDEOGRAPHIC:
 298 @G_UNICODE_BREAK_NUMERIC:
 299 @G_UNICODE_BREAK_INFIX_SEPARATOR:
 300 @G_UNICODE_BREAK_SYMBOL:
 301 @G_UNICODE_BREAK_ALPHABETIC:
 302 @G_UNICODE_BREAK_PREFIX:
 303 @G_UNICODE_BREAK_POSTFIX:
 304 @G_UNICODE_BREAK_COMPLEX_CONTEXT:
 305 @G_UNICODE_BREAK_AMBIGUOUS:
 306 @G_UNICODE_BREAK_UNKNOWN:
 307
 308 <!-- ##### FUNCTION g_unichar_break_type ##### -->
 309 <para>
 310
 311 </para>
 312
 313 @c:
 314 @Returns:
 315
 316
 317 <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
 318 <para>
 319
 320 </para>
 321
 322 @string:
 323 @len:
 324
 325
 326 <!-- ##### FUNCTION g_unicode_canonical_decomposition ##### -->
 327 <para>
 328
 329 </para>
 330
 331 @ch:
 332 @result_len:
 333 @Returns:
 334
 335
 336 <!-- ##### MACRO g_utf8_next_char ##### -->
 337 <para>
 338 Skips to the next character in a UTF-8 string. The string must be
 339 valid; this macro is as fast as possible, and has no error-checking.
 340 You would use this macro to iterate over a string character by
 341 character. The macro returns the start of the next UTF-8 character.
 342 Before using this macro, use g_utf8_validate() to validate strings
 343 that may contain invalid UTF-8.
 344 </para>
 345
 346 @p: Pointer to the start of a valid UTF-8 character.
 347
 348
 349 <!-- ##### FUNCTION g_utf8_get_char ##### -->
 350 <para>
 351
 352 </para>
 353
 354 @p:
 355 @Returns:
 356
 357
 358 <!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
 359 <para>
 360
 361 </para>
 362
 363 @p:
 364 @max_len:
 365 @Returns:
 366
 367
 368 <!-- ##### FUNCTION g_utf8_offset_to_pointer ##### -->
 369 <para>
 370
 371 </para>
 372
 373 @str:
 374 @offset:
 375 @Returns:
 376
 377
 378 <!-- ##### FUNCTION g_utf8_pointer_to_offset ##### -->
 379 <para>
 380
 381 </para>
 382
 383 @str:
 384 @pos:
 385 @Returns:
 386
 387
 388 <!-- ##### FUNCTION g_utf8_prev_char ##### -->
 389 <para>
 390
 391 </para>
 392
 393 @p:
 394 @Returns:
 395
 396
 397 <!-- ##### FUNCTION g_utf8_find_next_char ##### -->
 398 <para>
 399
 400 </para>
 401
 402 @p:
 403 @end:
 404 @Returns:
 405 <!-- # Unused Parameters # -->
 406 @bound:
 407
 408
 409 <!-- ##### FUNCTION g_utf8_find_prev_char ##### -->
 410 <para>
 411
 412 </para>
 413
 414 @str:
 415 @p:
 416 @Returns:
 417
 418
 419 <!-- ##### FUNCTION g_utf8_strlen ##### -->
 420 <para>
 421
 422 </para>
 423
 424 @p:
 425 @max:
 426 @Returns:
 427
 428
 429 <!-- ##### FUNCTION g_utf8_strncpy ##### -->
 430 <para>
 431
 432 </para>
 433
 434 @dest:
 435 @src:
 436 @n:
 437 @Returns:
 438
 439
 440 <!-- ##### FUNCTION g_utf8_strchr ##### -->
 441 <para>
 442
 443 </para>
 444
 445 @p:
 446 @len:
 447 @c:
 448 @Returns:
 449 <!-- # Unused Parameters # -->
 450 @ch:
 451
 452
 453 <!-- ##### FUNCTION g_utf8_strrchr ##### -->
 454 <para>
 455
 456 </para>
 457
 458 @p:
 459 @len:
 460 @c:
 461 @Returns:
 462 <!-- # Unused Parameters # -->
 463 @ch:
 464
 465
 466 <!-- ##### FUNCTION g_utf8_validate ##### -->
 467 <para>
 468
 469 </para>
 470
 471 @str:
 472 @max_len:
 473 @end:
 474 @Returns:
 475
 476
 477 <!-- ##### FUNCTION g_utf8_strup ##### -->
 478 <para>
 479
 480 </para>
 481
 482 @str:
 483 @len:
 484 @Returns:
 485
 486
 487 <!-- ##### FUNCTION g_utf8_strdown ##### -->
 488 <para>
 489
 490 </para>
 491
 492 @str:
 493 @len:
 494 @Returns:
 495
 496
 497 <!-- ##### FUNCTION g_utf8_casefold ##### -->
 498 <para>
 499
 500 </para>
 501
 502 @str:
 503 @len:
 504 @Returns:
 505
 506
 507 <!-- ##### FUNCTION g_utf8_normalize ##### -->
 508 <para>
 509
 510 </para>
 511
 512 @str:
 513 @len:
 514 @mode:
 515 @Returns:
 516
 517
 518 <!-- ##### ENUM GNormalizeMode ##### -->
 519 <para>
 520 Defines how a Unicode string is transformed in a canonical
 521 form, standardizing such issues as whether a character with an accent is
 522 represented as a base character and combining accent or as a single precomposed
 523 character. Unicode strings should generally be normalized before comparing them.
 524 </para>
 525
 526 @G_NORMALIZE_DEFAULT: standardize differences that do not affect the
 527   text content, such as the above-mentioned accent representation.
 528 @G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
 529 @G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
 530   forms rather than a maximally decomposed form.
 531 @G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
 532 @G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the
 533   "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the
 534   standard forms (in this case DIGIT THREE). Formatting information may be
 535   lost but for most text operations such characters should be considered the
 536   same.
 537 @G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
 538 @G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
 539   forms rather than a maximally decomposed form.
 540 @G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
 541
 542 <!-- ##### FUNCTION g_utf8_collate ##### -->
 543 <para>
 544
 545 </para>
 546
 547 @str1:
 548 @str2:
 549 @Returns:
 550
 551
 552 <!-- ##### FUNCTION g_utf8_collate_key ##### -->
 553 <para>
 554
 555 </para>
 556
 557 @str:
 558 @len:
 559 @Returns:
 560
 561
 562 <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
 563 <para>
 564
 565 </para>
 566
 567 @str:
 568 @len:
 569 @items_read:
 570 @items_written:
 571 @error:
 572 @Returns:
 573
 574
 575 <!-- ##### FUNCTION g_utf8_to_ucs4 ##### -->
 576 <para>
 577
 578 </para>
 579
 580 @str:
 581 @len:
 582 @items_read:
 583 @items_written:
 584 @error:
 585 @Returns:
 586
 587
 588 <!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
 589 <para>
 590
 591 </para>
 592
 593 @str:
 594 @len:
 595 @items_written:
 596 @Returns:
 597
 598
 599 <!-- ##### FUNCTION g_utf16_to_ucs4 ##### -->
 600 <para>
 601
 602 </para>
 603
 604 @str:
 605 @len:
 606 @items_read:
 607 @items_written:
 608 @error:
 609 @Returns:
 610
 611
 612 <!-- ##### FUNCTION g_utf16_to_utf8 ##### -->
 613 <para>
 614
 615 </para>
 616
 617 @str:
 618 @len:
 619 @items_read:
 620 @items_written:
 621 @error:
 622 @Returns:
 623
 624
 625 <!-- ##### FUNCTION g_ucs4_to_utf16 ##### -->
 626 <para>
 627
 628 </para>
 629
 630 @str:
 631 @len:
 632 @items_read:
 633 @items_written:
 634 @error:
 635 @Returns:
 636
 637
 638 <!-- ##### FUNCTION g_ucs4_to_utf8 ##### -->
 639 <para>
 640
 641 </para>
 642
 643 @str:
 644 @len:
 645 @items_read:
 646 @items_written:
 647 @error:
 648 @Returns:
 649
 650
 651 <!-- ##### FUNCTION g_unichar_to_utf8 ##### -->
 652 <para>
 653
 654 </para>
 655
 656 @c:
 657 @outbuf:
 658 @Returns:
 659
 660