doc/libunistring.texi

   1 \input texinfo          @c -*-texinfo-*-
   2 @comment %**start of header
   3 @setfilename libunistring.info
   4 @documentencoding UTF-8
   5 @settitle GNU libunistring
   6 @finalout
   7 @c Indices:
   8 @c   am = autoconf macro  @amindex
   9 @c   cp = concept         @cindex
  10 @c   fn = function        @findex
  11 @c   tp = type            @tindex
  12 @c Unused predefined indices:
  13 @c   ky = keystroke       @kindex
  14 @c   pg = program         @pindex
  15 @c   vr = variable        @vindex
  16 @defcodeindex am
  17 @syncodeindex am cp
  18 @syncodeindex fn cp
  19 @syncodeindex tp cp
  20 @ifclear texi2html
  21 @firstparagraphindent insert
  22 @end ifclear
  23 @c texi2html-1.76 does not support @arrow{}.
  24 @ifset texi2html
  25 @macro arrow{}
  26 →
  27 @end macro
  28 @end ifset
  29 @comment %**end of header
  30
  31 @include version.texi
  32
  33 @c Location of the POSIX specification on the web.
  34 @set POSIXURL http://pubs.opengroup.org/onlinepubs/9699919799
  35
  36 @c Macro for referencing a POSIX header.
  37 @ifinfo
  38 @macro posixheader{header}
  39 @code{<\header\>}
  40 @end macro
  41 @end ifinfo
  42 @ifnotinfo
  43 @macro posixheader{header}
  44 @uref{@value{POSIXURL}/basedefs/\header\.html,,@code{<\header\>}}
  45 @end macro
  46 @end ifnotinfo
  47
  48 @c Macro for referencing a POSIX function.
  49 @c We don't write it as func(), see section "GNU Manuals" of the
  50 @c GNU coding standards.
  51 @ifinfo
  52 @macro posixfunc{func}
  53 @code{\func\}
  54 @end macro
  55 @end ifinfo
  56 @ifnotinfo
  57 @macro posixfunc{func}
  58 @uref{@value{POSIXURL}/functions/\func\.html,,@code{\func\}}
  59 @end macro
  60 @end ifnotinfo
  61
  62 @c Macro for referencing a normal function.
  63 @c We don't write it as func(), see section "GNU Manuals" of the
  64 @c GNU coding standards.
  65 @macro func{func}
  66 @code{\func\}
  67 @end macro
  68
  69 @c Macro for an advisory ragged line break in TeX mode.
  70 @c Needed because there are long unbreakable pieces of text (such as URLs or
  71 @c formulas), TeX is too shy to move them to a new line. TeX considers only
  72 @c two choices: a line break in aligned mode (which it rejects due to aesthetic
  73 @c reasons) and writing into the margin. What we want in many cases is a line
  74 @c break without filling the first line. Like what @* delivers. But we want it
  75 @c only when needed, so that it disappears when unrelated changes in the same
  76 @c paragraph cause a line break in a nearby position. And we need it only in
  77 @c TeX mode. info and HTML modes are fine.
  78 @c This trick is from Karl Berry.
  79 @iftex
  80 @macro texnl
  81 @hfil@penalty9000@hfilneg
  82 @end macro
  83 @end iftex
  84 @ifnottex
  85 @macro texnl
  86 @end macro
  87 @end ifnottex
  88
  89 @ifinfo
  90 @dircategory Software development
  91 @direntry
  92 * GNU libunistring: (libunistring).     Unicode string library.
  93 @end direntry
  94 @end ifinfo
  95
  96 @ifinfo
  97 This manual is for GNU libunistring.
  98
  99 @ignore
 100 @c This was: @copying but it triggers a makeinfo 4.13 bug
 101 Copyright (C) 2001-2022 Free Software Foundation, Inc.
 102
 103 This manual is free documentation.  It is dually licensed under the
 104 GNU FDL and the GNU GPL.  This means that you can redistribute this
 105 manual under either of these two licenses, at your choice.
 106
 107 This manual is covered by the GNU FDL.  Permission is granted to copy,
 108 distribute and/or modify this document under the terms of the
 109 GNU Free Documentation License (FDL), either version 1.2 of the
 110 License, or (at your option) any later version published by the
 111 Free Software Foundation (FSF); with no Invariant Sections, with no
 112 Front-Cover Text, and with no Back-Cover Texts.
 113 A copy of the license is included in @ref{GNU FDL}.
 114
 115 This manual is covered by the GNU GPL.  You can redistribute it and/or
 116 modify it under the terms of the GNU General Public License (GPL), either
 117 version 3 of the License, or (at your option) any later version published
 118 by the Free Software Foundation (FSF).
 119 A copy of the license is included in @ref{GNU GPL}.
 120 @end ignore
 121 @end ifinfo
 122
 123 @titlepage
 124 @title GNU libunistring, version @value{VERSION}
 125 @subtitle updated @value{UPDATED}
 126 @author Bruno Haible
 127
 128 @ifnothtml
 129 @page
 130 @vskip 0pt plus 1filll
 131 @c @insertcopying
 132 Copyright (C) 2001-2022 Free Software Foundation, Inc.
 133
 134 This manual is free documentation.  It is dually licensed under the
 135 GNU FDL and the GNU GPL.  This means that you can redistribute this
 136 manual under either of these two licenses, at your choice.
 137
 138 This manual is covered by the GNU FDL.  Permission is granted to copy,
 139 distribute and/or modify this document under the terms of the
 140 GNU Free Documentation License (FDL), either version 1.2 of the
 141 License, or (at your option) any later version published by the
 142 Free Software Foundation (FSF); with no Invariant Sections, with no
 143 Front-Cover Text, and with no Back-Cover Texts.
 144 A copy of the license is included in @ref{GNU FDL}.
 145
 146 This manual is covered by the GNU GPL.  You can redistribute it and/or
 147 modify it under the terms of the GNU General Public License (GPL), either
 148 version 3 of the License, or (at your option) any later version published
 149 by the Free Software Foundation (FSF).
 150 A copy of the license is included in @ref{GNU GPL}.
 151 @end ifnothtml
 152 @end titlepage
 153
 154 @c Table of Contents
 155 @contents
 156
 157 @ifnottex
 158 @node Top
 159 @top GNU libunistring
 160 @end ifnottex
 161
 162 @menu
 163 * Introduction::                Who may need Unicode strings?
 164 * Conventions::                 Conventions used in this manual
 165 * unitypes.h::                  Elementary types
 166 * unistr.h::                    Elementary Unicode string functions
 167 * uniconv.h::                   Conversions between Unicode and encodings
 168 * unistdio.h::                  Output with Unicode strings
 169 * uniname.h::                   Names of Unicode characters
 170 * unictype.h::                  Unicode character classification and properties
 171 * uniwidth.h::                  Display width
 172 * unigbrk.h::                   Grapheme cluster breaking
 173 * uniwbrk.h::                   Word breaks in strings
 174 * unilbrk.h::                   Line breaking
 175 * uninorm.h::                   Normalization forms
 176 * unicase.h::                   Case mappings
 177 * uniregex.h::                  Regular expressions
 178 * Using the library::           How to link with the library and use it?
 179 * More functionality::          More advanced functionality
 180 * The wchar_t mess::            Why @code{wchar_t *} strings are useless
 181 * The char32_t problem::        Why @code{char32_t *} strings are problematic
 182 * Licenses::                    Licenses
 183
 184 * Index::                       General Index
 185
 186 @detailmenu
 187  --- The Detailed Node Listing ---
 188
 189 Introduction
 190
 191 * Unicode::                     What is Unicode?
 192 * Unicode and i18n::            Unicode and internationalization
 193 * Locale encodings::            What is a locale encoding?
 194 * In-memory representation::    How to represent strings in memory?
 195 * char * strings::              What to keep in mind with @code{char *} strings
 196 * Unicode strings::             How are Unicode strings represented?
 197
 198 unistr.h
 199
 200 * Elementary string checks::
 201 * Elementary string conversions::
 202 * Elementary string functions::
 203 * Elementary string functions with memory allocation::
 204 * Elementary string functions on NUL terminated strings::
 205
 206 Elementary string functions
 207
 208 * Iterating::
 209 * Creating Unicode strings::
 210 * Copying Unicode strings::
 211 * Comparing Unicode strings::
 212 * Searching for a character::
 213 * Counting characters::
 214
 215 Elementary string functions on NUL terminated strings
 216
 217 * Iterating over a NUL terminated Unicode string::
 218 * Length::
 219 * Copying a NUL terminated Unicode string::
 220 * Comparing NUL terminated Unicode strings::
 221 * Duplicating a NUL terminated Unicode string::
 222 * Searching for a character in a NUL terminated Unicode string::
 223 * Searching for a substring::
 224 * Tokenizing::
 225
 226 unictype.h
 227
 228 * General category::
 229 * Canonical combining class::
 230 * Bidi class::
 231 * Decimal digit value::
 232 * Digit value::
 233 * Numeric value::
 234 * Mirrored character::
 235 * Arabic shaping::
 236 * Properties::
 237 * Scripts::
 238 * Blocks::
 239 * ISO C and Java syntax::
 240 * Classifications like in ISO C::
 241
 242 General category
 243
 244 * Object oriented API::
 245 * Bit mask API::
 246
 247 Properties
 248
 249 * Properties as objects::
 250 * Properties as functions::
 251
 252 unigbrk.h
 253
 254 * Grapheme cluster breaks in a string::
 255 * Grapheme cluster break property::
 256
 257 uniwbrk.h
 258
 259 * Word breaks in a string::
 260 * Word break property::
 261
 262 uninorm.h
 263
 264 * Decomposition of characters::
 265 * Composition of characters::
 266 * Normalization of strings::
 267 * Normalizing comparisons::
 268 * Normalization of streams::
 269
 270 unicase,h
 271
 272 * Case mappings of characters::
 273 * Case mappings of strings::
 274 * Case mappings of substrings::
 275 * Case insensitive comparison::
 276 * Case detection::
 277
 278 Using the library
 279
 280 * Installation::
 281 * Compiler options::
 282 * Include files::
 283 * Autoconf macro::
 284 * Reporting problems::
 285
 286 Licenses
 287
 288 * GNU GPL::                     GNU General Public License
 289 * GNU LGPL::                    GNU Lesser General Public License
 290 * GNU FDL::                     GNU Free Documentation License
 291
 292 @end detailmenu
 293 @end menu
 294
 295 @node Introduction
 296 @chapter Introduction
 297
 298 This library provides functions for manipulating Unicode strings and
 299 for manipulating C strings according to the Unicode standard.
 300
 301 It consists of the following parts:
 302
 303 @table @code
 304 @item <unistr.h>
 305 elementary string functions
 306 @item <uniconv.h>
 307 conversion from/to legacy encodings
 308 @item <unistdio.h>
 309 formatted output to strings
 310 @item <uniname.h>
 311 character names
 312 @item <unictype.h>
 313 character classification and properties
 314 @item <uniwidth.h>
 315 string width when using nonproportional fonts
 316 @item <unigbrk.h>
 317 grapheme cluster breaks
 318 @item <uniwbrk.h>
 319 word breaks
 320 @item <unilbrk.h>
 321 line breaking algorithm
 322 @item <uninorm.h>
 323 normalization (composition and decomposition)
 324 @item <unicase.h>
 325 case folding
 326 @item <uniregex.h>
 327 regular expressions (not yet implemented)
 328 @end table
 329
 330 @cindex use cases
 331 @cindex value, of libunistring
 332 libunistring is for you if your application involves non-trivial text
 333 processing, such as upper/lower case conversions, line breaking, operations
 334 on words, or more advanced analysis of text.  Text provided by the user can,
 335 in general, contain characters of all kinds of scripts.  The text processing
 336 functions provided by this library handle all scripts and all languages.
 337
 338 libunistring is for you if your application already uses the ISO C / POSIX
 339 @posixheader{ctype.h}, @posixheader{wctype.h} functions and the text it
 340 operates on is provided by the user and can be in any language.
 341
 342 libunistring is also for you if your application uses Unicode strings as
 343 internal in-memory representation.
 344
 345 @menu
 346 * Unicode::                     What is Unicode?
 347 * Unicode and i18n::            Unicode and internationalization
 348 * Locale encodings::            What is a locale encoding?
 349 * In-memory representation::    How to represent strings in memory?
 350 * char * strings::              What to keep in mind with @code{char *} strings
 351 * Unicode strings::             How are Unicode strings represented?
 352 @end menu
 353
 354 @node Unicode
 355 @section Unicode
 356
 357 @cindex Unicode
 358 Unicode is a standardized repertoire of characters that contains characters
 359 from all scripts of the world, from Latin letters to Chinese ideographs
 360 and Babylonian cuneiform glyphs.  It also specifies how these characters
 361 are to be rendered on a screen or on paper, and how common text processing
 362 (word selection, line breaking, uppercasing of page titles etc.) is supposed
 363 to behave on Unicode text.
 364
 365 Unicode also specifies three ways of storing sequences of Unicode
 366 characters in a computer whose basic unit of data is an 8-bit byte:
 367 @cindex UTF-8
 368 @cindex UTF-16
 369 @cindex UTF-32
 370 @cindex UCS-4
 371 @table @asis
 372 @item UTF-8
 373 Every character is represented as 1 to 4 bytes.
 374 @item UTF-16
 375 Every character is represented as 1 to 2 units of 16 bits.
 376 @item UTF-32, a.k.a@. UCS-4
 377 Every character is represented as 1 unit of 32 bits.
 378 @end table
 379
 380 For encoding Unicode text in a file, UTF-8 is usually used.  For encoding
 381 Unicode strings in memory for a program, either of the three encoding forms
 382 can be reasonably used.
 383
 384 Unicode is widely used on the web.  Prior to the use of Unicode, web pages
 385 were in many different encodings (ISO-8859-1 for English, French, Spanish,
 386 ISO-8859-2 for Polish, ISO-8859-7 for Greek, KOI8-R for Russian, GB2312 or
 387 BIG5 for Chinese, ISO-2022-JP-2 or EUC-JP or Shift_JIS for Japanese, and many
 388 many others).  It was next to impossible to create a document that contained
 389 Chinese and Polish text in the same document.  Due to the many encodings for
 390 Japanese, even the processing of pure Japanese text was error prone.
 391
 392 References:
 393 @itemize @bullet
 394 @item
 395 The Unicode standard:@texnl{} @url{https://www.unicode.org/}
 396 @item
 397 Definition of UTF-8:@texnl{} @url{https://www.rfc-editor.org/rfc/rfc3629.txt}
 398 @item
 399 Definition of UTF-16:@texnl{} @url{https://www.rfc-editor.org/rfc/rfc2781.txt}
 400 @item
 401 Markus Kuhn's UTF-8 and Unicode FAQ:@texnl{}
 402 @url{https://www.cl.cam.ac.uk/~mgk25/unicode.html}
 403 @end itemize
 404
 405 @node Unicode and i18n
 406 @section Unicode and Internationalization
 407
 408 @cindex internationalization
 409 Internationalization is the process of changing the source code of a program
 410 so that it can meet the expectations of users in any culture, if culture
 411 specific data (translations, images etc.) are provided.
 412
 413 Use of Unicode is not strictly required for internationalization, but it
 414 makes internationalization much easier, because operations that need to
 415 look at specific characters (like hyphenation, spell checking, or the
 416 automatic conversion of double-quotes to opening and closing double-quote
 417 characters) don't need to consider multiple possible encodings of the text.
 418
 419 Use of Unicode also enables multilingualization: the ability of having text
 420 in multiple languages present in the same document or even in the same line
 421 of text.
 422
 423 But use of Unicode is not everything.  Internationalization usually consists
 424 of four features:
 425 @itemize @bullet
 426 @item
 427 Use of Unicode where needed for text processing.  This is what this library
 428 is for.
 429 @item
 430 Use of message catalogs for messages shown to the user, This is what
 431 GNU gettext is about.
 432 @item
 433 Use of locale specific conventions for date and time formats, for numeric
 434 formatting, or for sorting of text.  This can be done adequately with the
 435 POSIX APIs and the implementation of locales in the GNU C library.
 436 @item
 437 In graphical user interfaces, adapting the GUI to the default text direction
 438 of the current locale (see
 439 @url{https://en.wikipedia.org/wiki/Right-to-left,right-to-left languages}).
 440 @end itemize
 441
 442 @node Locale encodings
 443 @section Locale encodings
 444
 445 @cindex locale
 446 A locale is a set of cultural conventions.  According to POSIX, for a program,
 447 at any moment, there is one locale being designated as the ``current locale''.
 448 (Actually, POSIX supports also one locale per thread, but this feature is not
 449 yet universally implemented and not widely used.)
 450 @cindex locale categories
 451 The locale is partitioned into several aspects, called the ``categories''
 452 of the locale.  The main various aspects are:
 453 @itemize @bullet
 454 @item
 455 The character encoding and the character properties.  This is the
 456 @code{LC_CTYPE} category.
 457 @item
 458 The sorting rules for text.  This is the @code{LC_COLLATE} category.
 459 @item
 460 The language specific translations of messages.  This is the
 461 @code{LC_MESSAGES} category.
 462 @item
 463 The formatting rules for numbers, such as the decimal separator.  This is
 464 the @code{LC_NUMERIC} category.
 465 @item
 466 The formatting rules for amounts of money.  This is the @code{LC_MONETARY}
 467 category.
 468 @item
 469 The formatting of date and time.  This is the @code{LC_TIME} category.
 470 @end itemize
 471
 472 @cindex locale encoding
 473 In particular, the @code{LC_CTYPE} category of the current locale determines
 474 the character encoding.  This is the encoding of @samp{char *} strings.
 475 We also call it the ``locale encoding''.  GNU libunistring has a function,
 476 @func{locale_charset}, that returns a standardized (platform independent)
 477 name for this encoding.
 478
 479 All locale encodings used on glibc systems are essentially ASCII compatible:
 480 Most graphic ASCII characters have the same representation, as a single byte,
 481 in that encoding as in ASCII.
 482
 483 Among the possible locale encodings are UTF-8 and GB18030.  Both allow
 484 to represent any Unicode character as a sequence of bytes.  UTF-8 is used in
 485 most of the world, whereas GB18030 is used in the People's Republic of China,
 486 because it is backward compatible with the GB2312 encoding that was used in
 487 this country earlier.
 488
 489 The legacy locale encodings, ISO-8859-15 (which supplanted ISO-8859-1 in
 490 most of Europe), ISO-8859-2, KOI8-R, EUC-JP, etc., are still in use in
 491 some places, though.
 492
 493 UTF-16 and UTF-32 are not used as locale encodings, because they are not
 494 ASCII compatible.
 495
 496 @node In-memory representation
 497 @section Choice of in-memory representation of strings
 498
 499 There are three ways of representing strings in memory of a running
 500 program.
 501 @itemize @bullet
 502 @item
 503 As @samp{char *} strings.  Such strings are represented in locale encoding.
 504 This approach is employed when not much text processing is done by the
 505 program.  When some Unicode aware processing is to be done, a string is
 506 converted to Unicode on the fly and back to locale encoding afterwards.
 507 @item
 508 As UTF-8 or UTF-16 or UTF-32 strings.  This implies that conversion from
 509 locale encoding to Unicode is performed on input, and in the opposite
 510 direction on output.  This approach is employed when the program does
 511 a significant amount of text processing, or when the program has multiple
 512 threads operating on the same data but in different locales.
 513 @item
 514 As @samp{wchar_t *}, a.k.a@. ``wide strings''.  This approach is misguided,
 515 see @ref{The wchar_t mess}.
 516 @end itemize
 517
 518 Of course, a @samp{char *} string can, in some cases, be encoded in UTF-8.
 519 You will use the data type depending on what you can guarantee about how
 520 it's encoded: If a string is encoded in the locale encoding, or if you
 521 don't know how it's encoded, use @samp{char *}.  If, on the other hand,
 522 you can @emph{guarantee} that it is UTF-8 encoded, then you can use the
 523 UTF-8 string type, @code{uint8_t *}, for it.
 524
 525 The five types @code{char *}, @code{uint8_t *}, @code{uint16_t *},
 526 @code{uint32_t *}, and @code{wchar_t *} are incompatible types at the C
 527 level.  Therefore, @samp{gcc -Wall} will produce a warning if, by mistake,
 528 your code contains a mismatch between these types.  In the context of
 529 using GNU libunistring, even a warning about a mismatch between
 530 @code{char *} and @code{uint8_t *} is a sign of a bug in your code
 531 that you should not try to silence through a cast.
 532
 533 @node char * strings
 534 @section @samp{char *} strings
 535
 536 @cindex C string functions
 537 The classical C strings, with its C library support standardized by
 538 ISO C and POSIX, can be used in internationalized programs with some
 539 precautions.  The problem with this API is that many of the C library
 540 functions for strings don't work correctly on strings in locale
 541 encodings, leading to bugs that only people in some cultures of the
 542 world will experience.
 543
 544 @cindex locale, multibyte
 545 The first problem with the C library API is the support of multibyte
 546 locales.  According to the locale encoding, in general, every character
 547 is represented by one or more bytes (up to 4 bytes in practice --- but
 548 use @code{MB_LEN_MAX} instead of the number 4 in the code).
 549 When every character is represented by only 1 byte, we speak of an
 550 ``unibyte locale'', otherwise of a ``multibyte locale''.  It is important
 551 to realize that the majority of Unix installations nowadays use UTF-8
 552 or GB18030 as locale encoding; therefore, the majority of users are
 553 using multibyte locales.
 554
 555 @cindex char, type
 556 The important fact to remember is:
 557 @cartouche
 558 @emph{A @samp{char} is a byte, not a character.}
 559 @end cartouche
 560
 561 As a consequence:
 562 @itemize @bullet
 563 @item
 564 The @posixheader{ctype.h} API is useless in this context; it does not work in
 565 multibyte locales.
 566 @item
 567 The @posixfunc{strlen} function does not return the number of characters
 568 in a string.  Nor does it return the number of screen columns occupied
 569 by a string after it is output.  It merely returns the number of
 570 @emph{bytes} occupied by a string.
 571 @item
 572 Truncating a string, for example, with @posixfunc{strncpy}, can have the
 573 effect of truncating it in the middle of a multibyte character.  Such
 574 a string will, when output, have a garbled character at its end, often
 575 represented by a hollow box.
 576 @item
 577 @posixfunc{strchr} and @posixfunc{strrchr} do not work with multibyte strings
 578 if the locale encoding is GB18030 and the character to be searched is
 579 a digit.
 580 @item
 581 @posixfunc{strstr} does not work with multibyte strings if the locale encoding
 582 is different from UTF-8.
 583 @item
 584 @posixfunc{strcspn}, @posixfunc{strpbrk}, @posixfunc{strspn} cannot work
 585 correctly in multibyte locales: they assume the second argument is a list of
 586 single-byte characters.  Even in this simple case, they do not work with
 587 multibyte strings if the locale encoding is GB18030 and one of the
 588 characters to be searched is a digit.
 589 @item
 590 @posixfunc{strsep} and @posixfunc{strtok_r} do not work with multibyte strings
 591 unless all of the delimiter characters are ASCII characters < 0x30.
 592 @item
 593 The @posixfunc{strcasecmp}, @posixfunc{strncasecmp}, and @posixfunc{strcasestr}
 594 functions do not work with multibyte strings.
 595 @end itemize
 596
 597 The workarounds can be found in GNU gnulib
 598 @url{https://www.gnu.org/software/gnulib/}.
 599 @itemize @bullet
 600 @item
 601 gnulib has modules @samp{mbchar}, @samp{mbiter}, @samp{mbuiter} that
 602 represent multibyte characters and allow to iterate across a multibyte
 603 string with the same ease as through a unibyte string.
 604 @item
 605 gnulib has functions @func{mbslen} and @func{mbswidth} that can be
 606 used instead of @posixfunc{strlen} when the number of characters or the
 607 number of screen columns of a string is requested.
 608 @item
 609 gnulib has functions @func{mbschr} and @func{mbsrrchr} that are
 610 like @posixfunc{strchr} and @posixfunc{strrchr}, but work in multibyte locales.
 611 @item
 612 gnulib has a function @func{mbsstr}, like @posixfunc{strstr}, but works
 613 in multibyte locales.
 614 @item
 615 gnulib has functions @func{mbscspn}, @func{mbspbrk}, @func{mbsspn}
 616 that are like @posixfunc{strcspn}, @posixfunc{strpbrk}, @posixfunc{strspn}, but
 617 work in multibyte locales.
 618 @item
 619 gnulib has functions @func{mbssep} and @func{mbstok_r} that are
 620 like @posixfunc{strsep} and @posixfunc{strtok_r} but work in multibyte locales.
 621 @item
 622 gnulib has functions @func{mbscasecmp}, @func{mbsncasecmp},
 623 @func{mbspcasecmp}, and @func{mbscasestr} that are like @posixfunc{strcasecmp},
 624 @posixfunc{strncasecmp}, and @posixfunc{strcasestr}, but
 625 work in multibyte locales.  Still, the function @code{ulc_casecmp} is
 626 preferable to these functions; see below.
 627 @end itemize
 628
 629 The second problem with the C library API is that it has some assumptions built-in that are not valid in some languages:
 630 @itemize @bullet
 631 @item
 632 It assumes that there are only two forms of every character: uppercase
 633 and lowercase.  This is not true for Croatian, where the character
 634 @sc{LETTER DZ WITH CARON} comes in three forms:
 635 @sc{LATIN CAPITAL LETTER DZ WITH CARON} (DZ),
 636 @sc{LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON} (Dz),
 637 @sc{LATIN SMALL LETTER DZ WITH CARON} (dz).
 638 @item
 639 It assumes that uppercasing of 1 character leads to 1 character.  This
 640 is not true for German, where the @sc{LATIN SMALL LETTER SHARP S}, when
 641 uppercased, becomes @samp{SS}.
 642 @item
 643 It assumes that there is 1:1 mapping between uppercase and lowercase forms.
 644 This is not true for the Greek sigma: @sc{GREEK CAPITAL LETTER SIGMA} is
 645 the uppercase of both @sc{GREEK SMALL LETTER SIGMA} and
 646 @sc{GREEK SMALL LETTER FINAL SIGMA}.
 647 @item
 648 It assumes that the upper/lowercase mappings are position independent.
 649 This is not true for the Greek sigma and the Lithuanian i.
 650 @end itemize
 651
 652 The correct way to deal with this problem is
 653 @enumerate
 654 @item
 655 to provide functions for titlecasing, as well as for upper- and
 656 lowercasing,
 657 @item
 658 to view case transformations as functions that operates on strings,
 659 rather than on characters.
 660 @end enumerate
 661
 662 This is implemented in this library, through the functions declared in @code{<unicase.h>}, see @ref{unicase.h}.
 663
 664 @node Unicode strings
 665 @section Unicode strings
 666
 667 libunistring supports Unicode strings in three representations:
 668 @cindex UTF-8, strings
 669 @cindex UTF-16, strings
 670 @cindex UTF-32, strings
 671 @itemize @bullet
 672 @item
 673 UTF-8 strings, through the type @samp{uint8_t *}.  The units are bytes
 674 (@code{uint8_t}).
 675 @item
 676 UTF-16 strings, through the type @samp{uint16_t *},  The units are 16-bit
 677 memory words (@code{uint16_t}).
 678 @item
 679 UTF-32 strings, through the type @samp{uint32_t *}.  The units are 32-bit
 680 memory words (@code{uint32_t}).
 681 @end itemize
 682
 683 As with C strings, there are two variants:
 684 @itemize @bullet
 685 @item
 686 Unicode strings with a terminating NUL character are represented as
 687 a pointer to the first unit of the string.  There is a unit containing
 688 a 0 value at the end.  It is considered part of the string for all
 689 memory allocation purposes, but is not considered part of the string
 690 for all other logical purposes.
 691 @item
 692 Unicode strings where embedded NUL characters are allowed.  These
 693 are represented by a pointer to the first unit and the number of units
 694 (not bytes!) of the string.  In this setting, there is no trailing
 695 zero-valued unit used as ``end marker''.
 696 @end itemize
 697
 698 @node Conventions
 699 @chapter Conventions
 700
 701 This chapter explains conventions valid throughout the libunistring library.
 702
 703 @cindex argument conventions
 704 Variables of type @code{char *} denote C strings in locale encoding.
 705 See @ref{Locale encodings}.
 706
 707 Variables of type @code{uint8_t *} denote UTF-8 strings.  Their units
 708 are bytes.
 709
 710 Variables of type @code{uint16_t *} denote UTF-16 strings, without byte
 711 order mark.  Their units are 2-byte words.
 712
 713 Variables of type @code{uint32_t *} denote UTF-32 strings, without byte
 714 order mark.  Their units are 4-byte words.
 715
 716 Argument pairs @code{(@var{s}, @var{n})} denote a string
 717 @code{@var{s}[0..@var{n}-1]} with exactly @var{n} units.
 718
 719 All functions with prefix @samp{ulc_} operate on C strings in locale
 720 encoding.
 721
 722 All functions with prefix @samp{u8_} operate on UTF-8 strings.
 723
 724 All functions with prefix @samp{u16_} operate on UTF-16 strings.
 725
 726 All functions with prefix @samp{u32_} operate on UTF-32 strings.
 727
 728 For every function with prefix @samp{u8_}, operating on UTF-8 strings,
 729 there is also a corresponding function with prefix @samp{u16_},
 730 operating on UTF-16 strings, and a corresponding function with prefix
 731 @samp{u32_}, operating on UTF-32 strings.  Their description is
 732 analogous; in this documentation we describe only the function that
 733 operates on UTF-8 strings, for brevity.
 734
 735 A declaration with a variable @var{n} denotes the three concrete
 736 declarations with @var{n} = 8, @var{n} = 16, @var{n} = 32.
 737
 738 All parameters starting with @samp{str} and the parameters of
 739 functions starting with @code{u8_str}/@code{u16_str}/@code{u32_str}
 740 denote a NUL terminated string.
 741
 742 @cindex return value conventions
 743 Error values are always returned through the @code{errno} variable,
 744 usually with a return value that indicates the presence of an error
 745 (NULL for functions that return an pointer, or -1 for functions that
 746 return an @code{int}).
 747
 748 Functions returning a string result take a
 749 @code{(@var{resultbuf}, @var{lengthp})}
 750 argument pair.  If @var{resultbuf} is not NULL and the result fits
 751 into @code{*@var{lengthp}} units, it is put in @var{resultbuf}, and
 752 @var{resultbuf} is returned.  Otherwise, a freshly allocated string
 753 is returned.  In both cases, @code{*@var{lengthp}} is set to the
 754 length (number of units) of the returned string.  In case of error,
 755 NULL is returned and @code{errno} is set.
 756
 757 @include unitypes.texi
 758 @include unistr.texi
 759 @include uniconv.texi
 760 @include unistdio.texi
 761 @include uniname.texi
 762 @include unictype.texi
 763 @include uniwidth.texi
 764 @include unigbrk.texi
 765 @include uniwbrk.texi
 766 @include unilbrk.texi
 767 @include uninorm.texi
 768 @include unicase.texi
 769 @include uniregex.texi
 770
 771 @node Using the library
 772 @chapter Using the library
 773
 774 This chapter explains some practical considerations, regarding the
 775 installation and compiler options that are needed in order to use this
 776 library.
 777
 778 @menu
 779 * Installation::
 780 * Compiler options::
 781 * Include files::
 782 * Autoconf macro::
 783 * Reporting problems::
 784 @end menu
 785
 786 @node Installation
 787 @section Installation
 788
 789 @cindex dependencies
 790 Before you can use the library, it must be installed.  First, you have to
 791 make sure all dependencies are installed.  They are listed in the file
 792 @file{DEPENDENCIES}.
 793
 794 @cindex installation
 795 Then you can proceed to build and install the library, as described in the
 796 file @file{INSTALL}.  For installation on Windows systems, please refer to
 797 the file @file{INSTALL.windows}.
 798
 799 @node Compiler options
 800 @section Compiler options
 801
 802 Let's denote as @code{LIBUNISTRING_PREFIX} the value of the @samp{--prefix}
 803 option that you passed to @code{configure} while installing this package.
 804 If you didn't pass any @samp{--prefix} option, then the package is installed
 805 in @file{/usr/local}.
 806
 807 Let's denote as @code{LIBUNISTRING_INCLUDEDIR} the directory where the
 808 include files were installed.  This is usually the same as
 809 @code{$@{LIBUNISTRING_PREFIX@}/include}.  Except that if you passed an
 810 @samp{--includedir} option to @code{configure}, it is the value of that
 811 option.
 812
 813 Let's further denote as @code{LIBUNISTRING_LIBDIR} the directory where
 814 the library itself was installed.  This is the value that you passed
 815 with the @samp{--libdir} option to @code{configure}, or otherwise the
 816 same as @code{$@{LIBUNISTRING_PREFIX@}/lib}.  Recall that when building
 817 in 64-bit mode on a 64-bit GNU/Linux system that supports executables
 818 in either 64-bit mode or 32-bit mode, you should have used the option
 819 @code{--libdir=$@{LIBUNISTRING_PREFIX@}/lib64}.
 820
 821 @cindex compiler options
 822 So that the compiler finds the include files, you have to pass it the
 823 option @code{-I$@{LIBUNISTRING_INCLUDEDIR@}}.
 824
 825 So that the compiler finds the library during its linking pass, you have
 826 to pass it the options @code{-L$@{LIBUNISTRING_LIBDIR@} -lunistring}.
 827 On some systems, in some configurations, you also have to pass options
 828 needed for linking with @code{libiconv}.  The autoconf macro
 829 @code{gl_LIBUNISTRING} (see @ref{Autoconf macro}) deals with this
 830 particularity.
 831
 832 @node Include files
 833 @section Include files
 834
 835 Most of the include files have been presented in the introduction, see
 836 @ref{Introduction}, and subsequent detailed chapters.
 837
 838 Another include file is @code{<unistring/version.h>}. It contains the
 839 version number of the libunistring library.
 840
 841 @deftypevr Macro int _LIBUNISTRING_VERSION
 842 This constant contains the version of libunistring that is being used
 843 at compile time.  It encodes the major and minor parts of the version
 844 number only.  These parts are encoded in the form @code{(major<<8) + minor}.
 845 @end deftypevr
 846
 847 @deftypevr Constant int _libunistring_version
 848 This constant contains the version of libunistring that is being used
 849 at run time.  It encodes the major and minor parts of the version
 850 number only.  These parts are encoded in the form @code{(major<<8) + minor}.
 851 @end deftypevr
 852
 853 It is possible that @code{_libunistring_version} is greater than
 854 @code{_LIBUNISTRING_VERSION}.  This can happen when you use
 855 @code{libunistring} as a shared library, and a newer, binary
 856 backward-compatible version has been installed after your program
 857 that uses @code{libunistring} was installed.
 858
 859 @node Autoconf macro
 860 @section Autoconf macro
 861
 862 @cindex autoconf macro
 863 GNU Gnulib provides an autoconf macro that tests for the availability
 864 of @code{libunistring}.  It is contained in the Gnulib module
 865 @samp{libunistring}, see@texnl{}
 866 @url{https://www.gnu.org/software/gnulib/MODULES.html#module=libunistring}.
 867
 868 @amindex gl_LIBUNISTRING
 869 The macro is called @code{gl_LIBUNISTRING}.  It searches for an installed
 870 libunistring.  If found, it sets and AC_SUBSTs @code{HAVE_LIBUNISTRING=yes}
 871 and the @code{LIBUNISTRING} and @code{LTLIBUNISTRING} variables and augments
 872 the @code{CPPFLAGS} variable, and defines the C macro
 873 @code{HAVE_LIBUNISTRING} to 1.  Otherwise, it sets and AC_SUBSTs
 874 @code{HAVE_LIBUNISTRING=no} and @code{LIBUNISTRING} and @code{LTLIBUNISTRING}
 875 to empty.
 876
 877 The complexities that @code{gl_LIBUNISTRING} deals with are the following:
 878
 879 @itemize @bullet
 880 @item
 881 On some operating systems, in some configurations, libunistring depends
 882 on @code{libiconv}, and the options for linking with libiconv must be
 883 mentioned explicitly on the link command line.
 884
 885 @item
 886 GNU @code{libunistring}, if installed, is not necessarily already in the
 887 search path (@code{CPPFLAGS} for the include file search path,
 888 @code{LDFLAGS} for the library search path).
 889
 890 @item
 891 GNU @code{libunistring}, if installed, is not necessarily already in the
 892 run time library search path.  To avoid the need for setting an environment
 893 variable like @code{LD_LIBRARY_PATH}, the macro adds the appropriate
 894 run time search path options to the @code{LIBUNISTRING} variable.  This works
 895 on most systems.
 896 @end itemize
 897
 898 @node Reporting problems
 899 @section Reporting problems
 900
 901 @cindex bug reports
 902 @cindex bug tracker
 903 @cindex mailing list
 904 If you encounter any problem, please don't hesitate to submit a detailed
 905 bug report either in the bug tracker at the project page
 906 @url{https://savannah.gnu.org/projects/libunistring}, or by email
 907 to the @code{bug-libunistring@@gnu.org} mailing list.
 908
 909 Please always include the version number of this library, and a short
 910 description of your operating system and compilation environment with
 911 corresponding version numbers.
 912
 913 For problems that appear while building and installing @code{libunistring},
 914 for which you don't find the remedy in the @file{INSTALL} file, please include
 915 a description of the options that you passed to the @samp{configure} script.
 916
 917 @node More functionality
 918 @chapter More advanced functionality
 919
 920 @cindex bidirectional reordering
 921 For bidirectional reordering of strings, we recommend the GNU FriBidi library:
 922 @url{http://www.fribidi.org/}.
 923
 924 @cindex rendering
 925 For the rendering of Unicode strings outside of the context of a given toolkit
 926 (KDE/Qt or GNOME/Gtk), we recommend the Pango library:
 927 @url{https://www.pango.org/}.
 928
 929 @include wchar_t.texi
 930
 931 @include char32_t.texi
 932
 933 @node Licenses
 934 @appendix Licenses
 935 @cindex Licenses
 936
 937 The files of this package are covered by the licenses indicated in each
 938 particular file or directory.  Here is a summary:
 939
 940 @itemize @bullet
 941 @item
 942 The @code{libunistring} library and its header files are dual-licensed under
 943 "the GNU LGPLv3+ or the GNU GPLv2+". This means, you can use it under either
 944 @itemize @bullet
 945 @item @minus{}
 946 the terms of the GNU Lesser General Public License (LGPL) version 3 or
 947 (at your option) any later version, or
 948 @item @minus{}
 949 the terms of the GNU General Public License (GPL) version 2 or
 950 (at your option) any later version, or
 951 @item @minus{}
 952 the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
 953 @end itemize
 954 You find the GNU LGPL version 3 in @ref{GNU LGPL}.  This license is
 955 based on the GNU GPL version 3, see @ref{GNU GPL}.
 956 @*
 957 You can find the GNU GPL version 2 at
 958 @url{https://www.gnu.org/licenses/old-licenses/gpl-2.0.html}.
 959 @*
 960 Note: This dual license makes it possible for the @code{libunistring} library
 961 to be used by packages under GPLv2 or GPLv2+ licenses, in particular. See
 962 the table in @url{https://www.gnu.org/licenses/gpl-faq.html#AllCompatibility}.
 963
 964
 965 @item
 966 This manual is free documentation.  It is dually licensed under the
 967 GNU FDL and the GNU GPL.  This means that you can redistribute this
 968 manual under either of these two licenses, at your choice.
 969 @*
 970 This manual is covered by the GNU FDL.  Permission is granted to copy,
 971 distribute and/or modify this document under the terms of the
 972 GNU Free Documentation License (FDL), either version 1.2 of the
 973 License, or (at your option) any later version published by the
 974 Free Software Foundation (FSF); with no Invariant Sections, with no
 975 Front-Cover Text, and with no Back-Cover Texts.
 976 A copy of the license is included in @ref{GNU FDL}.
 977 @*
 978 This manual is covered by the GNU GPL.  You can redistribute it and/or
 979 modify it under the terms of the GNU General Public License (GPL), either
 980 version 3 of the License, or (at your option) any later version published
 981 by the Free Software Foundation (FSF).
 982 A copy of the license is included in @ref{GNU GPL}.
 983 @end itemize
 984
 985 @menu
 986 * GNU GPL::                     GNU General Public License
 987 * GNU LGPL::                    GNU Lesser General Public License
 988 * GNU FDL::                     GNU Free Documentation License
 989 @end menu
 990
 991 @page
 992 @node GNU GPL
 993 @appendixsec GNU GENERAL PUBLIC LICENSE
 994 @cindex GPL, GNU General Public License
 995 @cindex License, GNU GPL
 996 @include gpl.texi
 997 @page
 998 @node GNU LGPL
 999 @appendixsec GNU LESSER GENERAL PUBLIC LICENSE
1000 @cindex LGPL, GNU Lesser General Public License
1001 @cindex License, GNU LGPL
1002 @include lgpl.texi
1003 @page
1004 @node GNU FDL
1005 @appendixsec GNU Free Documentation License
1006 @cindex FDL, GNU Free Documentation License
1007 @cindex License, GNU FDL
1008 @include fdl.texi
1009
1010 @node Index
1011 @unnumbered Index
1012
1013 @printindex cp
1014
1015 @bye
1016
1017 @c Local Variables:
1018 @c indent-tabs-mode: nil
1019 @c whitespace-check-buffer-indent: nil
1020 @c End: