doc/uninorm.texi

   1 @node uninorm.h
   2 @chapter Normalization forms (composition and decomposition) @code{<uninorm.h>}
   3
   4 @cindex normal forms
   5 @cindex normalizing
   6 This include file defines functions for transforming Unicode strings to one
   7 of the four normal forms, known as NFC, NFD, NKFC, NFKD.  These
   8 transformations involve decomposition and --- for NFC and NFKC --- composition
   9 of Unicode characters.
  10
  11 @menu
  12 * Decomposition of characters::
  13 * Composition of characters::
  14 * Normalization of strings::
  15 * Normalizing comparisons::
  16 * Normalization of streams::
  17 @end menu
  18
  19 @node Decomposition of characters
  20 @section Decomposition of Unicode characters
  21
  22 @cindex decomposing
  23 The following enumerated values are the possible types of decomposition of a
  24 Unicode character.
  25
  26 @deftypevr Constant int UC_DECOMP_CANONICAL
  27 Denotes canonical decomposition.
  28 @end deftypevr
  29
  30 @deftypevr Constant int UC_DECOMP_FONT
  31 UCD marker: @code{<font>}.  Denotes a font variant (e.g@. a blackletter form).
  32 @end deftypevr
  33
  34 @deftypevr Constant int UC_DECOMP_NOBREAK
  35 UCD marker: @code{<noBreak>}.
  36 Denotes a no-break version of a space or hyphen.
  37 @end deftypevr
  38
  39 @deftypevr Constant int UC_DECOMP_INITIAL
  40 UCD marker: @code{<initial>}.
  41 Denotes an initial presentation form (Arabic).
  42 @end deftypevr
  43
  44 @deftypevr Constant int UC_DECOMP_MEDIAL
  45 UCD marker: @code{<medial>}.
  46 Denotes a medial presentation form (Arabic).
  47 @end deftypevr
  48
  49 @deftypevr Constant int UC_DECOMP_FINAL
  50 UCD marker: @code{<final>}.
  51 Denotes a final presentation form (Arabic).
  52 @end deftypevr
  53
  54 @deftypevr Constant int UC_DECOMP_ISOLATED
  55 UCD marker: @code{<isolated>}.
  56 Denotes an isolated presentation form (Arabic).
  57 @end deftypevr
  58
  59 @deftypevr Constant int UC_DECOMP_CIRCLE
  60 UCD marker: @code{<circle>}.
  61 Denotes an encircled form.
  62 @end deftypevr
  63
  64 @deftypevr Constant int UC_DECOMP_SUPER
  65 UCD marker: @code{<super>}.
  66 Denotes a superscript form.
  67 @end deftypevr
  68
  69 @deftypevr Constant int UC_DECOMP_SUB
  70 UCD marker: @code{<sub>}.
  71 Denotes a subscript form.
  72 @end deftypevr
  73
  74 @deftypevr Constant int UC_DECOMP_VERTICAL
  75 UCD marker: @code{<vertical>}.
  76 Denotes a vertical layout presentation form.
  77 @end deftypevr
  78
  79 @deftypevr Constant int UC_DECOMP_WIDE
  80 UCD marker: @code{<wide>}.
  81 Denotes a wide (or zenkaku) compatibility character.
  82 @end deftypevr
  83
  84 @deftypevr Constant int UC_DECOMP_NARROW
  85 UCD marker: @code{<narrow>}.
  86 Denotes a narrow (or hankaku) compatibility character.
  87 @end deftypevr
  88
  89 @deftypevr Constant int UC_DECOMP_SMALL
  90 UCD marker: @code{<small>}.
  91 Denotes a small variant form (CNS compatibility).
  92 @end deftypevr
  93
  94 @deftypevr Constant int UC_DECOMP_SQUARE
  95 UCD marker: @code{<square>}.
  96 Denotes a CJK squared font variant.
  97 @end deftypevr
  98
  99 @deftypevr Constant int UC_DECOMP_FRACTION
 100 UCD marker: @code{<fraction>}.
 101 Denotes a vulgar fraction form.
 102 @end deftypevr
 103
 104 @deftypevr Constant int UC_DECOMP_COMPAT
 105 UCD marker: @code{<compat>}.
 106 Denotes an otherwise unspecified compatibility character.
 107 @end deftypevr
 108
 109 The following constant denotes the maximum size of decomposition of a single
 110 Unicode character.
 111
 112 @deftypevr Macro {unsigned int} UC_DECOMPOSITION_MAX_LENGTH
 113 This macro expands to a constant that is the required size of buffer passed to
 114 the @code{uc_decomposition} and @code{uc_canonical_decomposition} functions.
 115 @end deftypevr
 116
 117 The following functions decompose a Unicode character.
 118
 119 @deftypefun int uc_decomposition (ucs4_t @var{uc}, int *@var{decomp_tag}, ucs4_t *@var{decomposition})
 120 Returns the character decomposition mapping of the Unicode character @var{uc}.
 121 @var{decomposition} must point to an array of at least
 122 @code{UC_DECOMPOSITION_MAX_LENGTH} @code{ucs_t} elements.
 123
 124 When a decomposition exists, @code{@var{decomposition}[0..@var{n}-1]} and
 125 @code{*@var{decomp_tag}} are filled and @var{n} is returned.  Otherwise -1 is
 126 returned.
 127 @end deftypefun
 128
 129 @deftypefun int uc_canonical_decomposition (ucs4_t @var{uc}, ucs4_t *@var{decomposition})
 130 Returns the canonical character decomposition mapping of the Unicode character
 131 @var{uc}.  @var{decomposition} must point to an array of at least
 132 @code{UC_DECOMPOSITION_MAX_LENGTH} @code{ucs_t} elements.
 133
 134 When a decomposition exists, @code{@var{decomposition}[0..@var{n}-1]} is filled
 135 and @var{n} is returned.  Otherwise -1 is returned.
 136 @end deftypefun
 137
 138 @node Composition of characters
 139 @section Composition of Unicode characters
 140
 141 @cindex composing, Unicode characters
 142 @cindex combining, Unicode characters
 143 The following function composes a Unicode character from two Unicode
 144 characters.
 145
 146 @deftypefun ucs4_t uc_composition (ucs4_t @var{uc1}, ucs4_t @var{uc2})
 147 Attempts to combine the Unicode characters @var{uc1}, @var{uc2}.
 148 @var{uc1} is known to have canonical combining class 0.
 149
 150 Returns the combination of @var{uc1} and @var{uc2}, if it exists.
 151 Returns 0 otherwise.
 152
 153 Not all decompositions can be recombined using this function.  See the Unicode
 154 file @file{CompositionExclusions.txt} for details.
 155 @end deftypefun
 156
 157 @node Normalization of strings
 158 @section Normalization of strings
 159
 160 The Unicode standard defines four normalization forms for Unicode strings.
 161 The following type is used to denote a normalization form.
 162
 163 @deftp Type uninorm_t
 164 An object of type @code{uninorm_t} denotes a Unicode normalization form.
 165 This is a scalar type; its values can be compared with @code{==}.
 166 @end deftp
 167
 168 The following constants denote the four normalization forms.
 169
 170 @deftypevr Macro uninorm_t UNINORM_NFD
 171 Denotes Normalization form D: canonical decomposition.
 172 @end deftypevr
 173
 174 @deftypevr Macro uninorm_t UNINORM_NFC
 175 Normalization form C: canonical decomposition, then canonical composition.
 176 @end deftypevr
 177
 178 @deftypevr Macro uninorm_t UNINORM_NFKD
 179 Normalization form KD: compatibility decomposition.
 180 @end deftypevr
 181
 182 @deftypevr Macro uninorm_t UNINORM_NFKC
 183 Normalization form KC: compatibility decomposition, then canonical composition.
 184 @end deftypevr
 185
 186 The following functions operate on @code{uninorm_t} objects.
 187
 188 @deftypefun bool uninorm_is_compat_decomposing (uninorm_t @var{nf})
 189 Tests whether the normalization form @var{nf} does compatibility decomposition.
 190 @end deftypefun
 191
 192 @deftypefun bool uninorm_is_composing (uninorm_t @var{nf})
 193 Tests whether the normalization form @var{nf} includes canonical composition.
 194 @end deftypefun
 195
 196 @deftypefun uninorm_t uninorm_decomposing_form (uninorm_t @var{nf})
 197 Returns the decomposing variant of the normalization form @var{nf}.
 198 This maps NFC,NFD @arrow{} NFD and NFKC,NFKD @arrow{} NFKD.
 199 @end deftypefun
 200
 201 The following functions apply a Unicode normalization form to a Unicode string.
 202
 203 @deftypefun {uint8_t *} u8_normalize (uninorm_t @var{nf}, const uint8_t *@var{s}, size_t @var{n}, uint8_t *@var{resultbuf}, size_t *@var{lengthp})
 204 @deftypefunx {uint16_t *} u16_normalize (uninorm_t @var{nf}, const uint16_t *@var{s}, size_t @var{n}, uint16_t *@var{resultbuf}, size_t *@var{lengthp})
 205 @deftypefunx {uint32_t *} u32_normalize (uninorm_t @var{nf}, const uint32_t *@var{s}, size_t @var{n}, uint32_t *@var{resultbuf}, size_t *@var{lengthp})
 206 Returns the specified normalization form of a string.
 207 @end deftypefun
 208
 209 @node Normalizing comparisons
 210 @section Normalizing comparisons
 211
 212 @cindex comparing, ignoring normalization
 213 The following functions compare Unicode string, ignoring differences in
 214 normalization.
 215
 216 @deftypefun int u8_normcmp (const uint8_t *@var{s1}, size_t @var{n1}, const uint8_t *@var{s2}, size_t @var{n2}, uninorm_t @var{nf}, int *@var{resultp})
 217 @deftypefunx int u16_normcmp (const uint16_t *@var{s1}, size_t @var{n1}, const uint16_t *@var{s2}, size_t @var{n2}, uninorm_t @var{nf}, int *@var{resultp})
 218 @deftypefunx int u32_normcmp (const uint32_t *@var{s1}, size_t @var{n1}, const uint32_t *@var{s2}, size_t @var{n2}, uninorm_t @var{nf}, int *@var{resultp})
 219 Compares @var{s1} and @var{s2}, ignoring differences in normalization.
 220
 221 @var{nf} must be either @code{UNINORM_NFD} or @code{UNINORM_NFKD}.
 222
 223 If successful, sets @code{*@var{resultp}} to -1 if @var{s1} < @var{s2},
 224 0 if @var{s1} = @var{s2}, 1 if @var{s1} > @var{s2}, and returns 0.
 225 Upon failure, returns -1 with @code{errno} set.
 226 @end deftypefun
 227
 228 @cindex comparing, ignoring normalization, with collation rules
 229 @cindex comparing, with collation rules, ignoring normalization
 230 @deftypefun {char *} u8_normxfrm (const uint8_t *@var{s}, size_t @var{n}, uninorm_t @var{nf}, char *@var{resultbuf}, size_t *@var{lengthp})
 231 @deftypefunx {char *} u16_normxfrm (const uint16_t *@var{s}, size_t @var{n}, uninorm_t @var{nf}, char *@var{resultbuf}, size_t *@var{lengthp})
 232 @deftypefunx {char *} u32_normxfrm (const uint32_t *@var{s}, size_t @var{n}, uninorm_t @var{nf}, char *@var{resultbuf}, size_t *@var{lengthp})
 233 Converts the string @var{s} of length @var{n} to a NUL-terminated byte
 234 sequence, in such a way that comparing @code{u8_normxfrm (@var{s1})} and
 235 @code{u8_normxfrm (@var{s2})} with the @code{u8_cmp2} function is equivalent to
 236 comparing @var{s1} and @var{s2} with the @code{u8_normcoll} function.
 237
 238 @var{nf} must be either @code{UNINORM_NFC} or @code{UNINORM_NFKC}.
 239 @end deftypefun
 240
 241 @deftypefun int u8_normcoll (const uint8_t *@var{s1}, size_t @var{n1}, const uint8_t *@var{s2}, size_t @var{n2}, uninorm_t @var{nf}, int *@var{resultp})
 242 @deftypefunx int u16_normcoll (const uint16_t *@var{s1}, size_t @var{n1}, const uint16_t *@var{s2}, size_t @var{n2}, uninorm_t @var{nf}, int *@var{resultp})
 243 @deftypefunx int u32_normcoll (const uint32_t *@var{s1}, size_t @var{n1}, const uint32_t *@var{s2}, size_t @var{n2}, uninorm_t @var{nf}, int *@var{resultp})
 244 Compares @var{s1} and @var{s2}, ignoring differences in normalization, using
 245 the collation rules of the current locale.
 246
 247 @var{nf} must be either @code{UNINORM_NFC} or @code{UNINORM_NFKC}.
 248
 249 If successful, sets @code{*@var{resultp}} to -1 if @var{s1} < @var{s2},
 250 0 if @var{s1} = @var{s2}, 1 if @var{s1} > @var{s2}, and returns 0.
 251 Upon failure, returns -1 with @code{errno} set.
 252 @end deftypefun
 253
 254 @node Normalization of streams
 255 @section Normalization of streams of Unicode characters
 256
 257 @cindex stream, normalizing a
 258 A ``stream of Unicode characters'' is essentially a function that accepts an
 259 @code{ucs4_t} argument repeatedly, optionally combined with a function that
 260 ``flushes'' the stream.
 261
 262 @deftp Type {struct uninorm_filter}
 263 This is the data type of a stream of Unicode characters that normalizes its
 264 input according to a given normalization form and passes the normalized
 265 character sequence to the encapsulated stream of Unicode characters.
 266 @end deftp
 267
 268 @deftypefun {struct uninorm_filter *} uninorm_filter_create (uninorm_t @var{nf}, int (*@var{stream_func}) (void *@var{stream_data}, ucs4_t @var{uc}), void *@var{stream_data})
 269 Creates and returns a normalization filter for Unicode characters.
 270
 271 The pair (@var{stream_func}, @var{stream_data}) is the encapsulated stream.
 272 @code{@var{stream_func} (@var{stream_data}, @var{uc})} receives the Unicode
 273 character @var{uc} and returns 0 if successful, or -1 with @code{errno} set
 274 upon failure.
 275
 276 Returns the new filter, or NULL with @code{errno} set upon failure.
 277 @end deftypefun
 278
 279 @deftypefun int uninorm_filter_write (struct uninorm_filter *@var{filter}, ucs4_t @var{uc})
 280 Stuffs a Unicode character into a normalizing filter.
 281 Returns 0 if successful, or -1 with @code{errno} set upon failure.
 282 @end deftypefun
 283
 284 @deftypefun int uninorm_filter_flush (struct uninorm_filter *@var{filter})
 285 Brings data buffered in the filter to its destination, the encapsulated stream.
 286
 287 Returns 0 if successful, or -1 with @code{errno} set upon failure.
 288
 289 Note! If after calling this function, additional characters are written
 290 into the filter, the resulting character sequence in the encapsulated stream
 291 will not necessarily be normalized.
 292 @end deftypefun
 293
 294 @deftypefun int uninorm_filter_free (struct uninorm_filter *@var{filter})
 295 Brings data buffered in the filter to its destination, the encapsulated stream,
 296 then closes and frees the filter.
 297
 298 Returns 0 if successful, or -1 with @code{errno} set upon failure.
 299 @end deftypefun