lib/uninorm.h

   1 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   2 /* Normalization forms (composition and decomposition) of Unicode strings.
   3    Copyright (C) 2001-2002, 2009-2014 Free Software Foundation, Inc.
   4    Written by Bruno Haible <bruno@clisp.org>, 2009.
   5
   6    This program is free software: you can redistribute it and/or modify it
   7    under the terms of the GNU Lesser General Public License as published
   8    by the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifndef _UNINORM_H
  20 #define _UNINORM_H
  21
  22 /* Get LIBUNISTRING_DLL_VARIABLE.  */
  23 #include <unistring/woe32dll.h>
  24
  25 /* Get size_t.  */
  26 #include <stddef.h>
  27
  28 #include "unitypes.h"
  29
  30
  31 #ifdef __cplusplus
  32 extern "C" {
  33 #endif
  34
  35
  36 /* Conventions:
  37
  38    All functions prefixed with u8_ operate on UTF-8 encoded strings.
  39    Their unit is an uint8_t (1 byte).
  40
  41    All functions prefixed with u16_ operate on UTF-16 encoded strings.
  42    Their unit is an uint16_t (a 2-byte word).
  43
  44    All functions prefixed with u32_ operate on UCS-4 encoded strings.
  45    Their unit is an uint32_t (a 4-byte word).
  46
  47    All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
  48    n units.
  49
  50    Functions returning a string result take a (resultbuf, lengthp) argument
  51    pair.  If resultbuf is not NULL and the result fits into *lengthp units,
  52    it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
  53    allocated string is returned.  In both cases, *lengthp is set to the
  54    length (number of units) of the returned string.  In case of error,
  55    NULL is returned and errno is set.  */
  56
  57
  58 enum
  59 {
  60   UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
  61   UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
  62   UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
  63   UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
  64   UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
  65   UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
  66   UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
  67   UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
  68   UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
  69   UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
  70   UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
  71   UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
  72   UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
  73   UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
  74   UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
  75   UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
  76   UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
  77 };
  78
  79 /* Maximum size of decomposition of a single Unicode character.  */
  80 #define UC_DECOMPOSITION_MAX_LENGTH 32
  81
  82 /* Return the character decomposition mapping of a Unicode character.
  83    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
  84    ucs_t elements.
  85    When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
  86    filled and N is returned.  Otherwise -1 is returned.  */
  87 extern int
  88        uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
  89
  90 /* Return the canonical character decomposition mapping of a Unicode character.
  91    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
  92    ucs_t elements.
  93    When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
  94    returned.  Otherwise -1 is returned.  */
  95 extern int
  96        uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
  97
  98
  99 /* Attempt to combine the Unicode characters uc1, uc2.
 100    uc1 is known to have canonical combining class 0.
 101    Return the combination of uc1 and uc2, if it exists.
 102    Return 0 otherwise.
 103    Not all decompositions can be recombined using this function.  See the
 104    Unicode file CompositionExclusions.txt for details.  */
 105 extern ucs4_t
 106        uc_composition (ucs4_t uc1, ucs4_t uc2)
 107        _UC_ATTRIBUTE_CONST;
 108
 109
 110 /* An object of type uninorm_t denotes a Unicode normalization form.  */
 111 struct unicode_normalization_form;
 112 typedef const struct unicode_normalization_form *uninorm_t;
 113
 114 /* UNINORM_NFD: Normalization form D: canonical decomposition.  */
 115 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd;
 116 #define UNINORM_NFD (&uninorm_nfd)
 117
 118 /* UNINORM_NFC: Normalization form C: canonical decomposition, then
 119    canonical composition.  */
 120 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc;
 121 #define UNINORM_NFC (&uninorm_nfc)
 122
 123 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
 124 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd;
 125 #define UNINORM_NFKD (&uninorm_nfkd)
 126
 127 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
 128    canonical composition.  */
 129 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc;
 130 #define UNINORM_NFKC (&uninorm_nfkc)
 131
 132 /* Test whether a normalization form does compatibility decomposition.  */
 133 #define uninorm_is_compat_decomposing(nf) \
 134   ((* (const unsigned int *) (nf) >> 0) & 1)
 135
 136 /* Test whether a normalization form includes canonical composition.  */
 137 #define uninorm_is_composing(nf) \
 138   ((* (const unsigned int *) (nf) >> 1) & 1)
 139
 140 /* Return the decomposing variant of a normalization form.
 141    This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
 142 extern uninorm_t
 143        uninorm_decomposing_form (uninorm_t nf)
 144        _UC_ATTRIBUTE_PURE;
 145
 146
 147 /* Return the specified normalization form of a string.  */
 148 extern uint8_t *
 149        u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
 150                      uint8_t *resultbuf, size_t *lengthp);
 151 extern uint16_t *
 152        u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
 153                       uint16_t *resultbuf, size_t *lengthp);
 154 extern uint32_t *
 155        u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
 156                       uint32_t *resultbuf, size_t *lengthp);
 157
 158
 159 /* Compare S1 and S2, ignoring differences in normalization.
 160    NF must be either UNINORM_NFD or UNINORM_NFKD.
 161    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
 162    return 0.  Upon failure, return -1 with errno set.  */
 163 extern int
 164        u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
 165                    uninorm_t nf, int *resultp);
 166 extern int
 167        u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
 168                     uninorm_t nf, int *resultp);
 169 extern int
 170        u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
 171                     uninorm_t nf, int *resultp);
 172
 173
 174 /* Converts the string S of length N to a NUL-terminated byte sequence, in such
 175    a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
 176    equivalent to comparing S1 and S2 with uN_normcoll().
 177    NF must be either UNINORM_NFC or UNINORM_NFKC.  */
 178 extern char *
 179        u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
 180                     char *resultbuf, size_t *lengthp);
 181 extern char *
 182        u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
 183                      char *resultbuf, size_t *lengthp);
 184 extern char *
 185        u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
 186                      char *resultbuf, size_t *lengthp);
 187
 188
 189 /* Compare S1 and S2, ignoring differences in normalization, using the
 190    collation rules of the current locale.
 191    NF must be either UNINORM_NFC or UNINORM_NFKC.
 192    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
 193    return 0.  Upon failure, return -1 with errno set.  */
 194 extern int
 195        u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
 196                     uninorm_t nf, int *resultp);
 197 extern int
 198        u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
 199                      uninorm_t nf, int *resultp);
 200 extern int
 201        u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
 202                      uninorm_t nf, int *resultp);
 203
 204
 205 /* Normalization of a stream of Unicode characters.
 206
 207    A "stream of Unicode characters" is essentially a function that accepts an
 208    ucs4_t argument repeatedly, optionally combined with a function that
 209    "flushes" the stream.  */
 210
 211 /* Data type of a stream of Unicode characters that normalizes its input
 212    according to a given normalization form and passes the normalized character
 213    sequence to the encapsulated stream of Unicode characters.  */
 214 struct uninorm_filter;
 215
 216 /* Create and return a normalization filter for Unicode characters.
 217    The pair (stream_func, stream_data) is the encapsulated stream.
 218    stream_func (stream_data, uc) receives the Unicode character uc
 219    and returns 0 if successful, or -1 with errno set upon failure.
 220    Return the new filter, or NULL with errno set upon failure.  */
 221 extern struct uninorm_filter *
 222        uninorm_filter_create (uninorm_t nf,
 223                               int (*stream_func) (void *stream_data, ucs4_t uc),
 224                               void *stream_data);
 225
 226 /* Stuff a Unicode character into a normalizing filter.
 227    Return 0 if successful, or -1 with errno set upon failure.  */
 228 extern int
 229        uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
 230
 231 /* Bring data buffered in the filter to its destination, the encapsulated
 232    stream.
 233    Return 0 if successful, or -1 with errno set upon failure.
 234    Note! If after calling this function, additional characters are written
 235    into the filter, the resulting character sequence in the encapsulated stream
 236    will not necessarily be normalized.  */
 237 extern int
 238        uninorm_filter_flush (struct uninorm_filter *filter);
 239
 240 /* Bring data buffered in the filter to its destination, the encapsulated
 241    stream, then close and free the filter.
 242    Return 0 if successful, or -1 with errno set upon failure.  */
 243 extern int
 244        uninorm_filter_free (struct uninorm_filter *filter);
 245
 246
 247 #ifdef __cplusplus
 248 }
 249 #endif
 250
 251
 252 #endif /* _UNINORM_H */