lib/uninorm.in.h

   1 /* Normalization forms (composition and decomposition) of Unicode strings.
   2    Copyright (C) 2001-2002, 2009-2014 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4
   5    This program is free software: you can redistribute it and/or modify it
   6    under the terms of the GNU Lesser General Public License as published
   7    by the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #ifndef _UNINORM_H
  19 #define _UNINORM_H
  20
  21 /* Get LIBUNISTRING_DLL_VARIABLE.  */
  22 #include <unistring/woe32dll.h>
  23
  24 /* Get size_t.  */
  25 #include <stddef.h>
  26
  27 #include "unitypes.h"
  28
  29
  30 #ifdef __cplusplus
  31 extern "C" {
  32 #endif
  33
  34
  35 /* Conventions:
  36
  37    All functions prefixed with u8_ operate on UTF-8 encoded strings.
  38    Their unit is an uint8_t (1 byte).
  39
  40    All functions prefixed with u16_ operate on UTF-16 encoded strings.
  41    Their unit is an uint16_t (a 2-byte word).
  42
  43    All functions prefixed with u32_ operate on UCS-4 encoded strings.
  44    Their unit is an uint32_t (a 4-byte word).
  45
  46    All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
  47    n units.
  48
  49    Functions returning a string result take a (resultbuf, lengthp) argument
  50    pair.  If resultbuf is not NULL and the result fits into *lengthp units,
  51    it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
  52    allocated string is returned.  In both cases, *lengthp is set to the
  53    length (number of units) of the returned string.  In case of error,
  54    NULL is returned and errno is set.  */
  55
  56
  57 enum
  58 {
  59   UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
  60   UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
  61   UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
  62   UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
  63   UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
  64   UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
  65   UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
  66   UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
  67   UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
  68   UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
  69   UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
  70   UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
  71   UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
  72   UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
  73   UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
  74   UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
  75   UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
  76 };
  77
  78 /* Maximum size of decomposition of a single Unicode character.  */
  79 #define UC_DECOMPOSITION_MAX_LENGTH 32
  80
  81 /* Return the character decomposition mapping of a Unicode character.
  82    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
  83    ucs_t elements.
  84    When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
  85    filled and N is returned.  Otherwise -1 is returned.  */
  86 extern int
  87        uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
  88
  89 /* Return the canonical character decomposition mapping of a Unicode character.
  90    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
  91    ucs_t elements.
  92    When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
  93    returned.  Otherwise -1 is returned.  */
  94 extern int
  95        uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
  96
  97
  98 /* Attempt to combine the Unicode characters uc1, uc2.
  99    uc1 is known to have canonical combining class 0.
 100    Return the combination of uc1 and uc2, if it exists.
 101    Return 0 otherwise.
 102    Not all decompositions can be recombined using this function.  See the
 103    Unicode file CompositionExclusions.txt for details.  */
 104 extern ucs4_t
 105        uc_composition (ucs4_t uc1, ucs4_t uc2)
 106        _UC_ATTRIBUTE_CONST;
 107
 108
 109 /* An object of type uninorm_t denotes a Unicode normalization form.  */
 110 struct unicode_normalization_form;
 111 typedef const struct unicode_normalization_form *uninorm_t;
 112
 113 /* UNINORM_NFD: Normalization form D: canonical decomposition.  */
 114 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd;
 115 #define UNINORM_NFD (&uninorm_nfd)
 116
 117 /* UNINORM_NFC: Normalization form C: canonical decomposition, then
 118    canonical composition.  */
 119 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc;
 120 #define UNINORM_NFC (&uninorm_nfc)
 121
 122 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
 123 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd;
 124 #define UNINORM_NFKD (&uninorm_nfkd)
 125
 126 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
 127    canonical composition.  */
 128 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc;
 129 #define UNINORM_NFKC (&uninorm_nfkc)
 130
 131 /* Test whether a normalization form does compatibility decomposition.  */
 132 #define uninorm_is_compat_decomposing(nf) \
 133   ((* (const unsigned int *) (nf) >> 0) & 1)
 134
 135 /* Test whether a normalization form includes canonical composition.  */
 136 #define uninorm_is_composing(nf) \
 137   ((* (const unsigned int *) (nf) >> 1) & 1)
 138
 139 /* Return the decomposing variant of a normalization form.
 140    This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
 141 extern uninorm_t
 142        uninorm_decomposing_form (uninorm_t nf)
 143        _UC_ATTRIBUTE_PURE;
 144
 145
 146 /* Return the specified normalization form of a string.  */
 147 extern uint8_t *
 148        u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
 149                      uint8_t *resultbuf, size_t *lengthp);
 150 extern uint16_t *
 151        u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
 152                       uint16_t *resultbuf, size_t *lengthp);
 153 extern uint32_t *
 154        u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
 155                       uint32_t *resultbuf, size_t *lengthp);
 156
 157
 158 /* Compare S1 and S2, ignoring differences in normalization.
 159    NF must be either UNINORM_NFD or UNINORM_NFKD.
 160    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
 161    return 0.  Upon failure, return -1 with errno set.  */
 162 extern int
 163        u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
 164                    uninorm_t nf, int *resultp);
 165 extern int
 166        u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
 167                     uninorm_t nf, int *resultp);
 168 extern int
 169        u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
 170                     uninorm_t nf, int *resultp);
 171
 172
 173 /* Converts the string S of length N to a NUL-terminated byte sequence, in such
 174    a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
 175    equivalent to comparing S1 and S2 with uN_normcoll().
 176    NF must be either UNINORM_NFC or UNINORM_NFKC.  */
 177 extern char *
 178        u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
 179                     char *resultbuf, size_t *lengthp);
 180 extern char *
 181        u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
 182                      char *resultbuf, size_t *lengthp);
 183 extern char *
 184        u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
 185                      char *resultbuf, size_t *lengthp);
 186
 187
 188 /* Compare S1 and S2, ignoring differences in normalization, using the
 189    collation rules of the current locale.
 190    NF must be either UNINORM_NFC or UNINORM_NFKC.
 191    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
 192    return 0.  Upon failure, return -1 with errno set.  */
 193 extern int
 194        u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
 195                     uninorm_t nf, int *resultp);
 196 extern int
 197        u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
 198                      uninorm_t nf, int *resultp);
 199 extern int
 200        u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
 201                      uninorm_t nf, int *resultp);
 202
 203
 204 /* Normalization of a stream of Unicode characters.
 205
 206    A "stream of Unicode characters" is essentially a function that accepts an
 207    ucs4_t argument repeatedly, optionally combined with a function that
 208    "flushes" the stream.  */
 209
 210 /* Data type of a stream of Unicode characters that normalizes its input
 211    according to a given normalization form and passes the normalized character
 212    sequence to the encapsulated stream of Unicode characters.  */
 213 struct uninorm_filter;
 214
 215 /* Create and return a normalization filter for Unicode characters.
 216    The pair (stream_func, stream_data) is the encapsulated stream.
 217    stream_func (stream_data, uc) receives the Unicode character uc
 218    and returns 0 if successful, or -1 with errno set upon failure.
 219    Return the new filter, or NULL with errno set upon failure.  */
 220 extern struct uninorm_filter *
 221        uninorm_filter_create (uninorm_t nf,
 222                               int (*stream_func) (void *stream_data, ucs4_t uc),
 223                               void *stream_data);
 224
 225 /* Stuff a Unicode character into a normalizing filter.
 226    Return 0 if successful, or -1 with errno set upon failure.  */
 227 extern int
 228        uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
 229
 230 /* Bring data buffered in the filter to its destination, the encapsulated
 231    stream.
 232    Return 0 if successful, or -1 with errno set upon failure.
 233    Note! If after calling this function, additional characters are written
 234    into the filter, the resulting character sequence in the encapsulated stream
 235    will not necessarily be normalized.  */
 236 extern int
 237        uninorm_filter_flush (struct uninorm_filter *filter);
 238
 239 /* Bring data buffered in the filter to its destination, the encapsulated
 240    stream, then close and free the filter.
 241    Return 0 if successful, or -1 with errno set upon failure.  */
 242 extern int
 243        uninorm_filter_free (struct uninorm_filter *filter);
 244
 245
 246 #ifdef __cplusplus
 247 }
 248 #endif
 249
 250
 251 #endif /* _UNINORM_H */