Towards normalization
[framework/uifw/harfbuzz.git] / src / hb-icu.cc
1 /*
2  * Copyright © 2009  Red Hat, Inc.
3  * Copyright © 2009  Keith Stribley
4  * Copyright © 2011  Google, Inc.
5  *
6  *  This is part of HarfBuzz, a text shaping library.
7  *
8  * Permission is hereby granted, without written agreement and without
9  * license or royalty fees, to use, copy, modify, and distribute this
10  * software and its documentation for any purpose, provided that the
11  * above copyright notice and the following two paragraphs appear in
12  * all copies of this software.
13  *
14  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
15  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
16  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
17  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
18  * DAMAGE.
19  *
20  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
22  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
23  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
24  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
25  *
26  * Red Hat Author(s): Behdad Esfahbod
27  * Google Author(s): Behdad Esfahbod
28  */
29
30 #include "hb-private.hh"
31
32 #include "hb-icu.h"
33
34 #include "hb-unicode-private.hh"
35
36 #include <unicode/uversion.h>
37 #include <unicode/uchar.h>
38 #include <unicode/unorm.h>
39 #include <unicode/unistr.h>
40
41 HB_BEGIN_DECLS
42
43
44 hb_script_t
45 hb_icu_script_to_script (UScriptCode script)
46 {
47   if (unlikely (script == USCRIPT_INVALID_CODE))
48     return HB_SCRIPT_INVALID;
49
50   return hb_script_from_string (uscript_getShortName (script));
51 }
52
53 UScriptCode
54 hb_icu_script_from_script (hb_script_t script)
55 {
56   if (unlikely (script == HB_SCRIPT_INVALID))
57     return USCRIPT_INVALID_CODE;
58
59   for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++)
60     if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))
61       return (UScriptCode) i;
62
63   return USCRIPT_UNKNOWN;
64 }
65
66
67 static unsigned int
68 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,
69                                 hb_codepoint_t      unicode,
70                                 void               *user_data HB_UNUSED)
71
72 {
73   return u_getCombiningClass (unicode);
74 }
75
76 static unsigned int
77 hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED,
78                                 hb_codepoint_t      unicode,
79                                 void               *user_data HB_UNUSED)
80 {
81   switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))
82   {
83   case U_EA_WIDE:
84   case U_EA_FULLWIDTH:
85     return 2;
86   case U_EA_NEUTRAL:
87   case U_EA_AMBIGUOUS:
88   case U_EA_HALFWIDTH:
89   case U_EA_NARROW:
90     return 1;
91   }
92   return 1;
93 }
94
95 static hb_unicode_general_category_t
96 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,
97                                  hb_codepoint_t      unicode,
98                                  void               *user_data HB_UNUSED)
99 {
100   switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
101   {
102   case U_UNASSIGNED:                    return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
103
104   case U_UPPERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
105   case U_LOWERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
106   case U_TITLECASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
107   case U_MODIFIER_LETTER:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
108   case U_OTHER_LETTER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
109
110   case U_NON_SPACING_MARK:              return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
111   case U_ENCLOSING_MARK:                return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
112   case U_COMBINING_SPACING_MARK:        return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;
113
114   case U_DECIMAL_DIGIT_NUMBER:          return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
115   case U_LETTER_NUMBER:                 return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
116   case U_OTHER_NUMBER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
117
118   case U_SPACE_SEPARATOR:               return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
119   case U_LINE_SEPARATOR:                return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
120   case U_PARAGRAPH_SEPARATOR:           return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
121
122   case U_CONTROL_CHAR:                  return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
123   case U_FORMAT_CHAR:                   return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
124   case U_PRIVATE_USE_CHAR:              return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
125   case U_SURROGATE:                     return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
126
127
128   case U_DASH_PUNCTUATION:              return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
129   case U_START_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
130   case U_END_PUNCTUATION:               return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
131   case U_CONNECTOR_PUNCTUATION:         return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
132   case U_OTHER_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
133
134   case U_MATH_SYMBOL:                   return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
135   case U_CURRENCY_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
136   case U_MODIFIER_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
137   case U_OTHER_SYMBOL:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
138
139   case U_INITIAL_PUNCTUATION:           return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
140   case U_FINAL_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
141   }
142
143   return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
144 }
145
146 static hb_codepoint_t
147 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,
148                           hb_codepoint_t      unicode,
149                           void               *user_data HB_UNUSED)
150 {
151   return u_charMirror(unicode);
152 }
153
154 static hb_script_t
155 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,
156                        hb_codepoint_t      unicode,
157                        void               *user_data HB_UNUSED)
158 {
159   UErrorCode status = U_ZERO_ERROR;
160   UScriptCode scriptCode = uscript_getScript(unicode, &status);
161
162   if (unlikely (status != U_ZERO_ERROR))
163     return HB_SCRIPT_UNKNOWN;
164
165   return hb_icu_script_to_script (scriptCode);
166 }
167
168 static hb_bool_t
169 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
170                         hb_codepoint_t      a,
171                         hb_codepoint_t      b,
172                         hb_codepoint_t     *ab,
173                         void               *user_data HB_UNUSED)
174 {
175   if (!a || !b)
176     return FALSE;
177
178   UChar utf16[4], normalized[5];
179   gint len;
180   hb_bool_t ret, err;
181   UErrorCode icu_err;
182
183   len = 0;
184   err = FALSE;
185   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);
186   if (err) return FALSE;
187   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);
188   if (err) return FALSE;
189
190   icu_err = U_ZERO_ERROR;
191   len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
192   if (icu_err)
193     return FALSE;
194   normalized[len] = 0;
195   if (u_strlen (normalized) == 1) {
196     U16_GET_UNSAFE (normalized, 0, *ab);
197     ret = TRUE;
198   } else {
199     ret = FALSE;
200   }
201
202   return ret;
203 }
204
205 static hb_bool_t
206 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
207                           hb_codepoint_t      ab,
208                           hb_codepoint_t     *a,
209                           hb_codepoint_t     *b,
210                           void               *user_data HB_UNUSED)
211 {
212   UChar utf16[2], normalized[20];
213   gint len;
214   hb_bool_t ret, err;
215   UErrorCode icu_err;
216
217   len = 0;
218   err = FALSE;
219   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
220   if (err) return FALSE;
221
222   icu_err = U_ZERO_ERROR;
223   len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
224   if (icu_err)
225     return FALSE;
226
227   normalized[len] = 0;
228   len = u_strlen (normalized);
229
230   if (len == 1) {
231     U16_GET_UNSAFE (normalized, 0, *a);
232     *b = 0;
233     ret = *a != ab;
234   } else if (len == 2) {
235     /* Here's the ugly part: if ab decomposes to a single character and
236      * that character decomposes again, we have to detect that and undo
237      * the second part :-(. */
238     UChar recomposed[20];
239     icu_err = U_ZERO_ERROR;
240     len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
241     if (icu_err)
242       return FALSE;
243     U16_GET_UNSAFE (recomposed, 0, *a);
244     if (*a != ab) {
245       *b = 0;
246     } else {
247       len =0;
248       U16_NEXT_UNSAFE (normalized, len, *a);
249       U16_GET_UNSAFE (normalized, len, *b);
250     }
251     ret = TRUE;
252   } else {
253     /* If decomposed to more than two characters, take the last one,
254      * and recompose the rest to get the first component. */
255     U16_PREV_UNSAFE (normalized, len, *b);
256     UChar recomposed[20];
257     icu_err = U_ZERO_ERROR;
258     len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
259     if (icu_err)
260       return FALSE;
261     /* We expect that recomposed has exactly one character now. */
262     U16_GET_UNSAFE (recomposed, 0, *a);
263     ret = TRUE;
264   }
265
266   return ret;
267 }
268
269 extern HB_INTERNAL hb_unicode_funcs_t _hb_unicode_funcs_icu;
270 hb_unicode_funcs_t _hb_icu_unicode_funcs = {
271   HB_OBJECT_HEADER_STATIC,
272
273   NULL, /* parent */
274   TRUE, /* immutable */
275   {
276 #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name,
277     HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
278 #undef HB_UNICODE_FUNC_IMPLEMENT
279   }
280 };
281
282 hb_unicode_funcs_t *
283 hb_icu_get_unicode_funcs (void)
284 {
285   return &_hb_icu_unicode_funcs;
286 }
287
288
289 HB_END_DECLS