Always allocate new ligature id
[framework/uifw/harfbuzz.git] / src / hb-icu.c
1 /*
2  * Copyright (C) 2009  Red Hat, Inc.
3  * Copyright (C) 2009  Keith Stribley <devel@thanlwinsoft.org>
4  *
5  *  This is part of HarfBuzz, a text shaping library.
6  *
7  * Permission is hereby granted, without written agreement and without
8  * license or royalty fees, to use, copy, modify, and distribute this
9  * software and its documentation for any purpose, provided that the
10  * above copyright notice and the following two paragraphs appear in
11  * all copies of this software.
12  *
13  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
14  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
15  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
16  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
17  * DAMAGE.
18  *
19  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
20  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
21  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
22  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
23  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
24  *
25  * Red Hat Author(s): Behdad Esfahbod
26  */
27
28 #include "hb-private.h"
29
30 #include "hb-icu.h"
31
32 #include "hb-unicode-private.h"
33
34 #include <unicode/uversion.h>
35 #include <unicode/uchar.h>
36 #include <unicode/uscript.h>
37
38 HB_BEGIN_DECLS
39
40
41 static hb_codepoint_t hb_icu_get_mirroring (hb_codepoint_t unicode) { return u_charMirror(unicode); }
42 static unsigned int hb_icu_get_combining_class (hb_codepoint_t unicode) { return u_getCombiningClass (unicode); }
43
44 static unsigned int
45 hb_icu_get_eastasian_width (hb_codepoint_t unicode)
46 {
47   switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))
48   {
49   case U_EA_WIDE:
50   case U_EA_FULLWIDTH:
51     return 2;
52   case U_EA_NEUTRAL:
53   case U_EA_AMBIGUOUS:
54   case U_EA_HALFWIDTH:
55   case U_EA_NARROW:
56     return 1;
57   }
58   return 1;
59 }
60
61 static hb_category_t
62 hb_icu_get_general_category (hb_codepoint_t unicode)
63 {
64   switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
65   {
66   case U_UNASSIGNED:                    return HB_CATEGORY_UNASSIGNED;
67
68   case U_UPPERCASE_LETTER:              return HB_CATEGORY_UPPERCASE_LETTER;    /* Lu */
69   case U_LOWERCASE_LETTER:              return HB_CATEGORY_LOWERCASE_LETTER;    /* Ll */
70   case U_TITLECASE_LETTER:              return HB_CATEGORY_TITLECASE_LETTER;    /* Lt */
71   case U_MODIFIER_LETTER:               return HB_CATEGORY_MODIFIER_LETTER;     /* Lm */
72   case U_OTHER_LETTER:                  return HB_CATEGORY_OTHER_LETTER;        /* Lo */
73
74   case U_NON_SPACING_MARK:              return HB_CATEGORY_NON_SPACING_MARK;    /* Mn */
75   case U_ENCLOSING_MARK:                return HB_CATEGORY_ENCLOSING_MARK;      /* Me */
76   case U_COMBINING_SPACING_MARK:        return HB_CATEGORY_COMBINING_MARK;      /* Mc */
77
78   case U_DECIMAL_DIGIT_NUMBER:          return HB_CATEGORY_DECIMAL_NUMBER;      /* Nd */
79   case U_LETTER_NUMBER:                 return HB_CATEGORY_LETTER_NUMBER;       /* Nl */
80   case U_OTHER_NUMBER:                  return HB_CATEGORY_OTHER_NUMBER;        /* No */
81
82   case U_SPACE_SEPARATOR:               return HB_CATEGORY_SPACE_SEPARATOR;     /* Zs */
83   case U_LINE_SEPARATOR:                return HB_CATEGORY_LINE_SEPARATOR;      /* Zl */
84   case U_PARAGRAPH_SEPARATOR:           return HB_CATEGORY_PARAGRAPH_SEPARATOR; /* Zp */
85
86   case U_CONTROL_CHAR:                  return HB_CATEGORY_CONTROL;             /* Cc */
87   case U_FORMAT_CHAR:                   return HB_CATEGORY_FORMAT;              /* Cf */
88   case U_PRIVATE_USE_CHAR:              return HB_CATEGORY_PRIVATE_USE;         /* Co */
89   case U_SURROGATE:                     return HB_CATEGORY_SURROGATE;           /* Cs */
90
91
92   case U_DASH_PUNCTUATION:              return HB_CATEGORY_DASH_PUNCTUATION;    /* Pd */
93   case U_START_PUNCTUATION:             return HB_CATEGORY_OPEN_PUNCTUATION;    /* Ps */
94   case U_END_PUNCTUATION:               return HB_CATEGORY_CLOSE_PUNCTUATION;   /* Pe */
95   case U_CONNECTOR_PUNCTUATION:         return HB_CATEGORY_CONNECT_PUNCTUATION; /* Pc */
96   case U_OTHER_PUNCTUATION:             return HB_CATEGORY_OTHER_PUNCTUATION;   /* Po */
97
98   case U_MATH_SYMBOL:                   return HB_CATEGORY_MATH_SYMBOL;         /* Sm */
99   case U_CURRENCY_SYMBOL:               return HB_CATEGORY_CURRENCY_SYMBOL;     /* Sc */
100   case U_MODIFIER_SYMBOL:               return HB_CATEGORY_MODIFIER_SYMBOL;     /* Sk */
101   case U_OTHER_SYMBOL:                  return HB_CATEGORY_OTHER_SYMBOL;        /* So */
102
103   case U_INITIAL_PUNCTUATION:           return HB_CATEGORY_INITIAL_PUNCTUATION; /* Pi */
104   case U_FINAL_PUNCTUATION:             return HB_CATEGORY_FINAL_PUNCTUATION;   /* Pf */
105   }
106
107   return HB_CATEGORY_UNASSIGNED;
108 }
109
110 static hb_script_t
111 hb_icu_get_script (hb_codepoint_t unicode)
112 {
113   UErrorCode status = U_ZERO_ERROR;
114   UScriptCode scriptCode = uscript_getScript(unicode, &status);
115   switch ((int) scriptCode)
116   {
117 #define CHECK_ICU_VERSION(major, minor) \
118         U_ICU_VERSION_MAJOR_NUM > (major) || (U_ICU_VERSION_MAJOR_NUM == (major) && U_ICU_VERSION_MINOR_NUM >= (minor))
119 #define MATCH_SCRIPT(C) case USCRIPT_##C: return HB_SCRIPT_##C
120 #define MATCH_SCRIPT2(C1, C2) case USCRIPT_##C1: return HB_SCRIPT_##C2
121   MATCH_SCRIPT (INVALID_CODE);
122   MATCH_SCRIPT (COMMON);             /* Zyyy */
123   MATCH_SCRIPT (INHERITED);          /* Qaai */
124   MATCH_SCRIPT (ARABIC);             /* Arab */
125   MATCH_SCRIPT (ARMENIAN);           /* Armn */
126   MATCH_SCRIPT (BENGALI);            /* Beng */
127   MATCH_SCRIPT (BOPOMOFO);           /* Bopo */
128   MATCH_SCRIPT (CHEROKEE);           /* Cher */
129   MATCH_SCRIPT (COPTIC);             /* Qaac */
130   MATCH_SCRIPT (CYRILLIC);           /* Cyrl (Cyrs) */
131   MATCH_SCRIPT (DESERET);            /* Dsrt */
132   MATCH_SCRIPT (DEVANAGARI);         /* Deva */
133   MATCH_SCRIPT (ETHIOPIC);           /* Ethi */
134   MATCH_SCRIPT (GEORGIAN);           /* Geor (Geon); Geoa) */
135   MATCH_SCRIPT (GOTHIC);             /* Goth */
136   MATCH_SCRIPT (GREEK);              /* Grek */
137   MATCH_SCRIPT (GUJARATI);           /* Gujr */
138   MATCH_SCRIPT (GURMUKHI);           /* Guru */
139   MATCH_SCRIPT (HAN);                /* Hani */
140   MATCH_SCRIPT (HANGUL);             /* Hang */
141   MATCH_SCRIPT (HEBREW);             /* Hebr */
142   MATCH_SCRIPT (HIRAGANA);           /* Hira */
143   MATCH_SCRIPT (KANNADA);            /* Knda */
144   MATCH_SCRIPT (KATAKANA);           /* Kana */
145   MATCH_SCRIPT (KHMER);              /* Khmr */
146   MATCH_SCRIPT (LAO);                /* Laoo */
147   MATCH_SCRIPT (LATIN);              /* Latn (Latf); Latg) */
148   MATCH_SCRIPT (MALAYALAM);          /* Mlym */
149   MATCH_SCRIPT (MONGOLIAN);          /* Mong */
150   MATCH_SCRIPT (MYANMAR);            /* Mymr */
151   MATCH_SCRIPT (OGHAM);              /* Ogam */
152   MATCH_SCRIPT (OLD_ITALIC);         /* Ital */
153   MATCH_SCRIPT (ORIYA);              /* Orya */
154   MATCH_SCRIPT (RUNIC);              /* Runr */
155   MATCH_SCRIPT (SINHALA);            /* Sinh */
156   MATCH_SCRIPT (SYRIAC);             /* Syrc (Syrj, Syrn); Syre) */
157   MATCH_SCRIPT (TAMIL);              /* Taml */
158   MATCH_SCRIPT (TELUGU);             /* Telu */
159   MATCH_SCRIPT (THAANA);             /* Thaa */
160   MATCH_SCRIPT (THAI);               /* Thai */
161   MATCH_SCRIPT (TIBETAN);            /* Tibt */
162   MATCH_SCRIPT (CANADIAN_ABORIGINAL);/* Cans */
163   MATCH_SCRIPT (YI);                 /* Yiii */
164   MATCH_SCRIPT (TAGALOG);            /* Tglg */
165   MATCH_SCRIPT (HANUNOO);            /* Hano */
166   MATCH_SCRIPT (BUHID);              /* Buhd */
167   MATCH_SCRIPT (TAGBANWA);           /* Tagb */
168
169   /* Unicode-4.0 additions */
170   MATCH_SCRIPT (BRAILLE);            /* Brai */
171   MATCH_SCRIPT (CYPRIOT);            /* Cprt */
172   MATCH_SCRIPT (LIMBU);              /* Limb */
173   MATCH_SCRIPT (OSMANYA);            /* Osma */
174   MATCH_SCRIPT (SHAVIAN);            /* Shaw */
175   MATCH_SCRIPT (LINEAR_B);           /* Linb */
176   MATCH_SCRIPT (TAI_LE);             /* Tale */
177   MATCH_SCRIPT (UGARITIC);           /* Ugar */
178
179   /* Unicode-4.1 additions */
180   MATCH_SCRIPT (NEW_TAI_LUE);        /* Talu */
181   MATCH_SCRIPT (BUGINESE);           /* Bugi */
182   MATCH_SCRIPT (GLAGOLITIC);         /* Glag */
183   MATCH_SCRIPT (TIFINAGH);           /* Tfng */
184   MATCH_SCRIPT (SYLOTI_NAGRI);       /* Sylo */
185   MATCH_SCRIPT (OLD_PERSIAN);        /* Xpeo */
186   MATCH_SCRIPT (KHAROSHTHI);         /* Khar */
187
188   /* Unicode-5.0 additions */
189   MATCH_SCRIPT (UNKNOWN);            /* Zzzz */
190   MATCH_SCRIPT (BALINESE);           /* Bali */
191   MATCH_SCRIPT (CUNEIFORM);          /* Xsux */
192   MATCH_SCRIPT (PHOENICIAN);         /* Phnx */
193   MATCH_SCRIPT (PHAGS_PA);           /* Phag */
194   MATCH_SCRIPT (NKO);                /* Nkoo */
195
196   /* Unicode-5.1 additions */
197   MATCH_SCRIPT (KAYAH_LI);           /* Kali */
198   MATCH_SCRIPT (LEPCHA);             /* Lepc */
199   MATCH_SCRIPT (REJANG);             /* Rjng */
200   MATCH_SCRIPT (SUNDANESE);          /* Sund */
201   MATCH_SCRIPT (SAURASHTRA);         /* Saur */
202   MATCH_SCRIPT (CHAM);               /* Cham */
203   MATCH_SCRIPT (OL_CHIKI);           /* Olck */
204   MATCH_SCRIPT (VAI);                /* Vaii */
205   MATCH_SCRIPT (CARIAN);             /* Cari */
206   MATCH_SCRIPT (LYCIAN);             /* Lyci */
207   MATCH_SCRIPT (LYDIAN);             /* Lydi */
208
209   /* Unicode-5.2 additions */
210   MATCH_SCRIPT (AVESTAN);                /* Avst */
211 #if CHECK_ICU_VERSION (4, 4)
212   MATCH_SCRIPT (BAMUM);                  /* Bamu */
213 #endif
214   MATCH_SCRIPT (EGYPTIAN_HIEROGLYPHS);   /* Egyp */
215   MATCH_SCRIPT (IMPERIAL_ARAMAIC);       /* Armi */
216   MATCH_SCRIPT (INSCRIPTIONAL_PAHLAVI);  /* Phli */
217   MATCH_SCRIPT (INSCRIPTIONAL_PARTHIAN); /* Prti */
218   MATCH_SCRIPT (JAVANESE);               /* Java */
219   MATCH_SCRIPT (KAITHI);                 /* Kthi */
220   MATCH_SCRIPT2(LANNA, TAI_THAM);        /* Lana */
221 #if CHECK_ICU_VERSION (4, 4)
222   MATCH_SCRIPT (LISU);                   /* Lisu */
223 #endif
224   MATCH_SCRIPT (MEITEI_MAYEK);           /* Mtei */
225 #if CHECK_ICU_VERSION (4, 4)
226   MATCH_SCRIPT (OLD_SOUTH_ARABIAN);      /* Sarb */
227 #endif
228   MATCH_SCRIPT2(ORKHON, OLD_TURKIC);     /* Orkh */
229   MATCH_SCRIPT (SAMARITAN);              /* Samr */
230   MATCH_SCRIPT (TAI_VIET);               /* Tavt */
231   }
232   return HB_SCRIPT_UNKNOWN;
233 }
234
235 static hb_unicode_funcs_t icu_ufuncs = {
236   HB_REFERENCE_COUNT_INVALID, /* ref_count */
237   TRUE, /* immutable */
238   {
239     hb_icu_get_general_category,
240     hb_icu_get_combining_class,
241     hb_icu_get_mirroring,
242     hb_icu_get_script,
243     hb_icu_get_eastasian_width
244   }
245 };
246
247 hb_unicode_funcs_t *
248 hb_icu_get_unicode_funcs (void)
249 {
250   return &icu_ufuncs;
251 }
252
253
254 HB_END_DECLS