2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
41 #include "pinyin_data.h"
43 static const char *initials[] =
44 { "", "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x",
46 "ch", "sh", "r", "z", "c", "s", "y", "w", };
47 static const unsigned num_initials = sizeof(initials) / sizeof(*initials);
49 static const char *finals[] =
50 { "", "a", "o", "e", "ai", "ei", "ao", "ou", "an", "en", "ang", "eng", "er",
52 "ia", "ie", "iao", "iu", "ian", "in", "iang", "ing", "u", "ua", "uo", "uai",
53 "ui", "uan", "un", "uang", "ong", "v", "ue", "iong", };
54 static const unsigned num_finals = sizeof(finals) / sizeof(*finals);
56 static const char *fuzzy_finals[] =
57 { "ia", "iao", "ian", "iang", "ie", "ua", "uai", "uan", "uang", "ue" };
58 static const unsigned num_fuzzy_finals = sizeof(fuzzy_finals) /
59 sizeof(*fuzzy_finals);
61 static const unsigned fuzzy_finals_map[] = {
62 0x0e, 0x10, 1, /* ia -> a len 1 */
63 0x10, 0x60, 2, /* iao -> ao len 2 */
64 0x12, 0x80, 2, /* ian -> an len 2 */
65 0x14, 0xa0, 3, /* iang -> ang len 3 */
66 0x0f, 0x30, 1, /* ie -> e len 1 */
67 0x17, 0x10, 1, /* ua -> a len 1 */
68 0x19, 0x40, 2, /* uai -> ai len 2 */
69 0x1b, 0x80, 2, /* uan -> an len 2 */
70 0x1d, 0xa0, 3, /* uang -> ang len 3 */
71 0x20, 0x30, 1, /* ue -> e len 1 */
74 static const unsigned fuzzy_pre_syllables [] = {
75 0x0d0e0, 'n', 0x0d120, /* qian */
76 0x09080, 'g', 0x090a0, /* gang */
77 0x080e0, 'n', 0x08120, /* lian */
78 0x15090, 'g', 0x150b0, /* seng */
79 0x04010, 'n', 0x04080, /* fan */
80 0x10030, 'n', 0x10090, /* chen */
81 0x050e0, 'n', 0x05120, /* dian */
82 0x15160, 'n', 0x151c0, /* sun */
83 0x07080, 'g', 0x070a0, /* nang */
84 0x0a160, 'n', 0x0a1c0, /* kun */
85 0x05030, 'n', 0x05090, /* den */
86 0x07090, 'g', 0x070b0, /* neng */
87 0x03030, 'n', 0x03090, /* men */
88 0x09090, 'g', 0x090b0, /* geng */
89 0x10080, 'g', 0x100a0, /* chang */
90 0x0f010, 'n', 0x0f080, /* zhan */
91 0x14010, 'n', 0x14080, /* can */
92 0x07130, 'g', 0x07150, /* ning */
93 0x17080, 'g', 0x170a0, /* wang */
94 0x01090, 'g', 0x010b0, /* beng */
95 0x0f1b0, 'g', 0x0f1d0, /* zhuang */
96 0x06010, 'n', 0x06080, /* tan */
97 0x00090, 'g', 0x000b0, /* eng */
98 0x0f080, 'g', 0x0f0a0, /* zhang */
99 0x02130, 'g', 0x02150, /* ping */
100 0x08010, 'n', 0x08080, /* lan */
101 0x0e160, 'n', 0x0e1c0, /* xun */
102 0x03010, 'n', 0x03080, /* man */
103 0x0c120, 'g', 0x0c140, /* jiang */
104 0x0a1b0, 'g', 0x0a1d0, /* kuang */
105 0x01130, 'g', 0x01150, /* bing */
106 0x13010, 'n', 0x13080, /* zan */
107 0x13030, 'n', 0x13090, /* zen */
108 0x02080, 'g', 0x020a0, /* pang */
109 0x0c0d0, 'n', 0x0c130, /* jin */
110 0x14030, 'n', 0x14090, /* cen */
111 0x05010, 'n', 0x05080, /* dan */
112 0x0f030, 'n', 0x0f090, /* zhen */
113 0x01080, 'g', 0x010a0, /* bang */
114 0x17090, 'g', 0x170b0, /* weng */
115 0x00030, 'n', 0x00090, /* en */
116 0x0a080, 'g', 0x0a0a0, /* kang */
117 0x09160, 'n', 0x091c0, /* gun */
118 0x00030, 'r', 0x000c0, /* er */
119 0x0a090, 'g', 0x0a0b0, /* keng */
120 0x15080, 'g', 0x150a0, /* sang */
121 0x12030, 'n', 0x12090, /* ren */
122 0x11160, 'n', 0x111c0, /* shun */
123 0x0d160, 'n', 0x0d1c0, /* qun */
124 0x16160, 'n', 0x161c0, /* yun */
125 0x0e120, 'g', 0x0e140, /* xiang */
126 0x12080, 'g', 0x120a0, /* rang */
127 0x09170, 'n', 0x091b0, /* guan */
128 0x16130, 'g', 0x16150, /* ying */
129 0x0a170, 'n', 0x0a1b0, /* kuan */
130 0x10010, 'n', 0x10080, /* chan */
131 0x160d0, 'n', 0x16130, /* yin */
132 0x0e0d0, 'n', 0x0e130, /* xin */
133 0x07120, 'g', 0x07140, /* niang */
134 0x0b160, 'n', 0x0b1c0, /* hun */
135 0x11170, 'n', 0x111b0, /* shuan */
136 0x05080, 'g', 0x050a0, /* dang */
137 0x00080, 'g', 0x000a0, /* ang */
138 0x15010, 'n', 0x15080, /* san */
139 0x12090, 'g', 0x120b0, /* reng */
140 0x03130, 'g', 0x03150, /* ming */
141 0x030d0, 'n', 0x03130, /* min */
142 0x07030, 'n', 0x07090, /* nen */
143 0x0a010, 'n', 0x0a080, /* kan */
144 0x16080, 'g', 0x160a0, /* yang */
145 0x05090, 'g', 0x050b0, /* deng */
146 0x101b0, 'g', 0x101d0, /* chuang */
147 0x04090, 'g', 0x040b0, /* feng */
148 0x03090, 'g', 0x030b0, /* meng */
149 0x10090, 'g', 0x100b0, /* cheng */
150 0x09030, 'n', 0x09090, /* gen */
151 0x01010, 'n', 0x01080, /* ban */
152 0x07160, 'n', 0x071c0, /* nun */
153 0x15030, 'n', 0x15090, /* sen */
154 0x04080, 'g', 0x040a0, /* fang */
155 0x08160, 'n', 0x081c0, /* lun */
156 0x0a030, 'n', 0x0a090, /* ken */
157 0x0b1b0, 'g', 0x0b1d0, /* huang */
158 0x03080, 'g', 0x030a0, /* mang */
159 0x06160, 'n', 0x061c0, /* tun */
160 0x0d0d0, 'n', 0x0d130, /* qin */
161 0x02090, 'g', 0x020b0, /* peng */
162 0x05160, 'n', 0x051c0, /* dun */
163 0x10160, 'n', 0x101c0, /* chun */
164 0x09010, 'n', 0x09080, /* gan */
165 0x13090, 'g', 0x130b0, /* zeng */
166 0x06080, 'g', 0x060a0, /* tang */
167 0x14080, 'g', 0x140a0, /* cang */
168 0x0b090, 'g', 0x0b0b0, /* heng */
169 0x0e0e0, 'n', 0x0e120, /* xian */
170 0x0f160, 'n', 0x0f1c0, /* zhun */
171 0x111b0, 'g', 0x111d0, /* shuang */
172 0x11010, 'n', 0x11080, /* shan */
173 0x02010, 'n', 0x02080, /* pan */
174 0x070d0, 'n', 0x07130, /* nin */
175 0x0b080, 'g', 0x0b0a0, /* hang */
176 0x0f170, 'n', 0x0f1b0, /* zhuan */
177 0x080d0, 'n', 0x08130, /* lin */
178 0x091b0, 'g', 0x091d0, /* guang */
179 0x0b010, 'n', 0x0b080, /* han */
180 0x14160, 'n', 0x141c0, /* cun */
181 0x010d0, 'n', 0x01130, /* bin */
182 0x11030, 'n', 0x11090, /* shen */
183 0x0e130, 'g', 0x0e150, /* xing */
184 0x0d120, 'g', 0x0d140, /* qiang */
185 0x12160, 'n', 0x121c0, /* run */
186 0x11090, 'g', 0x110b0, /* sheng */
187 0x10170, 'n', 0x101b0, /* chuan */
188 0x0d130, 'g', 0x0d150, /* qing */
189 0x0c0e0, 'n', 0x0c120, /* jian */
190 0x17010, 'n', 0x17080, /* wan */
191 0x0c130, 'g', 0x0c150, /* jing */
192 0x16010, 'n', 0x16080, /* yan */
193 0x08120, 'g', 0x08140, /* liang */
194 0x0b170, 'n', 0x0b1b0, /* huan */
195 0x0b030, 'n', 0x0b090, /* hen */
196 0x11080, 'g', 0x110a0, /* shang */
197 0x0c160, 'n', 0x0c1c0, /* jun */
198 0x08130, 'g', 0x08150, /* ling */
199 0x14090, 'g', 0x140b0, /* ceng */
200 0x020d0, 'n', 0x02130, /* pin */
201 0x00010, 'n', 0x00080, /* an */
202 0x13080, 'g', 0x130a0, /* zang */
203 0x07010, 'n', 0x07080, /* nan */
204 0x0f090, 'g', 0x0f0b0, /* zheng */
205 0x13160, 'n', 0x131c0, /* zun */
206 0x08080, 'g', 0x080a0, /* lang */
210 static const unsigned fuzzy_pro_syllables [] = {
211 0x09030, 'g', 0x00030, /* ge */
212 0x090a0, 'g', 0x000a0, /* gang */
213 0x09010, 'g', 0x00010, /* ga */
214 0x12070, 'r', 0x00070, /* rou */
215 0x07050, 'n', 0x00050, /* nei */
216 0x070a0, 'n', 0x000a0, /* nang */
217 0x070b0, 'n', 0x000b0, /* neng */
218 0x090b0, 'g', 0x000b0, /* geng */
219 0x07070, 'n', 0x00070, /* nou */
220 0x12030, 'r', 0x00030, /* re */
221 0x12090, 'r', 0x00090, /* ren */
222 0x09070, 'g', 0x00070, /* gou */
223 0x120a0, 'r', 0x000a0, /* rang */
224 0x120b0, 'r', 0x000b0, /* reng */
225 0x12080, 'r', 0x00080, /* ran */
226 0x12060, 'r', 0x00060, /* rao */
227 0x07090, 'n', 0x00090, /* nen */
228 0x09050, 'g', 0x00050, /* gei */
229 0x09090, 'g', 0x00090, /* gen */
230 0x09060, 'g', 0x00060, /* gao */
231 0x09080, 'g', 0x00080, /* gan */
232 0x09040, 'g', 0x00040, /* gai */
233 0x07060, 'n', 0x00060, /* nao */
234 0x07010, 'n', 0x00010, /* na */
235 0x07040, 'n', 0x00040, /* nai */
236 0x07080, 'n', 0x00080, /* nan */
237 0x07030, 'n', 0x00030, /* ne */
241 static const char * fuzzy_pairs[] = {
257 static const unsigned num_fuzzy_pairs = sizeof(fuzzy_pairs) /
258 sizeof(*fuzzy_pairs) / 2;
260 static const char * auto_correction_pairs[] = {
267 static const unsigned num_auto_correction_pairs =
268 sizeof(auto_correction_pairs) / sizeof(*auto_correction_pairs) / 2;
270 static const TPyTabEntry
308 { "chang", 0x100a0 },
312 { "cheng", 0x100b0 },
314 { "chong", 0x101e0 },
318 { "chuai", 0x10190 },
319 { "chuan", 0x101b0 },
320 { "chuang", 0x101d0 },
388 { "guang", 0x091d0 },
408 { "huang", 0x0b1d0 },
416 { "jiang", 0x0c140 },
421 { "jiong", 0x0c210 },
443 { "kuang", 0x0a1d0 },
459 { "liang", 0x08140 },
506 { "niang", 0x07140 },
544 { "qiang", 0x0d140 },
549 { "qiong", 0x0d210 },
583 { "shang", 0x110a0 },
588 { "sheng", 0x110b0 },
593 { "shuai", 0x11190 },
594 { "shuan", 0x111b0 },
595 { "shuang", 0x111d0 },
642 { "xiang", 0x0e140 },
647 { "xiong", 0x0e210 },
683 { "zhang", 0x0f0a0 },
688 { "zheng", 0x0f0b0 },
690 { "zhong", 0x0f1e0 },
694 { "zhuai", 0x0f190 },
695 { "zhuan", 0x0f1b0 },
696 { "zhuang", 0x0f1d0 },
711 pytab_entry_compare(const char *s, TPyTabEntry *v)
713 return strcmp(s, v->pystr);
717 CPinyinData::encodeSyllable(const char *pinyin)
719 typedef int (*bsearch_compare)(const void*, const void*);
720 TPyTabEntry *e = (TPyTabEntry*)bsearch(pinyin, pinyin_table,
721 sizeof(pinyin_table) /
722 sizeof(pinyin_table[0]),
723 sizeof(pinyin_table[0]),
724 (bsearch_compare)pytab_entry_compare);
732 CPinyinData::decodeSyllable(TSyllable s, const char **i, const char **f)
734 if (i) *i = initials[s.initial];
735 if (f) *f = finals[s.final];
737 static char buf[128];
738 snprintf(buf, sizeof(buf), "%s%s", initials[s.initial], finals[s.final]);
740 typedef int (*bsearch_compare)(const void*, const void*);
741 TPyTabEntry *e = (TPyTabEntry*)bsearch(buf, pinyin_table,
742 sizeof(pinyin_table) /
743 sizeof(pinyin_table[0]),
744 sizeof(pinyin_table[0]),
745 (bsearch_compare)pytab_entry_compare);
754 CPinyinData::getAutoCorrectionPairs(unsigned &num)
756 num = num_auto_correction_pairs;
757 return auto_correction_pairs;
761 CPinyinData::getFuzzyPairs(unsigned &num)
763 num = num_fuzzy_pairs;
768 CPinyinData::getInitials(unsigned &num)
775 CPinyinData::getFinals(unsigned &num)
782 CPinyinData::getPinyinTable(unsigned &num)
784 num = sizeof(pinyin_table) / sizeof(TPyTabEntry);
789 CPinyinData::getInnerFuzzyFinalMap(unsigned &num)
791 num = num_fuzzy_finals;
792 return fuzzy_finals_map;
796 CPinyinData::getFuzzyPreProSyllables(const unsigned **pre_syls,
797 const unsigned **pro_syls)
799 *pre_syls = fuzzy_pre_syllables;
800 *pro_syls = fuzzy_pro_syllables;