1 /* Unicode character classification and properties.
2 Copyright (C) 2002, 2005-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
22 /* Get LIBUNISTRING_DLL_VARIABLE. */
23 #include <unistring/woe32dll.h>
26 #include <unistring/stdbool.h>
35 /* ========================================================================= */
37 /* Field 1 of Unicode Character Database: Character name.
40 /* ========================================================================= */
42 /* Field 2 of Unicode Character Database: General category. */
44 /* Data type denoting a General category value. This is not just a bitmask,
45 but rather a bitmask and a pointer to the lookup table, so that programs
46 that use only the predefined bitmasks (i.e. don't combine bitmasks with &
47 and |) don't have a link-time dependency towards the big general table. */
50 uint32_t bitmask : 31;
51 /*bool*/ unsigned int generic : 1;
54 const void *table; /* when generic is 0 */
55 bool (*lookup_fn) (ucs4_t uc, uint32_t bitmask); /* when generic is 1 */
58 uc_general_category_t;
60 /* Bits and bit masks denoting General category values. UnicodeData-3.2.0.html
61 says a 32-bit integer will always suffice to represent them.
62 These bit masks can only be used with the uc_is_general_category_withtable
66 UC_CATEGORY_MASK_L = 0x0000001f,
67 UC_CATEGORY_MASK_LC = 0x00000007,
68 UC_CATEGORY_MASK_Lu = 0x00000001,
69 UC_CATEGORY_MASK_Ll = 0x00000002,
70 UC_CATEGORY_MASK_Lt = 0x00000004,
71 UC_CATEGORY_MASK_Lm = 0x00000008,
72 UC_CATEGORY_MASK_Lo = 0x00000010,
73 UC_CATEGORY_MASK_M = 0x000000e0,
74 UC_CATEGORY_MASK_Mn = 0x00000020,
75 UC_CATEGORY_MASK_Mc = 0x00000040,
76 UC_CATEGORY_MASK_Me = 0x00000080,
77 UC_CATEGORY_MASK_N = 0x00000700,
78 UC_CATEGORY_MASK_Nd = 0x00000100,
79 UC_CATEGORY_MASK_Nl = 0x00000200,
80 UC_CATEGORY_MASK_No = 0x00000400,
81 UC_CATEGORY_MASK_P = 0x0003f800,
82 UC_CATEGORY_MASK_Pc = 0x00000800,
83 UC_CATEGORY_MASK_Pd = 0x00001000,
84 UC_CATEGORY_MASK_Ps = 0x00002000,
85 UC_CATEGORY_MASK_Pe = 0x00004000,
86 UC_CATEGORY_MASK_Pi = 0x00008000,
87 UC_CATEGORY_MASK_Pf = 0x00010000,
88 UC_CATEGORY_MASK_Po = 0x00020000,
89 UC_CATEGORY_MASK_S = 0x003c0000,
90 UC_CATEGORY_MASK_Sm = 0x00040000,
91 UC_CATEGORY_MASK_Sc = 0x00080000,
92 UC_CATEGORY_MASK_Sk = 0x00100000,
93 UC_CATEGORY_MASK_So = 0x00200000,
94 UC_CATEGORY_MASK_Z = 0x01c00000,
95 UC_CATEGORY_MASK_Zs = 0x00400000,
96 UC_CATEGORY_MASK_Zl = 0x00800000,
97 UC_CATEGORY_MASK_Zp = 0x01000000,
98 UC_CATEGORY_MASK_C = 0x3e000000,
99 UC_CATEGORY_MASK_Cc = 0x02000000,
100 UC_CATEGORY_MASK_Cf = 0x04000000,
101 UC_CATEGORY_MASK_Cs = 0x08000000,
102 UC_CATEGORY_MASK_Co = 0x10000000,
103 UC_CATEGORY_MASK_Cn = 0x20000000
106 /* Predefined General category values. */
107 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_L;
108 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_LC;
109 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lu;
110 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Ll;
111 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lt;
112 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lm;
113 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Lo;
114 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_M;
115 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Mn;
116 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Mc;
117 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Me;
118 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_N;
119 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Nd;
120 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Nl;
121 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_No;
122 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_P;
123 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pc;
124 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pd;
125 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Ps;
126 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pe;
127 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pi;
128 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Pf;
129 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Po;
130 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_S;
131 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sm;
132 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sc;
133 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Sk;
134 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_So;
135 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Z;
136 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zs;
137 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zl;
138 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Zp;
139 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_C;
140 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cc;
141 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cf;
142 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cs;
143 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Co;
144 extern LIBUNISTRING_DLL_VARIABLE const uc_general_category_t UC_CATEGORY_Cn;
146 extern const uc_general_category_t _UC_CATEGORY_NONE;
148 /* Alias names for predefined General category values. */
149 #define UC_LETTER UC_CATEGORY_L
150 #define UC_CASED_LETTER UC_CATEGORY_LC
151 #define UC_UPPERCASE_LETTER UC_CATEGORY_Lu
152 #define UC_LOWERCASE_LETTER UC_CATEGORY_Ll
153 #define UC_TITLECASE_LETTER UC_CATEGORY_Lt
154 #define UC_MODIFIER_LETTER UC_CATEGORY_Lm
155 #define UC_OTHER_LETTER UC_CATEGORY_Lo
156 #define UC_MARK UC_CATEGORY_M
157 #define UC_NON_SPACING_MARK UC_CATEGORY_Mn
158 #define UC_COMBINING_SPACING_MARK UC_CATEGORY_Mc
159 #define UC_ENCLOSING_MARK UC_CATEGORY_Me
160 #define UC_NUMBER UC_CATEGORY_N
161 #define UC_DECIMAL_DIGIT_NUMBER UC_CATEGORY_Nd
162 #define UC_LETTER_NUMBER UC_CATEGORY_Nl
163 #define UC_OTHER_NUMBER UC_CATEGORY_No
164 #define UC_PUNCTUATION UC_CATEGORY_P
165 #define UC_CONNECTOR_PUNCTUATION UC_CATEGORY_Pc
166 #define UC_DASH_PUNCTUATION UC_CATEGORY_Pd
167 #define UC_OPEN_PUNCTUATION UC_CATEGORY_Ps /* a.k.a. UC_START_PUNCTUATION */
168 #define UC_CLOSE_PUNCTUATION UC_CATEGORY_Pe /* a.k.a. UC_END_PUNCTUATION */
169 #define UC_INITIAL_QUOTE_PUNCTUATION UC_CATEGORY_Pi
170 #define UC_FINAL_QUOTE_PUNCTUATION UC_CATEGORY_Pf
171 #define UC_OTHER_PUNCTUATION UC_CATEGORY_Po
172 #define UC_SYMBOL UC_CATEGORY_S
173 #define UC_MATH_SYMBOL UC_CATEGORY_Sm
174 #define UC_CURRENCY_SYMBOL UC_CATEGORY_Sc
175 #define UC_MODIFIER_SYMBOL UC_CATEGORY_Sk
176 #define UC_OTHER_SYMBOL UC_CATEGORY_So
177 #define UC_SEPARATOR UC_CATEGORY_Z
178 #define UC_SPACE_SEPARATOR UC_CATEGORY_Zs
179 #define UC_LINE_SEPARATOR UC_CATEGORY_Zl
180 #define UC_PARAGRAPH_SEPARATOR UC_CATEGORY_Zp
181 #define UC_OTHER UC_CATEGORY_C
182 #define UC_CONTROL UC_CATEGORY_Cc
183 #define UC_FORMAT UC_CATEGORY_Cf
184 #define UC_SURROGATE UC_CATEGORY_Cs /* all of them are invalid characters */
185 #define UC_PRIVATE_USE UC_CATEGORY_Co
186 #define UC_UNASSIGNED UC_CATEGORY_Cn /* some of them are invalid characters */
188 /* Return the union of two general categories.
189 This corresponds to the unions of the two sets of characters. */
190 extern uc_general_category_t
191 uc_general_category_or (uc_general_category_t category1,
192 uc_general_category_t category2);
194 /* Return the intersection of two general categories as bit masks.
195 This *does*not* correspond to the intersection of the two sets of
197 extern uc_general_category_t
198 uc_general_category_and (uc_general_category_t category1,
199 uc_general_category_t category2);
201 /* Return the intersection of a general category with the complement of a
202 second general category, as bit masks.
203 This *does*not* correspond to the intersection with complement, when
204 viewing the categories as sets of characters. */
205 extern uc_general_category_t
206 uc_general_category_and_not (uc_general_category_t category1,
207 uc_general_category_t category2);
209 /* Return the name of a general category. */
211 uc_general_category_name (uc_general_category_t category)
214 /* Return the long name of a general category. */
216 uc_general_category_long_name (uc_general_category_t category)
219 /* Return the general category given by name, e.g. "Lu", or by long name,
220 e.g. "Uppercase Letter". */
221 extern uc_general_category_t
222 uc_general_category_byname (const char *category_name)
225 /* Return the general category of a Unicode character. */
226 extern uc_general_category_t
227 uc_general_category (ucs4_t uc)
230 /* Test whether a Unicode character belongs to a given category.
231 The CATEGORY argument can be the combination of several predefined
232 general categories. */
234 uc_is_general_category (ucs4_t uc, uc_general_category_t category)
236 /* Likewise. This function uses a big table comprising all categories. */
238 uc_is_general_category_withtable (ucs4_t uc, uint32_t bitmask)
241 /* ========================================================================= */
243 /* Field 3 of Unicode Character Database: Canonical combining class. */
245 /* The possible results of uc_combining_class (0..255) are described in
246 UCD.html. The list here is not definitive; more values can be added
247 in future versions. */
250 UC_CCC_NR = 0, /* Not Reordered */
251 UC_CCC_OV = 1, /* Overlay */
252 UC_CCC_NK = 7, /* Nukta */
253 UC_CCC_KV = 8, /* Kana Voicing */
254 UC_CCC_VR = 9, /* Virama */
255 UC_CCC_ATBL = 200, /* Attached Below Left */
256 UC_CCC_ATB = 202, /* Attached Below */
257 UC_CCC_ATA = 214, /* Attached Above */
258 UC_CCC_ATAR = 216, /* Attached Above Right */
259 UC_CCC_BL = 218, /* Below Left */
260 UC_CCC_B = 220, /* Below */
261 UC_CCC_BR = 222, /* Below Right */
262 UC_CCC_L = 224, /* Left */
263 UC_CCC_R = 226, /* Right */
264 UC_CCC_AL = 228, /* Above Left */
265 UC_CCC_A = 230, /* Above */
266 UC_CCC_AR = 232, /* Above Right */
267 UC_CCC_DB = 233, /* Double Below */
268 UC_CCC_DA = 234, /* Double Above */
269 UC_CCC_IS = 240 /* Iota Subscript */
272 /* Return the canonical combining class of a Unicode character. */
274 uc_combining_class (ucs4_t uc)
277 /* Return the name of a canonical combining class. */
279 uc_combining_class_name (int ccc)
282 /* Return the long name of a canonical combining class. */
284 uc_combining_class_long_name (int ccc)
287 /* Return the canonical combining class given by name, e.g. "BL", or by long
288 name, e.g. "Below Left". */
290 uc_combining_class_byname (const char *ccc_name)
293 /* ========================================================================= */
295 /* Field 4 of Unicode Character Database: Bidi class.
296 Before Unicode 4.0, this field was called "Bidirectional category". */
300 UC_BIDI_L, /* Left-to-Right */
301 UC_BIDI_LRE, /* Left-to-Right Embedding */
302 UC_BIDI_LRO, /* Left-to-Right Override */
303 UC_BIDI_R, /* Right-to-Left */
304 UC_BIDI_AL, /* Right-to-Left Arabic */
305 UC_BIDI_RLE, /* Right-to-Left Embedding */
306 UC_BIDI_RLO, /* Right-to-Left Override */
307 UC_BIDI_PDF, /* Pop Directional Format */
308 UC_BIDI_EN, /* European Number */
309 UC_BIDI_ES, /* European Number Separator */
310 UC_BIDI_ET, /* European Number Terminator */
311 UC_BIDI_AN, /* Arabic Number */
312 UC_BIDI_CS, /* Common Number Separator */
313 UC_BIDI_NSM, /* Non-Spacing Mark */
314 UC_BIDI_BN, /* Boundary Neutral */
315 UC_BIDI_B, /* Paragraph Separator */
316 UC_BIDI_S, /* Segment Separator */
317 UC_BIDI_WS, /* Whitespace */
318 UC_BIDI_ON /* Other Neutral */
321 /* Return the name of a bidi class. */
323 uc_bidi_class_name (int bidi_class)
325 /* Same; obsolete function name. */
327 uc_bidi_category_name (int category)
330 /* Return the long name of a bidi class. */
332 uc_bidi_class_long_name (int bidi_class)
335 /* Return the bidi class given by name, e.g. "LRE", or by long name, e.g.
336 "Left-to-Right Embedding". */
338 uc_bidi_class_byname (const char *bidi_class_name)
340 /* Same; obsolete function name. */
342 uc_bidi_category_byname (const char *category_name)
345 /* Return the bidi class of a Unicode character. */
347 uc_bidi_class (ucs4_t uc)
349 /* Same; obsolete function name. */
351 uc_bidi_category (ucs4_t uc)
354 /* Test whether a Unicode character belongs to a given bidi class. */
356 uc_is_bidi_class (ucs4_t uc, int bidi_class)
358 /* Same; obsolete function name. */
360 uc_is_bidi_category (ucs4_t uc, int category)
363 /* ========================================================================= */
365 /* Field 5 of Unicode Character Database: Character decomposition mapping.
368 /* ========================================================================= */
370 /* Field 6 of Unicode Character Database: Decimal digit value. */
372 /* Return the decimal digit value of a Unicode character. */
374 uc_decimal_value (ucs4_t uc)
377 /* ========================================================================= */
379 /* Field 7 of Unicode Character Database: Digit value. */
381 /* Return the digit value of a Unicode character. */
383 uc_digit_value (ucs4_t uc)
386 /* ========================================================================= */
388 /* Field 8 of Unicode Character Database: Numeric value. */
390 /* Return the numeric value of a Unicode character. */
398 uc_numeric_value (ucs4_t uc)
401 /* ========================================================================= */
403 /* Field 9 of Unicode Character Database: Mirrored. */
405 /* Return the mirrored character of a Unicode character UC in *PUC. */
407 uc_mirror_char (ucs4_t uc, ucs4_t *puc);
409 /* ========================================================================= */
411 /* Field 10 of Unicode Character Database: Unicode 1.0 Name.
412 Not available in this library. */
414 /* ========================================================================= */
416 /* Field 11 of Unicode Character Database: ISO 10646 comment.
417 Not available in this library. */
419 /* ========================================================================= */
421 /* Field 12, 13, 14 of Unicode Character Database: Uppercase mapping,
422 lowercase mapping, titlecase mapping. See "unicase.h". */
424 /* ========================================================================= */
426 /* Field 2 of the file ArabicShaping.txt in the Unicode Character Database. */
428 /* Possible joining types. */
431 UC_JOINING_TYPE_U, /* Non_Joining */
432 UC_JOINING_TYPE_T, /* Transparent */
433 UC_JOINING_TYPE_C, /* Join_Causing */
434 UC_JOINING_TYPE_L, /* Left_Joining */
435 UC_JOINING_TYPE_R, /* Right_Joining */
436 UC_JOINING_TYPE_D /* Dual_Joining */
439 /* Return the name of a joining type. */
441 uc_joining_type_name (int joining_type)
444 /* Return the long name of a joining type. */
446 uc_joining_type_long_name (int joining_type)
449 /* Return the joining type given by name, e.g. "D", or by long name, e.g.
452 uc_joining_type_byname (const char *joining_type_name)
455 /* Return the joining type of a Unicode character. */
457 uc_joining_type (ucs4_t uc)
460 /* ========================================================================= */
462 /* Field 3 of the file ArabicShaping.txt in the Unicode Character Database. */
464 /* Possible joining groups.
465 This enumeration may be extended in the future. */
468 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
469 UC_JOINING_GROUP_AIN, /* Ain */
470 UC_JOINING_GROUP_ALAPH, /* Alaph */
471 UC_JOINING_GROUP_ALEF, /* Alef */
472 UC_JOINING_GROUP_BEH, /* Beh */
473 UC_JOINING_GROUP_BETH, /* Beth */
474 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
475 UC_JOINING_GROUP_DAL, /* Dal */
476 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
477 UC_JOINING_GROUP_E, /* E */
478 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
479 UC_JOINING_GROUP_FE, /* Fe */
480 UC_JOINING_GROUP_FEH, /* Feh */
481 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
482 UC_JOINING_GROUP_GAF, /* Gaf */
483 UC_JOINING_GROUP_GAMAL, /* Gamal */
484 UC_JOINING_GROUP_HAH, /* Hah */
485 UC_JOINING_GROUP_HE, /* He */
486 UC_JOINING_GROUP_HEH, /* Heh */
487 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
488 UC_JOINING_GROUP_HETH, /* Heth */
489 UC_JOINING_GROUP_KAF, /* Kaf */
490 UC_JOINING_GROUP_KAPH, /* Kaph */
491 UC_JOINING_GROUP_KHAPH, /* Khaph */
492 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
493 UC_JOINING_GROUP_LAM, /* Lam */
494 UC_JOINING_GROUP_LAMADH, /* Lamadh */
495 UC_JOINING_GROUP_MEEM, /* Meem */
496 UC_JOINING_GROUP_MIM, /* Mim */
497 UC_JOINING_GROUP_NOON, /* Noon */
498 UC_JOINING_GROUP_NUN, /* Nun */
499 UC_JOINING_GROUP_NYA, /* Nya */
500 UC_JOINING_GROUP_PE, /* Pe */
501 UC_JOINING_GROUP_QAF, /* Qaf */
502 UC_JOINING_GROUP_QAPH, /* Qaph */
503 UC_JOINING_GROUP_REH, /* Reh */
504 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
505 UC_JOINING_GROUP_SAD, /* Sad */
506 UC_JOINING_GROUP_SADHE, /* Sadhe */
507 UC_JOINING_GROUP_SEEN, /* Seen */
508 UC_JOINING_GROUP_SEMKATH, /* Semkath */
509 UC_JOINING_GROUP_SHIN, /* Shin */
510 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
511 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
512 UC_JOINING_GROUP_TAH, /* Tah */
513 UC_JOINING_GROUP_TAW, /* Taw */
514 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
515 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
516 UC_JOINING_GROUP_TETH, /* Teth */
517 UC_JOINING_GROUP_WAW, /* Waw */
518 UC_JOINING_GROUP_YEH, /* Yeh */
519 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
520 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
521 UC_JOINING_GROUP_YUDH, /* Yudh */
522 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
523 UC_JOINING_GROUP_ZAIN, /* Zain */
524 UC_JOINING_GROUP_ZHAIN /* Zhain */
527 /* Return the name of a joining group. */
529 uc_joining_group_name (int joining_group)
532 /* Return the joining group given by name, e.g. "Teh_Marbuta". */
534 uc_joining_group_byname (const char *joining_group_name)
537 /* Return the joining group of a Unicode character. */
539 uc_joining_group (ucs4_t uc)
542 /* ========================================================================= */
544 /* Common API for properties. */
546 /* Data type denoting a property. This is not just a number, but rather a
547 pointer to the test functions, so that programs that use only few of the
548 properties don't have a link-time dependency towards all the tables. */
551 bool (*test_fn) (ucs4_t uc);
555 /* Predefined properties. */
557 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_WHITE_SPACE;
558 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ALPHABETIC;
559 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ALPHABETIC;
560 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NOT_A_CHARACTER;
561 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT;
562 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT;
563 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DEPRECATED;
564 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LOGICAL_ORDER_EXCEPTION;
565 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_VARIATION_SELECTOR;
566 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PRIVATE_USE;
567 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UNASSIGNED_CODE_VALUE;
569 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UPPERCASE;
570 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_UPPERCASE;
571 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LOWERCASE;
572 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_LOWERCASE;
573 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_TITLECASE;
574 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CASED;
575 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CASE_IGNORABLE;
576 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CHANGES_WHEN_LOWERCASED;
577 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CHANGES_WHEN_UPPERCASED;
578 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CHANGES_WHEN_TITLECASED;
579 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CHANGES_WHEN_CASEFOLDED;
580 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CHANGES_WHEN_CASEMAPPED;
581 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SOFT_DOTTED;
583 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ID_START;
584 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ID_START;
585 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ID_CONTINUE;
586 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_ID_CONTINUE;
587 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_XID_START;
588 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_XID_CONTINUE;
589 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PATTERN_WHITE_SPACE;
590 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PATTERN_SYNTAX;
591 /* Shaping and rendering. */
592 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_JOIN_CONTROL;
593 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_BASE;
594 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_EXTEND;
595 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_GRAPHEME_EXTEND;
596 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_GRAPHEME_LINK;
598 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_CONTROL;
599 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_LEFT_TO_RIGHT;
600 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_HEBREW_RIGHT_TO_LEFT;
601 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_ARABIC_RIGHT_TO_LEFT;
602 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUROPEAN_DIGIT;
603 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_SEPARATOR;
604 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_TERMINATOR;
605 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_ARABIC_DIGIT;
606 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_COMMON_SEPARATOR;
607 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_BLOCK_SEPARATOR;
608 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_SEGMENT_SEPARATOR;
609 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_WHITESPACE;
610 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_NON_SPACING_MARK;
611 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_BOUNDARY_NEUTRAL;
612 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_PDF;
613 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_EMBEDDING_OR_OVERRIDE;
614 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_BIDI_OTHER_NEUTRAL;
616 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_HEX_DIGIT;
617 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ASCII_HEX_DIGIT;
619 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDEOGRAPHIC;
620 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_UNIFIED_IDEOGRAPH;
621 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_RADICAL;
622 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDS_BINARY_OPERATOR;
623 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IDS_TRINARY_OPERATOR;
625 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ZERO_WIDTH;
626 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SPACE;
627 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NON_BREAK;
628 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_ISO_CONTROL;
629 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_FORMAT_CONTROL;
630 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DASH;
631 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_HYPHEN;
632 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PUNCTUATION;
633 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LINE_SEPARATOR;
634 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PARAGRAPH_SEPARATOR;
635 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_QUOTATION_MARK;
636 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_SENTENCE_TERMINAL;
637 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_TERMINAL_PUNCTUATION;
638 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_CURRENCY_SYMBOL;
639 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_MATH;
640 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_OTHER_MATH;
641 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_PAIRED_PUNCTUATION;
642 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_LEFT_OF_PAIR;
643 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_COMBINING;
644 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_COMPOSITE;
645 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DECIMAL_DIGIT;
646 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_NUMERIC;
647 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_DIACRITIC;
648 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_EXTENDER;
649 extern LIBUNISTRING_DLL_VARIABLE const uc_property_t UC_PROPERTY_IGNORABLE_CONTROL;
651 /* Return the property given by name, e.g. "White space". */
653 uc_property_byname (const char *property_name);
655 /* Test whether a property is valid. */
656 #define uc_property_is_valid(property) ((property).test_fn != NULL)
658 /* Test whether a Unicode character has a given property. */
660 uc_is_property (ucs4_t uc, uc_property_t property);
661 extern bool uc_is_property_white_space (ucs4_t uc)
663 extern bool uc_is_property_alphabetic (ucs4_t uc)
665 extern bool uc_is_property_other_alphabetic (ucs4_t uc)
667 extern bool uc_is_property_not_a_character (ucs4_t uc)
669 extern bool uc_is_property_default_ignorable_code_point (ucs4_t uc)
671 extern bool uc_is_property_other_default_ignorable_code_point (ucs4_t uc)
673 extern bool uc_is_property_deprecated (ucs4_t uc)
675 extern bool uc_is_property_logical_order_exception (ucs4_t uc)
677 extern bool uc_is_property_variation_selector (ucs4_t uc)
679 extern bool uc_is_property_private_use (ucs4_t uc)
681 extern bool uc_is_property_unassigned_code_value (ucs4_t uc)
683 extern bool uc_is_property_uppercase (ucs4_t uc)
685 extern bool uc_is_property_other_uppercase (ucs4_t uc)
687 extern bool uc_is_property_lowercase (ucs4_t uc)
689 extern bool uc_is_property_other_lowercase (ucs4_t uc)
691 extern bool uc_is_property_titlecase (ucs4_t uc)
693 extern bool uc_is_property_cased (ucs4_t uc)
695 extern bool uc_is_property_case_ignorable (ucs4_t uc)
697 extern bool uc_is_property_changes_when_lowercased (ucs4_t uc)
699 extern bool uc_is_property_changes_when_uppercased (ucs4_t uc)
701 extern bool uc_is_property_changes_when_titlecased (ucs4_t uc)
703 extern bool uc_is_property_changes_when_casefolded (ucs4_t uc)
705 extern bool uc_is_property_changes_when_casemapped (ucs4_t uc)
707 extern bool uc_is_property_soft_dotted (ucs4_t uc)
709 extern bool uc_is_property_id_start (ucs4_t uc)
711 extern bool uc_is_property_other_id_start (ucs4_t uc)
713 extern bool uc_is_property_id_continue (ucs4_t uc)
715 extern bool uc_is_property_other_id_continue (ucs4_t uc)
717 extern bool uc_is_property_xid_start (ucs4_t uc)
719 extern bool uc_is_property_xid_continue (ucs4_t uc)
721 extern bool uc_is_property_pattern_white_space (ucs4_t uc)
723 extern bool uc_is_property_pattern_syntax (ucs4_t uc)
725 extern bool uc_is_property_join_control (ucs4_t uc)
727 extern bool uc_is_property_grapheme_base (ucs4_t uc)
729 extern bool uc_is_property_grapheme_extend (ucs4_t uc)
731 extern bool uc_is_property_other_grapheme_extend (ucs4_t uc)
733 extern bool uc_is_property_grapheme_link (ucs4_t uc)
735 extern bool uc_is_property_bidi_control (ucs4_t uc)
737 extern bool uc_is_property_bidi_left_to_right (ucs4_t uc)
739 extern bool uc_is_property_bidi_hebrew_right_to_left (ucs4_t uc)
741 extern bool uc_is_property_bidi_arabic_right_to_left (ucs4_t uc)
743 extern bool uc_is_property_bidi_european_digit (ucs4_t uc)
745 extern bool uc_is_property_bidi_eur_num_separator (ucs4_t uc)
747 extern bool uc_is_property_bidi_eur_num_terminator (ucs4_t uc)
749 extern bool uc_is_property_bidi_arabic_digit (ucs4_t uc)
751 extern bool uc_is_property_bidi_common_separator (ucs4_t uc)
753 extern bool uc_is_property_bidi_block_separator (ucs4_t uc)
755 extern bool uc_is_property_bidi_segment_separator (ucs4_t uc)
757 extern bool uc_is_property_bidi_whitespace (ucs4_t uc)
759 extern bool uc_is_property_bidi_non_spacing_mark (ucs4_t uc)
761 extern bool uc_is_property_bidi_boundary_neutral (ucs4_t uc)
763 extern bool uc_is_property_bidi_pdf (ucs4_t uc)
765 extern bool uc_is_property_bidi_embedding_or_override (ucs4_t uc)
767 extern bool uc_is_property_bidi_other_neutral (ucs4_t uc)
769 extern bool uc_is_property_hex_digit (ucs4_t uc)
771 extern bool uc_is_property_ascii_hex_digit (ucs4_t uc)
773 extern bool uc_is_property_ideographic (ucs4_t uc)
775 extern bool uc_is_property_unified_ideograph (ucs4_t uc)
777 extern bool uc_is_property_radical (ucs4_t uc)
779 extern bool uc_is_property_ids_binary_operator (ucs4_t uc)
781 extern bool uc_is_property_ids_trinary_operator (ucs4_t uc)
783 extern bool uc_is_property_zero_width (ucs4_t uc)
785 extern bool uc_is_property_space (ucs4_t uc)
787 extern bool uc_is_property_non_break (ucs4_t uc)
789 extern bool uc_is_property_iso_control (ucs4_t uc)
791 extern bool uc_is_property_format_control (ucs4_t uc)
793 extern bool uc_is_property_dash (ucs4_t uc)
795 extern bool uc_is_property_hyphen (ucs4_t uc)
797 extern bool uc_is_property_punctuation (ucs4_t uc)
799 extern bool uc_is_property_line_separator (ucs4_t uc)
801 extern bool uc_is_property_paragraph_separator (ucs4_t uc)
803 extern bool uc_is_property_quotation_mark (ucs4_t uc)
805 extern bool uc_is_property_sentence_terminal (ucs4_t uc)
807 extern bool uc_is_property_terminal_punctuation (ucs4_t uc)
809 extern bool uc_is_property_currency_symbol (ucs4_t uc)
811 extern bool uc_is_property_math (ucs4_t uc)
813 extern bool uc_is_property_other_math (ucs4_t uc)
815 extern bool uc_is_property_paired_punctuation (ucs4_t uc)
817 extern bool uc_is_property_left_of_pair (ucs4_t uc)
819 extern bool uc_is_property_combining (ucs4_t uc)
821 extern bool uc_is_property_composite (ucs4_t uc)
823 extern bool uc_is_property_decimal_digit (ucs4_t uc)
825 extern bool uc_is_property_numeric (ucs4_t uc)
827 extern bool uc_is_property_diacritic (ucs4_t uc)
829 extern bool uc_is_property_extender (ucs4_t uc)
831 extern bool uc_is_property_ignorable_control (ucs4_t uc)
834 /* ========================================================================= */
836 /* Subdivision of the Unicode characters into scripts. */
840 unsigned int code : 21;
841 unsigned int start : 1;
842 unsigned int end : 1;
847 unsigned int nintervals;
848 const uc_interval_t *intervals;
853 /* Return the script of a Unicode character. */
854 extern const uc_script_t *
855 uc_script (ucs4_t uc)
858 /* Return the script given by name, e.g. "HAN". */
859 extern const uc_script_t *
860 uc_script_byname (const char *script_name)
863 /* Test whether a Unicode character belongs to a given script. */
865 uc_is_script (ucs4_t uc, const uc_script_t *script)
868 /* Get the list of all scripts. */
870 uc_all_scripts (const uc_script_t **scripts, size_t *count);
872 /* ========================================================================= */
874 /* Subdivision of the Unicode character range into blocks. */
884 /* Return the block a character belongs to. */
885 extern const uc_block_t *
889 /* Test whether a Unicode character belongs to a given block. */
891 uc_is_block (ucs4_t uc, const uc_block_t *block)
894 /* Get the list of all blocks. */
896 uc_all_blocks (const uc_block_t **blocks, size_t *count);
898 /* ========================================================================= */
900 /* Properties taken from language standards. */
902 /* Test whether a Unicode character is considered whitespace in ISO C 99. */
904 uc_is_c_whitespace (ucs4_t uc)
907 /* Test whether a Unicode character is considered whitespace in Java. */
909 uc_is_java_whitespace (ucs4_t uc)
914 UC_IDENTIFIER_START, /* valid as first or subsequent character */
915 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
916 UC_IDENTIFIER_INVALID, /* not valid */
917 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
920 /* Return the categorization of a Unicode character w.r.t. the ISO C 99
921 identifier syntax. */
923 uc_c_ident_category (ucs4_t uc)
926 /* Return the categorization of a Unicode character w.r.t. the Java
927 identifier syntax. */
929 uc_java_ident_category (ucs4_t uc)
932 /* ========================================================================= */
934 /* Like ISO C <ctype.h> and <wctype.h>. These functions are deprecated,
935 because this set of functions was designed with ASCII in mind and cannot
936 reflect the more diverse reality of the Unicode character set. But they
937 can be a quick-and-dirty porting aid when migrating from wchar_t APIs
938 to Unicode strings. */
940 /* Test for any character for which 'uc_is_alpha' or 'uc_is_digit' is true. */
942 uc_is_alnum (ucs4_t uc)
945 /* Test for any character for which 'uc_is_upper' or 'uc_is_lower' is true,
946 or any character that is one of a locale-specific set of characters for
947 which none of 'uc_is_cntrl', 'uc_is_digit', 'uc_is_punct', or 'uc_is_space'
950 uc_is_alpha (ucs4_t uc)
953 /* Test for any control character. */
955 uc_is_cntrl (ucs4_t uc)
958 /* Test for any character that corresponds to a decimal-digit character. */
960 uc_is_digit (ucs4_t uc)
963 /* Test for any character for which 'uc_is_print' is true and 'uc_is_space'
966 uc_is_graph (ucs4_t uc)
969 /* Test for any character that corresponds to a lowercase letter or is one
970 of a locale-specific set of characters for which none of 'uc_is_cntrl',
971 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
973 uc_is_lower (ucs4_t uc)
976 /* Test for any printing character. */
978 uc_is_print (ucs4_t uc)
981 /* Test for any printing character that is one of a locale-specific set of
982 characters for which neither 'uc_is_space' nor 'uc_is_alnum' is true. */
984 uc_is_punct (ucs4_t uc)
987 /* Test for any character that corresponds to a locale-specific set of
988 characters for which none of 'uc_is_alnum', 'uc_is_graph', or 'uc_is_punct'
991 uc_is_space (ucs4_t uc)
994 /* Test for any character that corresponds to an uppercase letter or is one
995 of a locale-specific set of character for which none of 'uc_is_cntrl',
996 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
998 uc_is_upper (ucs4_t uc)
1001 /* Test for any character that corresponds to a hexadecimal-digit
1004 uc_is_xdigit (ucs4_t uc)
1005 _UC_ATTRIBUTE_CONST;
1007 /* GNU extension. */
1008 /* Test for any character that corresponds to a standard blank character or
1009 a locale-specific set of characters for which 'uc_is_alnum' is false. */
1011 uc_is_blank (ucs4_t uc)
1012 _UC_ATTRIBUTE_CONST;
1014 /* ========================================================================= */
1020 #endif /* _UNICTYPE_H */