2 * Copyright (c) 2015 Samsung Electronics Co., Ltd.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <dali/devel-api/text-abstraction/script.h>
24 namespace TextAbstraction
29 const unsigned int WHITE_SPACE_THRESHOLD = 0x21; ///< All characters below 0x21 are considered white spaces.
30 const unsigned int CHAR_FL = 0x000A; ///< NL Line feed, new line.
31 const unsigned int CHAR_VT = 0x000B; ///< Vertical tab.
32 const unsigned int CHAR_FF = 0x000C; ///< NP Form feed, new page.
33 const unsigned int CHAR_NEL = 0x0085; ///< Next line.
34 const unsigned int CHAR_LS = 0x2028; ///< Line separator.
35 const unsigned int CHAR_PS = 0x2029; ///< Paragraph separator
37 const unsigned int CHAR_ZWS = 0x200B; ///< Zero width space.
38 const unsigned int CHAR_ZWNJ = 0x200C; ///< Zero width non joiner.
39 const unsigned int CHAR_ZWJ = 0x200D; ///< Zero width joiner.
40 const unsigned int CHAR_LTRM = 0x200E; ///< Left to Right Mark.
41 const unsigned int CHAR_RTLM = 0x200F; ///< Right to Left Mark.
42 const unsigned int CHAR_TS = 0x2009; ///< Thin Space.
45 bool IsRightToLeftScript( Script script )
47 return ( ( ARABIC == script ) ||
48 ( HEBREW == script ) );
51 Script GetCharacterScript( Character character )
54 // 0x0000 - 0x007f C0 Controls and Basic Latin
55 // 0x0080 - 0x00ff C1 Controls and Latin-1 Supplement
56 // 0x0100 - 0x017f Latin Extended-A
57 // 0x0180 - 0x024f Latin Extended-B
58 // 0x0250 - 0x02af IPA Extensions
59 // 0x02b0 - 0x02ff Spacing Modifier Letters
60 // 0x1d00 - 0x1d7f Phonetic Extensions
61 // 0x1d80 - 0x1dbf Phonetic Extensions Supplement
62 // 0x1e00 - 0x1eff Latin Extended Additional
63 // 0x2070 - 0x209f Superscripts and Subscripts
64 // 0x2100 - 0x214f Letterlike symbols
65 // 0x2150 - 0x218f Number Forms
66 // 0x2c60 - 0x2c7f Latin Extended-C
67 // 0xa720 - 0xa7ff Latin Extended-D
68 // 0xab30 - 0xab6f Latin Extended-E
69 // 0xfb00 - 0xfb06 Latin Alphabetic Presentation Forms
70 // 0xff00 - 0xffef Halfwidth and Fullwidth Forms
73 // 0x0900 - 0x097f Devanagari
74 // 0x0980 - 0x09ff Bengali
75 // 0x0a00 - 0x0a7f Gurmukhi
76 // 0x0a80 - 0x0aff Gujarati
77 // 0x0b00 - 0x0b7f Oriya
78 // 0x0b80 - 0x0bff Tamil
79 // 0x0c00 - 0x0c7f Telugu
80 // 0x0c80 - 0x0cff Kannada
81 // 0x0d00 - 0x0d7f Malayalam
84 // 0x0d80 - 0x0dff Sinhala
87 // 0x0600 - 0x06ff Arabic
88 // 0x0750 - 0x077f Arabic Supplement
89 // 0x08A0 - 0x08ff Arabic Extended-A
90 // 0xfb50 - 0xfdff Arabic Presentation Forms-A
91 // 0xfe70 - 0xfeff Arabic Presentation Forms-B
92 // 0x1ee00 - 0x1eeff Arabic Mathematical Alphabetic Symbols
94 // CJK (Chinese, Japanese and Korean) and Vietnamese script.
95 // 0x2e80 - 0x2eff CJK Radicals Supplement
96 // 0x2f00 - 0x2fdf Kangxi Radicals
97 // 0x3000 - 0x303f CJK Symbols and Punctuation
98 // 0x3200 - 0x32ff Enclosed CJK Letters and Months
99 // 0x3400 - 0x4dbf CJK Unified Ideographs Extension A
100 // 0x4e00 - 0x62ff CJK Unified Ideographs
101 // 0x6300 - 0x77ff CJK Unified Ideographs
102 // 0x7800 - 0x8cff CJK Unified Ideographs
103 // 0x8d00 - 0x9fff CJK Unified Ideographs
104 // 0x20000 - 0x215ff CJK Unified Ideographs Extension B
105 // 0x21600 - 0x230ff CJK Unified Ideographs Extension B
106 // 0x23100 - 0x245ff CJK Unified Ideographs Extension B
107 // 0x24600 - 0x260ff CJK Unified Ideographs Extension B
108 // 0x26100 - 0x275ff CJK Unified Ideographs Extension B
109 // 0x27600 - 0x290ff CJK Unified Ideographs Extension B
110 // 0x29100 - 0x2a6df CJK Unified Ideographs Extension B
111 // 0x2a700 - 0x2b73f CJK Unified Ideographs Extension C
112 // 0x2b740 - 0x2b81f CJK Unified Ideographs Extension D
115 // 0x3040 - 0x309f Hiragana
116 // 0x30a0 - 0x30ff Katakana
119 // 0x1100 - 0x11ff Hangul jamo
120 // 0x3130 - 0x318f Hangul Compatibility Jamo
121 // 0xa960 - 0xa97f Hangul Jamo Extended-A
122 // 0xac00 - 0xd7af Hangul Syllables
123 // 0xd7b0 - 0xd7ff Hangul Jamo Extended-B
126 // 0x1780 - 0x17ff Khmer
127 // 0x19e0 - 0x19ff Khmer Symbols
130 // 0x0e80 - 0x0eff Lao
133 // 0x0e00 - 0x0e7f Thai
136 // 0x1000 - 0x109f Myanmar
139 // 0x0591 - 0x05f4 Hebrew
140 // 0xfb1d - 0xfb4f Hebrew subset of Alphabetic Presentation Forms
142 // The Emoji which map to standardized Unicode characters
143 // 1. Emoticons ( 1F601 - 1F64F )
144 // 2. Dingbats ( 2702 - 27B0 )
145 // 3. Transport and map symbols ( 1F680 - 1F6C0 )
146 // 4. Enclosed characters ( 24C2 - 1F251 )
147 // 5. Uncategorized :-S
148 // 6. Additional Emoticons ( 1F600 - 1F636 )
149 // 6b. Additional transport and map symbols ( 1F681 - 1F6C5 )
150 // 6c. Other additional symbols ( 1F30D - 1F567 )
152 if( character <= 0x0cff )
154 if( character <= 0x09ff )
156 if( character <= 0x077f )
158 if( character == 0x00A9 )
160 return EMOJI; // 5. Uncategorized: copyright sign
162 if( character == 0x00AE )
164 return EMOJI; // 5. Uncategorized: registered sign
166 if( character <= 0x02ff )
170 if( ( 0x0591 <= character ) && ( character <= 0x05f4 ) )
174 if( ( 0x0600 <= character ) && ( character <= 0x06ff ) )
178 if( ( 0x0750 <= character ) && ( character <= 0x077f ) )
185 if( ( 0x08A0 <= character ) && ( character <= 0x08ff ) )
189 if( ( 0x0900 <= character ) && ( character <= 0x097f ) )
193 if( ( 0x0980 <= character ) && ( character <= 0x09ff ) )
201 if( character <= 0x0b7f )
203 if( ( 0x0a00 <= character ) && ( character <= 0x0a7f ) )
207 if( ( 0x0a80 <= character ) && ( character <= 0x0aff ) )
211 if( ( 0x0b00 <= character ) && ( character <= 0x0b7f ) )
218 if( ( 0x0b80 <= character ) && ( character <= 0x0bff ) )
222 if( ( 0x0c00 <= character ) && ( character <= 0x0c7f ) )
226 if( ( 0x0c80 <= character ) && ( character <= 0x0cff ) )
235 if( character <= 0x2c7f )
237 if( character <= 0x1eff )
239 if( ( 0x0d00 <= character ) && ( character <= 0x0d7f ) )
243 if( ( 0x0d80 <= character ) && ( character <= 0x0dff ) )
247 if( ( 0x0e00 <= character ) && ( character <= 0x0e7f ) )
251 if( ( 0x0e80 <= character ) && ( character <= 0x0eff ) )
255 if( ( 0x1000 <= character ) && ( character <= 0x109f ) )
259 if( ( 0x1100 <= character ) && ( character <= 0x11ff ) )
263 if( ( 0x1780 <= character ) && ( character <= 0x17ff ) )
267 if( ( 0x19e0 <= character ) && ( character <= 0x19ff ) )
271 if( ( 0x1d00 <= character ) && ( character <= 0x1eff ) )
278 if( character == 0x203c )
280 return EMOJI; // 5. Uncategorized: double exclamation mark
282 if( character == 0x2049 )
284 return EMOJI; // 5. Uncategorized: exclamation question mark
286 if( ( 0x2070 <= character ) && ( character <= 0x209f ) )
290 if( character == 0x20e3 )
292 return EMOJI; // 5. Uncategorized: combining enclosing keycap
294 if( character == 0x2122 )
296 return EMOJI; // 5. Uncategorized: trade mark sign
298 if( character == 0x2139 )
300 return EMOJI; // 5. Uncategorized: information source
302 if( ( 0x2100 <= character ) && ( character <= 0x218f ) )
306 // U+2194 5. Uncategorized: left right arrow
307 // U+2B55 5. Uncategorized: heavy large circle
308 if( ( 0x2194 <= character ) && ( character <= 0x2B55 ) )
312 if( ( 0x2c60 <= character ) && ( character <= 0x2c7f ) )
320 if( character <= 0xfdff )
322 if( ( 0x2e80 <= character ) && ( character <= 0x2eff ) )
326 if( ( 0x2f00 <= character ) && ( character <= 0x2fdf ) )
330 if( ( 0x3000 <= character ) && ( character <= 0x303f ) )
334 if( ( 0x3040 <= character ) && ( character <= 0x309f ) )
338 if( ( 0x30a0 <= character ) && ( character <= 0x30ff ) )
342 if( ( 0x3130 <= character ) && ( character <= 0x318f ) )
346 if( ( 0x3200 <= character ) && ( character <= 0x32ff ) )
350 if( ( 0x3400 <= character ) && ( character <= 0x4dbf ) )
354 if( ( 0x4e00 <= character ) && ( character <= 0x62ff ) )
358 if( ( 0x6300 <= character ) && ( character <= 0x77ff ) )
362 if( ( 0x7800 <= character ) && ( character <= 0x8cff ) )
366 if( ( 0x8d00 <= character ) && ( character <= 0x9fff ) )
370 if( ( 0xa720 <= character ) && ( character <= 0xa7ff ) )
374 if( ( 0xa960 <= character ) && ( character <= 0xa97f ) )
378 if( ( 0xab30 <= character ) && ( character <= 0xab6f ) )
382 if( ( 0xac00 <= character ) && ( character <= 0xd7af ) )
386 if( ( 0xd7b0 <= character ) && ( character <= 0xd7ff ) )
390 if( ( 0xfb00 <= character ) && ( character <= 0xfb06 ) )
394 if( ( 0xfb1d <= character ) && ( character <= 0xfb4f ) )
398 if( ( 0xfb50 <= character ) && ( character <= 0xfdff ) )
405 if( ( 0xfe70 <= character ) && ( character <= 0xfeff ) )
409 if( ( 0xff00 <= character ) && ( character <= 0xffef ) )
413 if( ( 0x1ee00 <= character ) && ( character <= 0x1eeff ) )
417 // U+1f170 4. Enclosed characters: negative squared latin capital letter A
418 // U+1f6c5 6b. Additional transport and map symbols
419 if( ( 0x1f170 <= character ) && ( character <= 0x1f6c5 ) )
423 if( ( 0x20000 <= character ) && ( character <= 0x215ff ) )
427 if( ( 0x21600 <= character ) && ( character <= 0x230ff ) )
431 if( ( 0x23100 <= character ) && ( character <= 0x245ff ) )
435 if( ( 0x24600 <= character ) && ( character <= 0x260ff ) )
439 if( ( 0x26100 <= character ) && ( character <= 0x275ff ) )
443 if( ( 0x27600 <= character ) && ( character <= 0x290ff ) )
447 if( ( 0x29100 <= character ) && ( character <= 0x2a6df ) )
451 if( ( 0x2a700 <= character ) && ( character <= 0x2b73f ) )
455 if( ( 0x2b740 <= character ) && ( character <= 0x2b81f ) )
466 bool IsWhiteSpace( Character character )
468 return character < WHITE_SPACE_THRESHOLD;
471 bool IsNewParagraph( Character character )
473 return ( ( CHAR_FL == character ) ||
474 ( CHAR_VT == character ) ||
475 ( CHAR_FF == character ) ||
476 ( CHAR_NEL == character ) ||
477 ( CHAR_LS == character ) ||
478 ( CHAR_PS == character ) );
481 bool IsZeroWidthNonJoiner( Character character )
483 return CHAR_ZWNJ == character;
486 bool IsZeroWidthJoiner( Character character )
488 return CHAR_ZWJ == character;
491 bool IsZeroWidthSpace( Character character )
493 return CHAR_ZWS == character;
496 bool IsLeftToRightMark( Character character )
498 return CHAR_LTRM == character;
501 bool IsRightToLeftMark( Character character )
503 return CHAR_RTLM == character;
506 bool IsThinSpace( Character character )
508 return CHAR_TS == character;
511 } // namespace TextAbstraction