2 * Copyright (c) 2015 Samsung Electronics Co., Ltd.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <dali/devel-api/text-abstraction/script.h>
24 namespace TextAbstraction
29 const unsigned int WHITE_SPACE_THRESHOLD = 0x21; ///< All characters below 0x21 are considered white spaces.
30 const unsigned int CHAR_LF = 0x000A; ///< NL Line feed, new line.
31 const unsigned int CHAR_VT = 0x000B; ///< Vertical tab.
32 const unsigned int CHAR_FF = 0x000C; ///< NP Form feed, new page.
33 const unsigned int CHAR_CR = 0x000D; ///< Carriage return, new line.
34 const unsigned int CHAR_NEL = 0x0085; ///< Next line.
35 const unsigned int CHAR_LS = 0x2028; ///< Line separator.
36 const unsigned int CHAR_PS = 0x2029; ///< Paragraph separator
38 const unsigned int CHAR_ZWS = 0x200B; ///< Zero width space.
39 const unsigned int CHAR_ZWNJ = 0x200C; ///< Zero width non joiner.
40 const unsigned int CHAR_ZWJ = 0x200D; ///< Zero width joiner.
41 const unsigned int CHAR_LTRM = 0x200E; ///< Left to Right Mark.
42 const unsigned int CHAR_RTLM = 0x200F; ///< Right to Left Mark.
43 const unsigned int CHAR_TS = 0x2009; ///< Thin Space.
46 bool IsRightToLeftScript( Script script )
48 return ( ( ARABIC == script ) ||
49 ( HEBREW == script ) );
52 Script GetCharacterScript( Character character )
54 // Latin script: It contains punctuation characters and symbols which are not part of the latin script. https://en.wikipedia.org/wiki/Latin_script_in_Unicode
55 // 0x0000 - 0x007f C0 Controls and Basic Latin
57 // ASCII digits (not part of LATIN script):
60 // ASCII punctuation and symbols (not part of LATIN script):
66 // Controls (not part of LATIN script):
69 // 0x0080 - 0x00ff C1 Controls and Latin-1 Supplement
71 // Controls (not part of LATIN script):
74 // Punctuations and symbols (not part of LATIN script):
77 // Mathematical operators (not part of LATIN script):
81 // 0x0100 - 0x017f Latin Extended-A
82 // 0x0180 - 0x024f Latin Extended-B
83 // 0x0250 - 0x02af IPA Extensions
84 // 0x02b0 - 0x02ff Spacing Modifier Letters
86 // Punctuation (not part of LATIN script):
89 // 0x1d00 - 0x1d7f Phonetic Extensions
91 // Uralic Phonetic (not part of LATIN script):
94 // Subscripts and superscripts
99 // 0x1d80 - 0x1dbf Phonetic Extensions Supplement
101 // 0x1DBF (subscript or superscript. Not part of LATIN script )
103 // 0x1e00 - 0x1eff Latin Extended Additional
104 // 0x2070 - 0x209f Superscripts and Subscripts
106 // 0x2070 (not part of LATIN script)
107 // 0x2074 - 0x207E (not part of LATIN script)
109 // 0x2100 - 0x214f Letterlike symbols (not part of LATIN script)
111 // 0x212A - 0x212B (are part of LATIN script)
112 // 0x2132 (are part of LATIN script)
113 // 0x214E (are part of LATIN script)
115 // 0x2150 - 0x2189 Number Forms
117 // 0x2150 - 0x215F Fractions (not part of LATIN script)
118 // 0x2189 Fractions (not part of LATIN script)
120 // 0x2c60 - 0x2c7f Latin Extended-C
121 // 0xa720 - 0xa7ff Latin Extended-D
123 // 0xA720 - 0xA721 Uralic Phonetic (not part of LATIN script)
124 // 0xA788 (not part of LATIN script)
125 // 0xA789 - 0xA78A Budu (not part of LATIN script)
127 // 0xab30 - 0xab6f Latin Extended-E
129 // 0xfb00 - 0xfb06 Latin Alphabetic Presentation Forms
130 // 0xff00 - 0xffef Halfwidth and Fullwidth Forms
132 // 0xFF00 - 0xFF20 HWFW Symbols (not part of LATIN script)
133 // 0xFF3B - 0xFF40 HWFW Symbols (not part of LATIN script)
134 // 0xFF5B - 0xFFEF HWFW Symbols (not part of LATIN script)
137 // 0x0900 - 0x097f Devanagari
138 // 0x0980 - 0x09ff Bengali
139 // 0x0a00 - 0x0a7f Gurmukhi
140 // 0x0a80 - 0x0aff Gujarati
141 // 0x0b00 - 0x0b7f Oriya
142 // 0x0b80 - 0x0bff Tamil
143 // 0x0c00 - 0x0c7f Telugu
144 // 0x0c80 - 0x0cff Kannada
145 // 0x0d00 - 0x0d7f Malayalam
148 // 0x0d80 - 0x0dff Sinhala
151 // 0x0600 - 0x06ff Arabic
152 // 0x0750 - 0x077f Arabic Supplement
153 // 0x08A0 - 0x08ff Arabic Extended-A
154 // 0xfb50 - 0xfdff Arabic Presentation Forms-A
155 // 0xfe70 - 0xfeff Arabic Presentation Forms-B
156 // 0x1ee00 - 0x1eeff Arabic Mathematical Alphabetic Symbols
158 // CJK (Chinese, Japanese and Korean) and Vietnamese script.
159 // 0x2e80 - 0x2eff CJK Radicals Supplement
160 // 0x2f00 - 0x2fdf Kangxi Radicals
161 // 0x3000 - 0x303f CJK Symbols and Punctuation
162 // 0x3200 - 0x32ff Enclosed CJK Letters and Months
163 // 0x3400 - 0x4dbf CJK Unified Ideographs Extension A
164 // 0x4e00 - 0x62ff CJK Unified Ideographs
165 // 0x6300 - 0x77ff CJK Unified Ideographs
166 // 0x7800 - 0x8cff CJK Unified Ideographs
167 // 0x8d00 - 0x9fff CJK Unified Ideographs
168 // 0x20000 - 0x215ff CJK Unified Ideographs Extension B
169 // 0x21600 - 0x230ff CJK Unified Ideographs Extension B
170 // 0x23100 - 0x245ff CJK Unified Ideographs Extension B
171 // 0x24600 - 0x260ff CJK Unified Ideographs Extension B
172 // 0x26100 - 0x275ff CJK Unified Ideographs Extension B
173 // 0x27600 - 0x290ff CJK Unified Ideographs Extension B
174 // 0x29100 - 0x2a6df CJK Unified Ideographs Extension B
175 // 0x2a700 - 0x2b73f CJK Unified Ideographs Extension C
176 // 0x2b740 - 0x2b81f CJK Unified Ideographs Extension D
179 // 0x3040 - 0x309f Hiragana
180 // 0x30a0 - 0x30ff Katakana
183 // 0x1100 - 0x11ff Hangul jamo
184 // 0x3130 - 0x318f Hangul Compatibility Jamo
185 // 0xa960 - 0xa97f Hangul Jamo Extended-A
186 // 0xac00 - 0xd7af Hangul Syllables
187 // 0xd7b0 - 0xd7ff Hangul Jamo Extended-B
190 // 0x3100 - 0x312f Bopomofo
191 // 0x31a0 - 0x31bf Bopomofo Extended
194 // 0x1780 - 0x17ff Khmer
195 // 0x19e0 - 0x19ff Khmer Symbols
198 // 0x0e80 - 0x0eff Lao
201 // 0x0e00 - 0x0e7f Thai
204 // 0x1000 - 0x109f Myanmar
207 // 0x0591 - 0x05f4 Hebrew
208 // 0xfb1d - 0xfb4f Hebrew subset of Alphabetic Presentation Forms
211 // 0x0400 - 0x04ff Cyrillic
212 // 0x0500 - 0x052f Cyrillic suplement
213 // 0x2de0 - 0x2dff Cyrillic Extended-A
214 // 0xa640 - 0xa69f Cyrillic Extended-B
217 // 0x10a0 - 0x10ff Georgian
218 // 0x2d00 - 0x2d2f Georgian suplement
221 // 0x0370 - 0x03ff Greek & Coptic
222 // 0x1f00 - 0x1fff Greek Extended
225 // 0x0530 - 0x058f Armenian
226 // 0xfb13 - 0xfb17 Armenian subset of Alphabetic prefentation forms
229 // 0xa980 - 0xa9fd Javanese
232 // 0x1b80 - 0x1bbf Sundanese
233 // 0x1cc0 - 0x1ccf Sundanese supplement
235 // Ge'ez script (Ethiopic)
236 // 0x1200 - 0x137f Ethiopic
237 // 0x1380 - 0x139f Ethiopic supplement
238 // 0x2d80 - 0x2ddf Ethiopic Extended
239 // 0xab00 - 0xab2f Ethiopic Extended-A
242 // 0x1700 - 0x171f Baybayin
245 // 0x1c50 - 0x1c7f Ol Chiki
248 // 0xabc0 - 0xabff Meetei Mayek
249 // 0xaae0 - 0xaaff Meetei Mayek Extensions
251 // The Emoji which map to standardized Unicode characters
252 // 1. Emoticons ( 1F601 - 1F64F )
253 // 2. Dingbats ( 2700 - 27BF )
254 // 3. Transport and map symbols ( 1F680 - 1F6C0 )
255 // 4. Enclosed characters ( 24C2 - 1F251 )
256 // 5. Uncategorized :-S
257 // 6. Additional Emoticons ( 1F600 - 1F636 )
258 // 6b. Additional transport and map symbols ( 1F680 - 1F6FF ): http://unicode.org/charts/PDF/U1F680.pdf
259 // 6c. Other additional symbols ( 1F30D - 1F567 )
260 // 7. Supplemental Symbols and Pictographs ( 1F900–1F9FF ): http://unicode.org/charts/PDF/U1F900.pdf
262 // Symbols. Work around for these symbols.
275 if( IsCommonScript( character ) )
280 if( character <= 0x0cff )
282 if( character <= 0x09ff )
284 if( character <= 0x077f )
286 if( ( 0x0030 <= character ) && ( character <= 0x0039 ) )
290 if( character <= 0x007E )
292 if( ( 0x0020 <= character ) && ( character <= 0x002F ) )
296 if( ( 0x003A <= character ) && ( character <= 0x0040 ) )
300 if( ( 0x005B <= character ) && ( character <= 0x0060 ) )
304 if( ( 0x007B <= character ) && ( character <= 0x007E ) )
309 if( ( 0x007F <= character ) && ( character <= 0x009F ) )
311 // 0x007F is actually part of C0 Controls and Basic Latin. However, is the last and only control character of its block
312 // and the following characters of the next block are consecutive.
315 if( ( 0x00A0 <= character ) && ( character <= 0x00BF ) )
317 if( character == 0x00A9 )
319 return EMOJI; // 5. Uncategorized: copyright sign
321 if( character == 0x00AE )
323 return EMOJI; // 5. Uncategorized: registered sign
328 if( character == 0x00D7 )
332 if( character == 0x00F7 )
336 if( character <= 0x02ff )
338 if( ( 0x02B9 <= character ) && ( character <= 0x02BF ) )
345 if( ( 0x0370 <= character ) && ( character <= 0x03ff ) )
349 if( ( 0x0400 <= character ) && ( character <= 0x04ff ) )
353 if( ( 0x0500 <= character ) && ( character <= 0x052f ) )
357 if( ( 0x0530 <= character ) && ( character <= 0x058f ) )
361 if( ( 0x0591 <= character ) && ( character <= 0x05f4 ) )
365 if( ( 0x0600 <= character ) && ( character <= 0x06ff ) )
369 if( ( 0x0750 <= character ) && ( character <= 0x077f ) )
376 if( ( 0x08A0 <= character ) && ( character <= 0x08ff ) )
380 if( ( 0x0900 <= character ) && ( character <= 0x097f ) )
384 if( ( 0x0980 <= character ) && ( character <= 0x09ff ) )
392 if( character <= 0x0b7f )
394 if( ( 0x0a00 <= character ) && ( character <= 0x0a7f ) )
398 if( ( 0x0a80 <= character ) && ( character <= 0x0aff ) )
402 if( ( 0x0b00 <= character ) && ( character <= 0x0b7f ) )
409 if( ( 0x0b80 <= character ) && ( character <= 0x0bff ) )
413 if( ( 0x0c00 <= character ) && ( character <= 0x0c7f ) )
417 if( ( 0x0c80 <= character ) && ( character <= 0x0cff ) )
426 if( character <= 0x2c7f )
428 if( character <= 0x1eff )
430 if( ( 0x0d00 <= character ) && ( character <= 0x0d7f ) )
434 if( ( 0x0d80 <= character ) && ( character <= 0x0dff ) )
438 if( ( 0x0e00 <= character ) && ( character <= 0x0e7f ) )
442 if( ( 0x0e80 <= character ) && ( character <= 0x0eff ) )
446 if( ( 0x1000 <= character ) && ( character <= 0x109f ) )
450 if( ( 0x10a0 <= character ) && ( character <= 0x10ff ) )
454 if( ( 0x1100 <= character ) && ( character <= 0x11ff ) )
458 if( ( 0x1200 <= character ) && ( character <= 0x137f ) )
462 if( ( 0x1380 <= character ) && ( character <= 0x139f ) )
466 if( ( 0x1700 <= character ) && ( character <= 0x171f ) )
470 if( ( 0x1780 <= character ) && ( character <= 0x17ff ) )
474 if( ( 0x19e0 <= character ) && ( character <= 0x19ff ) )
478 if( ( 0x1b80 <= character ) && ( character <= 0x1bbf ) )
482 if( ( 0x1c50 <= character ) && ( character <= 0x1c7f ) )
486 if( ( 0x1cc0 <= character ) && ( character <= 0x1ccf ) )
490 if( ( 0x1d00 <= character ) && ( character <= 0x1eff ) )
492 if( ( 0x1D26 <= character ) && ( character <= 0x1D2B ) )
496 if( ( 0x1D5D <= character ) && ( character <= 0x1D61 ) )
500 if( ( 0x1D66 <= character ) && ( character <= 0x1D6A ) )
504 if( character == 0x1D78 )
508 if( character == 0x1DBF)
518 if( ( 0x1f00 <= character ) && ( character <= 0x1fff ) )
522 if( character == 0x203c )
524 return EMOJI; // 5. Uncategorized: double exclamation mark
526 if( character == 0x2049 )
528 return EMOJI; // 5. Uncategorized: exclamation question mark
530 if( ( 0x2070 <= character ) && ( character <= 0x209f ) )
532 if( character == 0x2070 )
536 if( ( 0x2074 <= character ) && ( character <= 0x207E ) )
543 if( character == 0x20e3 )
545 return EMOJI; // 5. Uncategorized: combining enclosing keycap
547 if( character == 0x2122 )
549 return EMOJI; // 5. Uncategorized: trade mark sign
551 if( character == 0x2139 )
553 return EMOJI; // 5. Uncategorized: information source
555 if( ( 0x2100 <= character ) && ( character <= 0x2189 ) )
557 if( ( 0x2100 <= character ) && ( character <= 0x214f ) )
559 if( ( 0x212A <= character ) && ( character <= 0x212B ) )
563 if( character == 0x2132 )
567 if( character == 0x214E )
574 if( ( 0x2150 <= character ) && ( character <= 0x215F ) )
578 if( character == 0x2189 )
587 if( ( 0x25cb == character ) ||
588 ( 0x25cf == character ) ||
589 ( 0x25a1 == character ) )
594 if( 0x25a0 == character )
599 if( ( 0x2664 == character ) ||
600 ( 0x2661 == character ) ||
601 ( 0x2662 == character ) ||
602 ( 0x2667 == character ) )
607 if( ( 0x2606 == character ) ||
608 ( 0x25aa == character ) )
613 if( 0x262a == character )
618 // U+2194 5. Uncategorized: left right arrow
619 // U+2B55 5. Uncategorized: heavy large circle
620 if( ( 0x2194 <= character ) && ( character <= 0x2B55 ) )
624 if( ( 0x2c60 <= character ) && ( character <= 0x2c7f ) )
632 if( character <= 0xfdff )
634 if( ( 0x2d00 <= character ) && ( character <= 0x2d2f ) )
638 if( ( 0x2d80 <= character ) && ( character <= 0x2ddf ) )
642 if( ( 0x2de0 <= character ) && ( character <= 0x2dff ) )
646 if( ( 0x2e80 <= character ) && ( character <= 0x2eff ) )
650 if( ( 0x2f00 <= character ) && ( character <= 0x2fdf ) )
654 if( ( 0x3000 <= character ) && ( character <= 0x303f ) )
658 if( ( 0x3040 <= character ) && ( character <= 0x309f ) )
662 if( ( 0x30a0 <= character ) && ( character <= 0x30ff ) )
666 if( ( 0x3100 <= character ) && ( character <= 0x312f ) )
670 if( ( 0x3130 <= character ) && ( character <= 0x318f ) )
674 if( ( 0x31a0 <= character ) && ( character <= 0x31bf ) )
678 if( ( 0x3200 <= character ) && ( character <= 0x32ff ) )
682 if( ( 0x3400 <= character ) && ( character <= 0x4dbf ) )
686 if( ( 0x4e00 <= character ) && ( character <= 0x62ff ) )
690 if( ( 0x6300 <= character ) && ( character <= 0x77ff ) )
694 if( ( 0x7800 <= character ) && ( character <= 0x8cff ) )
698 if( ( 0x8d00 <= character ) && ( character <= 0x9fff ) )
702 if( ( 0xa640 <= character ) && ( character <= 0xa69f ) )
706 if( ( 0xa720 <= character ) && ( character <= 0xa7ff ) )
708 if( character == 0xA720 )
712 if( character == 0xA721 )
716 if( character == 0xA788 )
718 return NON_LATIN_LED;
720 if( character == 0xA789 )
722 return NON_LATIN_LED;
724 if( character == 0xA78A )
726 return NON_LATIN_LED;
731 if( ( 0xa960 <= character ) && ( character <= 0xa97f ) )
735 if( ( 0xa980 <= character ) && ( character <= 0xa9fd ) )
739 if( ( 0xab00 <= character ) && ( character <= 0xab2f ) )
743 if( ( 0xab30 <= character ) && ( character <= 0xab6f ) )
747 if( ( 0xaae0 <= character ) && ( character <= 0xaaff ) )
751 if( ( 0xabc0 <= character ) && ( character <= 0xabff ) )
755 if( ( 0xac00 <= character ) && ( character <= 0xd7af ) )
759 if( ( 0xd7b0 <= character ) && ( character <= 0xd7ff ) )
763 if( ( 0xfb00 <= character ) && ( character <= 0xfb06 ) )
767 if( ( 0xfb13 <= character ) && ( character <= 0xfb17 ) )
771 if( ( 0xfb1d <= character ) && ( character <= 0xfb4f ) )
775 if( ( 0xfb50 <= character ) && ( character <= 0xfdff ) )
782 if( ( 0xfe70 <= character ) && ( character <= 0xfeff ) )
786 if( ( 0xff00 <= character ) && ( character <= 0xffef ) )
788 if( ( 0xFF00 <= character ) && ( character <= 0xFF20 ) )
792 if( ( 0xFF3B <= character ) && ( character <= 0xFF40 ) )
796 if( ( 0xFF5B <= character ) && ( character <= 0xFFEF ) )
803 if( ( 0x1ee00 <= character ) && ( character <= 0x1eeff ) )
807 // U+1f170 4. Enclosed characters: negative squared latin capital letter A
808 // U+1f6ff 6b. Additional transport and map symbols
809 if( ( 0x1f170 <= character ) && ( character <= 0x1f6ff ) )
813 // 7. Supplemental Symbols and Pictographs
814 if( ( 0x1f900 <= character ) && ( character <= 0x1f9ff ) )
818 if( ( 0x20000 <= character ) && ( character <= 0x215ff ) )
822 if( ( 0x21600 <= character ) && ( character <= 0x230ff ) )
826 if( ( 0x23100 <= character ) && ( character <= 0x245ff ) )
830 if( ( 0x24600 <= character ) && ( character <= 0x260ff ) )
834 if( ( 0x26100 <= character ) && ( character <= 0x275ff ) )
838 if( ( 0x27600 <= character ) && ( character <= 0x290ff ) )
842 if( ( 0x29100 <= character ) && ( character <= 0x2a6df ) )
846 if( ( 0x2a700 <= character ) && ( character <= 0x2b73f ) )
850 if( ( 0x2b740 <= character ) && ( character <= 0x2b81f ) )
861 bool IsWhiteSpace( Character character )
863 return character < WHITE_SPACE_THRESHOLD;
866 bool IsNewParagraph( Character character )
868 return ( ( CHAR_LF == character ) ||
869 ( CHAR_VT == character ) ||
870 ( CHAR_FF == character ) ||
871 ( CHAR_CR == character ) ||
872 ( CHAR_NEL == character ) ||
873 ( CHAR_LS == character ) ||
874 ( CHAR_PS == character ) );
877 bool IsZeroWidthNonJoiner( Character character )
879 return CHAR_ZWNJ == character;
882 bool IsZeroWidthJoiner( Character character )
884 return CHAR_ZWJ == character;
887 bool IsZeroWidthSpace( Character character )
889 return CHAR_ZWS == character;
892 bool IsLeftToRightMark( Character character )
894 return CHAR_LTRM == character;
897 bool IsRightToLeftMark( Character character )
899 return CHAR_RTLM == character;
902 bool IsThinSpace( Character character )
904 return CHAR_TS == character;
907 bool IsCommonScript( Character character )
909 return ( IsWhiteSpace( character ) ||
910 IsZeroWidthNonJoiner( character ) ||
911 IsZeroWidthJoiner( character ) ||
912 IsZeroWidthSpace( character ) ||
913 IsLeftToRightMark( character ) ||
914 IsRightToLeftMark( character ) ||
915 IsThinSpace( character ) ||
916 IsNewParagraph( character ) );
919 bool HasLigatureMustBreak( Script script )
921 return ( ( LATIN == script ) ||
922 ( ARABIC == script ) );
925 } // namespace TextAbstraction