2 * Copyright (c) 2021 Samsung Electronics Co., Ltd.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <dali/devel-api/text-abstraction/script.h>
23 namespace TextAbstraction
27 //TODO: Move the below defined characters to "defined-characters.h"
28 constexpr unsigned int WHITE_SPACE_THRESHOLD = 0x21; ///< All characters below 0x21 are considered white spaces.
29 constexpr unsigned int CHAR_SPACE = 0x20; ///< Space.
30 constexpr unsigned int CHAR_LF = 0x000A; ///< NL Line feed, new line.
31 constexpr unsigned int CHAR_VT = 0x000B; ///< Vertical tab.
32 constexpr unsigned int CHAR_FF = 0x000C; ///< NP Form feed, new page.
33 constexpr unsigned int CHAR_CR = 0x000D; ///< Carriage return, new line.
34 constexpr unsigned int CHAR_NEL = 0x0085; ///< Next line.
35 constexpr unsigned int CHAR_LS = 0x2028; ///< Line separator.
36 constexpr unsigned int CHAR_PS = 0x2029; ///< Paragraph separator
38 constexpr unsigned int CHAR_ZWNJ = 0x200C; ///< Zero width non joiner.
39 constexpr unsigned int CHAR_ZWJ = 0x200D; ///< Zero width joiner.
40 constexpr unsigned int CHAR_LTRM = 0x200E; ///< Left to Right Mark.
41 constexpr unsigned int CHAR_RTLM = 0x200F; ///< Right to Left Mark.
42 constexpr unsigned int CHAR_TS = 0x2009; ///< Thin Space.
44 // Latin script: It contains punctuation characters and symbols which are not part of the latin script. https://en.wikipedia.org/wiki/Latin_script_in_Unicode
45 // 0x0000 - 0x007f C0 Controls and Basic Latin
47 // ASCII digits (not part of LATIN script):
50 // ASCII punctuation and symbols (not part of LATIN script):
56 // Controls (not part of LATIN script):
59 // 0x0080 - 0x00ff C1 Controls and Latin-1 Supplement
61 // Controls (not part of LATIN script):
64 // Punctuations and symbols (not part of LATIN script):
67 // Mathematical operators (not part of LATIN script):
71 // 0x0100 - 0x017f Latin Extended-A
72 // 0x0180 - 0x024f Latin Extended-B
73 // 0x0250 - 0x02af IPA Extensions
74 // 0x02b0 - 0x02ff Spacing Modifier Letters
76 // Punctuation (not part of LATIN script):
79 // 0x1d00 - 0x1d7f Phonetic Extensions
81 // Uralic Phonetic (not part of LATIN script):
84 // Subscripts and superscripts
89 // 0x1d80 - 0x1dbf Phonetic Extensions Supplement
91 // 0x1DBF (subscript or superscript. Not part of LATIN script )
93 // 0x1e00 - 0x1eff Latin Extended Additional
94 // 0x2070 - 0x209f Superscripts and Subscripts
96 // 0x2070 (not part of LATIN script)
97 // 0x2074 - 0x207E (not part of LATIN script)
99 // 0x2100 - 0x214f Letterlike symbols (not part of LATIN script)
101 // 0x212A - 0x212B (are part of LATIN script)
102 // 0x2132 (are part of LATIN script)
103 // 0x214E (are part of LATIN script)
105 // 0x2150 - 0x2189 Number Forms
107 // 0x2150 - 0x215F Fractions (not part of LATIN script)
108 // 0x2189 Fractions (not part of LATIN script)
110 // 0x2c60 - 0x2c7f Latin Extended-C
111 // 0xa720 - 0xa7ff Latin Extended-D
113 // 0xA720 - 0xA721 Uralic Phonetic (not part of LATIN script)
114 // 0xA788 (not part of LATIN script)
115 // 0xA789 - 0xA78A Budu (not part of LATIN script)
117 // 0xab30 - 0xab6f Latin Extended-E
119 // 0xfb00 - 0xfb06 Latin Alphabetic Presentation Forms
120 // 0xff00 - 0xffef Halfwidth and Fullwidth Forms
122 // 0xFF00 - 0xFF20 HWFW Symbols (not part of LATIN script)
123 // 0xFF3B - 0xFF40 HWFW Symbols (not part of LATIN script)
124 // 0xFF5B - 0xFFEF HWFW Symbols (not part of LATIN script)
127 // 0x0900 - 0x097f Devanagari
128 // 0x0980 - 0x09ff Bengali
129 // 0x0a00 - 0x0a7f Gurmukhi
130 // 0x0a80 - 0x0aff Gujarati
131 // 0x0b00 - 0x0b7f Oriya
132 // 0x0b80 - 0x0bff Tamil
133 // 0x0c00 - 0x0c7f Telugu
134 // 0x0c80 - 0x0cff Kannada
135 // 0x0d00 - 0x0d7f Malayalam
138 // 0x0d80 - 0x0dff Sinhala
141 // 0x0600 - 0x06ff Arabic
142 // 0x0750 - 0x077f Arabic Supplement
143 // 0x08A0 - 0x08ff Arabic Extended-A
144 // 0xfb50 - 0xfdff Arabic Presentation Forms-A
145 // 0xfe70 - 0xfeff Arabic Presentation Forms-B
146 // 0x1ee00 - 0x1eeff Arabic Mathematical Alphabetic Symbols
148 // CJK (Chinese, Japanese and Korean) and Vietnamese script.
149 // 0x2e80 - 0x2eff CJK Radicals Supplement
150 // 0x2f00 - 0x2fdf Kangxi Radicals
151 // 0x3000 - 0x303f CJK Symbols and Punctuation
152 // 0x3200 - 0x32ff Enclosed CJK Letters and Months
153 // 0x3400 - 0x4dbf CJK Unified Ideographs Extension A
154 // 0x4e00 - 0x62ff CJK Unified Ideographs
155 // 0x6300 - 0x77ff CJK Unified Ideographs
156 // 0x7800 - 0x8cff CJK Unified Ideographs
157 // 0x8d00 - 0x9fff CJK Unified Ideographs
158 // 0x20000 - 0x215ff CJK Unified Ideographs Extension B
159 // 0x21600 - 0x230ff CJK Unified Ideographs Extension B
160 // 0x23100 - 0x245ff CJK Unified Ideographs Extension B
161 // 0x24600 - 0x260ff CJK Unified Ideographs Extension B
162 // 0x26100 - 0x275ff CJK Unified Ideographs Extension B
163 // 0x27600 - 0x290ff CJK Unified Ideographs Extension B
164 // 0x29100 - 0x2a6df CJK Unified Ideographs Extension B
165 // 0x2a700 - 0x2b73f CJK Unified Ideographs Extension C
166 // 0x2b740 - 0x2b81f CJK Unified Ideographs Extension D
169 // 0x3040 - 0x309f Hiragana
170 // 0x30a0 - 0x30ff Katakana
173 // 0x1100 - 0x11ff Hangul jamo
174 // 0x3130 - 0x318f Hangul Compatibility Jamo
175 // 0xa960 - 0xa97f Hangul Jamo Extended-A
176 // 0xac00 - 0xd7af Hangul Syllables
177 // 0xd7b0 - 0xd7ff Hangul Jamo Extended-B
180 // 0x3100 - 0x312f Bopomofo
181 // 0x31a0 - 0x31bf Bopomofo Extended
184 // 0x1780 - 0x17ff Khmer
185 // 0x19e0 - 0x19ff Khmer Symbols
188 // 0x0e80 - 0x0eff Lao
191 // 0x0e00 - 0x0e7f Thai
194 // 0x1000 - 0x109f Myanmar
197 // 0x0591 - 0x05f4 Hebrew
198 // 0xfb1d - 0xfb4f Hebrew subset of Alphabetic Presentation Forms
201 // 0x0400 - 0x04ff Cyrillic
202 // 0x0500 - 0x052f Cyrillic suplement
203 // 0x2de0 - 0x2dff Cyrillic Extended-A
204 // 0xa640 - 0xa69f Cyrillic Extended-B
207 // 0x10a0 - 0x10ff Georgian
208 // 0x2d00 - 0x2d2f Georgian suplement
211 // 0x0370 - 0x03ff Greek & Coptic
212 // 0x1f00 - 0x1fff Greek Extended
215 // 0x0530 - 0x058f Armenian
216 // 0xfb13 - 0xfb17 Armenian subset of Alphabetic prefentation forms
219 // 0xa980 - 0xa9fd Javanese
222 // 0x1b80 - 0x1bbf Sundanese
223 // 0x1cc0 - 0x1ccf Sundanese supplement
225 // Ge'ez script (Ethiopic)
226 // 0x1200 - 0x137f Ethiopic
227 // 0x1380 - 0x139f Ethiopic supplement
228 // 0x2d80 - 0x2ddf Ethiopic Extended
229 // 0xab00 - 0xab2f Ethiopic Extended-A
232 // 0x1700 - 0x171f Baybayin
235 // 0x1c50 - 0x1c7f Ol Chiki
238 // 0xabc0 - 0xabff Meetei Mayek
239 // 0xaae0 - 0xaaff Meetei Mayek Extensions
241 // The Emoji which map to standardized Unicode characters
242 // 1. Emoticons ( 1F601 - 1F64F )
243 // 2. Dingbats ( 2700 - 27BF )
244 // 3. Transport and map symbols ( 1F680 - 1F6C0 )
245 // 4. Enclosed characters ( 24C2 - 1F251 )
246 // 5. Uncategorized :-S
247 // 6. Additional Emoticons ( 1F600 - 1F636 )
248 // 6b. Additional transport and map symbols ( 1F680 - 1F6FF ): http://unicode.org/charts/PDF/U1F680.pdf
249 // 6c. Other additional symbols ( 1F30D - 1F567 )
250 // 7. Supplemental Symbols and Pictographs ( 1F900–1F9FF ): http://unicode.org/charts/PDF/U1F900.pdf
252 // Symbols. Work around for these symbols.
265 /// character <= 0x077f
266 inline Script GetScriptTillArabicSupplement(Character character)
268 Script script = UNKNOWN;
270 if((0x0030 <= character) && (character <= 0x0039))
272 script = ASCII_DIGITS;
274 else if(character <= 0x007E)
276 if((0x0020 <= character) && (character <= 0x002F))
280 else if((0x003A <= character) && (character <= 0x0040))
284 else if((0x005B <= character) && (character <= 0x0060))
288 else if((0x007B <= character) && (character <= 0x007E))
297 else if((0x007F <= character) && (character <= 0x009F))
299 // 0x007F is actually part of C0 Controls and Basic Latin. However, is the last and only control character of its block
300 // and the following characters of the next block are consecutive.
301 script = C1_CONTROLS;
303 else if((0x00A0 <= character) && (character <= 0x00BF))
305 if(character == 0x00A9)
307 script = EMOJI; // 5. Uncategorized: copyright sign
309 else if(character == 0x00AE)
311 script = EMOJI; // 5. Uncategorized: registered sign
318 else if(character == 0x00D7)
322 else if(character == 0x00F7)
326 else if((0x00C0 <= character) && (character <= 0x02ff))
328 if((0x02B9 <= character) && (character <= 0x02BF))
337 else if((0x0370 <= character) && (character <= 0x03ff))
341 else if((0x0400 <= character) && (character <= 0x04ff))
345 else if((0x0500 <= character) && (character <= 0x052f))
349 else if((0x0530 <= character) && (character <= 0x058f))
353 else if((0x0591 <= character) && (character <= 0x05f4))
357 else if((0x0600 <= character) && (character <= 0x06ff))
361 else if((0x0750 <= character) && (character <= 0x077f))
369 /// character <= 0x09ff
370 inline Script GetScriptTillBengali(Character character)
372 Script script = UNKNOWN;
374 if(character <= 0x077f)
376 script = GetScriptTillArabicSupplement(character);
380 if((0x08A0 <= character) && (character <= 0x08ff))
384 else if((0x0900 <= character) && (character <= 0x097f))
388 else if((0x0980 <= character) && (character <= 0x09ff))
397 /// 0x09ff < character <= 0x0cff
398 inline Script GetScriptBetweenBengaliAndKannada(Character character)
400 Script script = UNKNOWN;
402 if(character <= 0x0b7f)
404 if((0x0a00 <= character) && (character <= 0x0a7f))
408 else if((0x0a80 <= character) && (character <= 0x0aff))
412 else if((0x0b00 <= character) && (character <= 0x0b7f))
419 if((0x0b80 <= character) && (character <= 0x0bff))
423 else if((0x0c00 <= character) && (character <= 0x0c7f))
427 else if((0x0c80 <= character) && (character <= 0x0cff))
436 /// 0x0cff < character <= 0x1eff
437 inline Script GetScriptBetweenKannadaAndLatinExtendedAdditional(Character character)
439 Script script = UNKNOWN;
441 if((0x0d00 <= character) && (character <= 0x0d7f))
445 else if((0x0d80 <= character) && (character <= 0x0dff))
449 else if((0x0e00 <= character) && (character <= 0x0e7f))
453 else if((0x0e80 <= character) && (character <= 0x0eff))
457 else if((0x1000 <= character) && (character <= 0x109f))
461 else if((0x10a0 <= character) && (character <= 0x10ff))
465 else if((0x1100 <= character) && (character <= 0x11ff))
469 else if((0x1200 <= character) && (character <= 0x137f))
473 else if((0x1380 <= character) && (character <= 0x139f))
477 else if((0x1700 <= character) && (character <= 0x171f))
481 else if((0x1780 <= character) && (character <= 0x17ff))
485 else if((0x19e0 <= character) && (character <= 0x19ff))
489 else if((0x1b80 <= character) && (character <= 0x1bbf))
493 else if((0x1c50 <= character) && (character <= 0x1c7f))
497 else if((0x1cc0 <= character) && (character <= 0x1ccf))
501 else if((0x1d00 <= character) && (character <= 0x1eff))
503 if((0x1D26 <= character) && (character <= 0x1D2B))
507 else if((0x1D5D <= character) && (character <= 0x1D61))
509 script = PHONETIC_SS;
511 else if((0x1D66 <= character) && (character <= 0x1D6A))
513 script = PHONETIC_SS;
515 else if(character == 0x1D78)
517 script = PHONETIC_SS;
519 else if(character == 0x1DBF)
521 script = PHONETIC_SS;
532 /// 0x1eff < character <= 0x2c7f
533 inline Script GetScriptBetweenLatinExtendedAdditionalAndLatinExtendedC(Character character)
535 Script script = UNKNOWN;
537 if((0x1f00 <= character) && (character <= 0x1fff))
541 else if(character == 0x203c)
543 script = EMOJI; // 5. Uncategorized: double exclamation mark
545 else if(character == 0x2049)
547 script = EMOJI; // 5. Uncategorized: exclamation question mark
549 else if((0x2070 <= character) && (character <= 0x209f))
551 if(character == 0x2070)
555 else if((0x2074 <= character) && (character <= 0x207E))
564 else if(character == 0x20e3)
566 script = EMOJI; // 5. Uncategorized: combining enclosing keycap
568 else if(character == 0x2122)
570 script = EMOJI; // 5. Uncategorized: trade mark sign
572 else if(character == 0x2139)
574 script = EMOJI; // 5. Uncategorized: information source
576 else if((0x2100 <= character) && (character <= 0x2189))
578 if((0x2100 <= character) && (character <= 0x214f))
580 if((0x212A <= character) && (character <= 0x212B))
584 else if(character == 0x2132)
588 else if(character == 0x214E)
594 script = LETTER_LIKE;
597 else if((0x2150 <= character) && (character <= 0x215F))
599 script = FRACTIONS_NF;
601 else if(character == 0x2189)
603 script = FRACTIONS_NF;
611 else if((0x25cb == character) ||
612 (0x25cf == character) ||
613 (0x25a1 == character))
617 else if(0x25a0 == character)
621 else if((0x2664 == character) ||
622 (0x2661 == character) ||
623 (0x2662 == character) ||
624 (0x2667 == character))
628 else if((0x2606 == character) ||
629 (0x25aa == character))
633 else if(0x262a == character)
637 // U+2194 5. Uncategorized: left right arrow
638 // U+2B55 5. Uncategorized: heavy large circle
639 else if((0x2194 <= character) && (character <= 0x2B55))
643 else if((0x2c60 <= character) && (character <= 0x2c7f))
651 /// 0x0cff < character <= 0x2c7f
652 inline Script GetScriptBetweenKannadaAndLatinExtendedC(Character character)
654 Script script = UNKNOWN;
656 if(character <= 0x1eff)
658 script = GetScriptBetweenKannadaAndLatinExtendedAdditional(character);
662 script = GetScriptBetweenLatinExtendedAdditionalAndLatinExtendedC(character);
668 /// 0x2c7f < character <= 0xa7ff
669 inline Script GetScriptBetweenLatinExtendedCAndLatinExtendedD(Character character)
671 Script script = UNKNOWN;
673 if((0x2d00 <= character) && (character <= 0x2d2f))
677 else if((0x2d80 <= character) && (character <= 0x2ddf))
681 else if((0x2de0 <= character) && (character <= 0x2dff))
685 else if((0x2e80 <= character) && (character <= 0x2eff))
689 else if((0x2f00 <= character) && (character <= 0x2fdf))
693 else if((0x3000 <= character) && (character <= 0x303f))
697 else if((0x3040 <= character) && (character <= 0x309f))
701 else if((0x30a0 <= character) && (character <= 0x30ff))
705 else if((0x3100 <= character) && (character <= 0x312f))
709 else if((0x3130 <= character) && (character <= 0x318f))
713 else if((0x31a0 <= character) && (character <= 0x31bf))
717 else if((0x3200 <= character) && (character <= 0x32ff))
721 else if((0x3400 <= character) && (character <= 0x4dbf))
725 else if((0x4e00 <= character) && (character <= 0x62ff))
729 else if((0x6300 <= character) && (character <= 0x77ff))
733 else if((0x7800 <= character) && (character <= 0x8cff))
737 else if((0x8d00 <= character) && (character <= 0x9fff))
741 else if((0xa640 <= character) && (character <= 0xa69f))
745 else if((0xa720 <= character) && (character <= 0xa7ff))
747 if(character == 0xA720)
751 else if(character == 0xA721)
755 else if(character == 0xA788)
757 script = NON_LATIN_LED;
759 else if(character == 0xA789)
761 script = NON_LATIN_LED;
763 else if(character == 0xA78A)
765 script = NON_LATIN_LED;
776 /// 0x2c7f < character <= 0xfdff
777 inline Script GetScriptBetweenLatinExtendedCAndArabicPresentationFormsA(Character character)
779 Script script = GetScriptBetweenLatinExtendedCAndLatinExtendedD(character);
781 if((0xa960 <= character) && (character <= 0xa97f))
785 else if((0xa980 <= character) && (character <= 0xa9fd))
789 else if((0xab00 <= character) && (character <= 0xab2f))
793 else if((0xab30 <= character) && (character <= 0xab6f))
797 else if((0xaae0 <= character) && (character <= 0xaaff))
801 else if((0xabc0 <= character) && (character <= 0xabff))
805 else if((0xac00 <= character) && (character <= 0xd7af))
809 else if((0xd7b0 <= character) && (character <= 0xd7ff))
813 else if((0xfb00 <= character) && (character <= 0xfb06))
817 else if((0xfb13 <= character) && (character <= 0xfb17))
821 else if((0xfb1d <= character) && (character <= 0xfb4f))
825 else if((0xfb50 <= character) && (character <= 0xfdff))
833 /// character > 0xfdff
834 inline Script GetScriptAboveArabicPresentationFormsA(Character character)
836 Script script = UNKNOWN;
838 if((0xfe70 <= character) && (character <= 0xfeff))
842 else if((0xff00 <= character) && (character <= 0xffef))
844 if((0xFF00 <= character) && (character <= 0xFF20))
848 else if((0xFF3B <= character) && (character <= 0xFF40))
852 else if((0xFF5B <= character) && (character <= 0xFFEF))
861 else if((0x1ee00 <= character) && (character <= 0x1eeff))
865 // U+1f170 4. Enclosed characters: negative squared latin capital letter A
866 // U+1f6ff 6b. Additional transport and map symbols
867 // Exclude U+1f170 ~ U+1f189. They are SYMBOLS_NSLCL (negative squared latin capital letter)
868 else if((0x1f170 <= character) && (character <= 0x1f6ff))
872 // 7. Supplemental Symbols and Pictographs
873 else if((0x1f900 <= character) && (character <= 0x1f9ff))
877 else if((0x20000 <= character) && (character <= 0x215ff))
881 else if((0x21600 <= character) && (character <= 0x230ff))
885 else if((0x23100 <= character) && (character <= 0x245ff))
889 else if((0x24600 <= character) && (character <= 0x260ff))
893 else if((0x26100 <= character) && (character <= 0x275ff))
897 else if((0x27600 <= character) && (character <= 0x290ff))
901 else if((0x29100 <= character) && (character <= 0x2a6df))
905 else if((0x2a700 <= character) && (character <= 0x2b73f))
909 else if((0x2b740 <= character) && (character <= 0x2b81f))
917 /// character > 0x2c7f
918 inline Script GetScriptAboveLatinExtendedC(Character character)
920 Script script = UNKNOWN;
922 if(character <= 0xfdff)
924 script = GetScriptBetweenLatinExtendedCAndArabicPresentationFormsA(character);
928 script = GetScriptAboveArabicPresentationFormsA(character);
936 bool IsRightToLeftScript(Script script)
938 return ((ARABIC == script) ||
942 Script GetCharacterScript(Character character)
944 Script script = UNKNOWN;
946 if(IsTextPresentationSelector(character))
950 else if(IsEmojiPresentationSelector(character))
952 script = EMOJI_COLOR;
954 else if(IsEmojiItem(character))
958 else if(IsNegativeSquaredLatinCapitalLetter(character))
960 script = SYMBOLS_NSLCL;
962 else if(IsCommonScript(character))
966 else if(character <= 0x0cff)
968 if(character <= 0x09ff)
970 script = GetScriptTillBengali(character);
974 script = GetScriptBetweenBengaliAndKannada(character);
979 if(character <= 0x2c7f)
981 script = GetScriptBetweenKannadaAndLatinExtendedC(character);
985 script = GetScriptAboveLatinExtendedC(character);
992 bool IsWhiteSpace(Character character)
994 return character < WHITE_SPACE_THRESHOLD;
997 bool IsSpace(Character character)
999 return CHAR_SPACE == character;
1002 bool IsNewParagraph(Character character)
1004 return ((CHAR_LF == character) ||
1005 (CHAR_VT == character) ||
1006 (CHAR_FF == character) ||
1007 (CHAR_CR == character) ||
1008 (CHAR_NEL == character) ||
1009 (CHAR_LS == character) ||
1010 (CHAR_PS == character));
1013 bool IsZeroWidthNonJoiner(Character character)
1015 return CHAR_ZWNJ == character;
1018 bool IsZeroWidthJoiner(Character character)
1020 return CHAR_ZWJ == character;
1023 bool IsZeroWidthSpace(Character character)
1025 return CHAR_ZWS == character;
1028 bool IsLeftToRightMark(Character character)
1030 return CHAR_LTRM == character;
1033 bool IsRightToLeftMark(Character character)
1035 return CHAR_RTLM == character;
1038 bool IsThinSpace(Character character)
1040 return CHAR_TS == character;
1043 bool IsCommonScript(Character character)
1045 return (IsWhiteSpace(character) ||
1046 IsZeroWidthNonJoiner(character) ||
1047 IsZeroWidthJoiner(character) ||
1048 IsZeroWidthSpace(character) ||
1049 IsLeftToRightMark(character) ||
1050 IsRightToLeftMark(character) ||
1051 IsThinSpace(character) ||
1052 IsNewParagraph(character));
1055 bool HasLigatureMustBreak(Script script)
1057 return ((LATIN == script) ||
1058 (ARABIC == script));
1061 Length GetNumberOfScripts()
1063 return SYMBOLS_NSLCL + 1;
1066 } // namespace TextAbstraction