1 #ifndef DALI_TOOLKIT_TEXT_ABSTRACTION_SCRIPT_H
2 #define DALI_TOOLKIT_TEXT_ABSTRACTION_SCRIPT_H
5 * Copyright (c) 2020 Samsung Electronics Co., Ltd.
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
22 #include <dali/devel-api/text-abstraction/emoji-character-properties.h>
23 #include <dali/devel-api/text-abstraction/text-abstraction-definitions.h>
24 #include <dali/public-api/dali-adaptor-common.h>
25 #include <sys/types.h>
29 namespace TextAbstraction
32 * @brief Script is the writing system used by a language.
33 * Typically one script can be used to write different languages although one language could be written in different scrips.
37 COMMON, ///< Valid for all scripts. i.e white space or '\n'.
39 ASCII_DIGITS, ///< ASCII digits.
40 ASCII_PS, ///< ASCII punctuation and symbols.
42 C1_CONTROLS, ///< Controls of the C1 Controls and Latin-1 Supplement unicode block.
43 C1_PS, ///< Punctuation and symbols of the C1 Controls and Latin-1 Supplement unicode block.
44 C1_MATH, ///< Math symbols of the C1 Controls and Latin-1 Supplement unicode block.
46 SML_P, ///< Punctuation symbols of the Spacing Modifier Letters unicode block.
47 PHONETIC_U, ///< Uralic phonetic symbols of the Phonetic Extensions unicode block.
48 PHONETIC_SS, ///< Subscripts and superscripts of the Phonetic Extensions unicode block.
50 NUMERIC_SS, ///< Numeric subscripts and superscripts.
52 LETTER_LIKE, ///< Symbols of the Letterlike unicode block.
53 NUMBER_FORMS, ///< Number Forms unicode block.
54 FRACTIONS_NF, ///< Numeric fraction symbols of the Number Forms unicode block.
55 NON_LATIN_LED, ///< Non latin symbols within the Latin Extended D unicode block.
56 HWFW_S, ///< Non latin symbols within the Halfwidth and fullwidth unicode block.
58 CYRILLIC, ///< The Cyrillic script. Used by Russian, Bulgarian, Ukrainian, Macedonian, ...
59 GREEK, ///< The Greek script. Used by Greek.
60 LATIN, ///< The latin script. Used by many western languages and others around the world.
62 ARABIC, ///< The arabic script. Used by Arab and Urdu among others.
63 HEBREW, ///< The Hebrew script. Used by the Hebrew, Yiddish, Ladino, and Judeo-Arabic.
65 ARMENIAN, ///< The Armenian script. Used by Armenian.
66 GEORGIAN, ///< The Georgian script. Used by Georgian.
68 CJK, ///< The CJK script. Used by Chinese, Japanese, Korean and Vietnamese(old writing system).
69 HANGUL, ///< The Hangul jamo script. Used by Korean.
70 HIRAGANA, ///< The Hiragana script. Used by the Japanese.
71 KATAKANA, ///< The Katakana script. Used by the Japanese.
72 BOPOMOFO, ///< The Bopomofo script. Also called Zhuyin fuhao or Zhuyin. A phonetic notation used for the transcription of spoken Chinese.
74 BENGALI, ///< The Bengali script. Used by Bangla, Assamese, Bishnupriya Manipuri, Daphla, Garo, Hallam, Khasi, Mizo, Munda, Naga, Rian, and Santali.
75 BURMESE, ///< The Burmese script. Used by the Burmese (Myanmar) language.
76 DEVANAGARI, ///< The devanagari script. Used by Hindi, Marathi, Sindhi, Nepali and Sanskrit.
77 GUJARATI, ///< The Gujarati script. Used by Gujarati.
78 GURMUKHI, ///< The Gurmukhi script. Used by Punjabi.
79 KANNADA, ///< The Kannada script. Used by Kannada and Tulu.
80 MALAYALAM, ///< The Malayalam script. Used by Malayalam.
81 ORIYA, ///< The Oriya script. Used by Oriya (Odia), Khondi, and Santali.
82 SINHALA, ///< The Sinhala script. Used by Sinhala and Pali.
83 TAMIL, ///< The Tamil script. Used by Tamil, Badaga, and Saurashtra.
84 TELUGU, ///< The Telugu script. Used by Telugu, Gondi, and Lambadi.
86 LAO, ///< The Lao script. Used by the Lao language.
87 THAI, ///< The Thai script. Used by the Thai language
88 KHMER, ///< The Khmer script. Used by the Khmer language.
89 JAVANESE, ///< The Javanese script. Used by the Javanese language.
90 SUNDANESE, ///< The Sundanese script. Used by the Sundanese language.
92 GEEZ, ///< The Ge'ez script. Used by the Amharic, Tigrinya and other languages in Ethiopia and Eritrea.
93 OL_CHIKI, ///< The Ol Chiki script. Used by the Santali.
94 BAYBAYIN, ///< The Baybayin script. Used by the Tagalog, Bikol languages, Ilocano, Pangasinan, Visayan and other languages in Philippines.
95 MEITEI, ///< The Meitei script used for the Meitei language in Manipur, India.
97 EMOJI, ///< The Emoji which map to standardized Unicode characters.
99 SYMBOLS1, ///< Some symbols.
100 SYMBOLS2, ///< Some symbols.
101 SYMBOLS3, ///< Some symbols.
102 SYMBOLS4, ///< Some symbols.
103 SYMBOLS5, ///< Some symbols.
105 UNKNOWN, ///< The script is unknown.
106 EMOJI_TEXT, ///< The Emoji request a text presentation for an emoji character.
107 EMOJI_COLOR, ///< The Emoji request a color-emoji presentation for an emoji character.
108 SYMBOLS_NSLCL ///< THe Negative Squared Latin Capital Letter
110 //Note: update ScriptName and GetNumberOfScripts when adding new script
113 const char* const ScriptName[] =
115 "COMMON", ///< Valid for all scripts. i.e white space or '\n'.
117 "ASCII_DIGITS", ///< ASCII digits.
118 "ASCII_PS", ///< ASCII punctuation and symbols.
120 "C1_CONTROLS", ///< Controls of the C1 Controls and Latin-1 Supplement unicode block.
121 "C1_PS", ///< Punctuation and symbols of the C1 Controls and Latin-1 Supplement unicode block.
122 "C1_MATH", ///< Math symbols of the C1 Controls and Latin-1 Supplement unicode block.
124 "SML_P", ///< Punctuation symbols of the Spacing Modifier Letters unicode block.
125 "PHONETIC_U", ///< Uralic phonetic symbols of the Phonetic Extensions unicode block.
126 "PHONETIC_SS", ///< Subscripts and superscripts of the Phonetic Extensions unicode block.
128 "NUMERIC_SS", ///< Numeric subscripts and superscripts.
130 "LETTER_LIKE", ///< Symbols of the Letterlike unicode block.
131 "NUMBER_FORMS", ///< Number Forms unicode block.
132 "FRACTIONS_NF", ///< Numeric fraction symbols of the Number Forms unicode block.
133 "NON_LATIN_LED", ///< Non latin symbols within the Latin Extended D unicode block.
134 "HWFW_S", ///< Non latin symbols within the Halfwidth and fullwidth unicode block.
136 "CYRILLIC", ///< The Cyrillic script. Used by Russian, Bulgarian, Ukrainian, Macedonian, ...
137 "GREEK", ///< The Greek script. Used by Greek.
138 "LATIN", ///< The latin script. Used by many western languages and others around the world.
140 "ARABIC", ///< The arabic script. Used by Arab and Urdu among others.
141 "HEBREW", ///< The Hebrew script. Used by the Hebrew, Yiddish, Ladino, and Judeo-Arabic.
143 "ARMENIAN", ///< The Armenian script. Used by Armenian.
144 "GEORGIAN", ///< The Georgian script. Used by Georgian.
146 "CJK", ///< The CJK script. Used by Chinese, Japanese, Korean and Vietnamese(old writing system).
147 "HANGUL", ///< The Hangul jamo script. Used by Korean.
148 "HIRAGANA", ///< The Hiragana script. Used by the Japanese.
149 "KATAKANA", ///< The Katakana script. Used by the Japanese.
150 "BOPOMOFO", ///< The Bopomofo script. Also called Zhuyin fuhao or Zhuyin. A phonetic notation used for the transcription of spoken Chinese.
152 "BENGALI", ///< The Bengali script. Used by Bangla, Assamese, Bishnupriya Manipuri, Daphla, Garo, Hallam, Khasi, Mizo, Munda, Naga, Rian, and Santali.
153 "BURMESE", ///< The Burmese script. Used by the Burmese (Myanmar) language.
154 "DEVANAGARI", ///< The devanagari script. Used by Hindi, Marathi, Sindhi, Nepali and Sanskrit.
155 "GUJARATI", ///< The Gujarati script. Used by Gujarati.
156 "GURMUKHI", ///< The Gurmukhi script. Used by Punjabi.
157 "KANNADA", ///< The Kannada script. Used by Kannada and Tulu.
158 "MALAYALAM", ///< The Malayalam script. Used by Malayalam.
159 "ORIYA", ///< The Oriya script. Used by Oriya (Odia), Khondi, and Santali.
160 "SINHALA", ///< The Sinhala script. Used by Sinhala and Pali.
161 "TAMIL", ///< The Tamil script. Used by Tamil, Badaga, and Saurashtra.
162 "TELUGU", ///< The Telugu script. Used by Telugu, Gondi, and Lambadi.
164 "LAO", ///< The Lao script. Used by the Lao language.
165 "THAI", ///< The Thai script. Used by the Thai language
166 "KHMER", ///< The Khmer script. Used by the Khmer language.
167 "JAVANESE", ///< The Javanese script. Used by the Javanese language.
168 "SUNDANESE", ///< The Sundanese script. Used by the Sundanese language.
170 "GEEZ", ///< The Ge'ez script also known as Ethiopic. Used by the Amharic, Tigrinya and other languages in Ethiopia and Eritrea.
171 "OL_CHIKI", ///< The Ol Chiki script. Used by the Santali.
172 "BAYBAYIN", ///< The Baybayin script. Used by the Tagalog, Bikol languages, Ilocano, Pangasinan, Visayan and other languages in Philippines.
173 "MEITEI", ///< The Meitei script used for the Meitei language in Manipur, India.
175 "EMOJI", ///< The Emoji which map to standardized Unicode characters.
177 "SYMBOLS1", ///< Some symbols.
178 "SYMBOLS2", ///< Some symbols.
179 "SYMBOLS3", ///< Some symbols.
180 "SYMBOLS4", ///< Some symbols.
181 "SYMBOLS5", ///< Some symbols.
183 "UNKNOWN", ///< The script is unknown.
184 "EMOJI_TEXT", ///< The Emoji request a text presentation for an emoji character.
185 "EMOJI_COLOR", ///< The Emoji request a color-emoji presentation for an emoji character.
186 "SYMBOLS_NSLCL" ///< THe Negative Squared Latin Capital Letter
190 * @brief Whether the script is a right to left script.
192 * @param[in] script The script.
194 * @return @e true if the script is right to left.
196 DALI_ADAPTOR_API bool IsRightToLeftScript(Script script);
199 * @brief Retrieves a character's script.
201 * @param[in] character The character.
203 * @return The chraracter's script.
205 DALI_ADAPTOR_API Script GetCharacterScript(Character character);
208 * @brief Whether the character is a white space.
210 * @param[in] character The character.
212 * @return @e true if the character is a white space.
214 DALI_ADAPTOR_API bool IsWhiteSpace(Character character);
217 * @brief Whether the character is a space.
219 * @param[in] character The character.
221 * @return @e true if the character is a space.
223 DALI_ADAPTOR_API bool IsSpace(Character character);
226 * @brief Whether the character is a new paragraph character.
228 * @param[in] character The character.
230 * @return @e true if the character is a new paragraph character.
232 DALI_ADAPTOR_API bool IsNewParagraph(Character character);
235 * @brief Whether the character is a zero width non joiner.
237 * @param[in] character The character.
239 * @return @e true if the character is a zero width non joiner.
241 DALI_ADAPTOR_API bool IsZeroWidthNonJoiner(Character character);
244 * @brief Whether the character is a zero width joiner.
246 * @param[in] character The character.
248 * @return @e true if the character is a zero width joiner.
250 DALI_ADAPTOR_API bool IsZeroWidthJoiner(Character character);
253 * @brief Whether the character is a zero width space.
255 * @param[in] character The character.
257 * @return @e true if the character is a zero width space.
259 DALI_ADAPTOR_API bool IsZeroWidthSpace(Character character);
262 * @brief Whether the character is a left to right mark.
264 * @param[in] character The character.
266 * @return @e true if the character is a left to right mark.
268 DALI_ADAPTOR_API bool IsLeftToRightMark(Character character);
271 * @brief Whether the character is a right to left mark.
273 * @param[in] character The character.
275 * @return @e true if the character is a right to left mark.
277 DALI_ADAPTOR_API bool IsRightToLeftMark(Character character);
280 * @brief Whether the character is a thin space.
282 * @param[in] character The character.
284 * @return @e true if the character is a thin space.
286 DALI_ADAPTOR_API bool IsThinSpace(Character character);
289 * @brief Whether the character is common within all scripts.
291 * @param[in] character The character.
293 * @return @e true if the character is common within all scripts.
295 DALI_ADAPTOR_API bool IsCommonScript(Character character);
298 * @brief Whether the script contains ligatures that must be 'broken' for selection or cursor position.
300 * i.e The latin script has the 'ff' or 'fi' ligatures that need to be broken to position the cursor
301 * between the two characters. Equally the arabic script has the 'ﻻ' ligature that needs to be broken.
303 * @param[in] script The script.
305 * @return @e true if the script has ligatures that must be 'broken'.
307 DALI_ADAPTOR_API bool HasLigatureMustBreak(Script script);
310 * @brief Get the number of elements in enum Script
312 * @return returns the number of Scripts
314 DALI_ADAPTOR_API Length GetNumberOfScripts();
316 } // namespace TextAbstraction
320 #endif // DALI_TOOLKIT_TEXT_ABSTRACTION_SCRIPT_H