dali/devel-api/text-abstraction/script.h

   1 #ifndef DALI_TOOLKIT_TEXT_ABSTRACTION_SCRIPT_H
   2 #define DALI_TOOLKIT_TEXT_ABSTRACTION_SCRIPT_H
   3
   4 /*
   5  * Copyright (c) 2020 Samsung Electronics Co., Ltd.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  * http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  *
  19  */
  20
  21 // INTERNAL INCLUDES
  22 #include <dali/devel-api/text-abstraction/emoji-character-properties.h>
  23 #include <dali/devel-api/text-abstraction/text-abstraction-definitions.h>
  24 #include <dali/public-api/dali-adaptor-common.h>
  25 #include <sys/types.h>
  26
  27 namespace Dali
  28 {
  29 namespace TextAbstraction
  30 {
  31 /**
  32  * @brief Script is the writing system used by a language.
  33  * Typically one script can be used to write different languages although one language could be written in different scrips.
  34  */
  35 enum Script
  36 {
  37   COMMON, ///< Valid for all scripts. i.e white space or '\n'.
  38
  39   ASCII_DIGITS, ///< ASCII digits.
  40   ASCII_PS,     ///< ASCII punctuation and symbols.
  41
  42   C1_CONTROLS, ///< Controls of the C1 Controls and Latin-1 Supplement unicode block.
  43   C1_PS,       ///< Punctuation and symbols of the C1 Controls and Latin-1 Supplement unicode block.
  44   C1_MATH,     ///< Math symbols of the C1 Controls and Latin-1 Supplement unicode block.
  45
  46   SML_P,       ///< Punctuation symbols of the Spacing Modifier Letters unicode block.
  47   PHONETIC_U,  ///< Uralic phonetic symbols of the Phonetic Extensions unicode block.
  48   PHONETIC_SS, ///< Subscripts and superscripts of the Phonetic Extensions unicode block.
  49
  50   NUMERIC_SS, ///< Numeric subscripts and superscripts.
  51
  52   LETTER_LIKE,   ///< Symbols of the Letterlike unicode block.
  53   NUMBER_FORMS,  ///< Number Forms unicode block.
  54   FRACTIONS_NF,  ///< Numeric fraction symbols of the Number Forms unicode block.
  55   NON_LATIN_LED, ///< Non latin symbols within the Latin Extended D unicode block.
  56   HWFW_S,        ///< Non latin symbols within the Halfwidth and fullwidth unicode block.
  57
  58   CYRILLIC, ///< The Cyrillic script. Used by Russian, Bulgarian, Ukrainian, Macedonian, ...
  59   GREEK,    ///< The Greek script. Used by Greek.
  60   LATIN,    ///< The latin script. Used by many western languages and others around the world.
  61
  62   ARABIC, ///< The arabic script. Used by Arab and Urdu among others.
  63   HEBREW, ///< The Hebrew script. Used by the Hebrew, Yiddish, Ladino, and Judeo-Arabic.
  64
  65   ARMENIAN, ///< The Armenian script. Used by Armenian.
  66   GEORGIAN, ///< The Georgian script. Used by Georgian.
  67
  68   CJK,      ///< The CJK script. Used by Chinese, Japanese, Korean and Vietnamese(old writing system).
  69   HANGUL,   ///< The Hangul jamo script. Used by Korean.
  70   HIRAGANA, ///< The Hiragana script. Used by the Japanese.
  71   KATAKANA, ///< The Katakana script. Used by the Japanese.
  72   BOPOMOFO, ///< The Bopomofo script. Also called Zhuyin fuhao or Zhuyin. A phonetic notation used for the transcription of spoken Chinese.
  73
  74   BENGALI,    ///< The Bengali script. Used by Bangla, Assamese, Bishnupriya Manipuri, Daphla, Garo, Hallam, Khasi, Mizo, Munda, Naga, Rian, and Santali.
  75   BURMESE,    ///< The Burmese script. Used by the Burmese (Myanmar) language.
  76   DEVANAGARI, ///< The devanagari script. Used by Hindi, Marathi, Sindhi, Nepali and Sanskrit.
  77   GUJARATI,   ///< The Gujarati script. Used by Gujarati.
  78   GURMUKHI,   ///< The Gurmukhi script. Used by Punjabi.
  79   KANNADA,    ///< The Kannada script. Used by Kannada and Tulu.
  80   MALAYALAM,  ///< The Malayalam script. Used by Malayalam.
  81   ORIYA,      ///< The Oriya script. Used by Oriya (Odia), Khondi, and Santali.
  82   SINHALA,    ///< The Sinhala script. Used by Sinhala and Pali.
  83   TAMIL,      ///< The Tamil script. Used by Tamil, Badaga, and Saurashtra.
  84   TELUGU,     ///< The Telugu script. Used by Telugu, Gondi, and Lambadi.
  85
  86   LAO,       ///< The Lao script. Used by the Lao language.
  87   THAI,      ///< The Thai script. Used by the Thai language
  88   KHMER,     ///< The Khmer script. Used by the Khmer language.
  89   JAVANESE,  ///< The Javanese script. Used by the Javanese language.
  90   SUNDANESE, ///< The Sundanese script. Used by the Sundanese language.
  91
  92   GEEZ,     ///< The Ge'ez script. Used by the Amharic, Tigrinya and other languages in Ethiopia and Eritrea.
  93   OL_CHIKI, ///< The Ol Chiki script. Used by the Santali.
  94   BAYBAYIN, ///< The Baybayin script. Used by the Tagalog, Bikol languages, Ilocano, Pangasinan, Visayan and other languages in Philippines.
  95   MEITEI,   ///< The Meitei script used for the Meitei language in Manipur, India.
  96
  97   EMOJI, ///< The Emoji which map to standardized Unicode characters.
  98
  99   SYMBOLS1, ///< Some symbols.
 100   SYMBOLS2, ///< Some symbols.
 101   SYMBOLS3, ///< Some symbols.
 102   SYMBOLS4, ///< Some symbols.
 103   SYMBOLS5, ///< Some symbols.
 104
 105   UNKNOWN,      ///< The script is unknown.
 106   EMOJI_TEXT,   ///< The Emoji request a text presentation for an emoji character.
 107   EMOJI_COLOR,  ///< The Emoji request a color-emoji presentation for an emoji character.
 108   SYMBOLS_NSLCL ///< THe Negative Squared Latin Capital Letter
 109
 110   //Note: update ScriptName and GetNumberOfScripts when adding new script
 111 };
 112
 113 const char* const ScriptName[] =
 114   {
 115     "COMMON", ///< Valid for all scripts. i.e white space or '\n'.
 116
 117     "ASCII_DIGITS", ///< ASCII digits.
 118     "ASCII_PS",     ///< ASCII punctuation and symbols.
 119
 120     "C1_CONTROLS", ///< Controls of the C1 Controls and Latin-1 Supplement unicode block.
 121     "C1_PS",       ///< Punctuation and symbols of the C1 Controls and Latin-1 Supplement unicode block.
 122     "C1_MATH",     ///< Math symbols of the C1 Controls and Latin-1 Supplement unicode block.
 123
 124     "SML_P",       ///< Punctuation symbols of the Spacing Modifier Letters unicode block.
 125     "PHONETIC_U",  ///< Uralic phonetic symbols of the Phonetic Extensions unicode block.
 126     "PHONETIC_SS", ///< Subscripts and superscripts of the Phonetic Extensions unicode block.
 127
 128     "NUMERIC_SS", ///< Numeric subscripts and superscripts.
 129
 130     "LETTER_LIKE",   ///< Symbols of the Letterlike unicode block.
 131     "NUMBER_FORMS",  ///< Number Forms unicode block.
 132     "FRACTIONS_NF",  ///< Numeric fraction symbols of the Number Forms unicode block.
 133     "NON_LATIN_LED", ///< Non latin symbols within the Latin Extended D unicode block.
 134     "HWFW_S",        ///< Non latin symbols within the Halfwidth and fullwidth unicode block.
 135
 136     "CYRILLIC", ///< The Cyrillic script. Used by Russian, Bulgarian, Ukrainian, Macedonian, ...
 137     "GREEK",    ///< The Greek script. Used by Greek.
 138     "LATIN",    ///< The latin script. Used by many western languages and others around the world.
 139
 140     "ARABIC", ///< The arabic script. Used by Arab and Urdu among others.
 141     "HEBREW", ///< The Hebrew script. Used by the Hebrew, Yiddish, Ladino, and Judeo-Arabic.
 142
 143     "ARMENIAN", ///< The Armenian script. Used by Armenian.
 144     "GEORGIAN", ///< The Georgian script. Used by Georgian.
 145
 146     "CJK",      ///< The CJK script. Used by Chinese, Japanese, Korean and Vietnamese(old writing system).
 147     "HANGUL",   ///< The Hangul jamo script. Used by Korean.
 148     "HIRAGANA", ///< The Hiragana script. Used by the Japanese.
 149     "KATAKANA", ///< The Katakana script. Used by the Japanese.
 150     "BOPOMOFO", ///< The Bopomofo script. Also called Zhuyin fuhao or Zhuyin. A phonetic notation used for the transcription of spoken Chinese.
 151
 152     "BENGALI",    ///< The Bengali script. Used by Bangla, Assamese, Bishnupriya Manipuri, Daphla, Garo, Hallam, Khasi, Mizo, Munda, Naga, Rian, and Santali.
 153     "BURMESE",    ///< The Burmese script. Used by the Burmese (Myanmar) language.
 154     "DEVANAGARI", ///< The devanagari script. Used by Hindi, Marathi, Sindhi, Nepali and Sanskrit.
 155     "GUJARATI",   ///< The Gujarati script. Used by Gujarati.
 156     "GURMUKHI",   ///< The Gurmukhi script. Used by Punjabi.
 157     "KANNADA",    ///< The Kannada script. Used by Kannada and Tulu.
 158     "MALAYALAM",  ///< The Malayalam script. Used by Malayalam.
 159     "ORIYA",      ///< The Oriya script. Used by Oriya (Odia), Khondi, and Santali.
 160     "SINHALA",    ///< The Sinhala script. Used by Sinhala and Pali.
 161     "TAMIL",      ///< The Tamil script. Used by Tamil, Badaga, and Saurashtra.
 162     "TELUGU",     ///< The Telugu script. Used by Telugu, Gondi, and Lambadi.
 163
 164     "LAO",       ///< The Lao script. Used by the Lao language.
 165     "THAI",      ///< The Thai script. Used by the Thai language
 166     "KHMER",     ///< The Khmer script. Used by the Khmer language.
 167     "JAVANESE",  ///< The Javanese script. Used by the Javanese language.
 168     "SUNDANESE", ///< The Sundanese script. Used by the Sundanese language.
 169
 170     "GEEZ",     ///< The Ge'ez script also known as Ethiopic. Used by the Amharic, Tigrinya and other languages in Ethiopia and Eritrea.
 171     "OL_CHIKI", ///< The Ol Chiki script. Used by the Santali.
 172     "BAYBAYIN", ///< The Baybayin script. Used by the Tagalog, Bikol languages, Ilocano, Pangasinan, Visayan and other languages in Philippines.
 173     "MEITEI",   ///< The Meitei script used for the Meitei language in Manipur, India.
 174
 175     "EMOJI", ///< The Emoji which map to standardized Unicode characters.
 176
 177     "SYMBOLS1", ///< Some symbols.
 178     "SYMBOLS2", ///< Some symbols.
 179     "SYMBOLS3", ///< Some symbols.
 180     "SYMBOLS4", ///< Some symbols.
 181     "SYMBOLS5", ///< Some symbols.
 182
 183     "UNKNOWN",      ///< The script is unknown.
 184     "EMOJI_TEXT",   ///< The Emoji request a text presentation for an emoji character.
 185     "EMOJI_COLOR",  ///< The Emoji request a color-emoji presentation for an emoji character.
 186     "SYMBOLS_NSLCL" ///< THe Negative Squared Latin Capital Letter
 187 };
 188
 189 /**
 190  * @brief Whether the script is a right to left script.
 191  *
 192  * @param[in] script The script.
 193  *
 194  * @return @e true if the script is right to left.
 195  */
 196 DALI_ADAPTOR_API bool IsRightToLeftScript(Script script);
 197
 198 /**
 199  * @brief Retrieves a character's script.
 200  *
 201  * @param[in] character The character.
 202  *
 203  * @return The chraracter's script.
 204  */
 205 DALI_ADAPTOR_API Script GetCharacterScript(Character character);
 206
 207 /**
 208  * @brief Whether the character is a white space.
 209  *
 210  * @param[in] character The character.
 211  *
 212  * @return @e true if the character is a white space.
 213  */
 214 DALI_ADAPTOR_API bool IsWhiteSpace(Character character);
 215
 216 /**
 217  * @brief Whether the character is a space.
 218  *
 219  * @param[in] character The character.
 220  *
 221  * @return @e true if the character is a space.
 222  */
 223 DALI_ADAPTOR_API bool IsSpace(Character character);
 224
 225 /**
 226  * @brief Whether the character is a new paragraph character.
 227  *
 228  * @param[in] character The character.
 229  *
 230  * @return @e true if the character is a new paragraph character.
 231  */
 232 DALI_ADAPTOR_API bool IsNewParagraph(Character character);
 233
 234 /**
 235  * @brief Whether the character is a zero width non joiner.
 236  *
 237  * @param[in] character The character.
 238  *
 239  * @return @e true if the character is a zero width non joiner.
 240  */
 241 DALI_ADAPTOR_API bool IsZeroWidthNonJoiner(Character character);
 242
 243 /**
 244  * @brief Whether the character is a zero width joiner.
 245  *
 246  * @param[in] character The character.
 247  *
 248  * @return @e true if the character is a zero width joiner.
 249  */
 250 DALI_ADAPTOR_API bool IsZeroWidthJoiner(Character character);
 251
 252 /**
 253  * @brief Whether the character is a zero width space.
 254  *
 255  * @param[in] character The character.
 256  *
 257  * @return @e true if the character is a zero width space.
 258  */
 259 DALI_ADAPTOR_API bool IsZeroWidthSpace(Character character);
 260
 261 /**
 262  * @brief Whether the character is a left to right mark.
 263  *
 264  * @param[in] character The character.
 265  *
 266  * @return @e true if the character is a left to right mark.
 267  */
 268 DALI_ADAPTOR_API bool IsLeftToRightMark(Character character);
 269
 270 /**
 271  * @brief Whether the character is a right to left mark.
 272  *
 273  * @param[in] character The character.
 274  *
 275  * @return @e true if the character is a right to left mark.
 276  */
 277 DALI_ADAPTOR_API bool IsRightToLeftMark(Character character);
 278
 279 /**
 280  * @brief Whether the character is a thin space.
 281  *
 282  * @param[in] character The character.
 283  *
 284  * @return @e true if the character is a thin space.
 285  */
 286 DALI_ADAPTOR_API bool IsThinSpace(Character character);
 287
 288 /**
 289  * @brief Whether the character is common within all scripts.
 290  *
 291  * @param[in] character The character.
 292  *
 293  * @return @e true if the character is common within all scripts.
 294  */
 295 DALI_ADAPTOR_API bool IsCommonScript(Character character);
 296
 297 /**
 298  * @brief Whether the script contains ligatures that must be 'broken' for selection or cursor position.
 299  *
 300  * i.e The latin script has the 'ff' or 'fi' ligatures that need to be broken to position the cursor
 301  * between the two characters. Equally the arabic script has the 'ﻻ' ligature that needs to be broken.
 302  *
 303  * @param[in] script The script.
 304  *
 305  * @return @e true if the script has ligatures that must be 'broken'.
 306  */
 307 DALI_ADAPTOR_API bool HasLigatureMustBreak(Script script);
 308
 309 /**
 310  * @brief Get the number of elements in enum Script
 311  *
 312  * @return returns the number of Scripts
 313  */
 314 DALI_ADAPTOR_API Length GetNumberOfScripts();
 315
 316 } // namespace TextAbstraction
 317
 318 } // namespace Dali
 319
 320 #endif // DALI_TOOLKIT_TEXT_ABSTRACTION_SCRIPT_H