Script GetCharacterScript( Character character )
{
- // Latin script:
+ // Latin script: It contains punctuation characters and symbols which are not part of the latin script. https://en.wikipedia.org/wiki/Latin_script_in_Unicode
// 0x0000 - 0x007f C0 Controls and Basic Latin
+ //
+ // ASCII digits (not part of LATIN script):
+ // 0x0030 - 0x0039
+ //
+ // ASCII punctuation and symbols (not part of LATIN script):
+ // 0x0020 - 0x002F
+ // 0x003A - 0x0040
+ // 0x005B - 0x0060
+ // 0x007B - 0x007E
+ //
+ // Controls (not part of LATIN script):
+ // 0x007F
+ //
// 0x0080 - 0x00ff C1 Controls and Latin-1 Supplement
+ //
+ // Controls (not part of LATIN script):
+ // 0x0080 - 0x009F
+ //
+ // Punctuations and symbols (not part of LATIN script):
+ // 0x00A0 - 0x00BF
+ //
+ // Mathematical operators (not part of LATIN script):
+ // 0x00D7
+ // 0x00F7
+ //
// 0x0100 - 0x017f Latin Extended-A
// 0x0180 - 0x024f Latin Extended-B
// 0x0250 - 0x02af IPA Extensions
// 0x02b0 - 0x02ff Spacing Modifier Letters
+ //
+ // Punctuation (not part of LATIN script):
+ // 0x02B9 - 0x02BF
+ //
// 0x1d00 - 0x1d7f Phonetic Extensions
+ //
+ // Uralic Phonetic (not part of LATIN script):
+ // 0x1D26 - 0x1D2B
+ //
+ // Subscripts and superscripts
+ // 0x1D5D - 0x1D61
+ // 0x1D66 - 0x1D6A
+ // 0x1D78
+ //
// 0x1d80 - 0x1dbf Phonetic Extensions Supplement
+ //
+ // 0x1DBF (subscript or superscript. Not part of LATIN script )
+ //
// 0x1e00 - 0x1eff Latin Extended Additional
// 0x2070 - 0x209f Superscripts and Subscripts
- // 0x2100 - 0x214f Letterlike symbols
- // 0x2150 - 0x218f Number Forms
+ //
+ // 0x2070 (not part of LATIN script)
+ // 0x2074 - 0x207E (not part of LATIN script)
+ //
+ // 0x2100 - 0x214f Letterlike symbols (not part of LATIN script)
+ //
+ // 0x212A - 0x212B (are part of LATIN script)
+ // 0x2132 (are part of LATIN script)
+ // 0x214E (are part of LATIN script)
+ //
+ // 0x2150 - 0x2189 Number Forms
+ //
+ // 0x2150 - 0x215F Fractions (not part of LATIN script)
+ // 0x2189 Fractions (not part of LATIN script)
+ //
// 0x2c60 - 0x2c7f Latin Extended-C
// 0xa720 - 0xa7ff Latin Extended-D
+ //
+ // 0xA720 - 0xA721 Uralic Phonetic (not part of LATIN script)
+ // 0xA788 (not part of LATIN script)
+ // 0xA789 - 0xA78A Budu (not part of LATIN script)
+ //
// 0xab30 - 0xab6f Latin Extended-E
+ //
// 0xfb00 - 0xfb06 Latin Alphabetic Presentation Forms
// 0xff00 - 0xffef Halfwidth and Fullwidth Forms
+ //
+ // 0xFF00 - 0xFF20 HWFW Symbols (not part of LATIN script)
+ // 0xFF3B - 0xFF40 HWFW Symbols (not part of LATIN script)
+ // 0xFF5B - 0xFFEF HWFW Symbols (not part of LATIN script)
// Brahmic scripts:
// 0x0900 - 0x097f Devanagari
// 6b. Additional transport and map symbols ( 1F681 - 1F6C5 )
// 6c. Other additional symbols ( 1F30D - 1F567 )
+ // Symbols. Work around for these symbols.
+ // 0x25cb
+ // 0x25cf
+ // 0x25a1
+ // 0x25a0
+ // 0x2664
+ // 0x2661
+ // 0x2662
+ // 0x2667
+ // 0x2606
+ // 0x25aa
+ // 0x262a
+
if( IsCommonScript( character ) )
{
return COMMON;
{
if( character <= 0x077f )
{
- if( character == 0x00A9 )
+ if( ( 0x0030 <= character ) && ( character <= 0x0039 ) )
+ {
+ return ASCII_DIGITS;
+ }
+ if( character <= 0x007E )
{
- return EMOJI; // 5. Uncategorized: copyright sign
+ if( ( 0x0020 <= character ) && ( character <= 0x002F ) )
+ {
+ return ASCII_PS;
+ }
+ if( ( 0x003A <= character ) && ( character <= 0x0040 ) )
+ {
+ return ASCII_PS;
+ }
+ if( ( 0x005B <= character ) && ( character <= 0x0060 ) )
+ {
+ return ASCII_PS;
+ }
+ if( ( 0x007B <= character ) && ( character <= 0x007E ) )
+ {
+ return ASCII_PS;
+ }
}
- if( character == 0x00AE )
+ if( ( 0x007F <= character ) && ( character <= 0x009F ) )
{
- return EMOJI; // 5. Uncategorized: registered sign
+ // 0x007F is actually part of C0 Controls and Basic Latin. However, is the last and only control character of its block
+ // and the following characters of the next block are consecutive.
+ return C1_CONTROLS;
+ }
+ if( ( 0x00A0 <= character ) && ( character <= 0x00BF ) )
+ {
+ if( character == 0x00A9 )
+ {
+ return EMOJI; // 5. Uncategorized: copyright sign
+ }
+ if( character == 0x00AE )
+ {
+ return EMOJI; // 5. Uncategorized: registered sign
+ }
+
+ return C1_PS;
+ }
+ if( character == 0x00D7 )
+ {
+ return C1_MATH;
+ }
+ if( character == 0x00F7 )
+ {
+ return C1_MATH;
}
if( character <= 0x02ff )
{
+ if( ( 0x02B9 <= character ) && ( character <= 0x02BF ) )
+ {
+ return SML_P;
+ }
+
return LATIN;
}
if( ( 0x0370 <= character ) && ( character <= 0x03ff ) )
}
if( ( 0x1d00 <= character ) && ( character <= 0x1eff ) )
{
+ if( ( 0x1D26 <= character ) && ( character <= 0x1D2B ) )
+ {
+ return PHONETIC_U;
+ }
+ if( ( 0x1D5D <= character ) && ( character <= 0x1D61 ) )
+ {
+ return PHONETIC_SS;
+ }
+ if( ( 0x1D66 <= character ) && ( character <= 0x1D6A ) )
+ {
+ return PHONETIC_SS;
+ }
+ if( character == 0x1D78 )
+ {
+ return PHONETIC_SS;
+ }
+ if( character == 0x1DBF)
+ {
+ return PHONETIC_SS;
+ }
+
return LATIN;
}
}
}
if( ( 0x2070 <= character ) && ( character <= 0x209f ) )
{
+ if( character == 0x2070 )
+ {
+ return NUMERIC_SS;
+ }
+ if( ( 0x2074 <= character ) && ( character <= 0x207E ) )
+ {
+ return NUMERIC_SS;
+ }
+
return LATIN;
}
if( character == 0x20e3 )
{
return EMOJI; // 5. Uncategorized: information source
}
- if( ( 0x2100 <= character ) && ( character <= 0x218f ) )
- {
+ if( ( 0x2100 <= character ) && ( character <= 0x2189 ) )
+ {
+ if( ( 0x2100 <= character ) && ( character <= 0x214f ) )
+ {
+ if( ( 0x212A <= character ) && ( character <= 0x212B ) )
+ {
+ return LATIN;
+ }
+ if( character == 0x2132 )
+ {
+ return LATIN;
+ }
+ if( character == 0x214E )
+ {
+ return LATIN;
+ }
+
+ return LETTER_LIKE;
+ }
+ if( ( 0x2150 <= character ) && ( character <= 0x215F ) )
+ {
+ return FRACTIONS_NF;
+ }
+ if( character == 0x2189 )
+ {
+ return FRACTIONS_NF;
+ }
+
return LATIN;
}
+
+ // Symbols
+ if( ( 0x25cb == character ) ||
+ ( 0x25cf == character ) ||
+ ( 0x25a1 == character ) )
+ {
+ return SYMBOLS1;
+ }
+
+ if( 0x25a0 == character )
+ {
+ return SYMBOLS2;
+ }
+
+ if( ( 0x2664 == character ) ||
+ ( 0x2661 == character ) ||
+ ( 0x2662 == character ) ||
+ ( 0x2667 == character ) )
+ {
+ return SYMBOLS3;
+ }
+
+ if( ( 0x2606 == character ) ||
+ ( 0x25aa == character ) )
+ {
+ return SYMBOLS4;
+ }
+
+ if( 0x262a == character )
+ {
+ return SYMBOLS5;
+ }
+
// U+2194 5. Uncategorized: left right arrow
// U+2B55 5. Uncategorized: heavy large circle
if( ( 0x2194 <= character ) && ( character <= 0x2B55 ) )
}
if( ( 0xa720 <= character ) && ( character <= 0xa7ff ) )
{
+ if( character == 0xA720 )
+ {
+ return PHONETIC_U;
+ }
+ if( character == 0xA721 )
+ {
+ return PHONETIC_U;
+ }
+ if( character == 0xA788 )
+ {
+ return NON_LATIN_LED;
+ }
+ if( character == 0xA789 )
+ {
+ return NON_LATIN_LED;
+ }
+ if( character == 0xA78A )
+ {
+ return NON_LATIN_LED;
+ }
+
return LATIN;
}
if( ( 0xa960 <= character ) && ( character <= 0xa97f ) )
}
if( ( 0xff00 <= character ) && ( character <= 0xffef ) )
{
+ if( ( 0xFF00 <= character ) && ( character <= 0xFF20 ) )
+ {
+ return HWFW_S;
+ }
+ if( ( 0xFF3B <= character ) && ( character <= 0xFF40 ) )
+ {
+ return HWFW_S;
+ }
+ if( ( 0xFF5B <= character ) && ( character <= 0xFFEF ) )
+ {
+ return HWFW_S;
+ }
+
return LATIN;
}
if( ( 0x1ee00 <= character ) && ( character <= 0x1eeff ) )
*/
enum Script
{
- COMMON, ///< Valid for all scripts. i.e white space or '\n'.
-
- CYRILLIC, ///< The Cyrillic script. Used by Russian, Bulgarian, Ukrainian, Macedonian, ...
- GREEK, ///< The Greek script. Used by Greek.
- LATIN, ///< The latin script. Used by many western languages and others around the world.
-
- ARABIC, ///< The arabic script. Used by Arab and Urdu among others.
- HEBREW, ///< The Hebrew script. Used by the Hebrew, Yiddish, Ladino, and Judeo-Arabic.
-
- ARMENIAN, ///< The Armenian script. Used by Armenian.
- GEORGIAN, ///< The Georgian script. Used by Georgian.
-
- CJK, ///< The CJK script. Used by Chinese, Japanese, Korean and Vietnamese(old writing system).
- HANGUL, ///< The Hangul jamo script. Used by Korean.
- HIRAGANA, ///< The Hiragana script. Used by the Japanese.
- KATAKANA, ///< The Katakana script. Used by the Japanese.
- BOPOMOFO, ///< The Bopomofo script. Also called Zhuyin fuhao or Zhuyin. A phonetic notation used for the transcription of spoken Chinese.
-
- BENGALI, ///< The Bengali script. Used by Bangla, Assamese, Bishnupriya Manipuri, Daphla, Garo, Hallam, Khasi, Mizo, Munda, Naga, Rian, and Santali.
- BURMESE, ///< The Burmese script. Used by the Burmese (Myanmar) language.
- DEVANAGARI, ///< The devanagari script. Used by Hindi, Marathi, Sindhi, Nepali and Sanskrit.
- GUJARATI, ///< The Gujarati script. Used by Gujarati.
- GURMUKHI, ///< The Gurmukhi script. Used by Punjabi.
- KANNADA, ///< The Kannada script. Used by Kannada and Tulu.
- MALAYALAM, ///< The Malayalam script. Used by Malayalam.
- ORIYA, ///< The Oriya script. Used by Oriya (Odia), Khondi, and Santali.
- SINHALA, ///< The Sinhala script. Used by Sinhala and Pali.
- TAMIL, ///< The Tamil script. Used by Tamil, Badaga, and Saurashtra.
- TELUGU, ///< The Telugu script. Used by Telugu, Gondi, and Lambadi.
-
- LAO, ///< The Lao script. Used by the Lao language.
- THAI, ///< The Thai script. Used by the Thai language
- KHMER, ///< The Khmer script. Used by the Khmer language.
-
- EMOJI, ///< The Emoji which map to standardized Unicode characters.
-
- UNKNOWN ///< The script is unknown.
+ COMMON, ///< Valid for all scripts. i.e white space or '\n'.
+
+ ASCII_DIGITS, ///< ASCII digits.
+ ASCII_PS, ///< ASCII punctuation and symbols.
+
+ C1_CONTROLS, ///< Controls of the C1 Controls and Latin-1 Supplement unicode block.
+ C1_PS, ///< Punctuation and symbols of the C1 Controls and Latin-1 Supplement unicode block.
+ C1_MATH, ///< Math symbols of the C1 Controls and Latin-1 Supplement unicode block.
+
+ SML_P, ///< Punctuation symbols of the Spacing Modifier Letters unicode block.
+ PHONETIC_U, ///< Uralic phonetic symbols of the Phonetic Extensions unicode block.
+ PHONETIC_SS, ///< Subscripts and superscripts of the Phonetic Extensions unicode block.
+
+ NUMERIC_SS, ///< Numeric subscripts and superscripts.
+
+ LETTER_LIKE, ///< Symbols of the Letterlike unicode block.
+ NUMBER_FORMS, ///< Number Forms unicode block.
+ FRACTIONS_NF, ///< Numeric fraction symbols of the Number Forms unicode block.
+ NON_LATIN_LED, ///< Non latin symbols within the Latin Extended D unicode block.
+ HWFW_S, ///< Non latin symbols within the Halfwidth and fullwidth unicode block.
+
+ CYRILLIC, ///< The Cyrillic script. Used by Russian, Bulgarian, Ukrainian, Macedonian, ...
+ GREEK, ///< The Greek script. Used by Greek.
+ LATIN, ///< The latin script. Used by many western languages and others around the world.
+
+ ARABIC, ///< The arabic script. Used by Arab and Urdu among others.
+ HEBREW, ///< The Hebrew script. Used by the Hebrew, Yiddish, Ladino, and Judeo-Arabic.
+
+ ARMENIAN, ///< The Armenian script. Used by Armenian.
+ GEORGIAN, ///< The Georgian script. Used by Georgian.
+
+ CJK, ///< The CJK script. Used by Chinese, Japanese, Korean and Vietnamese(old writing system).
+ HANGUL, ///< The Hangul jamo script. Used by Korean.
+ HIRAGANA, ///< The Hiragana script. Used by the Japanese.
+ KATAKANA, ///< The Katakana script. Used by the Japanese.
+ BOPOMOFO, ///< The Bopomofo script. Also called Zhuyin fuhao or Zhuyin. A phonetic notation used for the transcription of spoken Chinese.
+
+ BENGALI, ///< The Bengali script. Used by Bangla, Assamese, Bishnupriya Manipuri, Daphla, Garo, Hallam, Khasi, Mizo, Munda, Naga, Rian, and Santali.
+ BURMESE, ///< The Burmese script. Used by the Burmese (Myanmar) language.
+ DEVANAGARI, ///< The devanagari script. Used by Hindi, Marathi, Sindhi, Nepali and Sanskrit.
+ GUJARATI, ///< The Gujarati script. Used by Gujarati.
+ GURMUKHI, ///< The Gurmukhi script. Used by Punjabi.
+ KANNADA, ///< The Kannada script. Used by Kannada and Tulu.
+ MALAYALAM, ///< The Malayalam script. Used by Malayalam.
+ ORIYA, ///< The Oriya script. Used by Oriya (Odia), Khondi, and Santali.
+ SINHALA, ///< The Sinhala script. Used by Sinhala and Pali.
+ TAMIL, ///< The Tamil script. Used by Tamil, Badaga, and Saurashtra.
+ TELUGU, ///< The Telugu script. Used by Telugu, Gondi, and Lambadi.
+
+ LAO, ///< The Lao script. Used by the Lao language.
+ THAI, ///< The Thai script. Used by the Thai language
+ KHMER, ///< The Khmer script. Used by the Khmer language.
+
+ EMOJI, ///< The Emoji which map to standardized Unicode characters.
+
+ SYMBOLS1, ///< Some symbols.
+ SYMBOLS2, ///< Some symbols.
+ SYMBOLS3, ///< Some symbols.
+ SYMBOLS4, ///< Some symbols.
+ SYMBOLS5, ///< Some symbols.
+
+ UNKNOWN ///< The script is unknown.
};
const char* const ScriptName[] =
{
- "COMMON", ///< Valid for all scripts. i.e white space or '\n'.
-
- "CYRILLIC", ///< The Cyrillic script. Used by Russian, Bulgarian, Ukrainian, Macedonian, ...
- "GREEK", ///< The Greek script. Used by Greek.
- "LATIN", ///< The latin script. Used by many western languages and others around the world.
-
- "ARABIC", ///< The arabic script. Used by Arab and Urdu among others.
- "HEBREW", ///< The Hebrew script. Used by the Hebrew, Yiddish, Ladino, and Judeo-Arabic.
-
- "ARMENIAN", ///< The Armenian script. Used by Armenian.
- "GEORGIAN", ///< The Georgian script. Used by Georgian.
-
- "CJK", ///< The CJK script. Used by Chinese, Japanese, Korean and Vietnamese(old writing system).
- "HANGUL", ///< The Hangul jamo script. Used by Korean.
- "HIRAGANA", ///< The Hiragana script. Used by the Japanese.
- "KATAKANA", ///< The Katakana script. Used by the Japanese.
- "BOPOMOFO", ///< The Bopomofo script. Also called Zhuyin fuhao or Zhuyin. A phonetic notation used for the transcription of spoken Chinese.
-
- "BENGALI", ///< The Bengali script. Used by Bangla, Assamese, Bishnupriya Manipuri, Daphla, Garo, Hallam, Khasi, Mizo, Munda, Naga, Rian, and Santali.
- "BURMESE", ///< The Burmese script. Used by the Burmese (Myanmar) language.
- "DEVANAGARI", ///< The devanagari script. Used by Hindi, Marathi, Sindhi, Nepali and Sanskrit.
- "GUJARATI", ///< The Gujarati script. Used by Gujarati.
- "GURMUKHI", ///< The Gurmukhi script. Used by Punjabi.
- "KANNADA", ///< The Kannada script. Used by Kannada and Tulu.
- "MALAYALAM", ///< The Malayalam script. Used by Malayalam.
- "ORIYA", ///< The Oriya script. Used by Oriya (Odia), Khondi, and Santali.
- "SINHALA", ///< The Sinhala script. Used by Sinhala and Pali.
- "TAMIL", ///< The Tamil script. Used by Tamil, Badaga, and Saurashtra.
- "TELUGU", ///< The Telugu script. Used by Telugu, Gondi, and Lambadi.
-
- "LAO", ///< The Lao script. Used by the Lao language.
- "THAI", ///< The Thai script. Used by the Thai language
- "KHMER", ///< The Khmer script. Used by the Khmer language.
-
- "EMOJI", ///< The Emoji which map to standardized Unicode characters.
-
- "UNKNOWN" ///< The script is unknown.
+ "COMMON", ///< Valid for all scripts. i.e white space or '\n'.
+
+ "ASCII_DIGITS", ///< ASCII digits.
+ "ASCII_PS", ///< ASCII punctuation and symbols.
+
+ "C1_CONTROLS", ///< Controls of the C1 Controls and Latin-1 Supplement unicode block.
+ "C1_PS", ///< Punctuation and symbols of the C1 Controls and Latin-1 Supplement unicode block.
+ "C1_MATH", ///< Math symbols of the C1 Controls and Latin-1 Supplement unicode block.
+
+ "SML_P", ///< Punctuation symbols of the Spacing Modifier Letters unicode block.
+ "PHONETIC_U", ///< Uralic phonetic symbols of the Phonetic Extensions unicode block.
+ "PHONETIC_SS", ///< Subscripts and superscripts of the Phonetic Extensions unicode block.
+
+ "NUMERIC_SS", ///< Numeric subscripts and superscripts.
+
+ "LETTER_LIKE", ///< Symbols of the Letterlike unicode block.
+ "NUMBER_FORMS", ///< Number Forms unicode block.
+ "FRACTIONS_NF", ///< Numeric fraction symbols of the Number Forms unicode block.
+ "NON_LATIN_LED", ///< Non latin symbols within the Latin Extended D unicode block.
+ "HWFW_S", ///< Non latin symbols within the Halfwidth and fullwidth unicode block.
+
+ "CYRILLIC", ///< The Cyrillic script. Used by Russian, Bulgarian, Ukrainian, Macedonian, ...
+ "GREEK", ///< The Greek script. Used by Greek.
+ "LATIN", ///< The latin script. Used by many western languages and others around the world.
+
+ "ARABIC", ///< The arabic script. Used by Arab and Urdu among others.
+ "HEBREW", ///< The Hebrew script. Used by the Hebrew, Yiddish, Ladino, and Judeo-Arabic.
+
+ "ARMENIAN", ///< The Armenian script. Used by Armenian.
+ "GEORGIAN", ///< The Georgian script. Used by Georgian.
+
+ "CJK", ///< The CJK script. Used by Chinese, Japanese, Korean and Vietnamese(old writing system).
+ "HANGUL", ///< The Hangul jamo script. Used by Korean.
+ "HIRAGANA", ///< The Hiragana script. Used by the Japanese.
+ "KATAKANA", ///< The Katakana script. Used by the Japanese.
+ "BOPOMOFO", ///< The Bopomofo script. Also called Zhuyin fuhao or Zhuyin. A phonetic notation used for the transcription of spoken Chinese.
+
+ "BENGALI", ///< The Bengali script. Used by Bangla, Assamese, Bishnupriya Manipuri, Daphla, Garo, Hallam, Khasi, Mizo, Munda, Naga, Rian, and Santali.
+ "BURMESE", ///< The Burmese script. Used by the Burmese (Myanmar) language.
+ "DEVANAGARI", ///< The devanagari script. Used by Hindi, Marathi, Sindhi, Nepali and Sanskrit.
+ "GUJARATI", ///< The Gujarati script. Used by Gujarati.
+ "GURMUKHI", ///< The Gurmukhi script. Used by Punjabi.
+ "KANNADA", ///< The Kannada script. Used by Kannada and Tulu.
+ "MALAYALAM", ///< The Malayalam script. Used by Malayalam.
+ "ORIYA", ///< The Oriya script. Used by Oriya (Odia), Khondi, and Santali.
+ "SINHALA", ///< The Sinhala script. Used by Sinhala and Pali.
+ "TAMIL", ///< The Tamil script. Used by Tamil, Badaga, and Saurashtra.
+ "TELUGU", ///< The Telugu script. Used by Telugu, Gondi, and Lambadi.
+
+ "LAO", ///< The Lao script. Used by the Lao language.
+ "THAI", ///< The Thai script. Used by the Thai language
+ "KHMER", ///< The Khmer script. Used by the Khmer language.
+
+ "EMOJI", ///< The Emoji which map to standardized Unicode characters.
+
+ "SYMBOLS1", ///< Some symbols.
+ "SYMBOLS2", ///< Some symbols.
+ "SYMBOLS3", ///< Some symbols.
+ "SYMBOLS4", ///< Some symbols.
+ "SYMBOLS5", ///< Some symbols.
+
+ "UNKNOWN" ///< The script is unknown.
};
/**