namespace
{
const unsigned int WHITE_SPACE_THRESHOLD = 0x21; ///< All characters below 0x21 are considered white spaces.
-const unsigned int CHAR_FL = 0x000A; ///< NL Line feed, new line.
+const unsigned int CHAR_LF = 0x000A; ///< NL Line feed, new line.
const unsigned int CHAR_VT = 0x000B; ///< Vertical tab.
const unsigned int CHAR_FF = 0x000C; ///< NP Form feed, new page.
+const unsigned int CHAR_CR = 0x000D; ///< Carriage return, new line.
const unsigned int CHAR_NEL = 0x0085; ///< Next line.
const unsigned int CHAR_LS = 0x2028; ///< Line separator.
const unsigned int CHAR_PS = 0x2029; ///< Paragraph separator
Script GetCharacterScript( Character character )
{
- // Latin script:
+ // Latin script: It contains punctuation characters and symbols which are not part of the latin script. https://en.wikipedia.org/wiki/Latin_script_in_Unicode
// 0x0000 - 0x007f C0 Controls and Basic Latin
+ //
+ // ASCII digits (not part of LATIN script):
+ // 0x0030 - 0x0039
+ //
+ // ASCII punctuation and symbols (not part of LATIN script):
+ // 0x0020 - 0x002F
+ // 0x003A - 0x0040
+ // 0x005B - 0x0060
+ // 0x007B - 0x007E
+ //
+ // Controls (not part of LATIN script):
+ // 0x007F
+ //
// 0x0080 - 0x00ff C1 Controls and Latin-1 Supplement
+ //
+ // Controls (not part of LATIN script):
+ // 0x0080 - 0x009F
+ //
+ // Punctuations and symbols (not part of LATIN script):
+ // 0x00A0 - 0x00BF
+ //
+ // Mathematical operators (not part of LATIN script):
+ // 0x00D7
+ // 0x00F7
+ //
// 0x0100 - 0x017f Latin Extended-A
// 0x0180 - 0x024f Latin Extended-B
// 0x0250 - 0x02af IPA Extensions
// 0x02b0 - 0x02ff Spacing Modifier Letters
+ //
+ // Punctuation (not part of LATIN script):
+ // 0x02B9 - 0x02BF
+ //
// 0x1d00 - 0x1d7f Phonetic Extensions
+ //
+ // Uralic Phonetic (not part of LATIN script):
+ // 0x1D26 - 0x1D2B
+ //
+ // Subscripts and superscripts
+ // 0x1D5D - 0x1D61
+ // 0x1D66 - 0x1D6A
+ // 0x1D78
+ //
// 0x1d80 - 0x1dbf Phonetic Extensions Supplement
+ //
+ // 0x1DBF (subscript or superscript. Not part of LATIN script )
+ //
// 0x1e00 - 0x1eff Latin Extended Additional
// 0x2070 - 0x209f Superscripts and Subscripts
- // 0x2100 - 0x214f Letterlike symbols
- // 0x2150 - 0x218f Number Forms
+ //
+ // 0x2070 (not part of LATIN script)
+ // 0x2074 - 0x207E (not part of LATIN script)
+ //
+ // 0x2100 - 0x214f Letterlike symbols (not part of LATIN script)
+ //
+ // 0x212A - 0x212B (are part of LATIN script)
+ // 0x2132 (are part of LATIN script)
+ // 0x214E (are part of LATIN script)
+ //
+ // 0x2150 - 0x2189 Number Forms
+ //
+ // 0x2150 - 0x215F Fractions (not part of LATIN script)
+ // 0x2189 Fractions (not part of LATIN script)
+ //
// 0x2c60 - 0x2c7f Latin Extended-C
// 0xa720 - 0xa7ff Latin Extended-D
+ //
+ // 0xA720 - 0xA721 Uralic Phonetic (not part of LATIN script)
+ // 0xA788 (not part of LATIN script)
+ // 0xA789 - 0xA78A Budu (not part of LATIN script)
+ //
// 0xab30 - 0xab6f Latin Extended-E
+ //
// 0xfb00 - 0xfb06 Latin Alphabetic Presentation Forms
// 0xff00 - 0xffef Halfwidth and Fullwidth Forms
+ //
+ // 0xFF00 - 0xFF20 HWFW Symbols (not part of LATIN script)
+ // 0xFF3B - 0xFF40 HWFW Symbols (not part of LATIN script)
+ // 0xFF5B - 0xFFEF HWFW Symbols (not part of LATIN script)
// Brahmic scripts:
// 0x0900 - 0x097f Devanagari
// 0xac00 - 0xd7af Hangul Syllables
// 0xd7b0 - 0xd7ff Hangul Jamo Extended-B
+ // Bopomofo script
+ // 0x3100 - 0x312f Bopomofo
+ // 0x31a0 - 0x31bf Bopomofo Extended
+
// Khmer script
// 0x1780 - 0x17ff Khmer
// 0x19e0 - 0x19ff Khmer Symbols
// 0x0530 - 0x058f Armenian
// 0xfb13 - 0xfb17 Armenian subset of Alphabetic prefentation forms
+ // Javanese script
+ // 0xa980 - 0xa9fd Javanese
+
+ // Sundanese script
+ // 0x1b80 - 0x1bbf Sundanese
+ // 0x1cc0 - 0x1ccf Sundanese supplement
+
// The Emoji which map to standardized Unicode characters
// 1. Emoticons ( 1F601 - 1F64F )
// 2. Dingbats ( 2702 - 27B0 )
// 6b. Additional transport and map symbols ( 1F681 - 1F6C5 )
// 6c. Other additional symbols ( 1F30D - 1F567 )
+ // Symbols. Work around for these symbols.
+ // 0x25cb
+ // 0x25cf
+ // 0x25a1
+ // 0x25a0
+ // 0x2664
+ // 0x2661
+ // 0x2662
+ // 0x2667
+ // 0x2606
+ // 0x25aa
+ // 0x262a
+
+ if( IsCommonScript( character ) )
+ {
+ return COMMON;
+ }
+
if( character <= 0x0cff )
{
if( character <= 0x09ff )
{
if( character <= 0x077f )
{
- if( character == 0x00A9 )
+ if( ( 0x0030 <= character ) && ( character <= 0x0039 ) )
+ {
+ return ASCII_DIGITS;
+ }
+ if( character <= 0x007E )
+ {
+ if( ( 0x0020 <= character ) && ( character <= 0x002F ) )
+ {
+ return ASCII_PS;
+ }
+ if( ( 0x003A <= character ) && ( character <= 0x0040 ) )
+ {
+ return ASCII_PS;
+ }
+ if( ( 0x005B <= character ) && ( character <= 0x0060 ) )
+ {
+ return ASCII_PS;
+ }
+ if( ( 0x007B <= character ) && ( character <= 0x007E ) )
+ {
+ return ASCII_PS;
+ }
+ }
+ if( ( 0x007F <= character ) && ( character <= 0x009F ) )
{
- return EMOJI; // 5. Uncategorized: copyright sign
+ // 0x007F is actually part of C0 Controls and Basic Latin. However, is the last and only control character of its block
+ // and the following characters of the next block are consecutive.
+ return C1_CONTROLS;
}
- if( character == 0x00AE )
+ if( ( 0x00A0 <= character ) && ( character <= 0x00BF ) )
{
- return EMOJI; // 5. Uncategorized: registered sign
+ if( character == 0x00A9 )
+ {
+ return EMOJI; // 5. Uncategorized: copyright sign
+ }
+ if( character == 0x00AE )
+ {
+ return EMOJI; // 5. Uncategorized: registered sign
+ }
+
+ return C1_PS;
+ }
+ if( character == 0x00D7 )
+ {
+ return C1_MATH;
+ }
+ if( character == 0x00F7 )
+ {
+ return C1_MATH;
}
if( character <= 0x02ff )
{
+ if( ( 0x02B9 <= character ) && ( character <= 0x02BF ) )
+ {
+ return SML_P;
+ }
+
return LATIN;
}
if( ( 0x0370 <= character ) && ( character <= 0x03ff ) )
{
return KHMER;
}
+ if( ( 0x1b80 <= character ) && ( character <= 0x1bbf ) )
+ {
+ return SUNDANESE;
+ }
+ if( ( 0x1cc0 <= character ) && ( character <= 0x1ccf ) )
+ {
+ return SUNDANESE;
+ }
if( ( 0x1d00 <= character ) && ( character <= 0x1eff ) )
{
+ if( ( 0x1D26 <= character ) && ( character <= 0x1D2B ) )
+ {
+ return PHONETIC_U;
+ }
+ if( ( 0x1D5D <= character ) && ( character <= 0x1D61 ) )
+ {
+ return PHONETIC_SS;
+ }
+ if( ( 0x1D66 <= character ) && ( character <= 0x1D6A ) )
+ {
+ return PHONETIC_SS;
+ }
+ if( character == 0x1D78 )
+ {
+ return PHONETIC_SS;
+ }
+ if( character == 0x1DBF)
+ {
+ return PHONETIC_SS;
+ }
+
return LATIN;
}
}
}
if( ( 0x2070 <= character ) && ( character <= 0x209f ) )
{
+ if( character == 0x2070 )
+ {
+ return NUMERIC_SS;
+ }
+ if( ( 0x2074 <= character ) && ( character <= 0x207E ) )
+ {
+ return NUMERIC_SS;
+ }
+
return LATIN;
}
if( character == 0x20e3 )
{
return EMOJI; // 5. Uncategorized: information source
}
- if( ( 0x2100 <= character ) && ( character <= 0x218f ) )
- {
+ if( ( 0x2100 <= character ) && ( character <= 0x2189 ) )
+ {
+ if( ( 0x2100 <= character ) && ( character <= 0x214f ) )
+ {
+ if( ( 0x212A <= character ) && ( character <= 0x212B ) )
+ {
+ return LATIN;
+ }
+ if( character == 0x2132 )
+ {
+ return LATIN;
+ }
+ if( character == 0x214E )
+ {
+ return LATIN;
+ }
+
+ return LETTER_LIKE;
+ }
+ if( ( 0x2150 <= character ) && ( character <= 0x215F ) )
+ {
+ return FRACTIONS_NF;
+ }
+ if( character == 0x2189 )
+ {
+ return FRACTIONS_NF;
+ }
+
return LATIN;
}
+
+ // Symbols
+ if( ( 0x25cb == character ) ||
+ ( 0x25cf == character ) ||
+ ( 0x25a1 == character ) )
+ {
+ return SYMBOLS1;
+ }
+
+ if( 0x25a0 == character )
+ {
+ return SYMBOLS2;
+ }
+
+ if( ( 0x2664 == character ) ||
+ ( 0x2661 == character ) ||
+ ( 0x2662 == character ) ||
+ ( 0x2667 == character ) )
+ {
+ return SYMBOLS3;
+ }
+
+ if( ( 0x2606 == character ) ||
+ ( 0x25aa == character ) )
+ {
+ return SYMBOLS4;
+ }
+
+ if( 0x262a == character )
+ {
+ return SYMBOLS5;
+ }
+
// U+2194 5. Uncategorized: left right arrow
// U+2B55 5. Uncategorized: heavy large circle
if( ( 0x2194 <= character ) && ( character <= 0x2B55 ) )
{
return KATAKANA;
}
+ if( ( 0x3100 <= character ) && ( character <= 0x312f ) )
+ {
+ return BOPOMOFO;
+ }
if( ( 0x3130 <= character ) && ( character <= 0x318f ) )
{
return HANGUL;
}
+ if( ( 0x31a0 <= character ) && ( character <= 0x31bf ) )
+ {
+ return BOPOMOFO;
+ }
if( ( 0x3200 <= character ) && ( character <= 0x32ff ) )
{
return CJK;
}
if( ( 0xa720 <= character ) && ( character <= 0xa7ff ) )
{
+ if( character == 0xA720 )
+ {
+ return PHONETIC_U;
+ }
+ if( character == 0xA721 )
+ {
+ return PHONETIC_U;
+ }
+ if( character == 0xA788 )
+ {
+ return NON_LATIN_LED;
+ }
+ if( character == 0xA789 )
+ {
+ return NON_LATIN_LED;
+ }
+ if( character == 0xA78A )
+ {
+ return NON_LATIN_LED;
+ }
+
return LATIN;
}
if( ( 0xa960 <= character ) && ( character <= 0xa97f ) )
{
return HANGUL;
}
+ if( ( 0xa980 <= character ) && ( character <= 0xa9fd ) )
+ {
+ return JAVANESE;
+ }
if( ( 0xab30 <= character ) && ( character <= 0xab6f ) )
{
return LATIN;
}
if( ( 0xff00 <= character ) && ( character <= 0xffef ) )
{
+ if( ( 0xFF00 <= character ) && ( character <= 0xFF20 ) )
+ {
+ return HWFW_S;
+ }
+ if( ( 0xFF3B <= character ) && ( character <= 0xFF40 ) )
+ {
+ return HWFW_S;
+ }
+ if( ( 0xFF5B <= character ) && ( character <= 0xFFEF ) )
+ {
+ return HWFW_S;
+ }
+
return LATIN;
}
if( ( 0x1ee00 <= character ) && ( character <= 0x1eeff ) )
bool IsNewParagraph( Character character )
{
- return ( ( CHAR_FL == character ) ||
+ return ( ( CHAR_LF == character ) ||
( CHAR_VT == character ) ||
( CHAR_FF == character ) ||
+ ( CHAR_CR == character ) ||
( CHAR_NEL == character ) ||
( CHAR_LS == character ) ||
( CHAR_PS == character ) );
return CHAR_TS == character;
}
+bool IsCommonScript( Character character )
+{
+ return ( IsWhiteSpace( character ) ||
+ IsZeroWidthNonJoiner( character ) ||
+ IsZeroWidthJoiner( character ) ||
+ IsZeroWidthSpace( character ) ||
+ IsLeftToRightMark( character ) ||
+ IsRightToLeftMark( character ) ||
+ IsThinSpace( character ) ||
+ IsNewParagraph( character ) );
+}
+
+bool HasLigatureMustBreak( Script script )
+{
+ return ( ( LATIN == script ) ||
+ ( ARABIC == script ) );
+}
+
} // namespace TextAbstraction
} // namespace Dali