[dali_2.3.24] Merge branch 'devel/master'
[platform/core/uifw/dali-adaptor.git] / dali / devel-api / text-abstraction / script.cpp
1 /*
2  * Copyright (c) 2021 Samsung Electronics Co., Ltd.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17
18 // FILE HEADER
19 #include <dali/devel-api/text-abstraction/script.h>
20
21 namespace Dali
22 {
23 namespace TextAbstraction
24 {
25 namespace
26 {
27 //TODO: Move the below defined characters to "defined-characters.h"
28 constexpr unsigned int WHITE_SPACE_THRESHOLD = 0x21;   ///< All characters below 0x21 are considered white spaces.
29 constexpr unsigned int CHAR_SPACE            = 0x20;   ///< Space.
30 constexpr unsigned int CHAR_LF               = 0x000A; ///< NL Line feed, new line.
31 constexpr unsigned int CHAR_VT               = 0x000B; ///< Vertical tab.
32 constexpr unsigned int CHAR_FF               = 0x000C; ///< NP Form feed, new page.
33 constexpr unsigned int CHAR_CR               = 0x000D; ///< Carriage return, new line.
34 constexpr unsigned int CHAR_NEL              = 0x0085; ///< Next line.
35 constexpr unsigned int CHAR_LS               = 0x2028; ///< Line separator.
36 constexpr unsigned int CHAR_PS               = 0x2029; ///< Paragraph separator
37
38 constexpr unsigned int CHAR_ZWNJ = 0x200C; ///< Zero width non joiner.
39 constexpr unsigned int CHAR_ZWJ  = 0x200D; ///< Zero width joiner.
40 constexpr unsigned int CHAR_LTRM = 0x200E; ///< Left to Right Mark.
41 constexpr unsigned int CHAR_RTLM = 0x200F; ///< Right to Left Mark.
42 constexpr unsigned int CHAR_TS   = 0x2009; ///< Thin Space.
43
44 // Latin script:   It contains punctuation characters and symbols which are not part of the latin script. https://en.wikipedia.org/wiki/Latin_script_in_Unicode
45 // 0x0000 - 0x007f C0 Controls and Basic Latin
46 //
47 //                 ASCII digits (not part of LATIN script):
48 //                 0x0030 - 0x0039
49 //
50 //                 ASCII punctuation and symbols (not part of LATIN script):
51 //                 0x0020 - 0x002F
52 //                 0x003A - 0x0040
53 //                 0x005B - 0x0060
54 //                 0x007B - 0x007E
55 //
56 //                 Controls (not part of LATIN script):
57 //                 0x007F
58 //
59 // 0x0080 - 0x00ff C1 Controls and Latin-1 Supplement
60 //
61 //                 Controls (not part of LATIN script):
62 //                 0x0080 - 0x009F
63 //
64 //                 Punctuations and symbols (not part of LATIN script):
65 //                 0x00A0 - 0x00BF
66 //
67 //                 Mathematical operators (not part of LATIN script):
68 //                 0x00D7
69 //                 0x00F7
70 //
71 // 0x0100 - 0x017f Latin Extended-A
72 // 0x0180 - 0x024f Latin Extended-B
73 // 0x0250 - 0x02af IPA Extensions
74 // 0x02b0 - 0x02ff Spacing Modifier Letters
75 //
76 //                 Punctuation (not part of LATIN script):
77 //                 0x02B9 - 0x02BF
78 //
79 // 0x1d00 - 0x1d7f Phonetic Extensions
80 //
81 //                 Uralic Phonetic (not part of LATIN script):
82 //                 0x1D26 - 0x1D2B
83 //
84 //                 Subscripts and superscripts
85 //                 0x1D5D - 0x1D61
86 //                 0x1D66 - 0x1D6A
87 //                 0x1D78
88 //
89 // 0x1d80 - 0x1dbf Phonetic Extensions Supplement
90 //
91 //                 0x1DBF (subscript or superscript. Not part of LATIN script )
92 //
93 // 0x1e00 - 0x1eff Latin Extended Additional
94 // 0x2070 - 0x209f Superscripts and Subscripts
95 //
96 //                 0x2070          (not part of LATIN script)
97 //                 0x2074 - 0x207E (not part of LATIN script)
98 //
99 // 0x2100 - 0x214f Letterlike symbols (not part of LATIN script)
100 //
101 //                 0x212A - 0x212B (are part of LATIN script)
102 //                 0x2132          (are part of LATIN script)
103 //                 0x214E          (are part of LATIN script)
104 //
105 // 0x2150 - 0x2189 Number Forms
106 //
107 //                 0x2150 - 0x215F Fractions (not part of LATIN script)
108 //                 0x2189          Fractions (not part of LATIN script)
109 //
110 // 0x2c60 - 0x2c7f Latin Extended-C
111 // 0xa720 - 0xa7ff Latin Extended-D
112 //
113 //                 0xA720 - 0xA721 Uralic Phonetic (not part of LATIN script)
114 //                 0xA788          (not part of LATIN script)
115 //                 0xA789 - 0xA78A Budu (not part of LATIN script)
116 //
117 // 0xab30 - 0xab6f Latin Extended-E
118 //
119 // 0xfb00 - 0xfb06 Latin Alphabetic Presentation Forms
120 // 0xff00 - 0xffef Halfwidth and Fullwidth Forms
121 //
122 //                 0xFF00 - 0xFF20 HWFW Symbols (not part of LATIN script)
123 //                 0xFF3B - 0xFF40 HWFW Symbols (not part of LATIN script)
124 //                 0xFF5B - 0xFFEF HWFW Symbols (not part of LATIN script)
125
126 // Brahmic scripts:
127 // 0x0900 - 0x097f Devanagari
128 // 0x0980 - 0x09ff Bengali
129 // 0x0a00 - 0x0a7f Gurmukhi
130 // 0x0a80 - 0x0aff Gujarati
131 // 0x0b00 - 0x0b7f Oriya
132 // 0x0b80 - 0x0bff Tamil
133 // 0x0c00 - 0x0c7f Telugu
134 // 0x0c80 - 0x0cff Kannada
135 // 0x0d00 - 0x0d7f Malayalam
136
137 // Sinhala script.
138 // 0x0d80 - 0x0dff Sinhala
139
140 // Arabic script.
141 // 0x0600 - 0x06ff Arabic
142 // 0x0750 - 0x077f Arabic Supplement
143 // 0x08A0 - 0x08ff Arabic Extended-A
144 // 0xfb50 - 0xfdff Arabic Presentation Forms-A
145 // 0xfe70 - 0xfeff Arabic Presentation Forms-B
146 // 0x1ee00 - 0x1eeff Arabic Mathematical Alphabetic Symbols
147
148 // CJK (Chinese, Japanese and Korean) and Vietnamese script.
149 // 0x2e80 - 0x2eff CJK Radicals Supplement
150 // 0x2f00 - 0x2fdf Kangxi Radicals
151 // 0x3000 - 0x303f CJK Symbols and Punctuation
152 // 0x3200 - 0x32ff Enclosed CJK Letters and Months
153 // 0x3400 - 0x4dbf CJK Unified Ideographs Extension A
154 // 0x4e00 - 0x62ff CJK Unified Ideographs
155 // 0x6300 - 0x77ff CJK Unified Ideographs
156 // 0x7800 - 0x8cff CJK Unified Ideographs
157 // 0x8d00 - 0x9fff CJK Unified Ideographs
158 // 0x20000 - 0x215ff CJK Unified Ideographs Extension B
159 // 0x21600 - 0x230ff CJK Unified Ideographs Extension B
160 // 0x23100 - 0x245ff CJK Unified Ideographs Extension B
161 // 0x24600 - 0x260ff CJK Unified Ideographs Extension B
162 // 0x26100 - 0x275ff CJK Unified Ideographs Extension B
163 // 0x27600 - 0x290ff CJK Unified Ideographs Extension B
164 // 0x29100 - 0x2a6df CJK Unified Ideographs Extension B
165 // 0x2a700 - 0x2b73f CJK Unified Ideographs Extension C
166 // 0x2b740 - 0x2b81f CJK Unified Ideographs Extension D
167
168 // Japanese scripts.
169 // 0x3040 - 0x309f Hiragana
170 // 0x30a0 - 0x30ff Katakana
171
172 // Hangul script
173 // 0x1100 - 0x11ff Hangul jamo
174 // 0x3130 - 0x318f Hangul Compatibility Jamo
175 // 0xa960 - 0xa97f Hangul Jamo Extended-A
176 // 0xac00 - 0xd7af Hangul Syllables
177 // 0xd7b0 - 0xd7ff Hangul Jamo Extended-B
178
179 // Bopomofo script
180 // 0x3100 - 0x312f Bopomofo
181 // 0x31a0 - 0x31bf Bopomofo Extended
182
183 // Khmer script
184 // 0x1780 - 0x17ff Khmer
185 // 0x19e0 - 0x19ff Khmer Symbols
186
187 // Lao script
188 // 0x0e80 - 0x0eff Lao
189
190 // Thai script
191 // 0x0e00 - 0x0e7f Thai
192
193 // Burmese script
194 // 0x1000 - 0x109f Myanmar
195
196 // Hebrew script
197 // 0x0591 - 0x05f4 Hebrew
198 // 0xfb1d - 0xfb4f Hebrew subset of Alphabetic Presentation Forms
199
200 // Cyrillic script
201 // 0x0400 - 0x04ff Cyrillic
202 // 0x0500 - 0x052f Cyrillic suplement
203 // 0x2de0 - 0x2dff Cyrillic Extended-A
204 // 0xa640 - 0xa69f Cyrillic Extended-B
205
206 // Georgian script
207 // 0x10a0 - 0x10ff Georgian
208 // 0x2d00 - 0x2d2f Georgian suplement
209
210 // Greek script
211 // 0x0370 - 0x03ff Greek & Coptic
212 // 0x1f00 - 0x1fff Greek Extended
213
214 // Armenian script
215 // 0x0530 - 0x058f Armenian
216 // 0xfb13 - 0xfb17 Armenian subset of Alphabetic prefentation forms
217
218 // Javanese script
219 // 0xa980 - 0xa9fd Javanese
220
221 // Sundanese script
222 // 0x1b80 - 0x1bbf Sundanese
223 // 0x1cc0 - 0x1ccf Sundanese supplement
224
225 // Ge'ez script (Ethiopic)
226 // 0x1200 - 0x137f Ethiopic
227 // 0x1380 - 0x139f Ethiopic supplement
228 // 0x2d80 - 0x2ddf Ethiopic Extended
229 // 0xab00 - 0xab2f Ethiopic Extended-A
230
231 // Baybayin Script
232 // 0x1700 - 0x171f Baybayin
233
234 // Ol Chiki Script
235 // 0x1c50 - 0x1c7f Ol Chiki
236
237 // Meitei Script
238 // 0xabc0 - 0xabff Meetei Mayek
239 // 0xaae0 - 0xaaff Meetei Mayek Extensions
240
241 // The Emoji which map to standardized Unicode characters
242 // 1. Emoticons ( 1F601 - 1F64F )
243 // 2. Dingbats ( 2700 - 27BF )
244 // 3. Transport and map symbols ( 1F680 - 1F6C0 )
245 // 4. Enclosed characters ( 24C2 - 1F251 )
246 // 5. Uncategorized :-S
247 // 6. Additional Emoticons ( 1F600 - 1F636 )
248 // 6b. Additional transport and map symbols ( 1F680 - 1F6FF ): http://unicode.org/charts/PDF/U1F680.pdf
249 // 6c. Other additional symbols ( 1F30D - 1F567 )
250 // 7. Supplemental Symbols and Pictographs ( 1F900–1F9FF ): http://unicode.org/charts/PDF/U1F900.pdf
251
252 // Symbols. Work around for these symbols.
253 // 0x25cb
254 // 0x25cf
255 // 0x25a1
256 // 0x25a0
257 // 0x2664
258 // 0x2661
259 // 0x2662
260 // 0x2667
261 // 0x2606
262 // 0x25aa
263 // 0x262a
264
265 /// character <= 0x077f
266 inline Script GetScriptTillArabicSupplement(Character character)
267 {
268   Script script = UNKNOWN;
269
270   if((0x0030 <= character) && (character <= 0x0039))
271   {
272     script = ASCII_DIGITS;
273   }
274   else if(character <= 0x007E)
275   {
276     if((0x0020 <= character) && (character <= 0x002F))
277     {
278       script = ASCII_PS;
279     }
280     else if((0x003A <= character) && (character <= 0x0040))
281     {
282       script = ASCII_PS;
283     }
284     else if((0x005B <= character) && (character <= 0x0060))
285     {
286       script = ASCII_PS;
287     }
288     else if((0x007B <= character) && (character <= 0x007E))
289     {
290       script = ASCII_PS;
291     }
292     else
293     {
294       script = LATIN;
295     }
296   }
297   else if((0x007F <= character) && (character <= 0x009F))
298   {
299     // 0x007F is actually part of C0 Controls and Basic Latin. However, is the last and only control character of its block
300     // and the following characters of the next block are consecutive.
301     script = C1_CONTROLS;
302   }
303   else if((0x00A0 <= character) && (character <= 0x00BF))
304   {
305     if(character == 0x00A9)
306     {
307       script = EMOJI; // 5. Uncategorized: copyright sign
308     }
309     else if(character == 0x00AE)
310     {
311       script = EMOJI; // 5. Uncategorized: registered sign
312     }
313     else
314     {
315       script = C1_PS;
316     }
317   }
318   else if(character == 0x00D7)
319   {
320     script = C1_MATH;
321   }
322   else if(character == 0x00F7)
323   {
324     script = C1_MATH;
325   }
326   else if((0x00C0 <= character) && (character <= 0x02ff))
327   {
328     if((0x02B9 <= character) && (character <= 0x02BF))
329     {
330       script = SML_P;
331     }
332     else
333     {
334       script = LATIN;
335     }
336   }
337   else if((0x0370 <= character) && (character <= 0x03ff))
338   {
339     script = GREEK;
340   }
341   else if((0x0400 <= character) && (character <= 0x04ff))
342   {
343     script = CYRILLIC;
344   }
345   else if((0x0500 <= character) && (character <= 0x052f))
346   {
347     script = CYRILLIC;
348   }
349   else if((0x0530 <= character) && (character <= 0x058f))
350   {
351     script = ARMENIAN;
352   }
353   else if((0x0591 <= character) && (character <= 0x05f4))
354   {
355     script = HEBREW;
356   }
357   else if((0x0600 <= character) && (character <= 0x06ff))
358   {
359     script = ARABIC;
360   }
361   else if((0x0750 <= character) && (character <= 0x077f))
362   {
363     script = ARABIC;
364   }
365
366   return script;
367 }
368
369 /// character <= 0x09ff
370 inline Script GetScriptTillBengali(Character character)
371 {
372   Script script = UNKNOWN;
373
374   if(character <= 0x077f)
375   {
376     script = GetScriptTillArabicSupplement(character);
377   }
378   else // > 0x077f
379   {
380     if((0x08A0 <= character) && (character <= 0x08ff))
381     {
382       script = ARABIC;
383     }
384     else if((0x0900 <= character) && (character <= 0x097f))
385     {
386       script = DEVANAGARI;
387     }
388     else if((0x0980 <= character) && (character <= 0x09ff))
389     {
390       script = BENGALI;
391     }
392   }
393
394   return script;
395 }
396
397 /// 0x09ff < character <= 0x0cff
398 inline Script GetScriptBetweenBengaliAndKannada(Character character)
399 {
400   Script script = UNKNOWN;
401
402   if(character <= 0x0b7f)
403   {
404     if((0x0a00 <= character) && (character <= 0x0a7f))
405     {
406       script = GURMUKHI;
407     }
408     else if((0x0a80 <= character) && (character <= 0x0aff))
409     {
410       script = GUJARATI;
411     }
412     else if((0x0b00 <= character) && (character <= 0x0b7f))
413     {
414       script = ORIYA;
415     }
416   }
417   else // > 0x0b7f
418   {
419     if((0x0b80 <= character) && (character <= 0x0bff))
420     {
421       script = TAMIL;
422     }
423     else if((0x0c00 <= character) && (character <= 0x0c7f))
424     {
425       script = TELUGU;
426     }
427     else if((0x0c80 <= character) && (character <= 0x0cff))
428     {
429       script = KANNADA;
430     }
431   }
432
433   return script;
434 }
435
436 /// 0x0cff < character <= 0x1eff
437 inline Script GetScriptBetweenKannadaAndLatinExtendedAdditional(Character character)
438 {
439   Script script = UNKNOWN;
440
441   if((0x0d00 <= character) && (character <= 0x0d7f))
442   {
443     script = MALAYALAM;
444   }
445   else if((0x0d80 <= character) && (character <= 0x0dff))
446   {
447     script = SINHALA;
448   }
449   else if((0x0e00 <= character) && (character <= 0x0e7f))
450   {
451     script = THAI;
452   }
453   else if((0x0e80 <= character) && (character <= 0x0eff))
454   {
455     script = LAO;
456   }
457   else if((0x1000 <= character) && (character <= 0x109f))
458   {
459     script = BURMESE;
460   }
461   else if((0x10a0 <= character) && (character <= 0x10ff))
462   {
463     script = GEORGIAN;
464   }
465   else if((0x1100 <= character) && (character <= 0x11ff))
466   {
467     script = HANGUL;
468   }
469   else if((0x1200 <= character) && (character <= 0x137f))
470   {
471     script = GEEZ;
472   }
473   else if((0x1380 <= character) && (character <= 0x139f))
474   {
475     script = GEEZ;
476   }
477   else if((0x1700 <= character) && (character <= 0x171f))
478   {
479     script = BAYBAYIN;
480   }
481   else if((0x1780 <= character) && (character <= 0x17ff))
482   {
483     script = KHMER;
484   }
485   else if((0x19e0 <= character) && (character <= 0x19ff))
486   {
487     script = KHMER;
488   }
489   else if((0x1b80 <= character) && (character <= 0x1bbf))
490   {
491     script = SUNDANESE;
492   }
493   else if((0x1c50 <= character) && (character <= 0x1c7f))
494   {
495     script = OL_CHIKI;
496   }
497   else if((0x1cc0 <= character) && (character <= 0x1ccf))
498   {
499     script = SUNDANESE;
500   }
501   else if((0x1d00 <= character) && (character <= 0x1eff))
502   {
503     if((0x1D26 <= character) && (character <= 0x1D2B))
504     {
505       script = PHONETIC_U;
506     }
507     else if((0x1D5D <= character) && (character <= 0x1D61))
508     {
509       script = PHONETIC_SS;
510     }
511     else if((0x1D66 <= character) && (character <= 0x1D6A))
512     {
513       script = PHONETIC_SS;
514     }
515     else if(character == 0x1D78)
516     {
517       script = PHONETIC_SS;
518     }
519     else if(character == 0x1DBF)
520     {
521       script = PHONETIC_SS;
522     }
523     else
524     {
525       script = LATIN;
526     }
527   }
528
529   return script;
530 }
531
532 /// 0x1eff < character <= 0x2c7f
533 inline Script GetScriptBetweenLatinExtendedAdditionalAndLatinExtendedC(Character character)
534 {
535   Script script = UNKNOWN;
536
537   if((0x1f00 <= character) && (character <= 0x1fff))
538   {
539     script = GREEK;
540   }
541   else if(character == 0x203c)
542   {
543     script = EMOJI; // 5. Uncategorized: double exclamation mark
544   }
545   else if(character == 0x2049)
546   {
547     script = EMOJI; // 5. Uncategorized: exclamation question mark
548   }
549   else if((0x2070 <= character) && (character <= 0x209f))
550   {
551     if(character == 0x2070)
552     {
553       script = NUMERIC_SS;
554     }
555     else if((0x2074 <= character) && (character <= 0x207E))
556     {
557       script = NUMERIC_SS;
558     }
559     else
560     {
561       script = LATIN;
562     }
563   }
564   else if(character == 0x20e3)
565   {
566     script = EMOJI; // 5. Uncategorized: combining enclosing keycap
567   }
568   else if(character == 0x2122)
569   {
570     script = EMOJI; // 5. Uncategorized: trade mark sign
571   }
572   else if(character == 0x2139)
573   {
574     script = EMOJI; // 5. Uncategorized: information source
575   }
576   else if((0x2100 <= character) && (character <= 0x2189))
577   {
578     if((0x2100 <= character) && (character <= 0x214f))
579     {
580       if((0x212A <= character) && (character <= 0x212B))
581       {
582         script = LATIN;
583       }
584       else if(character == 0x2132)
585       {
586         script = LATIN;
587       }
588       else if(character == 0x214E)
589       {
590         script = LATIN;
591       }
592       else
593       {
594         script = LETTER_LIKE;
595       }
596     }
597     else if((0x2150 <= character) && (character <= 0x215F))
598     {
599       script = FRACTIONS_NF;
600     }
601     else if(character == 0x2189)
602     {
603       script = FRACTIONS_NF;
604     }
605     else
606     {
607       script = LATIN;
608     }
609   }
610   // Symbols
611   else if((0x25cb == character) ||
612           (0x25cf == character) ||
613           (0x25a1 == character))
614   {
615     script = SYMBOLS1;
616   }
617   else if(0x25a0 == character)
618   {
619     script = SYMBOLS2;
620   }
621   else if((0x2664 == character) ||
622           (0x2661 == character) ||
623           (0x2662 == character) ||
624           (0x2667 == character))
625   {
626     script = SYMBOLS3;
627   }
628   else if((0x2606 == character) ||
629           (0x25aa == character))
630   {
631     script = SYMBOLS4;
632   }
633   else if(0x262a == character)
634   {
635     script = SYMBOLS5;
636   }
637   // U+2194 5. Uncategorized: left right arrow
638   // U+2B55 5. Uncategorized: heavy large circle
639   else if((0x2194 <= character) && (character <= 0x2B55))
640   {
641     script = EMOJI;
642   }
643   else if((0x2c60 <= character) && (character <= 0x2c7f))
644   {
645     script = LATIN;
646   }
647
648   return script;
649 }
650
651 /// 0x0cff < character <= 0x2c7f
652 inline Script GetScriptBetweenKannadaAndLatinExtendedC(Character character)
653 {
654   Script script = UNKNOWN;
655
656   if(character <= 0x1eff)
657   {
658     script = GetScriptBetweenKannadaAndLatinExtendedAdditional(character);
659   }
660   else // > 0x1eff
661   {
662     script = GetScriptBetweenLatinExtendedAdditionalAndLatinExtendedC(character);
663   }
664
665   return script;
666 }
667
668 /// 0x2c7f < character <= 0xa7ff
669 inline Script GetScriptBetweenLatinExtendedCAndLatinExtendedD(Character character)
670 {
671   Script script = UNKNOWN;
672
673   if((0x2d00 <= character) && (character <= 0x2d2f))
674   {
675     script = GEORGIAN;
676   }
677   else if((0x2d80 <= character) && (character <= 0x2ddf))
678   {
679     script = GEEZ;
680   }
681   else if((0x2de0 <= character) && (character <= 0x2dff))
682   {
683     script = CYRILLIC;
684   }
685   else if((0x2e80 <= character) && (character <= 0x2eff))
686   {
687     script = CJK;
688   }
689   else if((0x2f00 <= character) && (character <= 0x2fdf))
690   {
691     script = CJK;
692   }
693   else if((0x3000 <= character) && (character <= 0x303f))
694   {
695     script = CJK;
696   }
697   else if((0x3040 <= character) && (character <= 0x309f))
698   {
699     script = HIRAGANA;
700   }
701   else if((0x30a0 <= character) && (character <= 0x30ff))
702   {
703     script = KATAKANA;
704   }
705   else if((0x3100 <= character) && (character <= 0x312f))
706   {
707     script = BOPOMOFO;
708   }
709   else if((0x3130 <= character) && (character <= 0x318f))
710   {
711     script = HANGUL;
712   }
713   else if((0x31a0 <= character) && (character <= 0x31bf))
714   {
715     script = BOPOMOFO;
716   }
717   else if((0x3200 <= character) && (character <= 0x32ff))
718   {
719     script = CJK;
720   }
721   else if((0x3400 <= character) && (character <= 0x4dbf))
722   {
723     script = CJK;
724   }
725   else if((0x4e00 <= character) && (character <= 0x62ff))
726   {
727     script = CJK;
728   }
729   else if((0x6300 <= character) && (character <= 0x77ff))
730   {
731     script = CJK;
732   }
733   else if((0x7800 <= character) && (character <= 0x8cff))
734   {
735     script = CJK;
736   }
737   else if((0x8d00 <= character) && (character <= 0x9fff))
738   {
739     script = CJK;
740   }
741   else if((0xa640 <= character) && (character <= 0xa69f))
742   {
743     script = CYRILLIC;
744   }
745   else if((0xa720 <= character) && (character <= 0xa7ff))
746   {
747     if(character == 0xA720)
748     {
749       script = PHONETIC_U;
750     }
751     else if(character == 0xA721)
752     {
753       script = PHONETIC_U;
754     }
755     else if(character == 0xA788)
756     {
757       script = NON_LATIN_LED;
758     }
759     else if(character == 0xA789)
760     {
761       script = NON_LATIN_LED;
762     }
763     else if(character == 0xA78A)
764     {
765       script = NON_LATIN_LED;
766     }
767     else
768     {
769       script = LATIN;
770     }
771   }
772
773   return script;
774 }
775
776 /// 0x2c7f < character <= 0xfdff
777 inline Script GetScriptBetweenLatinExtendedCAndArabicPresentationFormsA(Character character)
778 {
779   Script script = GetScriptBetweenLatinExtendedCAndLatinExtendedD(character);
780
781   if((0xa960 <= character) && (character <= 0xa97f))
782   {
783     script = HANGUL;
784   }
785   else if((0xa980 <= character) && (character <= 0xa9fd))
786   {
787     script = JAVANESE;
788   }
789   else if((0xab00 <= character) && (character <= 0xab2f))
790   {
791     script = GEEZ;
792   }
793   else if((0xab30 <= character) && (character <= 0xab6f))
794   {
795     script = LATIN;
796   }
797   else if((0xaae0 <= character) && (character <= 0xaaff))
798   {
799     script = MEITEI;
800   }
801   else if((0xabc0 <= character) && (character <= 0xabff))
802   {
803     script = MEITEI;
804   }
805   else if((0xac00 <= character) && (character <= 0xd7af))
806   {
807     script = HANGUL;
808   }
809   else if((0xd7b0 <= character) && (character <= 0xd7ff))
810   {
811     script = HANGUL;
812   }
813   else if((0xfb00 <= character) && (character <= 0xfb06))
814   {
815     script = LATIN;
816   }
817   else if((0xfb13 <= character) && (character <= 0xfb17))
818   {
819     script = ARMENIAN;
820   }
821   else if((0xfb1d <= character) && (character <= 0xfb4f))
822   {
823     script = HEBREW;
824   }
825   else if((0xfb50 <= character) && (character <= 0xfdff))
826   {
827     script = ARABIC;
828   }
829
830   return script;
831 }
832
833 /// character > 0xfdff
834 inline Script GetScriptAboveArabicPresentationFormsA(Character character)
835 {
836   Script script = UNKNOWN;
837
838   if((0xfe70 <= character) && (character <= 0xfeff))
839   {
840     script = ARABIC;
841   }
842   else if((0xff00 <= character) && (character <= 0xffef))
843   {
844     if((0xFF00 <= character) && (character <= 0xFF20))
845     {
846       script = HWFW_S;
847     }
848     else if((0xFF3B <= character) && (character <= 0xFF40))
849     {
850       script = HWFW_S;
851     }
852     else if((0xFF5B <= character) && (character <= 0xFFEF))
853     {
854       script = HWFW_S;
855     }
856     else
857     {
858       script = LATIN;
859     }
860   }
861   else if((0x1ee00 <= character) && (character <= 0x1eeff))
862   {
863     script = ARABIC;
864   }
865   // U+1f170 4. Enclosed characters: negative squared latin capital letter A
866   // U+1f6ff 6b. Additional transport and map symbols
867   // Exclude U+1f170 ~ U+1f189. They are SYMBOLS_NSLCL (negative squared latin capital letter)
868   else if((0x1f170 <= character) && (character <= 0x1f6ff))
869   {
870     script = EMOJI;
871   }
872   // 7. Supplemental Symbols and Pictographs
873   else if((0x1f900 <= character) && (character <= 0x1f9ff))
874   {
875     script = EMOJI;
876   }
877   else if((0x20000 <= character) && (character <= 0x215ff))
878   {
879     script = CJK;
880   }
881   else if((0x21600 <= character) && (character <= 0x230ff))
882   {
883     script = CJK;
884   }
885   else if((0x23100 <= character) && (character <= 0x245ff))
886   {
887     script = CJK;
888   }
889   else if((0x24600 <= character) && (character <= 0x260ff))
890   {
891     script = CJK;
892   }
893   else if((0x26100 <= character) && (character <= 0x275ff))
894   {
895     script = CJK;
896   }
897   else if((0x27600 <= character) && (character <= 0x290ff))
898   {
899     script = CJK;
900   }
901   else if((0x29100 <= character) && (character <= 0x2a6df))
902   {
903     script = CJK;
904   }
905   else if((0x2a700 <= character) && (character <= 0x2b73f))
906   {
907     script = CJK;
908   }
909   else if((0x2b740 <= character) && (character <= 0x2b81f))
910   {
911     script = CJK;
912   }
913
914   return script;
915 }
916
917 /// character > 0x2c7f
918 inline Script GetScriptAboveLatinExtendedC(Character character)
919 {
920   Script script = UNKNOWN;
921
922   if(character <= 0xfdff)
923   {
924     script = GetScriptBetweenLatinExtendedCAndArabicPresentationFormsA(character);
925   }
926   else // > 0xfdff
927   {
928     script = GetScriptAboveArabicPresentationFormsA(character);
929   }
930
931   return script;
932 }
933
934 } // namespace
935
936 bool IsRightToLeftScript(Script script)
937 {
938   return ((ARABIC == script) ||
939           (HEBREW == script));
940 }
941
942 Script GetCharacterScript(Character character)
943 {
944   Script script = UNKNOWN;
945
946   if(IsTextPresentationSelector(character))
947   {
948     script = EMOJI_TEXT;
949   }
950   else if(IsEmojiPresentationSelector(character))
951   {
952     script = EMOJI_COLOR;
953   }
954   else if(IsEmojiItem(character))
955   {
956     script = EMOJI;
957   }
958   else if(IsNegativeSquaredLatinCapitalLetter(character))
959   {
960     script = SYMBOLS_NSLCL;
961   }
962   else if(IsCommonScript(character))
963   {
964     script = COMMON;
965   }
966   else if(character <= 0x0cff)
967   {
968     if(character <= 0x09ff)
969     {
970       script = GetScriptTillBengali(character);
971     }
972     else // > 0x09ff
973     {
974       script = GetScriptBetweenBengaliAndKannada(character);
975     }
976   }
977   else // > 0x0cff
978   {
979     if(character <= 0x2c7f)
980     {
981       script = GetScriptBetweenKannadaAndLatinExtendedC(character);
982     }
983     else // > 0x2c7f
984     {
985       script = GetScriptAboveLatinExtendedC(character);
986     }
987   }
988
989   return script;
990 }
991
992 bool IsWhiteSpace(Character character)
993 {
994   return character < WHITE_SPACE_THRESHOLD;
995 }
996
997 bool IsSpace(Character character)
998 {
999   return CHAR_SPACE == character;
1000 }
1001
1002 bool IsNewParagraph(Character character)
1003 {
1004   return ((CHAR_LF == character) ||
1005           (CHAR_VT == character) ||
1006           (CHAR_FF == character) ||
1007           (CHAR_CR == character) ||
1008           (CHAR_NEL == character) ||
1009           (CHAR_LS == character) ||
1010           (CHAR_PS == character));
1011 }
1012
1013 bool IsZeroWidthNonJoiner(Character character)
1014 {
1015   return CHAR_ZWNJ == character;
1016 }
1017
1018 bool IsZeroWidthJoiner(Character character)
1019 {
1020   return CHAR_ZWJ == character;
1021 }
1022
1023 bool IsZeroWidthSpace(Character character)
1024 {
1025   return CHAR_ZWS == character;
1026 }
1027
1028 bool IsLeftToRightMark(Character character)
1029 {
1030   return CHAR_LTRM == character;
1031 }
1032
1033 bool IsRightToLeftMark(Character character)
1034 {
1035   return CHAR_RTLM == character;
1036 }
1037
1038 bool IsThinSpace(Character character)
1039 {
1040   return CHAR_TS == character;
1041 }
1042
1043 bool IsCommonScript(Character character)
1044 {
1045   return (IsWhiteSpace(character) ||
1046           IsZeroWidthNonJoiner(character) ||
1047           IsZeroWidthJoiner(character) ||
1048           IsZeroWidthSpace(character) ||
1049           IsLeftToRightMark(character) ||
1050           IsRightToLeftMark(character) ||
1051           IsThinSpace(character) ||
1052           IsNewParagraph(character));
1053 }
1054
1055 bool HasLigatureMustBreak(Script script)
1056 {
1057   return ((LATIN == script) ||
1058           (ARABIC == script));
1059 }
1060
1061 Length GetNumberOfScripts()
1062 {
1063   return SYMBOLS_NSLCL + 1;
1064 }
1065
1066 } // namespace TextAbstraction
1067
1068 } // namespace Dali