62dfaa92044c0879dd26ab0ca1e2f2ba951ae92d
[platform/core/uifw/dali-adaptor.git] / dali / devel-api / text-abstraction / script.cpp
1 /*
2  * Copyright (c) 2021 Samsung Electronics Co., Ltd.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17
18 // FILE HEADER
19 #include <dali/devel-api/text-abstraction/script.h>
20
21 namespace Dali
22 {
23 namespace TextAbstraction
24 {
25 namespace
26 {
27 constexpr unsigned int WHITE_SPACE_THRESHOLD = 0x21;   ///< All characters below 0x21 are considered white spaces.
28 constexpr unsigned int CHAR_LF               = 0x000A; ///< NL Line feed, new line.
29 constexpr unsigned int CHAR_VT               = 0x000B; ///< Vertical tab.
30 constexpr unsigned int CHAR_FF               = 0x000C; ///< NP Form feed, new page.
31 constexpr unsigned int CHAR_CR               = 0x000D; ///< Carriage return, new line.
32 constexpr unsigned int CHAR_NEL              = 0x0085; ///< Next line.
33 constexpr unsigned int CHAR_LS               = 0x2028; ///< Line separator.
34 constexpr unsigned int CHAR_PS               = 0x2029; ///< Paragraph separator
35
36 constexpr unsigned int CHAR_ZWS  = 0x200B; ///< Zero width space.
37 constexpr unsigned int CHAR_ZWNJ = 0x200C; ///< Zero width non joiner.
38 constexpr unsigned int CHAR_ZWJ  = 0x200D; ///< Zero width joiner.
39 constexpr unsigned int CHAR_LTRM = 0x200E; ///< Left to Right Mark.
40 constexpr unsigned int CHAR_RTLM = 0x200F; ///< Right to Left Mark.
41 constexpr unsigned int CHAR_TS   = 0x2009; ///< Thin Space.
42
43 // Latin script:   It contains punctuation characters and symbols which are not part of the latin script. https://en.wikipedia.org/wiki/Latin_script_in_Unicode
44 // 0x0000 - 0x007f C0 Controls and Basic Latin
45 //
46 //                 ASCII digits (not part of LATIN script):
47 //                 0x0030 - 0x0039
48 //
49 //                 ASCII punctuation and symbols (not part of LATIN script):
50 //                 0x0020 - 0x002F
51 //                 0x003A - 0x0040
52 //                 0x005B - 0x0060
53 //                 0x007B - 0x007E
54 //
55 //                 Controls (not part of LATIN script):
56 //                 0x007F
57 //
58 // 0x0080 - 0x00ff C1 Controls and Latin-1 Supplement
59 //
60 //                 Controls (not part of LATIN script):
61 //                 0x0080 - 0x009F
62 //
63 //                 Punctuations and symbols (not part of LATIN script):
64 //                 0x00A0 - 0x00BF
65 //
66 //                 Mathematical operators (not part of LATIN script):
67 //                 0x00D7
68 //                 0x00F7
69 //
70 // 0x0100 - 0x017f Latin Extended-A
71 // 0x0180 - 0x024f Latin Extended-B
72 // 0x0250 - 0x02af IPA Extensions
73 // 0x02b0 - 0x02ff Spacing Modifier Letters
74 //
75 //                 Punctuation (not part of LATIN script):
76 //                 0x02B9 - 0x02BF
77 //
78 // 0x1d00 - 0x1d7f Phonetic Extensions
79 //
80 //                 Uralic Phonetic (not part of LATIN script):
81 //                 0x1D26 - 0x1D2B
82 //
83 //                 Subscripts and superscripts
84 //                 0x1D5D - 0x1D61
85 //                 0x1D66 - 0x1D6A
86 //                 0x1D78
87 //
88 // 0x1d80 - 0x1dbf Phonetic Extensions Supplement
89 //
90 //                 0x1DBF (subscript or superscript. Not part of LATIN script )
91 //
92 // 0x1e00 - 0x1eff Latin Extended Additional
93 // 0x2070 - 0x209f Superscripts and Subscripts
94 //
95 //                 0x2070          (not part of LATIN script)
96 //                 0x2074 - 0x207E (not part of LATIN script)
97 //
98 // 0x2100 - 0x214f Letterlike symbols (not part of LATIN script)
99 //
100 //                 0x212A - 0x212B (are part of LATIN script)
101 //                 0x2132          (are part of LATIN script)
102 //                 0x214E          (are part of LATIN script)
103 //
104 // 0x2150 - 0x2189 Number Forms
105 //
106 //                 0x2150 - 0x215F Fractions (not part of LATIN script)
107 //                 0x2189          Fractions (not part of LATIN script)
108 //
109 // 0x2c60 - 0x2c7f Latin Extended-C
110 // 0xa720 - 0xa7ff Latin Extended-D
111 //
112 //                 0xA720 - 0xA721 Uralic Phonetic (not part of LATIN script)
113 //                 0xA788          (not part of LATIN script)
114 //                 0xA789 - 0xA78A Budu (not part of LATIN script)
115 //
116 // 0xab30 - 0xab6f Latin Extended-E
117 //
118 // 0xfb00 - 0xfb06 Latin Alphabetic Presentation Forms
119 // 0xff00 - 0xffef Halfwidth and Fullwidth Forms
120 //
121 //                 0xFF00 - 0xFF20 HWFW Symbols (not part of LATIN script)
122 //                 0xFF3B - 0xFF40 HWFW Symbols (not part of LATIN script)
123 //                 0xFF5B - 0xFFEF HWFW Symbols (not part of LATIN script)
124
125 // Brahmic scripts:
126 // 0x0900 - 0x097f Devanagari
127 // 0x0980 - 0x09ff Bengali
128 // 0x0a00 - 0x0a7f Gurmukhi
129 // 0x0a80 - 0x0aff Gujarati
130 // 0x0b00 - 0x0b7f Oriya
131 // 0x0b80 - 0x0bff Tamil
132 // 0x0c00 - 0x0c7f Telugu
133 // 0x0c80 - 0x0cff Kannada
134 // 0x0d00 - 0x0d7f Malayalam
135
136 // Sinhala script.
137 // 0x0d80 - 0x0dff Sinhala
138
139 // Arabic script.
140 // 0x0600 - 0x06ff Arabic
141 // 0x0750 - 0x077f Arabic Supplement
142 // 0x08A0 - 0x08ff Arabic Extended-A
143 // 0xfb50 - 0xfdff Arabic Presentation Forms-A
144 // 0xfe70 - 0xfeff Arabic Presentation Forms-B
145 // 0x1ee00 - 0x1eeff Arabic Mathematical Alphabetic Symbols
146
147 // CJK (Chinese, Japanese and Korean) and Vietnamese script.
148 // 0x2e80 - 0x2eff CJK Radicals Supplement
149 // 0x2f00 - 0x2fdf Kangxi Radicals
150 // 0x3000 - 0x303f CJK Symbols and Punctuation
151 // 0x3200 - 0x32ff Enclosed CJK Letters and Months
152 // 0x3400 - 0x4dbf CJK Unified Ideographs Extension A
153 // 0x4e00 - 0x62ff CJK Unified Ideographs
154 // 0x6300 - 0x77ff CJK Unified Ideographs
155 // 0x7800 - 0x8cff CJK Unified Ideographs
156 // 0x8d00 - 0x9fff CJK Unified Ideographs
157 // 0x20000 - 0x215ff CJK Unified Ideographs Extension B
158 // 0x21600 - 0x230ff CJK Unified Ideographs Extension B
159 // 0x23100 - 0x245ff CJK Unified Ideographs Extension B
160 // 0x24600 - 0x260ff CJK Unified Ideographs Extension B
161 // 0x26100 - 0x275ff CJK Unified Ideographs Extension B
162 // 0x27600 - 0x290ff CJK Unified Ideographs Extension B
163 // 0x29100 - 0x2a6df CJK Unified Ideographs Extension B
164 // 0x2a700 - 0x2b73f CJK Unified Ideographs Extension C
165 // 0x2b740 - 0x2b81f CJK Unified Ideographs Extension D
166
167 // Japanese scripts.
168 // 0x3040 - 0x309f Hiragana
169 // 0x30a0 - 0x30ff Katakana
170
171 // Hangul script
172 // 0x1100 - 0x11ff Hangul jamo
173 // 0x3130 - 0x318f Hangul Compatibility Jamo
174 // 0xa960 - 0xa97f Hangul Jamo Extended-A
175 // 0xac00 - 0xd7af Hangul Syllables
176 // 0xd7b0 - 0xd7ff Hangul Jamo Extended-B
177
178 // Bopomofo script
179 // 0x3100 - 0x312f Bopomofo
180 // 0x31a0 - 0x31bf Bopomofo Extended
181
182 // Khmer script
183 // 0x1780 - 0x17ff Khmer
184 // 0x19e0 - 0x19ff Khmer Symbols
185
186 // Lao script
187 // 0x0e80 - 0x0eff Lao
188
189 // Thai script
190 // 0x0e00 - 0x0e7f Thai
191
192 // Burmese script
193 // 0x1000 - 0x109f Myanmar
194
195 // Hebrew script
196 // 0x0591 - 0x05f4 Hebrew
197 // 0xfb1d - 0xfb4f Hebrew subset of Alphabetic Presentation Forms
198
199 // Cyrillic script
200 // 0x0400 - 0x04ff Cyrillic
201 // 0x0500 - 0x052f Cyrillic suplement
202 // 0x2de0 - 0x2dff Cyrillic Extended-A
203 // 0xa640 - 0xa69f Cyrillic Extended-B
204
205 // Georgian script
206 // 0x10a0 - 0x10ff Georgian
207 // 0x2d00 - 0x2d2f Georgian suplement
208
209 // Greek script
210 // 0x0370 - 0x03ff Greek & Coptic
211 // 0x1f00 - 0x1fff Greek Extended
212
213 // Armenian script
214 // 0x0530 - 0x058f Armenian
215 // 0xfb13 - 0xfb17 Armenian subset of Alphabetic prefentation forms
216
217 // Javanese script
218 // 0xa980 - 0xa9fd Javanese
219
220 // Sundanese script
221 // 0x1b80 - 0x1bbf Sundanese
222 // 0x1cc0 - 0x1ccf Sundanese supplement
223
224 // Ge'ez script (Ethiopic)
225 // 0x1200 - 0x137f Ethiopic
226 // 0x1380 - 0x139f Ethiopic supplement
227 // 0x2d80 - 0x2ddf Ethiopic Extended
228 // 0xab00 - 0xab2f Ethiopic Extended-A
229
230 // Baybayin Script
231 // 0x1700 - 0x171f Baybayin
232
233 // Ol Chiki Script
234 // 0x1c50 - 0x1c7f Ol Chiki
235
236 // Meitei Script
237 // 0xabc0 - 0xabff Meetei Mayek
238 // 0xaae0 - 0xaaff Meetei Mayek Extensions
239
240 // The Emoji which map to standardized Unicode characters
241 // 1. Emoticons ( 1F601 - 1F64F )
242 // 2. Dingbats ( 2700 - 27BF )
243 // 3. Transport and map symbols ( 1F680 - 1F6C0 )
244 // 4. Enclosed characters ( 24C2 - 1F251 )
245 // 5. Uncategorized :-S
246 // 6. Additional Emoticons ( 1F600 - 1F636 )
247 // 6b. Additional transport and map symbols ( 1F680 - 1F6FF ): http://unicode.org/charts/PDF/U1F680.pdf
248 // 6c. Other additional symbols ( 1F30D - 1F567 )
249 // 7. Supplemental Symbols and Pictographs ( 1F900–1F9FF ): http://unicode.org/charts/PDF/U1F900.pdf
250
251 // Symbols. Work around for these symbols.
252 // 0x25cb
253 // 0x25cf
254 // 0x25a1
255 // 0x25a0
256 // 0x2664
257 // 0x2661
258 // 0x2662
259 // 0x2667
260 // 0x2606
261 // 0x25aa
262 // 0x262a
263
264 /// character <= 0x077f
265 inline Script GetScriptTillArabicSupplement(Character character)
266 {
267   Script script = UNKNOWN;
268
269   if((0x0030 <= character) && (character <= 0x0039))
270   {
271     script = ASCII_DIGITS;
272   }
273   else if(character <= 0x007E)
274   {
275     if((0x0020 <= character) && (character <= 0x002F))
276     {
277       script = ASCII_PS;
278     }
279     else if((0x003A <= character) && (character <= 0x0040))
280     {
281       script = ASCII_PS;
282     }
283     else if((0x005B <= character) && (character <= 0x0060))
284     {
285       script = ASCII_PS;
286     }
287     else if((0x007B <= character) && (character <= 0x007E))
288     {
289       script = ASCII_PS;
290     }
291     else
292     {
293       script = LATIN;
294     }
295   }
296   else if((0x007F <= character) && (character <= 0x009F))
297   {
298     // 0x007F is actually part of C0 Controls and Basic Latin. However, is the last and only control character of its block
299     // and the following characters of the next block are consecutive.
300     script = C1_CONTROLS;
301   }
302   else if((0x00A0 <= character) && (character <= 0x00BF))
303   {
304     if(character == 0x00A9)
305     {
306       script = EMOJI; // 5. Uncategorized: copyright sign
307     }
308     else if(character == 0x00AE)
309     {
310       script = EMOJI; // 5. Uncategorized: registered sign
311     }
312     else
313     {
314       script = C1_PS;
315     }
316   }
317   else if(character == 0x00D7)
318   {
319     script = C1_MATH;
320   }
321   else if(character == 0x00F7)
322   {
323     script = C1_MATH;
324   }
325   else if((0x00C0 <= character) && (character <= 0x02ff))
326   {
327     if((0x02B9 <= character) && (character <= 0x02BF))
328     {
329       script = SML_P;
330     }
331     else
332     {
333       script = LATIN;
334     }
335   }
336   else if((0x0370 <= character) && (character <= 0x03ff))
337   {
338     script = GREEK;
339   }
340   else if((0x0400 <= character) && (character <= 0x04ff))
341   {
342     script = CYRILLIC;
343   }
344   else if((0x0500 <= character) && (character <= 0x052f))
345   {
346     script = CYRILLIC;
347   }
348   else if((0x0530 <= character) && (character <= 0x058f))
349   {
350     script = ARMENIAN;
351   }
352   else if((0x0591 <= character) && (character <= 0x05f4))
353   {
354     script = HEBREW;
355   }
356   else if((0x0600 <= character) && (character <= 0x06ff))
357   {
358     script = ARABIC;
359   }
360   else if((0x0750 <= character) && (character <= 0x077f))
361   {
362     script = ARABIC;
363   }
364
365   return script;
366 }
367
368 /// character <= 0x09ff
369 inline Script GetScriptTillBengali(Character character)
370 {
371   Script script = UNKNOWN;
372
373   if(character <= 0x077f)
374   {
375     script = GetScriptTillArabicSupplement(character);
376   }
377   else // > 0x077f
378   {
379     if((0x08A0 <= character) && (character <= 0x08ff))
380     {
381       script = ARABIC;
382     }
383     else if((0x0900 <= character) && (character <= 0x097f))
384     {
385       script = DEVANAGARI;
386     }
387     else if((0x0980 <= character) && (character <= 0x09ff))
388     {
389       script = BENGALI;
390     }
391   }
392
393   return script;
394 }
395
396 /// 0x09ff < character <= 0x0cff
397 inline Script GetScriptBetweenBengaliAndKannada(Character character)
398 {
399   Script script = UNKNOWN;
400
401   if(character <= 0x0b7f)
402   {
403     if((0x0a00 <= character) && (character <= 0x0a7f))
404     {
405       script = GURMUKHI;
406     }
407     else if((0x0a80 <= character) && (character <= 0x0aff))
408     {
409       script = GUJARATI;
410     }
411     else if((0x0b00 <= character) && (character <= 0x0b7f))
412     {
413       script = ORIYA;
414     }
415   }
416   else // > 0x0b7f
417   {
418     if((0x0b80 <= character) && (character <= 0x0bff))
419     {
420       script = TAMIL;
421     }
422     else if((0x0c00 <= character) && (character <= 0x0c7f))
423     {
424       script = TELUGU;
425     }
426     else if((0x0c80 <= character) && (character <= 0x0cff))
427     {
428       script = KANNADA;
429     }
430   }
431
432   return script;
433 }
434
435 /// 0x0cff < character <= 0x1eff
436 inline Script GetScriptBetweenKannadaAndLatinExtendedAdditional(Character character)
437 {
438   Script script = UNKNOWN;
439
440   if((0x0d00 <= character) && (character <= 0x0d7f))
441   {
442     script = MALAYALAM;
443   }
444   else if((0x0d80 <= character) && (character <= 0x0dff))
445   {
446     script = SINHALA;
447   }
448   else if((0x0e00 <= character) && (character <= 0x0e7f))
449   {
450     script = THAI;
451   }
452   else if((0x0e80 <= character) && (character <= 0x0eff))
453   {
454     script = LAO;
455   }
456   else if((0x1000 <= character) && (character <= 0x109f))
457   {
458     script = BURMESE;
459   }
460   else if((0x10a0 <= character) && (character <= 0x10ff))
461   {
462     script = GEORGIAN;
463   }
464   else if((0x1100 <= character) && (character <= 0x11ff))
465   {
466     script = HANGUL;
467   }
468   else if((0x1200 <= character) && (character <= 0x137f))
469   {
470     script = GEEZ;
471   }
472   else if((0x1380 <= character) && (character <= 0x139f))
473   {
474     script = GEEZ;
475   }
476   else if((0x1700 <= character) && (character <= 0x171f))
477   {
478     script = BAYBAYIN;
479   }
480   else if((0x1780 <= character) && (character <= 0x17ff))
481   {
482     script = KHMER;
483   }
484   else if((0x19e0 <= character) && (character <= 0x19ff))
485   {
486     script = KHMER;
487   }
488   else if((0x1b80 <= character) && (character <= 0x1bbf))
489   {
490     script = SUNDANESE;
491   }
492   else if((0x1c50 <= character) && (character <= 0x1c7f))
493   {
494     script = OL_CHIKI;
495   }
496   else if((0x1cc0 <= character) && (character <= 0x1ccf))
497   {
498     script = SUNDANESE;
499   }
500   else if((0x1d00 <= character) && (character <= 0x1eff))
501   {
502     if((0x1D26 <= character) && (character <= 0x1D2B))
503     {
504       script = PHONETIC_U;
505     }
506     else if((0x1D5D <= character) && (character <= 0x1D61))
507     {
508       script = PHONETIC_SS;
509     }
510     else if((0x1D66 <= character) && (character <= 0x1D6A))
511     {
512       script = PHONETIC_SS;
513     }
514     else if(character == 0x1D78)
515     {
516       script = PHONETIC_SS;
517     }
518     else if(character == 0x1DBF)
519     {
520       script = PHONETIC_SS;
521     }
522     else
523     {
524       script = LATIN;
525     }
526   }
527
528   return script;
529 }
530
531 /// 0x1eff < character <= 0x2c7f
532 inline Script GetScriptBetweenLatinExtendedAdditionalAndLatinExtendedC(Character character)
533 {
534   Script script = UNKNOWN;
535
536   if((0x1f00 <= character) && (character <= 0x1fff))
537   {
538     script = GREEK;
539   }
540   else if(character == 0x203c)
541   {
542     script = EMOJI; // 5. Uncategorized: double exclamation mark
543   }
544   else if(character == 0x2049)
545   {
546     script = EMOJI; // 5. Uncategorized: exclamation question mark
547   }
548   else if((0x2070 <= character) && (character <= 0x209f))
549   {
550     if(character == 0x2070)
551     {
552       script = NUMERIC_SS;
553     }
554     else if((0x2074 <= character) && (character <= 0x207E))
555     {
556       script = NUMERIC_SS;
557     }
558     else
559     {
560       script = LATIN;
561     }
562   }
563   else if(character == 0x20e3)
564   {
565     script = EMOJI; // 5. Uncategorized: combining enclosing keycap
566   }
567   else if(character == 0x2122)
568   {
569     script = EMOJI; // 5. Uncategorized: trade mark sign
570   }
571   else if(character == 0x2139)
572   {
573     script = EMOJI; // 5. Uncategorized: information source
574   }
575   else if((0x2100 <= character) && (character <= 0x2189))
576   {
577     if((0x2100 <= character) && (character <= 0x214f))
578     {
579       if((0x212A <= character) && (character <= 0x212B))
580       {
581         script = LATIN;
582       }
583       else if(character == 0x2132)
584       {
585         script = LATIN;
586       }
587       else if(character == 0x214E)
588       {
589         script = LATIN;
590       }
591       else
592       {
593         script = LETTER_LIKE;
594       }
595     }
596     else if((0x2150 <= character) && (character <= 0x215F))
597     {
598       script = FRACTIONS_NF;
599     }
600     else if(character == 0x2189)
601     {
602       script = FRACTIONS_NF;
603     }
604     else
605     {
606       script = LATIN;
607     }
608   }
609   // Symbols
610   else if((0x25cb == character) ||
611           (0x25cf == character) ||
612           (0x25a1 == character))
613   {
614     script = SYMBOLS1;
615   }
616   else if(0x25a0 == character)
617   {
618     script = SYMBOLS2;
619   }
620   else if((0x2664 == character) ||
621           (0x2661 == character) ||
622           (0x2662 == character) ||
623           (0x2667 == character))
624   {
625     script = SYMBOLS3;
626   }
627   else if((0x2606 == character) ||
628           (0x25aa == character))
629   {
630     script = SYMBOLS4;
631   }
632   else if(0x262a == character)
633   {
634     script = SYMBOLS5;
635   }
636   // U+2194 5. Uncategorized: left right arrow
637   // U+2B55 5. Uncategorized: heavy large circle
638   else if((0x2194 <= character) && (character <= 0x2B55))
639   {
640     script = EMOJI;
641   }
642   else if((0x2c60 <= character) && (character <= 0x2c7f))
643   {
644     script = LATIN;
645   }
646
647   return script;
648 }
649
650 /// 0x0cff < character <= 0x2c7f
651 inline Script GetScriptBetweenKannadaAndLatinExtendedC(Character character)
652 {
653   Script script = UNKNOWN;
654
655   if(character <= 0x1eff)
656   {
657     script = GetScriptBetweenKannadaAndLatinExtendedAdditional(character);
658   }
659   else // > 0x1eff
660   {
661     script = GetScriptBetweenLatinExtendedAdditionalAndLatinExtendedC(character);
662   }
663
664   return script;
665 }
666
667 /// 0x2c7f < character <= 0xa7ff
668 inline Script GetScriptBetweenLatinExtendedCAndLatinExtendedD(Character character)
669 {
670   Script script = UNKNOWN;
671
672   if((0x2d00 <= character) && (character <= 0x2d2f))
673   {
674     script = GEORGIAN;
675   }
676   else if((0x2d80 <= character) && (character <= 0x2ddf))
677   {
678     script = GEEZ;
679   }
680   else if((0x2de0 <= character) && (character <= 0x2dff))
681   {
682     script = CYRILLIC;
683   }
684   else if((0x2e80 <= character) && (character <= 0x2eff))
685   {
686     script = CJK;
687   }
688   else if((0x2f00 <= character) && (character <= 0x2fdf))
689   {
690     script = CJK;
691   }
692   else if((0x3000 <= character) && (character <= 0x303f))
693   {
694     script = CJK;
695   }
696   else if((0x3040 <= character) && (character <= 0x309f))
697   {
698     script = HIRAGANA;
699   }
700   else if((0x30a0 <= character) && (character <= 0x30ff))
701   {
702     script = KATAKANA;
703   }
704   else if((0x3100 <= character) && (character <= 0x312f))
705   {
706     script = BOPOMOFO;
707   }
708   else if((0x3130 <= character) && (character <= 0x318f))
709   {
710     script = HANGUL;
711   }
712   else if((0x31a0 <= character) && (character <= 0x31bf))
713   {
714     script = BOPOMOFO;
715   }
716   else if((0x3200 <= character) && (character <= 0x32ff))
717   {
718     script = CJK;
719   }
720   else if((0x3400 <= character) && (character <= 0x4dbf))
721   {
722     script = CJK;
723   }
724   else if((0x4e00 <= character) && (character <= 0x62ff))
725   {
726     script = CJK;
727   }
728   else if((0x6300 <= character) && (character <= 0x77ff))
729   {
730     script = CJK;
731   }
732   else if((0x7800 <= character) && (character <= 0x8cff))
733   {
734     script = CJK;
735   }
736   else if((0x8d00 <= character) && (character <= 0x9fff))
737   {
738     script = CJK;
739   }
740   else if((0xa640 <= character) && (character <= 0xa69f))
741   {
742     script = CYRILLIC;
743   }
744   else if((0xa720 <= character) && (character <= 0xa7ff))
745   {
746     if(character == 0xA720)
747     {
748       script = PHONETIC_U;
749     }
750     else if(character == 0xA721)
751     {
752       script = PHONETIC_U;
753     }
754     else if(character == 0xA788)
755     {
756       script = NON_LATIN_LED;
757     }
758     else if(character == 0xA789)
759     {
760       script = NON_LATIN_LED;
761     }
762     else if(character == 0xA78A)
763     {
764       script = NON_LATIN_LED;
765     }
766     else
767     {
768       script = LATIN;
769     }
770   }
771
772   return script;
773 }
774
775 /// 0x2c7f < character <= 0xfdff
776 inline Script GetScriptBetweenLatinExtendedCAndArabicPresentationFormsA(Character character)
777 {
778   Script script = GetScriptBetweenLatinExtendedCAndLatinExtendedD(character);
779
780   if((0xa960 <= character) && (character <= 0xa97f))
781   {
782     script = HANGUL;
783   }
784   else if((0xa980 <= character) && (character <= 0xa9fd))
785   {
786     script = JAVANESE;
787   }
788   else if((0xab00 <= character) && (character <= 0xab2f))
789   {
790     script = GEEZ;
791   }
792   else if((0xab30 <= character) && (character <= 0xab6f))
793   {
794     script = LATIN;
795   }
796   else if((0xaae0 <= character) && (character <= 0xaaff))
797   {
798     script = MEITEI;
799   }
800   else if((0xabc0 <= character) && (character <= 0xabff))
801   {
802     script = MEITEI;
803   }
804   else if((0xac00 <= character) && (character <= 0xd7af))
805   {
806     script = HANGUL;
807   }
808   else if((0xd7b0 <= character) && (character <= 0xd7ff))
809   {
810     script = HANGUL;
811   }
812   else if((0xfb00 <= character) && (character <= 0xfb06))
813   {
814     script = LATIN;
815   }
816   else if((0xfb13 <= character) && (character <= 0xfb17))
817   {
818     script = ARMENIAN;
819   }
820   else if((0xfb1d <= character) && (character <= 0xfb4f))
821   {
822     script = HEBREW;
823   }
824   else if((0xfb50 <= character) && (character <= 0xfdff))
825   {
826     script = ARABIC;
827   }
828
829   return script;
830 }
831
832 /// character > 0xfdff
833 inline Script GetScriptAboveArabicPresentationFormsA(Character character)
834 {
835   Script script = UNKNOWN;
836
837   if((0xfe70 <= character) && (character <= 0xfeff))
838   {
839     script = ARABIC;
840   }
841   else if((0xff00 <= character) && (character <= 0xffef))
842   {
843     if((0xFF00 <= character) && (character <= 0xFF20))
844     {
845       script = HWFW_S;
846     }
847     else if((0xFF3B <= character) && (character <= 0xFF40))
848     {
849       script = HWFW_S;
850     }
851     else if((0xFF5B <= character) && (character <= 0xFFEF))
852     {
853       script = HWFW_S;
854     }
855     else
856     {
857       script = LATIN;
858     }
859   }
860   else if((0x1ee00 <= character) && (character <= 0x1eeff))
861   {
862     script = ARABIC;
863   }
864   // U+1f170 4. Enclosed characters: negative squared latin capital letter A
865   // U+1f6ff 6b. Additional transport and map symbols
866   else if((0x1f170 <= character) && (character <= 0x1f6ff))
867   {
868     script = EMOJI;
869   }
870   // 7. Supplemental Symbols and Pictographs
871   else if((0x1f900 <= character) && (character <= 0x1f9ff))
872   {
873     script = EMOJI;
874   }
875   else if((0x20000 <= character) && (character <= 0x215ff))
876   {
877     script = CJK;
878   }
879   else if((0x21600 <= character) && (character <= 0x230ff))
880   {
881     script = CJK;
882   }
883   else if((0x23100 <= character) && (character <= 0x245ff))
884   {
885     script = CJK;
886   }
887   else if((0x24600 <= character) && (character <= 0x260ff))
888   {
889     script = CJK;
890   }
891   else if((0x26100 <= character) && (character <= 0x275ff))
892   {
893     script = CJK;
894   }
895   else if((0x27600 <= character) && (character <= 0x290ff))
896   {
897     script = CJK;
898   }
899   else if((0x29100 <= character) && (character <= 0x2a6df))
900   {
901     script = CJK;
902   }
903   else if((0x2a700 <= character) && (character <= 0x2b73f))
904   {
905     script = CJK;
906   }
907   else if((0x2b740 <= character) && (character <= 0x2b81f))
908   {
909     script = CJK;
910   }
911
912   return script;
913 }
914
915 /// character > 0x2c7f
916 inline Script GetScriptAboveLatinExtendedC(Character character)
917 {
918   Script script = UNKNOWN;
919
920   if(character <= 0xfdff)
921   {
922     script = GetScriptBetweenLatinExtendedCAndArabicPresentationFormsA(character);
923   }
924   else // > 0xfdff
925   {
926     script = GetScriptAboveArabicPresentationFormsA(character);
927   }
928
929   return script;
930 }
931
932 } // namespace
933
934 bool IsRightToLeftScript(Script script)
935 {
936   return ((ARABIC == script) ||
937           (HEBREW == script));
938 }
939
940 Script GetCharacterScript(Character character)
941 {
942   Script script = UNKNOWN;
943
944   if(IsCommonScript(character))
945   {
946     script = COMMON;
947   }
948   else if(character <= 0x0cff)
949   {
950     if(character <= 0x09ff)
951     {
952       script = GetScriptTillBengali(character);
953     }
954     else // > 0x09ff
955     {
956       script = GetScriptBetweenBengaliAndKannada(character);
957     }
958   }
959   else // > 0x0cff
960   {
961     if(character <= 0x2c7f)
962     {
963       script = GetScriptBetweenKannadaAndLatinExtendedC(character);
964     }
965     else // > 0x2c7f
966     {
967       script = GetScriptAboveLatinExtendedC(character);
968     }
969   }
970
971   return script;
972 }
973
974 bool IsWhiteSpace(Character character)
975 {
976   return character < WHITE_SPACE_THRESHOLD;
977 }
978
979 bool IsNewParagraph(Character character)
980 {
981   return ((CHAR_LF == character) ||
982           (CHAR_VT == character) ||
983           (CHAR_FF == character) ||
984           (CHAR_CR == character) ||
985           (CHAR_NEL == character) ||
986           (CHAR_LS == character) ||
987           (CHAR_PS == character));
988 }
989
990 bool IsZeroWidthNonJoiner(Character character)
991 {
992   return CHAR_ZWNJ == character;
993 }
994
995 bool IsZeroWidthJoiner(Character character)
996 {
997   return CHAR_ZWJ == character;
998 }
999
1000 bool IsZeroWidthSpace(Character character)
1001 {
1002   return CHAR_ZWS == character;
1003 }
1004
1005 bool IsLeftToRightMark(Character character)
1006 {
1007   return CHAR_LTRM == character;
1008 }
1009
1010 bool IsRightToLeftMark(Character character)
1011 {
1012   return CHAR_RTLM == character;
1013 }
1014
1015 bool IsThinSpace(Character character)
1016 {
1017   return CHAR_TS == character;
1018 }
1019
1020 bool IsCommonScript(Character character)
1021 {
1022   return (IsWhiteSpace(character) ||
1023           IsZeroWidthNonJoiner(character) ||
1024           IsZeroWidthJoiner(character) ||
1025           IsZeroWidthSpace(character) ||
1026           IsLeftToRightMark(character) ||
1027           IsRightToLeftMark(character) ||
1028           IsThinSpace(character) ||
1029           IsNewParagraph(character));
1030 }
1031
1032 bool HasLigatureMustBreak(Script script)
1033 {
1034   return ((LATIN == script) ||
1035           (ARABIC == script));
1036 }
1037
1038 } // namespace TextAbstraction
1039
1040 } // namespace Dali